@@ -166,13 +166,21 @@ template <typename ImgType>
166166void getGradientCuda (const PixelData<ImgType> &image, PixelData<float > &local_scale_temp,
167167 ImgType *cudaImage, ImgType *cudaGrad, float *cudalocal_scale_temp,
168168 BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary,
169+ bool &isErrorDetected, ScopedCudaMemHandler<bool *, JUST_ALLOC>& isErrorDetectedCuda,
169170 float bspline_offset, const APRParameters &par, cudaStream_t aStream) {
170171
171172 // TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim
172173
173- if (image.y_num > 2 ) runBsplineYdir (cudaImage, image.getDimension (), py, boundary, aStream);
174+ isErrorDetected = false ;
175+ isErrorDetectedCuda.copyH2D ();
176+ if (image.y_num > 2 ) runBsplineYdir (cudaImage, image.getDimension (), py, boundary, isErrorDetectedCuda.get (), aStream);
174177 if (image.x_num > 2 ) runBsplineXdir (cudaImage, image.getDimension (), px, aStream);
175178 if (image.z_num > 2 ) runBsplineZdir (cudaImage, image.getDimension (), pz, aStream);
179+ isErrorDetectedCuda.copyD2H ();
180+ if (isErrorDetected) {
181+ throw std::invalid_argument (" integer under-/overflow encountered in CUDA bspline(XYZ)dir - "
182+ " try squashing the input image to a narrower range or use APRConverter<float>" );
183+ }
176184
177185
178186 runKernelGradient (cudaImage, cudaGrad, image.getDimension (), local_scale_temp.getDimension (), par.dx , par.dy , par.dz , aStream);
@@ -273,6 +281,8 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
273281 BsplineParamsCuda splineCudaX;
274282 BsplineParamsCuda splineCudaY;
275283 BsplineParamsCuda splineCudaZ;
284+ bool isErrorDetected;
285+ ScopedCudaMemHandler<bool *, JUST_ALLOC> isErrorDetectedCuda;
276286
277287 const size_t boundaryLen;
278288 ScopedCudaMemHandler<float *, JUST_ALLOC> boundary;
@@ -311,6 +321,7 @@ public:
311321 cudax (transferSpline(prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance), iStream)),
312322 cuday (transferSpline(prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance), iStream)),
313323 cudaz (transferSpline(prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance), iStream)),
324+ isErrorDetectedCuda (&isErrorDetected, 1 , iStream),
314325 boundaryLen{(2 /* two first elements*/ + 2 /* two last elements */ ) * (size_t )inputImage.x_num * (size_t )inputImage.z_num },
315326 boundary{nullptr , boundaryLen, iStream},
316327 pctc (iAprInfo, iStream),
@@ -347,7 +358,7 @@ public:
347358 CudaTimer time (false , " PIPELINE" );
348359 time.start_timer (" getgradient" );
349360 getGradientCuda (iCpuImage, iCpuLevels, image.get (), gradient.get (), local_scale_temp.get (),
350- splineCudaX, splineCudaY, splineCudaZ, boundary.get (),
361+ splineCudaX, splineCudaY, splineCudaZ, boundary.get (), isErrorDetected, isErrorDetectedCuda,
351362 iBsplineOffset, iParameters, iStream);
352363 time.stop_timer ();
353364 time.start_timer (" intensity" );
@@ -420,14 +431,16 @@ void cudaFilterBsplineFull(PixelData<ImgType> &input, float lambda, float tolera
420431 ScopedCudaMemHandler<PixelData<ImgType>, D2H | H2D> cudaInput (input, aStream);
421432
422433 APRTimer timer (false );
434+ bool isErrorDetected = false ;
435+ ScopedCudaMemHandler<bool *, H2D | D2H> error (&isErrorDetected, 1 , aStream);
423436 timer.start_timer (" GpuDeviceTimeFull" );
424437 if (flags & BSPLINE_Y_DIR) {
425438 BsplineParams p = prepareBsplineStuff ((size_t )input.y_num , lambda, tolerance, maxFilterLen);
426439 auto cuda = transferSpline (p, aStream);
427440 auto splineCuda = cuda.first ;
428441 int boundaryLen = (2 /* two first elements*/ + 2 /* two last elements */ ) * input.x_num * input.z_num ;
429442 ScopedCudaMemHandler<float *, JUST_ALLOC> boundary (nullptr , boundaryLen, aStream); // allocate memory on device
430- runBsplineYdir (cudaInput.get (), input.getDimension (), splineCuda, boundary.get (), aStream);
443+ runBsplineYdir (cudaInput.get (), input.getDimension (), splineCuda, boundary.get (), error. get (), aStream);
431444 }
432445 if (flags & BSPLINE_X_DIR) {
433446 BsplineParams p = prepareBsplineStuff ((size_t )input.x_num , lambda, tolerance, maxFilterLen);
@@ -441,6 +454,14 @@ void cudaFilterBsplineFull(PixelData<ImgType> &input, float lambda, float tolera
441454 auto splineCuda = cuda.first ;
442455 runBsplineZdir (cudaInput.get (), input.getDimension (), splineCuda, aStream);
443456 }
457+
458+ waitForCuda ();
459+
460+ if (isErrorDetected) {
461+ throw std::invalid_argument (" integer under-/overflow encountered in CUDA bspline(XYZ)dir - "
462+ " try squashing the input image to a narrower range or use APRConverter<float>" );
463+ }
464+
444465 timer.stop_timer ();
445466}
446467
@@ -510,9 +531,12 @@ void getGradient(PixelData<ImgType> &image, PixelData<ImgType> &grad_temp, Pixel
510531 BsplineParams pz = prepareBsplineStuff (image.z_num , par.lambda , tolerance);
511532 auto cudaz = transferSpline (pz, aStream);
512533 auto splineCudaZ = cudaz.first ;
513-
514- getGradientCuda (image, local_scale_temp, cudaImage.get (), cudaGrad.get (), cudalocal_scale_temp.get (),
515- splineCudaX, splineCudaY, splineCudaZ, boundary.get (), bspline_offset, par, aStream);
534+ bool isErrorDetected = false ;
535+ {
536+ ScopedCudaMemHandler<bool *, JUST_ALLOC> isErrorDetectedCuda (&isErrorDetected, 1 , aStream);
537+ getGradientCuda (image, local_scale_temp, cudaImage.get (), cudaGrad.get (), cudalocal_scale_temp.get (),
538+ splineCudaX, splineCudaY, splineCudaZ, boundary.get (), isErrorDetected, isErrorDetectedCuda, bspline_offset, par, aStream);
539+ }
516540}
517541
518542void cudaDownsampledGradient (PixelData<float > &input, PixelData<float > &grad, const float hx, const float hy, const float hz) {
0 commit comments