Skip to content

Commit 9572e10

Browse files
committed
added error handling for bspline y-dir
1 parent 9604c63 commit 9572e10

File tree

2 files changed

+34
-19
lines changed

2 files changed

+34
-19
lines changed

src/algorithm/ComputeGradientCuda.cu

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -166,13 +166,21 @@ template <typename ImgType>
166166
void getGradientCuda(const PixelData<ImgType> &image, PixelData<float> &local_scale_temp,
167167
ImgType *cudaImage, ImgType *cudaGrad, float *cudalocal_scale_temp,
168168
BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary,
169+
bool &isErrorDetected, ScopedCudaMemHandler<bool *, JUST_ALLOC>& isErrorDetectedCuda,
169170
float bspline_offset, const APRParameters &par, cudaStream_t aStream) {
170171

171172
// TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim
172173

173-
if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream);
174+
isErrorDetected = false;
175+
isErrorDetectedCuda.copyH2D();
176+
if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, isErrorDetectedCuda.get(), aStream);
174177
if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream);
175178
if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream);
179+
isErrorDetectedCuda.copyD2H();
180+
if (isErrorDetected) {
181+
throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - "
182+
"try squashing the input image to a narrower range or use APRConverter<float>");
183+
}
176184

177185

178186
runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream);
@@ -273,6 +281,8 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
273281
BsplineParamsCuda splineCudaX;
274282
BsplineParamsCuda splineCudaY;
275283
BsplineParamsCuda splineCudaZ;
284+
bool isErrorDetected;
285+
ScopedCudaMemHandler<bool *, JUST_ALLOC> isErrorDetectedCuda;
276286

277287
const size_t boundaryLen;
278288
ScopedCudaMemHandler<float*, JUST_ALLOC> boundary;
@@ -311,6 +321,7 @@ public:
311321
cudax(transferSpline(prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance), iStream)),
312322
cuday(transferSpline(prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance), iStream)),
313323
cudaz(transferSpline(prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance), iStream)),
324+
isErrorDetectedCuda(&isErrorDetected, 1, iStream),
314325
boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num},
315326
boundary{nullptr, boundaryLen, iStream},
316327
pctc(iAprInfo, iStream),
@@ -347,7 +358,7 @@ public:
347358
CudaTimer time(false, "PIPELINE");
348359
time.start_timer("getgradient");
349360
getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(),
350-
splineCudaX, splineCudaY, splineCudaZ, boundary.get(),
361+
splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetected, isErrorDetectedCuda,
351362
iBsplineOffset, iParameters, iStream);
352363
time.stop_timer();
353364
time.start_timer("intensity");
@@ -420,14 +431,16 @@ void cudaFilterBsplineFull(PixelData<ImgType> &input, float lambda, float tolera
420431
ScopedCudaMemHandler<PixelData<ImgType>, D2H | H2D> cudaInput(input, aStream);
421432

422433
APRTimer timer(false);
434+
bool isErrorDetected = false;
435+
ScopedCudaMemHandler<bool*, H2D | D2H> error(&isErrorDetected, 1, aStream);
423436
timer.start_timer("GpuDeviceTimeFull");
424437
if (flags & BSPLINE_Y_DIR) {
425438
BsplineParams p = prepareBsplineStuff((size_t)input.y_num, lambda, tolerance, maxFilterLen);
426439
auto cuda = transferSpline(p, aStream);
427440
auto splineCuda = cuda.first;
428441
int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * input.x_num * input.z_num;
429442
ScopedCudaMemHandler<float*, JUST_ALLOC> boundary(nullptr, boundaryLen, aStream); // allocate memory on device
430-
runBsplineYdir(cudaInput.get(), input.getDimension(), splineCuda, boundary.get(), aStream);
443+
runBsplineYdir(cudaInput.get(), input.getDimension(), splineCuda, boundary.get(), error.get(), aStream);
431444
}
432445
if (flags & BSPLINE_X_DIR) {
433446
BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen);
@@ -441,6 +454,14 @@ void cudaFilterBsplineFull(PixelData<ImgType> &input, float lambda, float tolera
441454
auto splineCuda = cuda.first;
442455
runBsplineZdir(cudaInput.get(), input.getDimension(), splineCuda, aStream);
443456
}
457+
458+
waitForCuda();
459+
460+
if (isErrorDetected) {
461+
throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - "
462+
"try squashing the input image to a narrower range or use APRConverter<float>");
463+
}
464+
444465
timer.stop_timer();
445466
}
446467

@@ -510,9 +531,12 @@ void getGradient(PixelData<ImgType> &image, PixelData<ImgType> &grad_temp, Pixel
510531
BsplineParams pz = prepareBsplineStuff(image.z_num, par.lambda, tolerance);
511532
auto cudaz = transferSpline(pz, aStream);
512533
auto splineCudaZ = cudaz.first;
513-
514-
getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(),
515-
splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, aStream);
534+
bool isErrorDetected = false;
535+
{
536+
ScopedCudaMemHandler<bool*, JUST_ALLOC> isErrorDetectedCuda(&isErrorDetected, 1, aStream);
537+
getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(),
538+
splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetected, isErrorDetectedCuda, bspline_offset, par, aStream);
539+
}
516540
}
517541

518542
void cudaDownsampledGradient(PixelData<float> &input, PixelData<float> &grad, const float hx, const float hy, const float hz) {

src/algorithm/bsplineYdir.cuh

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -242,22 +242,13 @@ __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, BsplinePara
242242
* Function for launching a kernel
243243
*/
244244
template <typename T>
245-
void runBsplineYdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, float *boundary, cudaStream_t aStream) {
245+
void runBsplineYdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, float *boundary, bool *error, cudaStream_t aStream) {
246246

247247
dim3 threadsPerBlock(numOfThreads);
248248
dim3 numBlocks((dim.x * dim.z + threadsPerBlock.x - 1) / threadsPerBlock.x);
249249
size_t sharedMemSize = (2 /*bc vectors*/) * (p.k0) * sizeof(float) + numOfThreads * (p.k0) * sizeof(float);
250-
bool isErrorDetected = false;
251-
{
252-
ScopedCudaMemHandler<bool *, H2D | D2H> error(&isErrorDetected, 1, aStream);
253-
bsplineYdirBoundary<T> <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get());
254-
sharedMemSize = numOfThreads * blockWidth * sizeof(float);
255-
bsplineYdirProcess<T> <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get());
256-
}
257-
258-
if (isErrorDetected) {
259-
throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineYdir - "
260-
"try squashing the input image to a narrower range or use APRConverter<float>");
261-
}
250+
bsplineYdirBoundary<T> <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error);
251+
sharedMemSize = numOfThreads * blockWidth * sizeof(float);
252+
bsplineYdirProcess<T> <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error);
262253
}
263254
#endif

0 commit comments

Comments
 (0)