Skip to content

Commit 005a4ba

Browse files
committed
added error handling for bspline x-dir, other steps temporarily blocked
1 parent 514c03c commit 005a4ba

File tree

6 files changed

+161
-54
lines changed

6 files changed

+161
-54
lines changed

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,10 @@ set_property(TARGET aprObjLib PROPERTY POSITION_INDEPENDENT_CODE ON)
209209

210210
if(APR_USE_CUDA)
211211
message(STATUS "APR: Building CUDA for APR")
212+
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
212213
set(CMAKE_CUDA_STANDARD 14)
213214
set(CMAKE_CUDA_RUNTIME_LIBRARY "Static")
214-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA")
215+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Wno-deprecated-gpu-targets -Xptxas -v -DAPR_USE_CUDA")
215216
set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling
216217
set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G")
217218
if(APR_BENCHMARK)

src/algorithm/APRConverter.hpp

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef __APR_CONVERTER_HPP__
1010
#define __APR_CONVERTER_HPP__
1111

12+
#include <future>
1213
#include <list>
1314

1415
#include "AutoParameters.hpp"
@@ -74,6 +75,8 @@ class APRConverter {
7475
#ifdef APR_USE_CUDA
7576
template <typename T>
7677
bool get_apr_cuda(APR &aAPR, PixelData<T> &input_image);
78+
template <typename T>
79+
bool get_apr_cuda_streams(APR &aAPR, PixelData<T> &input_image);
7780
#endif
7881

7982
bool verbose = true;
@@ -438,6 +441,118 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
438441
}
439442
#endif
440443

444+
#ifdef APR_USE_CUDA
445+
/**
446+
* Implementation of pipeline for GPU/CUDA and multiple streams
447+
* NOTE: Currently only one image is processed multiple times just get an idea how fast it can be.
448+
* Finally, it should be able to process incoming stream of data (sequence of images).
449+
*
450+
* @param aAPR - the APR data structure
451+
* @param input_image - input image
452+
*/
453+
template<typename ImageType> template<typename T>
454+
inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T>& input_image) {
455+
456+
if (!initPipelineAPR(aAPR, input_image)) return false;
457+
458+
initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
459+
460+
computation_timer.start_timer("init_mem");
461+
PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
462+
463+
/////////////////////////////////
464+
/// Pipeline
465+
////////////////////////
466+
// offset image by factor (this is required if there are zero areas in the background with
467+
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
468+
// Warning both of these could result in over-flow!
469+
470+
if (std::is_same<uint16_t, ImageType>::value) {
471+
bspline_offset = 100;
472+
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
473+
} else if (std::is_same<uint8_t, ImageType>::value) {
474+
bspline_offset = 5;
475+
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
476+
} else {
477+
image_temp.copyFromMesh(input_image);
478+
}
479+
480+
481+
482+
constexpr int numOfStreams = 3;
483+
constexpr int repetitionsPerStream = 3; //
484+
APRTimer ttt(true);
485+
ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
486+
{
487+
std::vector<GpuProcessingTask<ImageType>> gpts;
488+
489+
//std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
490+
for (int i = 0; i < numOfStreams; ++i) {
491+
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
492+
}
493+
494+
APRTimer t(true);
495+
t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
496+
{
497+
498+
APRTimer tt(false);
499+
// Create streams and send initial task to do
500+
for (int i = 0; i < numOfStreams; ++i) {
501+
// gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
502+
tt.start_timer("SEND");
503+
gpts[i].sendDataToGpu();
504+
tt.stop_timer();
505+
// std::cout << "Send " << i << std::endl;
506+
// gpts.back().processOnGpu();
507+
// std::cout << "Proc " << i << std::endl;
508+
}
509+
// Create streams and send initial task to do
510+
for (int i = 0; i < numOfStreams; ++i) {
511+
// gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
512+
tt.start_timer("Process");
513+
gpts[i].processOnGpu();
514+
tt.stop_timer();
515+
// std::cout << "Proc " << i << std::endl;
516+
}
517+
std::cout << "=========" << std::endl;
518+
519+
for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
520+
int c = i % numOfStreams;
521+
522+
// get data from previous task
523+
// gpts_futures[c].get();
524+
auto linearAccessGpu = gpts[c].getDataFromGpu();
525+
// std::cout << "Get " << c << std::endl;
526+
527+
// in theory, we get new data and send them to task
528+
if (i < numOfStreams * (repetitionsPerStream - 1)) {
529+
gpts[c].sendDataToGpu();
530+
// std::cout << "Send " << c << std::endl;
531+
gpts[c].processOnGpu();
532+
// gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
533+
// std::cout << "Proc " << c << std::endl;
534+
}
535+
536+
aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
537+
538+
// generateDatastructures(aAPR) for linearAcceess for CUDA
539+
aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
540+
aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
541+
aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
542+
aAPR.apr_initialized = true;
543+
544+
// std::cout << "CUDA pipeline finished!\n";
545+
}
546+
}
547+
auto allT = t.stop_timer();
548+
std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
549+
}
550+
auto allT = ttt.stop_timer();
551+
std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
552+
553+
return false; //TODO: change it back to true
554+
}
555+
#endif
441556

442557
/**
443558
* Implementation of pipeline for CPU
@@ -509,7 +624,8 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
509624
#ifndef APR_USE_CUDA
510625
return get_apr_cpu(aAPR, input_image);
511626
#else
512-
return get_apr_cuda(aAPR, input_image);
627+
// return get_apr_cuda(aAPR, input_image);
628+
return get_apr_cuda_streams(aAPR, input_image);
513629
#endif
514630
}
515631

src/algorithm/APRParameters.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class APRParameters {
5757
os << "sigma_th_max=" << obj.sigma_th_max << "\n";
5858
os << "auto_parameters=" << (obj.auto_parameters ? "true" : "false") << "\n";
5959
os << "neighborhood_optimization=" << (obj.neighborhood_optimization ? "true" : "false") << "\n";
60+
os << "constant_intensity_scale=" << (obj.constant_intensity_scale ? "true" : "false") << "\n";
6061
os << "output_steps=" << (obj.output_steps ? "true" : "false") << "\n";
6162

6263
return os;

src/algorithm/ComputeGradientCuda.cu

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -174,22 +174,22 @@ void getGradientCuda(const PixelData<ImgType> &image, PixelData<float> &local_sc
174174
isErrorDetected = false;
175175
isErrorDetectedCuda.copyH2D();
176176
if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, isErrorDetectedCuda.get(), aStream);
177-
if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream);
178-
if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream);
179-
isErrorDetectedCuda.copyD2H();
180-
if (isErrorDetected) {
181-
throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - "
182-
"try squashing the input image to a narrower range or use APRConverter<float>");
183-
}
184-
185-
186-
runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream);
187-
188-
runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream);
189-
190-
if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
191-
if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
192-
if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
177+
if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, isErrorDetectedCuda.get(), aStream);
178+
// if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream);
179+
// isErrorDetectedCuda.copyD2H();
180+
// if (isErrorDetected) {
181+
// throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - "
182+
// "try squashing the input image to a narrower range or use APRConverter<float>");
183+
// }
184+
//
185+
//
186+
// runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream);
187+
//
188+
// runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream);
189+
//
190+
// if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
191+
// if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
192+
// if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
193193
}
194194

195195
class CurrentTime {
@@ -361,27 +361,27 @@ public:
361361
splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetected, isErrorDetectedCuda,
362362
iBsplineOffset, iParameters, iStream);
363363
time.stop_timer();
364-
time.start_timer("intensity");
365-
runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream);
366-
time.stop_timer();
367-
368-
369-
// Apply parameters from APRConverter:
370-
time.start_timer("runs....");
371-
runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream);
372-
runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream);
373-
runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream);
374-
// TODO: automatic parameters are not implemented for GPU pipeline (yet)
375-
time.stop_timer();
376-
377-
time.start_timer("compute lev");
378-
float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz));
379-
float level_factor = pow(2, iMaxLevel) * min_dim;
380-
const float mult_const = level_factor/iParameters.rel_error;
381-
runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream);
382-
time.stop_timer();
383-
computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream);
384-
computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream);
364+
// time.start_timer("intensity");
365+
// runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream);
366+
// time.stop_timer();
367+
//
368+
//
369+
// // Apply parameters from APRConverter:
370+
// time.start_timer("runs....");
371+
// runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream);
372+
// runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream);
373+
// runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream);
374+
// // TODO: automatic parameters are not implemented for GPU pipeline (yet)
375+
// time.stop_timer();
376+
//
377+
// time.start_timer("compute lev");
378+
// float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz));
379+
// float level_factor = pow(2, iMaxLevel) * min_dim;
380+
// const float mult_const = level_factor/iParameters.rel_error;
381+
// runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream);
382+
// time.stop_timer();
383+
// computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream);
384+
// computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream);
385385
}
386386

387387
~GpuProcessingTaskImpl() {
@@ -446,7 +446,7 @@ void cudaFilterBsplineFull(PixelData<ImgType> &input, float lambda, float tolera
446446
BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen);
447447
auto cuda = transferSpline(p, aStream);
448448
auto splineCuda = cuda.first;
449-
runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, aStream);
449+
runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, error.get(), aStream);
450450
}
451451
if (flags & BSPLINE_Z_DIR) {
452452
BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen);

src/algorithm/LocalIntensityScale.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,7 @@ void runLocalIntensityScalePipeline(const PixelData<T> &image, const APRParamete
503503
bool constant_scale = false;
504504

505505
if (par.constant_intensity_scale || (lis.number_active_dimensions == 0)) {
506-
// include the case where the local intensity scale doesn't make sense due to the image being to small.
506+
// include the case where the local intensity scale doesn't make sense due to the image being too small.
507507
// (This is for just edge cases and sanity checking)
508508
constant_scale = true;
509509
}

src/algorithm/bsplineXdir.cuh

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -127,24 +127,13 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, BsplineParamsCuda p, boo
127127
* Function for launching a kernel
128128
*/
129129
template<typename T>
130-
void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) {
130+
void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, bool *error, cudaStream_t aStream) {
131131
constexpr int numOfWorkersYdir = 128;
132132
dim3 threadsPerBlockX(1, numOfWorkersYdir, 1);
133133
dim3 numBlocksX(1,
134134
(dim.y + threadsPerBlockX.y - 1) / threadsPerBlockX.y,
135135
(dim.z + threadsPerBlockX.z - 1) / threadsPerBlockX.z);
136-
// In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel
137-
// access it but this is enough for us to know that somewhere in one on more kernels overflow was detected.
138-
bool isErrorDetected = false;
139-
{
140-
ScopedCudaMemHandler<bool*, H2D | D2H> error(&isErrorDetected, 1, aStream);
141-
bsplineXdir<T> <<<numBlocksX, threadsPerBlockX, 0, aStream>>>(cudaImage, dim, p, error.get());
142-
}
143-
144-
if (isErrorDetected) {
145-
throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineXdir - "
146-
"try squashing the input image to a narrower range or use APRConverter<float>");
147-
}
136+
bsplineXdir<T> <<<numBlocksX, threadsPerBlockX, 0, aStream>>>(cudaImage, dim, p, error);
148137
}
149138

150139
#endif

0 commit comments

Comments
 (0)