added error handling for bspline x-dir, other steps temporarily blocked

krzysg · krzysg · commit 005a4ba53354 · 2025-06-25T10:29:09.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -209,9 +209,10 @@ set_property(TARGET aprObjLib PROPERTY POSITION_INDEPENDENT_CODE ON)
 
 if(APR_USE_CUDA)
     message(STATUS "APR: Building CUDA for APR")
+    set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
     set(CMAKE_CUDA_STANDARD 14)
     set(CMAKE_CUDA_RUNTIME_LIBRARY "Static")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Wno-deprecated-gpu-targets -Xptxas -v -DAPR_USE_CUDA")
     set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling
     set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G")
     if(APR_BENCHMARK)
diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp
@@ -9,6 +9,7 @@
 #ifndef __APR_CONVERTER_HPP__
 #define __APR_CONVERTER_HPP__
 
+#include <future>
 #include <list>
 
 #include "AutoParameters.hpp"
@@ -74,6 +75,8 @@ class APRConverter {
 #ifdef APR_USE_CUDA
     template <typename T>
     bool get_apr_cuda(APR &aAPR, PixelData<T> &input_image);
+    template <typename T>
+    bool get_apr_cuda_streams(APR &aAPR, PixelData<T> &input_image);
 #endif
 
     bool verbose = true;
@@ -438,6 +441,118 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
 }
 #endif
 
+#ifdef APR_USE_CUDA
+/**
+ * Implementation of pipeline for GPU/CUDA and multiple streams
+ * NOTE: Currently only one image is processed multiple times just get an idea how fast it can be.
+ *       Finally, it should be able to process incoming stream of data (sequence of images).
+ *
+ * @param aAPR - the APR data structure
+ * @param input_image - input image
+ */
+template<typename ImageType> template<typename T>
+inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T>& input_image) {
+
+    if (!initPipelineAPR(aAPR, input_image)) return false;
+
+    initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
+
+    computation_timer.start_timer("init_mem");
+    PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
+
+    /////////////////////////////////
+    /// Pipeline
+    ////////////////////////
+    // offset image by factor (this is required if there are zero areas in the background with
+    // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
+    // Warning both of these could result in over-flow!
+
+    if (std::is_same<uint16_t, ImageType>::value) {
+        bspline_offset = 100;
+        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
+    } else if (std::is_same<uint8_t, ImageType>::value) {
+        bspline_offset = 5;
+        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
+    } else {
+        image_temp.copyFromMesh(input_image);
+    }
+
+
+
+    constexpr int numOfStreams = 3;
+    constexpr int repetitionsPerStream = 3; //
+    APRTimer ttt(true);
+    ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
+    {
+        std::vector<GpuProcessingTask<ImageType>> gpts;
+
+        //std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
+        for (int i = 0; i < numOfStreams; ++i) {
+            gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
+        }
+
+        APRTimer t(true);
+        t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
+        {
+
+            APRTimer tt(false);
+            // Create streams and send initial task to do
+            for (int i = 0; i < numOfStreams; ++i) {
+                // gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
+                tt.start_timer("SEND");
+                gpts[i].sendDataToGpu();
+                tt.stop_timer();
+                // std::cout << "Send " << i << std::endl;
+                // gpts.back().processOnGpu();
+                // std::cout << "Proc " << i << std::endl;
+            }
+            // Create streams and send initial task to do
+            for (int i = 0; i < numOfStreams; ++i) {
+                // gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
+                tt.start_timer("Process");
+                gpts[i].processOnGpu();
+                tt.stop_timer();
+                // std::cout << "Proc " << i << std::endl;
+            }
+            std::cout << "=========" << std::endl;
+
+            for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
+                int c = i % numOfStreams;
+
+                // get data from previous task
+                // gpts_futures[c].get();
+                auto linearAccessGpu = gpts[c].getDataFromGpu();
+                // std::cout << "Get  " << c << std::endl;
+
+                // in theory, we get new data and send them to task
+                if (i  < numOfStreams * (repetitionsPerStream - 1)) {
+                    gpts[c].sendDataToGpu();
+                    // std::cout << "Send " << c << std::endl;
+                    gpts[c].processOnGpu();
+                    // gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
+                    // std::cout << "Proc " << c << std::endl;
+                }
+
+                aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
+
+                // generateDatastructures(aAPR) for linearAcceess for CUDA
+                aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
+                aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
+                aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
+                aAPR.apr_initialized = true;
+
+                // std::cout << "CUDA pipeline finished!\n";
+            }
+        }
+        auto allT = t.stop_timer();
+        std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
+    }
+    auto allT = ttt.stop_timer();
+    std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
+
+    return false; //TODO: change it back to true
+}
+#endif
 
 /**
  * Implementation of pipeline for CPU
@@ -509,7 +624,8 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
 #ifndef APR_USE_CUDA
     return get_apr_cpu(aAPR, input_image);
 #else
-    return get_apr_cuda(aAPR, input_image);
+    // return get_apr_cuda(aAPR, input_image);
+    return get_apr_cuda_streams(aAPR, input_image);
 #endif
 }
 
diff --git a/src/algorithm/APRParameters.hpp b/src/algorithm/APRParameters.hpp
@@ -57,6 +57,7 @@ class APRParameters {
         os << "sigma_th_max=" << obj.sigma_th_max << "\n";
         os << "auto_parameters=" << (obj.auto_parameters ? "true" : "false") << "\n";
         os << "neighborhood_optimization=" << (obj.neighborhood_optimization ? "true" : "false") << "\n";
+        os << "constant_intensity_scale=" << (obj.constant_intensity_scale ? "true" : "false") << "\n";
         os << "output_steps=" << (obj.output_steps ? "true" : "false") << "\n";
 
 	    return os;
diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu
@@ -174,22 +174,22 @@ void getGradientCuda(const PixelData<ImgType> &image, PixelData<float> &local_sc
     isErrorDetected = false;
     isErrorDetectedCuda.copyH2D();
     if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, isErrorDetectedCuda.get(), aStream);
-    if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream);
-    if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream);
-    isErrorDetectedCuda.copyD2H();
-    if (isErrorDetected) {
-        throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - "
-                                    "try squashing the input image to a narrower range or use APRConverter<float>");
-    }
-
-
-    runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream);
-
-    runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream);
-
-    if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
-    if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
-    if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
+    if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, isErrorDetectedCuda.get(), aStream);
+    // if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream);
+    // isErrorDetectedCuda.copyD2H();
+    // if (isErrorDetected) {
+    //     throw std::invalid_argument("integer under-/overflow encountered in CUDA bspline(XYZ)dir - "
+    //                                 "try squashing the input image to a narrower range or use APRConverter<float>");
+    // }
+    //
+    //
+    // runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream);
+    //
+    // runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream);
+    //
+    // if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
+    // if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
+    // if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
 }
 
 class CurrentTime {
@@ -361,27 +361,27 @@ public:
                          splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetected, isErrorDetectedCuda,
                         iBsplineOffset, iParameters, iStream);
         time.stop_timer();
-        time.start_timer("intensity");
-        runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream);
-        time.stop_timer();
-
-
-        // Apply parameters from APRConverter:
-        time.start_timer("runs....");
-        runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream);
-        runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream);
-        runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream);
-        // TODO: automatic parameters are not implemented for GPU pipeline (yet)
-        time.stop_timer();
-
-        time.start_timer("compute lev");
-        float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz));
-        float level_factor = pow(2, iMaxLevel) * min_dim;
-        const float mult_const = level_factor/iParameters.rel_error;
-        runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream);
-        time.stop_timer();
-        computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream);
-        computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream);
+        // time.start_timer("intensity");
+        // runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream);
+        // time.stop_timer();
+        //
+        //
+        // // Apply parameters from APRConverter:
+        // time.start_timer("runs....");
+        // runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream);
+        // runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream);
+        // runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream);
+        // // TODO: automatic parameters are not implemented for GPU pipeline (yet)
+        // time.stop_timer();
+        //
+        // time.start_timer("compute lev");
+        // float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz));
+        // float level_factor = pow(2, iMaxLevel) * min_dim;
+        // const float mult_const = level_factor/iParameters.rel_error;
+        // runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream);
+        // time.stop_timer();
+        // computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream);
+        // computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream);
     }
 
     ~GpuProcessingTaskImpl() {
@@ -446,7 +446,7 @@ void cudaFilterBsplineFull(PixelData<ImgType> &input, float lambda, float tolera
         BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen);
         auto cuda = transferSpline(p, aStream);
         auto splineCuda = cuda.first;
-        runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, aStream);
+        runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, error.get(), aStream);
     }
     if (flags & BSPLINE_Z_DIR) {
         BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen);
diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu
@@ -503,7 +503,7 @@ void runLocalIntensityScalePipeline(const PixelData<T> &image, const APRParamete
     bool constant_scale = false;
 
     if (par.constant_intensity_scale || (lis.number_active_dimensions == 0)) {
-        // include the case where the local intensity scale doesn't make sense due to the image being to small.
+        // include the case where the local intensity scale doesn't make sense due to the image being too small.
         // (This is for just edge cases and sanity checking)
         constant_scale = true;
     }
diff --git a/src/algorithm/bsplineXdir.cuh b/src/algorithm/bsplineXdir.cuh
@@ -127,24 +127,13 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, BsplineParamsCuda p, boo
  * Function for launching a kernel
  */
 template<typename T>
-void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) {
+void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, bool *error, cudaStream_t aStream) {
     constexpr int numOfWorkersYdir = 128;
     dim3 threadsPerBlockX(1, numOfWorkersYdir, 1);
     dim3 numBlocksX(1,
                     (dim.y + threadsPerBlockX.y - 1) / threadsPerBlockX.y,
                     (dim.z + threadsPerBlockX.z - 1) / threadsPerBlockX.z);
-    // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel
-    // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected.
-    bool isErrorDetected = false;
-    {
-        ScopedCudaMemHandler<bool*, H2D | D2H> error(&isErrorDetected, 1, aStream);
-        bsplineXdir<T> <<<numBlocksX, threadsPerBlockX, 0, aStream>>>(cudaImage, dim, p, error.get());
-    }
-
-    if (isErrorDetected) {
-        throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineXdir - "
-                                    "try squashing the input image to a narrower range or use APRConverter<float>");
-    }
+    bsplineXdir<T> <<<numBlocksX, threadsPerBlockX, 0, aStream>>>(cudaImage, dim, p, error);
 }
 
 #endif

Original file line number	Diff line number	Diff line change
`@@ -503,7 +503,7 @@ void runLocalIntensityScalePipeline(const PixelData<T> &image, const APRParamete`
`503`	`503`	`bool constant_scale = false;`
`504`	`504`
`505`	`505`	`if (par.constant_intensity_scale \|\| (lis.number_active_dimensions == 0)) {`
`506`		`- // include the case where the local intensity scale doesn't make sense due to the image being to small.`
	`506`	`+ // include the case where the local intensity scale doesn't make sense due to the image being too small.`
`507`	`507`	`// (This is for just edge cases and sanity checking)`
`508`	`508`	`constant_scale = true;`
`509`	`509`	`}`