Bspline Offset is now computed on GPU + copy of original image for sampling

krzysg · krzysg · commit 4093552d786d · 2025-09-09T13:59:49.000+02:00
diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp
@@ -76,8 +76,6 @@ class APRConverter {
     template <typename T>
     bool get_apr_cuda(APR &aAPR, PixelData<T> &input_image);
     template <typename T>
-    bool get_apr_cuda_streams(APR &aAPR, PixelData<T> &input_image);
-    template <typename T>
     bool get_apr_cuda_multistreams(APR &aAPR, const std::vector<PixelData<T> *> &input_images, int numOfStreams = 3);
 #endif
 
@@ -406,25 +404,13 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
     initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
 
     computation_timer.start_timer("init_mem");
-    PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image)
+    PixelData<ImageType> image_temp(input_image, true /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image)
 
     /////////////////////////////////
     /// Pipeline
     ////////////////////////
-    // offset image by factor (this is required if there are zero areas in the background with
-    // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
-    // Warning both of these could result in over-flow!
-
-    if (std::is_floating_point<ImageType>::value) {
-        image_temp.copyFromMesh(input_image);
-    } else {
-        bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda);
-        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-    }
 
     GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, aAPR.level_max());
-    // std::cout << "after gpt \n";
-    gpt.setBsplineOffset(bspline_offset);
     gpt.processOnGpu();
     auto linearAccessGpu = gpt.getDataFromGpu();
 
@@ -462,7 +448,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
         return false;
     }
 
-    // Reduce number of streams to number of images if there are less images than streams
+    // Reduce number of streams to number of images if there are fewer images than streams
     if (numOfImages < numOfStreams) numOfStreams = numOfImages;
 
     // Use first image to initialize the APR - all other images should have the same dimensions
@@ -476,7 +462,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
     std::vector<PixelData<ImageType>> tempImages;
     std::cout << "allocating PixelData for " << numOfStreams << " streams" << std::endl;
     for (int i = 0; i < numOfStreams; ++i) {
-        tempImages.emplace_back(PixelData<T>(*input_image, false /* don't copy */, true /* pinned memory */));
+        tempImages.emplace_back(PixelData<T>(*input_image, true /* copy */, true /* pinned memory */));
     }
 
      /////////////////////////////////
@@ -497,22 +483,10 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
     t.start_timer("GPU processing...");
     // Saturate all the streams with first images
     for (int i = 0; i < numOfStreams; ++i) {
-
-        // offset image by factor (this is required if there are zero areas in the background with
-        // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
-        // Warning both of these could result in over-flow!
-        if (std::is_floating_point<ImageType>::value) {
-            tempImages[i].copyFromMesh(*input_images[i]);
-        } else {
-            bspline_offset = compute_bspline_offset<ImageType>(*input_images[i], par.lambda);
-            tempImages[i].copyFromMeshWithUnaryOp(*input_images[i], [=](const auto &a) { return (a + bspline_offset); });
-        }
         std::cout << "Processing image " << i << " on stream " << i  << std::endl;
-        gpts[i].setBsplineOffset(bspline_offset);
         gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
     }
 
-
     // Main loop - get results from GPU and send new images to the streams (if any left)
     for (int s = 0; s < numOfImages; ++s) {
         int streamNum = s % numOfStreams;
@@ -525,14 +499,8 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
         // We have 'numOfImages - numOfStreams' left to process after saturating the streams with first images
         if (s  < numOfImages - numOfStreams) {
             int imageToProcess = s + numOfStreams;
-            if (std::is_floating_point<ImageType>::value) {
-                tempImages[streamNum].copyFromMesh(*input_images[imageToProcess]);
-            } else {
-                bspline_offset = compute_bspline_offset<ImageType>(*input_images[imageToProcess], par.lambda);
-                tempImages[streamNum].copyFromMeshWithUnaryOp(*input_images[imageToProcess], [=](const auto &a) { return (a + bspline_offset); });
-            }
+            tempImages[streamNum].copyFromMesh(*input_images[imageToProcess]);
             std::cout << "Processing image " << imageToProcess << " on stream " << streamNum << std::endl;
-            gpts[streamNum].setBsplineOffset(bspline_offset);
             gpts_futures[streamNum] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[streamNum]);
         }
 
@@ -554,97 +522,6 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
     std::cout << "CUDA multistream pipeline finished!\n";
     return true;
 }
-
-/**
- * Implementation of pipeline for GPU/CUDA and multiple streams
- * NOTE: Currently only one image is processed multiple times just get an idea how fast it can be.
- *       Finally, it should be able to process incoming stream of data (sequence of images).
- *
- * @param aAPR - the APR data structure
- * @param input_image - input image
- */
-template<typename ImageType> template<typename T>
-inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T>& input_image) {
-    // Initialize APR and memory for the pipeline
-    if (!initPipelineAPR(aAPR, input_image)) return false;
-    initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
-    PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full size copy of the image)
-
-    /////////////////////////////////
-    /// Pipeline
-    /////////////////////////////////
-
-    // offset image by factor (this is required if there are zero areas in the background with
-    // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
-    // Warning both of these could result in over-flow!
-    if (std::is_floating_point<ImageType>::value) {
-        image_temp.copyFromMesh(input_image);
-    } else {
-        bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda);
-        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-    }
-
-    // Run input on the GPU streams
-    constexpr int numOfStreams = 3; // number of streams to use for parallel processing
-    constexpr int repetitionsPerStream = 3; // number of repetitions per stream to simulate processing of multiple images
-
-    APRTimer ttt(true);
-
-    ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
-    {
-        APRTimer t(true);
-        std::vector<GpuProcessingTask<ImageType>> gpts;
-
-        t.start_timer("Creating GPTS");
-        std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
-        for (int i = 0; i < numOfStreams; ++i) {
-            gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, aAPR.level_max()));
-        }
-        t.stop_timer();
-
-        t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
-        {
-            APRTimer tt(false);
-            // Run processOnGpu() asynchronously - it will handle transfering data from CPU to GPU and run whole pipeline
-            for (int i = 0; i < numOfStreams; ++i) {
-                gpts[i].setBsplineOffset(bspline_offset);
-                gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
-            }
-
-            for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
-                int c = i % numOfStreams;
-
-                // Get data from GpuProcessingTask - get() will block until the task is finished
-                gpts_futures[c].get();
-                auto linearAccessGpu = gpts[c].getDataFromGpu();
-
-                // in theory, we get new data and send them to task
-                if (i  < numOfStreams * (repetitionsPerStream - 1)) {
-                    gpts[c].setBsplineOffset(bspline_offset);
-                    gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
-                }
-
-                // Fill APR data structure with data from GPU
-                aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
-                aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec);
-                aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec);
-                aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec);
-
-                aAPR.apr_initialized = true;
-            }
-        }
-        auto allT = t.stop_timer();
-        std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
-        std::cout << "Bandwidth:" << (input_image.size() / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024) << " MB/s\n";
-    }
-    auto allT = ttt.stop_timer();
-    float tpi = allT / (numOfStreams*repetitionsPerStream);
-    std::cout << "Time per image: " << tpi << " seconds\n";
-    std::cout << "Image size: " << (input_image.size() / 1024 / 1024) << " MB\n";
-    std::cout << "Bandwidth:" << (input_image.size() / tpi / 1024 / 1024) << " MB/s\n";
-
-    return true;
-}
 #endif
 
 
@@ -719,8 +596,7 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
     return get_apr_cpu(aAPR, input_image);
 #else
     // return get_apr_cuda(aAPR, input_image);
-    // return get_apr_cuda_streams(aAPR, input_image);
-    std::vector<PixelData<T> *> input_images(3*66, &input_image);
+    std::vector<PixelData<T> *> input_images(1, &input_image);
     return get_apr_cuda_multistreams(aAPR, input_images, 3);
 #endif
 }
diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu
@@ -289,6 +289,55 @@ void runRescaleAndThreshold(T *data, size_t len, float sigma, float sigmaMax, cu
     rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, sigma, sigmaMax);
 }
 
+/**
+ * Compute bspline offset for APRConverter of integer type ImageType
+ */
+template<typename T>
+float computeBsplineOffset(T *cudaImage, PixelDataDim dim, float lambda, int numOfBlocks, ScopedCudaMemHandler<T*, JUST_ALLOC>  &resultsMin, ScopedCudaMemHandler<T*, JUST_ALLOC> &resultsMax, VectorData<T> &minVector, VectorData<T> &maxVector, cudaStream_t aStream) {
+
+    // if bspline smoothing is disabled, there is no need for an offset
+    if(lambda <= 0) return 0;
+
+    // Run kernel and copy data back to CPU
+    runFindMinMax(cudaImage, dim, aStream, resultsMin.get(), resultsMax.get(), numOfBlocks, numOfThreads);
+    resultsMin.copyD2H();
+    resultsMax.copyD2H();
+    checkCuda(cudaStreamSynchronize(aStream));
+
+    // compute offset to center the intensities in the ImageType range (can be negative)
+    float offset = (std::numeric_limits<T>::max() - (maxVector[0] - minVector[0])) / 2 - minVector[0];
+
+    // clamp the offset to [-100, 100]
+    return std::max(std::min(offset, 100.f), -100.f);
+}
+
+
+/**
+ * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise.
+ * @param input
+ * @param output
+ * @param length - len of input/output arrays
+ * @param thresholdLevel
+ */
+template <typename T>
+__global__ void bsplineOffsetAndCopyOriginal(T *input, T *copy, size_t length, float bspline_offset) {
+    size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (idx < length) {
+        auto v = input[idx];
+        copy[idx] = v;
+        input[idx] = v + bspline_offset;
+    }
+}
+
+template <typename ImgType>
+void runBsplineOffsetAndCopyOriginal(ImgType *cudaImage, ImgType *cudaCopy, float bspline_offset, const PixelDataDim &dim, cudaStream_t aStream) {
+    dim3 threadsPerBlock(128);
+    dim3 numBlocks((dim.size() + threadsPerBlock.x - 1)/threadsPerBlock.x);
+    bsplineOffsetAndCopyOriginal<<<numBlocks,threadsPerBlock, 0, aStream>>>(cudaImage, cudaCopy, dim.size(), bspline_offset);
+};
+
+
 class CudaStream {
     cudaStream_t iStream;
 
@@ -332,6 +381,7 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
 
     // cuda stuff - memory and stream to be used
     ScopedCudaMemHandler<const PixelData<ImgType>, JUST_ALLOC> image;
+    ScopedCudaMemHandler<const PixelData<ImgType>, JUST_ALLOC> imageSampling;
     ScopedCudaMemHandler<PixelData<ImgType>, JUST_ALLOC> gradient;
     ScopedCudaMemHandler<PixelData<float>, JUST_ALLOC> local_scale_temp;
     ScopedCudaMemHandler<PixelData<float>, JUST_ALLOC> local_scale_temp2;
@@ -373,6 +423,13 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
     GenInfoGpuAccess giga;
     uint64_t counter_total = 1;
 
+    // Preallocated memory for bspline shift computation
+    VectorData<ImgType> minVector{true};
+    VectorData<ImgType> maxVector{true};
+    ScopedCudaMemHandler<ImgType*, JUST_ALLOC> resultsMin;
+    ScopedCudaMemHandler<ImgType*, JUST_ALLOC> resultsMax;
+    int numOfBlocks;
+
 public:
 
     // TODO: Remove need for passing 'levels' to GpuProcessingTask
@@ -383,6 +440,7 @@ public:
         iCpuLevels(levels),
         iStream(cudaStream.get()),
         image (inputImage, iStream),
+        imageSampling (inputImage, iStream),
         gradient (levels, iStream),
         local_scale_temp (levels, iStream),
         local_scale_temp2 (levels, iStream),
@@ -435,13 +493,35 @@ public:
 
         isErrorDetectedPinned.resize(1);
         isErrorDetectedCuda.initialize(isErrorDetectedPinned.data(), 1, iStream);
+
+
+
+        // In nvidia GPUs maximum number of threads per SM is multiplication of 512 (usually 1536 or 2048)
+        // Calculate number of blocks to saturate whole SMs
+        // Multiply it by 8 to have more smaller blocks to have better load balancing in case GPU is busy with other tasks
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+        const int smCount = deviceProp.multiProcessorCount;
+        const int numOfThreadsPerSM = deviceProp.maxThreadsPerMultiProcessor;
+        constexpr int numOfThreads = 512;
+        const int numOfBlocksPerSM = numOfThreadsPerSM / 512;
+        const int maxNumberOfBlocks = smCount * numOfBlocksPerSM * 8;
+        const size_t numOfElements = inputImage.getDimension().size();
+        numOfBlocks = std::min(maxNumberOfBlocks, static_cast<int>((numOfElements + numOfThreads -1) / numOfThreads) );
+
+        minVector.resize(numOfBlocks);
+        maxVector.resize(numOfBlocks);
+        resultsMin.initialize(minVector.data(), numOfBlocks, iStream);
+        resultsMax.initialize(maxVector.data(), numOfBlocks, iStream);
     }
 
     LinearAccessCudaStructs getDataFromGpu() {
         return std::move(lacs);
     }
 
     void processOnGpu() {
+
+
         // Set it and copy first before copying the image
         // It improves *a lot* performance even though it is needed later in computeLinearStructureCuda()
         iAprInfo.total_number_particles = 0; // reset total_number_particles to 0
@@ -450,6 +530,17 @@ public:
 
         image.copyH2D();
 
+        // offset image by factor (this is required if there are zero areas in the background with
+        // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
+        // Warning both of these could result in over-flow!
+        if (std::is_floating_point<ImgType>::value) {
+            iBsplineOffset = 0;
+        } else {
+            iBsplineOffset = computeBsplineOffset(image.get(), iCpuImage.getDimension(), iParameters.lambda, numOfBlocks, resultsMin, resultsMax, minVector, maxVector, iStream);
+        }
+        runBsplineOffsetAndCopyOriginal(image.get(), imageSampling.get(), iBsplineOffset /*bspline_offset*/, iCpuImage.getDimension(), iStream);
+
+
         getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(),
                          splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetectedPinned[0], isErrorDetectedCuda,
                         iBsplineOffset, iParameters, iStream);
@@ -490,8 +581,6 @@ public:
         lacs.y_vec.copy(y_vec);
     }
 
-    void setBsplineOffset(float offset) {iBsplineOffset = offset;}
-
     ~GpuProcessingTaskImpl() {}
 };
 
@@ -511,9 +600,6 @@ LinearAccessCudaStructs GpuProcessingTask<ImgType>::getDataFromGpu() {return imp
 template <typename ImgType>
 void GpuProcessingTask<ImgType>::processOnGpu() {impl->processOnGpu();}
 
-template <typename ImgType>
-void GpuProcessingTask<ImgType>::setBsplineOffset(float offset) {impl->setBsplineOffset(offset);}
-
 // explicit instantiation of handled types
 template class GpuProcessingTask<uint8_t>;
 template class GpuProcessingTask<int>;
diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp
@@ -340,7 +340,6 @@ namespace {
             // Calculate pipeline on GPU
             timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
             GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, maxLevel);
-            gpt.setBsplineOffset(bspline_offset);
             gpt.processOnGpu();
             auto linearAccessGpu = gpt.getDataFromGpu();
             giGpu.total_number_particles = linearAccessGpu.y_vec.size();