CUDA version of getMinMax added (for finding bspline offset)

krzysg · krzysg · commit 0a4473655253 · 2025-09-04T15:57:31.000+02:00
diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu
@@ -26,6 +26,7 @@
 #include "bsplineXdir.cuh"
 #include "bsplineYdir.cuh"
 #include "bsplineZdir.cuh"
+#include "findMinMax.cuh"
 
 #include "data_structures/APR/access/GenInfoGpuAccess.cuh"
 
@@ -653,3 +654,46 @@ void cudaDownsampledGradient(PixelData<float> &input, PixelData<float> &grad, co
 
     runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, aStream);
 }
+
+
+template<typename T>
+std::pair<T,T> cudaRunMinMax(PixelData<T> &input_image) {
+    cudaStream_t  aStream = nullptr;
+
+    // Copy CPU image to CUDA mem
+    ScopedCudaMemHandler<PixelData<T>, H2D> cudaImage(input_image, aStream);
+
+    // In nvidia GPUs maximum number of threads per SM is multiplication of 512 (usually 1536 or 2048)
+    // Calculate number of blocks to saturate whole SMs
+    // Multiply it by 8 to have more smaller blocks to have better load balancing in case GPU is busy with other tasks
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+    const int smCount = deviceProp.multiProcessorCount;
+    const int numOfThreadsPerSM = deviceProp.maxThreadsPerMultiProcessor;
+    constexpr int numOfThreads = 512;
+    const int numOfBlocksPerSM = numOfThreadsPerSM / 512;
+    const int maxNumberOfBlocks = smCount * numOfBlocksPerSM * 8;
+    const size_t numOfElements = input_image.getDimension().size();
+    int numOfBlocks = std::min(maxNumberOfBlocks, static_cast<int>((numOfElements + numOfThreads -1) / numOfThreads) );
+
+    // Allocate memory for results both for CPU and GPU
+    VectorData<T> minVector(true);
+    VectorData<T> maxVector(true);
+    minVector.resize(numOfBlocks);
+    maxVector.resize(numOfBlocks);
+    ScopedCudaMemHandler<T*, JUST_ALLOC> resultsMin(minVector.data(), numOfBlocks, aStream);
+    ScopedCudaMemHandler<T*, JUST_ALLOC> resultsMax(maxVector.data(), numOfBlocks, aStream);
+
+    // Run kernel and copy data back to CPU
+    runFindMinMax(cudaImage.get(), input_image.getDimension(), aStream, resultsMin.get(), resultsMax.get(), numOfBlocks, numOfThreads);
+    resultsMin.copyD2H();
+    resultsMax.copyD2H();
+    waitForCuda();
+
+    // First values of minVector and maxVector contain min and max of all data
+    return std::pair<T, T>(minVector[0], maxVector[0]);
+}
+
+template std::pair<uint16_t, uint16_t> cudaRunMinMax(PixelData<uint16_t> &);
+template std::pair<int, int> cudaRunMinMax(PixelData<int> &);
+
diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp
@@ -33,6 +33,7 @@ void computeLevelsCuda(const PixelData<ImageType> &grad_temp, PixelData<float> &
 template <typename ImgType>
 void getGradient(PixelData<ImgType> &image, PixelData<ImgType> &grad_temp, PixelData<float> &local_scale_temp, PixelData<float> &local_scale_temp2, float bspline_offset, const APRParameters &par);
 void cudaDownsampledGradient(PixelData<float> &input, PixelData<float> &grad, const float hx, const float hy, const float hz);
+template<typename T> std::pair<T,T> cudaRunMinMax(PixelData<T> &input_image);
 
 template <typename ImgType>
 class GpuProcessingTask {
diff --git a/src/algorithm/findMinMax.cuh b/src/algorithm/findMinMax.cuh
@@ -0,0 +1,134 @@
+#ifndef FIND_MIN_MAX_CUH
+#define FIND_MIN_MAX_CUH
+
+#include "misc/CudaTools.cuh"
+#include <cuda/std/limits>
+
+/**
+ * This kernel finds the minimum and maximum values in the input data array.
+ * Each block processes a portion of the data and writes the minimum and maximum
+ * values it finds to the resultsMin and resultsMax arrays.
+ *
+ * It requires 2*numOfThreads*sizeof(T) of shared memory
+ *
+ * @param in - input data
+ * @param len - length of input data
+ * @param resultsMin - output array for minimum values per block
+ * @param resultsMax - output array for maximum values per block
+ */
+template<typename T>
+__global__ void findMinMax(const T *in, const size_t len, T* resultsMin, T* resultsMax) {
+    // Compute initial indices
+    const int numOfThreads = blockDim.x;
+    size_t idx = threadIdx.x;
+    size_t globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Set pointers to shared memory for all needed buffers - use uint64_t to avoid alignment issues so all types
+    // used as APR like uint16_t or float etc. are aligned properly
+    extern __shared__ uint64_t array[];
+    T *minValPerThread = reinterpret_cast<T *>(array);
+    T *maxValPerThread = reinterpret_cast<T *>(array) + numOfThreads;
+
+    // Set initial values for min and max
+    minValPerThread[idx] = cuda::std::numeric_limits<T>::max();
+    maxValPerThread[idx] = cuda::std::numeric_limits<T>::min();
+
+    // Read from global memory and compute min and max
+    for (size_t i = globalIdx; i < len; i += gridDim.x * blockDim.x) {
+        auto val = in[i];
+        if (val < minValPerThread[idx]) minValPerThread[idx] = val;
+        if (val > maxValPerThread[idx]) maxValPerThread[idx] = val;
+    }
+
+    // Wait for all threads in block to finish
+    __syncthreads();
+
+    // First thread should go through the shared memory and find the global min and max
+    // All that work is done only by single thread but it is fast enough to keep it simple
+    if (idx == 0) {
+        T globalMin = minValPerThread[0];
+        T globalMax = maxValPerThread[0];
+        for (int i = 1; i < numOfThreads; ++i) {
+            auto vmin = minValPerThread[i];
+            if (vmin < globalMin) globalMin = vmin;
+            auto vmax = maxValPerThread[i];
+            if (vmax > globalMax) globalMax = vmax;
+        }
+
+        // Store results to global memory
+        resultsMin[blockIdx.x] = globalMin;
+        resultsMax[blockIdx.x] = globalMax;
+    }
+}
+
+/**
+ * This kernel takes the intermediate min and max results from each block and computes the final
+ * minimum and maximum values across all blocks. Results are stored in the first element of resultsMin and resultsMax.
+ *
+ * This kernel requires 2*numOfBlocks*sizeof(T) of shared memory.
+ *
+ * @param resultsMin - intermediate minimum values from each block
+ * @param resultsMax - intermediate maximum values from each block
+ * @param numOfBlocks - number of blocks used in 'findMinMax' kenel (size of resultsMin and resultsMax)
+ */
+template<typename T>
+__global__ void findMinMaxFinal(T* resultsMin, T* resultsMax, int numOfBlocks) {
+
+    // Set pointers to shared memory for all needed buffers - use uint64_t to avoid alignment issues so all types
+    // used as APR like uint16_t or float etc. are aligned properly
+    extern __shared__ uint64_t array2[];
+    T *minValPerThread = reinterpret_cast<T *>(array2);
+    T *maxValPerThread = reinterpret_cast<T *>(array2) + numOfBlocks;
+
+    size_t idx = threadIdx.x;
+
+    // Read all data with all threads to shared memory
+    for (size_t i = idx; i < numOfBlocks; i += blockDim.x) {
+        minValPerThread[i] = resultsMin[i];
+        maxValPerThread[i] = resultsMax[i];
+    }
+
+    // Wait for all threads to finish
+    __syncthreads();
+
+    //First thread should go through the shared memory and find the global min and max
+    if (idx == 0) {
+        T globalMin = minValPerThread[0];
+        T globalMax = maxValPerThread[0];
+        for (int i = 1; i < numOfBlocks; ++i) {
+            auto vmin = minValPerThread[i];
+            if (vmin < globalMin) globalMin = vmin;
+            auto vmax = maxValPerThread[i];
+            if (vmax > globalMax) globalMax = vmax;
+        }
+        // store results to global memory
+        resultsMin[0] = globalMin;
+        resultsMax[0] = globalMax;
+    }
+}
+
+
+/**
+ * Compute min and max values in the cudaInput array.
+ *
+ * numOfBlocks and numOfThreads are computed outside of this function to allow finding the optimal values (number of SMs)
+ * and allocating resultsMin and resultsMax arrays only once and then reuse.
+ *
+ * @param cudaInput - input data in device memory
+ * @param inputDim - dimensions of the input data
+ * @param aStream - cuda stream to use
+ * @param resultsMin - output array for minimum value, should have numOfBlocks elements
+ * @param resultsMax - output array for maximum value, should have numOfBlocks elements
+ * @param numOfBlocks - number of blocks to use
+ * @param numOfThreads - number of threads per block
+ */
+template<typename T>
+void runFindMinMax(const T *cudaInput, PixelDataDim inputDim, cudaStream_t aStream, T* resultsMin, T* resultsMax, int numOfBlocks, int numOfThreads) {
+    const size_t numOfElements = inputDim.size();
+
+    findMinMax<<<numOfBlocks, numOfThreads, 2*numOfThreads*sizeof(T), aStream>>> (cudaInput, numOfElements, resultsMin, resultsMax);
+    findMinMaxFinal<<<1, 1024, 2*numOfBlocks*sizeof(T), aStream>>> (resultsMin, resultsMax, numOfBlocks);
+}
+
+
+#endif
diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp
@@ -359,6 +359,54 @@ namespace {
         EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0);
     }
 
+
+
+
+    TEST(ComputeThreshold, TEST_FIND_MIN_MAX) {
+        // Sizes of input data to test
+        std::vector<std::tuple<int, int, int>> allSizes = {{2, 1, 1},
+                                                           {146, 321, 137},
+                                                           {512, 512, 512},
+                                                           {127,1, 1},
+                                                           {129, 1, 1}};
+
+        for (auto &p : allSizes) {
+            int yLen = std::get<0>(p);
+            int xLen = std::get<1>(p);
+            int zLen = std::get<2>(p);
+
+
+            // Generate input image
+            using ImageType = uint16_t;
+            PixelData<ImageType> input_image = getRandInitializedMesh<ImageType>(yLen, xLen, zLen, 15, 20, true);
+            // Set whole input_image to 1001
+            for (size_t i = 0; i < input_image.mesh.size(); ++i) {
+                input_image.mesh[i] = 1001;
+            }
+
+            const int hiValue = 5000;
+            const int lowValue = 666;
+
+            // Add two random pixels with some max and min value
+            srand((unsigned)time(0));
+            int randIndexMax = rand() % input_image.mesh.size();
+            input_image.mesh[randIndexMax] = hiValue;
+            int randIndexMin = rand() % input_image.mesh.size();
+            // Make sure min and max indices are not the same
+            if (randIndexMin == randIndexMax) {
+                randIndexMin = (randIndexMin + 1) % input_image.mesh.size();
+            }
+            input_image.mesh[randIndexMin] = lowValue;
+            // Print indices in case of debugging
+            std::cout << "Position of max and min values: " << randIndexMax << " " << randIndexMin << std::endl;
+
+            // Function under test
+            auto res = cudaRunMinMax(input_image);
+
+            EXPECT_EQ(res.first, lowValue);
+            EXPECT_EQ(res.second, hiValue);
+        }
+    }
 #endif // APR_USE_CUDA
 
 }