Skip to content

Commit 0a44736

Browse files
committed
CUDA version of getMinMax added (for finding bspline offset)
1 parent 0ba2f45 commit 0a44736

File tree

4 files changed

+227
-0
lines changed

4 files changed

+227
-0
lines changed

src/algorithm/ComputeGradientCuda.cu

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "bsplineXdir.cuh"
2727
#include "bsplineYdir.cuh"
2828
#include "bsplineZdir.cuh"
29+
#include "findMinMax.cuh"
2930

3031
#include "data_structures/APR/access/GenInfoGpuAccess.cuh"
3132

@@ -653,3 +654,46 @@ void cudaDownsampledGradient(PixelData<float> &input, PixelData<float> &grad, co
653654

654655
runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, aStream);
655656
}
657+
658+
659+
template<typename T>
660+
std::pair<T,T> cudaRunMinMax(PixelData<T> &input_image) {
661+
cudaStream_t aStream = nullptr;
662+
663+
// Copy CPU image to CUDA mem
664+
ScopedCudaMemHandler<PixelData<T>, H2D> cudaImage(input_image, aStream);
665+
666+
// In nvidia GPUs maximum number of threads per SM is multiplication of 512 (usually 1536 or 2048)
667+
// Calculate number of blocks to saturate whole SMs
668+
// Multiply it by 8 to have more smaller blocks to have better load balancing in case GPU is busy with other tasks
669+
cudaDeviceProp deviceProp;
670+
cudaGetDeviceProperties(&deviceProp, 0);
671+
const int smCount = deviceProp.multiProcessorCount;
672+
const int numOfThreadsPerSM = deviceProp.maxThreadsPerMultiProcessor;
673+
constexpr int numOfThreads = 512;
674+
const int numOfBlocksPerSM = numOfThreadsPerSM / 512;
675+
const int maxNumberOfBlocks = smCount * numOfBlocksPerSM * 8;
676+
const size_t numOfElements = input_image.getDimension().size();
677+
int numOfBlocks = std::min(maxNumberOfBlocks, static_cast<int>((numOfElements + numOfThreads -1) / numOfThreads) );
678+
679+
// Allocate memory for results both for CPU and GPU
680+
VectorData<T> minVector(true);
681+
VectorData<T> maxVector(true);
682+
minVector.resize(numOfBlocks);
683+
maxVector.resize(numOfBlocks);
684+
ScopedCudaMemHandler<T*, JUST_ALLOC> resultsMin(minVector.data(), numOfBlocks, aStream);
685+
ScopedCudaMemHandler<T*, JUST_ALLOC> resultsMax(maxVector.data(), numOfBlocks, aStream);
686+
687+
// Run kernel and copy data back to CPU
688+
runFindMinMax(cudaImage.get(), input_image.getDimension(), aStream, resultsMin.get(), resultsMax.get(), numOfBlocks, numOfThreads);
689+
resultsMin.copyD2H();
690+
resultsMax.copyD2H();
691+
waitForCuda();
692+
693+
// First values of minVector and maxVector contain min and max of all data
694+
return std::pair<T, T>(minVector[0], maxVector[0]);
695+
}
696+
697+
template std::pair<uint16_t, uint16_t> cudaRunMinMax(PixelData<uint16_t> &);
698+
template std::pair<int, int> cudaRunMinMax(PixelData<int> &);
699+

src/algorithm/ComputeGradientCuda.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ void computeLevelsCuda(const PixelData<ImageType> &grad_temp, PixelData<float> &
3333
template <typename ImgType>
3434
void getGradient(PixelData<ImgType> &image, PixelData<ImgType> &grad_temp, PixelData<float> &local_scale_temp, PixelData<float> &local_scale_temp2, float bspline_offset, const APRParameters &par);
3535
void cudaDownsampledGradient(PixelData<float> &input, PixelData<float> &grad, const float hx, const float hy, const float hz);
36+
template<typename T> std::pair<T,T> cudaRunMinMax(PixelData<T> &input_image);
3637

3738
template <typename ImgType>
3839
class GpuProcessingTask {

src/algorithm/findMinMax.cuh

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#ifndef FIND_MIN_MAX_CUH
2+
#define FIND_MIN_MAX_CUH
3+
4+
#include "misc/CudaTools.cuh"
5+
#include <cuda/std/limits>
6+
7+
/**
8+
* This kernel finds the minimum and maximum values in the input data array.
9+
* Each block processes a portion of the data and writes the minimum and maximum
10+
* values it finds to the resultsMin and resultsMax arrays.
11+
*
12+
* It requires 2*numOfThreads*sizeof(T) of shared memory
13+
*
14+
* @param in - input data
15+
* @param len - length of input data
16+
* @param resultsMin - output array for minimum values per block
17+
* @param resultsMax - output array for maximum values per block
18+
*/
19+
template<typename T>
20+
__global__ void findMinMax(const T *in, const size_t len, T* resultsMin, T* resultsMax) {
21+
// Compute initial indices
22+
const int numOfThreads = blockDim.x;
23+
size_t idx = threadIdx.x;
24+
size_t globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
25+
26+
// Set pointers to shared memory for all needed buffers - use uint64_t to avoid alignment issues so all types
27+
// used as APR like uint16_t or float etc. are aligned properly
28+
extern __shared__ uint64_t array[];
29+
T *minValPerThread = reinterpret_cast<T *>(array);
30+
T *maxValPerThread = reinterpret_cast<T *>(array) + numOfThreads;
31+
32+
// Set initial values for min and max
33+
minValPerThread[idx] = cuda::std::numeric_limits<T>::max();
34+
maxValPerThread[idx] = cuda::std::numeric_limits<T>::min();
35+
36+
// Read from global memory and compute min and max
37+
for (size_t i = globalIdx; i < len; i += gridDim.x * blockDim.x) {
38+
auto val = in[i];
39+
if (val < minValPerThread[idx]) minValPerThread[idx] = val;
40+
if (val > maxValPerThread[idx]) maxValPerThread[idx] = val;
41+
}
42+
43+
// Wait for all threads in block to finish
44+
__syncthreads();
45+
46+
// First thread should go through the shared memory and find the global min and max
47+
// All that work is done only by single thread but it is fast enough to keep it simple
48+
if (idx == 0) {
49+
T globalMin = minValPerThread[0];
50+
T globalMax = maxValPerThread[0];
51+
for (int i = 1; i < numOfThreads; ++i) {
52+
auto vmin = minValPerThread[i];
53+
if (vmin < globalMin) globalMin = vmin;
54+
auto vmax = maxValPerThread[i];
55+
if (vmax > globalMax) globalMax = vmax;
56+
}
57+
58+
// Store results to global memory
59+
resultsMin[blockIdx.x] = globalMin;
60+
resultsMax[blockIdx.x] = globalMax;
61+
}
62+
}
63+
64+
/**
65+
* This kernel takes the intermediate min and max results from each block and computes the final
66+
* minimum and maximum values across all blocks. Results are stored in the first element of resultsMin and resultsMax.
67+
*
68+
* This kernel requires 2*numOfBlocks*sizeof(T) of shared memory.
69+
*
70+
* @param resultsMin - intermediate minimum values from each block
71+
* @param resultsMax - intermediate maximum values from each block
72+
* @param numOfBlocks - number of blocks used in 'findMinMax' kenel (size of resultsMin and resultsMax)
73+
*/
74+
template<typename T>
75+
__global__ void findMinMaxFinal(T* resultsMin, T* resultsMax, int numOfBlocks) {
76+
77+
// Set pointers to shared memory for all needed buffers - use uint64_t to avoid alignment issues so all types
78+
// used as APR like uint16_t or float etc. are aligned properly
79+
extern __shared__ uint64_t array2[];
80+
T *minValPerThread = reinterpret_cast<T *>(array2);
81+
T *maxValPerThread = reinterpret_cast<T *>(array2) + numOfBlocks;
82+
83+
size_t idx = threadIdx.x;
84+
85+
// Read all data with all threads to shared memory
86+
for (size_t i = idx; i < numOfBlocks; i += blockDim.x) {
87+
minValPerThread[i] = resultsMin[i];
88+
maxValPerThread[i] = resultsMax[i];
89+
}
90+
91+
// Wait for all threads to finish
92+
__syncthreads();
93+
94+
//First thread should go through the shared memory and find the global min and max
95+
if (idx == 0) {
96+
T globalMin = minValPerThread[0];
97+
T globalMax = maxValPerThread[0];
98+
for (int i = 1; i < numOfBlocks; ++i) {
99+
auto vmin = minValPerThread[i];
100+
if (vmin < globalMin) globalMin = vmin;
101+
auto vmax = maxValPerThread[i];
102+
if (vmax > globalMax) globalMax = vmax;
103+
}
104+
// store results to global memory
105+
resultsMin[0] = globalMin;
106+
resultsMax[0] = globalMax;
107+
}
108+
}
109+
110+
111+
/**
112+
* Compute min and max values in the cudaInput array.
113+
*
114+
* numOfBlocks and numOfThreads are computed outside of this function to allow finding the optimal values (number of SMs)
115+
* and allocating resultsMin and resultsMax arrays only once and then reuse.
116+
*
117+
* @param cudaInput - input data in device memory
118+
* @param inputDim - dimensions of the input data
119+
* @param aStream - cuda stream to use
120+
* @param resultsMin - output array for minimum value, should have numOfBlocks elements
121+
* @param resultsMax - output array for maximum value, should have numOfBlocks elements
122+
* @param numOfBlocks - number of blocks to use
123+
* @param numOfThreads - number of threads per block
124+
*/
125+
template<typename T>
126+
void runFindMinMax(const T *cudaInput, PixelDataDim inputDim, cudaStream_t aStream, T* resultsMin, T* resultsMax, int numOfBlocks, int numOfThreads) {
127+
const size_t numOfElements = inputDim.size();
128+
129+
findMinMax<<<numOfBlocks, numOfThreads, 2*numOfThreads*sizeof(T), aStream>>> (cudaInput, numOfElements, resultsMin, resultsMax);
130+
findMinMaxFinal<<<1, 1024, 2*numOfBlocks*sizeof(T), aStream>>> (resultsMin, resultsMax, numOfBlocks);
131+
}
132+
133+
134+
#endif

test/ComputeGradientCudaTest.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,54 @@ namespace {
359359
EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0);
360360
}
361361

362+
363+
364+
365+
TEST(ComputeThreshold, TEST_FIND_MIN_MAX) {
366+
// Sizes of input data to test
367+
std::vector<std::tuple<int, int, int>> allSizes = {{2, 1, 1},
368+
{146, 321, 137},
369+
{512, 512, 512},
370+
{127,1, 1},
371+
{129, 1, 1}};
372+
373+
for (auto &p : allSizes) {
374+
int yLen = std::get<0>(p);
375+
int xLen = std::get<1>(p);
376+
int zLen = std::get<2>(p);
377+
378+
379+
// Generate input image
380+
using ImageType = uint16_t;
381+
PixelData<ImageType> input_image = getRandInitializedMesh<ImageType>(yLen, xLen, zLen, 15, 20, true);
382+
// Set whole input_image to 1001
383+
for (size_t i = 0; i < input_image.mesh.size(); ++i) {
384+
input_image.mesh[i] = 1001;
385+
}
386+
387+
const int hiValue = 5000;
388+
const int lowValue = 666;
389+
390+
// Add two random pixels with some max and min value
391+
srand((unsigned)time(0));
392+
int randIndexMax = rand() % input_image.mesh.size();
393+
input_image.mesh[randIndexMax] = hiValue;
394+
int randIndexMin = rand() % input_image.mesh.size();
395+
// Make sure min and max indices are not the same
396+
if (randIndexMin == randIndexMax) {
397+
randIndexMin = (randIndexMin + 1) % input_image.mesh.size();
398+
}
399+
input_image.mesh[randIndexMin] = lowValue;
400+
// Print indices in case of debugging
401+
std::cout << "Position of max and min values: " << randIndexMax << " " << randIndexMin << std::endl;
402+
403+
// Function under test
404+
auto res = cudaRunMinMax(input_image);
405+
406+
EXPECT_EQ(res.first, lowValue);
407+
EXPECT_EQ(res.second, hiValue);
408+
}
409+
}
362410
#endif // APR_USE_CUDA
363411

364412
}

0 commit comments

Comments
 (0)