|
| 1 | +#ifndef FIND_MIN_MAX_CUH |
| 2 | +#define FIND_MIN_MAX_CUH |
| 3 | + |
| 4 | +#include "misc/CudaTools.cuh" |
| 5 | +#include <cuda/std/limits> |
| 6 | + |
| 7 | +/** |
| 8 | + * This kernel finds the minimum and maximum values in the input data array. |
| 9 | + * Each block processes a portion of the data and writes the minimum and maximum |
| 10 | + * values it finds to the resultsMin and resultsMax arrays. |
| 11 | + * |
| 12 | + * It requires 2*numOfThreads*sizeof(T) of shared memory |
| 13 | + * |
| 14 | + * @param in - input data |
| 15 | + * @param len - length of input data |
| 16 | + * @param resultsMin - output array for minimum values per block |
| 17 | + * @param resultsMax - output array for maximum values per block |
| 18 | + */ |
| 19 | +template<typename T> |
| 20 | +__global__ void findMinMax(const T *in, const size_t len, T* resultsMin, T* resultsMax) { |
| 21 | + // Compute initial indices |
| 22 | + const int numOfThreads = blockDim.x; |
| 23 | + size_t idx = threadIdx.x; |
| 24 | + size_t globalIdx = blockIdx.x * blockDim.x + threadIdx.x; |
| 25 | + |
| 26 | + // Set pointers to shared memory for all needed buffers - use uint64_t to avoid alignment issues so all types |
| 27 | + // used as APR like uint16_t or float etc. are aligned properly |
| 28 | + extern __shared__ uint64_t array[]; |
| 29 | + T *minValPerThread = reinterpret_cast<T *>(array); |
| 30 | + T *maxValPerThread = reinterpret_cast<T *>(array) + numOfThreads; |
| 31 | + |
| 32 | + // Set initial values for min and max |
| 33 | + minValPerThread[idx] = cuda::std::numeric_limits<T>::max(); |
| 34 | + maxValPerThread[idx] = cuda::std::numeric_limits<T>::min(); |
| 35 | + |
| 36 | + // Read from global memory and compute min and max |
| 37 | + for (size_t i = globalIdx; i < len; i += gridDim.x * blockDim.x) { |
| 38 | + auto val = in[i]; |
| 39 | + if (val < minValPerThread[idx]) minValPerThread[idx] = val; |
| 40 | + if (val > maxValPerThread[idx]) maxValPerThread[idx] = val; |
| 41 | + } |
| 42 | + |
| 43 | + // Wait for all threads in block to finish |
| 44 | + __syncthreads(); |
| 45 | + |
| 46 | + // First thread should go through the shared memory and find the global min and max |
| 47 | + // All that work is done only by single thread but it is fast enough to keep it simple |
| 48 | + if (idx == 0) { |
| 49 | + T globalMin = minValPerThread[0]; |
| 50 | + T globalMax = maxValPerThread[0]; |
| 51 | + for (int i = 1; i < numOfThreads; ++i) { |
| 52 | + auto vmin = minValPerThread[i]; |
| 53 | + if (vmin < globalMin) globalMin = vmin; |
| 54 | + auto vmax = maxValPerThread[i]; |
| 55 | + if (vmax > globalMax) globalMax = vmax; |
| 56 | + } |
| 57 | + |
| 58 | + // Store results to global memory |
| 59 | + resultsMin[blockIdx.x] = globalMin; |
| 60 | + resultsMax[blockIdx.x] = globalMax; |
| 61 | + } |
| 62 | +} |
| 63 | + |
| 64 | +/** |
| 65 | + * This kernel takes the intermediate min and max results from each block and computes the final |
| 66 | + * minimum and maximum values across all blocks. Results are stored in the first element of resultsMin and resultsMax. |
| 67 | + * |
| 68 | + * This kernel requires 2*numOfBlocks*sizeof(T) of shared memory. |
| 69 | + * |
| 70 | + * @param resultsMin - intermediate minimum values from each block |
| 71 | + * @param resultsMax - intermediate maximum values from each block |
| 72 | + * @param numOfBlocks - number of blocks used in 'findMinMax' kenel (size of resultsMin and resultsMax) |
| 73 | + */ |
| 74 | +template<typename T> |
| 75 | +__global__ void findMinMaxFinal(T* resultsMin, T* resultsMax, int numOfBlocks) { |
| 76 | + |
| 77 | + // Set pointers to shared memory for all needed buffers - use uint64_t to avoid alignment issues so all types |
| 78 | + // used as APR like uint16_t or float etc. are aligned properly |
| 79 | + extern __shared__ uint64_t array2[]; |
| 80 | + T *minValPerThread = reinterpret_cast<T *>(array2); |
| 81 | + T *maxValPerThread = reinterpret_cast<T *>(array2) + numOfBlocks; |
| 82 | + |
| 83 | + size_t idx = threadIdx.x; |
| 84 | + |
| 85 | + // Read all data with all threads to shared memory |
| 86 | + for (size_t i = idx; i < numOfBlocks; i += blockDim.x) { |
| 87 | + minValPerThread[i] = resultsMin[i]; |
| 88 | + maxValPerThread[i] = resultsMax[i]; |
| 89 | + } |
| 90 | + |
| 91 | + // Wait for all threads to finish |
| 92 | + __syncthreads(); |
| 93 | + |
| 94 | + //First thread should go through the shared memory and find the global min and max |
| 95 | + if (idx == 0) { |
| 96 | + T globalMin = minValPerThread[0]; |
| 97 | + T globalMax = maxValPerThread[0]; |
| 98 | + for (int i = 1; i < numOfBlocks; ++i) { |
| 99 | + auto vmin = minValPerThread[i]; |
| 100 | + if (vmin < globalMin) globalMin = vmin; |
| 101 | + auto vmax = maxValPerThread[i]; |
| 102 | + if (vmax > globalMax) globalMax = vmax; |
| 103 | + } |
| 104 | + // store results to global memory |
| 105 | + resultsMin[0] = globalMin; |
| 106 | + resultsMax[0] = globalMax; |
| 107 | + } |
| 108 | +} |
| 109 | + |
| 110 | + |
| 111 | +/** |
| 112 | + * Compute min and max values in the cudaInput array. |
| 113 | + * |
| 114 | + * numOfBlocks and numOfThreads are computed outside of this function to allow finding the optimal values (number of SMs) |
| 115 | + * and allocating resultsMin and resultsMax arrays only once and then reuse. |
| 116 | + * |
| 117 | + * @param cudaInput - input data in device memory |
| 118 | + * @param inputDim - dimensions of the input data |
| 119 | + * @param aStream - cuda stream to use |
| 120 | + * @param resultsMin - output array for minimum value, should have numOfBlocks elements |
| 121 | + * @param resultsMax - output array for maximum value, should have numOfBlocks elements |
| 122 | + * @param numOfBlocks - number of blocks to use |
| 123 | + * @param numOfThreads - number of threads per block |
| 124 | + */ |
| 125 | +template<typename T> |
| 126 | +void runFindMinMax(const T *cudaInput, PixelDataDim inputDim, cudaStream_t aStream, T* resultsMin, T* resultsMax, int numOfBlocks, int numOfThreads) { |
| 127 | + const size_t numOfElements = inputDim.size(); |
| 128 | + |
| 129 | + findMinMax<<<numOfBlocks, numOfThreads, 2*numOfThreads*sizeof(T), aStream>>> (cudaInput, numOfElements, resultsMin, resultsMax); |
| 130 | + findMinMaxFinal<<<1, 1024, 2*numOfBlocks*sizeof(T), aStream>>> (resultsMin, resultsMax, numOfBlocks); |
| 131 | +} |
| 132 | + |
| 133 | + |
| 134 | +#endif |
0 commit comments