|
4 | 4 |
|
5 | 5 | #include "APRNumericsGPU.hpp" |
6 | 6 |
|
| 7 | + |
| 8 | +template<typename InputType> |
| 9 | +void APRNumericsGPU::gradient_magnitude(GPUAccessHelper &access, GPUAccessHelper &tree_access, |
| 10 | + VectorData<InputType> &inputParticles, VectorData<float> &outputParticles, |
| 11 | + VectorData<float> &stencil_vec_y, VectorData<float> &stencil_vec_x, |
| 12 | + VectorData<float> &stencil_vec_z) { |
| 13 | + |
| 14 | + // initialize GPU access data |
| 15 | + tree_access.init_gpu(); |
| 16 | + access.init_gpu(tree_access); |
| 17 | + |
| 18 | + // check stencils |
| 19 | + assert(stencil_vec_y.size() >= 27); |
| 20 | + assert(stencil_vec_x.size() >= 27); |
| 21 | + assert(stencil_vec_z.size() >= 27); |
| 22 | + const bool downsampled_y = (stencil_vec_y.size() >= 27 * (access.level_max() - access.level_min())); |
| 23 | + const bool downsampled_x = (stencil_vec_x.size() >= 27 * (access.level_max() - access.level_min())); |
| 24 | + const bool downsampled_z = (stencil_vec_z.size() >= 27 * (access.level_max() - access.level_min())); |
| 25 | + |
| 26 | + // initialize output |
| 27 | + outputParticles.resize(access.total_number_particles()); |
| 28 | + |
| 29 | + // find non-empty rows |
| 30 | + VectorData<int> ne_counter_ds; |
| 31 | + VectorData<int> ne_counter_333; |
| 32 | + ScopedCudaMemHandler<int*, JUST_ALLOC> ne_rows_ds_gpu; |
| 33 | + ScopedCudaMemHandler<int*, JUST_ALLOC> ne_rows_333_gpu; |
| 34 | + compute_ne_rows_tree_cuda<16, 32>(tree_access, ne_counter_ds, ne_rows_ds_gpu); |
| 35 | + compute_ne_rows_cuda<16, 32>(access, ne_counter_333, ne_rows_333_gpu, 2); |
| 36 | + |
| 37 | + // allocate GPU memory |
| 38 | + ScopedCudaMemHandler<InputType*, H2D> input_gpu(inputParticles.data(), inputParticles.size()); |
| 39 | + ScopedCudaMemHandler<float*, JUST_ALLOC> output_gpu(outputParticles.data(), outputParticles.size()); |
| 40 | + ScopedCudaMemHandler<float*, JUST_ALLOC> tmp_output(NULL, access.total_number_particles()); |
| 41 | + ScopedCudaMemHandler<float*, JUST_ALLOC> tree_gpu(NULL, tree_access.total_number_particles()); |
| 42 | + ScopedCudaMemHandler<float*, H2D> stencil_y_gpu(stencil_vec_y.data(), stencil_vec_y.size()); |
| 43 | + ScopedCudaMemHandler<float*, H2D> stencil_x_gpu(stencil_vec_x.data(), stencil_vec_x.size()); |
| 44 | + ScopedCudaMemHandler<float*, H2D> stencil_z_gpu(stencil_vec_z.data(), stencil_vec_z.size()); |
| 45 | + |
| 46 | + // compute block and grid size for elementwise operations |
| 47 | + const size_t N = access.total_number_particles(); |
| 48 | + int blockSize, minGridSize, gridSize; |
| 49 | + cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, addSquare, 0, 0); |
| 50 | + gridSize = (N + blockSize - 1) / blockSize; |
| 51 | + |
| 52 | + // fill tree by average downsampling |
| 53 | + downsample_avg(access, tree_access, input_gpu.get(), tree_gpu.get(), ne_rows_ds_gpu.get(), ne_counter_ds); |
| 54 | + error_check( cudaDeviceSynchronize() ) |
| 55 | + |
| 56 | + // compute y gradient |
| 57 | + isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), output_gpu.get(), |
| 58 | + stencil_y_gpu.get(), tree_gpu.get(), ne_rows_333_gpu.get(), |
| 59 | + ne_counter_333, downsampled_y); |
| 60 | + error_check( cudaDeviceSynchronize() ) |
| 61 | + |
| 62 | + // square y gradient |
| 63 | + elementWiseMult<<<blockSize, gridSize>>>(output_gpu.get(), output_gpu.get(), N); |
| 64 | + |
| 65 | + // compute x gradient |
| 66 | + isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), tmp_output.get(), |
| 67 | + stencil_x_gpu.get(), tree_gpu.get(), ne_rows_333_gpu.get(), |
| 68 | + ne_counter_333, downsampled_x); |
| 69 | + |
| 70 | + error_check( cudaDeviceSynchronize() ) |
| 71 | + |
| 72 | + // add square of x gradient to output |
| 73 | + addSquare<<<blockSize, gridSize>>>(output_gpu.get(), tmp_output.get(), N); |
| 74 | + |
| 75 | + error_check( cudaDeviceSynchronize() ) |
| 76 | + |
| 77 | + // compute z gradient |
| 78 | + isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), tmp_output.get(), |
| 79 | + stencil_z_gpu.get(), tree_gpu.get(), ne_rows_333_gpu.get(), |
| 80 | + ne_counter_333, downsampled_z); |
| 81 | + |
| 82 | + error_check( cudaDeviceSynchronize() ) |
| 83 | + |
| 84 | + // add square of x gradient to output |
| 85 | + addSquare<<<blockSize, gridSize>>>(output_gpu.get(), tmp_output.get(), N); |
| 86 | + |
| 87 | + error_check( cudaDeviceSynchronize() ) |
| 88 | + |
| 89 | + elementWiseSqrt<<<blockSize, gridSize>>>(output_gpu.get(), N); |
| 90 | + |
| 91 | + error_check( cudaDeviceSynchronize() ) |
| 92 | + |
| 93 | + output_gpu.copyD2H(); |
| 94 | +} |
| 95 | + |
| 96 | + |
7 | 97 | template<typename inputType, typename stencilType> |
8 | 98 | void APRNumericsGPU::richardson_lucy(GPUAccessHelper& access, GPUAccessHelper& tree_access, inputType* input, |
9 | 99 | stencilType* output, stencilType* psf, stencilType* psf_flipped, int kernel_size, |
@@ -159,6 +249,11 @@ void APRNumericsGPU::richardson_lucy(GPUAccessHelper& access, GPUAccessHelper& t |
159 | 249 | } |
160 | 250 |
|
161 | 251 |
|
| 252 | +template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&); |
| 253 | +template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&); |
| 254 | +template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&); |
| 255 | +template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&); |
| 256 | + |
162 | 257 | template void APRNumericsGPU::richardson_lucy(GPUAccessHelper&, GPUAccessHelper&, uint16_t*, float*, float*, float*, int, int, bool, bool); |
163 | 258 | template void APRNumericsGPU::richardson_lucy(GPUAccessHelper&, GPUAccessHelper&, float*, float*, float*, float*, int, int, bool, bool); |
164 | 259 |
|
|
0 commit comments