Skip to content

Commit 06da7ec

Browse files
committed
add cuda gradient/sobel magnitude methods + tests
1 parent ba2207e commit 06da7ec

File tree

3 files changed

+254
-0
lines changed

3 files changed

+254
-0
lines changed

src/numerics/APRNumericsGPU.cu

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,96 @@
44

55
#include "APRNumericsGPU.hpp"
66

7+
8+
template<typename InputType>
9+
void APRNumericsGPU::gradient_magnitude(GPUAccessHelper &access, GPUAccessHelper &tree_access,
10+
VectorData<InputType> &inputParticles, VectorData<float> &outputParticles,
11+
VectorData<float> &stencil_vec_y, VectorData<float> &stencil_vec_x,
12+
VectorData<float> &stencil_vec_z) {
13+
14+
// initialize GPU access data
15+
tree_access.init_gpu();
16+
access.init_gpu(tree_access);
17+
18+
// check stencils
19+
assert(stencil_vec_y.size() >= 27);
20+
assert(stencil_vec_x.size() >= 27);
21+
assert(stencil_vec_z.size() >= 27);
22+
const bool downsampled_y = (stencil_vec_y.size() >= 27 * (access.level_max() - access.level_min()));
23+
const bool downsampled_x = (stencil_vec_x.size() >= 27 * (access.level_max() - access.level_min()));
24+
const bool downsampled_z = (stencil_vec_z.size() >= 27 * (access.level_max() - access.level_min()));
25+
26+
// initialize output
27+
outputParticles.resize(access.total_number_particles());
28+
29+
// find non-empty rows
30+
VectorData<int> ne_counter_ds;
31+
VectorData<int> ne_counter_333;
32+
ScopedCudaMemHandler<int*, JUST_ALLOC> ne_rows_ds_gpu;
33+
ScopedCudaMemHandler<int*, JUST_ALLOC> ne_rows_333_gpu;
34+
compute_ne_rows_tree_cuda<16, 32>(tree_access, ne_counter_ds, ne_rows_ds_gpu);
35+
compute_ne_rows_cuda<16, 32>(access, ne_counter_333, ne_rows_333_gpu, 2);
36+
37+
// allocate GPU memory
38+
ScopedCudaMemHandler<InputType*, H2D> input_gpu(inputParticles.data(), inputParticles.size());
39+
ScopedCudaMemHandler<float*, JUST_ALLOC> output_gpu(outputParticles.data(), outputParticles.size());
40+
ScopedCudaMemHandler<float*, JUST_ALLOC> tmp_output(NULL, access.total_number_particles());
41+
ScopedCudaMemHandler<float*, JUST_ALLOC> tree_gpu(NULL, tree_access.total_number_particles());
42+
ScopedCudaMemHandler<float*, H2D> stencil_y_gpu(stencil_vec_y.data(), stencil_vec_y.size());
43+
ScopedCudaMemHandler<float*, H2D> stencil_x_gpu(stencil_vec_x.data(), stencil_vec_x.size());
44+
ScopedCudaMemHandler<float*, H2D> stencil_z_gpu(stencil_vec_z.data(), stencil_vec_z.size());
45+
46+
// compute block and grid size for elementwise operations
47+
const size_t N = access.total_number_particles();
48+
int blockSize, minGridSize, gridSize;
49+
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, addSquare, 0, 0);
50+
gridSize = (N + blockSize - 1) / blockSize;
51+
52+
// fill tree by average downsampling
53+
downsample_avg(access, tree_access, input_gpu.get(), tree_gpu.get(), ne_rows_ds_gpu.get(), ne_counter_ds);
54+
error_check( cudaDeviceSynchronize() )
55+
56+
// compute y gradient
57+
isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), output_gpu.get(),
58+
stencil_y_gpu.get(), tree_gpu.get(), ne_rows_333_gpu.get(),
59+
ne_counter_333, downsampled_y);
60+
error_check( cudaDeviceSynchronize() )
61+
62+
// square y gradient
63+
elementWiseMult<<<blockSize, gridSize>>>(output_gpu.get(), output_gpu.get(), N);
64+
65+
// compute x gradient
66+
isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), tmp_output.get(),
67+
stencil_x_gpu.get(), tree_gpu.get(), ne_rows_333_gpu.get(),
68+
ne_counter_333, downsampled_x);
69+
70+
error_check( cudaDeviceSynchronize() )
71+
72+
// add square of x gradient to output
73+
addSquare<<<blockSize, gridSize>>>(output_gpu.get(), tmp_output.get(), N);
74+
75+
error_check( cudaDeviceSynchronize() )
76+
77+
// compute z gradient
78+
isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), tmp_output.get(),
79+
stencil_z_gpu.get(), tree_gpu.get(), ne_rows_333_gpu.get(),
80+
ne_counter_333, downsampled_z);
81+
82+
error_check( cudaDeviceSynchronize() )
83+
84+
// add square of x gradient to output
85+
addSquare<<<blockSize, gridSize>>>(output_gpu.get(), tmp_output.get(), N);
86+
87+
error_check( cudaDeviceSynchronize() )
88+
89+
elementWiseSqrt<<<blockSize, gridSize>>>(output_gpu.get(), N);
90+
91+
error_check( cudaDeviceSynchronize() )
92+
93+
output_gpu.copyD2H();
94+
}
95+
96+
797
template<typename inputType, typename stencilType>
898
void APRNumericsGPU::richardson_lucy(GPUAccessHelper& access, GPUAccessHelper& tree_access, inputType* input,
999
stencilType* output, stencilType* psf, stencilType* psf_flipped, int kernel_size,
@@ -159,6 +249,11 @@ void APRNumericsGPU::richardson_lucy(GPUAccessHelper& access, GPUAccessHelper& t
159249
}
160250

161251

252+
template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&);
253+
template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&);
254+
template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&);
255+
template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&);
256+
162257
template void APRNumericsGPU::richardson_lucy(GPUAccessHelper&, GPUAccessHelper&, uint16_t*, float*, float*, float*, int, int, bool, bool);
163258
template void APRNumericsGPU::richardson_lucy(GPUAccessHelper&, GPUAccessHelper&, float*, float*, float*, float*, int, int, bool, bool);
164259

src/numerics/APRNumericsGPU.hpp

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,57 @@ namespace APRNumericsGPU {
4949
float delta = 1.0f);
5050

5151

52+
/**
53+
* Apply 3x3x3 convolution using three input stencils and compute the magnitude sqrt(dz*dz + dy*dy + dx*dx)
54+
* of the results.
55+
* @tparam InputType
56+
* @param access
57+
* @param tree_access
58+
* @param inputParticles
59+
* @param outputParticles
60+
* @param stencil_vec_y Stencil (vectors) to be applied. Should be of size 27 or `27 * (access.level_max() - access.level_min())`
61+
* @param stencil_vec_x
62+
* @param stencil_vec_z
63+
*/
64+
template<typename InputType>
65+
void gradient_magnitude(GPUAccessHelper &access, GPUAccessHelper &tree_access, VectorData<InputType> &inputParticles,
66+
VectorData<float> &outputParticles, VectorData<float> &stencil_vec_y,
67+
VectorData<float> &stencil_vec_x, VectorData<float> &stencil_vec_z);
68+
69+
70+
/**
71+
* Compute the gradient magnitude using level-adaptive central finite differences.
72+
* Note: uses 3x3x3 convolutions instead of e.g. 1x1x3, which is quite inefficient.
73+
* @tparam InputType
74+
* @tparam GradientType
75+
* @param apr
76+
* @param inputParticles
77+
* @param outputParticles
78+
* @param deltas pixel size in each dimension, used to scale the gradients (default: {1, 1, 1})
79+
*/
80+
template<typename InputType>
81+
void gradient_magnitude_cfd(GPUAccessHelper &access,
82+
GPUAccessHelper &tree_access,
83+
VectorData<InputType> &inputParticles,
84+
VectorData<float> &outputParticles,
85+
const std::vector<float> &deltas = {1.0f, 1.0f, 1.0f});
86+
87+
88+
/**
89+
* Compute the gradient magnitude using level-adaptive Sobel filters.
90+
* @tparam InputType
91+
* @tparam GradientType
92+
* @param apr
93+
* @param inputParticles
94+
* @param outputParticles
95+
* @param deltas pixel size in each dimension, used to scale the gradients (default: {1, 1, 1})
96+
*/
97+
template<typename InputType>
98+
void gradient_magnitude_sobel(GPUAccessHelper &access,
99+
GPUAccessHelper &tree_access,
100+
VectorData<InputType>& inputParticles,
101+
VectorData<float>& outputParticles,
102+
const std::vector<float>& deltas = {1.0f, 1.0f, 1.0f});
52103

53104

54105
template<typename inputType, typename stencilType>
@@ -96,4 +147,51 @@ void APRNumericsGPU::gradient_sobel(GPUAccessHelper &access, GPUAccessHelper &tr
96147
}
97148

98149

150+
template<typename InputType>
151+
void APRNumericsGPU::gradient_magnitude_cfd(GPUAccessHelper &access, GPUAccessHelper &tree_access,
152+
VectorData<InputType> &inputParticles, VectorData<float> &outputParticles,
153+
const std::vector<float> &deltas) {
154+
155+
// generate cfd stencils
156+
PixelData<float> stencil_y(3, 3, 3, 0);
157+
PixelData<float> stencil_x(3, 3, 3, 0);
158+
PixelData<float> stencil_z(3, 3, 3, 0);
159+
160+
stencil_y.at(0, 1, 1) = -1.f/(2*deltas[0]); stencil_y.at(2, 1, 1) = 1.f/(2*deltas[0]);
161+
stencil_x.at(1, 0, 1) = -1.f/(2*deltas[1]); stencil_x.at(1, 2, 1) = 1.f/(2*deltas[1]);
162+
stencil_z.at(1, 1, 0) = -1.f/(2*deltas[2]); stencil_z.at(1, 1, 2) = 1.f/(2*deltas[2]);
163+
164+
// rescale stencils for each level
165+
VectorData<float> stencil_vec_y, stencil_vec_x, stencil_vec_z;
166+
APRStencil::get_rescaled_stencils(stencil_y, stencil_vec_y, access.level_max()-access.level_min());
167+
APRStencil::get_rescaled_stencils(stencil_x, stencil_vec_x, access.level_max()-access.level_min());
168+
APRStencil::get_rescaled_stencils(stencil_z, stencil_vec_z, access.level_max()-access.level_min());
169+
170+
// compute gradient magnitude
171+
gradient_magnitude(access, tree_access, inputParticles, outputParticles, stencil_vec_y, stencil_vec_x, stencil_vec_z);
172+
}
173+
174+
175+
template<typename InputType>
176+
void APRNumericsGPU::gradient_magnitude_sobel(GPUAccessHelper &access, GPUAccessHelper &tree_access,
177+
VectorData<InputType> &inputParticles, VectorData<float> &outputParticles,
178+
const std::vector<float> &deltas) {
179+
// generate Sobel stencils
180+
auto stencil_y = APRStencil::create_sobel_filter<float>(0, deltas[0]);
181+
auto stencil_x = APRStencil::create_sobel_filter<float>(1, deltas[1]);
182+
auto stencil_z = APRStencil::create_sobel_filter<float>(2, deltas[2]);
183+
184+
// rescale stencils for each level
185+
VectorData<float> stencil_vec_y, stencil_vec_x, stencil_vec_z;
186+
APRStencil::get_rescaled_stencils(stencil_y, stencil_vec_y, access.level_max()-access.level_min());
187+
APRStencil::get_rescaled_stencils(stencil_x, stencil_vec_x, access.level_max()-access.level_min());
188+
APRStencil::get_rescaled_stencils(stencil_z, stencil_vec_z, access.level_max()-access.level_min());
189+
190+
// compute gradient magnitude
191+
gradient_magnitude(access, tree_access, inputParticles, outputParticles, stencil_vec_y, stencil_vec_x, stencil_vec_z);
192+
}
193+
194+
195+
196+
99197
#endif //LIBAPR_APRNUMERICSGPU_HPP

test/APRTestCuda.cpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1471,6 +1471,67 @@ TEST_F(CreateCR124, TEST_SOBEL_CUDA) {
14711471
}
14721472

14731473

1474+
bool test_gradient_magnitude_cuda(TestDataGPU &test_data) {
1475+
1476+
APRTimer timer(true);
1477+
const std::vector<float> deltas = {0.9, 1.1, 1.3};
1478+
test_data.apr.init_cuda(true);
1479+
auto access = test_data.apr.gpuAPRHelper();
1480+
auto tree_access = test_data.apr.gpuTreeHelper();
1481+
1482+
timer.start_timer("gradient magnitude CUDA");
1483+
ParticleData<float> output;
1484+
APRNumericsGPU::gradient_magnitude_cfd(access, tree_access, test_data.particles_intensities.data, output.data, deltas);
1485+
timer.stop_timer();
1486+
1487+
timer.start_timer("gradient magnitude CPU");
1488+
ParticleData<float> output_gt;
1489+
APRNumerics::gradient_magnitude_cfd(test_data.apr, test_data.particles_intensities, output_gt, deltas);
1490+
timer.stop_timer();
1491+
1492+
size_t err_count = compareParticles(output_gt, output, 1e-2, 20);
1493+
return err_count == 0;
1494+
}
1495+
1496+
TEST_F(CreateSmallSphereTest, TEST_GRADIENT_MAGNITUDE_CUDA) {
1497+
ASSERT_TRUE(test_gradient_magnitude_cuda(test_data));
1498+
}
1499+
1500+
TEST_F(CreateCR124, TEST_GRADIENT_MAGNITUDE_CUDA) {
1501+
ASSERT_TRUE(test_gradient_magnitude_cuda(test_data));
1502+
}
1503+
1504+
1505+
bool test_sobel_magnitude_cuda(TestDataGPU &test_data) {
1506+
1507+
APRTimer timer(true);
1508+
const std::vector<float> deltas = {0.9, 1.1, 1.3};
1509+
test_data.apr.init_cuda(true);
1510+
auto access = test_data.apr.gpuAPRHelper();
1511+
auto tree_access = test_data.apr.gpuTreeHelper();
1512+
1513+
timer.start_timer("sobel magnitude CUDA");
1514+
ParticleData<float> output;
1515+
APRNumericsGPU::gradient_magnitude_sobel(access, tree_access, test_data.particles_intensities.data, output.data, deltas);
1516+
timer.stop_timer();
1517+
1518+
timer.start_timer("sobel magnitude CPU");
1519+
ParticleData<float> output_gt;
1520+
APRNumerics::gradient_magnitude_sobel(test_data.apr, test_data.particles_intensities, output_gt, deltas);
1521+
timer.stop_timer();
1522+
1523+
size_t err_count = compareParticles(output_gt, output, 1e-2, 20);
1524+
return err_count == 0;
1525+
}
1526+
1527+
TEST_F(CreateSmallSphereTest, TEST_SOBEL_MAGNITUDE_CUDA) {
1528+
ASSERT_TRUE(test_sobel_magnitude_cuda(test_data));
1529+
}
1530+
1531+
TEST_F(CreateCR124, TEST_SOBEL_MAGNITUDE_CUDA) {
1532+
ASSERT_TRUE(test_sobel_magnitude_cuda(test_data));
1533+
}
1534+
14741535

14751536
#endif // APR_USE_CUDA
14761537

0 commit comments

Comments
 (0)