Skip to content

Commit c869596

Browse files
authored
Merge pull request #175 from AdaptiveParticles/cuda_numerics
Fix issue with APRStencil methods, expand cuda numerics functionality
2 parents 10fe707 + 96039d6 commit c869596

19 files changed

+807
-471
lines changed

benchmarks/FilterBenchmarks.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -817,13 +817,13 @@ inline void bench_richardson_lucy_apr(APR& apr, ParticleData<partsType>& parts,
817817

818818
// burn-in
819819
for(int i = 0; i < num_rep/10; ++i) {
820-
richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);
820+
APRNumericsGPU::richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);
821821
}
822822
cudaDeviceSynchronize();
823823

824824
timer.start_timer("richardson lucy");
825825
for(int i = 0; i < num_rep; ++i) {
826-
richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);
826+
APRNumericsGPU::richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);
827827
}
828828
cudaDeviceSynchronize();
829829
timer.stop_timer();

examples/Example_apr_deconvolution.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ int main(int argc, char **argv) {
6262
auto tree_access = apr.gpuTreeHelper();
6363
ParticleData<float> tree_data;
6464

65-
richardson_lucy(access, tree_access, parts.data, output.data, stencil, options.number_iterations,
66-
/*downsample stencil*/ true, /*normalize stencils*/ true, /*resume*/false);
65+
APRNumericsGPU::richardson_lucy(access, tree_access, parts.data, output.data, stencil, options.number_iterations,
66+
/*downsample stencil*/ true, /*normalize stencils*/ true, /*resume*/false);
6767

6868
done = true;
6969
timer.stop_timer();

examples/Example_apr_filter.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,8 @@ int main(int argc, char **argv) {
5858
timer.start_timer("APR Convolution CUDA");
5959
auto access = apr.gpuAPRHelper();
6060
auto tree_access = apr.gpuTreeHelper();
61-
VectorData<float> stencil_vd;
62-
stencil_vd.resize(125); // stencil must be 5x5x5!
63-
std::copy(stencil.mesh.begin(), stencil.mesh.end(), stencil_vd.begin());
6461
ParticleData<float> tree_data;
65-
isotropic_convolve_555(access, tree_access, parts.data, output.data, stencil_vd, tree_data.data,
62+
isotropic_convolve_555(access, tree_access, parts.data, output.data, stencil, tree_data.data,
6663
/*reflect boundary*/true, /*downsample stencil*/true, /*normalize stencils*/true);
6764

6865
done = true;

examples/Example_compute_gradient.cpp

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,29 @@ int main(int argc, char **argv) {
5959
ParticleData<float> output;
6060
std::vector<float> deltas = {options.dy, options.dx, options.dz};
6161

62-
if(options.sobel) {
63-
APRNumerics::gradient_magnitude_sobel(apr, parts, output, deltas);
64-
} else {
65-
APRNumerics::gradient_magnitude_cfd(apr, parts, output, deltas);
62+
bool done = false;
63+
64+
if(options.use_cuda) {
65+
#ifdef APR_USE_CUDA
66+
auto access = apr.gpuAPRHelper();
67+
auto tree_access = apr.gpuTreeHelper();
68+
if(options.sobel) {
69+
APRNumericsGPU::gradient_magnitude_sobel(access, tree_access, parts.data, output.data, deltas);
70+
} else {
71+
APRNumericsGPU::gradient_magnitude_cfd(access, tree_access, parts.data, output.data, deltas);
72+
}
73+
done = true;
74+
#else
75+
std::cout << "Option -use_cuda was given, but LibAPR was not built with CUDA enabled. Using CPU implementation." << std::endl;
76+
#endif
77+
}
78+
79+
if(!done) {
80+
if (options.sobel) {
81+
APRNumerics::gradient_magnitude_sobel(apr, parts, output, deltas);
82+
} else {
83+
APRNumerics::gradient_magnitude_cfd(apr, parts, output, deltas);
84+
}
6685
}
6786
timer.stop_timer();
6887

@@ -144,6 +163,11 @@ cmdLineOptions read_command_line_options(int argc, char **argv){
144163
result.dz = std::stof(get_command_option(argv, argv + argc, "-dz"));
145164
}
146165

166+
if(command_option_exists(argv, argv + argc, "-use_cuda"))
167+
{
168+
result.use_cuda = true;
169+
}
170+
147171

148172
return result;
149173

examples/Example_compute_gradient.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717
#include "numerics/APRNumerics.hpp"
1818
#include "numerics/APRReconstruction.hpp"
1919

20+
#ifdef APR_USE_CUDA
21+
#include "numerics/APRNumericsGPU.hpp"
22+
#endif
23+
2024
struct cmdLineOptions{
2125
std::string output = "";
2226
std::string stats = "";
@@ -26,6 +30,7 @@ struct cmdLineOptions{
2630
float dx = 1.0f;
2731
float dy = 1.0f;
2832
float dz = 1.0f;
33+
bool use_cuda = false;
2934
};
3035

3136
cmdLineOptions read_command_line_options(int argc, char **argv);

src/data_structures/APR/access/GPUAccess.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,12 @@ template class ParticleDataGpu<int16_t>;
163163
template class ParticleDataGpu<int64_t>;
164164

165165

166-
__global__ void fill_y_vec_max_level(const uint64_t* level_xz_vec,
167-
const uint64_t* xz_end_vec,
168-
uint16_t* y_vec,
169-
const uint64_t* level_xz_vec_tree,
170-
const uint64_t* xz_end_vec_tree,
171-
const uint16_t* y_vec_tree,
166+
__global__ void fill_y_vec_max_level(const uint64_t* __restrict__ level_xz_vec,
167+
const uint64_t* __restrict__ xz_end_vec,
168+
uint16_t* __restrict__ y_vec,
169+
const uint64_t* __restrict__ level_xz_vec_tree,
170+
const uint64_t* __restrict__ xz_end_vec_tree,
171+
const uint16_t* __restrict__ y_vec_tree,
172172
const int z_num,
173173
const int x_num,
174174
const uint16_t y_num,

src/numerics/APRDownsampleGPU.cu

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,13 +1207,17 @@ void downsample_avg(GPUAccessHelper& access, GPUAccessHelper& tree_access, Vecto
12071207
}
12081208

12091209
/// instantiate templates
1210+
template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&);
12101211
template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&);
1211-
template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&);
1212+
template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&);
12121213
template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&);
1214+
template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&);
12131215

1216+
template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&);
12141217
template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&);
1215-
template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&);
1218+
template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&);
12161219
template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&);
1220+
template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&);
12171221

12181222
template void compute_ne_rows_tree_cuda<8, 32>(GPUAccessHelper&, VectorData<int>&, ScopedCudaMemHandler<int*, JUST_ALLOC>&);
12191223
template void compute_ne_rows_tree_cuda<16, 32>(GPUAccessHelper&, VectorData<int>&, ScopedCudaMemHandler<int*, JUST_ALLOC>&);

src/numerics/APRIsoConvGPU333.cu

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1506,48 +1506,35 @@ void isotropic_convolve_333_alt(GPUAccessHelper& access, GPUAccessHelper& tree_a
15061506

15071507

15081508
template<typename inputType, typename outputType, typename stencilType, typename treeType>
1509-
void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input, VectorData<outputType>& output,
1510-
VectorData<stencilType>& stencil, VectorData<treeType>& tree_data, bool reflective_bc, bool use_stencil_downsample, bool normalize_stencil) {
1511-
/*
1512-
* Perform APR Isotropic Convolution Operation on the GPU with a 3x3x3 kernel
1513-
* conv_stencil needs to have 27 entries, with element (x, y, z) corresponding to index z*9 + x*3 + y
1514-
*/
1509+
void isotropic_convolve_333_direct(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input,
1510+
VectorData<outputType>& output, VectorData<stencilType>& stencil,
1511+
VectorData<treeType>& tree_data, bool reflective_bc) {
15151512

15161513
tree_access.init_gpu();
15171514
access.init_gpu(tree_access);
15181515

15191516
assert(input.size() == access.total_number_particles());
1520-
assert(stencil.size() == 27);
1517+
assert(stencil.size() >= 27);
1518+
1519+
const bool downsampled_stencil = (stencil.size() >= 27 * (access.level_max() - access.level_min()));
15211520

15221521
tree_data.resize(tree_access.total_number_particles());
15231522
output.resize(access.total_number_particles());
15241523

15251524
/// compute nonempty rows
1526-
VectorData<int> ne_counter_ds; //non empty rows
1525+
VectorData<int> ne_counter_ds;
15271526
VectorData<int> ne_counter;
15281527
ScopedCudaMemHandler<int*, JUST_ALLOC> ne_rows_ds_gpu;
15291528
ScopedCudaMemHandler<int*, JUST_ALLOC> ne_rows_gpu;
15301529

15311530
compute_ne_rows_tree_cuda<16, 32>(tree_access, ne_counter_ds, ne_rows_ds_gpu);
15321531
compute_ne_rows_cuda<16, 32>(access, ne_counter, ne_rows_gpu, 2);
15331532

1534-
/// downsample the stencil
1535-
VectorData<stencilType> stencil_vec;
1536-
if(use_stencil_downsample) {
1537-
APRStencil::get_downsampled_stencils(stencil, stencil_vec, access.level_max() - access.level_min(), normalize_stencil);
1538-
}
1539-
15401533
/// allocate GPU memory
15411534
ScopedCudaMemHandler<inputType*, JUST_ALLOC> input_gpu(input.data(), input.size());
15421535
ScopedCudaMemHandler<treeType*, JUST_ALLOC> tree_data_gpu(tree_data.data(), tree_data.size());
15431536
ScopedCudaMemHandler<outputType*, JUST_ALLOC> output_gpu(output.data(), output.size());
1544-
ScopedCudaMemHandler<stencilType*, JUST_ALLOC> stencil_gpu;
1545-
1546-
if(use_stencil_downsample) {
1547-
stencil_gpu.initialize(stencil_vec.data(), stencil_vec.size());
1548-
} else {
1549-
stencil_gpu.initialize(stencil.data(), stencil.size());
1550-
}
1537+
ScopedCudaMemHandler<stencilType*, JUST_ALLOC> stencil_gpu(stencil.data(), stencil.size());
15511538

15521539
/// copy input particles and stencil(s) to device
15531540
input_gpu.copyH2D();
@@ -1560,10 +1547,10 @@ void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_acces
15601547
/// perform the convolution operation
15611548
if(reflective_bc) {
15621549
isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(),
1563-
tree_data_gpu.get(), ne_rows_gpu.get(), ne_counter, use_stencil_downsample);
1550+
tree_data_gpu.get(), ne_rows_gpu.get(), ne_counter, downsampled_stencil);
15641551
} else {
15651552
isotropic_convolve_333(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(),
1566-
tree_data_gpu.get(), ne_rows_gpu.get(), ne_counter, use_stencil_downsample);
1553+
tree_data_gpu.get(), ne_rows_gpu.get(), ne_counter, downsampled_stencil);
15671554
}
15681555
error_check( cudaDeviceSynchronize() )
15691556

@@ -1626,11 +1613,15 @@ void isotropic_convolve_333_alt(GPUAccessHelper& access, GPUAccessHelper& tree_a
16261613

16271614

16281615
/// instantiate templates
1629-
template void isotropic_convolve_333(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool, bool);
1630-
template void isotropic_convolve_333(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool, bool);
1631-
template void isotropic_convolve_333(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&, VectorData<double>&, VectorData<double>&, bool, bool, bool);
1616+
template void isotropic_convolve_333_direct(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool);
1617+
template void isotropic_convolve_333_direct(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool);
1618+
template void isotropic_convolve_333_direct(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool);
1619+
template void isotropic_convolve_333_direct(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool);
1620+
template void isotropic_convolve_333_direct(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&, VectorData<double>&, VectorData<double>&, bool);
16321621

1622+
template void isotropic_convolve_333_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool);
16331623
template void isotropic_convolve_333_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool);
1624+
template void isotropic_convolve_333_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool);
16341625
template void isotropic_convolve_333_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool);
16351626
template void isotropic_convolve_333_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&, VectorData<double>&, VectorData<double>&, bool, bool);
16361627

src/numerics/APRIsoConvGPU333.hpp

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,42 @@
1414
/// high-level functions including data transfer
1515

1616
template<typename inputType, typename outputType, typename stencilType, typename treeType>
17-
void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input, VectorData<outputType>& output,
18-
VectorData<stencilType>& stencil, VectorData<treeType>& tree_data, bool reflective_bc = false,
19-
bool use_stencil_downsample = false, bool normalize_stencil = false);
17+
void isotropic_convolve_333_direct(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input,
18+
VectorData<outputType>& output, VectorData<stencilType>& stencil,
19+
VectorData<treeType>& tree_data, bool reflective_bc);
20+
21+
22+
template<typename inputType, typename outputType, typename stencilType, typename treeType>
23+
void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input,
24+
VectorData<outputType>& output, VectorData<stencilType>& stencil, VectorData<treeType>& tree_data,
25+
bool reflective_bc=false, bool use_stencil_downsample=false, bool normalize_stencil=false) {
26+
tree_access.init_gpu();
27+
access.init_gpu(tree_access);
28+
29+
assert(stencil.size() == 27);
30+
VectorData<stencilType> stencil_vec;
31+
const int nlevels = use_stencil_downsample ? access.level_max() - access.level_min() : 1;
32+
APRStencil::get_downsampled_stencils(stencil, stencil_vec, nlevels, normalize_stencil);
33+
isotropic_convolve_333_direct(access, tree_access, input, output, stencil_vec, tree_data, reflective_bc);
34+
}
35+
36+
37+
template<typename inputType, typename outputType, typename stencilType, typename treeType>
38+
void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input,
39+
VectorData<outputType>& output, PixelData<stencilType>& stencil, VectorData<treeType>& tree_data,
40+
bool reflective_bc=false, bool use_stencil_downsample=false, bool normalize_stencil=false) {
41+
tree_access.init_gpu();
42+
access.init_gpu(tree_access);
43+
44+
assert(stencil.z_num == 3);
45+
assert(stencil.x_num == 3);
46+
assert(stencil.y_num == 3);
47+
VectorData<stencilType> stencil_vec;
48+
const int nlevels = use_stencil_downsample ? access.level_max() - access.level_min() : 1;
49+
APRStencil::get_downsampled_stencils(stencil, stencil_vec, nlevels, normalize_stencil);
50+
isotropic_convolve_333_direct(access, tree_access, input, output, stencil_vec, tree_data, reflective_bc);
51+
}
52+
2053

2154
template<typename inputType, typename outputType, typename stencilType, typename treeType>
2255
void isotropic_convolve_333_alt(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input, VectorData<outputType>& output,

0 commit comments

Comments
 (0)