AdaptiveParticles
diff --git a/‎benchmarks/FilterBenchmarks.hpp‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/FilterBenchmarks.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/Example_apr_deconvolution.cpp‎
Lines changed: 2 additions & 2 deletions b/‎examples/Example_apr_deconvolution.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/Example_apr_filter.cpp‎
Lines changed: 1 addition & 4 deletions b/‎examples/Example_apr_filter.cpp‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎examples/Example_compute_gradient.cpp‎
Lines changed: 28 additions & 4 deletions b/‎examples/Example_compute_gradient.cpp‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎examples/Example_compute_gradient.hpp‎
Lines changed: 5 additions & 0 deletions b/‎examples/Example_compute_gradient.hpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/data_structures/APR/access/GPUAccess.cu‎
Lines changed: 6 additions & 6 deletions b/‎src/data_structures/APR/access/GPUAccess.cu‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/numerics/APRDownsampleGPU.cu‎
Lines changed: 6 additions & 2 deletions b/‎src/numerics/APRDownsampleGPU.cu‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/numerics/APRIsoConvGPU333.cu‎
Lines changed: 17 additions & 26 deletions b/‎src/numerics/APRIsoConvGPU333.cu‎
Lines changed: 17 additions & 26 deletions
diff --git a/‎src/numerics/APRIsoConvGPU333.hpp‎
Lines changed: 36 additions & 3 deletions b/‎src/numerics/APRIsoConvGPU333.hpp‎
Lines changed: 36 additions & 3 deletions
@@ -817,13 +817,13 @@ inline void bench_richardson_lucy_apr(APR& apr, ParticleData<partsType>& parts,
 
     // burn-in
     for(int i = 0; i < num_rep/10; ++i) {
-        richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);
+        APRNumericsGPU::richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);
     }
     cudaDeviceSynchronize();
 
     timer.start_timer("richardson lucy");
     for(int i = 0; i < num_rep; ++i) {
-        richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);
+        APRNumericsGPU::richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);
     }
     cudaDeviceSynchronize();
     timer.stop_timer();
 
@@ -62,8 +62,8 @@ int main(int argc, char **argv) {
         auto tree_access = apr.gpuTreeHelper();
         ParticleData<float> tree_data;
 
-        richardson_lucy(access, tree_access, parts.data, output.data, stencil, options.number_iterations,
-                        /*downsample stencil*/ true, /*normalize stencils*/ true, /*resume*/false);
+        APRNumericsGPU::richardson_lucy(access, tree_access, parts.data, output.data, stencil, options.number_iterations,
+                                        /*downsample stencil*/ true, /*normalize stencils*/ true, /*resume*/false);
 
         done = true;
         timer.stop_timer();
 
@@ -58,11 +58,8 @@ int main(int argc, char **argv) {
         timer.start_timer("APR Convolution CUDA");
         auto access = apr.gpuAPRHelper();
         auto tree_access = apr.gpuTreeHelper();
-        VectorData<float> stencil_vd;
-        stencil_vd.resize(125); // stencil must be 5x5x5!
-        std::copy(stencil.mesh.begin(), stencil.mesh.end(), stencil_vd.begin());
         ParticleData<float> tree_data;
-        isotropic_convolve_555(access, tree_access, parts.data, output.data, stencil_vd, tree_data.data,
+        isotropic_convolve_555(access, tree_access, parts.data, output.data, stencil, tree_data.data,
                                /*reflect boundary*/true, /*downsample stencil*/true, /*normalize stencils*/true);
 
         done = true;
 
@@ -59,10 +59,29 @@ int main(int argc, char **argv) {
     ParticleData<float> output;
     std::vector<float> deltas = {options.dy, options.dx, options.dz};
 
-    if(options.sobel) {
-        APRNumerics::gradient_magnitude_sobel(apr, parts, output, deltas);
-    } else {
-        APRNumerics::gradient_magnitude_cfd(apr, parts, output, deltas);
+    bool done = false;
+
+    if(options.use_cuda) {
+#ifdef APR_USE_CUDA
+        auto access = apr.gpuAPRHelper();
+        auto tree_access = apr.gpuTreeHelper();
+        if(options.sobel) {
+            APRNumericsGPU::gradient_magnitude_sobel(access, tree_access, parts.data, output.data, deltas);
+        } else {
+            APRNumericsGPU::gradient_magnitude_cfd(access, tree_access, parts.data, output.data, deltas);
+        }
+        done = true;
+#else
+        std::cout << "Option -use_cuda was given, but LibAPR was not built with CUDA enabled. Using CPU implementation." << std::endl;
+#endif
+    }
+
+    if(!done) {
+        if (options.sobel) {
+            APRNumerics::gradient_magnitude_sobel(apr, parts, output, deltas);
+        } else {
+            APRNumerics::gradient_magnitude_cfd(apr, parts, output, deltas);
+        }
     }
     timer.stop_timer();
 
@@ -144,6 +163,11 @@ cmdLineOptions read_command_line_options(int argc, char **argv){
         result.dz =  std::stof(get_command_option(argv, argv + argc, "-dz"));
     }
 
+    if(command_option_exists(argv, argv + argc, "-use_cuda"))
+    {
+        result.use_cuda = true;
+    }
+
 
     return result;
 
 
@@ -17,6 +17,10 @@
 #include "numerics/APRNumerics.hpp"
 #include "numerics/APRReconstruction.hpp"
 
+#ifdef APR_USE_CUDA
+#include "numerics/APRNumericsGPU.hpp"
+#endif
+
 struct cmdLineOptions{
     std::string output = "";
     std::string stats = "";
@@ -26,6 +30,7 @@ struct cmdLineOptions{
     float dx = 1.0f;
     float dy = 1.0f;
     float dz = 1.0f;
+    bool use_cuda = false;
 };
 
 cmdLineOptions read_command_line_options(int argc, char **argv);
 
@@ -163,12 +163,12 @@ template class ParticleDataGpu<int16_t>;
 template class ParticleDataGpu<int64_t>;
 
 
-__global__ void fill_y_vec_max_level(const uint64_t* level_xz_vec,
-                                     const uint64_t* xz_end_vec,
-                                     uint16_t* y_vec,
-                                     const uint64_t* level_xz_vec_tree,
-                                     const uint64_t* xz_end_vec_tree,
-                                     const uint16_t* y_vec_tree,
+__global__ void fill_y_vec_max_level(const uint64_t* __restrict__ level_xz_vec,
+                                     const uint64_t* __restrict__ xz_end_vec,
+                                     uint16_t* __restrict__ y_vec,
+                                     const uint64_t* __restrict__ level_xz_vec_tree,
+                                     const uint64_t* __restrict__ xz_end_vec_tree,
+                                     const uint16_t* __restrict__ y_vec_tree,
                                      const int z_num,
                                      const int x_num,
                                      const uint16_t y_num,
 
@@ -1207,13 +1207,17 @@ void downsample_avg(GPUAccessHelper& access, GPUAccessHelper& tree_access, Vecto
 }
 
 /// instantiate templates
+template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&);
 template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&);
-template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&);
+template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&);
 template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&);
+template void downsample_avg(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&);
 
+template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&);
 template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&);
-template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&);
+template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&);
 template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&);
+template void downsample_avg_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&);
 
 template void compute_ne_rows_tree_cuda<8, 32>(GPUAccessHelper&, VectorData<int>&, ScopedCudaMemHandler<int*, JUST_ALLOC>&);
 template void compute_ne_rows_tree_cuda<16, 32>(GPUAccessHelper&, VectorData<int>&, ScopedCudaMemHandler<int*, JUST_ALLOC>&);
 
@@ -1506,48 +1506,35 @@ void isotropic_convolve_333_alt(GPUAccessHelper& access, GPUAccessHelper& tree_a
 
 
 template<typename inputType, typename outputType, typename stencilType, typename treeType>
-void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input, VectorData<outputType>& output,
-                            VectorData<stencilType>& stencil, VectorData<treeType>& tree_data, bool reflective_bc, bool use_stencil_downsample, bool normalize_stencil) {
-    /*
-     *  Perform APR Isotropic Convolution Operation on the GPU with a 3x3x3 kernel
-     *  conv_stencil needs to have 27 entries, with element (x, y, z) corresponding to index z*9 + x*3 + y
-     */
+void isotropic_convolve_333_direct(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input,
+                                   VectorData<outputType>& output, VectorData<stencilType>& stencil,
+                                   VectorData<treeType>& tree_data, bool reflective_bc) {
 
     tree_access.init_gpu();
     access.init_gpu(tree_access);
 
     assert(input.size() == access.total_number_particles());
-    assert(stencil.size() == 27);
+    assert(stencil.size() >= 27);
+
+    const bool downsampled_stencil = (stencil.size() >= 27 * (access.level_max() - access.level_min()));
 
     tree_data.resize(tree_access.total_number_particles());
     output.resize(access.total_number_particles());
 
     /// compute nonempty rows
-    VectorData<int> ne_counter_ds; //non empty rows
+    VectorData<int> ne_counter_ds;
     VectorData<int> ne_counter;
     ScopedCudaMemHandler<int*, JUST_ALLOC> ne_rows_ds_gpu;
     ScopedCudaMemHandler<int*, JUST_ALLOC> ne_rows_gpu;
 
     compute_ne_rows_tree_cuda<16, 32>(tree_access, ne_counter_ds, ne_rows_ds_gpu);
     compute_ne_rows_cuda<16, 32>(access, ne_counter, ne_rows_gpu, 2);
 
-    /// downsample the stencil
-    VectorData<stencilType> stencil_vec;
-    if(use_stencil_downsample) {
-        APRStencil::get_downsampled_stencils(stencil, stencil_vec, access.level_max() - access.level_min(), normalize_stencil);
-    }
-
     /// allocate GPU memory
     ScopedCudaMemHandler<inputType*, JUST_ALLOC> input_gpu(input.data(), input.size());
     ScopedCudaMemHandler<treeType*, JUST_ALLOC> tree_data_gpu(tree_data.data(), tree_data.size());
     ScopedCudaMemHandler<outputType*, JUST_ALLOC> output_gpu(output.data(), output.size());
-    ScopedCudaMemHandler<stencilType*, JUST_ALLOC> stencil_gpu;
-
-    if(use_stencil_downsample) {
-        stencil_gpu.initialize(stencil_vec.data(), stencil_vec.size());
-    } else {
-        stencil_gpu.initialize(stencil.data(), stencil.size());
-    }
+    ScopedCudaMemHandler<stencilType*, JUST_ALLOC> stencil_gpu(stencil.data(), stencil.size());
 
     /// copy input particles and stencil(s) to device
     input_gpu.copyH2D();
@@ -1560,10 +1547,10 @@ void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_acces
     /// perform the convolution operation
     if(reflective_bc) {
         isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(),
-                                          tree_data_gpu.get(), ne_rows_gpu.get(), ne_counter, use_stencil_downsample);
+                                          tree_data_gpu.get(), ne_rows_gpu.get(), ne_counter, downsampled_stencil);
     } else {
         isotropic_convolve_333(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(),
-                               tree_data_gpu.get(), ne_rows_gpu.get(), ne_counter, use_stencil_downsample);
+                               tree_data_gpu.get(), ne_rows_gpu.get(), ne_counter, downsampled_stencil);
     }
     error_check( cudaDeviceSynchronize() )
 
@@ -1626,11 +1613,15 @@ void isotropic_convolve_333_alt(GPUAccessHelper& access, GPUAccessHelper& tree_a
 
 
 /// instantiate templates
-template void isotropic_convolve_333(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool, bool);
-template void isotropic_convolve_333(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool, bool);
-template void isotropic_convolve_333(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&, VectorData<double>&, VectorData<double>&, bool, bool, bool);
+template void isotropic_convolve_333_direct(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool);
+template void isotropic_convolve_333_direct(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool);
+template void isotropic_convolve_333_direct(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool);
+template void isotropic_convolve_333_direct(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool);
+template void isotropic_convolve_333_direct(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&, VectorData<double>&, VectorData<double>&, bool);
 
+template void isotropic_convolve_333_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool);
 template void isotropic_convolve_333_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool);
+template void isotropic_convolve_333_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool);
 template void isotropic_convolve_333_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, bool, bool);
 template void isotropic_convolve_333_alt(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<double>&, VectorData<double>&, VectorData<double>&, bool, bool);
 
@@ -14,9 +14,42 @@
 /// high-level functions including data transfer
 
 template<typename inputType, typename outputType, typename stencilType, typename treeType>
-void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input, VectorData<outputType>& output,
-                            VectorData<stencilType>& stencil, VectorData<treeType>& tree_data, bool reflective_bc = false,
-                            bool use_stencil_downsample = false, bool normalize_stencil = false);
+void isotropic_convolve_333_direct(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input,
+                                   VectorData<outputType>& output, VectorData<stencilType>& stencil,
+                                   VectorData<treeType>& tree_data, bool reflective_bc);
+
+
+template<typename inputType, typename outputType, typename stencilType, typename treeType>
+void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input,
+                            VectorData<outputType>& output, VectorData<stencilType>& stencil, VectorData<treeType>& tree_data,
+                            bool reflective_bc=false, bool use_stencil_downsample=false, bool normalize_stencil=false) {
+    tree_access.init_gpu();
+    access.init_gpu(tree_access);
+
+    assert(stencil.size() == 27);
+    VectorData<stencilType> stencil_vec;
+    const int nlevels = use_stencil_downsample ? access.level_max() - access.level_min() : 1;
+    APRStencil::get_downsampled_stencils(stencil, stencil_vec, nlevels, normalize_stencil);
+    isotropic_convolve_333_direct(access, tree_access, input, output, stencil_vec, tree_data, reflective_bc);
+}
+
+
+template<typename inputType, typename outputType, typename stencilType, typename treeType>
+void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input,
+                            VectorData<outputType>& output, PixelData<stencilType>& stencil, VectorData<treeType>& tree_data,
+                            bool reflective_bc=false, bool use_stencil_downsample=false, bool normalize_stencil=false) {
+    tree_access.init_gpu();
+    access.init_gpu(tree_access);
+
+    assert(stencil.z_num == 3);
+    assert(stencil.x_num == 3);
+    assert(stencil.y_num == 3);
+    VectorData<stencilType> stencil_vec;
+    const int nlevels = use_stencil_downsample ? access.level_max() - access.level_min() : 1;
+    APRStencil::get_downsampled_stencils(stencil, stencil_vec, nlevels, normalize_stencil);
+    isotropic_convolve_333_direct(access, tree_access, input, output, stencil_vec, tree_data, reflective_bc);
+}
+
 
 template<typename inputType, typename outputType, typename stencilType, typename treeType>
 void isotropic_convolve_333_alt(GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input, VectorData<outputType>& output,
Original file line number	Diff line number	Diff line change
`@@ -817,13 +817,13 @@ inline void bench_richardson_lucy_apr(APR& apr, ParticleData<partsType>& parts,`
`817`	`817`
`818`	`818`	`// burn-in`
`819`	`819`	`for(int i = 0; i < num_rep/10; ++i) {`
`820`		`- richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);`
	`820`	`+ APRNumericsGPU::richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);`
`821`	`821`	`}`
`822`	`822`	`cudaDeviceSynchronize();`
`823`	`823`
`824`	`824`	`timer.start_timer("richardson lucy");`
`825`	`825`	`for(int i = 0; i < num_rep; ++i) {`
`826`		`- richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);`
	`826`	`+ APRNumericsGPU::richardson_lucy(access, tree_access, input_gpu.get(), output_gpu.get(), stencil_gpu.get(), stencil_gpu.get(), 5, niter, true);`
`827`	`827`	`}`
`828`	`828`	`cudaDeviceSynchronize();`
`829`	`829`	`timer.stop_timer();`