add cuda gradient/sobel magnitude methods + tests

joeljonsson · joeljonsson · commit 06da7ec37883 · 2022-06-01T15:57:02.000+02:00
diff --git a/src/numerics/APRNumericsGPU.cu b/src/numerics/APRNumericsGPU.cu
@@ -4,6 +4,96 @@
 
 #include "APRNumericsGPU.hpp"
 
+
+template<typename InputType>
+void APRNumericsGPU::gradient_magnitude(GPUAccessHelper &access, GPUAccessHelper &tree_access,
+                                        VectorData<InputType> &inputParticles, VectorData<float> &outputParticles,
+                                        VectorData<float> &stencil_vec_y, VectorData<float> &stencil_vec_x,
+                                        VectorData<float> &stencil_vec_z) {
+
+    // initialize GPU access data
+    tree_access.init_gpu();
+    access.init_gpu(tree_access);
+
+    // check stencils
+    assert(stencil_vec_y.size() >= 27);
+    assert(stencil_vec_x.size() >= 27);
+    assert(stencil_vec_z.size() >= 27);
+    const bool downsampled_y = (stencil_vec_y.size() >= 27 * (access.level_max() - access.level_min()));
+    const bool downsampled_x = (stencil_vec_x.size() >= 27 * (access.level_max() - access.level_min()));
+    const bool downsampled_z = (stencil_vec_z.size() >= 27 * (access.level_max() - access.level_min()));
+
+    // initialize output
+    outputParticles.resize(access.total_number_particles());
+
+    // find non-empty rows
+    VectorData<int> ne_counter_ds;
+    VectorData<int> ne_counter_333;
+    ScopedCudaMemHandler<int*, JUST_ALLOC> ne_rows_ds_gpu;
+    ScopedCudaMemHandler<int*, JUST_ALLOC> ne_rows_333_gpu;
+    compute_ne_rows_tree_cuda<16, 32>(tree_access, ne_counter_ds, ne_rows_ds_gpu);
+    compute_ne_rows_cuda<16, 32>(access, ne_counter_333, ne_rows_333_gpu, 2);
+
+    // allocate GPU memory
+    ScopedCudaMemHandler<InputType*, H2D> input_gpu(inputParticles.data(), inputParticles.size());
+    ScopedCudaMemHandler<float*, JUST_ALLOC> output_gpu(outputParticles.data(), outputParticles.size());
+    ScopedCudaMemHandler<float*, JUST_ALLOC> tmp_output(NULL, access.total_number_particles());
+    ScopedCudaMemHandler<float*, JUST_ALLOC> tree_gpu(NULL, tree_access.total_number_particles());
+    ScopedCudaMemHandler<float*, H2D> stencil_y_gpu(stencil_vec_y.data(), stencil_vec_y.size());
+    ScopedCudaMemHandler<float*, H2D> stencil_x_gpu(stencil_vec_x.data(), stencil_vec_x.size());
+    ScopedCudaMemHandler<float*, H2D> stencil_z_gpu(stencil_vec_z.data(), stencil_vec_z.size());
+
+    // compute block and grid size for elementwise operations
+    const size_t N = access.total_number_particles();
+    int blockSize, minGridSize, gridSize;
+    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, addSquare, 0, 0);
+    gridSize = (N + blockSize - 1) / blockSize;
+
+    // fill tree by average downsampling
+    downsample_avg(access, tree_access, input_gpu.get(), tree_gpu.get(), ne_rows_ds_gpu.get(), ne_counter_ds);
+    error_check( cudaDeviceSynchronize() )
+
+    // compute y gradient
+    isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), output_gpu.get(),
+                                      stencil_y_gpu.get(), tree_gpu.get(), ne_rows_333_gpu.get(),
+                                      ne_counter_333, downsampled_y);
+    error_check( cudaDeviceSynchronize() )
+
+    // square y gradient
+    elementWiseMult<<<blockSize, gridSize>>>(output_gpu.get(), output_gpu.get(), N);
+
+    // compute x gradient
+    isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), tmp_output.get(),
+                                      stencil_x_gpu.get(), tree_gpu.get(), ne_rows_333_gpu.get(),
+                                      ne_counter_333, downsampled_x);
+
+    error_check( cudaDeviceSynchronize() )
+
+    // add square of x gradient to output
+    addSquare<<<blockSize, gridSize>>>(output_gpu.get(), tmp_output.get(), N);
+
+    error_check( cudaDeviceSynchronize() )
+
+    // compute z gradient
+    isotropic_convolve_333_reflective(access, tree_access, input_gpu.get(), tmp_output.get(),
+                                      stencil_z_gpu.get(), tree_gpu.get(), ne_rows_333_gpu.get(),
+                                      ne_counter_333, downsampled_z);
+
+    error_check( cudaDeviceSynchronize() )
+
+    // add square of x gradient to output
+    addSquare<<<blockSize, gridSize>>>(output_gpu.get(), tmp_output.get(), N);
+
+    error_check( cudaDeviceSynchronize() )
+
+    elementWiseSqrt<<<blockSize, gridSize>>>(output_gpu.get(), N);
+
+    error_check( cudaDeviceSynchronize() )
+
+    output_gpu.copyD2H();
+}
+
+
 template<typename inputType, typename stencilType>
 void APRNumericsGPU::richardson_lucy(GPUAccessHelper& access, GPUAccessHelper& tree_access, inputType* input,
                                      stencilType* output, stencilType* psf, stencilType* psf_flipped, int kernel_size,
@@ -159,6 +249,11 @@ void APRNumericsGPU::richardson_lucy(GPUAccessHelper& access, GPUAccessHelper& t
 }
 
 
+template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&);
+template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&);
+template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&);
+template void APRNumericsGPU::gradient_magnitude(GPUAccessHelper&, GPUAccessHelper&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&, VectorData<float>&);
+
 template void APRNumericsGPU::richardson_lucy(GPUAccessHelper&, GPUAccessHelper&, uint16_t*, float*, float*, float*, int, int, bool, bool);
 template void APRNumericsGPU::richardson_lucy(GPUAccessHelper&, GPUAccessHelper&, float*, float*, float*, float*, int, int, bool, bool);
 
diff --git a/src/numerics/APRNumericsGPU.hpp b/src/numerics/APRNumericsGPU.hpp
@@ -49,6 +49,57 @@ namespace APRNumericsGPU {
                         float delta = 1.0f);
 
 
+    /**
+     * Apply 3x3x3 convolution using three input stencils and compute the magnitude sqrt(dz*dz + dy*dy + dx*dx)
+     * of the results.
+     * @tparam InputType
+     * @param access
+     * @param tree_access
+     * @param inputParticles
+     * @param outputParticles
+     * @param stencil_vec_y     Stencil (vectors) to be applied. Should be of size 27 or `27 * (access.level_max() - access.level_min())`
+     * @param stencil_vec_x
+     * @param stencil_vec_z
+     */
+    template<typename InputType>
+    void gradient_magnitude(GPUAccessHelper &access, GPUAccessHelper &tree_access, VectorData<InputType> &inputParticles,
+                            VectorData<float> &outputParticles, VectorData<float> &stencil_vec_y,
+                            VectorData<float> &stencil_vec_x, VectorData<float> &stencil_vec_z);
+
+
+    /**
+     * Compute the gradient magnitude using level-adaptive central finite differences.
+     * Note: uses 3x3x3 convolutions instead of e.g. 1x1x3, which is quite inefficient.
+     * @tparam InputType
+     * @tparam GradientType
+     * @param apr
+     * @param inputParticles
+     * @param outputParticles
+     * @param deltas                pixel size in each dimension, used to scale the gradients (default: {1, 1, 1})
+     */
+    template<typename InputType>
+    void gradient_magnitude_cfd(GPUAccessHelper &access,
+                                GPUAccessHelper &tree_access,
+                                VectorData<InputType> &inputParticles,
+                                VectorData<float> &outputParticles,
+                                const std::vector<float> &deltas = {1.0f, 1.0f, 1.0f});
+
+
+    /**
+     * Compute the gradient magnitude using level-adaptive Sobel filters.
+     * @tparam InputType
+     * @tparam GradientType
+     * @param apr
+     * @param inputParticles
+     * @param outputParticles
+     * @param deltas                pixel size in each dimension, used to scale the gradients (default: {1, 1, 1})
+     */
+    template<typename InputType>
+    void gradient_magnitude_sobel(GPUAccessHelper &access,
+                                  GPUAccessHelper &tree_access,
+                                  VectorData<InputType>& inputParticles,
+                                  VectorData<float>& outputParticles,
+                                  const std::vector<float>& deltas = {1.0f, 1.0f, 1.0f});
 
 
     template<typename inputType, typename stencilType>
@@ -96,4 +147,51 @@ void APRNumericsGPU::gradient_sobel(GPUAccessHelper &access, GPUAccessHelper &tr
 }
 
 
+template<typename InputType>
+void APRNumericsGPU::gradient_magnitude_cfd(GPUAccessHelper &access, GPUAccessHelper &tree_access,
+                                            VectorData<InputType> &inputParticles, VectorData<float> &outputParticles,
+                                            const std::vector<float> &deltas) {
+
+    // generate cfd stencils
+    PixelData<float> stencil_y(3, 3, 3, 0);
+    PixelData<float> stencil_x(3, 3, 3, 0);
+    PixelData<float> stencil_z(3, 3, 3, 0);
+
+    stencil_y.at(0, 1, 1) = -1.f/(2*deltas[0]); stencil_y.at(2, 1, 1) = 1.f/(2*deltas[0]);
+    stencil_x.at(1, 0, 1) = -1.f/(2*deltas[1]); stencil_x.at(1, 2, 1) = 1.f/(2*deltas[1]);
+    stencil_z.at(1, 1, 0) = -1.f/(2*deltas[2]); stencil_z.at(1, 1, 2) = 1.f/(2*deltas[2]);
+
+    // rescale stencils for each level
+    VectorData<float> stencil_vec_y, stencil_vec_x, stencil_vec_z;
+    APRStencil::get_rescaled_stencils(stencil_y, stencil_vec_y, access.level_max()-access.level_min());
+    APRStencil::get_rescaled_stencils(stencil_x, stencil_vec_x, access.level_max()-access.level_min());
+    APRStencil::get_rescaled_stencils(stencil_z, stencil_vec_z, access.level_max()-access.level_min());
+
+    // compute gradient magnitude
+    gradient_magnitude(access, tree_access, inputParticles, outputParticles, stencil_vec_y, stencil_vec_x, stencil_vec_z);
+}
+
+
+template<typename InputType>
+void APRNumericsGPU::gradient_magnitude_sobel(GPUAccessHelper &access, GPUAccessHelper &tree_access,
+                                              VectorData<InputType> &inputParticles, VectorData<float> &outputParticles,
+                                              const std::vector<float> &deltas) {
+    // generate Sobel stencils
+    auto stencil_y = APRStencil::create_sobel_filter<float>(0, deltas[0]);
+    auto stencil_x = APRStencil::create_sobel_filter<float>(1, deltas[1]);
+    auto stencil_z = APRStencil::create_sobel_filter<float>(2, deltas[2]);
+
+    // rescale stencils for each level
+    VectorData<float> stencil_vec_y, stencil_vec_x, stencil_vec_z;
+    APRStencil::get_rescaled_stencils(stencil_y, stencil_vec_y, access.level_max()-access.level_min());
+    APRStencil::get_rescaled_stencils(stencil_x, stencil_vec_x, access.level_max()-access.level_min());
+    APRStencil::get_rescaled_stencils(stencil_z, stencil_vec_z, access.level_max()-access.level_min());
+
+    // compute gradient magnitude
+    gradient_magnitude(access, tree_access, inputParticles, outputParticles, stencil_vec_y, stencil_vec_x, stencil_vec_z);
+}
+
+
+
+
 #endif //LIBAPR_APRNUMERICSGPU_HPP
diff --git a/test/APRTestCuda.cpp b/test/APRTestCuda.cpp
@@ -1471,6 +1471,67 @@ TEST_F(CreateCR124, TEST_SOBEL_CUDA) {
 }
 
 
+bool test_gradient_magnitude_cuda(TestDataGPU &test_data) {
+
+    APRTimer timer(true);
+    const std::vector<float> deltas = {0.9, 1.1, 1.3};
+    test_data.apr.init_cuda(true);
+    auto access = test_data.apr.gpuAPRHelper();
+    auto tree_access = test_data.apr.gpuTreeHelper();
+
+    timer.start_timer("gradient magnitude CUDA");
+    ParticleData<float> output;
+    APRNumericsGPU::gradient_magnitude_cfd(access, tree_access, test_data.particles_intensities.data, output.data, deltas);
+    timer.stop_timer();
+
+    timer.start_timer("gradient magnitude CPU");
+    ParticleData<float> output_gt;
+    APRNumerics::gradient_magnitude_cfd(test_data.apr, test_data.particles_intensities, output_gt, deltas);
+    timer.stop_timer();
+
+    size_t err_count = compareParticles(output_gt, output, 1e-2, 20);
+    return err_count == 0;
+}
+
+TEST_F(CreateSmallSphereTest, TEST_GRADIENT_MAGNITUDE_CUDA) {
+    ASSERT_TRUE(test_gradient_magnitude_cuda(test_data));
+}
+
+TEST_F(CreateCR124, TEST_GRADIENT_MAGNITUDE_CUDA) {
+    ASSERT_TRUE(test_gradient_magnitude_cuda(test_data));
+}
+
+
+bool test_sobel_magnitude_cuda(TestDataGPU &test_data) {
+
+    APRTimer timer(true);
+    const std::vector<float> deltas = {0.9, 1.1, 1.3};
+    test_data.apr.init_cuda(true);
+    auto access = test_data.apr.gpuAPRHelper();
+    auto tree_access = test_data.apr.gpuTreeHelper();
+
+    timer.start_timer("sobel magnitude CUDA");
+    ParticleData<float> output;
+    APRNumericsGPU::gradient_magnitude_sobel(access, tree_access, test_data.particles_intensities.data, output.data, deltas);
+    timer.stop_timer();
+
+    timer.start_timer("sobel magnitude CPU");
+    ParticleData<float> output_gt;
+    APRNumerics::gradient_magnitude_sobel(test_data.apr, test_data.particles_intensities, output_gt, deltas);
+    timer.stop_timer();
+
+    size_t err_count = compareParticles(output_gt, output, 1e-2, 20);
+    return err_count == 0;
+}
+
+TEST_F(CreateSmallSphereTest, TEST_SOBEL_MAGNITUDE_CUDA) {
+    ASSERT_TRUE(test_sobel_magnitude_cuda(test_data));
+}
+
+TEST_F(CreateCR124, TEST_SOBEL_MAGNITUDE_CUDA) {
+    ASSERT_TRUE(test_sobel_magnitude_cuda(test_data));
+}
+
 
 #endif  // APR_USE_CUDA