GPU pipeline now works for APRConverter!

krzysg · krzysg · commit d2fd1d0f4f5a · 2024-08-22T13:24:39.000+02:00
diff --git a/examples/Example_get_apr.h b/examples/Example_get_apr.h
@@ -30,7 +30,7 @@ struct cmdLineOptions{
     bool auto_parameters = false;
 
     float Ip_th = 0;
-    float lambda = -1;
+    float lambda = 3.0;
     float sigma_th = 0;
     float rel_error = 0.1;
     float grad_th = 1;
diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp
@@ -117,7 +117,7 @@ class APRConverter {
     PixelData<float> local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors
     PixelData<float> local_scale_temp2;
 
-    void applyParameters(APR& aAPR,APRParameters& aprParameters);
+    void applyParameters(APRParameters& aprParameters);
 
     template<typename T>
     void computeL(APR& aAPR,PixelData<T>& input_image);
@@ -184,7 +184,7 @@ void APRConverter<ImageType>::get_apr_custom_grad_scale(APR& aAPR,PixelData<Imag
     }
 
     aAPR.parameters = par;
-    applyParameters(aAPR,par);
+    applyParameters(par);
     solveForAPR(aAPR);
     generateDatastructures(aAPR);
 
@@ -251,7 +251,7 @@ void APRConverter<ImageType>::computeL(APR& aAPR,PixelData<T>& input_image){
 }
 
 template<typename ImageType>
-void APRConverter<ImageType>::applyParameters(APR& aAPR,APRParameters& aprParameters) {
+void APRConverter<ImageType>::applyParameters(APRParameters& aprParameters) {
     //
     //  Apply the main parameters
     //
@@ -265,39 +265,7 @@ void APRConverter<ImageType>::applyParameters(APR& aAPR,APRParameters& aprParame
     }
     fine_grained_timer.stop_timer();
 
-    fine_grained_timer.start_timer("threshold");
-    iComputeGradient.threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset);
-    fine_grained_timer.stop_timer();
-
-    float max_th = 60000;
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel for default(shared)
-#endif
-    for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
-
-        float rescaled = local_scale_temp.mesh[i];
-        if (rescaled < aprParameters.sigma_th) {
-            rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : par.sigma_th;
-            local_scale_temp.mesh[i] = rescaled;
-        }
-    }
-
-#ifdef HAVE_LIBTIFF
-    if(par.output_steps) {
-        TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_rescaled.tif", local_scale_temp);
-    }
-#endif
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel for default(shared)
-#endif
-    for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
-
-        if(grad_temp.mesh[i] < aprParameters.grad_th){
-            grad_temp.mesh[i] = 0;
-        }
-    }
+    iComputeGradient.applyParameters(grad_temp, local_scale_temp, local_scale_temp2, aprParameters, bspline_offset);
 }
 
 
@@ -405,7 +373,7 @@ inline bool APRConverter<ImageType>::get_lrf(APR &aAPR, PixelData<T>& input_imag
 template<typename ImageType>
 inline bool APRConverter<ImageType>::get_ds(APR &aAPR) {
 
-    applyParameters(aAPR,par);
+    applyParameters(par);
     aAPR.parameters = par;
 
     solveForAPR(aAPR);
@@ -426,104 +394,45 @@ inline bool APRConverter<ImageType>::get_ds(APR &aAPR) {
  */
 template<typename ImageType> template<typename T>
 inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input_image) {
-    if (!initPipelineAPR(aAPR, input_image)) return false;
 
+    if (!initPipelineAPR(aAPR, input_image)) return false;
 
     initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
 
-    method_timer.start_timer("compute_gradient_magnitude_using_bsplines and local instensity scale CUDA");
-    APRTimer t(true);
-    APRTimer d(true);
-    t.start_timer(" =========== ALL");
-    {
-
-        computation_timer.start_timer("init_mem");
-        PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
-
-        /////////////////////////////////
-        /// Pipeline
-        ////////////////////////
-        // offset image by factor (this is required if there are zero areas in the background with
-        // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
-        // Warning both of these could result in over-flow!
-
-        if (std::is_same<uint16_t, ImageType>::value) {
-            bspline_offset = 100;
-            image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-        } else if (std::is_same<uint8_t, ImageType>::value) {
-            bspline_offset = 5;
-            image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-        } else {
-            image_temp.copyFromMesh(input_image);
-        }
-
-        computation_timer.stop_timer();
-
-        std::vector<GpuProcessingTask<ImageType>> gpts;
-
-        int numOfStreams = 1;
-        int repetitionsPerStream = 1;
-
-        computation_timer.start_timer("compute_L");
-        // Create streams and send initial task to do
-        for (int i = 0; i < numOfStreams; ++i) {
-            gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
-            gpts.back().sendDataToGpu();
-            gpts.back().processOnGpu();
-        }
-        computation_timer.stop_timer();
-
-
-        for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
-            int c = i % numOfStreams;
-
-            computation_timer.start_timer("apply_parameters");
-            // get data from previous task
-            gpts[c].getDataFromGpu();
-
-            computation_timer.stop_timer();
-
-            // in theory we get new data and send them to task
-            if (i  < numOfStreams * (repetitionsPerStream - 1)) {
-                gpts[c].sendDataToGpu();
-                gpts[c].processOnGpu();
-            }
-
-            // Postprocess on CPU
-            std::cout << "--------- start CPU processing ---------- " << i << std::endl;
-
-            computation_timer.start_timer("solve_for_apr");
-            iPullingScheme.initialize_particle_cell_tree(aAPR.aprInfo);
-
-            PixelData<float> lst(local_scale_temp, true);
-
-#ifdef HAVE_LIBTIFF
-            if (par.output_steps){
-                TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_step.tif", lst);
-            }
-#endif
+    computation_timer.start_timer("init_mem");
+    PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
 
-#ifdef HAVE_LIBTIFF
-            if (par.output_steps){
-                TiffUtils::saveMeshAsTiff(par.output_dir + "gradient_step.tif", grad_temp);
-            }
-#endif
+    /////////////////////////////////
+    /// Pipeline
+    ////////////////////////
+    // offset image by factor (this is required if there are zero areas in the background with
+    // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
+    // Warning both of these could result in over-flow!
 
-            iLocalParticleSet.get_local_particle_cell_set(iPullingScheme,lst, local_scale_temp2,par);
+    if (std::is_same<uint16_t, ImageType>::value) {
+        bspline_offset = 100;
+        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
+    } else if (std::is_same<uint8_t, ImageType>::value) {
+        bspline_offset = 5;
+        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
+    } else {
+        image_temp.copyFromMesh(input_image);
+    }
 
-            iPullingScheme.pulling_scheme_main();
+    GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max());
+    gpt.sendDataToGpu();
+    gpt.processOnGpu();
+    auto linearAccessGpu = gpt.getDataFromGpu();
 
-            computation_timer.stop_timer();
+    aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
 
-            computation_timer.start_timer("generate_data_structures");
-            generateDatastructures(aAPR);
-            computation_timer.stop_timer();
-        }
-        std::cout << "Total n ENDED" << std::endl;
+    // generateDatastructures(aAPR) for linearAcceess for CUDA
+    aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
+    aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
+    aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
+    aAPR.apr_initialized = true;
 
-    }
-    t.stop_timer();
-    method_timer.stop_timer();
+    std::cout << "CUDA pipeline finished!\n";
 
     return true;
 }
@@ -565,7 +474,7 @@ inline bool APRConverter<ImageType>::get_apr_cpu(APR &aAPR, PixelData<T> &input_
         method_timer.stop_timer();
     }
 
-    applyParameters(aAPR,par);
+    applyParameters(par);
 
     computation_timer.stop_timer();
 
@@ -597,7 +506,7 @@ template<typename ImageType> template<typename T>
 inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_image) {
 // TODO: CUDA pipeline is temporarily turned off and CPU version is always chosen.
 //       After revising a CUDA pipeline remove "#if true // " part.
-#if true // #ifndef APR_USE_CUDA
+#ifndef APR_USE_CUDA
     return get_apr_cpu(aAPR, input_image);
 #else
     return get_apr_cuda(aAPR, input_image);
diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp
@@ -38,6 +38,35 @@ class ComputeGradient {
     template<typename T>
     void calc_inv_bspline_z(PixelData<T> &input);
 
+    template<typename T>
+    void applyParameters(PixelData<T> &grad_temp, PixelData<float> &local_scale_temp, PixelData<float> &local_scale_temp2, APRParameters &aprParameters, float bspline_offset) {
+        threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset);
+
+        float max_th = 60000;
+
+#ifdef HAVE_OPENMP
+#pragma omp parallel for default(shared)
+#endif
+        for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
+
+            float rescaled = local_scale_temp.mesh[i];
+            if (rescaled < aprParameters.sigma_th) {
+                rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : aprParameters.sigma_th;
+                local_scale_temp.mesh[i] = rescaled;
+            }
+        }
+
+#ifdef HAVE_OPENMP
+#pragma omp parallel for default(shared)
+#endif
+        for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
+
+            if(grad_temp.mesh[i] < aprParameters.grad_th){
+                grad_temp.mesh[i] = 0;
+            }
+        }
+    }
+
     struct three_temps {
         float temp_1, temp_2, temp_3;
     };
diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu
@@ -57,6 +57,7 @@ namespace {
     }
 
     BsplineParams prepareBsplineStuff(size_t dimLen, float lambda, float tol, int maxFilterLen = -1) {
+
         // Recursive Filter Implimentation for Smoothing BSplines
         // B-Spline Signal Processing: Part II - Efficient Design and Applications, Unser 1993
 
@@ -79,8 +80,8 @@ namespace {
 
         const float norm_factor = powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2);
   
-        //std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1
-        //          << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl;
+//        std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1
+//                  << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << " lambda=" << lambda << " tol=" << tol << std::endl;
 
         // ------- Calculating boundary conditions
 
@@ -169,18 +170,18 @@ void getGradientCuda(const PixelData<ImgType> &image, PixelData<float> &local_sc
 
     // TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim
 
-    runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream);
-    runBsplineXdir(cudaImage, image.getDimension(), px, aStream);
-    runBsplineZdir(cudaImage, image.getDimension(), pz, aStream);
+    if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream);
+    if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream);
+    if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream);
 
 
     runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream);
 
     runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream);
 
-    runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
-    runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
-    runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
+    if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
+    if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
+    if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
 }
 
 class CurrentTime {
@@ -202,6 +203,49 @@ public:
 };
 
 
+/**
+ * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise.
+ * @param input
+ * @param output
+ * @param length - len of input/output arrays
+ * @param thresholdLevel
+ */
+template <typename T, typename S>
+__global__ void threshold(const T *input, S *output, size_t length, float thresholdLevel) {
+    size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x;
+    if (idx < length) {
+        if (input[idx] <= thresholdLevel) { output[idx] = 0; }
+    }
+}
+
+template <typename ImgType, typename T>
+void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) {
+    dim3 threadsPerBlock(64);
+    dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x);
+    threshold<<<numBlocks,threadsPerBlock, 0, aStream>>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th);
+};
+
+template<typename T>
+__global__ void rescaleAndThreshold(T *data, size_t len, float sigmaThreshold, float sigmaThresholdMax) {
+    const float max_th = 60000.0;
+    size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < len) {
+        float rescaled = data[idx];
+        if (rescaled < sigmaThreshold) {
+            rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold;
+        }
+        data[idx] = rescaled;
+    }
+}
+
+template <typename T>
+void runRescaleAndThreshold(T *data, size_t len, float sigma, float sigmaMax, cudaStream_t aStream) {
+    dim3 threadsPerBlock(64);
+    dim3 numBlocks((len + threadsPerBlock.x - 1) / threadsPerBlock.x);
+    rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, sigma, sigmaMax);
+}
+
+
 template <typename U>
 template <typename ImgType>
 class GpuProcessingTask<U>::GpuProcessingTaskImpl {
@@ -264,11 +308,11 @@ public:
         iMaxLevel(maxLevel),
         // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension.
         //       Should be fixed when other parts of pipeline are ready.
-        params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)),
-        bc1(params.bc1.get(), params.k0, iStream),
-        bc2(params.bc2.get(), params.k0, iStream),
-        bc3(params.bc3.get(), params.k0, iStream),
-        bc4(params.bc4.get(), params.k0, iStream),
+//        params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)),
+//        bc1(params.bc1.get(), params.k0, iStream),
+//        bc2(params.bc2.get(), params.k0, iStream),
+//        bc3(params.bc3.get(), params.k0, iStream),
+//        bc4(params.bc4.get(), params.k0, iStream),
         boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num},
         boundary{nullptr, boundaryLen, iStream},
         pctc(iAprInfo, iStream),
@@ -317,6 +361,13 @@ public:
                          splineCudaX, splineCudaY, splineCudaZ, boundary.get(),
                         iBsplineOffset, iParameters, iStream);
         runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream);
+
+        // Apply parameters from APRConverter:
+        runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream);
+        runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream);
+        runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream);
+        // TODO: automatic parameters are not implemented for GPU pipeline (yet)
+
         float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz));
         float level_factor = pow(2, iMaxLevel) * min_dim;
         const float mult_const = level_factor/iParameters.rel_error;
diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp
diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp