Fixed CUDA-streams sync issues when copying back to CPU

krzysg · krzysg · commit b01df3115439 · 2025-08-05T16:04:22.000+02:00
diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp
@@ -400,10 +400,11 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
 
     if (!initPipelineAPR(aAPR, input_image)) return false;
 
+    total_timer.start_timer("full_pipeline");
     initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
 
     computation_timer.start_timer("init_mem");
-    PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
+    PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image)
 
     /////////////////////////////////
     /// Pipeline
@@ -435,6 +436,8 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
 
     std::cout << "CUDA pipeline finished!\n";
 
+    total_timer.stop_timer();
+
     return true;
 }
 #endif
@@ -465,14 +468,11 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
     // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
     // Warning both of these could result in over-flow!
 
-    if (std::is_same<uint16_t, ImageType>::value) {
-        bspline_offset = 100;
-        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-    } else if (std::is_same<uint8_t, ImageType>::value) {
-        bspline_offset = 5;
-        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-    } else {
+    if (std::is_floating_point<ImageType>::value) {
         image_temp.copyFromMesh(input_image);
+    } else {
+        bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda);
+        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
     }
 
 
diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu
@@ -447,48 +447,46 @@ public:
     }
 
     void processOnGpu() {
+        // Set it and copy first before copying the image
+        // It improves *a lot* performance even though it is needed later in computeLinearStructureCuda()
+        iAprInfo.total_number_particles = 0; // reset total_number_particles to 0
+        giga.copyHtoD();
+        level_xz_vec_cuda.copyH2D();
+
         image.copyH2D();
-        CurrentTime ct{};
-        uint64_t start = ct.microseconds();
 
-        CudaTimer time(false, "PIPELINE");
-        time.start_timer("getgradient");
         getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(),
                          splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetectedPinned[0], isErrorDetectedCuda,
                         iBsplineOffset, iParameters, iStream);
-        time.stop_timer();
-        time.start_timer("intensity");
+
         runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), lstPadded.get(), lst2Padded.get(), iStream);
-        time.stop_timer();
 
         // Apply parameters from APRConverter:
-        time.start_timer("runs....");
         runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream);
         runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream);
         runThresholdOpen(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream);
         // TODO: automatic parameters are not implemented for GPU pipeline (yet)
-        time.stop_timer();
 
-        time.start_timer("compute lev");
         float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz));
         float level_factor = pow(2, iMaxLevel) * min_dim;
         const float mult_const = level_factor/iParameters.rel_error;
         runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream);
-        time.stop_timer();
         computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream);
 
-
-        level_xz_vec_cuda.copyH2D();
-        iAprInfo.total_number_particles = 0; // reset total_number_particles to 0
-        giga.copyHtoD();
         computeLinearStructureCuda(y_vec_cuda.get(), xz_end_vec_cuda.get(), level_xz_vec_cuda.get(), pctc, iAprInfo, giga, iParameters, counter_total, iStream);
 
-        xz_end_vec_cuda.copyD2H();
+        // Get data from GPU - first we need to get number of particles to resize y_vec and have idea how many particles to copy - that is why we need to synchronize first time
+        giga.copyDtoH();
+        checkCuda(cudaStreamSynchronize(iStream));
 
-        // Trim buffer to calculated size (initially it is allocated to worst case - same number of particles as pixels in input image)
+        // Start copying the data from GPU to CPU
+        xz_end_vec_cuda.copyD2H();
+        // Trim buffer to calculated size (initially it is allocated to worst case - same number of particles as pixels in input image) and copy data from GPU
         y_vec.resize(iAprInfo.total_number_particles);
-
+        // Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures
         checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda.get(), iAprInfo.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, iStream));
+
+        // Synchornize last time - at that moment all data from GPU is copied to CPU
         checkCuda(cudaStreamSynchronize(iStream));
 
         // Prepare CPU structures
diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu
@@ -548,5 +548,4 @@ void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda,
         runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, aStream);
         runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, counter_total, aStream);
     }
-    giga.copyDtoH();
 }

Original file line number	Diff line number	Diff line change
`@@ -548,5 +548,4 @@ void computeLinearStructureCuda(uint16_t y_vec_cuda, uint64_t xz_end_vec_cuda,`
`548`	`548`	`runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, aStream);`
`549`	`549`	`runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, counter_total, aStream);`
`550`	`550`	`}`
`551`		`- giga.copyDtoH();`
`552`	`551`	`}`