Skip to content

Commit b01df31

Browse files
committed
Fixed CUDA-streams sync issues when copying back to CPU
1 parent 7f6e2d3 commit b01df31

File tree

3 files changed

+24
-27
lines changed

3 files changed

+24
-27
lines changed

src/algorithm/APRConverter.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,11 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
400400

401401
if (!initPipelineAPR(aAPR, input_image)) return false;
402402

403+
total_timer.start_timer("full_pipeline");
403404
initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
404405

405406
computation_timer.start_timer("init_mem");
406-
PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
407+
PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image)
407408

408409
/////////////////////////////////
409410
/// Pipeline
@@ -435,6 +436,8 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
435436

436437
std::cout << "CUDA pipeline finished!\n";
437438

439+
total_timer.stop_timer();
440+
438441
return true;
439442
}
440443
#endif
@@ -465,14 +468,11 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
465468
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
466469
// Warning both of these could result in over-flow!
467470

468-
if (std::is_same<uint16_t, ImageType>::value) {
469-
bspline_offset = 100;
470-
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
471-
} else if (std::is_same<uint8_t, ImageType>::value) {
472-
bspline_offset = 5;
473-
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
474-
} else {
471+
if (std::is_floating_point<ImageType>::value) {
475472
image_temp.copyFromMesh(input_image);
473+
} else {
474+
bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda);
475+
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
476476
}
477477

478478

src/algorithm/ComputeGradientCuda.cu

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -447,48 +447,46 @@ public:
447447
}
448448

449449
void processOnGpu() {
450+
// Set it and copy first before copying the image
451+
// It improves *a lot* performance even though it is needed later in computeLinearStructureCuda()
452+
iAprInfo.total_number_particles = 0; // reset total_number_particles to 0
453+
giga.copyHtoD();
454+
level_xz_vec_cuda.copyH2D();
455+
450456
image.copyH2D();
451-
CurrentTime ct{};
452-
uint64_t start = ct.microseconds();
453457

454-
CudaTimer time(false, "PIPELINE");
455-
time.start_timer("getgradient");
456458
getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(),
457459
splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetectedPinned[0], isErrorDetectedCuda,
458460
iBsplineOffset, iParameters, iStream);
459-
time.stop_timer();
460-
time.start_timer("intensity");
461+
461462
runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), lstPadded.get(), lst2Padded.get(), iStream);
462-
time.stop_timer();
463463

464464
// Apply parameters from APRConverter:
465-
time.start_timer("runs....");
466465
runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream);
467466
runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream);
468467
runThresholdOpen(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream);
469468
// TODO: automatic parameters are not implemented for GPU pipeline (yet)
470-
time.stop_timer();
471469

472-
time.start_timer("compute lev");
473470
float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz));
474471
float level_factor = pow(2, iMaxLevel) * min_dim;
475472
const float mult_const = level_factor/iParameters.rel_error;
476473
runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream);
477-
time.stop_timer();
478474
computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream);
479475

480-
481-
level_xz_vec_cuda.copyH2D();
482-
iAprInfo.total_number_particles = 0; // reset total_number_particles to 0
483-
giga.copyHtoD();
484476
computeLinearStructureCuda(y_vec_cuda.get(), xz_end_vec_cuda.get(), level_xz_vec_cuda.get(), pctc, iAprInfo, giga, iParameters, counter_total, iStream);
485477

486-
xz_end_vec_cuda.copyD2H();
478+
// Get data from GPU - first we need to get number of particles to resize y_vec and have idea how many particles to copy - that is why we need to synchronize first time
479+
giga.copyDtoH();
480+
checkCuda(cudaStreamSynchronize(iStream));
487481

488-
// Trim buffer to calculated size (initially it is allocated to worst case - same number of particles as pixels in input image)
482+
// Start copying the data from GPU to CPU
483+
xz_end_vec_cuda.copyD2H();
484+
// Trim buffer to calculated size (initially it is allocated to worst case - same number of particles as pixels in input image) and copy data from GPU
489485
y_vec.resize(iAprInfo.total_number_particles);
490-
486+
// Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures
491487
checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda.get(), iAprInfo.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, iStream));
488+
489+
// Synchornize last time - at that moment all data from GPU is copied to CPU
492490
checkCuda(cudaStreamSynchronize(iStream));
493491

494492
// Prepare CPU structures

src/data_structures/APR/access/LinearAccessCuda.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -548,5 +548,4 @@ void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda,
548548
runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, aStream);
549549
runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda, xz_end_vec_cuda, y_vec_cuda, counter_total, aStream);
550550
}
551-
giga.copyDtoH();
552551
}

0 commit comments

Comments
 (0)