Skip to content

Commit 9604c63

Browse files
committed
Linear acces now is using correct cuda stream, bspline params are computed in constructor and memory for them is preallocated
1 parent d2fd1d0 commit 9604c63

File tree

2 files changed

+35
-39
lines changed

2 files changed

+35
-39
lines changed

src/algorithm/ComputeGradientCuda.cu

Lines changed: 30 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ namespace {
134134
};
135135
}
136136

137-
auto transferSpline(BsplineParams &aParams, cudaStream_t aStream) {
137+
auto transferSpline(const BsplineParams &aParams, cudaStream_t aStream) {
138138
ScopedCudaMemHandler<float*, H2D> bc1(aParams.bc1.get(), aParams.k0, aStream);
139139
ScopedCudaMemHandler<float*, H2D> bc2(aParams.bc2.get(), aParams.k0, aStream);
140140
ScopedCudaMemHandler<float*, H2D> bc3(aParams.bc3.get(), aParams.k0, aStream);
@@ -267,11 +267,13 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
267267

268268
// bspline stuff
269269
const float tolerance = 0.0001;
270-
BsplineParams params;
271-
ScopedCudaMemHandler<float*, H2D> bc1;
272-
ScopedCudaMemHandler<float*, H2D> bc2;
273-
ScopedCudaMemHandler<float*, H2D> bc3;
274-
ScopedCudaMemHandler<float*, H2D> bc4;
270+
std::pair<BsplineParamsCuda, BsplineParamsCudaMemoryHandlers> cudax;
271+
std::pair<BsplineParamsCuda, BsplineParamsCudaMemoryHandlers> cuday;
272+
std::pair<BsplineParamsCuda, BsplineParamsCudaMemoryHandlers> cudaz;
273+
BsplineParamsCuda splineCudaX;
274+
BsplineParamsCuda splineCudaY;
275+
BsplineParamsCuda splineCudaZ;
276+
275277
const size_t boundaryLen;
276278
ScopedCudaMemHandler<float*, JUST_ALLOC> boundary;
277279

@@ -306,19 +308,18 @@ public:
306308
iAprInfo(iCpuImage.getDimension()),
307309
iBsplineOffset(bspline_offset),
308310
iMaxLevel(maxLevel),
309-
// TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension.
310-
// Should be fixed when other parts of pipeline are ready.
311-
// params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)),
312-
// bc1(params.bc1.get(), params.k0, iStream),
313-
// bc2(params.bc2.get(), params.k0, iStream),
314-
// bc3(params.bc3.get(), params.k0, iStream),
315-
// bc4(params.bc4.get(), params.k0, iStream),
311+
cudax(transferSpline(prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance), iStream)),
312+
cuday(transferSpline(prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance), iStream)),
313+
cudaz(transferSpline(prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance), iStream)),
316314
boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num},
317315
boundary{nullptr, boundaryLen, iStream},
318316
pctc(iAprInfo, iStream),
319317
y_vec(nullptr, iAprInfo.getSize(), iStream)
320318
{
321-
// std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n";
319+
splineCudaX = cudax.first;
320+
splineCudaY = cuday.first;
321+
splineCudaZ = cudaz.first;
322+
std::cout << "\n=============== GpuProcessingTaskImpl ===================" << iStream << "\n\n";
322323
// std::cout << iCpuImage << std::endl;
323324
// std::cout << iCpuLevels << std::endl;
324325
}
@@ -332,47 +333,42 @@ public:
332333
}
333334

334335
LinearAccessCudaStructs getDataFromGpu() {
335-
// CurrentTime ct;
336-
// uint64_t start = ct.microseconds();
337-
// local_scale_temp.copyD2H();
338-
// checkCuda(cudaStreamSynchronize(iStream));
339-
// std::cout << "RCV time: " << ct.microseconds() - start << std::endl;
336+
// TODO: Temporarily turned off here since synchronized already in computeLinearStructureCuda
337+
// checkCuda(cudaStreamSynchronize(iStream));
338+
340339
return std::move(lacs);
341340
}
342341

343342
void processOnGpu() {
344-
CurrentTime ct;
343+
// image.copyH2D();
344+
CurrentTime ct{};
345345
uint64_t start = ct.microseconds();
346346

347-
// TODO: temporarily bspline params are generated here
348-
// In principle this is OK and correct but would be faster (for processing series of same size images) if
349-
// they would be calculated in constructor of GpuProcessingTaskImpl class (once).
350-
BsplineParams px = prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance);
351-
auto cudax = transferSpline(px, iStream);
352-
auto splineCudaX = cudax.first;
353-
BsplineParams py = prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance);
354-
auto cuday = transferSpline(py, iStream);
355-
auto splineCudaY = cuday.first;
356-
BsplineParams pz = prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance);
357-
auto cudaz = transferSpline(pz, iStream);
358-
auto splineCudaZ = cudaz.first;
359-
347+
CudaTimer time(false, "PIPELINE");
348+
time.start_timer("getgradient");
360349
getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(),
361350
splineCudaX, splineCudaY, splineCudaZ, boundary.get(),
362351
iBsplineOffset, iParameters, iStream);
352+
time.stop_timer();
353+
time.start_timer("intensity");
363354
runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream);
355+
time.stop_timer();
356+
364357

365358
// Apply parameters from APRConverter:
359+
time.start_timer("runs....");
366360
runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream);
367361
runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream);
368362
runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream);
369363
// TODO: automatic parameters are not implemented for GPU pipeline (yet)
364+
time.stop_timer();
370365

366+
time.start_timer("compute lev");
371367
float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz));
372368
float level_factor = pow(2, iMaxLevel) * min_dim;
373369
const float mult_const = level_factor/iParameters.rel_error;
374370
runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream);
375-
371+
time.stop_timer();
376372
computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream);
377373
computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream);
378374
}

src/data_structures/APR/access/LinearAccessCuda.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -560,9 +560,9 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara
560560

561561

562562
{
563-
ScopedCudaMemHandler<uint16_t *, D2H> y_vec_cuda(y_vec.data(), y_vec.size());
564-
ScopedCudaMemHandler<uint64_t *, D2H> xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size());
565-
ScopedCudaMemHandler<uint64_t *, H2D | D2H> level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size());
563+
ScopedCudaMemHandler<uint16_t *, D2H> y_vec_cuda(y_vec.data(), y_vec.size(), aStream);
564+
ScopedCudaMemHandler<uint64_t *, D2H> xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size(), aStream);
565+
ScopedCudaMemHandler<uint64_t *, H2D | D2H> level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size(), aStream);
566566
GenInfoGpuAccess giga(gi, aStream);
567567
if (gi.l_max <= 2) {
568568
runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), gi, giga, aStream);
@@ -612,8 +612,8 @@ void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_ma
612612

613613

614614
{
615-
ScopedCudaMemHandler<uint64_t *, D2H> xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size());
616-
ScopedCudaMemHandler<uint64_t *, H2D | D2H> level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size());
615+
ScopedCudaMemHandler<uint64_t *, D2H> xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size(), aStream);
616+
ScopedCudaMemHandler<uint64_t *, H2D | D2H> level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size(), aStream);
617617
GenInfoGpuAccess giga(gi, aStream);
618618
if (gi.l_max <= 2) {
619619
runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, gi, giga, aStream);

0 commit comments

Comments
 (0)