Skip to content

Commit 1b54cd0

Browse files
committed
Stream operations on GPU are working now as expected. All tests are fixed. Still APRConverter for streams must be fixed/improved since there is a draft code.
1 parent 5534860 commit 1b54cd0

15 files changed

+619
-328
lines changed

src/algorithm/APRConverter.hpp

Lines changed: 158 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ void APRConverter<ImageType>::get_apr_custom_grad_scale(APR& aAPR,PixelData<Imag
182182

183183
} else {
184184
// To be done. The L(y) needs to be computed then max downsampled.
185-
std::cerr << "Not implimented" << std::endl;
185+
std::cerr << "Not implemented" << std::endl;
186186

187187
}
188188

@@ -412,17 +412,15 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
412412
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
413413
// Warning both of these could result in over-flow!
414414

415-
if (std::is_same<uint16_t, ImageType>::value) {
416-
bspline_offset = 100;
417-
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
418-
} else if (std::is_same<uint8_t, ImageType>::value) {
419-
bspline_offset = 5;
420-
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
421-
} else {
415+
if (std::is_floating_point<ImageType>::value) {
422416
image_temp.copyFromMesh(input_image);
417+
} else {
418+
bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda);
419+
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
423420
}
424421

425422
GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max());
423+
// std::cout << "after gpt \n";
426424
gpt.sendDataToGpu();
427425
gpt.processOnGpu();
428426
auto linearAccessGpu = gpt.getDataFromGpu();
@@ -479,76 +477,169 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
479477

480478

481479

482-
constexpr int numOfStreams = 3;
483-
constexpr int repetitionsPerStream = 3; //
484-
APRTimer ttt(true);
485-
ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
486-
{
487-
std::vector<GpuProcessingTask<ImageType>> gpts;
488-
489-
//std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
490-
for (int i = 0; i < numOfStreams; ++i) {
491-
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
492-
}
480+
constexpr int numOfStreams = 3; // number of streams to use for parallel processing
481+
constexpr int repetitionsPerStream = 15; // number of repetitions per stream to simulate processing of multiple images
482+
bool useThreads = true;
493483

494-
APRTimer t(true);
495-
t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
484+
if (useThreads) {
485+
std::cout << "\n!!! USING THREADS !!!\n\n";
486+
APRTimer ttt(true);
487+
std::cout << ">>>>>>>>>>> START\n";
488+
ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
496489
{
490+
APRTimer t(true);
491+
std::vector<GpuProcessingTask<ImageType>> gpts;
497492

498-
APRTimer tt(false);
499-
// Create streams and send initial task to do
493+
t.start_timer("Creating GPTS");
494+
std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
500495
for (int i = 0; i < numOfStreams; ++i) {
501-
// gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
502-
tt.start_timer("SEND");
503-
gpts[i].sendDataToGpu();
504-
tt.stop_timer();
505-
// std::cout << "Send " << i << std::endl;
506-
// gpts.back().processOnGpu();
507-
// std::cout << "Proc " << i << std::endl;
496+
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
508497
}
509-
// Create streams and send initial task to do
510-
for (int i = 0; i < numOfStreams; ++i) {
511-
// gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
512-
tt.start_timer("Process");
513-
gpts[i].processOnGpu();
514-
tt.stop_timer();
515-
// std::cout << "Proc " << i << std::endl;
516-
}
517-
std::cout << "=========" << std::endl;
518-
519-
for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
520-
int c = i % numOfStreams;
521-
522-
// get data from previous task
523-
// gpts_futures[c].get();
524-
auto linearAccessGpu = gpts[c].getDataFromGpu();
525-
// std::cout << "Get " << c << std::endl;
526-
527-
// in theory, we get new data and send them to task
528-
if (i < numOfStreams * (repetitionsPerStream - 1)) {
529-
gpts[c].sendDataToGpu();
530-
// std::cout << "Send " << c << std::endl;
531-
gpts[c].processOnGpu();
532-
// gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
533-
// std::cout << "Proc " << c << std::endl;
498+
t.stop_timer();
499+
500+
t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
501+
{
502+
APRTimer tt(false);
503+
// Create streams and send initial task to do
504+
for (int i = 0; i < numOfStreams; ++i) {
505+
// gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
506+
tt.start_timer("SEND");
507+
// gpts[i].sendDataToGpu();
508+
// gpts[i].processOnGpu();
509+
tt.stop_timer();
510+
// std::cout << "Send " << i << std::endl;
511+
// gpts.back().processOnGpu();
512+
// std::cout << "Proc " << i << std::endl;
534513
}
514+
// Create streams and send initial task to do
515+
for (int i = 0; i < numOfStreams; ++i) {
516+
gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
517+
// tt.start_timer("Process");
518+
// gpts[i].processOnGpu();
519+
// tt.stop_timer();
520+
// std::cout << "Proc " << i << std::endl;
521+
}
522+
std::cout << "=========" << std::endl;
523+
524+
for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
525+
int c = i % numOfStreams;
535526

536-
aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
527+
// get data from previous task
528+
gpts_futures[c].get();
529+
auto linearAccessGpu = gpts[c].getDataFromGpu();
537530

538-
// generateDatastructures(aAPR) for linearAcceess for CUDA
539-
aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
540-
aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
541-
aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
542-
aAPR.apr_initialized = true;
531+
// in theory, we get new data and send them to task
532+
if (i < numOfStreams * (repetitionsPerStream - 1)) {
533+
// gpts[c].sendDataToGpu();
534+
// std::cout << "Send " << c << std::endl;
535+
// gpts[c].processOnGpu();
536+
gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
537+
// std::cout << "Proc " << c << std::endl;
538+
}
543539

544-
// std::cout << "CUDA pipeline finished!\n";
540+
aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
541+
542+
// generateDatastructures(aAPR) for linearAcceess for CUDA
543+
aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
544+
aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
545+
aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
546+
aAPR.apr_initialized = true;
547+
548+
// std::cout << "CUDA pipeline finished!\n";
549+
}
550+
// cudaDeviceSynchronize();
551+
}
552+
auto allT = t.stop_timer();
553+
std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
554+
std::cout << "Bandwidth:" << (input_image.size() / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024) << " MB/s\n";
555+
}
556+
auto allT = ttt.stop_timer();
557+
float tpi = allT / (numOfStreams*repetitionsPerStream);
558+
std::cout << "Time per image: " << tpi << " seconds\n";
559+
std::cout << "Image size: " << (input_image.size() / 1024 / 1024) << " MB\n";
560+
std::cout << "Bandwidth:" << (input_image.size() / tpi / 1024 / 1024) << " MB/s\n";
561+
562+
563+
std::cout << "<<<<<<<<<<<< STOP\n";
564+
}
565+
else {
566+
APRTimer ttt(true);
567+
std::cout << ">>>>>>>>>>> START\n";
568+
ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
569+
{
570+
APRTimer t(true);
571+
std::vector<GpuProcessingTask<ImageType>> gpts;
572+
573+
t.start_timer("Creating GPTS");
574+
//std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
575+
for (int i = 0; i < numOfStreams; ++i) {
576+
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
577+
}
578+
// cudaDeviceSynchronize();
579+
t.stop_timer();
580+
581+
t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
582+
{
583+
584+
APRTimer tt(false);
585+
// Create streams and send initial task to do
586+
for (int i = 0; i < numOfStreams; ++i) {
587+
// gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
588+
tt.start_timer("SEND");
589+
gpts[i].sendDataToGpu();
590+
gpts[i].processOnGpu();
591+
tt.stop_timer();
592+
// std::cout << "Send " << i << std::endl;
593+
// gpts.back().processOnGpu();
594+
// std::cout << "Proc " << i << std::endl;
595+
}
596+
// Create streams and send initial task to do
597+
for (int i = 0; i < numOfStreams; ++i) {
598+
// gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
599+
tt.start_timer("Process");
600+
// gpts[i].processOnGpu();
601+
tt.stop_timer();
602+
// std::cout << "Proc " << i << std::endl;
603+
}
604+
std::cout << "=========" << std::endl;
605+
606+
for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
607+
int c = i % numOfStreams;
608+
609+
// get data from previous task
610+
// gpts_futures[c].get();
611+
auto linearAccessGpu = gpts[c].getDataFromGpu();
612+
// std::cout << "Get " << c << std::endl;
613+
614+
// in theory, we get new data and send them to task
615+
if (i < numOfStreams * (repetitionsPerStream - 1)) {
616+
gpts[c].sendDataToGpu();
617+
// std::cout << "Send " << c << std::endl;
618+
gpts[c].processOnGpu();
619+
// gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
620+
// std::cout << "Proc " << c << std::endl;
621+
}
622+
623+
aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
624+
625+
// generateDatastructures(aAPR) for linearAcceess for CUDA
626+
aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
627+
aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
628+
aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
629+
aAPR.apr_initialized = true;
630+
631+
// std::cout << "CUDA pipeline finished!\n";
632+
}
633+
// cudaDeviceSynchronize();
545634
}
635+
auto allT = t.stop_timer();
636+
std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
546637
}
547-
auto allT = t.stop_timer();
638+
auto allT = ttt.stop_timer();
548639
std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
640+
std::cout << "<<<<<<<<<<<< STOP\n";
549641
}
550-
auto allT = ttt.stop_timer();
551-
std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
642+
552643

553644
return false; //TODO: change it back to true
554645
}
@@ -624,8 +715,8 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
624715
#ifndef APR_USE_CUDA
625716
return get_apr_cpu(aAPR, input_image);
626717
#else
627-
// return get_apr_cuda(aAPR, input_image);
628-
return get_apr_cuda_streams(aAPR, input_image);
718+
return get_apr_cuda(aAPR, input_image);
719+
// return get_apr_cuda_streams(aAPR, input_image);
629720
#endif
630721
}
631722

src/algorithm/APRParameters.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,16 @@ class APRParameters {
5555
os << "rel_error=" << obj.rel_error << "\n";
5656
os << "sigma_th=" << obj.sigma_th << "\n";
5757
os << "sigma_th_max=" << obj.sigma_th_max << "\n";
58+
os << "grad_th=" << obj.grad_th << "\n";
5859
os << "auto_parameters=" << (obj.auto_parameters ? "true" : "false") << "\n";
60+
os << "reflect_bc_lis=" << (obj.reflect_bc_lis ? "true" : "false") << "\n";
61+
os << "check_input=" << (obj.check_input ? "true" : "false") << "\n";
62+
os << "swap_dimensions=" << (obj.swap_dimensions ? "true" : "false") << "\n";
5963
os << "neighborhood_optimization=" << (obj.neighborhood_optimization ? "true" : "false") << "\n";
6064
os << "constant_intensity_scale=" << (obj.constant_intensity_scale ? "true" : "false") << "\n";
6165
os << "output_steps=" << (obj.output_steps ? "true" : "false") << "\n";
62-
66+
os << "dx/dy/dz=" << obj.dx << "/" << obj.dy << "/" << obj.dz << "\n";
67+
os << "psfx/psfy/psfz=" << obj.psfx << "/" << obj.psfy << "/" << obj.psfz << "\n";
6368
return os;
6469
}
6570

src/algorithm/ComputeGradient.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ round(float val, size_t &errCount) {
364364

365365
if(val < std::numeric_limits<T>::min() || val > std::numeric_limits<T>::max()) {
366366
errCount++;
367-
std::cout << val << " " << (float)std::numeric_limits<T>::min() << " " << (float)std::numeric_limits<T>::max() << std::endl;
367+
// std::cout << val << " " << (float)std::numeric_limits<T>::min() << " " << (float)std::numeric_limits<T>::max() << std::endl;
368368
}
369369
return val;
370370
}

0 commit comments

Comments
 (0)