Skip to content

Commit bf4cdb0

Browse files
committed
Little bit cleanup of CUDA in APRConverter, Added move assignment operator to VectorData
1 parent b01df31 commit bf4cdb0

File tree

5 files changed

+65
-173
lines changed

5 files changed

+65
-173
lines changed

src/algorithm/APRConverter.hpp

Lines changed: 49 additions & 161 deletions
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,6 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
422422

423423
GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max());
424424
// std::cout << "after gpt \n";
425-
gpt.sendDataToGpu();
426425
gpt.processOnGpu();
427426
auto linearAccessGpu = gpt.getDataFromGpu();
428427

@@ -453,198 +452,87 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
453452
*/
454453
template<typename ImageType> template<typename T>
455454
inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T>& input_image) {
456-
455+
// Initialize APR and memory for the pipeline
457456
if (!initPipelineAPR(aAPR, input_image)) return false;
458-
459457
initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
460-
461-
computation_timer.start_timer("init_mem");
462-
PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
458+
PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full size copy of the image)
463459

464460
/////////////////////////////////
465461
/// Pipeline
466-
////////////////////////
462+
/////////////////////////////////
463+
467464
// offset image by factor (this is required if there are zero areas in the background with
468465
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
469466
// Warning both of these could result in over-flow!
470-
471467
if (std::is_floating_point<ImageType>::value) {
472468
image_temp.copyFromMesh(input_image);
473469
} else {
474470
bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda);
475471
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
476472
}
477473

478-
479-
474+
// Run input on the GPU streams
480475
constexpr int numOfStreams = 3; // number of streams to use for parallel processing
481-
constexpr int repetitionsPerStream = 15; // number of repetitions per stream to simulate processing of multiple images
482-
bool useThreads = true;
483-
484-
if (useThreads) {
485-
std::cout << "\n!!! USING THREADS !!!\n\n";
486-
APRTimer ttt(true);
487-
std::cout << ">>>>>>>>>>> START\n";
488-
ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
489-
{
490-
APRTimer t(true);
491-
std::vector<GpuProcessingTask<ImageType>> gpts;
476+
constexpr int repetitionsPerStream = 3; // number of repetitions per stream to simulate processing of multiple images
492477

493-
t.start_timer("Creating GPTS");
494-
std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
495-
for (int i = 0; i < numOfStreams; ++i) {
496-
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
497-
}
498-
t.stop_timer();
499-
500-
t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
501-
{
502-
APRTimer tt(false);
503-
// Create streams and send initial task to do
504-
for (int i = 0; i < numOfStreams; ++i) {
505-
// gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
506-
tt.start_timer("SEND");
507-
// gpts[i].sendDataToGpu();
508-
// gpts[i].processOnGpu();
509-
tt.stop_timer();
510-
// std::cout << "Send " << i << std::endl;
511-
// gpts.back().processOnGpu();
512-
// std::cout << "Proc " << i << std::endl;
513-
}
514-
// Create streams and send initial task to do
515-
for (int i = 0; i < numOfStreams; ++i) {
516-
gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
517-
// tt.start_timer("Process");
518-
// gpts[i].processOnGpu();
519-
// tt.stop_timer();
520-
// std::cout << "Proc " << i << std::endl;
521-
}
522-
std::cout << "=========" << std::endl;
478+
APRTimer ttt(true);
523479

524-
for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
525-
int c = i % numOfStreams;
480+
ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
481+
{
482+
APRTimer t(true);
483+
std::vector<GpuProcessingTask<ImageType>> gpts;
526484

527-
// get data from previous task
528-
gpts_futures[c].get();
529-
auto linearAccessGpu = gpts[c].getDataFromGpu();
485+
t.start_timer("Creating GPTS");
486+
std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
487+
for (int i = 0; i < numOfStreams; ++i) {
488+
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
489+
}
490+
t.stop_timer();
530491

531-
// in theory, we get new data and send them to task
532-
if (i < numOfStreams * (repetitionsPerStream - 1)) {
533-
// gpts[c].sendDataToGpu();
534-
// std::cout << "Send " << c << std::endl;
535-
// gpts[c].processOnGpu();
536-
gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
537-
// std::cout << "Proc " << c << std::endl;
538-
}
492+
t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
493+
{
494+
APRTimer tt(false);
495+
// Run processOnGpu() asynchronously - it will handle transfering data from CPU to GPU and run whole pipeline
496+
for (int i = 0; i < numOfStreams; ++i) {
497+
gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
498+
}
539499

540-
aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
500+
for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
501+
int c = i % numOfStreams;
541502

542-
// generateDatastructures(aAPR) for linearAcceess for CUDA
543-
aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
544-
aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
545-
aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
546-
aAPR.apr_initialized = true;
503+
// Get data from GpuProcessingTask - get() will block until the task is finished
504+
gpts_futures[c].get();
505+
auto linearAccessGpu = gpts[c].getDataFromGpu();
547506

548-
// std::cout << "CUDA pipeline finished!\n";
507+
// in theory, we get new data and send them to task
508+
if (i < numOfStreams * (repetitionsPerStream - 1)) {
509+
gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
549510
}
550-
// cudaDeviceSynchronize();
551-
}
552-
auto allT = t.stop_timer();
553-
std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
554-
std::cout << "Bandwidth:" << (input_image.size() / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024) << " MB/s\n";
555-
}
556-
auto allT = ttt.stop_timer();
557-
float tpi = allT / (numOfStreams*repetitionsPerStream);
558-
std::cout << "Time per image: " << tpi << " seconds\n";
559-
std::cout << "Image size: " << (input_image.size() / 1024 / 1024) << " MB\n";
560-
std::cout << "Bandwidth:" << (input_image.size() / tpi / 1024 / 1024) << " MB/s\n";
561-
562511

563-
std::cout << "<<<<<<<<<<<< STOP\n";
564-
}
565-
else {
566-
APRTimer ttt(true);
567-
std::cout << ">>>>>>>>>>> START\n";
568-
ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
569-
{
570-
APRTimer t(true);
571-
std::vector<GpuProcessingTask<ImageType>> gpts;
512+
// Fill APR data structure with data from GPU
513+
aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
514+
aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec);
515+
aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec);
516+
aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec);
572517

573-
t.start_timer("Creating GPTS");
574-
//std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
575-
for (int i = 0; i < numOfStreams; ++i) {
576-
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
577-
}
578-
// cudaDeviceSynchronize();
579-
t.stop_timer();
580-
581-
t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
582-
{
583-
584-
APRTimer tt(false);
585-
// Create streams and send initial task to do
586-
for (int i = 0; i < numOfStreams; ++i) {
587-
// gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
588-
tt.start_timer("SEND");
589-
gpts[i].sendDataToGpu();
590-
gpts[i].processOnGpu();
591-
tt.stop_timer();
592-
// std::cout << "Send " << i << std::endl;
593-
// gpts.back().processOnGpu();
594-
// std::cout << "Proc " << i << std::endl;
595-
}
596-
// Create streams and send initial task to do
597-
for (int i = 0; i < numOfStreams; ++i) {
598-
// gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
599-
tt.start_timer("Process");
600-
// gpts[i].processOnGpu();
601-
tt.stop_timer();
602-
// std::cout << "Proc " << i << std::endl;
603-
}
604-
std::cout << "=========" << std::endl;
605-
606-
for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
607-
int c = i % numOfStreams;
608-
609-
// get data from previous task
610-
// gpts_futures[c].get();
611-
auto linearAccessGpu = gpts[c].getDataFromGpu();
612-
// std::cout << "Get " << c << std::endl;
613-
614-
// in theory, we get new data and send them to task
615-
if (i < numOfStreams * (repetitionsPerStream - 1)) {
616-
gpts[c].sendDataToGpu();
617-
// std::cout << "Send " << c << std::endl;
618-
gpts[c].processOnGpu();
619-
// gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
620-
// std::cout << "Proc " << c << std::endl;
621-
}
622-
623-
aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
624-
625-
// generateDatastructures(aAPR) for linearAcceess for CUDA
626-
aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
627-
aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
628-
aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
629-
aAPR.apr_initialized = true;
630-
631-
// std::cout << "CUDA pipeline finished!\n";
632-
}
633-
// cudaDeviceSynchronize();
518+
aAPR.apr_initialized = true;
634519
}
635-
auto allT = t.stop_timer();
636-
std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
637520
}
638-
auto allT = ttt.stop_timer();
521+
auto allT = t.stop_timer();
639522
std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
640-
std::cout << "<<<<<<<<<<<< STOP\n";
523+
std::cout << "Bandwidth:" << (input_image.size() / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024) << " MB/s\n";
641524
}
525+
auto allT = ttt.stop_timer();
526+
float tpi = allT / (numOfStreams*repetitionsPerStream);
527+
std::cout << "Time per image: " << tpi << " seconds\n";
528+
std::cout << "Image size: " << (input_image.size() / 1024 / 1024) << " MB\n";
529+
std::cout << "Bandwidth:" << (input_image.size() / tpi / 1024 / 1024) << " MB/s\n";
642530

643-
644-
return false; //TODO: change it back to true
531+
return true;
645532
}
646533
#endif
647534

535+
648536
/**
649537
* Implementation of pipeline for CPU
650538
*
@@ -715,8 +603,8 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
715603
#ifndef APR_USE_CUDA
716604
return get_apr_cpu(aAPR, input_image);
717605
#else
718-
return get_apr_cuda(aAPR, input_image);
719-
// return get_apr_cuda_streams(aAPR, input_image);
606+
// return get_apr_cuda(aAPR, input_image);
607+
return get_apr_cuda_streams(aAPR, input_image);
720608
#endif
721609
}
722610

src/algorithm/ComputeGradientCuda.cu

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ public:
406406
splineCudaX = cudax.first;
407407
splineCudaY = cuday.first;
408408
splineCudaZ = cudaz.first;
409-
std::cout << "\n=============== GpuProcessingTaskImpl ===================" << iStream << "\n\n";
409+
// std::cout << "\n=============== GpuProcessingTaskImpl ===================" << iStream << "\n\n";
410410
// std::cout << iCpuImage << std::endl;
411411
// std::cout << iCpuLevels << std::endl;
412412

@@ -437,11 +437,6 @@ public:
437437
isErrorDetectedCuda.initialize(isErrorDetectedPinned.data(), 1, iStream);
438438
}
439439

440-
void sendDataToGpu() {
441-
// sends data in processOnGpu()
442-
// in multi-stream implementation it is done in threads so is not blocking current operations.
443-
}
444-
445440
LinearAccessCudaStructs getDataFromGpu() {
446441
return std::move(lacs);
447442
}
@@ -508,9 +503,6 @@ GpuProcessingTask<ImgType>::~GpuProcessingTask() { }
508503
template <typename ImgType>
509504
GpuProcessingTask<ImgType>::GpuProcessingTask(GpuProcessingTask&&) = default;
510505

511-
template <typename ImgType>
512-
void GpuProcessingTask<ImgType>::sendDataToGpu() {impl->sendDataToGpu();}
513-
514506
template <typename ImgType>
515507
LinearAccessCudaStructs GpuProcessingTask<ImgType>::getDataFromGpu() {return impl->getDataFromGpu();}
516508

src/algorithm/ComputeGradientCuda.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ class GpuProcessingTask {
4646
~GpuProcessingTask();
4747
GpuProcessingTask(GpuProcessingTask&&);
4848

49-
void sendDataToGpu();
5049
LinearAccessCudaStructs getDataFromGpu();
5150
void processOnGpu();
5251
};

src/data_structures/Mesh/PixelData.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,20 @@ public :
293293
#endif
294294
}
295295

296+
/**
297+
* Move assignment operator
298+
* @param aObj
299+
*/
300+
VectorData& operator=(VectorData &&aObj) {
301+
usePinnedMemory = aObj.usePinnedMemory;
302+
vecMemory.swap(aObj.vecMemory);
303+
vec = std::move(aObj.vec);
304+
#ifdef APR_USE_CUDA
305+
vecMemoryPinned = std::move(aObj.vecMemoryPinned);
306+
#endif
307+
return *this;
308+
}
309+
296310
/**
297311
* Apply unary operator to each element in parallel, writing the result to VectorData 'output'.
298312
* @tparam S

test/FullPipelineCudaTest.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,6 @@ namespace {
340340
// Calculate pipeline on GPU
341341
timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
342342
GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, bspline_offset, maxLevel);
343-
gpt.sendDataToGpu();
344343
gpt.processOnGpu();
345344
auto linearAccessGpu = gpt.getDataFromGpu();
346345
giGpu.total_number_particles = linearAccessGpu.y_vec.size();
@@ -359,7 +358,7 @@ namespace {
359358
}
360359

361360

362-
TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GPU_via_APRConverter) {
361+
TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GPU_via_APRConverter) {
363362
APRTimer timer(true);
364363

365364
// Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors

0 commit comments

Comments
 (0)