Skip to content

Commit 1f27876

Browse files
committed
Initial impl. of CUDA multistreams, it takes many images but STILL only one ARP object - use it only for speed for now
1 parent 0c702f3 commit 1f27876

File tree

4 files changed

+134
-10
lines changed

4 files changed

+134
-10
lines changed

src/algorithm/APRConverter.hpp

Lines changed: 120 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ class APRConverter {
7777
bool get_apr_cuda(APR &aAPR, PixelData<T> &input_image);
7878
template <typename T>
7979
bool get_apr_cuda_streams(APR &aAPR, PixelData<T> &input_image);
80+
template <typename T>
81+
bool get_apr_cuda_multistreams(APR &aAPR, const std::vector<PixelData<T> *> &input_images, int numOfStreams = 3);
8082
#endif
8183

8284
bool verbose = true;
@@ -420,8 +422,9 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
420422
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
421423
}
422424

423-
GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max());
425+
GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, aAPR.level_max());
424426
// std::cout << "after gpt \n";
427+
gpt.setBsplineOffset(bspline_offset);
425428
gpt.processOnGpu();
426429
auto linearAccessGpu = gpt.getDataFromGpu();
427430

@@ -442,6 +445,116 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
442445
#endif
443446

444447
#ifdef APR_USE_CUDA
448+
/**
449+
* Implementation of pipeline for GPU/CUDA and multiple streams
450+
* NOTE: Currently only one image is processed multiple times just get an idea how fast it can be.
451+
* Finally, it should be able to process incoming stream of data (sequence of images).
452+
*
453+
* @param aAPR - the APR data structure
454+
* @param input_images - input images
455+
* @param numOfStreams - number of streams to use for parallel processing on GPU
456+
*/
457+
template<typename ImageType> template<typename T>
458+
inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const std::vector<PixelData<T>*> &input_images, int numOfStreams) {
459+
int numOfImages = input_images.size();
460+
if (numOfImages == 0) {
461+
std::cerr << "No input images provided for APR conversion." << std::endl;
462+
return false;
463+
}
464+
465+
// Reduce number of streams to number of images if there are less images than streams
466+
if (numOfImages < numOfStreams) numOfStreams = numOfImages;
467+
468+
// Use first image to initialize the APR - all other images should have the same dimensions
469+
auto input_image = input_images[0];
470+
471+
// Initialize APR and memory for the pipeline
472+
if (!initPipelineAPR(aAPR, *input_image)) return false;
473+
initPipelineMemory(input_image->y_num, input_image->x_num, input_image->z_num);
474+
475+
// Create a temporary image for each stream
476+
std::vector<PixelData<ImageType>> tempImages;
477+
std::cout << "allocating PixelData for " << numOfStreams << " streams" << std::endl;
478+
for (int i = 0; i < numOfStreams; ++i) {
479+
tempImages.emplace_back(PixelData<T>(*input_image, false /* don't copy */, true /* pinned memory */));
480+
}
481+
482+
/////////////////////////////////
483+
/// Pipeline
484+
/////////////////////////////////
485+
APRTimer t(true);
486+
487+
// Create GpuProcessingTask for each stream
488+
std::vector<GpuProcessingTask<ImageType>> gpts;
489+
t.start_timer("Creating GPTS");
490+
std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
491+
for (int i = 0; i < numOfStreams; ++i) {
492+
gpts.emplace_back(GpuProcessingTask<ImageType>(tempImages[i], local_scale_temp, par, aAPR.level_max()));
493+
}
494+
t.stop_timer();
495+
496+
497+
t.start_timer("GPU processing...");
498+
// Saturate all the streams with first images
499+
for (int i = 0; i < numOfStreams; ++i) {
500+
501+
// offset image by factor (this is required if there are zero areas in the background with
502+
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
503+
// Warning both of these could result in over-flow!
504+
if (std::is_floating_point<ImageType>::value) {
505+
tempImages[i].copyFromMesh(*input_images[i]);
506+
} else {
507+
bspline_offset = compute_bspline_offset<ImageType>(*input_images[i], par.lambda);
508+
tempImages[i].copyFromMeshWithUnaryOp(*input_images[i], [=](const auto &a) { return (a + bspline_offset); });
509+
}
510+
std::cout << "Processing image " << i << " on stream " << i << std::endl;
511+
gpts[i].setBsplineOffset(bspline_offset);
512+
gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
513+
}
514+
515+
516+
// Main loop - get results from GPU and send new images to the streams (if any left)
517+
for (int s = 0; s < numOfImages; ++s) {
518+
int streamNum = s % numOfStreams;
519+
520+
// Get data from GpuProcessingTask - get() will block until the task is finished
521+
gpts_futures[streamNum].get();
522+
auto linearAccessGpu = gpts[streamNum].getDataFromGpu();
523+
524+
// Send next images to the stream if there are any left
525+
// We have 'numOfImages - numOfStreams' left to process after saturating the streams with first images
526+
if (s < numOfImages - numOfStreams) {
527+
int imageToProcess = s + numOfStreams;
528+
if (std::is_floating_point<ImageType>::value) {
529+
tempImages[streamNum].copyFromMesh(*input_images[imageToProcess]);
530+
} else {
531+
bspline_offset = compute_bspline_offset<ImageType>(*input_images[imageToProcess], par.lambda);
532+
tempImages[streamNum].copyFromMeshWithUnaryOp(*input_images[imageToProcess], [=](const auto &a) { return (a + bspline_offset); });
533+
}
534+
std::cout << "Processing image " << imageToProcess << " on stream " << streamNum << std::endl;
535+
gpts[streamNum].setBsplineOffset(bspline_offset);
536+
gpts_futures[streamNum] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[streamNum]);
537+
}
538+
539+
// Fill APR data structure with data from GPU
540+
aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
541+
aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec);
542+
aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec);
543+
aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec);
544+
545+
aAPR.apr_initialized = true;
546+
}
547+
548+
auto allT = t.stop_timer();
549+
float tpi = allT / (numOfImages);
550+
std::cout << "Num of images processed: " << numOfImages << "\n";
551+
std::cout << "Time per image: " << tpi << " seconds\n";
552+
std::cout << "Image size: " << (input_images[0]->size() / 1024 / 1024) << " MB\n";
553+
std::cout << "Bandwidth:" << (input_images[0]->size() / tpi / 1024 / 1024) << " MB/s\n";
554+
std::cout << "CUDA multistream pipeline finished!\n";
555+
return true;
556+
}
557+
445558
/**
446559
* Implementation of pipeline for GPU/CUDA and multiple streams
447560
* NOTE: Currently only one image is processed multiple times just get an idea how fast it can be.
@@ -485,7 +598,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
485598
t.start_timer("Creating GPTS");
486599
std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
487600
for (int i = 0; i < numOfStreams; ++i) {
488-
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
601+
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, aAPR.level_max()));
489602
}
490603
t.stop_timer();
491604

@@ -494,6 +607,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
494607
APRTimer tt(false);
495608
// Run processOnGpu() asynchronously - it will handle transfering data from CPU to GPU and run whole pipeline
496609
for (int i = 0; i < numOfStreams; ++i) {
610+
gpts[i].setBsplineOffset(bspline_offset);
497611
gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
498612
}
499613

@@ -506,6 +620,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
506620

507621
// in theory, we get new data and send them to task
508622
if (i < numOfStreams * (repetitionsPerStream - 1)) {
623+
gpts[c].setBsplineOffset(bspline_offset);
509624
gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
510625
}
511626

@@ -604,7 +719,9 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
604719
return get_apr_cpu(aAPR, input_image);
605720
#else
606721
// return get_apr_cuda(aAPR, input_image);
607-
return get_apr_cuda_streams(aAPR, input_image);
722+
// return get_apr_cuda_streams(aAPR, input_image);
723+
std::vector<PixelData<T> *> input_images(3*66, &input_image);
724+
return get_apr_cuda_multistreams(aAPR, input_images, 3);
608725
#endif
609726
}
610727

src/algorithm/ComputeGradientCuda.cu

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
326326
PixelData<float> &iCpuLevels;
327327
const APRParameters &iParameters;
328328
GenInfo iAprInfo;
329-
float iBsplineOffset;
329+
float iBsplineOffset = 0;
330330
int iMaxLevel;
331331

332332
// cuda stuff - memory and stream to be used
@@ -377,7 +377,7 @@ public:
377377
// TODO: Remove need for passing 'levels' to GpuProcessingTask
378378
// It was used during development to control internal computation like filters, gradient, levels etc. but
379379
// once all is done there is no need for it anymore
380-
GpuProcessingTaskImpl(const PixelData<ImgType> &inputImage, PixelData<float> &levels, const APRParameters &parameters, float bspline_offset, int maxLevel) :
380+
GpuProcessingTaskImpl(const PixelData<ImgType> &inputImage, PixelData<float> &levels, const APRParameters &parameters, int maxLevel) :
381381
iCpuImage(inputImage),
382382
iCpuLevels(levels),
383383
iStream(cudaStream.get()),
@@ -387,7 +387,6 @@ public:
387387
local_scale_temp2 (levels, iStream),
388388
iParameters(parameters),
389389
iAprInfo(iCpuImage.getDimension()),
390-
iBsplineOffset(bspline_offset),
391390
iMaxLevel(maxLevel),
392391
cudax(transferSpline(prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance), iStream)),
393392
cuday(transferSpline(prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance), iStream)),
@@ -490,12 +489,14 @@ public:
490489
lacs.y_vec.copy(y_vec);
491490
}
492491

492+
void setBsplineOffset(float offset) {iBsplineOffset = offset;}
493+
493494
~GpuProcessingTaskImpl() {}
494495
};
495496

496497
template <typename ImgType>
497-
GpuProcessingTask<ImgType>::GpuProcessingTask(const PixelData<ImgType> &image, PixelData<float> &levels, const APRParameters &parameters, float bspline_offset, int maxLevel)
498-
: impl{new GpuProcessingTaskImpl<ImgType>(image, levels, parameters, bspline_offset, maxLevel)} { }
498+
GpuProcessingTask<ImgType>::GpuProcessingTask(const PixelData<ImgType> &image, PixelData<float> &levels, const APRParameters &parameters, int maxLevel)
499+
: impl{new GpuProcessingTaskImpl<ImgType>(image, levels, parameters, maxLevel)} { }
499500

500501
template <typename ImgType>
501502
GpuProcessingTask<ImgType>::~GpuProcessingTask() { }
@@ -509,6 +510,9 @@ LinearAccessCudaStructs GpuProcessingTask<ImgType>::getDataFromGpu() {return imp
509510
template <typename ImgType>
510511
void GpuProcessingTask<ImgType>::processOnGpu() {impl->processOnGpu();}
511512

513+
template <typename ImgType>
514+
void GpuProcessingTask<ImgType>::setBsplineOffset(float offset) {impl->setBsplineOffset(offset);}
515+
512516
// explicit instantiation of handled types
513517
template class GpuProcessingTask<uint8_t>;
514518
template class GpuProcessingTask<int>;

src/algorithm/ComputeGradientCuda.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,14 @@ class GpuProcessingTask {
4242

4343
public:
4444

45-
GpuProcessingTask(const PixelData<ImgType> &image, PixelData<float> &levels, const APRParameters &parameters, float bspline_offset, int maxLevel);
45+
GpuProcessingTask(const PixelData<ImgType> &image, PixelData<float> &levels, const APRParameters &parameters, int maxLevel);
4646
~GpuProcessingTask();
4747
GpuProcessingTask(GpuProcessingTask&&);
4848

4949
LinearAccessCudaStructs getDataFromGpu();
5050
void processOnGpu();
51+
52+
void setBsplineOffset(float bspline_offset);
5153
};
5254

5355
#endif //LIBAPR_COMPUTEGRADIENTCUDA_HPP

test/FullPipelineCudaTest.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,8 @@ namespace {
339339

340340
// Calculate pipeline on GPU
341341
timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
342-
GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, bspline_offset, maxLevel);
342+
GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, maxLevel);
343+
gpt.setBsplineOffset(bspline_offset);
343344
gpt.processOnGpu();
344345
auto linearAccessGpu = gpt.getDataFromGpu();
345346
giGpu.total_number_particles = linearAccessGpu.y_vec.size();

0 commit comments

Comments
 (0)