@@ -77,6 +77,8 @@ class APRConverter {
7777 bool get_apr_cuda (APR &aAPR, PixelData<T> &input_image);
7878 template <typename T>
7979 bool get_apr_cuda_streams (APR &aAPR, PixelData<T> &input_image);
80+ template <typename T>
81+ bool get_apr_cuda_multistreams (APR &aAPR, const std::vector<PixelData<T> *> &input_images, int numOfStreams = 3 );
8082#endif
8183
8284 bool verbose = true ;
@@ -420,8 +422,9 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
420422 image_temp.copyFromMeshWithUnaryOp (input_image, [=](const auto &a) { return (a + bspline_offset); });
421423 }
422424
423- GpuProcessingTask<ImageType> gpt (image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max ());
425+ GpuProcessingTask<ImageType> gpt (image_temp, local_scale_temp, par, aAPR.level_max ());
424426 // std::cout << "after gpt \n";
427+ gpt.setBsplineOffset (bspline_offset);
425428 gpt.processOnGpu ();
426429 auto linearAccessGpu = gpt.getDataFromGpu ();
427430
@@ -442,6 +445,116 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
442445#endif
443446
444447#ifdef APR_USE_CUDA
448+ /* *
449+ * Implementation of pipeline for GPU/CUDA and multiple streams
450+ * NOTE: Currently only one image is processed multiple times just get an idea how fast it can be.
451+ * Finally, it should be able to process incoming stream of data (sequence of images).
452+ *
453+ * @param aAPR - the APR data structure
454+ * @param input_images - input images
455+ * @param numOfStreams - number of streams to use for parallel processing on GPU
456+ */
457+ template <typename ImageType> template <typename T>
458+ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const std::vector<PixelData<T>*> &input_images, int numOfStreams) {
459+ int numOfImages = input_images.size ();
460+ if (numOfImages == 0 ) {
461+ std::cerr << " No input images provided for APR conversion." << std::endl;
462+ return false ;
463+ }
464+
465+ // Reduce number of streams to number of images if there are less images than streams
466+ if (numOfImages < numOfStreams) numOfStreams = numOfImages;
467+
468+ // Use first image to initialize the APR - all other images should have the same dimensions
469+ auto input_image = input_images[0 ];
470+
471+ // Initialize APR and memory for the pipeline
472+ if (!initPipelineAPR (aAPR, *input_image)) return false ;
473+ initPipelineMemory (input_image->y_num , input_image->x_num , input_image->z_num );
474+
475+ // Create a temporary image for each stream
476+ std::vector<PixelData<ImageType>> tempImages;
477+ std::cout << " allocating PixelData for " << numOfStreams << " streams" << std::endl;
478+ for (int i = 0 ; i < numOfStreams; ++i) {
479+ tempImages.emplace_back (PixelData<T>(*input_image, false /* don't copy */ , true /* pinned memory */ ));
480+ }
481+
482+ // ///////////////////////////////
483+ // / Pipeline
484+ // ///////////////////////////////
485+ APRTimer t (true );
486+
487+ // Create GpuProcessingTask for each stream
488+ std::vector<GpuProcessingTask<ImageType>> gpts;
489+ t.start_timer (" Creating GPTS" );
490+ std::vector<std::future<void >> gpts_futures; gpts_futures.resize (numOfStreams);
491+ for (int i = 0 ; i < numOfStreams; ++i) {
492+ gpts.emplace_back (GpuProcessingTask<ImageType>(tempImages[i], local_scale_temp, par, aAPR.level_max ()));
493+ }
494+ t.stop_timer ();
495+
496+
497+ t.start_timer (" GPU processing..." );
498+ // Saturate all the streams with first images
499+ for (int i = 0 ; i < numOfStreams; ++i) {
500+
501+ // offset image by factor (this is required if there are zero areas in the background with
502+ // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
503+ // Warning both of these could result in over-flow!
504+ if (std::is_floating_point<ImageType>::value) {
505+ tempImages[i].copyFromMesh (*input_images[i]);
506+ } else {
507+ bspline_offset = compute_bspline_offset<ImageType>(*input_images[i], par.lambda );
508+ tempImages[i].copyFromMeshWithUnaryOp (*input_images[i], [=](const auto &a) { return (a + bspline_offset); });
509+ }
510+ std::cout << " Processing image " << i << " on stream " << i << std::endl;
511+ gpts[i].setBsplineOffset (bspline_offset);
512+ gpts_futures[i] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
513+ }
514+
515+
516+ // Main loop - get results from GPU and send new images to the streams (if any left)
517+ for (int s = 0 ; s < numOfImages; ++s) {
518+ int streamNum = s % numOfStreams;
519+
520+ // Get data from GpuProcessingTask - get() will block until the task is finished
521+ gpts_futures[streamNum].get ();
522+ auto linearAccessGpu = gpts[streamNum].getDataFromGpu ();
523+
524+ // Send next images to the stream if there are any left
525+ // We have 'numOfImages - numOfStreams' left to process after saturating the streams with first images
526+ if (s < numOfImages - numOfStreams) {
527+ int imageToProcess = s + numOfStreams;
528+ if (std::is_floating_point<ImageType>::value) {
529+ tempImages[streamNum].copyFromMesh (*input_images[imageToProcess]);
530+ } else {
531+ bspline_offset = compute_bspline_offset<ImageType>(*input_images[imageToProcess], par.lambda );
532+ tempImages[streamNum].copyFromMeshWithUnaryOp (*input_images[imageToProcess], [=](const auto &a) { return (a + bspline_offset); });
533+ }
534+ std::cout << " Processing image " << imageToProcess << " on stream " << streamNum << std::endl;
535+ gpts[streamNum].setBsplineOffset (bspline_offset);
536+ gpts_futures[streamNum] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[streamNum]);
537+ }
538+
539+ // Fill APR data structure with data from GPU
540+ aAPR.aprInfo .total_number_particles = linearAccessGpu.y_vec .size ();
541+ aAPR.linearAccess .y_vec = std::move (linearAccessGpu.y_vec );
542+ aAPR.linearAccess .xz_end_vec = std::move (linearAccessGpu.xz_end_vec );
543+ aAPR.linearAccess .level_xz_vec = std::move (linearAccessGpu.level_xz_vec );
544+
545+ aAPR.apr_initialized = true ;
546+ }
547+
548+ auto allT = t.stop_timer ();
549+ float tpi = allT / (numOfImages);
550+ std::cout << " Num of images processed: " << numOfImages << " \n " ;
551+ std::cout << " Time per image: " << tpi << " seconds\n " ;
552+ std::cout << " Image size: " << (input_images[0 ]->size () / 1024 / 1024 ) << " MB\n " ;
553+ std::cout << " Bandwidth:" << (input_images[0 ]->size () / tpi / 1024 / 1024 ) << " MB/s\n " ;
554+ std::cout << " CUDA multistream pipeline finished!\n " ;
555+ return true ;
556+ }
557+
445558/* *
446559 * Implementation of pipeline for GPU/CUDA and multiple streams
447560 * NOTE: Currently only one image is processed multiple times just get an idea how fast it can be.
@@ -485,7 +598,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
485598 t.start_timer (" Creating GPTS" );
486599 std::vector<std::future<void >> gpts_futures; gpts_futures.resize (numOfStreams);
487600 for (int i = 0 ; i < numOfStreams; ++i) {
488- gpts.emplace_back (GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max ()));
601+ gpts.emplace_back (GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, aAPR.level_max ()));
489602 }
490603 t.stop_timer ();
491604
@@ -494,6 +607,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
494607 APRTimer tt (false );
495608 // Run processOnGpu() asynchronously - it will handle transfering data from CPU to GPU and run whole pipeline
496609 for (int i = 0 ; i < numOfStreams; ++i) {
610+ gpts[i].setBsplineOffset (bspline_offset);
497611 gpts_futures[i] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
498612 }
499613
@@ -506,6 +620,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
506620
507621 // in theory, we get new data and send them to task
508622 if (i < numOfStreams * (repetitionsPerStream - 1 )) {
623+ gpts[c].setBsplineOffset (bspline_offset);
509624 gpts_futures[c] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
510625 }
511626
@@ -604,7 +719,9 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
604719 return get_apr_cpu (aAPR, input_image);
605720#else
606721 // return get_apr_cuda(aAPR, input_image);
607- return get_apr_cuda_streams (aAPR, input_image);
722+ // return get_apr_cuda_streams(aAPR, input_image);
723+ std::vector<PixelData<T> *> input_images (3 *66 , &input_image);
724+ return get_apr_cuda_multistreams (aAPR, input_images, 3 );
608725#endif
609726}
610727
0 commit comments