@@ -76,8 +76,6 @@ class APRConverter {
7676 template <typename T>
7777 bool get_apr_cuda (APR &aAPR, PixelData<T> &input_image);
7878 template <typename T>
79- bool get_apr_cuda_streams (APR &aAPR, PixelData<T> &input_image);
80- template <typename T>
8179 bool get_apr_cuda_multistreams (APR &aAPR, const std::vector<PixelData<T> *> &input_images, int numOfStreams = 3 );
8280#endif
8381
@@ -406,25 +404,13 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
406404 initPipelineMemory (input_image.y_num , input_image.x_num , input_image.z_num );
407405
408406 computation_timer.start_timer (" init_mem" );
409- PixelData<ImageType> image_temp (input_image, false /* don't copy */ , true /* pinned memory */ ); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image)
407+ PixelData<ImageType> image_temp (input_image, true /* don't copy */ , true /* pinned memory */ ); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image)
410408
411409 // ///////////////////////////////
412410 // / Pipeline
413411 // //////////////////////
414- // offset image by factor (this is required if there are zero areas in the background with
415- // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
416- // Warning both of these could result in over-flow!
417-
418- if (std::is_floating_point<ImageType>::value) {
419- image_temp.copyFromMesh (input_image);
420- } else {
421- bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda );
422- image_temp.copyFromMeshWithUnaryOp (input_image, [=](const auto &a) { return (a + bspline_offset); });
423- }
424412
425413 GpuProcessingTask<ImageType> gpt (image_temp, local_scale_temp, par, aAPR.level_max ());
426- // std::cout << "after gpt \n";
427- gpt.setBsplineOffset (bspline_offset);
428414 gpt.processOnGpu ();
429415 auto linearAccessGpu = gpt.getDataFromGpu ();
430416
@@ -462,7 +448,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
462448 return false ;
463449 }
464450
465- // Reduce number of streams to number of images if there are less images than streams
451+ // Reduce number of streams to number of images if there are fewer images than streams
466452 if (numOfImages < numOfStreams) numOfStreams = numOfImages;
467453
468454 // Use first image to initialize the APR - all other images should have the same dimensions
@@ -476,7 +462,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
476462 std::vector<PixelData<ImageType>> tempImages;
477463 std::cout << " allocating PixelData for " << numOfStreams << " streams" << std::endl;
478464 for (int i = 0 ; i < numOfStreams; ++i) {
479- tempImages.emplace_back (PixelData<T>(*input_image, false /* don't copy */ , true /* pinned memory */ ));
465+ tempImages.emplace_back (PixelData<T>(*input_image, true /* copy */ , true /* pinned memory */ ));
480466 }
481467
482468 // ///////////////////////////////
@@ -497,22 +483,10 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
497483 t.start_timer (" GPU processing..." );
498484 // Saturate all the streams with first images
499485 for (int i = 0 ; i < numOfStreams; ++i) {
500-
501- // offset image by factor (this is required if there are zero areas in the background with
502- // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
503- // Warning both of these could result in over-flow!
504- if (std::is_floating_point<ImageType>::value) {
505- tempImages[i].copyFromMesh (*input_images[i]);
506- } else {
507- bspline_offset = compute_bspline_offset<ImageType>(*input_images[i], par.lambda );
508- tempImages[i].copyFromMeshWithUnaryOp (*input_images[i], [=](const auto &a) { return (a + bspline_offset); });
509- }
510486 std::cout << " Processing image " << i << " on stream " << i << std::endl;
511- gpts[i].setBsplineOffset (bspline_offset);
512487 gpts_futures[i] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
513488 }
514489
515-
516490 // Main loop - get results from GPU and send new images to the streams (if any left)
517491 for (int s = 0 ; s < numOfImages; ++s) {
518492 int streamNum = s % numOfStreams;
@@ -525,14 +499,8 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
525499 // We have 'numOfImages - numOfStreams' left to process after saturating the streams with first images
526500 if (s < numOfImages - numOfStreams) {
527501 int imageToProcess = s + numOfStreams;
528- if (std::is_floating_point<ImageType>::value) {
529- tempImages[streamNum].copyFromMesh (*input_images[imageToProcess]);
530- } else {
531- bspline_offset = compute_bspline_offset<ImageType>(*input_images[imageToProcess], par.lambda );
532- tempImages[streamNum].copyFromMeshWithUnaryOp (*input_images[imageToProcess], [=](const auto &a) { return (a + bspline_offset); });
533- }
502+ tempImages[streamNum].copyFromMesh (*input_images[imageToProcess]);
534503 std::cout << " Processing image " << imageToProcess << " on stream " << streamNum << std::endl;
535- gpts[streamNum].setBsplineOffset (bspline_offset);
536504 gpts_futures[streamNum] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[streamNum]);
537505 }
538506
@@ -554,97 +522,6 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
554522 std::cout << " CUDA multistream pipeline finished!\n " ;
555523 return true ;
556524}
557-
558- /* *
559- * Implementation of pipeline for GPU/CUDA and multiple streams
560- * NOTE: Currently only one image is processed multiple times just get an idea how fast it can be.
561- * Finally, it should be able to process incoming stream of data (sequence of images).
562- *
563- * @param aAPR - the APR data structure
564- * @param input_image - input image
565- */
566- template <typename ImageType> template <typename T>
567- inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T>& input_image) {
568- // Initialize APR and memory for the pipeline
569- if (!initPipelineAPR (aAPR, input_image)) return false ;
570- initPipelineMemory (input_image.y_num , input_image.x_num , input_image.z_num );
571- PixelData<ImageType> image_temp (input_image, false /* don't copy */ , true /* pinned memory */ ); // global image variable useful for passing between methods, or re-using memory (should be the only full size copy of the image)
572-
573- // ///////////////////////////////
574- // / Pipeline
575- // ///////////////////////////////
576-
577- // offset image by factor (this is required if there are zero areas in the background with
578- // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
579- // Warning both of these could result in over-flow!
580- if (std::is_floating_point<ImageType>::value) {
581- image_temp.copyFromMesh (input_image);
582- } else {
583- bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda );
584- image_temp.copyFromMeshWithUnaryOp (input_image, [=](const auto &a) { return (a + bspline_offset); });
585- }
586-
587- // Run input on the GPU streams
588- constexpr int numOfStreams = 3 ; // number of streams to use for parallel processing
589- constexpr int repetitionsPerStream = 3 ; // number of repetitions per stream to simulate processing of multiple images
590-
591- APRTimer ttt (true );
592-
593- ttt.start_timer (" -----------------------------> Whole GPU pipeline with repetitions and MEMORY" );
594- {
595- APRTimer t (true );
596- std::vector<GpuProcessingTask<ImageType>> gpts;
597-
598- t.start_timer (" Creating GPTS" );
599- std::vector<std::future<void >> gpts_futures; gpts_futures.resize (numOfStreams);
600- for (int i = 0 ; i < numOfStreams; ++i) {
601- gpts.emplace_back (GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, aAPR.level_max ()));
602- }
603- t.stop_timer ();
604-
605- t.start_timer (" -----------------------------> Whole GPU pipeline with repetitions" );
606- {
607- APRTimer tt (false );
608- // Run processOnGpu() asynchronously - it will handle transfering data from CPU to GPU and run whole pipeline
609- for (int i = 0 ; i < numOfStreams; ++i) {
610- gpts[i].setBsplineOffset (bspline_offset);
611- gpts_futures[i] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
612- }
613-
614- for (int i = 0 ; i < numOfStreams * repetitionsPerStream; ++i) {
615- int c = i % numOfStreams;
616-
617- // Get data from GpuProcessingTask - get() will block until the task is finished
618- gpts_futures[c].get ();
619- auto linearAccessGpu = gpts[c].getDataFromGpu ();
620-
621- // in theory, we get new data and send them to task
622- if (i < numOfStreams * (repetitionsPerStream - 1 )) {
623- gpts[c].setBsplineOffset (bspline_offset);
624- gpts_futures[c] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
625- }
626-
627- // Fill APR data structure with data from GPU
628- aAPR.aprInfo .total_number_particles = linearAccessGpu.y_vec .size ();
629- aAPR.linearAccess .y_vec = std::move (linearAccessGpu.y_vec );
630- aAPR.linearAccess .xz_end_vec = std::move (linearAccessGpu.xz_end_vec );
631- aAPR.linearAccess .level_xz_vec = std::move (linearAccessGpu.level_xz_vec );
632-
633- aAPR.apr_initialized = true ;
634- }
635- }
636- auto allT = t.stop_timer ();
637- std::cout << " Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n " ;
638- std::cout << " Bandwidth:" << (input_image.size () / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024 ) << " MB/s\n " ;
639- }
640- auto allT = ttt.stop_timer ();
641- float tpi = allT / (numOfStreams*repetitionsPerStream);
642- std::cout << " Time per image: " << tpi << " seconds\n " ;
643- std::cout << " Image size: " << (input_image.size () / 1024 / 1024 ) << " MB\n " ;
644- std::cout << " Bandwidth:" << (input_image.size () / tpi / 1024 / 1024 ) << " MB/s\n " ;
645-
646- return true ;
647- }
648525#endif
649526
650527
@@ -719,8 +596,7 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
719596 return get_apr_cpu (aAPR, input_image);
720597#else
721598 // return get_apr_cuda(aAPR, input_image);
722- // return get_apr_cuda_streams(aAPR, input_image);
723- std::vector<PixelData<T> *> input_images (3 *66 , &input_image);
599+ std::vector<PixelData<T> *> input_images (1 , &input_image);
724600 return get_apr_cuda_multistreams (aAPR, input_images, 3 );
725601#endif
726602}
0 commit comments