@@ -422,7 +422,6 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
422422
423423 GpuProcessingTask<ImageType> gpt (image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max ());
424424 // std::cout << "after gpt \n";
425- gpt.sendDataToGpu ();
426425 gpt.processOnGpu ();
427426 auto linearAccessGpu = gpt.getDataFromGpu ();
428427
@@ -453,198 +452,87 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
453452 */
454453template <typename ImageType> template <typename T>
455454inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T>& input_image) {
456-
455+ // Initialize APR and memory for the pipeline
457456 if (!initPipelineAPR (aAPR, input_image)) return false ;
458-
459457 initPipelineMemory (input_image.y_num , input_image.x_num , input_image.z_num );
460-
461- computation_timer.start_timer (" init_mem" );
462- PixelData<ImageType> image_temp (input_image, false /* don't copy */ , true /* pinned memory */ ); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
458+ PixelData<ImageType> image_temp (input_image, false /* don't copy */ , true /* pinned memory */ ); // global image variable useful for passing between methods, or re-using memory (should be the only full size copy of the image)
463459
464460 // ///////////////////////////////
465461 // / Pipeline
466- // //////////////////////
462+ // ///////////////////////////////
463+
467464 // offset image by factor (this is required if there are zero areas in the background with
468465 // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
469466 // Warning both of these could result in over-flow!
470-
471467 if (std::is_floating_point<ImageType>::value) {
472468 image_temp.copyFromMesh (input_image);
473469 } else {
474470 bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda );
475471 image_temp.copyFromMeshWithUnaryOp (input_image, [=](const auto &a) { return (a + bspline_offset); });
476472 }
477473
478-
479-
474+ // Run input on the GPU streams
480475 constexpr int numOfStreams = 3 ; // number of streams to use for parallel processing
481- constexpr int repetitionsPerStream = 15 ; // number of repetitions per stream to simulate processing of multiple images
482- bool useThreads = true ;
483-
484- if (useThreads) {
485- std::cout << " \n !!! USING THREADS !!!\n\n " ;
486- APRTimer ttt (true );
487- std::cout << " >>>>>>>>>>> START\n " ;
488- ttt.start_timer (" -----------------------------> Whole GPU pipeline with repetitions and MEMORY" );
489- {
490- APRTimer t (true );
491- std::vector<GpuProcessingTask<ImageType>> gpts;
476+ constexpr int repetitionsPerStream = 3 ; // number of repetitions per stream to simulate processing of multiple images
492477
493- t.start_timer (" Creating GPTS" );
494- std::vector<std::future<void >> gpts_futures; gpts_futures.resize (numOfStreams);
495- for (int i = 0 ; i < numOfStreams; ++i) {
496- gpts.emplace_back (GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max ()));
497- }
498- t.stop_timer ();
499-
500- t.start_timer (" -----------------------------> Whole GPU pipeline with repetitions" );
501- {
502- APRTimer tt (false );
503- // Create streams and send initial task to do
504- for (int i = 0 ; i < numOfStreams; ++i) {
505- // gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
506- tt.start_timer (" SEND" );
507- // gpts[i].sendDataToGpu();
508- // gpts[i].processOnGpu();
509- tt.stop_timer ();
510- // std::cout << "Send " << i << std::endl;
511- // gpts.back().processOnGpu();
512- // std::cout << "Proc " << i << std::endl;
513- }
514- // Create streams and send initial task to do
515- for (int i = 0 ; i < numOfStreams; ++i) {
516- gpts_futures[i] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
517- // tt.start_timer("Process");
518- // gpts[i].processOnGpu();
519- // tt.stop_timer();
520- // std::cout << "Proc " << i << std::endl;
521- }
522- std::cout << " =========" << std::endl;
478+ APRTimer ttt (true );
523479
524- for (int i = 0 ; i < numOfStreams * repetitionsPerStream; ++i) {
525- int c = i % numOfStreams;
480+ ttt.start_timer (" -----------------------------> Whole GPU pipeline with repetitions and MEMORY" );
481+ {
482+ APRTimer t (true );
483+ std::vector<GpuProcessingTask<ImageType>> gpts;
526484
527- // get data from previous task
528- gpts_futures[c].get ();
529- auto linearAccessGpu = gpts[c].getDataFromGpu ();
485+ t.start_timer (" Creating GPTS" );
486+ std::vector<std::future<void >> gpts_futures; gpts_futures.resize (numOfStreams);
487+ for (int i = 0 ; i < numOfStreams; ++i) {
488+ gpts.emplace_back (GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max ()));
489+ }
490+ t.stop_timer ();
530491
531- // in theory, we get new data and send them to task
532- if (i < numOfStreams * (repetitionsPerStream - 1 )) {
533- // gpts[c].sendDataToGpu();
534- // std::cout << "Send " << c << std::endl;
535- // gpts[c].processOnGpu();
536- gpts_futures[c] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
537- // std::cout << "Proc " << c << std::endl;
538- }
492+ t.start_timer (" -----------------------------> Whole GPU pipeline with repetitions" );
493+ {
494+ APRTimer tt (false );
495+ // Run processOnGpu() asynchronously - it will handle transfering data from CPU to GPU and run whole pipeline
496+ for (int i = 0 ; i < numOfStreams; ++i) {
497+ gpts_futures[i] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
498+ }
539499
540- aAPR.aprInfo .total_number_particles = linearAccessGpu.y_vec .size ();
500+ for (int i = 0 ; i < numOfStreams * repetitionsPerStream; ++i) {
501+ int c = i % numOfStreams;
541502
542- // generateDatastructures(aAPR) for linearAcceess for CUDA
543- aAPR.linearAccess .y_vec .copy (linearAccessGpu.y_vec );
544- aAPR.linearAccess .xz_end_vec .copy (linearAccessGpu.xz_end_vec );
545- aAPR.linearAccess .level_xz_vec .copy (linearAccessGpu.level_xz_vec );
546- aAPR.apr_initialized = true ;
503+ // Get data from GpuProcessingTask - get() will block until the task is finished
504+ gpts_futures[c].get ();
505+ auto linearAccessGpu = gpts[c].getDataFromGpu ();
547506
548- // std::cout << "CUDA pipeline finished!\n";
507+ // in theory, we get new data and send them to task
508+ if (i < numOfStreams * (repetitionsPerStream - 1 )) {
509+ gpts_futures[c] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
549510 }
550- // cudaDeviceSynchronize();
551- }
552- auto allT = t.stop_timer ();
553- std::cout << " Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n " ;
554- std::cout << " Bandwidth:" << (input_image.size () / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024 ) << " MB/s\n " ;
555- }
556- auto allT = ttt.stop_timer ();
557- float tpi = allT / (numOfStreams*repetitionsPerStream);
558- std::cout << " Time per image: " << tpi << " seconds\n " ;
559- std::cout << " Image size: " << (input_image.size () / 1024 / 1024 ) << " MB\n " ;
560- std::cout << " Bandwidth:" << (input_image.size () / tpi / 1024 / 1024 ) << " MB/s\n " ;
561-
562511
563- std::cout << " <<<<<<<<<<<< STOP\n " ;
564- }
565- else {
566- APRTimer ttt (true );
567- std::cout << " >>>>>>>>>>> START\n " ;
568- ttt.start_timer (" -----------------------------> Whole GPU pipeline with repetitions and MEMORY" );
569- {
570- APRTimer t (true );
571- std::vector<GpuProcessingTask<ImageType>> gpts;
512+ // Fill APR data structure with data from GPU
513+ aAPR.aprInfo .total_number_particles = linearAccessGpu.y_vec .size ();
514+ aAPR.linearAccess .y_vec = std::move (linearAccessGpu.y_vec );
515+ aAPR.linearAccess .xz_end_vec = std::move (linearAccessGpu.xz_end_vec );
516+ aAPR.linearAccess .level_xz_vec = std::move (linearAccessGpu.level_xz_vec );
572517
573- t.start_timer (" Creating GPTS" );
574- // std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
575- for (int i = 0 ; i < numOfStreams; ++i) {
576- gpts.emplace_back (GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max ()));
577- }
578- // cudaDeviceSynchronize();
579- t.stop_timer ();
580-
581- t.start_timer (" -----------------------------> Whole GPU pipeline with repetitions" );
582- {
583-
584- APRTimer tt (false );
585- // Create streams and send initial task to do
586- for (int i = 0 ; i < numOfStreams; ++i) {
587- // gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
588- tt.start_timer (" SEND" );
589- gpts[i].sendDataToGpu ();
590- gpts[i].processOnGpu ();
591- tt.stop_timer ();
592- // std::cout << "Send " << i << std::endl;
593- // gpts.back().processOnGpu();
594- // std::cout << "Proc " << i << std::endl;
595- }
596- // Create streams and send initial task to do
597- for (int i = 0 ; i < numOfStreams; ++i) {
598- // gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
599- tt.start_timer (" Process" );
600- // gpts[i].processOnGpu();
601- tt.stop_timer ();
602- // std::cout << "Proc " << i << std::endl;
603- }
604- std::cout << " =========" << std::endl;
605-
606- for (int i = 0 ; i < numOfStreams * repetitionsPerStream; ++i) {
607- int c = i % numOfStreams;
608-
609- // get data from previous task
610- // gpts_futures[c].get();
611- auto linearAccessGpu = gpts[c].getDataFromGpu ();
612- // std::cout << "Get " << c << std::endl;
613-
614- // in theory, we get new data and send them to task
615- if (i < numOfStreams * (repetitionsPerStream - 1 )) {
616- gpts[c].sendDataToGpu ();
617- // std::cout << "Send " << c << std::endl;
618- gpts[c].processOnGpu ();
619- // gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
620- // std::cout << "Proc " << c << std::endl;
621- }
622-
623- aAPR.aprInfo .total_number_particles = linearAccessGpu.y_vec .size ();
624-
625- // generateDatastructures(aAPR) for linearAcceess for CUDA
626- aAPR.linearAccess .y_vec .copy (linearAccessGpu.y_vec );
627- aAPR.linearAccess .xz_end_vec .copy (linearAccessGpu.xz_end_vec );
628- aAPR.linearAccess .level_xz_vec .copy (linearAccessGpu.level_xz_vec );
629- aAPR.apr_initialized = true ;
630-
631- // std::cout << "CUDA pipeline finished!\n";
632- }
633- // cudaDeviceSynchronize();
518+ aAPR.apr_initialized = true ;
634519 }
635- auto allT = t.stop_timer ();
636- std::cout << " Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n " ;
637520 }
638- auto allT = ttt .stop_timer ();
521+ auto allT = t .stop_timer ();
639522 std::cout << " Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n " ;
640- std::cout << " <<<<<<<<<<<< STOP \n " ;
523+ std::cout << " Bandwidth: " << (input_image. size () / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024 ) << " MB/s \n " ;
641524 }
525+ auto allT = ttt.stop_timer ();
526+ float tpi = allT / (numOfStreams*repetitionsPerStream);
527+ std::cout << " Time per image: " << tpi << " seconds\n " ;
528+ std::cout << " Image size: " << (input_image.size () / 1024 / 1024 ) << " MB\n " ;
529+ std::cout << " Bandwidth:" << (input_image.size () / tpi / 1024 / 1024 ) << " MB/s\n " ;
642530
643-
644- return false ; // TODO: change it back to true
531+ return true ;
645532}
646533#endif
647534
535+
648536/* *
649537 * Implementation of pipeline for CPU
650538 *
@@ -715,8 +603,8 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
715603#ifndef APR_USE_CUDA
716604 return get_apr_cpu (aAPR, input_image);
717605#else
718- return get_apr_cuda (aAPR, input_image);
719- // return get_apr_cuda_streams(aAPR, input_image);
606+ // return get_apr_cuda(aAPR, input_image);
607+ return get_apr_cuda_streams (aAPR, input_image);
720608#endif
721609}
722610
0 commit comments