@@ -182,7 +182,7 @@ void APRConverter<ImageType>::get_apr_custom_grad_scale(APR& aAPR,PixelData<Imag
182182
183183 } else {
184184 // To be done. The L(y) needs to be computed then max downsampled.
185- std::cerr << " Not implimented " << std::endl;
185+ std::cerr << " Not implemented " << std::endl;
186186
187187 }
188188
@@ -412,17 +412,15 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
412412 // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
413413 // Warning both of these could result in over-flow!
414414
415- if (std::is_same<uint16_t , ImageType>::value) {
416- bspline_offset = 100 ;
417- image_temp.copyFromMeshWithUnaryOp (input_image, [=](const auto &a) { return (a + bspline_offset); });
418- } else if (std::is_same<uint8_t , ImageType>::value) {
419- bspline_offset = 5 ;
420- image_temp.copyFromMeshWithUnaryOp (input_image, [=](const auto &a) { return (a + bspline_offset); });
421- } else {
415+ if (std::is_floating_point<ImageType>::value) {
422416 image_temp.copyFromMesh (input_image);
417+ } else {
418+ bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda );
419+ image_temp.copyFromMeshWithUnaryOp (input_image, [=](const auto &a) { return (a + bspline_offset); });
423420 }
424421
425422 GpuProcessingTask<ImageType> gpt (image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max ());
423+ // std::cout << "after gpt \n";
426424 gpt.sendDataToGpu ();
427425 gpt.processOnGpu ();
428426 auto linearAccessGpu = gpt.getDataFromGpu ();
@@ -479,76 +477,169 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
479477
480478
481479
482- constexpr int numOfStreams = 3 ;
483- constexpr int repetitionsPerStream = 3 ; //
484- APRTimer ttt (true );
485- ttt.start_timer (" -----------------------------> Whole GPU pipeline with repetitions and MEMORY" );
486- {
487- std::vector<GpuProcessingTask<ImageType>> gpts;
488-
489- // std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
490- for (int i = 0 ; i < numOfStreams; ++i) {
491- gpts.emplace_back (GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max ()));
492- }
480+ constexpr int numOfStreams = 3 ; // number of streams to use for parallel processing
481+ constexpr int repetitionsPerStream = 15 ; // number of repetitions per stream to simulate processing of multiple images
482+ bool useThreads = true ;
493483
494- APRTimer t (true );
495- t.start_timer (" -----------------------------> Whole GPU pipeline with repetitions" );
484+ if (useThreads) {
485+ std::cout << " \n !!! USING THREADS !!!\n\n " ;
486+ APRTimer ttt (true );
487+ std::cout << " >>>>>>>>>>> START\n " ;
488+ ttt.start_timer (" -----------------------------> Whole GPU pipeline with repetitions and MEMORY" );
496489 {
490+ APRTimer t (true );
491+ std::vector<GpuProcessingTask<ImageType>> gpts;
497492
498- APRTimer tt ( false );
499- // Create streams and send initial task to do
493+ t. start_timer ( " Creating GPTS " );
494+ std::vector<std::future< void >> gpts_futures; gpts_futures. resize (numOfStreams);
500495 for (int i = 0 ; i < numOfStreams; ++i) {
501- // gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
502- tt.start_timer (" SEND" );
503- gpts[i].sendDataToGpu ();
504- tt.stop_timer ();
505- // std::cout << "Send " << i << std::endl;
506- // gpts.back().processOnGpu();
507- // std::cout << "Proc " << i << std::endl;
496+ gpts.emplace_back (GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max ()));
508497 }
509- // Create streams and send initial task to do
510- for (int i = 0 ; i < numOfStreams; ++i) {
511- // gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
512- tt.start_timer (" Process" );
513- gpts[i].processOnGpu ();
514- tt.stop_timer ();
515- // std::cout << "Proc " << i << std::endl;
516- }
517- std::cout << " =========" << std::endl;
518-
519- for (int i = 0 ; i < numOfStreams * repetitionsPerStream; ++i) {
520- int c = i % numOfStreams;
521-
522- // get data from previous task
523- // gpts_futures[c].get();
524- auto linearAccessGpu = gpts[c].getDataFromGpu ();
525- // std::cout << "Get " << c << std::endl;
526-
527- // in theory, we get new data and send them to task
528- if (i < numOfStreams * (repetitionsPerStream - 1 )) {
529- gpts[c].sendDataToGpu ();
530- // std::cout << "Send " << c << std::endl;
531- gpts[c].processOnGpu ();
532- // gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
533- // std::cout << "Proc " << c << std::endl;
498+ t.stop_timer ();
499+
500+ t.start_timer (" -----------------------------> Whole GPU pipeline with repetitions" );
501+ {
502+ APRTimer tt (false );
503+ // Create streams and send initial task to do
504+ for (int i = 0 ; i < numOfStreams; ++i) {
505+ // gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
506+ tt.start_timer (" SEND" );
507+ // gpts[i].sendDataToGpu();
508+ // gpts[i].processOnGpu();
509+ tt.stop_timer ();
510+ // std::cout << "Send " << i << std::endl;
511+ // gpts.back().processOnGpu();
512+ // std::cout << "Proc " << i << std::endl;
534513 }
514+ // Create streams and send initial task to do
515+ for (int i = 0 ; i < numOfStreams; ++i) {
516+ gpts_futures[i] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
517+ // tt.start_timer("Process");
518+ // gpts[i].processOnGpu();
519+ // tt.stop_timer();
520+ // std::cout << "Proc " << i << std::endl;
521+ }
522+ std::cout << " =========" << std::endl;
523+
524+ for (int i = 0 ; i < numOfStreams * repetitionsPerStream; ++i) {
525+ int c = i % numOfStreams;
535526
536- aAPR.aprInfo .total_number_particles = linearAccessGpu.y_vec .size ();
527+ // get data from previous task
528+ gpts_futures[c].get ();
529+ auto linearAccessGpu = gpts[c].getDataFromGpu ();
537530
538- // generateDatastructures(aAPR) for linearAcceess for CUDA
539- aAPR.linearAccess .y_vec .copy (linearAccessGpu.y_vec );
540- aAPR.linearAccess .xz_end_vec .copy (linearAccessGpu.xz_end_vec );
541- aAPR.linearAccess .level_xz_vec .copy (linearAccessGpu.level_xz_vec );
542- aAPR.apr_initialized = true ;
531+ // in theory, we get new data and send them to task
532+ if (i < numOfStreams * (repetitionsPerStream - 1 )) {
533+ // gpts[c].sendDataToGpu();
534+ // std::cout << "Send " << c << std::endl;
535+ // gpts[c].processOnGpu();
536+ gpts_futures[c] = std::async (std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
537+ // std::cout << "Proc " << c << std::endl;
538+ }
543539
544- // std::cout << "CUDA pipeline finished!\n";
540+ aAPR.aprInfo .total_number_particles = linearAccessGpu.y_vec .size ();
541+
542+ // generateDatastructures(aAPR) for linearAcceess for CUDA
543+ aAPR.linearAccess .y_vec .copy (linearAccessGpu.y_vec );
544+ aAPR.linearAccess .xz_end_vec .copy (linearAccessGpu.xz_end_vec );
545+ aAPR.linearAccess .level_xz_vec .copy (linearAccessGpu.level_xz_vec );
546+ aAPR.apr_initialized = true ;
547+
548+ // std::cout << "CUDA pipeline finished!\n";
549+ }
550+ // cudaDeviceSynchronize();
551+ }
552+ auto allT = t.stop_timer ();
553+ std::cout << " Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n " ;
554+ std::cout << " Bandwidth:" << (input_image.size () / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024 ) << " MB/s\n " ;
555+ }
556+ auto allT = ttt.stop_timer ();
557+ float tpi = allT / (numOfStreams*repetitionsPerStream);
558+ std::cout << " Time per image: " << tpi << " seconds\n " ;
559+ std::cout << " Image size: " << (input_image.size () / 1024 / 1024 ) << " MB\n " ;
560+ std::cout << " Bandwidth:" << (input_image.size () / tpi / 1024 / 1024 ) << " MB/s\n " ;
561+
562+
563+ std::cout << " <<<<<<<<<<<< STOP\n " ;
564+ }
565+ else {
566+ APRTimer ttt (true );
567+ std::cout << " >>>>>>>>>>> START\n " ;
568+ ttt.start_timer (" -----------------------------> Whole GPU pipeline with repetitions and MEMORY" );
569+ {
570+ APRTimer t (true );
571+ std::vector<GpuProcessingTask<ImageType>> gpts;
572+
573+ t.start_timer (" Creating GPTS" );
574+ // std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
575+ for (int i = 0 ; i < numOfStreams; ++i) {
576+ gpts.emplace_back (GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max ()));
577+ }
578+ // cudaDeviceSynchronize();
579+ t.stop_timer ();
580+
581+ t.start_timer (" -----------------------------> Whole GPU pipeline with repetitions" );
582+ {
583+
584+ APRTimer tt (false );
585+ // Create streams and send initial task to do
586+ for (int i = 0 ; i < numOfStreams; ++i) {
587+ // gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
588+ tt.start_timer (" SEND" );
589+ gpts[i].sendDataToGpu ();
590+ gpts[i].processOnGpu ();
591+ tt.stop_timer ();
592+ // std::cout << "Send " << i << std::endl;
593+ // gpts.back().processOnGpu();
594+ // std::cout << "Proc " << i << std::endl;
595+ }
596+ // Create streams and send initial task to do
597+ for (int i = 0 ; i < numOfStreams; ++i) {
598+ // gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
599+ tt.start_timer (" Process" );
600+ // gpts[i].processOnGpu();
601+ tt.stop_timer ();
602+ // std::cout << "Proc " << i << std::endl;
603+ }
604+ std::cout << " =========" << std::endl;
605+
606+ for (int i = 0 ; i < numOfStreams * repetitionsPerStream; ++i) {
607+ int c = i % numOfStreams;
608+
609+ // get data from previous task
610+ // gpts_futures[c].get();
611+ auto linearAccessGpu = gpts[c].getDataFromGpu ();
612+ // std::cout << "Get " << c << std::endl;
613+
614+ // in theory, we get new data and send them to task
615+ if (i < numOfStreams * (repetitionsPerStream - 1 )) {
616+ gpts[c].sendDataToGpu ();
617+ // std::cout << "Send " << c << std::endl;
618+ gpts[c].processOnGpu ();
619+ // gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
620+ // std::cout << "Proc " << c << std::endl;
621+ }
622+
623+ aAPR.aprInfo .total_number_particles = linearAccessGpu.y_vec .size ();
624+
625+ // generateDatastructures(aAPR) for linearAcceess for CUDA
626+ aAPR.linearAccess .y_vec .copy (linearAccessGpu.y_vec );
627+ aAPR.linearAccess .xz_end_vec .copy (linearAccessGpu.xz_end_vec );
628+ aAPR.linearAccess .level_xz_vec .copy (linearAccessGpu.level_xz_vec );
629+ aAPR.apr_initialized = true ;
630+
631+ // std::cout << "CUDA pipeline finished!\n";
632+ }
633+ // cudaDeviceSynchronize();
545634 }
635+ auto allT = t.stop_timer ();
636+ std::cout << " Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n " ;
546637 }
547- auto allT = t .stop_timer ();
638+ auto allT = ttt .stop_timer ();
548639 std::cout << " Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n " ;
640+ std::cout << " <<<<<<<<<<<< STOP\n " ;
549641 }
550- auto allT = ttt.stop_timer ();
551- std::cout << " Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n " ;
642+
552643
553644 return false ; // TODO: change it back to true
554645}
@@ -624,8 +715,8 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
624715#ifndef APR_USE_CUDA
625716 return get_apr_cpu (aAPR, input_image);
626717#else
627- // return get_apr_cuda(aAPR, input_image);
628- return get_apr_cuda_streams (aAPR, input_image);
718+ return get_apr_cuda (aAPR, input_image);
719+ // return get_apr_cuda_streams(aAPR, input_image);
629720#endif
630721}
631722
0 commit comments