AdaptiveParticles
diff --git a/‎src/algorithm/APRConverter.hpp‎
Lines changed: 158 additions & 67 deletions b/‎src/algorithm/APRConverter.hpp‎
Lines changed: 158 additions & 67 deletions
diff --git a/‎src/algorithm/APRParameters.hpp‎
Lines changed: 6 additions & 1 deletion b/‎src/algorithm/APRParameters.hpp‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/algorithm/ComputeGradient.hpp‎
Lines changed: 1 addition & 1 deletion b/‎src/algorithm/ComputeGradient.hpp‎
Lines changed: 1 addition & 1 deletion
@@ -182,7 +182,7 @@ void APRConverter<ImageType>::get_apr_custom_grad_scale(APR& aAPR,PixelData<Imag
 
     } else {
         // To be done. The L(y) needs to be computed then max downsampled.
-        std::cerr << "Not implimented" << std::endl;
+        std::cerr << "Not implemented" << std::endl;
 
     }
 
@@ -412,17 +412,15 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
     // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
     // Warning both of these could result in over-flow!
 
-    if (std::is_same<uint16_t, ImageType>::value) {
-        bspline_offset = 100;
-        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-    } else if (std::is_same<uint8_t, ImageType>::value) {
-        bspline_offset = 5;
-        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-    } else {
+    if (std::is_floating_point<ImageType>::value) {
         image_temp.copyFromMesh(input_image);
+    } else {
+        bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda);
+        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
     }
 
     GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max());
+    // std::cout << "after gpt \n";
     gpt.sendDataToGpu();
     gpt.processOnGpu();
     auto linearAccessGpu = gpt.getDataFromGpu();
@@ -479,76 +477,169 @@ inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T
 
 
 
-    constexpr int numOfStreams = 3;
-    constexpr int repetitionsPerStream = 3; //
-    APRTimer ttt(true);
-    ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
-    {
-        std::vector<GpuProcessingTask<ImageType>> gpts;
-
-        //std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
-        for (int i = 0; i < numOfStreams; ++i) {
-            gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
-        }
+    constexpr int numOfStreams = 3; // number of streams to use for parallel processing
+    constexpr int repetitionsPerStream = 15; // number of repetitions per stream to simulate processing of multiple images
+    bool useThreads = true;
 
-        APRTimer t(true);
-        t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
+    if (useThreads) {
+        std::cout << "\n!!! USING THREADS !!!\n\n";
+        APRTimer ttt(true);
+        std::cout << ">>>>>>>>>>> START\n";
+        ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
         {
+            APRTimer t(true);
+            std::vector<GpuProcessingTask<ImageType>> gpts;
 
-            APRTimer tt(false);
-            // Create streams and send initial task to do
+            t.start_timer("Creating GPTS");
+            std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
             for (int i = 0; i < numOfStreams; ++i) {
-                // gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
-                tt.start_timer("SEND");
-                gpts[i].sendDataToGpu();
-                tt.stop_timer();
-                // std::cout << "Send " << i << std::endl;
-                // gpts.back().processOnGpu();
-                // std::cout << "Proc " << i << std::endl;
+                gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
             }
-            // Create streams and send initial task to do
-            for (int i = 0; i < numOfStreams; ++i) {
-                // gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
-                tt.start_timer("Process");
-                gpts[i].processOnGpu();
-                tt.stop_timer();
-                // std::cout << "Proc " << i << std::endl;
-            }
-            std::cout << "=========" << std::endl;
-
-            for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
-                int c = i % numOfStreams;
-
-                // get data from previous task
-                // gpts_futures[c].get();
-                auto linearAccessGpu = gpts[c].getDataFromGpu();
-                // std::cout << "Get  " << c << std::endl;
-
-                // in theory, we get new data and send them to task
-                if (i  < numOfStreams * (repetitionsPerStream - 1)) {
-                    gpts[c].sendDataToGpu();
-                    // std::cout << "Send " << c << std::endl;
-                    gpts[c].processOnGpu();
-                    // gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
-                    // std::cout << "Proc " << c << std::endl;
+            t.stop_timer();
+
+            t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
+            {
+                APRTimer tt(false);
+                // Create streams and send initial task to do
+                for (int i = 0; i < numOfStreams; ++i) {
+                    // gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
+                    tt.start_timer("SEND");
+                    // gpts[i].sendDataToGpu();
+                    // gpts[i].processOnGpu();
+                    tt.stop_timer();
+                    // std::cout << "Send " << i << std::endl;
+                    // gpts.back().processOnGpu();
+                    // std::cout << "Proc " << i << std::endl;
                 }
+                // Create streams and send initial task to do
+                for (int i = 0; i < numOfStreams; ++i) {
+                    gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
+                    // tt.start_timer("Process");
+                    // gpts[i].processOnGpu();
+                    // tt.stop_timer();
+                    // std::cout << "Proc " << i << std::endl;
+                }
+                std::cout << "=========" << std::endl;
+
+                for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
+                    int c = i % numOfStreams;
 
-                aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
+                    // get data from previous task
+                    gpts_futures[c].get();
+                    auto linearAccessGpu = gpts[c].getDataFromGpu();
 
-                // generateDatastructures(aAPR) for linearAcceess for CUDA
-                aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
-                aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
-                aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
-                aAPR.apr_initialized = true;
+                    // in theory, we get new data and send them to task
+                    if (i  < numOfStreams * (repetitionsPerStream - 1)) {
+                        // gpts[c].sendDataToGpu();
+                        // std::cout << "Send " << c << std::endl;
+                        // gpts[c].processOnGpu();
+                        gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
+                        // std::cout << "Proc " << c << std::endl;
+                    }
 
-                // std::cout << "CUDA pipeline finished!\n";
+                    aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
+
+                    // generateDatastructures(aAPR) for linearAcceess for CUDA
+                    aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
+                    aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
+                    aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
+                    aAPR.apr_initialized = true;
+
+                    // std::cout << "CUDA pipeline finished!\n";
+                }
+                // cudaDeviceSynchronize();
+            }
+            auto allT = t.stop_timer();
+            std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
+            std::cout << "Bandwidth:" << (input_image.size() / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024) << " MB/s\n";
+        }
+        auto allT = ttt.stop_timer();
+        float tpi = allT / (numOfStreams*repetitionsPerStream);
+        std::cout << "Time per image: " << tpi << " seconds\n";
+        std::cout << "Image size: " << (input_image.size() / 1024 / 1024) << " MB\n";
+        std::cout << "Bandwidth:" << (input_image.size() / tpi / 1024 / 1024) << " MB/s\n";
+
+
+        std::cout << "<<<<<<<<<<<< STOP\n";
+    }
+    else {
+        APRTimer ttt(true);
+        std::cout << ">>>>>>>>>>> START\n";
+        ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
+        {
+            APRTimer t(true);
+            std::vector<GpuProcessingTask<ImageType>> gpts;
+
+            t.start_timer("Creating GPTS");
+            //std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
+            for (int i = 0; i < numOfStreams; ++i) {
+                gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
+            }
+            // cudaDeviceSynchronize();
+            t.stop_timer();
+
+            t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
+            {
+
+                APRTimer tt(false);
+                // Create streams and send initial task to do
+                for (int i = 0; i < numOfStreams; ++i) {
+                    // gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
+                    tt.start_timer("SEND");
+                    gpts[i].sendDataToGpu();
+                    gpts[i].processOnGpu();
+                    tt.stop_timer();
+                    // std::cout << "Send " << i << std::endl;
+                    // gpts.back().processOnGpu();
+                    // std::cout << "Proc " << i << std::endl;
+                }
+                // Create streams and send initial task to do
+                for (int i = 0; i < numOfStreams; ++i) {
+                    // gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
+                    tt.start_timer("Process");
+                    // gpts[i].processOnGpu();
+                    tt.stop_timer();
+                    // std::cout << "Proc " << i << std::endl;
+                }
+                std::cout << "=========" << std::endl;
+
+                for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
+                    int c = i % numOfStreams;
+
+                    // get data from previous task
+                    // gpts_futures[c].get();
+                    auto linearAccessGpu = gpts[c].getDataFromGpu();
+                    // std::cout << "Get  " << c << std::endl;
+
+                    // in theory, we get new data and send them to task
+                    if (i  < numOfStreams * (repetitionsPerStream - 1)) {
+                        gpts[c].sendDataToGpu();
+                        // std::cout << "Send " << c << std::endl;
+                        gpts[c].processOnGpu();
+                        // gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
+                        // std::cout << "Proc " << c << std::endl;
+                    }
+
+                    aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
+
+                    // generateDatastructures(aAPR) for linearAcceess for CUDA
+                    aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
+                    aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
+                    aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
+                    aAPR.apr_initialized = true;
+
+                    // std::cout << "CUDA pipeline finished!\n";
+                }
+                // cudaDeviceSynchronize();
             }
+            auto allT = t.stop_timer();
+            std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
         }
-        auto allT = t.stop_timer();
+        auto allT = ttt.stop_timer();
         std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
+        std::cout << "<<<<<<<<<<<< STOP\n";
     }
-    auto allT = ttt.stop_timer();
-    std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
+
 
     return false; //TODO: change it back to true
 }
@@ -624,8 +715,8 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
 #ifndef APR_USE_CUDA
     return get_apr_cpu(aAPR, input_image);
 #else
-    // return get_apr_cuda(aAPR, input_image);
-    return get_apr_cuda_streams(aAPR, input_image);
+    return get_apr_cuda(aAPR, input_image);
+    // return get_apr_cuda_streams(aAPR, input_image);
 #endif
 }
 
 
@@ -55,11 +55,16 @@ class APRParameters {
         os << "rel_error=" << obj.rel_error << "\n";
         os << "sigma_th=" << obj.sigma_th << "\n";
         os << "sigma_th_max=" << obj.sigma_th_max << "\n";
+        os << "grad_th=" << obj.grad_th << "\n";
         os << "auto_parameters=" << (obj.auto_parameters ? "true" : "false") << "\n";
+        os << "reflect_bc_lis=" << (obj.reflect_bc_lis ? "true" : "false") << "\n";
+        os << "check_input=" << (obj.check_input ? "true" : "false") << "\n";
+        os << "swap_dimensions=" << (obj.swap_dimensions ? "true" : "false") << "\n";
         os << "neighborhood_optimization=" << (obj.neighborhood_optimization ? "true" : "false") << "\n";
         os << "constant_intensity_scale=" << (obj.constant_intensity_scale ? "true" : "false") << "\n";
         os << "output_steps=" << (obj.output_steps ? "true" : "false") << "\n";
-
+        os << "dx/dy/dz=" << obj.dx << "/" << obj.dy << "/" << obj.dz << "\n";
+        os << "psfx/psfy/psfz=" << obj.psfx << "/" << obj.psfy << "/" << obj.psfz << "\n";
 	    return os;
     }
 
 
@@ -364,7 +364,7 @@ round(float val, size_t &errCount) {
 
     if(val < std::numeric_limits<T>::min() || val > std::numeric_limits<T>::max()) {
         errCount++;
-        std::cout << val << " " << (float)std::numeric_limits<T>::min() << " " << (float)std::numeric_limits<T>::max() << std::endl;
+        // std::cout << val << " " << (float)std::numeric_limits<T>::min() << " " << (float)std::numeric_limits<T>::max() << std::endl;
     }
     return val;
 }
Original file line number	Diff line number	Diff line change
`@@ -364,7 +364,7 @@ round(float val, size_t &errCount) {`
`364`	`364`
`365`	`365`	`if(val < std::numeric_limits<T>::min() \|\| val > std::numeric_limits<T>::max()) {`
`366`	`366`	`errCount++;`
`367`		`- std::cout << val << " " << (float)std::numeric_limits<T>::min() << " " << (float)std::numeric_limits<T>::max() << std::endl;`
	`367`	`+ // std::cout << val << " " << (float)std::numeric_limits<T>::min() << " " << (float)std::numeric_limits<T>::max() << std::endl;`
`368`	`368`	`}`
`369`	`369`	`return val;`
`370`	`370`	`}`