Full GPU pipeline works1

krzysg · krzysg · commit 9ff058005047 · 2024-08-21T16:51:30.000+02:00
diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu
@@ -15,6 +15,7 @@
 #include "misc/CudaMemory.cuh"
 #include "algorithm/ParticleCellTreeCuda.cuh"
 #include "algorithm/PullingSchemeCuda.hpp"
+#include "data_structures/APR/access/LinearAccessCuda.hpp"
 
 #include "dsGradient.cuh"
 #include "invBspline.cuh"
@@ -232,6 +233,9 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
 
     ParticleCellTreeCuda pctc;
 
+    ScopedCudaMemHandler<uint16_t*, JUST_ALLOC> y_vec; // for LinearAccess
+    LinearAccessCudaStructs lacs;
+
     /**
      * @return newly created stream
      */
@@ -264,7 +268,8 @@ public:
         bc4(params.bc4.get(), params.k0, iStream),
         boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num},
         boundary{nullptr, boundaryLen, iStream},
-        pctc(iAprInfo, iStream)
+        pctc(iAprInfo, iStream),
+        y_vec(nullptr, iAprInfo.getSize(), iStream)
     {
 //        std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n";
         std::cout << iCpuImage << std::endl;
@@ -279,12 +284,13 @@ public:
         std::cout << "SEND time: " << ct.microseconds() - start << std::endl;
     }
 
-    void getDataFromGpu() {
-        CurrentTime ct;
-        uint64_t start = ct.microseconds();
-        local_scale_temp.copyD2H();
-        checkCuda(cudaStreamSynchronize(iStream));
-        std::cout << "RCV time: " << ct.microseconds() - start << std::endl;
+    LinearAccessCudaStructs getDataFromGpu() {
+//        CurrentTime ct;
+//        uint64_t start = ct.microseconds();
+//        local_scale_temp.copyD2H();
+//        checkCuda(cudaStreamSynchronize(iStream));
+//        std::cout << "RCV time: " << ct.microseconds() - start << std::endl;
+        return std::move(lacs);
     }
 
     void processOnGpu() {
@@ -317,6 +323,8 @@ public:
         std::cout << "3: " << ct.microseconds() - start << std::endl;
 
         computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream);
+        computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream);
+        std::cout << iAprInfo << std::endl;
     }
 
     ~GpuProcessingTaskImpl() {
@@ -339,7 +347,7 @@ template <typename ImgType>
 void GpuProcessingTask<ImgType>::sendDataToGpu() {impl->sendDataToGpu();}
 
 template <typename ImgType>
-void GpuProcessingTask<ImgType>::getDataFromGpu() {impl->getDataFromGpu();}
+LinearAccessCudaStructs GpuProcessingTask<ImgType>::getDataFromGpu() {return impl->getDataFromGpu();}
 
 template <typename ImgType>
 void GpuProcessingTask<ImgType>::processOnGpu() {impl->processOnGpu();}
diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp
@@ -7,7 +7,7 @@
 
 #include "data_structures/Mesh/PixelData.hpp"
 #include "algorithm/APRParameters.hpp"
-
+#include "data_structures/APR/access/LinearAccessCuda.hpp"
 
 // Test helpers and definitions
 using TypeOfRecBsplineFlags = uint16_t;
@@ -47,7 +47,7 @@ class GpuProcessingTask {
     GpuProcessingTask(GpuProcessingTask&&);
 
     void sendDataToGpu();
-    void getDataFromGpu();
+    LinearAccessCudaStructs getDataFromGpu();
     void processOnGpu();
     void doAll();
 };
diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp
@@ -37,6 +37,8 @@ class GenInfo {
     GenInfo() {}
     GenInfo(const PixelDataDim &dim) { init(dim); }
 
+    size_t getSize() const { return (size_t)y_num[l_max] * x_num[l_max] * z_num[l_max]; }
+
     //initialize the information given the original dimensions
     void init(const PixelDataDim &dim) {
         init(dim.y, dim.x, dim.z);
@@ -119,6 +121,7 @@ class GenInfo {
     friend std::ostream & operator<<(std::ostream &os, const GenInfo &gi) {
         os << "GenInfo {\n";
         os << "    Original dimensions(y/x/z): [" << gi.org_dims[0] << ", " << gi.org_dims[1] << ", " << gi.org_dims[2] << "]\n";
+        os << "    Original size: " << gi.getSize() << "\n";
         os << "    Number of dimensions: " << static_cast<int>(gi.number_dimensions) << "\n";
         os << "    l_min, l_max: {" << gi.l_min << " - " << gi.l_max << "}\n";
         os << "    total number of particles: " << gi.total_number_particles << "\n";
diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu
@@ -592,3 +592,51 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara
 
     return lac;
 }
+
+void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream) {
+
+    uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2;
+
+    VectorData<uint64_t> xz_end_vec(true);
+    VectorData<uint64_t> level_xz_vec(true);
+
+    // initialize_xz_linear() - CPU impl.
+    uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking.
+    level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1;
+    level_xz_vec[0] = 1; //allowing for the offset.
+    for (int i = 0; i <= gi.l_max; ++i) {
+        counter_total += gi.x_num[i] * gi.z_num[i];
+        level_xz_vec[i + 1] = counter_total;
+    }
+    xz_end_vec.resize(counter_total, 0);
+
+
+    {
+        ScopedCudaMemHandler<uint64_t *, D2H> xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size());
+        ScopedCudaMemHandler<uint64_t *, H2D | D2H> level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size());
+        GenInfoGpuAccess giga(gi, aStream);
+        if (gi.l_max <= 2) {
+            runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, gi, giga, aStream);
+        }
+        else {
+            runFirstStep(gi, giga, p_map, min_type, aStream);
+            runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream);
+            runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream);
+            runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, aStream);
+            runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, counter_total, aStream);
+        }
+    }
+
+//        auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; };
+//    prt(y_vec);
+//    prt(xz_end_vec);
+//    prt(level_xz_vec);
+    VectorData<uint16_t> y_vec(true);
+    y_vec.resize(gi.total_number_particles);
+    checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda, gi.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, aStream));
+    checkCuda(cudaStreamSynchronize(aStream));
+
+    lacs.y_vec.swap(y_vec);
+    lacs.xz_end_vec.swap(xz_end_vec);
+    lacs.level_xz_vec.swap(level_xz_vec);
+}
diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp
@@ -4,6 +4,7 @@
 #include "algorithm/APRParameters.hpp"
 #include "data_structures/Mesh/PixelData.hpp"
 #include "data_structures/APR/GenInfo.hpp"
+#include "algorithm/ParticleCellTreeCuda.cuh"
 
 typedef struct {
     VectorData<uint16_t> y_vec;
@@ -13,5 +14,7 @@ typedef struct {
 
 LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
 
+void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream);
+
 
 #endif //APR_LINEARACCESSCUDA_HPP
diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp
@@ -277,7 +277,7 @@ namespace {
         }
     }
 
-    TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GpuProcessingTask) {
+    TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GpuProcessingTask) {
         APRTimer timer(true);
 
         // TODO: This tets fails if dim of input image is smaller than ~8 (not sure in which direction yet)
@@ -288,11 +288,15 @@ namespace {
         // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors
         using ImageType = float;
         constexpr PixelDataDim dim1{4, 4, 3};
-        constexpr PixelDataDim dim2{163, 123, 555};
+        constexpr PixelDataDim dim2{1024,512,512};
         for (int d = 0; d <= 3; d++) {
             auto &dim = (d % 2 == 0) ? dim1 : dim2;
             PixelData<ImageType> input_image = (d / 2 == 0) ? getRandInitializedMesh<ImageType>(dim, 13) :
-                                                              getMeshWithBlobInMiddle<ImageType>(dim);
+                                               getMeshWithBlobInMiddle<ImageType>(dim);
+
+//            constexpr PixelDataDim dim = dim1;
+//            PixelData<ImageType> input_image = getRandInitializedMesh<ImageType>(dim, 13);
+
             int maxLevel = ceil(std::log2(dim.maxDimSize()));
 
             // Initialize CPU data structures
@@ -321,32 +325,52 @@ namespace {
             par.dz = 1;
             par.neighborhood_optimization = true;
 
+            GenInfo aprInfo(input_image.getDimension());
+            GenInfo giGpu(input_image.getDimension());
+
+            // Calculate pipeline on CPU
             // Calculate pipeline on CPU
             timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE");
             ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par);
             LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par);
             LocalParticleCellSet lpcs = LocalParticleCellSet();
             lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
+            PullingScheme ps;
+            ps.initialize_particle_cell_tree(aprInfo);
+            lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par);
+            ps.pulling_scheme_main();
+            LinearAccess linearAccess;
+            linearAccess.genInfo = &aprInfo;
+            linearAccess.initialize_linear_structure(par, ps.getParticleCellTree());
             timer.stop_timer();
 
 
             // Calculate pipeline on GPU
             timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
-            {
-                GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel);
-                gpt.doAll();
-            }
+    //        {
+            GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel);
+            gpt.sendDataToGpu();
+            gpt.processOnGpu();
+            auto linearAccessGpu = gpt.getDataFromGpu();
+            giGpu.total_number_particles = linearAccessGpu.y_vec.size();
+
+    //        }
             timer.stop_timer();
 
             // Compare GPU vs CPU - expect exactly same result
-            EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0);
+            EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0);
+            EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0);
+            EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0);
+
+            EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles);
+            EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size());
 
         }
     }
+
 #endif // APR_USE_CUDA
 }
 
-
 int main(int argc, char **argv) {
     testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();