Sampling on GPU added - THIS CODE is to being cleaned up - it is pushed for backup reasons only

krzysg · krzysg · commit 1db4abf0290b · 2025-09-12T16:09:59.000+02:00
diff --git a/examples/Example_get_apr.cpp b/examples/Example_get_apr.cpp
@@ -81,6 +81,15 @@ int runAPR(cmdLineOptions options) {
         ParticleData<uint16_t> particle_intensities;
         particle_intensities.sample_image(apr, input_img); // sample your particles from your image
         //Below is IO and outputting of the Implied Resolution Function through the Particle Cell level.
+        std::cout << apr.linearAccess.y_vec.size() << " particles in APR" << std::endl;
+        std::cout << particle_intensities.size() << " intensities in CPU in APR" << std::endl;
+        std::cout << aprConverter.parts.size() << " intensities in GPU in APR" << std::endl;
+
+        for (int i = 0 ; i < particle_intensities.size(); ++i) {
+            if (particle_intensities[i]  != aprConverter.parts[i]) {
+                std::cout << "Mismatch at " << i << " CPU: " << particle_intensities[i] << " GPU: " << aprConverter.parts[i] << std::endl;
+            }
+        }
 
         //output
         std::string save_loc = options.output_dir;
diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp
@@ -66,6 +66,10 @@ class APRConverter {
     APRTimer computation_timer;
     APRParameters par;
 
+    // TODO: this is temporary place to put particle intensity data. It shoud be think over how to move it from GPU
+    //       but for now and tests this is the best place.
+    VectorData<ImageType> parts;
+
     template <typename T>
     bool get_apr(APR &aAPR, PixelData<T> &input_image);
 
@@ -420,6 +424,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
     aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
     aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
     aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
+    parts.copy(linearAccessGpu.parts);
     aAPR.apr_initialized = true;
 
     std::cout << "CUDA pipeline finished!\n";
@@ -509,6 +514,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
         aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec);
         aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec);
         aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec);
+        parts = std::move(linearAccessGpu.parts);
 
         aAPR.apr_initialized = true;
     }
@@ -596,7 +602,7 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
     return get_apr_cpu(aAPR, input_image);
 #else
     // return get_apr_cuda(aAPR, input_image);
-    std::vector<PixelData<T> *> input_images(1, &input_image);
+    std::vector<PixelData<T> *> input_images(3*11, &input_image);
     return get_apr_cuda_multistreams(aAPR, input_images, 3);
 #endif
 }
diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu
@@ -338,6 +338,62 @@ void runBsplineOffsetAndCopyOriginal(ImgType *cudaImage, ImgType *cudaCopy, floa
 };
 
 
+template <typename T>
+__global__ void printKernel(T *input, size_t length) {
+    printf("DOWNSAMPLED: ");
+    for (int i = 0; i < length; i++) printf("%d ", input[i]);
+    printf("\n");
+}
+
+template <typename ImgType>
+void runPrint(ImgType *cudaImage, size_t length, cudaStream_t aStream) {
+    printKernel<<<1,1, 0, aStream>>>(cudaImage, length);
+};
+
+
+template <typename T>
+__global__ void sampleKernel(T *downsampledLevel,  T *parts_cuda, int level, int xLen, int yLen, int zLen, uint64_t *level_xz_vec_cuda, uint64_t *xz_end_vec_cuda, uint16_t *y_vec) {
+    const int xi = (blockIdx.x * blockDim.x) + threadIdx.x;
+    const int zi = (blockIdx.z * blockDim.z) + threadIdx.z;
+    if (xi >= xLen || zi >= zLen) return;
+    uint64_t level_start = level_xz_vec_cuda[level];
+    uint64_t offset = xi + zi * xLen;
+    auto xz_start = level_start + offset;
+
+    auto begin_index = xz_end_vec_cuda[xz_start - 1];
+    auto end_index = xz_end_vec_cuda[xz_start];
+
+    for (size_t idx = begin_index; idx < end_index; ++idx) {
+        int y = y_vec[idx];
+        size_t imageIdx = zi * xLen * yLen + xi * yLen + y;
+        parts_cuda[idx] = downsampledLevel[imageIdx];
+    }
+}
+
+template <typename ImgType>
+void runSampleParts(ImgType** downsampled, GenInfo &aprInfo, ImgType *parts_cuda, uint64_t *level_xz_vec_cuda, uint64_t *xz_end_vec_cuda, uint16_t *y_vec, cudaStream_t aStream) {
+     // std::cout << aprInfo << std::endl;
+    // Run kernels for each level
+    for (int level = aprInfo.l_min; level <= aprInfo.l_max; level++) {
+        // std::cout << "Processing level " << level << std::endl;
+        dim3 threadsPerBlock(128, 1, 8);
+        dim3 numBlocks((aprInfo.x_num[level] + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                       1,
+                       (aprInfo.z_num[level] + threadsPerBlock.z - 1) / threadsPerBlock.z);
+        // std::cout << downsampled[level] << std::endl;
+        // std::cout << parts_cuda << std::endl;
+        // std::cout << aprInfo.x_num[level] << std::endl;
+        // std::cout << aprInfo.y_num[level] << std::endl;
+        // std::cout << aprInfo.z_num[level] << std::endl;
+        // std::cout << level_xz_vec_cuda << std::endl;
+        // std::cout << xz_end_vec_cuda << std::endl;
+        // std::cout << y_vec << std::endl;
+        sampleKernel<<<numBlocks, threadsPerBlock, 0, aStream>>>(downsampled[level], parts_cuda, level, aprInfo.x_num[level], aprInfo.y_num[level], aprInfo.z_num[level], level_xz_vec_cuda, xz_end_vec_cuda, y_vec);
+    }
+
+};
+
+
 class CudaStream {
     cudaStream_t iStream;
 
@@ -407,7 +463,7 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
     ParticleCellTreeCuda pctc;
 
     ScopedCudaMemHandler<uint16_t*, JUST_ALLOC> y_vec_cuda; // for LinearAccess
-    LinearAccessCudaStructs lacs;
+    LinearAccessCudaStructs<ImgType> lacs;
 
     // Padded memory for local_scale_temp and local_scale_temp2
     ScopedCudaMemHandler<float*, JUST_ALLOC> lstPadded;
@@ -422,6 +478,8 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
     ScopedCudaMemHandler<uint64_t *, JUST_ALLOC> level_xz_vec_cuda; //(level_xz_vec.data(), level_xz_vec.size(), aStream);
     GenInfoGpuAccess giga;
     uint64_t counter_total = 1;
+    VectorData<ImgType> parts;
+    ScopedCudaMemHandler<ImgType *, JUST_ALLOC> parts_cuda;
 
     // Preallocated memory for bspline shift computation
     VectorData<ImgType> minVector{true};
@@ -455,11 +513,13 @@ public:
         boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num},
         boundary{nullptr, boundaryLen, iStream},
         pctc(iAprInfo, iStream),
-        y_vec_cuda(nullptr, iAprInfo.getSize(), iStream),
+        y_vec_cuda(nullptr, iAprInfo.getSize()/2, iStream), // TODO: only half capacity
         xz_end_vec(true),
         level_xz_vec(true),
         y_vec(true),
-        giga(iAprInfo, iStream)
+        giga(iAprInfo, iStream),
+        parts(true),
+        parts_cuda(nullptr, iAprInfo.getSize()/2, iStream) // TODO: only half capacity
     {
         splineCudaX = cudax.first;
         splineCudaY = cuday.first;
@@ -491,6 +551,9 @@ public:
         xz_end_vec_cuda.initialize(xz_end_vec.data(), xz_end_vec.size(), iStream);
         level_xz_vec_cuda.initialize(level_xz_vec.data(), level_xz_vec.size(), iStream);
 
+        parts.resize(iAprInfo.getSize()); // resize it to  worst case -> same number particles as pixels in input image
+
+
         isErrorDetectedPinned.resize(1);
         isErrorDetectedCuda.initialize(isErrorDetectedPinned.data(), 1, iStream);
 
@@ -515,7 +578,34 @@ public:
         resultsMax.initialize(maxVector.data(), numOfBlocks, iStream);
     }
 
-    LinearAccessCudaStructs getDataFromGpu() {
+    void sample() {
+        // Prepare memory for downsampled pyramid
+        // Use 'image' as a memory for all levels (but max one)
+        // since data there is 'destroyed' anyway
+        // via bspline filtering and gradient computation
+        // and as the highest level of pyramid use imageSampling which is
+        // a copy of original image at full resolution
+        int l_max = iAprInfo.l_max;
+        int l_min = iAprInfo.l_min;
+        ImgType* downsampled[l_max + 1];
+        downsampled[l_max] = imageSampling.get();
+        size_t levelOffset = 0;
+        for (int l = l_max-1; l >= l_min; --l) {
+            size_t level_size = iAprInfo.x_num[l] * iAprInfo.y_num[l] * iAprInfo.z_num[l];
+            // std::cout << l << " dim: " << iAprInfo.getDimension(l) << " " << iAprInfo.getSize(l) << " " << level_size << std::endl;
+            downsampled[l] = image.get() + levelOffset;
+            levelOffset += iAprInfo.getSize(l);
+
+            runDownsampleMean(downsampled[l+1], downsampled[l], iAprInfo.x_num[l+1], iAprInfo.y_num[l+1], iAprInfo.z_num[l+1], iStream);
+        }
+
+        // VectorData<uint64_t> xz_end_vec;
+        // VectorData<uint64_t> level_xz_vec;
+        // VectorData<uint16_t> y_vec;
+        runSampleParts(downsampled, iAprInfo, parts_cuda.get(), level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), iStream);
+    }
+
+    LinearAccessCudaStructs<ImgType> getDataFromGpu() {
         return std::move(lacs);
     }
 
@@ -572,13 +662,24 @@ public:
         // Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures
         checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda.get(), iAprInfo.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, iStream));
 
+
+        // SAMPLE under development
+        sample();
+        parts.resize(iAprInfo.total_number_particles);
+        // Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures
+        checkCuda(cudaMemcpyAsync(parts.begin(), parts_cuda.get(), iAprInfo.total_number_particles * sizeof(ImgType), cudaMemcpyDeviceToHost, iStream));
+
+
+
+
         // Synchornize last time - at that moment all data from GPU is copied to CPU
         checkCuda(cudaStreamSynchronize(iStream));
 
         // Prepare CPU structures
         lacs.xz_end_vec.copy(xz_end_vec);
         lacs.level_xz_vec.copy(level_xz_vec);
         lacs.y_vec.copy(y_vec);
+        lacs.parts.copy(parts);
     }
 
     ~GpuProcessingTaskImpl() {}
@@ -595,7 +696,7 @@ template <typename ImgType>
 GpuProcessingTask<ImgType>::GpuProcessingTask(GpuProcessingTask&&) = default;
 
 template <typename ImgType>
-LinearAccessCudaStructs GpuProcessingTask<ImgType>::getDataFromGpu() {return impl->getDataFromGpu();}
+LinearAccessCudaStructs<ImgType> GpuProcessingTask<ImgType>::getDataFromGpu() {return impl->getDataFromGpu();}
 
 template <typename ImgType>
 void GpuProcessingTask<ImgType>::processOnGpu() {impl->processOnGpu();}
@@ -606,6 +707,7 @@ template class GpuProcessingTask<int>;
 template class GpuProcessingTask<uint16_t>;
 template class GpuProcessingTask<float>;
 
+
 // ================================== TEST helpers ==============
 // TODO: should be moved somewhere
 
diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp
@@ -47,7 +47,7 @@ class GpuProcessingTask {
     ~GpuProcessingTask();
     GpuProcessingTask(GpuProcessingTask&&);
 
-    LinearAccessCudaStructs getDataFromGpu();
+    LinearAccessCudaStructs<ImgType> getDataFromGpu();
     void processOnGpu();
 
     void setBsplineOffset(float bspline_offset);
diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu
@@ -460,7 +460,8 @@ void runFourthStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCu
  * - copy it back to CPU
  * - returns all the structure
  */
-LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct) {
+template <typename ImgType>
+LinearAccessCudaStructs<ImgType> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct) {
 
     cudaStream_t aStream = nullptr;
 
@@ -526,14 +527,21 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara
     p_map.downloadPCTfromGPU(pct);
 
 
-    LinearAccessCudaStructs lac;
+    LinearAccessCudaStructs<ImgType> lac;
     lac.y_vec.swap(y_vec);
     lac.xz_end_vec.swap(xz_end_vec);
     lac.level_xz_vec.swap(level_xz_vec);
 
     return lac;
 }
 
+// explicit instantiation of handled types
+template LinearAccessCudaStructs<float> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
+template LinearAccessCudaStructs<uint16_t> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
+template LinearAccessCudaStructs<int> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
+template LinearAccessCudaStructs<uint8_t> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
+
+
 void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda, const uint64_t *level_xz_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, GenInfoGpuAccess &giga, const APRParameters &apr_parameters, uint64_t counter_total, cudaStream_t aStream) {
 
     const uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2;
diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp
@@ -6,16 +6,27 @@
 #include "data_structures/APR/GenInfo.hpp"
 #include "algorithm/ParticleCellTreeCuda.cuh"
 
-typedef struct {
+template <typename ImgType>
+struct LinearAccessCudaStructs {
     VectorData<uint16_t> y_vec;
     VectorData<uint64_t> xz_end_vec;
     VectorData<uint64_t> level_xz_vec;
-} LinearAccessCudaStructs;
+
+    // temporarily added
+    VectorData<ImgType> parts;
+};
+
+// explicit instantiation of handled types
+template class LinearAccessCudaStructs<uint8_t>;
+template class LinearAccessCudaStructs<int>;
+template class LinearAccessCudaStructs<uint16_t>;
+template class LinearAccessCudaStructs<float>;
 
 #include "data_structures/APR/access/GenInfoGpuAccess.cuh"
 
 // This is for testing purposes only
-LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
+template <typename ImgType>
+LinearAccessCudaStructs<ImgType> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
 
 void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda, const uint64_t *level_xz_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, GenInfoGpuAccess &giga, const APRParameters &apr_parameters, uint64_t counter_total, cudaStream_t aStream);
 
diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp
@@ -263,7 +263,7 @@ namespace {
             getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par);
             computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
             auto pct = computeOvpcCuda(local_scale_temp_GPU, giGpu);
-            auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct);
+            auto linearAccessGpu = initializeLinearStructureCuda<ImageType>(giGpu, par, pct);
             timer.stop_timer();
 
             // Compare GPU vs CPU - expect exactly same result
diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp
@@ -272,7 +272,7 @@ TEST(LinearAccessCudaTest, optimizationForSmallLevels) {
     par.neighborhood_optimization = true;
 
     // --- Method under test
-    auto linearAccess = initializeLinearStructureCuda(gi, par, pct);
+    auto linearAccess = initializeLinearStructureCuda<uint16_t>(gi, par, pct);
 
     // ---- Verify output
     std::vector<uint16_t> expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz
@@ -312,7 +312,7 @@ TEST(LinearAccessCudaTest, optimizationForSmallLevelsVScpu) {
 
                 // --- Method under test
                 linearAccess.initialize_linear_structure(par, pct);
-                auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu);
+                auto linearAccessGpu = initializeLinearStructureCuda<uint16_t>(giGpu, par, pctGpu);
 
                 EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0);
                 EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0);
@@ -359,7 +359,7 @@ TEST(LinearAccessCudaTest, testGPUvsCPUforDifferentSizes) {
                 t.stop_timer();
 
                 t.start_timer("_________________________ GPU");
-                auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu);
+                auto linearAccessGpu = initializeLinearStructureCuda<uint16_t>(giGpu, par, pctGpu);
                 t.stop_timer();