@@ -338,6 +338,62 @@ void runBsplineOffsetAndCopyOriginal(ImgType *cudaImage, ImgType *cudaCopy, floa
338338};
339339
340340
341+ template <typename T>
342+ __global__ void printKernel (T *input, size_t length) {
343+ printf (" DOWNSAMPLED: " );
344+ for (int i = 0 ; i < length; i++) printf (" %d " , input[i]);
345+ printf (" \n " );
346+ }
347+
348+ template <typename ImgType>
349+ void runPrint (ImgType *cudaImage, size_t length, cudaStream_t aStream) {
350+ printKernel<<<1 ,1 , 0 , aStream>>> (cudaImage, length);
351+ };
352+
353+
354+ template <typename T>
355+ __global__ void sampleKernel (T *downsampledLevel, T *parts_cuda, int level, int xLen, int yLen, int zLen, uint64_t *level_xz_vec_cuda, uint64_t *xz_end_vec_cuda, uint16_t *y_vec) {
356+ const int xi = (blockIdx .x * blockDim .x ) + threadIdx .x ;
357+ const int zi = (blockIdx .z * blockDim .z ) + threadIdx .z ;
358+ if (xi >= xLen || zi >= zLen) return ;
359+ uint64_t level_start = level_xz_vec_cuda[level];
360+ uint64_t offset = xi + zi * xLen;
361+ auto xz_start = level_start + offset;
362+
363+ auto begin_index = xz_end_vec_cuda[xz_start - 1 ];
364+ auto end_index = xz_end_vec_cuda[xz_start];
365+
366+ for (size_t idx = begin_index; idx < end_index; ++idx) {
367+ int y = y_vec[idx];
368+ size_t imageIdx = zi * xLen * yLen + xi * yLen + y;
369+ parts_cuda[idx] = downsampledLevel[imageIdx];
370+ }
371+ }
372+
373+ template <typename ImgType>
374+ void runSampleParts (ImgType** downsampled, GenInfo &aprInfo, ImgType *parts_cuda, uint64_t *level_xz_vec_cuda, uint64_t *xz_end_vec_cuda, uint16_t *y_vec, cudaStream_t aStream) {
375+ // std::cout << aprInfo << std::endl;
376+ // Run kernels for each level
377+ for (int level = aprInfo.l_min ; level <= aprInfo.l_max ; level++) {
378+ // std::cout << "Processing level " << level << std::endl;
379+ dim3 threadsPerBlock (128 , 1 , 8 );
380+ dim3 numBlocks ((aprInfo.x_num [level] + threadsPerBlock.x - 1 ) / threadsPerBlock.x ,
381+ 1 ,
382+ (aprInfo.z_num [level] + threadsPerBlock.z - 1 ) / threadsPerBlock.z );
383+ // std::cout << downsampled[level] << std::endl;
384+ // std::cout << parts_cuda << std::endl;
385+ // std::cout << aprInfo.x_num[level] << std::endl;
386+ // std::cout << aprInfo.y_num[level] << std::endl;
387+ // std::cout << aprInfo.z_num[level] << std::endl;
388+ // std::cout << level_xz_vec_cuda << std::endl;
389+ // std::cout << xz_end_vec_cuda << std::endl;
390+ // std::cout << y_vec << std::endl;
391+ sampleKernel<<<numBlocks, threadsPerBlock, 0 , aStream>>> (downsampled[level], parts_cuda, level, aprInfo.x_num [level], aprInfo.y_num [level], aprInfo.z_num [level], level_xz_vec_cuda, xz_end_vec_cuda, y_vec);
392+ }
393+
394+ };
395+
396+
341397class CudaStream {
342398 cudaStream_t iStream;
343399
@@ -407,7 +463,7 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
407463 ParticleCellTreeCuda pctc;
408464
409465 ScopedCudaMemHandler<uint16_t *, JUST_ALLOC> y_vec_cuda; // for LinearAccess
410- LinearAccessCudaStructs lacs;
466+ LinearAccessCudaStructs<ImgType> lacs;
411467
412468 // Padded memory for local_scale_temp and local_scale_temp2
413469 ScopedCudaMemHandler<float *, JUST_ALLOC> lstPadded;
@@ -422,6 +478,8 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
422478 ScopedCudaMemHandler<uint64_t *, JUST_ALLOC> level_xz_vec_cuda; // (level_xz_vec.data(), level_xz_vec.size(), aStream);
423479 GenInfoGpuAccess giga;
424480 uint64_t counter_total = 1 ;
481+ VectorData<ImgType> parts;
482+ ScopedCudaMemHandler<ImgType *, JUST_ALLOC> parts_cuda;
425483
426484 // Preallocated memory for bspline shift computation
427485 VectorData<ImgType> minVector{true };
@@ -455,11 +513,13 @@ public:
455513 boundaryLen{(2 /* two first elements*/ + 2 /* two last elements */ ) * (size_t )inputImage.x_num * (size_t )inputImage.z_num },
456514 boundary{nullptr , boundaryLen, iStream},
457515 pctc (iAprInfo, iStream),
458- y_vec_cuda (nullptr , iAprInfo.getSize(), iStream),
516+ y_vec_cuda (nullptr , iAprInfo.getSize()/ 2 , iStream), // TODO: only half capacity
459517 xz_end_vec (true ),
460518 level_xz_vec (true ),
461519 y_vec (true ),
462- giga (iAprInfo, iStream)
520+ giga (iAprInfo, iStream),
521+ parts (true ),
522+ parts_cuda (nullptr , iAprInfo.getSize()/2 , iStream) // TODO: only half capacity
463523 {
464524 splineCudaX = cudax.first ;
465525 splineCudaY = cuday.first ;
@@ -491,6 +551,9 @@ public:
491551 xz_end_vec_cuda.initialize (xz_end_vec.data (), xz_end_vec.size (), iStream);
492552 level_xz_vec_cuda.initialize (level_xz_vec.data (), level_xz_vec.size (), iStream);
493553
554+ parts.resize (iAprInfo.getSize ()); // resize it to worst case -> same number particles as pixels in input image
555+
556+
494557 isErrorDetectedPinned.resize (1 );
495558 isErrorDetectedCuda.initialize (isErrorDetectedPinned.data (), 1 , iStream);
496559
@@ -515,7 +578,34 @@ public:
515578 resultsMax.initialize (maxVector.data (), numOfBlocks, iStream);
516579 }
517580
518- LinearAccessCudaStructs getDataFromGpu () {
581+ void sample () {
582+ // Prepare memory for downsampled pyramid
583+ // Use 'image' as a memory for all levels (but max one)
584+ // since data there is 'destroyed' anyway
585+ // via bspline filtering and gradient computation
586+ // and as the highest level of pyramid use imageSampling which is
587+ // a copy of original image at full resolution
588+ int l_max = iAprInfo.l_max ;
589+ int l_min = iAprInfo.l_min ;
590+ ImgType* downsampled[l_max + 1 ];
591+ downsampled[l_max] = imageSampling.get ();
592+ size_t levelOffset = 0 ;
593+ for (int l = l_max-1 ; l >= l_min; --l) {
594+ size_t level_size = iAprInfo.x_num [l] * iAprInfo.y_num [l] * iAprInfo.z_num [l];
595+ // std::cout << l << " dim: " << iAprInfo.getDimension(l) << " " << iAprInfo.getSize(l) << " " << level_size << std::endl;
596+ downsampled[l] = image.get () + levelOffset;
597+ levelOffset += iAprInfo.getSize (l);
598+
599+ runDownsampleMean (downsampled[l+1 ], downsampled[l], iAprInfo.x_num [l+1 ], iAprInfo.y_num [l+1 ], iAprInfo.z_num [l+1 ], iStream);
600+ }
601+
602+ // VectorData<uint64_t> xz_end_vec;
603+ // VectorData<uint64_t> level_xz_vec;
604+ // VectorData<uint16_t> y_vec;
605+ runSampleParts (downsampled, iAprInfo, parts_cuda.get (), level_xz_vec_cuda.get (), xz_end_vec_cuda.get (), y_vec_cuda.get (), iStream);
606+ }
607+
608+ LinearAccessCudaStructs<ImgType> getDataFromGpu () {
519609 return std::move (lacs);
520610 }
521611
@@ -572,13 +662,24 @@ public:
572662 // Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures
573663 checkCuda (cudaMemcpyAsync (y_vec.begin (), y_vec_cuda.get (), iAprInfo.total_number_particles * sizeof (uint16_t ), cudaMemcpyDeviceToHost, iStream));
574664
665+
666+ // SAMPLE under development
667+ sample ();
668+ parts.resize (iAprInfo.total_number_particles );
669+ // Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures
670+ checkCuda (cudaMemcpyAsync (parts.begin (), parts_cuda.get (), iAprInfo.total_number_particles * sizeof (ImgType), cudaMemcpyDeviceToHost, iStream));
671+
672+
673+
674+
575675 // Synchornize last time - at that moment all data from GPU is copied to CPU
576676 checkCuda (cudaStreamSynchronize (iStream));
577677
578678 // Prepare CPU structures
579679 lacs.xz_end_vec .copy (xz_end_vec);
580680 lacs.level_xz_vec .copy (level_xz_vec);
581681 lacs.y_vec .copy (y_vec);
682+ lacs.parts .copy (parts);
582683 }
583684
584685 ~GpuProcessingTaskImpl () {}
@@ -595,7 +696,7 @@ template <typename ImgType>
595696GpuProcessingTask<ImgType>::GpuProcessingTask(GpuProcessingTask&&) = default ;
596697
597698template <typename ImgType>
598- LinearAccessCudaStructs GpuProcessingTask<ImgType>::getDataFromGpu() {return impl->getDataFromGpu ();}
699+ LinearAccessCudaStructs<ImgType> GpuProcessingTask<ImgType>::getDataFromGpu() {return impl->getDataFromGpu ();}
599700
600701template <typename ImgType>
601702void GpuProcessingTask<ImgType>::processOnGpu() {impl->processOnGpu ();}
@@ -606,6 +707,7 @@ template class GpuProcessingTask<int>;
606707template class GpuProcessingTask <uint16_t >;
607708template class GpuProcessingTask <float >;
608709
710+
609711// ================================== TEST helpers ==============
610712// TODO: should be moved somewhere
611713
0 commit comments