Skip to content

Commit 1db4abf

Browse files
committed
Sampling on GPU added - THIS CODE is to being cleaned up - it is pushed for backup reasons only
1 parent e686349 commit 1db4abf

File tree

8 files changed

+152
-16
lines changed

8 files changed

+152
-16
lines changed

examples/Example_get_apr.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,15 @@ int runAPR(cmdLineOptions options) {
8181
ParticleData<uint16_t> particle_intensities;
8282
particle_intensities.sample_image(apr, input_img); // sample your particles from your image
8383
//Below is IO and outputting of the Implied Resolution Function through the Particle Cell level.
84+
std::cout << apr.linearAccess.y_vec.size() << " particles in APR" << std::endl;
85+
std::cout << particle_intensities.size() << " intensities in CPU in APR" << std::endl;
86+
std::cout << aprConverter.parts.size() << " intensities in GPU in APR" << std::endl;
87+
88+
for (int i = 0 ; i < particle_intensities.size(); ++i) {
89+
if (particle_intensities[i] != aprConverter.parts[i]) {
90+
std::cout << "Mismatch at " << i << " CPU: " << particle_intensities[i] << " GPU: " << aprConverter.parts[i] << std::endl;
91+
}
92+
}
8493

8594
//output
8695
std::string save_loc = options.output_dir;

src/algorithm/APRConverter.hpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ class APRConverter {
6666
APRTimer computation_timer;
6767
APRParameters par;
6868

69+
// TODO: this is temporary place to put particle intensity data. It shoud be think over how to move it from GPU
70+
// but for now and tests this is the best place.
71+
VectorData<ImageType> parts;
72+
6973
template <typename T>
7074
bool get_apr(APR &aAPR, PixelData<T> &input_image);
7175

@@ -420,6 +424,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
420424
aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
421425
aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
422426
aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
427+
parts.copy(linearAccessGpu.parts);
423428
aAPR.apr_initialized = true;
424429

425430
std::cout << "CUDA pipeline finished!\n";
@@ -509,6 +514,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
509514
aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec);
510515
aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec);
511516
aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec);
517+
parts = std::move(linearAccessGpu.parts);
512518

513519
aAPR.apr_initialized = true;
514520
}
@@ -596,7 +602,7 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
596602
return get_apr_cpu(aAPR, input_image);
597603
#else
598604
// return get_apr_cuda(aAPR, input_image);
599-
std::vector<PixelData<T> *> input_images(1, &input_image);
605+
std::vector<PixelData<T> *> input_images(3*11, &input_image);
600606
return get_apr_cuda_multistreams(aAPR, input_images, 3);
601607
#endif
602608
}

src/algorithm/ComputeGradientCuda.cu

Lines changed: 107 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,62 @@ void runBsplineOffsetAndCopyOriginal(ImgType *cudaImage, ImgType *cudaCopy, floa
338338
};
339339

340340

341+
template <typename T>
342+
__global__ void printKernel(T *input, size_t length) {
343+
printf("DOWNSAMPLED: ");
344+
for (int i = 0; i < length; i++) printf("%d ", input[i]);
345+
printf("\n");
346+
}
347+
348+
template <typename ImgType>
349+
void runPrint(ImgType *cudaImage, size_t length, cudaStream_t aStream) {
350+
printKernel<<<1,1, 0, aStream>>>(cudaImage, length);
351+
};
352+
353+
354+
template <typename T>
355+
__global__ void sampleKernel(T *downsampledLevel, T *parts_cuda, int level, int xLen, int yLen, int zLen, uint64_t *level_xz_vec_cuda, uint64_t *xz_end_vec_cuda, uint16_t *y_vec) {
356+
const int xi = (blockIdx.x * blockDim.x) + threadIdx.x;
357+
const int zi = (blockIdx.z * blockDim.z) + threadIdx.z;
358+
if (xi >= xLen || zi >= zLen) return;
359+
uint64_t level_start = level_xz_vec_cuda[level];
360+
uint64_t offset = xi + zi * xLen;
361+
auto xz_start = level_start + offset;
362+
363+
auto begin_index = xz_end_vec_cuda[xz_start - 1];
364+
auto end_index = xz_end_vec_cuda[xz_start];
365+
366+
for (size_t idx = begin_index; idx < end_index; ++idx) {
367+
int y = y_vec[idx];
368+
size_t imageIdx = zi * xLen * yLen + xi * yLen + y;
369+
parts_cuda[idx] = downsampledLevel[imageIdx];
370+
}
371+
}
372+
373+
template <typename ImgType>
374+
void runSampleParts(ImgType** downsampled, GenInfo &aprInfo, ImgType *parts_cuda, uint64_t *level_xz_vec_cuda, uint64_t *xz_end_vec_cuda, uint16_t *y_vec, cudaStream_t aStream) {
375+
// std::cout << aprInfo << std::endl;
376+
// Run kernels for each level
377+
for (int level = aprInfo.l_min; level <= aprInfo.l_max; level++) {
378+
// std::cout << "Processing level " << level << std::endl;
379+
dim3 threadsPerBlock(128, 1, 8);
380+
dim3 numBlocks((aprInfo.x_num[level] + threadsPerBlock.x - 1) / threadsPerBlock.x,
381+
1,
382+
(aprInfo.z_num[level] + threadsPerBlock.z - 1) / threadsPerBlock.z);
383+
// std::cout << downsampled[level] << std::endl;
384+
// std::cout << parts_cuda << std::endl;
385+
// std::cout << aprInfo.x_num[level] << std::endl;
386+
// std::cout << aprInfo.y_num[level] << std::endl;
387+
// std::cout << aprInfo.z_num[level] << std::endl;
388+
// std::cout << level_xz_vec_cuda << std::endl;
389+
// std::cout << xz_end_vec_cuda << std::endl;
390+
// std::cout << y_vec << std::endl;
391+
sampleKernel<<<numBlocks, threadsPerBlock, 0, aStream>>>(downsampled[level], parts_cuda, level, aprInfo.x_num[level], aprInfo.y_num[level], aprInfo.z_num[level], level_xz_vec_cuda, xz_end_vec_cuda, y_vec);
392+
}
393+
394+
};
395+
396+
341397
class CudaStream {
342398
cudaStream_t iStream;
343399

@@ -407,7 +463,7 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
407463
ParticleCellTreeCuda pctc;
408464

409465
ScopedCudaMemHandler<uint16_t*, JUST_ALLOC> y_vec_cuda; // for LinearAccess
410-
LinearAccessCudaStructs lacs;
466+
LinearAccessCudaStructs<ImgType> lacs;
411467

412468
// Padded memory for local_scale_temp and local_scale_temp2
413469
ScopedCudaMemHandler<float*, JUST_ALLOC> lstPadded;
@@ -422,6 +478,8 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
422478
ScopedCudaMemHandler<uint64_t *, JUST_ALLOC> level_xz_vec_cuda; //(level_xz_vec.data(), level_xz_vec.size(), aStream);
423479
GenInfoGpuAccess giga;
424480
uint64_t counter_total = 1;
481+
VectorData<ImgType> parts;
482+
ScopedCudaMemHandler<ImgType *, JUST_ALLOC> parts_cuda;
425483

426484
// Preallocated memory for bspline shift computation
427485
VectorData<ImgType> minVector{true};
@@ -455,11 +513,13 @@ public:
455513
boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num},
456514
boundary{nullptr, boundaryLen, iStream},
457515
pctc(iAprInfo, iStream),
458-
y_vec_cuda(nullptr, iAprInfo.getSize(), iStream),
516+
y_vec_cuda(nullptr, iAprInfo.getSize()/2, iStream), // TODO: only half capacity
459517
xz_end_vec(true),
460518
level_xz_vec(true),
461519
y_vec(true),
462-
giga(iAprInfo, iStream)
520+
giga(iAprInfo, iStream),
521+
parts(true),
522+
parts_cuda(nullptr, iAprInfo.getSize()/2, iStream) // TODO: only half capacity
463523
{
464524
splineCudaX = cudax.first;
465525
splineCudaY = cuday.first;
@@ -491,6 +551,9 @@ public:
491551
xz_end_vec_cuda.initialize(xz_end_vec.data(), xz_end_vec.size(), iStream);
492552
level_xz_vec_cuda.initialize(level_xz_vec.data(), level_xz_vec.size(), iStream);
493553

554+
parts.resize(iAprInfo.getSize()); // resize it to worst case -> same number particles as pixels in input image
555+
556+
494557
isErrorDetectedPinned.resize(1);
495558
isErrorDetectedCuda.initialize(isErrorDetectedPinned.data(), 1, iStream);
496559

@@ -515,7 +578,34 @@ public:
515578
resultsMax.initialize(maxVector.data(), numOfBlocks, iStream);
516579
}
517580

518-
LinearAccessCudaStructs getDataFromGpu() {
581+
void sample() {
582+
// Prepare memory for downsampled pyramid
583+
// Use 'image' as a memory for all levels (but max one)
584+
// since data there is 'destroyed' anyway
585+
// via bspline filtering and gradient computation
586+
// and as the highest level of pyramid use imageSampling which is
587+
// a copy of original image at full resolution
588+
int l_max = iAprInfo.l_max;
589+
int l_min = iAprInfo.l_min;
590+
ImgType* downsampled[l_max + 1];
591+
downsampled[l_max] = imageSampling.get();
592+
size_t levelOffset = 0;
593+
for (int l = l_max-1; l >= l_min; --l) {
594+
size_t level_size = iAprInfo.x_num[l] * iAprInfo.y_num[l] * iAprInfo.z_num[l];
595+
// std::cout << l << " dim: " << iAprInfo.getDimension(l) << " " << iAprInfo.getSize(l) << " " << level_size << std::endl;
596+
downsampled[l] = image.get() + levelOffset;
597+
levelOffset += iAprInfo.getSize(l);
598+
599+
runDownsampleMean(downsampled[l+1], downsampled[l], iAprInfo.x_num[l+1], iAprInfo.y_num[l+1], iAprInfo.z_num[l+1], iStream);
600+
}
601+
602+
// VectorData<uint64_t> xz_end_vec;
603+
// VectorData<uint64_t> level_xz_vec;
604+
// VectorData<uint16_t> y_vec;
605+
runSampleParts(downsampled, iAprInfo, parts_cuda.get(), level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), iStream);
606+
}
607+
608+
LinearAccessCudaStructs<ImgType> getDataFromGpu() {
519609
return std::move(lacs);
520610
}
521611

@@ -572,13 +662,24 @@ public:
572662
// Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures
573663
checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda.get(), iAprInfo.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, iStream));
574664

665+
666+
// SAMPLE under development
667+
sample();
668+
parts.resize(iAprInfo.total_number_particles);
669+
// Copy y_vec from GPU to CPU and synchronize last time - it is needed before we copy data to CPU structures
670+
checkCuda(cudaMemcpyAsync(parts.begin(), parts_cuda.get(), iAprInfo.total_number_particles * sizeof(ImgType), cudaMemcpyDeviceToHost, iStream));
671+
672+
673+
674+
575675
// Synchornize last time - at that moment all data from GPU is copied to CPU
576676
checkCuda(cudaStreamSynchronize(iStream));
577677

578678
// Prepare CPU structures
579679
lacs.xz_end_vec.copy(xz_end_vec);
580680
lacs.level_xz_vec.copy(level_xz_vec);
581681
lacs.y_vec.copy(y_vec);
682+
lacs.parts.copy(parts);
582683
}
583684

584685
~GpuProcessingTaskImpl() {}
@@ -595,7 +696,7 @@ template <typename ImgType>
595696
GpuProcessingTask<ImgType>::GpuProcessingTask(GpuProcessingTask&&) = default;
596697

597698
template <typename ImgType>
598-
LinearAccessCudaStructs GpuProcessingTask<ImgType>::getDataFromGpu() {return impl->getDataFromGpu();}
699+
LinearAccessCudaStructs<ImgType> GpuProcessingTask<ImgType>::getDataFromGpu() {return impl->getDataFromGpu();}
599700

600701
template <typename ImgType>
601702
void GpuProcessingTask<ImgType>::processOnGpu() {impl->processOnGpu();}
@@ -606,6 +707,7 @@ template class GpuProcessingTask<int>;
606707
template class GpuProcessingTask<uint16_t>;
607708
template class GpuProcessingTask<float>;
608709

710+
609711
// ================================== TEST helpers ==============
610712
// TODO: should be moved somewhere
611713

src/algorithm/ComputeGradientCuda.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class GpuProcessingTask {
4747
~GpuProcessingTask();
4848
GpuProcessingTask(GpuProcessingTask&&);
4949

50-
LinearAccessCudaStructs getDataFromGpu();
50+
LinearAccessCudaStructs<ImgType> getDataFromGpu();
5151
void processOnGpu();
5252

5353
void setBsplineOffset(float bspline_offset);

src/data_structures/APR/access/LinearAccessCuda.cu

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,8 @@ void runFourthStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCu
460460
* - copy it back to CPU
461461
* - returns all the structure
462462
*/
463-
LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct) {
463+
template <typename ImgType>
464+
LinearAccessCudaStructs<ImgType> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct) {
464465

465466
cudaStream_t aStream = nullptr;
466467

@@ -526,14 +527,21 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara
526527
p_map.downloadPCTfromGPU(pct);
527528

528529

529-
LinearAccessCudaStructs lac;
530+
LinearAccessCudaStructs<ImgType> lac;
530531
lac.y_vec.swap(y_vec);
531532
lac.xz_end_vec.swap(xz_end_vec);
532533
lac.level_xz_vec.swap(level_xz_vec);
533534

534535
return lac;
535536
}
536537

538+
// explicit instantiation of handled types
539+
template LinearAccessCudaStructs<float> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
540+
template LinearAccessCudaStructs<uint16_t> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
541+
template LinearAccessCudaStructs<int> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
542+
template LinearAccessCudaStructs<uint8_t> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
543+
544+
537545
void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda, const uint64_t *level_xz_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, GenInfoGpuAccess &giga, const APRParameters &apr_parameters, uint64_t counter_total, cudaStream_t aStream) {
538546

539547
const uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2;

src/data_structures/APR/access/LinearAccessCuda.hpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,27 @@
66
#include "data_structures/APR/GenInfo.hpp"
77
#include "algorithm/ParticleCellTreeCuda.cuh"
88

9-
typedef struct {
9+
template <typename ImgType>
10+
struct LinearAccessCudaStructs {
1011
VectorData<uint16_t> y_vec;
1112
VectorData<uint64_t> xz_end_vec;
1213
VectorData<uint64_t> level_xz_vec;
13-
} LinearAccessCudaStructs;
14+
15+
// temporarily added
16+
VectorData<ImgType> parts;
17+
};
18+
19+
// explicit instantiation of handled types
20+
template class LinearAccessCudaStructs<uint8_t>;
21+
template class LinearAccessCudaStructs<int>;
22+
template class LinearAccessCudaStructs<uint16_t>;
23+
template class LinearAccessCudaStructs<float>;
1424

1525
#include "data_structures/APR/access/GenInfoGpuAccess.cuh"
1626

1727
// This is for testing purposes only
18-
LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
28+
template <typename ImgType>
29+
LinearAccessCudaStructs<ImgType> initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
1930

2031
void computeLinearStructureCuda(uint16_t *y_vec_cuda, uint64_t *xz_end_vec_cuda, const uint64_t *level_xz_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, GenInfoGpuAccess &giga, const APRParameters &apr_parameters, uint64_t counter_total, cudaStream_t aStream);
2132

test/FullPipelineCudaTest.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ namespace {
263263
getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par);
264264
computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
265265
auto pct = computeOvpcCuda(local_scale_temp_GPU, giGpu);
266-
auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct);
266+
auto linearAccessGpu = initializeLinearStructureCuda<ImageType>(giGpu, par, pct);
267267
timer.stop_timer();
268268

269269
// Compare GPU vs CPU - expect exactly same result

test/LinearAccessCudaTest.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ TEST(LinearAccessCudaTest, optimizationForSmallLevels) {
272272
par.neighborhood_optimization = true;
273273

274274
// --- Method under test
275-
auto linearAccess = initializeLinearStructureCuda(gi, par, pct);
275+
auto linearAccess = initializeLinearStructureCuda<uint16_t>(gi, par, pct);
276276

277277
// ---- Verify output
278278
std::vector<uint16_t> expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz
@@ -312,7 +312,7 @@ TEST(LinearAccessCudaTest, optimizationForSmallLevelsVScpu) {
312312

313313
// --- Method under test
314314
linearAccess.initialize_linear_structure(par, pct);
315-
auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu);
315+
auto linearAccessGpu = initializeLinearStructureCuda<uint16_t>(giGpu, par, pctGpu);
316316

317317
EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0);
318318
EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0);
@@ -359,7 +359,7 @@ TEST(LinearAccessCudaTest, testGPUvsCPUforDifferentSizes) {
359359
t.stop_timer();
360360

361361
t.start_timer("_________________________ GPU");
362-
auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu);
362+
auto linearAccessGpu = initializeLinearStructureCuda<uint16_t>(giGpu, par, pctGpu);
363363
t.stop_timer();
364364

365365

0 commit comments

Comments
 (0)