Skip to content

Commit 9ff0580

Browse files
committed
Full GPU pipeline works1
1 parent 1d4e549 commit 9ff0580

File tree

6 files changed

+105
-19
lines changed

6 files changed

+105
-19
lines changed

src/algorithm/ComputeGradientCuda.cu

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "misc/CudaMemory.cuh"
1616
#include "algorithm/ParticleCellTreeCuda.cuh"
1717
#include "algorithm/PullingSchemeCuda.hpp"
18+
#include "data_structures/APR/access/LinearAccessCuda.hpp"
1819

1920
#include "dsGradient.cuh"
2021
#include "invBspline.cuh"
@@ -232,6 +233,9 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
232233

233234
ParticleCellTreeCuda pctc;
234235

236+
ScopedCudaMemHandler<uint16_t*, JUST_ALLOC> y_vec; // for LinearAccess
237+
LinearAccessCudaStructs lacs;
238+
235239
/**
236240
* @return newly created stream
237241
*/
@@ -264,7 +268,8 @@ public:
264268
bc4(params.bc4.get(), params.k0, iStream),
265269
boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num},
266270
boundary{nullptr, boundaryLen, iStream},
267-
pctc(iAprInfo, iStream)
271+
pctc(iAprInfo, iStream),
272+
y_vec(nullptr, iAprInfo.getSize(), iStream)
268273
{
269274
// std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n";
270275
std::cout << iCpuImage << std::endl;
@@ -279,12 +284,13 @@ public:
279284
std::cout << "SEND time: " << ct.microseconds() - start << std::endl;
280285
}
281286

282-
void getDataFromGpu() {
283-
CurrentTime ct;
284-
uint64_t start = ct.microseconds();
285-
local_scale_temp.copyD2H();
286-
checkCuda(cudaStreamSynchronize(iStream));
287-
std::cout << "RCV time: " << ct.microseconds() - start << std::endl;
287+
LinearAccessCudaStructs getDataFromGpu() {
288+
// CurrentTime ct;
289+
// uint64_t start = ct.microseconds();
290+
// local_scale_temp.copyD2H();
291+
// checkCuda(cudaStreamSynchronize(iStream));
292+
// std::cout << "RCV time: " << ct.microseconds() - start << std::endl;
293+
return std::move(lacs);
288294
}
289295

290296
void processOnGpu() {
@@ -317,6 +323,8 @@ public:
317323
std::cout << "3: " << ct.microseconds() - start << std::endl;
318324

319325
computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream);
326+
computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream);
327+
std::cout << iAprInfo << std::endl;
320328
}
321329

322330
~GpuProcessingTaskImpl() {
@@ -339,7 +347,7 @@ template <typename ImgType>
339347
void GpuProcessingTask<ImgType>::sendDataToGpu() {impl->sendDataToGpu();}
340348

341349
template <typename ImgType>
342-
void GpuProcessingTask<ImgType>::getDataFromGpu() {impl->getDataFromGpu();}
350+
LinearAccessCudaStructs GpuProcessingTask<ImgType>::getDataFromGpu() {return impl->getDataFromGpu();}
343351

344352
template <typename ImgType>
345353
void GpuProcessingTask<ImgType>::processOnGpu() {impl->processOnGpu();}

src/algorithm/ComputeGradientCuda.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
#include "data_structures/Mesh/PixelData.hpp"
99
#include "algorithm/APRParameters.hpp"
10-
10+
#include "data_structures/APR/access/LinearAccessCuda.hpp"
1111

1212
// Test helpers and definitions
1313
using TypeOfRecBsplineFlags = uint16_t;
@@ -47,7 +47,7 @@ class GpuProcessingTask {
4747
GpuProcessingTask(GpuProcessingTask&&);
4848

4949
void sendDataToGpu();
50-
void getDataFromGpu();
50+
LinearAccessCudaStructs getDataFromGpu();
5151
void processOnGpu();
5252
void doAll();
5353
};

src/data_structures/APR/GenInfo.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class GenInfo {
3737
GenInfo() {}
3838
GenInfo(const PixelDataDim &dim) { init(dim); }
3939

40+
size_t getSize() const { return (size_t)y_num[l_max] * x_num[l_max] * z_num[l_max]; }
41+
4042
//initialize the information given the original dimensions
4143
void init(const PixelDataDim &dim) {
4244
init(dim.y, dim.x, dim.z);
@@ -119,6 +121,7 @@ class GenInfo {
119121
friend std::ostream & operator<<(std::ostream &os, const GenInfo &gi) {
120122
os << "GenInfo {\n";
121123
os << " Original dimensions(y/x/z): [" << gi.org_dims[0] << ", " << gi.org_dims[1] << ", " << gi.org_dims[2] << "]\n";
124+
os << " Original size: " << gi.getSize() << "\n";
122125
os << " Number of dimensions: " << static_cast<int>(gi.number_dimensions) << "\n";
123126
os << " l_min, l_max: {" << gi.l_min << " - " << gi.l_max << "}\n";
124127
os << " total number of particles: " << gi.total_number_particles << "\n";

src/data_structures/APR/access/LinearAccessCuda.cu

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,3 +592,51 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara
592592

593593
return lac;
594594
}
595+
596+
void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream) {
597+
598+
uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2;
599+
600+
VectorData<uint64_t> xz_end_vec(true);
601+
VectorData<uint64_t> level_xz_vec(true);
602+
603+
// initialize_xz_linear() - CPU impl.
604+
uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking.
605+
level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1;
606+
level_xz_vec[0] = 1; //allowing for the offset.
607+
for (int i = 0; i <= gi.l_max; ++i) {
608+
counter_total += gi.x_num[i] * gi.z_num[i];
609+
level_xz_vec[i + 1] = counter_total;
610+
}
611+
xz_end_vec.resize(counter_total, 0);
612+
613+
614+
{
615+
ScopedCudaMemHandler<uint64_t *, D2H> xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size());
616+
ScopedCudaMemHandler<uint64_t *, H2D | D2H> level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size());
617+
GenInfoGpuAccess giga(gi, aStream);
618+
if (gi.l_max <= 2) {
619+
runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, gi, giga, aStream);
620+
}
621+
else {
622+
runFirstStep(gi, giga, p_map, min_type, aStream);
623+
runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream);
624+
runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream);
625+
runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, aStream);
626+
runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, counter_total, aStream);
627+
}
628+
}
629+
630+
// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; };
631+
// prt(y_vec);
632+
// prt(xz_end_vec);
633+
// prt(level_xz_vec);
634+
VectorData<uint16_t> y_vec(true);
635+
y_vec.resize(gi.total_number_particles);
636+
checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda, gi.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, aStream));
637+
checkCuda(cudaStreamSynchronize(aStream));
638+
639+
lacs.y_vec.swap(y_vec);
640+
lacs.xz_end_vec.swap(xz_end_vec);
641+
lacs.level_xz_vec.swap(level_xz_vec);
642+
}

src/data_structures/APR/access/LinearAccessCuda.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "algorithm/APRParameters.hpp"
55
#include "data_structures/Mesh/PixelData.hpp"
66
#include "data_structures/APR/GenInfo.hpp"
7+
#include "algorithm/ParticleCellTreeCuda.cuh"
78

89
typedef struct {
910
VectorData<uint16_t> y_vec;
@@ -13,5 +14,7 @@ typedef struct {
1314

1415
LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
1516

17+
void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream);
18+
1619

1720
#endif //APR_LINEARACCESSCUDA_HPP

test/FullPipelineCudaTest.cpp

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ namespace {
277277
}
278278
}
279279

280-
TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GpuProcessingTask) {
280+
TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GpuProcessingTask) {
281281
APRTimer timer(true);
282282

283283
// TODO: This tets fails if dim of input image is smaller than ~8 (not sure in which direction yet)
@@ -288,11 +288,15 @@ namespace {
288288
// Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors
289289
using ImageType = float;
290290
constexpr PixelDataDim dim1{4, 4, 3};
291-
constexpr PixelDataDim dim2{163, 123, 555};
291+
constexpr PixelDataDim dim2{1024,512,512};
292292
for (int d = 0; d <= 3; d++) {
293293
auto &dim = (d % 2 == 0) ? dim1 : dim2;
294294
PixelData<ImageType> input_image = (d / 2 == 0) ? getRandInitializedMesh<ImageType>(dim, 13) :
295-
getMeshWithBlobInMiddle<ImageType>(dim);
295+
getMeshWithBlobInMiddle<ImageType>(dim);
296+
297+
// constexpr PixelDataDim dim = dim1;
298+
// PixelData<ImageType> input_image = getRandInitializedMesh<ImageType>(dim, 13);
299+
296300
int maxLevel = ceil(std::log2(dim.maxDimSize()));
297301

298302
// Initialize CPU data structures
@@ -321,32 +325,52 @@ namespace {
321325
par.dz = 1;
322326
par.neighborhood_optimization = true;
323327

328+
GenInfo aprInfo(input_image.getDimension());
329+
GenInfo giGpu(input_image.getDimension());
330+
331+
// Calculate pipeline on CPU
324332
// Calculate pipeline on CPU
325333
timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE");
326334
ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par);
327335
LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par);
328336
LocalParticleCellSet lpcs = LocalParticleCellSet();
329337
lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
338+
PullingScheme ps;
339+
ps.initialize_particle_cell_tree(aprInfo);
340+
lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par);
341+
ps.pulling_scheme_main();
342+
LinearAccess linearAccess;
343+
linearAccess.genInfo = &aprInfo;
344+
linearAccess.initialize_linear_structure(par, ps.getParticleCellTree());
330345
timer.stop_timer();
331346

332347

333348
// Calculate pipeline on GPU
334349
timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
335-
{
336-
GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel);
337-
gpt.doAll();
338-
}
350+
// {
351+
GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel);
352+
gpt.sendDataToGpu();
353+
gpt.processOnGpu();
354+
auto linearAccessGpu = gpt.getDataFromGpu();
355+
giGpu.total_number_particles = linearAccessGpu.y_vec.size();
356+
357+
// }
339358
timer.stop_timer();
340359

341360
// Compare GPU vs CPU - expect exactly same result
342-
EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0);
361+
EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0);
362+
EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0);
363+
EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0);
364+
365+
EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles);
366+
EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size());
343367

344368
}
345369
}
370+
346371
#endif // APR_USE_CUDA
347372
}
348373

349-
350374
int main(int argc, char **argv) {
351375
testing::InitGoogleTest(&argc, argv);
352376
return RUN_ALL_TESTS();

0 commit comments

Comments
 (0)