Skip to content

Commit 4093552

Browse files
committed
Bspline Offset is now computed on GPU + copy of original image for sampling
1 parent 0a44736 commit 4093552

File tree

3 files changed

+96
-135
lines changed

3 files changed

+96
-135
lines changed

src/algorithm/APRConverter.hpp

Lines changed: 5 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,6 @@ class APRConverter {
7676
template <typename T>
7777
bool get_apr_cuda(APR &aAPR, PixelData<T> &input_image);
7878
template <typename T>
79-
bool get_apr_cuda_streams(APR &aAPR, PixelData<T> &input_image);
80-
template <typename T>
8179
bool get_apr_cuda_multistreams(APR &aAPR, const std::vector<PixelData<T> *> &input_images, int numOfStreams = 3);
8280
#endif
8381

@@ -406,25 +404,13 @@ inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input
406404
initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
407405

408406
computation_timer.start_timer("init_mem");
409-
PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image)
407+
PixelData<ImageType> image_temp(input_image, true /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full-size copy of the image)
410408

411409
/////////////////////////////////
412410
/// Pipeline
413411
////////////////////////
414-
// offset image by factor (this is required if there are zero areas in the background with
415-
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
416-
// Warning both of these could result in over-flow!
417-
418-
if (std::is_floating_point<ImageType>::value) {
419-
image_temp.copyFromMesh(input_image);
420-
} else {
421-
bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda);
422-
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
423-
}
424412

425413
GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, aAPR.level_max());
426-
// std::cout << "after gpt \n";
427-
gpt.setBsplineOffset(bspline_offset);
428414
gpt.processOnGpu();
429415
auto linearAccessGpu = gpt.getDataFromGpu();
430416

@@ -462,7 +448,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
462448
return false;
463449
}
464450

465-
// Reduce number of streams to number of images if there are less images than streams
451+
// Reduce number of streams to number of images if there are fewer images than streams
466452
if (numOfImages < numOfStreams) numOfStreams = numOfImages;
467453

468454
// Use first image to initialize the APR - all other images should have the same dimensions
@@ -476,7 +462,7 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
476462
std::vector<PixelData<ImageType>> tempImages;
477463
std::cout << "allocating PixelData for " << numOfStreams << " streams" << std::endl;
478464
for (int i = 0; i < numOfStreams; ++i) {
479-
tempImages.emplace_back(PixelData<T>(*input_image, false /* don't copy */, true /* pinned memory */));
465+
tempImages.emplace_back(PixelData<T>(*input_image, true /* copy */, true /* pinned memory */));
480466
}
481467

482468
/////////////////////////////////
@@ -497,22 +483,10 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
497483
t.start_timer("GPU processing...");
498484
// Saturate all the streams with first images
499485
for (int i = 0; i < numOfStreams; ++i) {
500-
501-
// offset image by factor (this is required if there are zero areas in the background with
502-
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
503-
// Warning both of these could result in over-flow!
504-
if (std::is_floating_point<ImageType>::value) {
505-
tempImages[i].copyFromMesh(*input_images[i]);
506-
} else {
507-
bspline_offset = compute_bspline_offset<ImageType>(*input_images[i], par.lambda);
508-
tempImages[i].copyFromMeshWithUnaryOp(*input_images[i], [=](const auto &a) { return (a + bspline_offset); });
509-
}
510486
std::cout << "Processing image " << i << " on stream " << i << std::endl;
511-
gpts[i].setBsplineOffset(bspline_offset);
512487
gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
513488
}
514489

515-
516490
// Main loop - get results from GPU and send new images to the streams (if any left)
517491
for (int s = 0; s < numOfImages; ++s) {
518492
int streamNum = s % numOfStreams;
@@ -525,14 +499,8 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
525499
// We have 'numOfImages - numOfStreams' left to process after saturating the streams with first images
526500
if (s < numOfImages - numOfStreams) {
527501
int imageToProcess = s + numOfStreams;
528-
if (std::is_floating_point<ImageType>::value) {
529-
tempImages[streamNum].copyFromMesh(*input_images[imageToProcess]);
530-
} else {
531-
bspline_offset = compute_bspline_offset<ImageType>(*input_images[imageToProcess], par.lambda);
532-
tempImages[streamNum].copyFromMeshWithUnaryOp(*input_images[imageToProcess], [=](const auto &a) { return (a + bspline_offset); });
533-
}
502+
tempImages[streamNum].copyFromMesh(*input_images[imageToProcess]);
534503
std::cout << "Processing image " << imageToProcess << " on stream " << streamNum << std::endl;
535-
gpts[streamNum].setBsplineOffset(bspline_offset);
536504
gpts_futures[streamNum] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[streamNum]);
537505
}
538506

@@ -554,97 +522,6 @@ inline bool APRConverter<ImageType>::get_apr_cuda_multistreams(APR &aAPR, const
554522
std::cout << "CUDA multistream pipeline finished!\n";
555523
return true;
556524
}
557-
558-
/**
559-
* Implementation of pipeline for GPU/CUDA and multiple streams
560-
* NOTE: Currently only one image is processed multiple times just get an idea how fast it can be.
561-
* Finally, it should be able to process incoming stream of data (sequence of images).
562-
*
563-
* @param aAPR - the APR data structure
564-
* @param input_image - input image
565-
*/
566-
template<typename ImageType> template<typename T>
567-
inline bool APRConverter<ImageType>::get_apr_cuda_streams(APR &aAPR, PixelData<T>& input_image) {
568-
// Initialize APR and memory for the pipeline
569-
if (!initPipelineAPR(aAPR, input_image)) return false;
570-
initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
571-
PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full size copy of the image)
572-
573-
/////////////////////////////////
574-
/// Pipeline
575-
/////////////////////////////////
576-
577-
// offset image by factor (this is required if there are zero areas in the background with
578-
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
579-
// Warning both of these could result in over-flow!
580-
if (std::is_floating_point<ImageType>::value) {
581-
image_temp.copyFromMesh(input_image);
582-
} else {
583-
bspline_offset = compute_bspline_offset<ImageType>(input_image, par.lambda);
584-
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
585-
}
586-
587-
// Run input on the GPU streams
588-
constexpr int numOfStreams = 3; // number of streams to use for parallel processing
589-
constexpr int repetitionsPerStream = 3; // number of repetitions per stream to simulate processing of multiple images
590-
591-
APRTimer ttt(true);
592-
593-
ttt.start_timer("-----------------------------> Whole GPU pipeline with repetitions and MEMORY");
594-
{
595-
APRTimer t(true);
596-
std::vector<GpuProcessingTask<ImageType>> gpts;
597-
598-
t.start_timer("Creating GPTS");
599-
std::vector<std::future<void>> gpts_futures; gpts_futures.resize(numOfStreams);
600-
for (int i = 0; i < numOfStreams; ++i) {
601-
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, aAPR.level_max()));
602-
}
603-
t.stop_timer();
604-
605-
t.start_timer("-----------------------------> Whole GPU pipeline with repetitions");
606-
{
607-
APRTimer tt(false);
608-
// Run processOnGpu() asynchronously - it will handle transfering data from CPU to GPU and run whole pipeline
609-
for (int i = 0; i < numOfStreams; ++i) {
610-
gpts[i].setBsplineOffset(bspline_offset);
611-
gpts_futures[i] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[i]);
612-
}
613-
614-
for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
615-
int c = i % numOfStreams;
616-
617-
// Get data from GpuProcessingTask - get() will block until the task is finished
618-
gpts_futures[c].get();
619-
auto linearAccessGpu = gpts[c].getDataFromGpu();
620-
621-
// in theory, we get new data and send them to task
622-
if (i < numOfStreams * (repetitionsPerStream - 1)) {
623-
gpts[c].setBsplineOffset(bspline_offset);
624-
gpts_futures[c] = std::async(std::launch::async, &GpuProcessingTask<ImageType>::processOnGpu, &gpts[c]);
625-
}
626-
627-
// Fill APR data structure with data from GPU
628-
aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
629-
aAPR.linearAccess.y_vec = std::move(linearAccessGpu.y_vec);
630-
aAPR.linearAccess.xz_end_vec = std::move(linearAccessGpu.xz_end_vec);
631-
aAPR.linearAccess.level_xz_vec = std::move(linearAccessGpu.level_xz_vec);
632-
633-
aAPR.apr_initialized = true;
634-
}
635-
}
636-
auto allT = t.stop_timer();
637-
std::cout << "Time per image: " << allT / (numOfStreams*repetitionsPerStream) << " seconds\n";
638-
std::cout << "Bandwidth:" << (input_image.size() / (allT / (numOfStreams*repetitionsPerStream)) / 1024 / 1024) << " MB/s\n";
639-
}
640-
auto allT = ttt.stop_timer();
641-
float tpi = allT / (numOfStreams*repetitionsPerStream);
642-
std::cout << "Time per image: " << tpi << " seconds\n";
643-
std::cout << "Image size: " << (input_image.size() / 1024 / 1024) << " MB\n";
644-
std::cout << "Bandwidth:" << (input_image.size() / tpi / 1024 / 1024) << " MB/s\n";
645-
646-
return true;
647-
}
648525
#endif
649526

650527

@@ -719,8 +596,7 @@ inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_imag
719596
return get_apr_cpu(aAPR, input_image);
720597
#else
721598
// return get_apr_cuda(aAPR, input_image);
722-
// return get_apr_cuda_streams(aAPR, input_image);
723-
std::vector<PixelData<T> *> input_images(3*66, &input_image);
599+
std::vector<PixelData<T> *> input_images(1, &input_image);
724600
return get_apr_cuda_multistreams(aAPR, input_images, 3);
725601
#endif
726602
}

src/algorithm/ComputeGradientCuda.cu

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,55 @@ void runRescaleAndThreshold(T *data, size_t len, float sigma, float sigmaMax, cu
289289
rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, sigma, sigmaMax);
290290
}
291291

292+
/**
293+
* Compute bspline offset for APRConverter of integer type ImageType
294+
*/
295+
template<typename T>
296+
float computeBsplineOffset(T *cudaImage, PixelDataDim dim, float lambda, int numOfBlocks, ScopedCudaMemHandler<T*, JUST_ALLOC> &resultsMin, ScopedCudaMemHandler<T*, JUST_ALLOC> &resultsMax, VectorData<T> &minVector, VectorData<T> &maxVector, cudaStream_t aStream) {
297+
298+
// if bspline smoothing is disabled, there is no need for an offset
299+
if(lambda <= 0) return 0;
300+
301+
// Run kernel and copy data back to CPU
302+
runFindMinMax(cudaImage, dim, aStream, resultsMin.get(), resultsMax.get(), numOfBlocks, numOfThreads);
303+
resultsMin.copyD2H();
304+
resultsMax.copyD2H();
305+
checkCuda(cudaStreamSynchronize(aStream));
306+
307+
// compute offset to center the intensities in the ImageType range (can be negative)
308+
float offset = (std::numeric_limits<T>::max() - (maxVector[0] - minVector[0])) / 2 - minVector[0];
309+
310+
// clamp the offset to [-100, 100]
311+
return std::max(std::min(offset, 100.f), -100.f);
312+
}
313+
314+
315+
/**
316+
* Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise.
317+
* @param input
318+
* @param output
319+
* @param length - len of input/output arrays
320+
* @param thresholdLevel
321+
*/
322+
template <typename T>
323+
__global__ void bsplineOffsetAndCopyOriginal(T *input, T *copy, size_t length, float bspline_offset) {
324+
size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x;
325+
326+
if (idx < length) {
327+
auto v = input[idx];
328+
copy[idx] = v;
329+
input[idx] = v + bspline_offset;
330+
}
331+
}
332+
333+
template <typename ImgType>
334+
void runBsplineOffsetAndCopyOriginal(ImgType *cudaImage, ImgType *cudaCopy, float bspline_offset, const PixelDataDim &dim, cudaStream_t aStream) {
335+
dim3 threadsPerBlock(128);
336+
dim3 numBlocks((dim.size() + threadsPerBlock.x - 1)/threadsPerBlock.x);
337+
bsplineOffsetAndCopyOriginal<<<numBlocks,threadsPerBlock, 0, aStream>>>(cudaImage, cudaCopy, dim.size(), bspline_offset);
338+
};
339+
340+
292341
class CudaStream {
293342
cudaStream_t iStream;
294343

@@ -332,6 +381,7 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
332381

333382
// cuda stuff - memory and stream to be used
334383
ScopedCudaMemHandler<const PixelData<ImgType>, JUST_ALLOC> image;
384+
ScopedCudaMemHandler<const PixelData<ImgType>, JUST_ALLOC> imageSampling;
335385
ScopedCudaMemHandler<PixelData<ImgType>, JUST_ALLOC> gradient;
336386
ScopedCudaMemHandler<PixelData<float>, JUST_ALLOC> local_scale_temp;
337387
ScopedCudaMemHandler<PixelData<float>, JUST_ALLOC> local_scale_temp2;
@@ -373,6 +423,13 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
373423
GenInfoGpuAccess giga;
374424
uint64_t counter_total = 1;
375425

426+
// Preallocated memory for bspline shift computation
427+
VectorData<ImgType> minVector{true};
428+
VectorData<ImgType> maxVector{true};
429+
ScopedCudaMemHandler<ImgType*, JUST_ALLOC> resultsMin;
430+
ScopedCudaMemHandler<ImgType*, JUST_ALLOC> resultsMax;
431+
int numOfBlocks;
432+
376433
public:
377434

378435
// TODO: Remove need for passing 'levels' to GpuProcessingTask
@@ -383,6 +440,7 @@ public:
383440
iCpuLevels(levels),
384441
iStream(cudaStream.get()),
385442
image (inputImage, iStream),
443+
imageSampling (inputImage, iStream),
386444
gradient (levels, iStream),
387445
local_scale_temp (levels, iStream),
388446
local_scale_temp2 (levels, iStream),
@@ -435,13 +493,35 @@ public:
435493

436494
isErrorDetectedPinned.resize(1);
437495
isErrorDetectedCuda.initialize(isErrorDetectedPinned.data(), 1, iStream);
496+
497+
498+
499+
// In nvidia GPUs maximum number of threads per SM is multiplication of 512 (usually 1536 or 2048)
500+
// Calculate number of blocks to saturate whole SMs
501+
// Multiply it by 8 to have more smaller blocks to have better load balancing in case GPU is busy with other tasks
502+
cudaDeviceProp deviceProp;
503+
cudaGetDeviceProperties(&deviceProp, 0);
504+
const int smCount = deviceProp.multiProcessorCount;
505+
const int numOfThreadsPerSM = deviceProp.maxThreadsPerMultiProcessor;
506+
constexpr int numOfThreads = 512;
507+
const int numOfBlocksPerSM = numOfThreadsPerSM / 512;
508+
const int maxNumberOfBlocks = smCount * numOfBlocksPerSM * 8;
509+
const size_t numOfElements = inputImage.getDimension().size();
510+
numOfBlocks = std::min(maxNumberOfBlocks, static_cast<int>((numOfElements + numOfThreads -1) / numOfThreads) );
511+
512+
minVector.resize(numOfBlocks);
513+
maxVector.resize(numOfBlocks);
514+
resultsMin.initialize(minVector.data(), numOfBlocks, iStream);
515+
resultsMax.initialize(maxVector.data(), numOfBlocks, iStream);
438516
}
439517

440518
LinearAccessCudaStructs getDataFromGpu() {
441519
return std::move(lacs);
442520
}
443521

444522
void processOnGpu() {
523+
524+
445525
// Set it and copy first before copying the image
446526
// It improves *a lot* performance even though it is needed later in computeLinearStructureCuda()
447527
iAprInfo.total_number_particles = 0; // reset total_number_particles to 0
@@ -450,6 +530,17 @@ public:
450530

451531
image.copyH2D();
452532

533+
// offset image by factor (this is required if there are zero areas in the background with
534+
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
535+
// Warning both of these could result in over-flow!
536+
if (std::is_floating_point<ImgType>::value) {
537+
iBsplineOffset = 0;
538+
} else {
539+
iBsplineOffset = computeBsplineOffset(image.get(), iCpuImage.getDimension(), iParameters.lambda, numOfBlocks, resultsMin, resultsMax, minVector, maxVector, iStream);
540+
}
541+
runBsplineOffsetAndCopyOriginal(image.get(), imageSampling.get(), iBsplineOffset /*bspline_offset*/, iCpuImage.getDimension(), iStream);
542+
543+
453544
getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(),
454545
splineCudaX, splineCudaY, splineCudaZ, boundary.get(), isErrorDetectedPinned[0], isErrorDetectedCuda,
455546
iBsplineOffset, iParameters, iStream);
@@ -490,8 +581,6 @@ public:
490581
lacs.y_vec.copy(y_vec);
491582
}
492583

493-
void setBsplineOffset(float offset) {iBsplineOffset = offset;}
494-
495584
~GpuProcessingTaskImpl() {}
496585
};
497586

@@ -511,9 +600,6 @@ LinearAccessCudaStructs GpuProcessingTask<ImgType>::getDataFromGpu() {return imp
511600
template <typename ImgType>
512601
void GpuProcessingTask<ImgType>::processOnGpu() {impl->processOnGpu();}
513602

514-
template <typename ImgType>
515-
void GpuProcessingTask<ImgType>::setBsplineOffset(float offset) {impl->setBsplineOffset(offset);}
516-
517603
// explicit instantiation of handled types
518604
template class GpuProcessingTask<uint8_t>;
519605
template class GpuProcessingTask<int>;

test/FullPipelineCudaTest.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,6 @@ namespace {
340340
// Calculate pipeline on GPU
341341
timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
342342
GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, maxLevel);
343-
gpt.setBsplineOffset(bspline_offset);
344343
gpt.processOnGpu();
345344
auto linearAccessGpu = gpt.getDataFromGpu();
346345
giGpu.total_number_particles = linearAccessGpu.y_vec.size();

0 commit comments

Comments
 (0)