Skip to content

Commit d2fd1d0

Browse files
committed
GPU pipeline now works for APRConverter!
1 parent 3c601be commit d2fd1d0

File tree

6 files changed

+290
-300
lines changed

6 files changed

+290
-300
lines changed

examples/Example_get_apr.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ struct cmdLineOptions{
3030
bool auto_parameters = false;
3131

3232
float Ip_th = 0;
33-
float lambda = -1;
33+
float lambda = 3.0;
3434
float sigma_th = 0;
3535
float rel_error = 0.1;
3636
float grad_th = 1;

src/algorithm/APRConverter.hpp

Lines changed: 36 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ class APRConverter {
117117
PixelData<float> local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors
118118
PixelData<float> local_scale_temp2;
119119

120-
void applyParameters(APR& aAPR,APRParameters& aprParameters);
120+
void applyParameters(APRParameters& aprParameters);
121121

122122
template<typename T>
123123
void computeL(APR& aAPR,PixelData<T>& input_image);
@@ -184,7 +184,7 @@ void APRConverter<ImageType>::get_apr_custom_grad_scale(APR& aAPR,PixelData<Imag
184184
}
185185

186186
aAPR.parameters = par;
187-
applyParameters(aAPR,par);
187+
applyParameters(par);
188188
solveForAPR(aAPR);
189189
generateDatastructures(aAPR);
190190

@@ -251,7 +251,7 @@ void APRConverter<ImageType>::computeL(APR& aAPR,PixelData<T>& input_image){
251251
}
252252

253253
template<typename ImageType>
254-
void APRConverter<ImageType>::applyParameters(APR& aAPR,APRParameters& aprParameters) {
254+
void APRConverter<ImageType>::applyParameters(APRParameters& aprParameters) {
255255
//
256256
// Apply the main parameters
257257
//
@@ -265,39 +265,7 @@ void APRConverter<ImageType>::applyParameters(APR& aAPR,APRParameters& aprParame
265265
}
266266
fine_grained_timer.stop_timer();
267267

268-
fine_grained_timer.start_timer("threshold");
269-
iComputeGradient.threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset);
270-
fine_grained_timer.stop_timer();
271-
272-
float max_th = 60000;
273-
274-
#ifdef HAVE_OPENMP
275-
#pragma omp parallel for default(shared)
276-
#endif
277-
for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
278-
279-
float rescaled = local_scale_temp.mesh[i];
280-
if (rescaled < aprParameters.sigma_th) {
281-
rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : par.sigma_th;
282-
local_scale_temp.mesh[i] = rescaled;
283-
}
284-
}
285-
286-
#ifdef HAVE_LIBTIFF
287-
if(par.output_steps) {
288-
TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_rescaled.tif", local_scale_temp);
289-
}
290-
#endif
291-
292-
#ifdef HAVE_OPENMP
293-
#pragma omp parallel for default(shared)
294-
#endif
295-
for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
296-
297-
if(grad_temp.mesh[i] < aprParameters.grad_th){
298-
grad_temp.mesh[i] = 0;
299-
}
300-
}
268+
iComputeGradient.applyParameters(grad_temp, local_scale_temp, local_scale_temp2, aprParameters, bspline_offset);
301269
}
302270

303271

@@ -405,7 +373,7 @@ inline bool APRConverter<ImageType>::get_lrf(APR &aAPR, PixelData<T>& input_imag
405373
template<typename ImageType>
406374
inline bool APRConverter<ImageType>::get_ds(APR &aAPR) {
407375

408-
applyParameters(aAPR,par);
376+
applyParameters(par);
409377
aAPR.parameters = par;
410378

411379
solveForAPR(aAPR);
@@ -426,104 +394,45 @@ inline bool APRConverter<ImageType>::get_ds(APR &aAPR) {
426394
*/
427395
template<typename ImageType> template<typename T>
428396
inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input_image) {
429-
if (!initPipelineAPR(aAPR, input_image)) return false;
430397

398+
if (!initPipelineAPR(aAPR, input_image)) return false;
431399

432400
initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
433401

434-
method_timer.start_timer("compute_gradient_magnitude_using_bsplines and local instensity scale CUDA");
435-
APRTimer t(true);
436-
APRTimer d(true);
437-
t.start_timer(" =========== ALL");
438-
{
439-
440-
computation_timer.start_timer("init_mem");
441-
PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
442-
443-
/////////////////////////////////
444-
/// Pipeline
445-
////////////////////////
446-
// offset image by factor (this is required if there are zero areas in the background with
447-
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
448-
// Warning both of these could result in over-flow!
449-
450-
if (std::is_same<uint16_t, ImageType>::value) {
451-
bspline_offset = 100;
452-
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
453-
} else if (std::is_same<uint8_t, ImageType>::value) {
454-
bspline_offset = 5;
455-
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
456-
} else {
457-
image_temp.copyFromMesh(input_image);
458-
}
459-
460-
computation_timer.stop_timer();
461-
462-
std::vector<GpuProcessingTask<ImageType>> gpts;
463-
464-
int numOfStreams = 1;
465-
int repetitionsPerStream = 1;
466-
467-
computation_timer.start_timer("compute_L");
468-
// Create streams and send initial task to do
469-
for (int i = 0; i < numOfStreams; ++i) {
470-
gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
471-
gpts.back().sendDataToGpu();
472-
gpts.back().processOnGpu();
473-
}
474-
computation_timer.stop_timer();
475-
476-
477-
for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
478-
int c = i % numOfStreams;
479-
480-
computation_timer.start_timer("apply_parameters");
481-
// get data from previous task
482-
gpts[c].getDataFromGpu();
483-
484-
computation_timer.stop_timer();
485-
486-
// in theory we get new data and send them to task
487-
if (i < numOfStreams * (repetitionsPerStream - 1)) {
488-
gpts[c].sendDataToGpu();
489-
gpts[c].processOnGpu();
490-
}
491-
492-
// Postprocess on CPU
493-
std::cout << "--------- start CPU processing ---------- " << i << std::endl;
494-
495-
computation_timer.start_timer("solve_for_apr");
496-
iPullingScheme.initialize_particle_cell_tree(aAPR.aprInfo);
497-
498-
PixelData<float> lst(local_scale_temp, true);
499-
500-
#ifdef HAVE_LIBTIFF
501-
if (par.output_steps){
502-
TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_step.tif", lst);
503-
}
504-
#endif
402+
computation_timer.start_timer("init_mem");
403+
PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
505404

506-
#ifdef HAVE_LIBTIFF
507-
if (par.output_steps){
508-
TiffUtils::saveMeshAsTiff(par.output_dir + "gradient_step.tif", grad_temp);
509-
}
510-
#endif
405+
/////////////////////////////////
406+
/// Pipeline
407+
////////////////////////
408+
// offset image by factor (this is required if there are zero areas in the background with
409+
// uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
410+
// Warning both of these could result in over-flow!
511411

512-
iLocalParticleSet.get_local_particle_cell_set(iPullingScheme,lst, local_scale_temp2,par);
412+
if (std::is_same<uint16_t, ImageType>::value) {
413+
bspline_offset = 100;
414+
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
415+
} else if (std::is_same<uint8_t, ImageType>::value) {
416+
bspline_offset = 5;
417+
image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
418+
} else {
419+
image_temp.copyFromMesh(input_image);
420+
}
513421

514-
iPullingScheme.pulling_scheme_main();
422+
GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max());
423+
gpt.sendDataToGpu();
424+
gpt.processOnGpu();
425+
auto linearAccessGpu = gpt.getDataFromGpu();
515426

516-
computation_timer.stop_timer();
427+
aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
517428

518-
computation_timer.start_timer("generate_data_structures");
519-
generateDatastructures(aAPR);
520-
computation_timer.stop_timer();
521-
}
522-
std::cout << "Total n ENDED" << std::endl;
429+
// generateDatastructures(aAPR) for linearAcceess for CUDA
430+
aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
431+
aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
432+
aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
433+
aAPR.apr_initialized = true;
523434

524-
}
525-
t.stop_timer();
526-
method_timer.stop_timer();
435+
std::cout << "CUDA pipeline finished!\n";
527436

528437
return true;
529438
}
@@ -565,7 +474,7 @@ inline bool APRConverter<ImageType>::get_apr_cpu(APR &aAPR, PixelData<T> &input_
565474
method_timer.stop_timer();
566475
}
567476

568-
applyParameters(aAPR,par);
477+
applyParameters(par);
569478

570479
computation_timer.stop_timer();
571480

@@ -597,7 +506,7 @@ template<typename ImageType> template<typename T>
597506
inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_image) {
598507
// TODO: CUDA pipeline is temporarily turned off and CPU version is always chosen.
599508
// After revising a CUDA pipeline remove "#if true // " part.
600-
#if true // #ifndef APR_USE_CUDA
509+
#ifndef APR_USE_CUDA
601510
return get_apr_cpu(aAPR, input_image);
602511
#else
603512
return get_apr_cuda(aAPR, input_image);

src/algorithm/ComputeGradient.hpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,35 @@ class ComputeGradient {
3838
template<typename T>
3939
void calc_inv_bspline_z(PixelData<T> &input);
4040

41+
template<typename T>
42+
void applyParameters(PixelData<T> &grad_temp, PixelData<float> &local_scale_temp, PixelData<float> &local_scale_temp2, APRParameters &aprParameters, float bspline_offset) {
43+
threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset);
44+
45+
float max_th = 60000;
46+
47+
#ifdef HAVE_OPENMP
48+
#pragma omp parallel for default(shared)
49+
#endif
50+
for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
51+
52+
float rescaled = local_scale_temp.mesh[i];
53+
if (rescaled < aprParameters.sigma_th) {
54+
rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : aprParameters.sigma_th;
55+
local_scale_temp.mesh[i] = rescaled;
56+
}
57+
}
58+
59+
#ifdef HAVE_OPENMP
60+
#pragma omp parallel for default(shared)
61+
#endif
62+
for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
63+
64+
if(grad_temp.mesh[i] < aprParameters.grad_th){
65+
grad_temp.mesh[i] = 0;
66+
}
67+
}
68+
}
69+
4170
struct three_temps {
4271
float temp_1, temp_2, temp_3;
4372
};

src/algorithm/ComputeGradientCuda.cu

Lines changed: 64 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ namespace {
5757
}
5858

5959
BsplineParams prepareBsplineStuff(size_t dimLen, float lambda, float tol, int maxFilterLen = -1) {
60+
6061
// Recursive Filter Implimentation for Smoothing BSplines
6162
// B-Spline Signal Processing: Part II - Efficient Design and Applications, Unser 1993
6263

@@ -79,8 +80,8 @@ namespace {
7980

8081
const float norm_factor = powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2);
8182

82-
//std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1
83-
// << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl;
83+
// std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1
84+
// << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << " lambda=" << lambda << " tol=" << tol << std::endl;
8485

8586
// ------- Calculating boundary conditions
8687

@@ -169,18 +170,18 @@ void getGradientCuda(const PixelData<ImgType> &image, PixelData<float> &local_sc
169170

170171
// TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim
171172

172-
runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream);
173-
runBsplineXdir(cudaImage, image.getDimension(), px, aStream);
174-
runBsplineZdir(cudaImage, image.getDimension(), pz, aStream);
173+
if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream);
174+
if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream);
175+
if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream);
175176

176177

177178
runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream);
178179

179180
runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream);
180181

181-
runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
182-
runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
183-
runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
182+
if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
183+
if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
184+
if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
184185
}
185186

186187
class CurrentTime {
@@ -202,6 +203,49 @@ public:
202203
};
203204

204205

206+
/**
207+
* Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise.
208+
* @param input
209+
* @param output
210+
* @param length - len of input/output arrays
211+
* @param thresholdLevel
212+
*/
213+
template <typename T, typename S>
214+
__global__ void threshold(const T *input, S *output, size_t length, float thresholdLevel) {
215+
size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x;
216+
if (idx < length) {
217+
if (input[idx] <= thresholdLevel) { output[idx] = 0; }
218+
}
219+
}
220+
221+
template <typename ImgType, typename T>
222+
void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) {
223+
dim3 threadsPerBlock(64);
224+
dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x);
225+
threshold<<<numBlocks,threadsPerBlock, 0, aStream>>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th);
226+
};
227+
228+
template<typename T>
229+
__global__ void rescaleAndThreshold(T *data, size_t len, float sigmaThreshold, float sigmaThresholdMax) {
230+
const float max_th = 60000.0;
231+
size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
232+
if (idx < len) {
233+
float rescaled = data[idx];
234+
if (rescaled < sigmaThreshold) {
235+
rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold;
236+
}
237+
data[idx] = rescaled;
238+
}
239+
}
240+
241+
template <typename T>
242+
void runRescaleAndThreshold(T *data, size_t len, float sigma, float sigmaMax, cudaStream_t aStream) {
243+
dim3 threadsPerBlock(64);
244+
dim3 numBlocks((len + threadsPerBlock.x - 1) / threadsPerBlock.x);
245+
rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, sigma, sigmaMax);
246+
}
247+
248+
205249
template <typename U>
206250
template <typename ImgType>
207251
class GpuProcessingTask<U>::GpuProcessingTaskImpl {
@@ -264,11 +308,11 @@ public:
264308
iMaxLevel(maxLevel),
265309
// TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension.
266310
// Should be fixed when other parts of pipeline are ready.
267-
params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)),
268-
bc1(params.bc1.get(), params.k0, iStream),
269-
bc2(params.bc2.get(), params.k0, iStream),
270-
bc3(params.bc3.get(), params.k0, iStream),
271-
bc4(params.bc4.get(), params.k0, iStream),
311+
// params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)),
312+
// bc1(params.bc1.get(), params.k0, iStream),
313+
// bc2(params.bc2.get(), params.k0, iStream),
314+
// bc3(params.bc3.get(), params.k0, iStream),
315+
// bc4(params.bc4.get(), params.k0, iStream),
272316
boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num},
273317
boundary{nullptr, boundaryLen, iStream},
274318
pctc(iAprInfo, iStream),
@@ -317,6 +361,13 @@ public:
317361
splineCudaX, splineCudaY, splineCudaZ, boundary.get(),
318362
iBsplineOffset, iParameters, iStream);
319363
runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream);
364+
365+
// Apply parameters from APRConverter:
366+
runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream);
367+
runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream);
368+
runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream);
369+
// TODO: automatic parameters are not implemented for GPU pipeline (yet)
370+
320371
float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz));
321372
float level_factor = pow(2, iMaxLevel) * min_dim;
322373
const float mult_const = level_factor/iParameters.rel_error;

0 commit comments

Comments
 (0)