@@ -1506,48 +1506,35 @@ void isotropic_convolve_333_alt(GPUAccessHelper& access, GPUAccessHelper& tree_a
15061506
15071507
15081508template <typename inputType, typename outputType, typename stencilType, typename treeType>
1509- void isotropic_convolve_333 (GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input, VectorData<outputType>& output,
1510- VectorData<stencilType>& stencil, VectorData<treeType>& tree_data, bool reflective_bc, bool use_stencil_downsample, bool normalize_stencil) {
1511- /*
1512- * Perform APR Isotropic Convolution Operation on the GPU with a 3x3x3 kernel
1513- * conv_stencil needs to have 27 entries, with element (x, y, z) corresponding to index z*9 + x*3 + y
1514- */
1509+ void isotropic_convolve_333_direct (GPUAccessHelper& access, GPUAccessHelper& tree_access, VectorData<inputType>& input,
1510+ VectorData<outputType>& output, VectorData<stencilType>& stencil,
1511+ VectorData<treeType>& tree_data, bool reflective_bc) {
15151512
15161513 tree_access.init_gpu ();
15171514 access.init_gpu (tree_access);
15181515
15191516 assert (input.size () == access.total_number_particles ());
1520- assert (stencil.size () == 27 );
1517+ assert (stencil.size () >= 27 );
1518+
1519+ const bool downsampled_stencil = (stencil.size () >= 27 * (access.level_max () - access.level_min ()));
15211520
15221521 tree_data.resize (tree_access.total_number_particles ());
15231522 output.resize (access.total_number_particles ());
15241523
15251524 // / compute nonempty rows
1526- VectorData<int > ne_counter_ds; // non empty rows
1525+ VectorData<int > ne_counter_ds;
15271526 VectorData<int > ne_counter;
15281527 ScopedCudaMemHandler<int *, JUST_ALLOC> ne_rows_ds_gpu;
15291528 ScopedCudaMemHandler<int *, JUST_ALLOC> ne_rows_gpu;
15301529
15311530 compute_ne_rows_tree_cuda<16 , 32 >(tree_access, ne_counter_ds, ne_rows_ds_gpu);
15321531 compute_ne_rows_cuda<16 , 32 >(access, ne_counter, ne_rows_gpu, 2 );
15331532
1534- // / downsample the stencil
1535- VectorData<stencilType> stencil_vec;
1536- if (use_stencil_downsample) {
1537- APRStencil::get_downsampled_stencils (stencil, stencil_vec, access.level_max () - access.level_min (), normalize_stencil);
1538- }
1539-
15401533 // / allocate GPU memory
15411534 ScopedCudaMemHandler<inputType*, JUST_ALLOC> input_gpu (input.data (), input.size ());
15421535 ScopedCudaMemHandler<treeType*, JUST_ALLOC> tree_data_gpu (tree_data.data (), tree_data.size ());
15431536 ScopedCudaMemHandler<outputType*, JUST_ALLOC> output_gpu (output.data (), output.size ());
1544- ScopedCudaMemHandler<stencilType*, JUST_ALLOC> stencil_gpu;
1545-
1546- if (use_stencil_downsample) {
1547- stencil_gpu.initialize (stencil_vec.data (), stencil_vec.size ());
1548- } else {
1549- stencil_gpu.initialize (stencil.data (), stencil.size ());
1550- }
1537+ ScopedCudaMemHandler<stencilType*, JUST_ALLOC> stencil_gpu (stencil.data (), stencil.size ());
15511538
15521539 // / copy input particles and stencil(s) to device
15531540 input_gpu.copyH2D ();
@@ -1560,10 +1547,10 @@ void isotropic_convolve_333(GPUAccessHelper& access, GPUAccessHelper& tree_acces
15601547 // / perform the convolution operation
15611548 if (reflective_bc) {
15621549 isotropic_convolve_333_reflective (access, tree_access, input_gpu.get (), output_gpu.get (), stencil_gpu.get (),
1563- tree_data_gpu.get (), ne_rows_gpu.get (), ne_counter, use_stencil_downsample );
1550+ tree_data_gpu.get (), ne_rows_gpu.get (), ne_counter, downsampled_stencil );
15641551 } else {
15651552 isotropic_convolve_333 (access, tree_access, input_gpu.get (), output_gpu.get (), stencil_gpu.get (),
1566- tree_data_gpu.get (), ne_rows_gpu.get (), ne_counter, use_stencil_downsample );
1553+ tree_data_gpu.get (), ne_rows_gpu.get (), ne_counter, downsampled_stencil );
15671554 }
15681555 error_check ( cudaDeviceSynchronize () )
15691556
@@ -1626,11 +1613,15 @@ void isotropic_convolve_333_alt(GPUAccessHelper& access, GPUAccessHelper& tree_a
16261613
16271614
16281615// / instantiate templates
1629- template void isotropic_convolve_333 (GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t >&, VectorData<float >&, VectorData<float >&, VectorData<float >&, bool , bool , bool );
1630- template void isotropic_convolve_333 (GPUAccessHelper&, GPUAccessHelper&, VectorData<float >&, VectorData<float >&, VectorData<float >&, VectorData<float >&, bool , bool , bool );
1631- template void isotropic_convolve_333 (GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t >&, VectorData<double >&, VectorData<double >&, VectorData<double >&, bool , bool , bool );
1616+ template void isotropic_convolve_333_direct (GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t >&, VectorData<float >&, VectorData<float >&, VectorData<float >&, bool );
1617+ template void isotropic_convolve_333_direct (GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t >&, VectorData<float >&, VectorData<float >&, VectorData<float >&, bool );
1618+ template void isotropic_convolve_333_direct (GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t >&, VectorData<float >&, VectorData<float >&, VectorData<float >&, bool );
1619+ template void isotropic_convolve_333_direct (GPUAccessHelper&, GPUAccessHelper&, VectorData<float >&, VectorData<float >&, VectorData<float >&, VectorData<float >&, bool );
1620+ template void isotropic_convolve_333_direct (GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t >&, VectorData<double >&, VectorData<double >&, VectorData<double >&, bool );
16321621
1622+ template void isotropic_convolve_333_alt (GPUAccessHelper&, GPUAccessHelper&, VectorData<uint8_t >&, VectorData<float >&, VectorData<float >&, VectorData<float >&, bool , bool );
16331623template void isotropic_convolve_333_alt (GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t >&, VectorData<float >&, VectorData<float >&, VectorData<float >&, bool , bool );
1624+ template void isotropic_convolve_333_alt (GPUAccessHelper&, GPUAccessHelper&, VectorData<uint64_t >&, VectorData<float >&, VectorData<float >&, VectorData<float >&, bool , bool );
16341625template void isotropic_convolve_333_alt (GPUAccessHelper&, GPUAccessHelper&, VectorData<float >&, VectorData<float >&, VectorData<float >&, VectorData<float >&, bool , bool );
16351626template void isotropic_convolve_333_alt (GPUAccessHelper&, GPUAccessHelper&, VectorData<uint16_t >&, VectorData<double >&, VectorData<double >&, VectorData<double >&, bool , bool );
16361627
0 commit comments