Skip to content

Commit 4c65788

Browse files
committed
Merge pull request #150 from TimmyLiu/develop_inplace_tranpose_general
bug fix from PR #149
2 parents c098adb + dd116f5 commit 4c65788

File tree

3 files changed

+18
-17
lines changed

3 files changed

+18
-17
lines changed

src/library/action.transpose.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
282282
//general swap kernel takes care of all ratio
283283
OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
284284
}
285-
285+
//std::cout << programCode << std::endl;
286286
cl_int status = CL_SUCCESS;
287287
cl_device_id Device = NULL;
288288
status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);

src/library/generator.transpose.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1734,16 +1734,16 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
17341734
if (i + 256 < LDS_per_WG)
17351735
{
17361736
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim
1737-
<< ", batch_offset*" << smaller_dim * bigger_dim << "+group_offSet+idx+" << i << ", post_userdata, preValue[idx+" << i
1738-
<< ");" << std::endl;
1737+
<< ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i
1738+
<< "]);" << std::endl;
17391739
}
17401740
else
17411741
{
17421742
// need to handle boundary
17431743
clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
17441744
clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim
1745-
<< ", batch_offset*" << smaller_dim * bigger_dim << "+group_offSet+idx+" << i << ", post_userdata, preValue[idx+" << i
1746-
<< ");" << std::endl;
1745+
<< ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i
1746+
<< "]);" << std::endl;
17471747
clKernWrite(transKernel, 6) << "}" << std::endl;
17481748
}
17491749
}
@@ -1848,17 +1848,17 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
18481848
{
18491849
//clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
18501850
//clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
1851-
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "inputA_R - batch_offset*" << smaller_dim * bigger_dim
1851+
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R - batch_offset*" << smaller_dim * bigger_dim
18521852
<< ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim
1853-
<< "+group_offset+idx+" << i << ", post_userdata, preValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
1853+
<< "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
18541854
}
18551855
else
18561856
{
18571857
// need to handle boundary
18581858
clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1859-
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "inputA_R - batch_offset*" << smaller_dim * bigger_dim
1859+
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R - batch_offset*" << smaller_dim * bigger_dim
18601860
<< ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim
1861-
<< "+group_offset+idx+" << i << ", post_userdata, preValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
1861+
<< "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
18621862
clKernWrite(transKernel, 6) << "}" << std::endl;
18631863
}
18641864
}

src/library/plan.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,14 @@ static bool pow235(size_t num, size_t &pow2, size_t &pow3, size_t &pow5)
6666
return true;
6767
}
6868

69-
static bool split1D_for_inplace(size_t num, vector<vector<size_t> > &splitNums, clfftPrecision precision)
69+
static bool split1D_for_inplace(size_t num, vector<vector<size_t> > &splitNums, clfftPrecision precision, size_t threshold)
7070
{
7171
/* a helper function to split big 1D to friendly 2D sizes for inplace transpose kernels
7272
currently only radix 2, 3 and 5 are supported
7373
the algorithm looks for ways to split up the 1D into 2D such that one of the dimensions is multiples of the other dimension.
7474
And this mupliple is radix2, 3 or 5.
7575
each splited dimentsion should be further splited until that it is smaller than 4096
7676
*/
77-
size_t threshold = 4096;
78-
if (precision == CLFFT_DOUBLE)
79-
threshold = 2048;
8077
if (num <= threshold)
8178
return true;
8279
if (num % 2 != 0 && num % 3 != 0 && num % 5 != 0)
@@ -174,8 +171,8 @@ static bool split1D_for_inplace(size_t num, vector<vector<size_t> > &splitNums,
174171
splitVec.push_back(temp);
175172
splitNums.push_back(splitVec);
176173

177-
status = status && split1D_for_inplace(temp*divide_factor, splitNums, precision);
178-
status = status && split1D_for_inplace(temp, splitNums, precision);
174+
status = status && split1D_for_inplace(temp*divide_factor, splitNums, precision, threshold);
175+
status = status && split1D_for_inplace(temp, splitNums, precision, threshold);
179176
return status;
180177

181178
}
@@ -794,13 +791,17 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
794791
if (fftPlan->length[0] == 354294)
795792
clLengths[1] = 243;
796793
*/
794+
size_t threshold = 4096;
795+
if (fftPlan->precision == CLFFT_DOUBLE)
796+
threshold = 2048;
797797
if (clfftGetRequestLibNoMemAlloc() &&
798798
fftPlan->placeness == CLFFT_INPLACE &&
799-
(fftPlan->inputLayout == fftPlan->outputLayout) )
799+
(fftPlan->inputLayout == fftPlan->outputLayout)
800+
&& fftPlan->length[0] > threshold)
800801
{
801802
//for inplace fft with inplace transpose, the split logic is different
802803
vector<vector<size_t> > splitNums;
803-
bool implemented = split1D_for_inplace(fftPlan->length[0], splitNums, fftPlan->precision);
804+
bool implemented = split1D_for_inplace(fftPlan->length[0], splitNums, fftPlan->precision, threshold);
804805
if (implemented)
805806
clLengths[1] = splitNums[0][0];
806807
}

0 commit comments

Comments
 (0)