@@ -66,17 +66,14 @@ static bool pow235(size_t num, size_t &pow2, size_t &pow3, size_t &pow5)
6666 return true ;
6767}
6868
69- static bool split1D_for_inplace (size_t num, vector<vector<size_t > > &splitNums, clfftPrecision precision)
69+ static bool split1D_for_inplace (size_t num, vector<vector<size_t > > &splitNums, clfftPrecision precision, size_t threshold )
7070{
7171 /* a helper function to split big 1D to friendly 2D sizes for inplace transpose kernels
7272 currently only radix 2, 3 and 5 are supported
7373 the algorithm looks for ways to split up the 1D into 2D such that one of the dimensions is multiples of the other dimension.
7474 And this mupliple is radix2, 3 or 5.
7575 each splited dimentsion should be further splited until that it is smaller than 4096
7676 */
77- size_t threshold = 4096 ;
78- if (precision == CLFFT_DOUBLE)
79- threshold = 2048 ;
8077 if (num <= threshold)
8178 return true ;
8279 if (num % 2 != 0 && num % 3 != 0 && num % 5 != 0 )
@@ -174,8 +171,8 @@ static bool split1D_for_inplace(size_t num, vector<vector<size_t> > &splitNums,
174171 splitVec.push_back (temp);
175172 splitNums.push_back (splitVec);
176173
177- status = status && split1D_for_inplace (temp*divide_factor, splitNums, precision);
178- status = status && split1D_for_inplace (temp, splitNums, precision);
174+ status = status && split1D_for_inplace (temp*divide_factor, splitNums, precision, threshold );
175+ status = status && split1D_for_inplace (temp, splitNums, precision, threshold );
179176 return status;
180177
181178}
@@ -794,13 +791,17 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
794791 if (fftPlan->length[0] == 354294)
795792 clLengths[1] = 243;
796793 */
794+ size_t threshold = 4096 ;
795+ if (fftPlan->precision == CLFFT_DOUBLE)
796+ threshold = 2048 ;
797797 if (clfftGetRequestLibNoMemAlloc () &&
798798 fftPlan->placeness == CLFFT_INPLACE &&
799- (fftPlan->inputLayout == fftPlan->outputLayout ) )
799+ (fftPlan->inputLayout == fftPlan->outputLayout )
800+ && fftPlan->length [0 ] > threshold)
800801 {
801802 // for inplace fft with inplace transpose, the split logic is different
802803 vector<vector<size_t > > splitNums;
803- bool implemented = split1D_for_inplace (fftPlan->length [0 ], splitNums, fftPlan->precision );
804+ bool implemented = split1D_for_inplace (fftPlan->length [0 ], splitNums, fftPlan->precision , threshold );
804805 if (implemented)
805806 clLengths[1 ] = splitNums[0 ][0 ];
806807 }
0 commit comments