Skip to content

Commit 4854ff2

Browse files
author
Kent Knox
committed
Merge pull request #23 from AMD-FirePro/develop
workaround to make clfft run on NV This adds a vendor check to generate the constant twiddle factors in a global array on an Nvidia runtime stack, bypassing what appears to be a runtime bug generating the table in constant memory. If the bug disappears from the runtime stack in the future, the vendor check should be removed again.
2 parents 0def5e1 + cdb2919 commit 4854ff2

File tree

5 files changed

+34
-17
lines changed

5 files changed

+34
-17
lines changed

src/library/generator.copy.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ clfftStatus FFTPlan::GetMax1DLengthPvt<Copy> (size_t * longest) const
445445
using namespace CopyGenerator;
446446

447447
template<>
448-
clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo ) const
448+
clfftStatus FFTPlan::GenerateKernelPvt<Copy>(FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const
449449
{
450450
FFTKernelGenKeyParams params;
451451
OPENCL_V( this->GetKernelGenKeyPvt<Copy> (params), _T("GetKernelGenKey() failed!") );

src/library/generator.stockham.cpp

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2348,7 +2348,7 @@ namespace StockhamGenerator
23482348

23492349
}
23502350

2351-
void GenerateKernel(std::string &str)
2351+
void GenerateKernel(std::string &str, cl_device_id Dev_ID)
23522352
{
23532353
std::string twType = RegBaseType<PR>(2);
23542354
std::string rType = RegBaseType<PR>(1);
@@ -2501,8 +2501,19 @@ namespace StockhamGenerator
25012501
else str += "fft_back";
25022502
str += "(";
25032503

2504-
// TODO : address this kludge
2505-
str += "__constant cb_t *cb __attribute__((max_constant_size(32))), ";
2504+
// TODO : address this kludge
2505+
size_t SizeParam_ret = 0;
2506+
clGetDeviceInfo(Dev_ID, CL_DEVICE_VENDOR, 0, NULL, &SizeParam_ret);
2507+
char* nameVendor = new char[SizeParam_ret];
2508+
clGetDeviceInfo(Dev_ID, CL_DEVICE_VENDOR, SizeParam_ret, nameVendor, NULL);
2509+
2510+
//nv compiler doesn't support __constant kernel argument
2511+
if (strncmp(nameVendor, "NVIDIA",6)!=0)
2512+
str += "__constant cb_t *cb __attribute__((max_constant_size(32))), ";
2513+
else
2514+
str += "__global cb_t *cb, ";
2515+
2516+
delete [] nameVendor;
25062517

25072518
// Function attributes
25082519
if(params.fft_placeness == CLFFT_INPLACE)
@@ -3218,24 +3229,30 @@ clfftStatus FFTPlan::GetMax1DLengthPvt<Stockham> (size_t * longest) const
32183229
}
32193230

32203231
template<>
3221-
clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo ) const
3232+
clfftStatus FFTPlan::GenerateKernelPvt<Stockham>(FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const
32223233
{
32233234
FFTKernelGenKeyParams params;
32243235
OPENCL_V( this->GetKernelGenKeyPvt<Stockham> (params), _T("GetKernelGenKey() failed!") );
32253236

3237+
cl_int status = CL_SUCCESS;
3238+
cl_device_id Device = NULL;
3239+
status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
3240+
3241+
OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
3242+
32263243
std::string programCode;
32273244
Precision pr = (params.fft_precision == CLFFT_SINGLE) ? P_SINGLE : P_DOUBLE;
32283245
switch(pr)
32293246
{
32303247
case P_SINGLE:
32313248
{
32323249
Kernel<P_SINGLE> kernel(params);
3233-
kernel.GenerateKernel(programCode);
3250+
kernel.GenerateKernel(programCode, Device);
32343251
} break;
32353252
case P_DOUBLE:
32363253
{
32373254
Kernel<P_DOUBLE> kernel(params);
3238-
kernel.GenerateKernel(programCode);
3255+
kernel.GenerateKernel(programCode, Device);
32393256
} break;
32403257
}
32413258

src/library/generator.transpose.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,7 @@ clfftStatus FFTPlan::GetWorkSizesPvt<Transpose> (std::vector<size_t> & globalWS,
822822
// OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
823823
// Feed this generator the FFTPlan, and it returns the generated program as a string
824824
template<>
825-
clfftStatus FFTPlan::GenerateKernelPvt<Transpose> ( FFTRepo& fftRepo ) const
825+
clfftStatus FFTPlan::GenerateKernelPvt<Transpose> ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const
826826
{
827827
FFTKernelGenKeyParams params;
828828
OPENCL_V( this->GetKernelGenKeyPvt<Transpose> (params), _T("GetKernelGenKey() failed!") );

src/library/plan.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
543543

544544
if(fftPlan->gen == Copy)
545545
{
546-
OPENCL_V( fftPlan->GenerateKernel( fftRepo ), _T( "GenerateKernel() failed" ) );
546+
OPENCL_V( fftPlan->GenerateKernel( fftRepo, *commQueueFFT ), _T( "GenerateKernel() failed" ) );
547547
OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
548548
fftPlan->baked = true;
549549
return CLFFT_SUCCESS;
@@ -1505,7 +1505,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
15051505
//break;
15061506
if (fftPlan->transflag) //Transpose for 2D
15071507
{
1508-
OPENCL_V( fftPlan->GenerateKernel( fftRepo ), _T( "GenerateTransposeProgram() failed" ) );
1508+
OPENCL_V( fftPlan->GenerateKernel( fftRepo, *commQueueFFT ), _T( "GenerateTransposeProgram() failed" ) );
15091509
OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
15101510

15111511
fftPlan->baked = true;
@@ -2445,7 +2445,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
24452445
}
24462446

24472447
// For the radices that we have factored, we need to load/compile and build the appropriate OpenCL kernels
2448-
OPENCL_V( fftPlan->GenerateKernel( fftRepo ), _T( "GenerateKernel() failed" ) );
2448+
OPENCL_V( fftPlan->GenerateKernel( fftRepo, *commQueueFFT ), _T( "GenerateKernel() failed" ) );
24492449

24502450
// For the radices that we have factored, we need to load/compile and build the appropriate OpenCL kernels
24512451
OPENCL_V( CompileKernels( *commQueueFFT, plHandle, fftPlan->gen, fftPlan ), _T( "CompileKernels() failed" ) );
@@ -3265,13 +3265,13 @@ clfftStatus FFTPlan::GetKernelGenKey (FFTKernelGenKeyParams & params) const
32653265
}
32663266
}
32673267

3268-
clfftStatus FFTPlan::GenerateKernel (FFTRepo & fftRepo) const
3268+
clfftStatus FFTPlan::GenerateKernel (FFTRepo & fftRepo, const cl_command_queue commQueueFFT) const
32693269
{
32703270
switch(gen)
32713271
{
3272-
case Stockham: return GenerateKernelPvt<Stockham>(fftRepo);
3273-
case Transpose: return GenerateKernelPvt<Transpose>(fftRepo);
3274-
case Copy: return GenerateKernelPvt<Copy>(fftRepo);
3272+
case Stockham: return GenerateKernelPvt<Stockham>(fftRepo, commQueueFFT);
3273+
case Transpose: return GenerateKernelPvt<Transpose>(fftRepo, commQueueFFT);
3274+
case Copy: return GenerateKernelPvt<Copy>(fftRepo, commQueueFFT);
32753275
default: assert(false); return CLFFT_NOTIMPLEMENTED;
32763276
}
32773277
}

src/library/plan.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ class FFTPlan
202202
clfftStatus GetKernelGenKeyPvt (FFTKernelGenKeyParams & params) const;
203203

204204
template <clfftGenerators G>
205-
clfftStatus GenerateKernelPvt (FFTRepo& fftRepo) const;
205+
clfftStatus GenerateKernelPvt (FFTRepo& fftRepo, const cl_command_queue commQueueFFT ) const;
206206

207207
template <clfftGenerators G>
208208
clfftStatus GetMax1DLengthPvt (size_t *longest ) const;
@@ -338,7 +338,7 @@ class FFTPlan
338338

339339
clfftStatus GetWorkSizes (std::vector<size_t> & globalws, std::vector<size_t> & localws) const;
340340
clfftStatus GetKernelGenKey (FFTKernelGenKeyParams & params) const;
341-
clfftStatus GenerateKernel (FFTRepo & fftRepo) const;
341+
clfftStatus GenerateKernel (FFTRepo & fftRepo, const cl_command_queue commQueueFFT) const;
342342
clfftStatus GetMax1DLength (size_t *longest ) const;
343343

344344
void ResetBinarySizes();

0 commit comments

Comments
 (0)