Skip to content

Commit 449d4f2

Browse files
committed
Merge pull request #30 from accelereyes/develop
Workaround for 2D FFT failures on NVIDIA GPUs
2 parents c12b25c + 3c94e56 commit 449d4f2

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

src/library/plan.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,6 +1515,20 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
15151515
return CLFFT_SUCCESS;
15161516
}
15171517

1518+
// TODO : Check for a better way to do this.
1519+
bool isnvidia = false;
1520+
for (size_t Idx = 0; !isnvidia && Idx < numQueues; Idx++)
1521+
{
1522+
cl_command_queue QIdx = commQueueFFT[Idx];
1523+
cl_device_id Device;
1524+
clGetCommandQueueInfo(QIdx, CL_QUEUE_DEVICE, sizeof(Device), &Device, NULL);
1525+
char Vendor[256];
1526+
clGetDeviceInfo(Device, CL_DEVICE_VENDOR, sizeof(Vendor), &Vendor, NULL);
1527+
isnvidia |= (strncmp(Vendor, "NVIDIA", 6) == 0);
1528+
}
1529+
// nvidia gpus are failing when doing transpose for 2D FFTs
1530+
if (isnvidia) break;
1531+
15181532
if (fftPlan->length.size() != 2) break;
15191533
if (!(IsPo2(fftPlan->length[0])) || !(IsPo2(fftPlan->length[1])))
15201534
break;

0 commit comments

Comments
 (0)