1515function _forindices_gpu (
1616 f,
1717 indices,
18- backend:: GPU ;
18+ backend:: Backend ;
1919
2020 block_size:: Int = 256 ,
2121)
4747 # CPU settings
4848 max_tasks=Threads.nthreads(),
4949 min_elems=1,
50+ prefer_threads::Bool=true,
5051
5152 # GPU settings
5253 block_size=256,
@@ -60,7 +61,8 @@ MtlArray, oneArray - with one GPU thread per index.
6061On CPUs at most `max_tasks` threads are launched, or fewer such that each thread processes at least
6162`min_elems` indices; if a single task ends up being needed, `f` is inlined and no thread is
6263launched. Tune it to your function - the more expensive it is, the fewer elements are needed to
63- amortise the cost of launching a thread (which is a few μs).
64+ amortise the cost of launching a thread (which is a few μs). `prefer_threads` tells AK to prioritize
65+ using the CPU algorithm implementation (default behaviour) over the KA algorithm through POCL.
6466
6567# Examples
6668Normally you would write a for loop like this:
@@ -125,11 +127,12 @@ function foreachindex(
125127 # CPU settings
126128 max_tasks= Threads. nthreads (),
127129 min_elems= 1 ,
130+ prefer_threads:: Bool = true ,
128131
129132 # GPU settings
130133 block_size= 256 ,
131134)
132- if backend isa GPU
135+ if use_KA_algo (itr, prefer_threads)
133136 _forindices_gpu (f, eachindex (itr), backend; block_size)
134137 else
135138 _forindices_threads (f, eachindex (itr); max_tasks, min_elems)
144147 # CPU settings
145148 max_tasks=Threads.nthreads(),
146149 min_elems=1,
150+ prefer_threads::Bool=true,
147151
148152 # GPU settings
149153 block_size=256,
@@ -157,7 +161,8 @@ MtlArray, oneArray - with one GPU thread per index.
157161On CPUs at most `max_tasks` threads are launched, or fewer such that each thread processes at least
158162`min_elems` indices; if a single task ends up being needed, `f` is inlined and no thread is
159163launched. Tune it to your function - the more expensive it is, the fewer elements are needed to
160- amortise the cost of launching a thread (which is a few μs).
164+ amortise the cost of launching a thread (which is a few μs). `prefer_threads` tells AK to prioritize
165+ using the CPU algorithm implementation (default behaviour) over the KA algorithm through POCL.
161166
162167# Examples
163168Normally you would write a for loop like this:
@@ -218,6 +223,7 @@ function foraxes(
218223 # CPU settings
219224 max_tasks= Threads. nthreads (),
220225 min_elems= 1 ,
226+ prefer_threads:: Bool = true ,
221227
222228 # GPU settings
223229 block_size= 256 ,
@@ -226,11 +232,11 @@ function foraxes(
226232 return foreachindex (
227233 f, itr, backend;
228234 max_tasks, min_elems,
229- block_size,
235+ prefer_threads, block_size,
230236 )
231237 end
232238
233- if backend isa GPU
239+ if use_KA_algo (itr, prefer_threads)
234240 _forindices_gpu (f, axes (itr, dims), backend; block_size)
235241 else
236242 _forindices_threads (f, axes (itr, dims); max_tasks, min_elems)
0 commit comments