Skip to content

Commit 2d86364

Browse files
committed
Determine algo in KA 0.10- compatible way
1 parent d44b474 commit 2d86364

26 files changed

+330
-247
lines changed

src/AcceleratedKernels.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ module AcceleratedKernels
1212

1313
# Internal dependencies
1414
using ArgCheck: @argcheck
15-
using GPUArraysCore: AbstractGPUArray, @allowscalar
15+
using GPUArraysCore: AnyGPUArray, @allowscalar
1616
using KernelAbstractions
1717

1818

src/accumulate/accumulate.jl

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ include("accumulate_nd.jl")
3838
# CPU settings
3939
max_tasks::Int=Threads.nthreads(),
4040
min_elems::Int=2,
41+
prefer_threads::Bool=true,
4142
4243
# Algorithm choice
4344
alg::AccumulateAlgorithm=DecoupledLookback(),
@@ -58,6 +59,7 @@ include("accumulate_nd.jl")
5859
# CPU settings
5960
max_tasks::Int=Threads.nthreads(),
6061
min_elems::Int=2,
62+
prefer_threads::Bool=true,
6163
6264
# Algorithm choice
6365
alg::AccumulateAlgorithm=DecoupledLookback(),
@@ -80,7 +82,9 @@ we do not need the constraint of `dst` and `src` being different; to minimise me
8082
recommend using the single-array interface (the first one above).
8183
8284
## CPU
83-
Use at most `max_tasks` threads with at least `min_elems` elements per task.
85+
Use at most `max_tasks` threads with at least `min_elems` elements per task. `prefer_threads` tells
86+
AK to prioritize using the CPU algorithm implementation (default behaviour) over the KA algorithm
87+
through POCL.
8488
8589
Note that accumulation is typically a memory-bound operation, so multithreaded accumulation only
8690
becomes faster if it is a more compute-heavy operation to hide memory latency - that includes:
@@ -167,24 +171,34 @@ function _accumulate_impl!(
167171
# CPU settings
168172
max_tasks::Int=Threads.nthreads(),
169173
min_elems::Int=2,
174+
prefer_threads::Bool=true,
170175

171176
# GPU settings
172177
block_size::Int=256,
173178
temp::Union{Nothing, AbstractArray}=nothing,
174179
temp_flags::Union{Nothing, AbstractArray}=nothing,
175180
)
176181
if isnothing(dims)
177-
return accumulate_1d!(
178-
op, v, backend, alg;
179-
init, neutral, inclusive,
180-
max_tasks, min_elems,
181-
block_size, temp, temp_flags,
182-
)
182+
return if use_KA_algo(v, prefer_threads)
183+
accumulate_1d_gpu!(
184+
op, v, backend, alg;
185+
init, neutral, inclusive,
186+
max_tasks, min_elems,
187+
block_size, temp, temp_flags,
188+
)
189+
else
190+
accumulate_1d_cpu!(
191+
op, v, backend, alg;
192+
init, neutral, inclusive,
193+
max_tasks, min_elems,
194+
block_size, temp, temp_flags,
195+
)
196+
end
183197
else
184198
return accumulate_nd!(
185199
op, v, backend;
186200
init, neutral, dims, inclusive,
187-
max_tasks, min_elems,
201+
max_tasks, min_elems, prefer_threads,
188202
block_size,
189203
)
190204
end

src/accumulate/accumulate_1d_cpu.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
function accumulate_1d!(
2-
op, v::AbstractArray, backend::CPU, alg;
1+
function accumulate_1d_cpu!(
2+
op, v::AbstractArray, backend::Backend, alg;
33
init,
44
neutral,
55
inclusive::Bool,

src/accumulate/accumulate_1d_gpu.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,8 @@ end
248248

249249

250250
# DecoupledLookback algorithm
251-
function accumulate_1d!(
252-
op, v::AbstractArray, backend::GPU, ::DecoupledLookback;
251+
function accumulate_1d_gpu!(
252+
op, v::AbstractArray, backend::Backend, ::DecoupledLookback;
253253
init,
254254
neutral,
255255
inclusive::Bool,
@@ -307,8 +307,8 @@ end
307307

308308

309309
# ScanPrefixes algorithm
310-
function accumulate_1d!(
311-
op, v::AbstractArray, backend::GPU, ::ScanPrefixes;
310+
function accumulate_1d_gpu!(
311+
op, v::AbstractArray, backend, ::ScanPrefixes;
312312
init,
313313
neutral,
314314
inclusive::Bool,

src/accumulate/accumulate_nd.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ function accumulate_nd!(
88
# CPU settings
99
max_tasks::Int,
1010
min_elems::Int,
11+
prefer_threads::Bool=true,
1112

1213
# GPU settings
1314
block_size::Int,
@@ -34,7 +35,7 @@ function accumulate_nd!(
3435

3536
# Degenerate cases end
3637

37-
if backend isa CPU
38+
if !use_KA_algo(v, prefer_threads)
3839
_accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
3940
else
4041
# On GPUs we have two parallelisation approaches, based on which dimension has more elements:

src/foreachindex.jl

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ end
1515
function _forindices_gpu(
1616
f,
1717
indices,
18-
backend::GPU;
18+
backend::Backend;
1919

2020
block_size::Int=256,
2121
)
@@ -47,6 +47,7 @@ end
4747
# CPU settings
4848
max_tasks=Threads.nthreads(),
4949
min_elems=1,
50+
prefer_threads::Bool=true,
5051
5152
# GPU settings
5253
block_size=256,
@@ -60,7 +61,8 @@ MtlArray, oneArray - with one GPU thread per index.
6061
On CPUs at most `max_tasks` threads are launched, or fewer such that each thread processes at least
6162
`min_elems` indices; if a single task ends up being needed, `f` is inlined and no thread is
6263
launched. Tune it to your function - the more expensive it is, the fewer elements are needed to
63-
amortise the cost of launching a thread (which is a few μs).
64+
amortise the cost of launching a thread (which is a few μs). `prefer_threads` tells AK to prioritize
65+
using the CPU algorithm implementation (default behaviour) over the KA algorithm through POCL.
6466
6567
# Examples
6668
Normally you would write a for loop like this:
@@ -125,11 +127,12 @@ function foreachindex(
125127
# CPU settings
126128
max_tasks=Threads.nthreads(),
127129
min_elems=1,
130+
prefer_threads::Bool=true,
128131

129132
# GPU settings
130133
block_size=256,
131134
)
132-
if backend isa GPU
135+
if use_KA_algo(itr, prefer_threads)
133136
_forindices_gpu(f, eachindex(itr), backend; block_size)
134137
else
135138
_forindices_threads(f, eachindex(itr); max_tasks, min_elems)
@@ -144,6 +147,7 @@ end
144147
# CPU settings
145148
max_tasks=Threads.nthreads(),
146149
min_elems=1,
150+
prefer_threads::Bool=true,
147151
148152
# GPU settings
149153
block_size=256,
@@ -157,7 +161,8 @@ MtlArray, oneArray - with one GPU thread per index.
157161
On CPUs at most `max_tasks` threads are launched, or fewer such that each thread processes at least
158162
`min_elems` indices; if a single task ends up being needed, `f` is inlined and no thread is
159163
launched. Tune it to your function - the more expensive it is, the fewer elements are needed to
160-
amortise the cost of launching a thread (which is a few μs).
164+
amortise the cost of launching a thread (which is a few μs). `prefer_threads` tells AK to prioritize
165+
using the CPU algorithm implementation (default behaviour) over the KA algorithm through POCL.
161166
162167
# Examples
163168
Normally you would write a for loop like this:
@@ -218,6 +223,7 @@ function foraxes(
218223
# CPU settings
219224
max_tasks=Threads.nthreads(),
220225
min_elems=1,
226+
prefer_threads::Bool=true,
221227

222228
# GPU settings
223229
block_size=256,
@@ -226,11 +232,11 @@ function foraxes(
226232
return foreachindex(
227233
f, itr, backend;
228234
max_tasks, min_elems,
229-
block_size,
235+
prefer_threads, block_size,
230236
)
231237
end
232238

233-
if backend isa GPU
239+
if use_KA_algo(itr, prefer_threads)
234240
_forindices_gpu(f, axes(itr, dims), backend; block_size)
235241
else
236242
_forindices_threads(f, axes(itr, dims); max_tasks, min_elems)

src/map.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# CPU settings
66
max_tasks=Threads.nthreads(),
77
min_elems=1,
8+
prefer_threads::Bool=true,
89
910
# GPU settings
1011
block_size=256,
@@ -53,6 +54,7 @@ end
5354
# CPU settings
5455
max_tasks=Threads.nthreads(),
5556
min_elems=1,
57+
prefer_threads::Bool=true,
5658
5759
# GPU settings
5860
block_size=256,

src/predicates.jl

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ end
3939
# CPU settings
4040
max_tasks=Threads.nthreads(),
4141
min_elems=1,
42+
prefer_threads::Bool=true,
4243
4344
# GPU settings
4445
block_size::Int=256,
@@ -53,7 +54,9 @@ reduction.
5354
## CPU
5455
Multithreaded parallelisation is only worth it for large arrays, relatively expensive predicates,
5556
and/or rare occurrence of true; use `max_tasks` and `min_elems` to only use parallelism when worth
56-
it in your application. When only one thread is needed, there is no overhead.
57+
it in your application. When only one thread is needed, there is no overhead. `prefer_threads`
58+
tells AK to prioritize using the CPU algorithm implementation (default behaviour) over the KA
59+
algorithm through POCL.
5760
5861
## GPU
5962
There are two possible `alg` choices:
@@ -114,11 +117,12 @@ function _any_impl(
114117
# CPU settings
115118
max_tasks=Threads.nthreads(),
116119
min_elems=1,
120+
prefer_threads::Bool=true,
117121

118122
# GPU settings
119123
block_size::Int=256,
120124
)
121-
if backend isa GPU
125+
if use_KA_algo(v, prefer_threads)
122126
@argcheck block_size > 0
123127

124128
# Some platforms crash when multiple threads write to the same memory location in a global
@@ -137,7 +141,8 @@ function _any_impl(
137141
backend;
138142
init=false,
139143
neutral=false,
140-
block_size=block_size,
144+
prefer_threads=true,
145+
block_size,
141146
temp=alg.temp,
142147
switch_below=alg.switch_below,
143148
)
@@ -171,6 +176,7 @@ end
171176
# CPU settings
172177
max_tasks=Threads.nthreads(),
173178
min_elems=1,
179+
prefer_threads::Bool=true,
174180
175181
# GPU settings
176182
block_size::Int=256,
@@ -185,7 +191,9 @@ reduction.
185191
## CPU
186192
Multithreaded parallelisation is only worth it for large arrays, relatively expensive predicates,
187193
and/or rare occurrence of true; use `max_tasks` and `min_elems` to only use parallelism when worth
188-
it in your application. When only one thread is needed, there is no overhead.
194+
it in your application. When only one thread is needed, there is no overhead. `prefer_threads`
195+
tells AK to prioritize using the CPU algorithm implementation (default behaviour) over the KA
196+
algorithm through POCL.
189197
190198
## GPU
191199
There are two possible `alg` choices:
@@ -246,11 +254,12 @@ function _all_impl(
246254
# CPU settings
247255
max_tasks=Threads.nthreads(),
248256
min_elems=1,
257+
prefer_threads::Bool=true,
249258

250259
# GPU settings
251260
block_size::Int=256,
252261
)
253-
if backend isa GPU
262+
if use_KA_algo(v, prefer_threads)
254263
@argcheck block_size > 0
255264

256265
# Some platforms crash when multiple threads write to the same memory location in a global
@@ -269,7 +278,8 @@ function _all_impl(
269278
backend;
270279
init=true,
271280
neutral=true,
272-
block_size=block_size,
281+
prefer_threads=false,
282+
block_size,
273283
temp=alg.temp,
274284
switch_below=alg.switch_below,
275285
)

src/reduce/mapreduce_1d_cpu.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
function mapreduce_1d(
2-
f, op, src::AbstractArray, backend::CPU;
1+
function mapreduce_1d_cpu(
2+
f, op, src::AbstractArray, backend::Backend;
33
init,
44
neutral,
55

src/reduce/mapreduce_1d_gpu.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@
9999
end
100100

101101

102-
function mapreduce_1d(
103-
f, op, src::AbstractArray, backend::GPU;
102+
function mapreduce_1d_gpu(
103+
f, op, src::AbstractArray, backend::Backend;
104104
init,
105105
neutral,
106106

0 commit comments

Comments
 (0)