Skip to content

Commit d997769

Browse files
authored
Merge pull request #45 from christiangnrd/ka0.10b
Stop relying on backend type to determine algorithm used This is in preparation for the PoCL Backend becoming the KernelAbstractions default for CPUs. Until the PoCL algorithm implementations are faster than the Julia Base / Threads ones we have in AcceleratedKernels, we will keep those.
2 parents c00ce96 + c62a66c commit d997769

26 files changed

+336
-250
lines changed

.github/workflows/CI-CPU.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,46 @@ jobs:
6363
- uses: julia-actions/julia-runtest@v1
6464
env:
6565
JULIA_NUM_THREADS: ${{ matrix.env.JULIA_NUM_THREADS }}
66+
OpenCL:
67+
name: OpenCL
68+
runs-on: ubuntu-latest
69+
timeout-minutes: 60
70+
permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
71+
actions: write
72+
contents: read
73+
strategy:
74+
fail-fast: true
75+
steps:
76+
- uses: actions/checkout@v4
77+
- uses: julia-actions/setup-julia@v2
78+
with:
79+
version: 1
80+
arch: x64
81+
- uses: julia-actions/cache@v2
82+
- uses: julia-actions/julia-buildpkg@v1
83+
- uses: julia-actions/julia-runtest@v1
84+
with:
85+
test_args: '--OpenCL'
86+
# cpuKA:
87+
# name: KA CPU Backend
88+
# runs-on: ubuntu-latest
89+
# timeout-minutes: 60
90+
# permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
91+
# actions: write
92+
# contents: read
93+
# strategy:
94+
# fail-fast: true
95+
# steps:
96+
# - uses: actions/checkout@v4
97+
# - uses: julia-actions/setup-julia@v2
98+
# with:
99+
# version: 1
100+
# arch: x64
101+
# - uses: julia-actions/cache@v2
102+
# - uses: julia-actions/julia-buildpkg@v1
103+
# - uses: julia-actions/julia-runtest@v1
104+
# with:
105+
# test_args: '--cpuKA'
66106
docs:
67107
name: Documentation
68108
runs-on: ubuntu-latest

src/AcceleratedKernels.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ module AcceleratedKernels
1212

1313
# Internal dependencies
1414
using ArgCheck: @argcheck
15-
using GPUArraysCore: AbstractGPUArray, @allowscalar
15+
using GPUArraysCore: AnyGPUArray, @allowscalar
1616
using KernelAbstractions
1717

1818

src/accumulate/accumulate.jl

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -167,24 +167,34 @@ function _accumulate_impl!(
167167
# CPU settings
168168
max_tasks::Int=Threads.nthreads(),
169169
min_elems::Int=2,
170+
prefer_threads::Bool=true,
170171

171172
# GPU settings
172173
block_size::Int=256,
173174
temp::Union{Nothing, AbstractArray}=nothing,
174175
temp_flags::Union{Nothing, AbstractArray}=nothing,
175176
)
176177
if isnothing(dims)
177-
return accumulate_1d!(
178-
op, v, backend, alg;
179-
init, neutral, inclusive,
180-
max_tasks, min_elems,
181-
block_size, temp, temp_flags,
182-
)
178+
return if use_KA_algo(v, prefer_threads)
179+
accumulate_1d_gpu!(
180+
op, v, backend, alg;
181+
init, neutral, inclusive,
182+
max_tasks, min_elems,
183+
block_size, temp, temp_flags,
184+
)
185+
else
186+
accumulate_1d_cpu!(
187+
op, v, backend, alg;
188+
init, neutral, inclusive,
189+
max_tasks, min_elems,
190+
block_size, temp, temp_flags,
191+
)
192+
end
183193
else
184194
return accumulate_nd!(
185195
op, v, backend;
186196
init, neutral, dims, inclusive,
187-
max_tasks, min_elems,
197+
max_tasks, min_elems, prefer_threads,
188198
block_size,
189199
)
190200
end

src/accumulate/accumulate_1d_cpu.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
function accumulate_1d!(
2-
op, v::AbstractArray, backend::CPU, alg;
1+
function accumulate_1d_cpu!(
2+
op, v::AbstractArray, backend::Backend, alg;
33
init,
44
neutral,
55
inclusive::Bool,

src/accumulate/accumulate_1d_gpu.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,8 @@ end
248248

249249

250250
# DecoupledLookback algorithm
251-
function accumulate_1d!(
252-
op, v::AbstractArray, backend::GPU, ::DecoupledLookback;
251+
function accumulate_1d_gpu!(
252+
op, v::AbstractArray, backend::Backend, ::DecoupledLookback;
253253
init,
254254
neutral,
255255
inclusive::Bool,
@@ -307,8 +307,8 @@ end
307307

308308

309309
# ScanPrefixes algorithm
310-
function accumulate_1d!(
311-
op, v::AbstractArray, backend::GPU, ::ScanPrefixes;
310+
function accumulate_1d_gpu!(
311+
op, v::AbstractArray, backend, ::ScanPrefixes;
312312
init,
313313
neutral,
314314
inclusive::Bool,

src/accumulate/accumulate_nd.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ function accumulate_nd!(
88
# CPU settings
99
max_tasks::Int,
1010
min_elems::Int,
11+
prefer_threads::Bool=true,
1112

1213
# GPU settings
1314
block_size::Int,
@@ -34,7 +35,7 @@ function accumulate_nd!(
3435

3536
# Degenerate cases end
3637

37-
if backend isa CPU
38+
if !use_KA_algo(v, prefer_threads)
3839
_accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
3940
else
4041
# On GPUs we have two parallelisation approaches, based on which dimension has more elements:

src/foreachindex.jl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ end
1515
function _forindices_gpu(
1616
f,
1717
indices,
18-
backend::GPU;
18+
backend::Backend;
1919

2020
block_size::Int=256,
2121
)
@@ -125,11 +125,12 @@ function foreachindex(
125125
# CPU settings
126126
max_tasks=Threads.nthreads(),
127127
min_elems=1,
128+
prefer_threads::Bool=true,
128129

129130
# GPU settings
130131
block_size=256,
131132
)
132-
if backend isa GPU
133+
if use_KA_algo(itr, prefer_threads)
133134
_forindices_gpu(f, eachindex(itr), backend; block_size)
134135
else
135136
_forindices_threads(f, eachindex(itr); max_tasks, min_elems)
@@ -218,6 +219,7 @@ function foraxes(
218219
# CPU settings
219220
max_tasks=Threads.nthreads(),
220221
min_elems=1,
222+
prefer_threads::Bool=true,
221223

222224
# GPU settings
223225
block_size=256,
@@ -226,11 +228,11 @@ function foraxes(
226228
return foreachindex(
227229
f, itr, backend;
228230
max_tasks, min_elems,
229-
block_size,
231+
prefer_threads, block_size,
230232
)
231233
end
232234

233-
if backend isa GPU
235+
if use_KA_algo(itr, prefer_threads)
234236
_forindices_gpu(f, axes(itr, dims), backend; block_size)
235237
else
236238
_forindices_threads(f, axes(itr, dims); max_tasks, min_elems)

src/map.jl

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,12 @@ end
3333
"""
3434
function map!(
3535
f, dst::AbstractArray, src::AbstractArray, backend::Backend=get_backend(src);
36-
37-
# CPU settings
38-
max_tasks=Threads.nthreads(),
39-
min_elems=1,
40-
41-
# GPU settings
42-
block_size=256,
36+
kwargs...
4337
)
4438
@argcheck length(dst) == length(src)
4539
foreachindex(
4640
src, backend;
47-
max_tasks, min_elems,
48-
block_size,
41+
kwargs...
4942
) do idx
5043
dst[idx] = f(src[idx])
5144
end

src/predicates.jl

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,12 @@ function _any_impl(
114114
# CPU settings
115115
max_tasks=Threads.nthreads(),
116116
min_elems=1,
117+
prefer_threads::Bool=true,
117118

118119
# GPU settings
119120
block_size::Int=256,
120121
)
121-
if backend isa GPU
122+
if use_KA_algo(v, prefer_threads)
122123
@argcheck block_size > 0
123124

124125
# Some platforms crash when multiple threads write to the same memory location in a global
@@ -137,7 +138,8 @@ function _any_impl(
137138
backend;
138139
init=false,
139140
neutral=false,
140-
block_size=block_size,
141+
prefer_threads=true,
142+
block_size,
141143
temp=alg.temp,
142144
switch_below=alg.switch_below,
143145
)
@@ -246,11 +248,12 @@ function _all_impl(
246248
# CPU settings
247249
max_tasks=Threads.nthreads(),
248250
min_elems=1,
251+
prefer_threads::Bool=true,
249252

250253
# GPU settings
251254
block_size::Int=256,
252255
)
253-
if backend isa GPU
256+
if use_KA_algo(v, prefer_threads)
254257
@argcheck block_size > 0
255258

256259
# Some platforms crash when multiple threads write to the same memory location in a global
@@ -269,7 +272,8 @@ function _all_impl(
269272
backend;
270273
init=true,
271274
neutral=true,
272-
block_size=block_size,
275+
prefer_threads=false,
276+
block_size,
273277
temp=alg.temp,
274278
switch_below=alg.switch_below,
275279
)

src/reduce/mapreduce_1d_cpu.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
function mapreduce_1d(
2-
f, op, src::AbstractArray, backend::CPU;
1+
function mapreduce_1d_cpu(
2+
f, op, src::AbstractArray, backend::Backend;
33
init,
44
neutral,
55

0 commit comments

Comments
 (0)