Skip to content

Commit ae73e6d

Browse files
committed
Merge remote-tracking branch 'origin/main' into vc/accumulate_alg
2 parents 1b17354 + 60c379f commit ae73e6d

29 files changed

+365
-257
lines changed

.github/workflows/CI-CPU.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,46 @@ jobs:
6363
- uses: julia-actions/julia-runtest@v1
6464
env:
6565
JULIA_NUM_THREADS: ${{ matrix.env.JULIA_NUM_THREADS }}
66+
OpenCL:
67+
name: OpenCL
68+
runs-on: ubuntu-latest
69+
timeout-minutes: 60
70+
permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
71+
actions: write
72+
contents: read
73+
strategy:
74+
fail-fast: true
75+
steps:
76+
- uses: actions/checkout@v4
77+
- uses: julia-actions/setup-julia@v2
78+
with:
79+
version: 1
80+
arch: x64
81+
- uses: julia-actions/cache@v2
82+
- uses: julia-actions/julia-buildpkg@v1
83+
- uses: julia-actions/julia-runtest@v1
84+
with:
85+
test_args: '--OpenCL'
86+
# cpuKA:
87+
# name: KA CPU Backend
88+
# runs-on: ubuntu-latest
89+
# timeout-minutes: 60
90+
# permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
91+
# actions: write
92+
# contents: read
93+
# strategy:
94+
# fail-fast: true
95+
# steps:
96+
# - uses: actions/checkout@v4
97+
# - uses: julia-actions/setup-julia@v2
98+
# with:
99+
# version: 1
100+
# arch: x64
101+
# - uses: julia-actions/cache@v2
102+
# - uses: julia-actions/julia-buildpkg@v1
103+
# - uses: julia-actions/julia-runtest@v1
104+
# with:
105+
# test_args: '--cpuKA'
66106
docs:
67107
name: Documentation
68108
runs-on: ubuntu-latest

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "AcceleratedKernels"
22
uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
33
authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
4-
version = "0.4.1"
4+
version = "0.4.2"
55

66
[deps]
77
ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"

benchmark/accumulate_nd.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ display(@benchmark AK.accumulate(+, v, init=Int64(0), dims=1) setup=(v = ArrayTy
5050

5151
println("\n===\nBenchmarking accumulate(+, dims=2) on $n1 × $n2 Int64 - Base vs. AK")
5252
display(@benchmark Base.accumulate(+, v, init=Int64(0), dims=2) setup=(v = ArrayType(rand(Int64(1):Int64(100), n1, n2))))
53-
display(@benchmark AK.reduce(+, v, init=Int64(0), dims=2) setup=(v = ArrayType(rand(Int64(1):Int64(100), n1, n2))))
53+
display(@benchmark AK.accumulate(+, v, init=Int64(0), dims=2) setup=(v = ArrayType(rand(Int64(1):Int64(100), n1, n2))))
5454

5555

5656

src/AcceleratedKernels.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ module AcceleratedKernels
1212

1313
# Internal dependencies
1414
using ArgCheck: @argcheck
15-
using GPUArraysCore: AbstractGPUArray, @allowscalar
15+
using GPUArraysCore: AnyGPUArray, @allowscalar
1616
using KernelAbstractions
1717
import UnsafeAtomics
1818

src/accumulate/accumulate.jl

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -165,24 +165,34 @@ function _accumulate_impl!(
165165
# CPU settings
166166
max_tasks::Int=Threads.nthreads(),
167167
min_elems::Int=2,
168+
prefer_threads::Bool=true,
168169

169170
# GPU settings
170171
block_size::Int=256,
171172
temp::Union{Nothing, AbstractArray}=nothing,
172173
temp_flags::Union{Nothing, AbstractArray}=nothing,
173174
)
174175
if isnothing(dims)
175-
return accumulate_1d!(
176-
op, v, backend, alg;
177-
init, neutral, inclusive,
178-
max_tasks, min_elems,
179-
block_size, temp, temp_flags,
180-
)
176+
return if use_KA_algo(v, prefer_threads)
177+
accumulate_1d_gpu!(
178+
op, v, backend, alg;
179+
init, neutral, inclusive,
180+
max_tasks, min_elems,
181+
block_size, temp, temp_flags,
182+
)
183+
else
184+
accumulate_1d_cpu!(
185+
op, v, backend, alg;
186+
init, neutral, inclusive,
187+
max_tasks, min_elems,
188+
block_size, temp, temp_flags,
189+
)
190+
end
181191
else
182192
return accumulate_nd!(
183193
op, v, backend;
184194
init, neutral, dims, inclusive,
185-
max_tasks, min_elems,
195+
max_tasks, min_elems, prefer_threads,
186196
block_size,
187197
)
188198
end

src/accumulate/accumulate_1d_cpu.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
function accumulate_1d!(
2-
op, v::AbstractArray, backend::CPU, alg;
1+
function accumulate_1d_cpu!(
2+
op, v::AbstractArray, backend::Backend, alg;
33
init,
44
neutral,
55
inclusive::Bool,

src/accumulate/accumulate_1d_gpu.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,8 @@ end
254254

255255

256256
# DecoupledLookback algorithm
257-
function accumulate_1d!(
258-
op, v::AbstractArray, backend::GPU, ::DecoupledLookback;
257+
function accumulate_1d_gpu!(
258+
op, v::AbstractArray, backend::Backend, ::DecoupledLookback;
259259
init,
260260
neutral,
261261
inclusive::Bool,
@@ -313,8 +313,8 @@ end
313313

314314

315315
# ScanPrefixes algorithm
316-
function accumulate_1d!(
317-
op, v::AbstractArray, backend::GPU, ::ScanPrefixes;
316+
function accumulate_1d_gpu!(
317+
op, v::AbstractArray, backend, ::ScanPrefixes;
318318
init,
319319
neutral,
320320
inclusive::Bool,

src/accumulate/accumulate_nd.jl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ function accumulate_nd!(
88
# CPU settings
99
max_tasks::Int,
1010
min_elems::Int,
11+
prefer_threads::Bool=true,
1112

1213
# GPU settings
1314
block_size::Int,
@@ -34,7 +35,7 @@ function accumulate_nd!(
3435

3536
# Degenerate cases end
3637

37-
if backend isa CPU
38+
if !use_KA_algo(v, prefer_threads)
3839
_accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
3940
else
4041
# On GPUs we have two parallelisation approaches, based on which dimension has more elements:
@@ -227,7 +228,7 @@ end
227228
# We have a block of threads to accumulate along the dims axis; do it in chunks of
228229
# block_size and keep track of previous chunks' running prefix
229230
ichunk = typeof(iblock)(0)
230-
num_chunks = (length_dims + block_size - 0x1) ÷ block_size
231+
num_chunks = (length_dims + (0x2 * block_size) - 0x1) ÷ (0x2 * block_size)
231232
total = neutral
232233

233234
if ithread == 0x0
@@ -326,7 +327,7 @@ end
326327

327328
# ...and accumulate the last value too
328329
if bi == 0x2 * block_size - 0x1
329-
if iblock < num_chunks - 0x1
330+
if ichunk < num_chunks - 0x1
330331
temp[bi + bank_offset_b + 0x1] = op(t2, v[
331332
input_base_idx +
332333
((ichunk + 0x1) * block_size * 0x2 - 0x1) * vstrides[dims] +

src/foreachindex.jl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ end
1515
function _forindices_gpu(
1616
f,
1717
indices,
18-
backend::GPU;
18+
backend::Backend;
1919

2020
block_size::Int=256,
2121
)
@@ -125,11 +125,12 @@ function foreachindex(
125125
# CPU settings
126126
max_tasks=Threads.nthreads(),
127127
min_elems=1,
128+
prefer_threads::Bool=true,
128129

129130
# GPU settings
130131
block_size=256,
131132
)
132-
if backend isa GPU
133+
if use_KA_algo(itr, prefer_threads)
133134
_forindices_gpu(f, eachindex(itr), backend; block_size)
134135
else
135136
_forindices_threads(f, eachindex(itr); max_tasks, min_elems)
@@ -218,6 +219,7 @@ function foraxes(
218219
# CPU settings
219220
max_tasks=Threads.nthreads(),
220221
min_elems=1,
222+
prefer_threads::Bool=true,
221223

222224
# GPU settings
223225
block_size=256,
@@ -226,11 +228,11 @@ function foraxes(
226228
return foreachindex(
227229
f, itr, backend;
228230
max_tasks, min_elems,
229-
block_size,
231+
prefer_threads, block_size,
230232
)
231233
end
232234

233-
if backend isa GPU
235+
if use_KA_algo(itr, prefer_threads)
234236
_forindices_gpu(f, axes(itr, dims), backend; block_size)
235237
else
236238
_forindices_threads(f, axes(itr, dims); max_tasks, min_elems)

src/map.jl

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,12 @@ end
3333
"""
3434
function map!(
3535
f, dst::AbstractArray, src::AbstractArray, backend::Backend=get_backend(src);
36-
37-
# CPU settings
38-
max_tasks=Threads.nthreads(),
39-
min_elems=1,
40-
41-
# GPU settings
42-
block_size=256,
36+
kwargs...
4337
)
4438
@argcheck length(dst) == length(src)
4539
foreachindex(
4640
src, backend;
47-
max_tasks, min_elems,
48-
block_size,
41+
kwargs...
4942
) do idx
5043
dst[idx] = f(src[idx])
5144
end

0 commit comments

Comments
 (0)