Skip to content

Commit a8e7d77

Browse files
committed
sample_sortperm uses by/rev/ord in the right order now. Removed AbstractGPUVector from imports. Full sample_sort benchmark suite
1 parent cf6e08d commit a8e7d77

File tree

4 files changed

+62
-26
lines changed

4 files changed

+62
-26
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,11 @@ Leave out to test the CPU backend:
299299
$> julia -e 'import Pkg; Pkg.test("AcceleratedKernels.jl")'
300300
```
301301

302+
Start Julia with multiple threads to run the tests on a multithreaded CPU backend:
303+
```bash
304+
$> julia --threads=4 -e 'import Pkg; Pkg.test("AcceleratedKernels.jl")'
305+
```
306+
302307

303308
## 8. Issues and Debugging
304309
As the compilation pipeline of GPU kernels is different to that of base Julia, error messages also look different - for example, where Julia would insert an exception when a variable name was not defined (e.g. we had a typo), a GPU kernel throwing exceptions cannot be compiled and instead you'll see some cascading errors like `"[...] compiling [...] resulted in invalid LLVM IR"` caused by `"Reason: unsupported use of an undefined name"` resulting in `"Reason: unsupported dynamic function invocation"`, etc.

prototype/parallel_sample_sort/benchmark.jl

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,30 @@ AK.sort!(v)
1616

1717
# Collect a profile
1818
Profile.clear()
19-
v = rand(1_000_000)
20-
@profile AK.sort!(v)
19+
# v = rand(1_000_000)
20+
# @profile AK.sort!(v)
21+
22+
v = rand(UInt32(0):UInt32(1_000_000), 1_000_000)
23+
ix = Vector{Int}(undef, 1_000_000)
24+
@profile AK.sortperm!(ix, v)
2125
pprof()
2226

2327

24-
println("Base vs AK sort (Int):")
25-
display(@benchmark Base.sort!(v) setup=(v = rand(1:100, 1_000_000)))
26-
display(@benchmark AK.sort!(v) setup=(v = rand(1:100, 1_000_000)))
28+
println("\nBase vs AK sort (Int):")
29+
display(@benchmark Base.sort!(v) setup=(v = rand(1:1_000_000, 1_000_000)))
30+
display(@benchmark AK.sort!(v) setup=(v = rand(1:1_000_000, 1_000_000)))
2731

2832

29-
println("Base vs AK sort (Float64):")
33+
println("\nBase vs AK sort (Float64):")
3034
display(@benchmark Base.sort!(v) setup=(v = rand(Float64, 1_000_000)))
3135
display(@benchmark AK.sort!(v) setup=(v = rand(Float64, 1_000_000)))
36+
37+
38+
println("\nBase vs AK sortperm (UInt32):")
39+
display(@benchmark Base.sortperm!(ix, v) setup=(v = rand(UInt32(0):UInt32(1_000_000), 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
40+
display(@benchmark AK.sortperm!(ix, v) setup=(v = rand(UInt32(0):UInt32(1_000_000), 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
41+
42+
43+
println("\nBase vs AK sortperm (Float64):")
44+
display(@benchmark Base.sortperm!(ix, v) setup=(v = rand(Float64, 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
45+
display(@benchmark AK.sortperm!(ix, v) setup=(v = rand(Float64, 1_000_000); ix = Vector{Int}(undef, 1_000_000)))

src/AcceleratedKernels.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ module AcceleratedKernels
1212

1313
# Internal dependencies
1414
using ArgCheck: @argcheck
15-
using GPUArraysCore: AbstractGPUVector, AbstractGPUArray, @allowscalar
15+
using GPUArraysCore: AbstractGPUArray, @allowscalar
1616
using KernelAbstractions
1717

1818

src/sort/cpu_sample_sort.jl

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@ end
1414

1515

1616
function _sample_sort_compute_offsets!(histograms, max_tasks)
17+
# Not worth parallelising this, as the number of tasks is much smaller than the number of
18+
# elements - in profiling this does not show up
1719
@inbounds begin
20+
1821
# Sum up histograms and compute global offsets for each task
1922
offsets = @view histograms[1:max_tasks, max_tasks + 1]
2023
for itask in 1:max_tasks
@@ -40,7 +43,7 @@ end
4043

4144

4245
function _sample_sort_move_buckets!(
43-
v, dest, ord,
46+
v, temp, ord,
4447
splitters, global_offsets, task_offsets,
4548
itask, max_tasks, irange,
4649
)
@@ -58,7 +61,7 @@ function _sample_sort_move_buckets!(
5861
ibucket = 1 + _searchsortedlast(splitters, v[i], 1, length(splitters), ord)
5962

6063
# Get the current destination index for this element, then increment
61-
dest[offsets[ibucket]] = v[i]
64+
temp[offsets[ibucket]] = v[i]
6265
offsets[ibucket] += 1
6366
end
6467
end
@@ -68,29 +71,29 @@ end
6871

6972

7073
function _sample_sort_sort_bucket!(
71-
v, dest, offsets, itask, max_tasks;
74+
v, temp, offsets, itask, max_tasks;
7275
lt, by, rev, order
7376
)
7477
@inbounds begin
7578
istart = offsets[itask] + 1
76-
istop = itask == max_tasks ? length(dest) : offsets[itask + 1]
79+
istop = itask == max_tasks ? length(temp) : offsets[itask + 1]
7780

7881
if istart == istop
79-
v[istart] = dest[istart]
82+
v[istart] = temp[istart]
8083
return
8184
elseif istart > istop
8285
return
8386
end
8487

85-
# At the end we will have to move elements from dest back to v anyways; for every
88+
# At the end we will have to move elements from temp back to v anyways; for every
8689
# odd-numbered itask, move elements first, to avoid false sharing from threads
8790
if isodd(itask)
88-
copyto!(v, istart, dest, istart, istop - istart + 1)
91+
copyto!(v, istart, temp, istart, istop - istart + 1)
8992
Base.sort!(view(v, istart:istop), lt=lt, by=by, rev=rev, order=order)
9093
else
9194
# For even-numbered itasks, sort first, then move elements back to v
92-
Base.sort!(view(dest, istart:istop), lt=lt, by=by, rev=rev, order=order)
93-
copyto!(v, istart, dest, istart, istop - istart + 1)
95+
Base.sort!(view(temp, istart:istop), lt=lt, by=by, rev=rev, order=order)
96+
copyto!(v, istart, temp, istart, istop - istart + 1)
9497
end
9598
end
9699

@@ -99,7 +102,7 @@ end
99102

100103

101104
function _sample_sort_parallel!(
102-
v, dest, ord,
105+
v, temp, ord,
103106
splitters, histograms,
104107
max_tasks;
105108
lt, by, rev, order,
@@ -121,7 +124,7 @@ function _sample_sort_parallel!(
121124
# Move the elements into the destination buffer
122125
itask_partition(tp) do itask, irange
123126
_sample_sort_move_buckets!(
124-
v, dest, ord,
127+
v, temp, ord,
125128
splitters, offsets, histograms,
126129
itask, max_tasks, irange,
127130
)
@@ -130,7 +133,7 @@ function _sample_sort_parallel!(
130133
# Sort each bucket in parallel
131134
itask_partition(tp) do itask, irange
132135
_sample_sort_sort_bucket!(
133-
v, dest, offsets, itask, max_tasks;
136+
v, temp, offsets, itask, max_tasks;
134137
lt=lt, by=by, rev=rev, order=order,
135138
)
136139
end
@@ -150,14 +153,14 @@ function _sample_sort_parallel!(
150153
# for itask in 1:max_tasks
151154
# irange = tp[itask]
152155
# _sample_sort_move_buckets!(
153-
# v, dest, ord,
156+
# v, temp, ord,
154157
# splitters, offsets, histograms,
155158
# itask, max_tasks, irange,
156159
# )
157160
# end
158161
# for itask in 1:max_tasks
159162
# _sample_sort_sort_bucket!(
160-
# v, dest, offsets, itask, max_tasks;
163+
# v, temp, offsets, itask, max_tasks;
161164
# lt=lt, by=by, rev=rev, order=order,
162165
# )
163166
# end
@@ -285,17 +288,31 @@ function sample_sortperm!(
285288
@argcheck length(ix) == length(v)
286289

287290
# Initialise indices that will be sorted by the keys in v
288-
foreachindex(ix, max_tasks=max_tasks) do i
291+
foreachindex(ix, max_tasks=max_tasks, min_elems=min_elems) do i
289292
@inbounds ix[i] = i
290293
end
291294

292-
# Construct custom comparator indexing into global array v for every index comparison
293-
ilt = (ix, iy) -> lt(v[ix], v[iy])
295+
# The Order may have a type instability for `rev=true`, so we keep this function barrier
296+
ord = Base.Order.ord(lt, by, rev, order)
297+
_sample_sort_barrier!(
298+
ix, v, ord;
299+
max_tasks=max_tasks,
300+
min_elems=min_elems,
301+
temp=temp,
302+
)
303+
end
304+
294305

295-
# Sort with custom comparator
306+
function _sample_sort_barrier!(ix, v, ord; max_tasks, min_elems, temp)
307+
# Construct custom comparator indexing into global array v for every index comparison
308+
comp = (ix, iy) -> Base.Order.lt(ord, v[ix], v[iy])
296309
sample_sort!(
297310
ix;
298-
lt=ilt, by=by, rev=rev, order=order,
311+
lt=comp,
312+
313+
# Leave defaults - we already have a custom comparator
314+
# by=identity, rev=nothing, order=Base.Order.Forward,
315+
299316
max_tasks=max_tasks,
300317
min_elems=min_elems,
301318
temp=temp,

0 commit comments

Comments
 (0)