sample_sortperm uses by/rev/ord in the right order now. Removed AbstractGPUVector from imports. Full sample_sort benchmark suite

anicusan · anicusan · commit a8e7d77d86b8 · 2025-05-25T01:12:31.000+01:00
diff --git a/README.md b/README.md
@@ -299,6 +299,11 @@ Leave out to test the CPU backend:
 $> julia -e 'import Pkg; Pkg.test("AcceleratedKernels.jl")'
 ```
 
+Start Julia with multiple threads to run the tests on a multithreaded CPU backend:
+```bash
+$> julia --threads=4 -e 'import Pkg; Pkg.test("AcceleratedKernels.jl")'
+```
+
 
 ## 8. Issues and Debugging
 As the compilation pipeline of GPU kernels is different to that of base Julia, error messages also look different - for example, where Julia would insert an exception when a variable name was not defined (e.g. we had a typo), a GPU kernel throwing exceptions cannot be compiled and instead you'll see some cascading errors like `"[...] compiling [...] resulted in invalid LLVM IR"` caused by `"Reason: unsupported use of an undefined name"` resulting in `"Reason: unsupported dynamic function invocation"`, etc.
diff --git a/prototype/parallel_sample_sort/benchmark.jl b/prototype/parallel_sample_sort/benchmark.jl
@@ -16,16 +16,30 @@ AK.sort!(v)
 
 # Collect a profile
 Profile.clear()
-v = rand(1_000_000)
-@profile AK.sort!(v)
+# v = rand(1_000_000)
+# @profile AK.sort!(v)
+
+v = rand(UInt32(0):UInt32(1_000_000), 1_000_000)
+ix = Vector{Int}(undef, 1_000_000)
+@profile AK.sortperm!(ix, v)
 pprof()
 
 
-println("Base vs AK sort (Int):")
-display(@benchmark Base.sort!(v) setup=(v = rand(1:100, 1_000_000)))
-display(@benchmark AK.sort!(v) setup=(v = rand(1:100, 1_000_000)))
+println("\nBase vs AK sort (Int):")
+display(@benchmark Base.sort!(v) setup=(v = rand(1:1_000_000, 1_000_000)))
+display(@benchmark AK.sort!(v) setup=(v = rand(1:1_000_000, 1_000_000)))
 
 
-println("Base vs AK sort (Float64):")
+println("\nBase vs AK sort (Float64):")
 display(@benchmark Base.sort!(v) setup=(v = rand(Float64, 1_000_000)))
 display(@benchmark AK.sort!(v) setup=(v = rand(Float64, 1_000_000)))
+
+
+println("\nBase vs AK sortperm (UInt32):")
+display(@benchmark Base.sortperm!(ix, v) setup=(v = rand(UInt32(0):UInt32(1_000_000), 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
+display(@benchmark AK.sortperm!(ix, v) setup=(v = rand(UInt32(0):UInt32(1_000_000), 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
+
+
+println("\nBase vs AK sortperm (Float64):")
+display(@benchmark Base.sortperm!(ix, v) setup=(v = rand(Float64, 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
+display(@benchmark AK.sortperm!(ix, v) setup=(v = rand(Float64, 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
@@ -12,7 +12,7 @@ module AcceleratedKernels
 
 # Internal dependencies
 using ArgCheck: @argcheck
-using GPUArraysCore: AbstractGPUVector, AbstractGPUArray, @allowscalar
+using GPUArraysCore: AbstractGPUArray, @allowscalar
 using KernelAbstractions
 
 
diff --git a/src/sort/cpu_sample_sort.jl b/src/sort/cpu_sample_sort.jl
@@ -14,7 +14,10 @@ end
 
 
 function _sample_sort_compute_offsets!(histograms, max_tasks)
+    # Not worth parallelising this, as the number of tasks is much smaller than the number of
+    # elements - in profiling this does not show up
     @inbounds begin
+
         # Sum up histograms and compute global offsets for each task
         offsets = @view histograms[1:max_tasks, max_tasks + 1]
         for itask in 1:max_tasks
@@ -40,7 +43,7 @@ end
 
 
 function _sample_sort_move_buckets!(
-    v, dest, ord,
+    v, temp, ord,
     splitters, global_offsets, task_offsets,
     itask, max_tasks, irange,
 )
@@ -58,7 +61,7 @@ function _sample_sort_move_buckets!(
             ibucket = 1 + _searchsortedlast(splitters, v[i], 1, length(splitters), ord)
 
             # Get the current destination index for this element, then increment
-            dest[offsets[ibucket]] = v[i]
+            temp[offsets[ibucket]] = v[i]
             offsets[ibucket] += 1
         end
     end
@@ -68,29 +71,29 @@ end
 
 
 function _sample_sort_sort_bucket!(
-    v, dest, offsets, itask, max_tasks;
+    v, temp, offsets, itask, max_tasks;
     lt, by, rev, order    
 )
     @inbounds begin
         istart = offsets[itask] + 1
-        istop = itask == max_tasks ? length(dest) : offsets[itask + 1]
+        istop = itask == max_tasks ? length(temp) : offsets[itask + 1]
 
         if istart == istop
-            v[istart] = dest[istart]
+            v[istart] = temp[istart]
             return
         elseif istart > istop
             return
         end
 
-        # At the end we will have to move elements from dest back to v anyways; for every
+        # At the end we will have to move elements from temp back to v anyways; for every
         # odd-numbered itask, move elements first, to avoid false sharing from threads
         if isodd(itask)
-            copyto!(v, istart, dest, istart, istop - istart + 1)
+            copyto!(v, istart, temp, istart, istop - istart + 1)
             Base.sort!(view(v, istart:istop), lt=lt, by=by, rev=rev, order=order)
         else
             # For even-numbered itasks, sort first, then move elements back to v
-            Base.sort!(view(dest, istart:istop), lt=lt, by=by, rev=rev, order=order)
-            copyto!(v, istart, dest, istart, istop - istart + 1)
+            Base.sort!(view(temp, istart:istop), lt=lt, by=by, rev=rev, order=order)
+            copyto!(v, istart, temp, istart, istop - istart + 1)
         end
     end
 
@@ -99,7 +102,7 @@ end
 
 
 function _sample_sort_parallel!(
-    v, dest, ord,
+    v, temp, ord,
     splitters, histograms,
     max_tasks;
     lt, by, rev, order,
@@ -121,7 +124,7 @@ function _sample_sort_parallel!(
     # Move the elements into the destination buffer
     itask_partition(tp) do itask, irange
         _sample_sort_move_buckets!(
-            v, dest, ord,
+            v, temp, ord,
             splitters, offsets, histograms,
             itask, max_tasks, irange,
         )
@@ -130,7 +133,7 @@ function _sample_sort_parallel!(
     # Sort each bucket in parallel
     itask_partition(tp) do itask, irange
         _sample_sort_sort_bucket!(
-            v, dest, offsets, itask, max_tasks;
+            v, temp, offsets, itask, max_tasks;
             lt=lt, by=by, rev=rev, order=order,
         )
     end
@@ -150,14 +153,14 @@ function _sample_sort_parallel!(
     # for itask in 1:max_tasks
     #     irange = tp[itask]
     #     _sample_sort_move_buckets!(
-    #         v, dest, ord,
+    #         v, temp, ord,
     #         splitters, offsets, histograms,
     #         itask, max_tasks, irange,
     #     )
     # end
     # for itask in 1:max_tasks
     #     _sample_sort_sort_bucket!(
-    #         v, dest, offsets, itask, max_tasks;
+    #         v, temp, offsets, itask, max_tasks;
     #         lt=lt, by=by, rev=rev, order=order,
     #     )
     # end
@@ -285,17 +288,31 @@ function sample_sortperm!(
     @argcheck length(ix) == length(v)
 
     # Initialise indices that will be sorted by the keys in v
-    foreachindex(ix, max_tasks=max_tasks) do i
+    foreachindex(ix, max_tasks=max_tasks, min_elems=min_elems) do i
         @inbounds ix[i] = i
     end
 
-    # Construct custom comparator indexing into global array v for every index comparison
-    ilt = (ix, iy) -> lt(v[ix], v[iy])
+    # The Order may have a type instability for `rev=true`, so we keep this function barrier
+    ord = Base.Order.ord(lt, by, rev, order)
+    _sample_sort_barrier!(
+        ix, v, ord;
+        max_tasks=max_tasks,
+        min_elems=min_elems,
+        temp=temp,
+    )
+end
+
 
-    # Sort with custom comparator
+function _sample_sort_barrier!(ix, v, ord; max_tasks, min_elems, temp)
+    # Construct custom comparator indexing into global array v for every index comparison
+    comp = (ix, iy) -> Base.Order.lt(ord, v[ix], v[iy])
     sample_sort!(
         ix;
-        lt=ilt, by=by, rev=rev, order=order,
+        lt=comp,
+
+        # Leave defaults - we already have a custom comparator
+        # by=identity, rev=nothing, order=Base.Order.Forward,
+
         max_tasks=max_tasks,
         min_elems=min_elems,
         temp=temp,