JuliaGPU
diff --git a/‎src/AcceleratedKernels.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/AcceleratedKernels.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/accumulate/accumulate.jl‎
Lines changed: 22 additions & 8 deletions b/‎src/accumulate/accumulate.jl‎
Lines changed: 22 additions & 8 deletions
diff --git a/‎src/accumulate/accumulate_1d_cpu.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/accumulate/accumulate_1d_cpu.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/accumulate/accumulate_1d_gpu.jl‎
Lines changed: 4 additions & 4 deletions b/‎src/accumulate/accumulate_1d_gpu.jl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/accumulate/accumulate_nd.jl‎
Lines changed: 2 additions & 1 deletion b/‎src/accumulate/accumulate_nd.jl‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/foreachindex.jl‎
Lines changed: 12 additions & 6 deletions b/‎src/foreachindex.jl‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎src/map.jl‎
Lines changed: 2 additions & 0 deletions b/‎src/map.jl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/predicates.jl‎
Lines changed: 16 additions & 6 deletions b/‎src/predicates.jl‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎src/reduce/mapreduce_1d_cpu.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/reduce/mapreduce_1d_cpu.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/reduce/mapreduce_1d_gpu.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/reduce/mapreduce_1d_gpu.jl‎
Lines changed: 2 additions & 2 deletions
@@ -12,7 +12,7 @@ module AcceleratedKernels
 
 # Internal dependencies
 using ArgCheck: @argcheck
-using GPUArraysCore: AbstractGPUArray, @allowscalar
+using GPUArraysCore: AnyGPUArray, @allowscalar
 using KernelAbstractions
 
 
 
@@ -38,6 +38,7 @@ include("accumulate_nd.jl")
         # CPU settings
         max_tasks::Int=Threads.nthreads(),
         min_elems::Int=2,
+        prefer_threads::Bool=true,
 
         # Algorithm choice
         alg::AccumulateAlgorithm=DecoupledLookback(),
@@ -58,6 +59,7 @@ include("accumulate_nd.jl")
         # CPU settings
         max_tasks::Int=Threads.nthreads(),
         min_elems::Int=2,
+        prefer_threads::Bool=true,
 
         # Algorithm choice
         alg::AccumulateAlgorithm=DecoupledLookback(),
@@ -80,7 +82,9 @@ we do not need the constraint of `dst` and `src` being different; to minimise me
 recommend using the single-array interface (the first one above).
 
 ## CPU
-Use at most `max_tasks` threads with at least `min_elems` elements per task.
+Use at most `max_tasks` threads with at least `min_elems` elements per task. `prefer_threads` tells
+AK to prioritize using the CPU algorithm implementation (default behaviour) over the KA algorithm
+through POCL.
 
 Note that accumulation is typically a memory-bound operation, so multithreaded accumulation only
 becomes faster if it is a more compute-heavy operation to hide memory latency - that includes:
@@ -167,24 +171,34 @@ function _accumulate_impl!(
     # CPU settings
     max_tasks::Int=Threads.nthreads(),
     min_elems::Int=2,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
     temp::Union{Nothing, AbstractArray}=nothing,
     temp_flags::Union{Nothing, AbstractArray}=nothing,
 )
     if isnothing(dims)
-        return accumulate_1d!(
-            op, v, backend, alg;
-            init, neutral, inclusive,
-            max_tasks, min_elems,
-            block_size, temp, temp_flags,
-        )
+        return if use_KA_algo(v, prefer_threads)
+            accumulate_1d_gpu!(
+                op, v, backend, alg;
+                init, neutral, inclusive,
+                max_tasks, min_elems,
+                block_size, temp, temp_flags,
+            )
+        else
+            accumulate_1d_cpu!(
+                op, v, backend, alg;
+                init, neutral, inclusive,
+                max_tasks, min_elems,
+                block_size, temp, temp_flags,
+            )
+        end
     else
         return accumulate_nd!(
             op, v, backend;
             init, neutral, dims, inclusive,
-            max_tasks, min_elems,
+            max_tasks, min_elems, prefer_threads,
             block_size,
         )
     end
 
@@ -1,5 +1,5 @@
-function accumulate_1d!(
-    op, v::AbstractArray, backend::CPU, alg;
+function accumulate_1d_cpu!(
+    op, v::AbstractArray, backend::Backend, alg;
     init,
     neutral,
     inclusive::Bool,
 
@@ -248,8 +248,8 @@ end
 
 
 # DecoupledLookback algorithm
-function accumulate_1d!(
-    op, v::AbstractArray, backend::GPU, ::DecoupledLookback;
+function accumulate_1d_gpu!(
+    op, v::AbstractArray, backend::Backend, ::DecoupledLookback;
     init,
     neutral,
     inclusive::Bool,
@@ -307,8 +307,8 @@ end
 
 
 # ScanPrefixes algorithm
-function accumulate_1d!(
-    op, v::AbstractArray, backend::GPU, ::ScanPrefixes;
+function accumulate_1d_gpu!(
+    op, v::AbstractArray, backend, ::ScanPrefixes;
     init,
     neutral,
     inclusive::Bool,
 
@@ -8,6 +8,7 @@ function accumulate_nd!(
     # CPU settings
     max_tasks::Int,
     min_elems::Int,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int,
@@ -34,7 +35,7 @@ function accumulate_nd!(
 
     # Degenerate cases end
 
-    if backend isa CPU
+    if !use_KA_algo(v, prefer_threads)
         _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
     else
         # On GPUs we have two parallelisation approaches, based on which dimension has more elements:
 
@@ -15,7 +15,7 @@ end
 function _forindices_gpu(
     f,
     indices,
-    backend::GPU;
+    backend::Backend;
 
     block_size::Int=256,
 )
@@ -47,6 +47,7 @@ end
         # CPU settings
         max_tasks=Threads.nthreads(),
         min_elems=1,
+        prefer_threads::Bool=true,
 
         # GPU settings
         block_size=256,
@@ -60,7 +61,8 @@ MtlArray, oneArray - with one GPU thread per index.
 On CPUs at most `max_tasks` threads are launched, or fewer such that each thread processes at least
 `min_elems` indices; if a single task ends up being needed, `f` is inlined and no thread is
 launched. Tune it to your function - the more expensive it is, the fewer elements are needed to
-amortise the cost of launching a thread (which is a few μs).
+amortise the cost of launching a thread (which is a few μs). `prefer_threads` tells AK to prioritize
+using the CPU algorithm implementation (default behaviour) over the KA algorithm through POCL.
 
 # Examples
 Normally you would write a for loop like this:
@@ -125,11 +127,12 @@ function foreachindex(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size=256,
 )
-    if backend isa GPU
+    if use_KA_algo(itr, prefer_threads)
         _forindices_gpu(f, eachindex(itr), backend; block_size)
     else
         _forindices_threads(f, eachindex(itr); max_tasks, min_elems)
@@ -144,6 +147,7 @@ end
         # CPU settings
         max_tasks=Threads.nthreads(),
         min_elems=1,
+        prefer_threads::Bool=true,
 
         # GPU settings
         block_size=256,
@@ -157,7 +161,8 @@ MtlArray, oneArray - with one GPU thread per index.
 On CPUs at most `max_tasks` threads are launched, or fewer such that each thread processes at least
 `min_elems` indices; if a single task ends up being needed, `f` is inlined and no thread is
 launched. Tune it to your function - the more expensive it is, the fewer elements are needed to
-amortise the cost of launching a thread (which is a few μs).
+amortise the cost of launching a thread (which is a few μs). `prefer_threads` tells AK to prioritize
+using the CPU algorithm implementation (default behaviour) over the KA algorithm through POCL.
 
 # Examples
 Normally you would write a for loop like this:
@@ -218,6 +223,7 @@ function foraxes(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size=256,
@@ -226,11 +232,11 @@ function foraxes(
         return foreachindex(
             f, itr, backend;
             max_tasks, min_elems,
-            block_size,
+            prefer_threads, block_size,
         )
     end
 
-    if backend isa GPU
+    if use_KA_algo(itr, prefer_threads)
         _forindices_gpu(f, axes(itr, dims), backend; block_size)
     else
         _forindices_threads(f, axes(itr, dims); max_tasks, min_elems)
 
@@ -5,6 +5,7 @@
         # CPU settings
         max_tasks=Threads.nthreads(),
         min_elems=1,
+        prefer_threads::Bool=true,
 
         # GPU settings
         block_size=256,
@@ -53,6 +54,7 @@ end
         # CPU settings
         max_tasks=Threads.nthreads(),
         min_elems=1,
+        prefer_threads::Bool=true,
 
         # GPU settings
         block_size=256,
 
@@ -39,6 +39,7 @@ end
         # CPU settings
         max_tasks=Threads.nthreads(),
         min_elems=1,
+        prefer_threads::Bool=true,
 
         # GPU settings
         block_size::Int=256,
@@ -53,7 +54,9 @@ reduction.
 ## CPU
 Multithreaded parallelisation is only worth it for large arrays, relatively expensive predicates,
 and/or rare occurrence of true; use `max_tasks` and `min_elems` to only use parallelism when worth
-it in your application. When only one thread is needed, there is no overhead.
+it in your application. When only one thread is needed, there is no overhead. `prefer_threads`
+tells AK to prioritize using the CPU algorithm implementation (default behaviour) over the KA
+algorithm through POCL.
 
 ## GPU
 There are two possible `alg` choices:
@@ -114,11 +117,12 @@ function _any_impl(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
 )
-    if backend isa GPU
+    if use_KA_algo(v, prefer_threads)
         @argcheck block_size > 0
 
         # Some platforms crash when multiple threads write to the same memory location in a global
@@ -137,7 +141,8 @@ function _any_impl(
                 backend;
                 init=false,
                 neutral=false,
-                block_size=block_size,
+                prefer_threads=true,
+                block_size,
                 temp=alg.temp,
                 switch_below=alg.switch_below,
             )
@@ -171,6 +176,7 @@ end
         # CPU settings
         max_tasks=Threads.nthreads(),
         min_elems=1,
+        prefer_threads::Bool=true,
 
         # GPU settings
         block_size::Int=256,
@@ -185,7 +191,9 @@ reduction.
 ## CPU
 Multithreaded parallelisation is only worth it for large arrays, relatively expensive predicates,
 and/or rare occurrence of true; use `max_tasks` and `min_elems` to only use parallelism when worth
-it in your application. When only one thread is needed, there is no overhead.
+it in your application. When only one thread is needed, there is no overhead. `prefer_threads`
+tells AK to prioritize using the CPU algorithm implementation (default behaviour) over the KA
+algorithm through POCL.
 
 ## GPU
 There are two possible `alg` choices:
@@ -246,11 +254,12 @@ function _all_impl(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
 )
-    if backend isa GPU
+    if use_KA_algo(v, prefer_threads)
         @argcheck block_size > 0
 
         # Some platforms crash when multiple threads write to the same memory location in a global
@@ -269,7 +278,8 @@ function _all_impl(
                 backend;
                 init=true,
                 neutral=true,
-                block_size=block_size,
+                prefer_threads=false,
+                block_size,
                 temp=alg.temp,
                 switch_below=alg.switch_below,
             )
 
@@ -1,5 +1,5 @@
-function mapreduce_1d(
-    f, op, src::AbstractArray, backend::CPU;
+function mapreduce_1d_cpu(
+    f, op, src::AbstractArray, backend::Backend;
     init,
     neutral,
 
 
@@ -99,8 +99,8 @@
 end
 
 
-function mapreduce_1d(
-    f, op, src::AbstractArray, backend::GPU;
+function mapreduce_1d_gpu(
+    f, op, src::AbstractArray, backend::Backend;
     init,
     neutral,