JuliaGPU
diff --git a/‎.github/workflows/CI-CPU.yml‎
Lines changed: 40 additions & 0 deletions b/‎.github/workflows/CI-CPU.yml‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎src/AcceleratedKernels.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/AcceleratedKernels.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/accumulate/accumulate.jl‎
Lines changed: 17 additions & 7 deletions b/‎src/accumulate/accumulate.jl‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎src/accumulate/accumulate_1d_cpu.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/accumulate/accumulate_1d_cpu.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/accumulate/accumulate_1d_gpu.jl‎
Lines changed: 4 additions & 4 deletions b/‎src/accumulate/accumulate_1d_gpu.jl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/accumulate/accumulate_nd.jl‎
Lines changed: 2 additions & 1 deletion b/‎src/accumulate/accumulate_nd.jl‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/foreachindex.jl‎
Lines changed: 6 additions & 4 deletions b/‎src/foreachindex.jl‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/map.jl‎
Lines changed: 2 additions & 9 deletions b/‎src/map.jl‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎src/predicates.jl‎
Lines changed: 8 additions & 4 deletions b/‎src/predicates.jl‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/reduce/mapreduce_1d_cpu.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/reduce/mapreduce_1d_cpu.jl‎
Lines changed: 2 additions & 2 deletions
@@ -63,6 +63,46 @@ jobs:
       - uses: julia-actions/julia-runtest@v1
         env:
           JULIA_NUM_THREADS: ${{ matrix.env.JULIA_NUM_THREADS }}
+  OpenCL:
+    name: OpenCL
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
+      actions: write
+      contents: read
+    strategy:
+      fail-fast: true
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: 1
+          arch: x64
+      - uses: julia-actions/cache@v2
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+        with:
+          test_args: '--OpenCL'
+  # cpuKA:
+  #   name: KA CPU Backend
+  #   runs-on: ubuntu-latest
+  #   timeout-minutes: 60
+  #   permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
+  #     actions: write
+  #     contents: read
+  #   strategy:
+  #     fail-fast: true
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - uses: julia-actions/setup-julia@v2
+  #       with:
+  #         version: 1
+  #         arch: x64
+  #     - uses: julia-actions/cache@v2
+  #     - uses: julia-actions/julia-buildpkg@v1
+  #     - uses: julia-actions/julia-runtest@v1
+  #       with:
+  #         test_args: '--cpuKA'
   docs:
     name: Documentation
     runs-on: ubuntu-latest
 
@@ -12,7 +12,7 @@ module AcceleratedKernels
 
 # Internal dependencies
 using ArgCheck: @argcheck
-using GPUArraysCore: AbstractGPUArray, @allowscalar
+using GPUArraysCore: AnyGPUArray, @allowscalar
 using KernelAbstractions
 
 
 
@@ -167,24 +167,34 @@ function _accumulate_impl!(
     # CPU settings
     max_tasks::Int=Threads.nthreads(),
     min_elems::Int=2,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
     temp::Union{Nothing, AbstractArray}=nothing,
     temp_flags::Union{Nothing, AbstractArray}=nothing,
 )
     if isnothing(dims)
-        return accumulate_1d!(
-            op, v, backend, alg;
-            init, neutral, inclusive,
-            max_tasks, min_elems,
-            block_size, temp, temp_flags,
-        )
+        return if use_KA_algo(v, prefer_threads)
+            accumulate_1d_gpu!(
+                op, v, backend, alg;
+                init, neutral, inclusive,
+                max_tasks, min_elems,
+                block_size, temp, temp_flags,
+            )
+        else
+            accumulate_1d_cpu!(
+                op, v, backend, alg;
+                init, neutral, inclusive,
+                max_tasks, min_elems,
+                block_size, temp, temp_flags,
+            )
+        end
     else
         return accumulate_nd!(
             op, v, backend;
             init, neutral, dims, inclusive,
-            max_tasks, min_elems,
+            max_tasks, min_elems, prefer_threads,
             block_size,
         )
     end
 
@@ -1,5 +1,5 @@
-function accumulate_1d!(
-    op, v::AbstractArray, backend::CPU, alg;
+function accumulate_1d_cpu!(
+    op, v::AbstractArray, backend::Backend, alg;
     init,
     neutral,
     inclusive::Bool,
 
@@ -248,8 +248,8 @@ end
 
 
 # DecoupledLookback algorithm
-function accumulate_1d!(
-    op, v::AbstractArray, backend::GPU, ::DecoupledLookback;
+function accumulate_1d_gpu!(
+    op, v::AbstractArray, backend::Backend, ::DecoupledLookback;
     init,
     neutral,
     inclusive::Bool,
@@ -307,8 +307,8 @@ end
 
 
 # ScanPrefixes algorithm
-function accumulate_1d!(
-    op, v::AbstractArray, backend::GPU, ::ScanPrefixes;
+function accumulate_1d_gpu!(
+    op, v::AbstractArray, backend, ::ScanPrefixes;
     init,
     neutral,
     inclusive::Bool,
 
@@ -8,6 +8,7 @@ function accumulate_nd!(
     # CPU settings
     max_tasks::Int,
     min_elems::Int,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int,
@@ -34,7 +35,7 @@ function accumulate_nd!(
 
     # Degenerate cases end
 
-    if backend isa CPU
+    if !use_KA_algo(v, prefer_threads)
         _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
     else
         # On GPUs we have two parallelisation approaches, based on which dimension has more elements:
 
@@ -15,7 +15,7 @@ end
 function _forindices_gpu(
     f,
     indices,
-    backend::GPU;
+    backend::Backend;
 
     block_size::Int=256,
 )
@@ -125,11 +125,12 @@ function foreachindex(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size=256,
 )
-    if backend isa GPU
+    if use_KA_algo(itr, prefer_threads)
         _forindices_gpu(f, eachindex(itr), backend; block_size)
     else
         _forindices_threads(f, eachindex(itr); max_tasks, min_elems)
@@ -218,6 +219,7 @@ function foraxes(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size=256,
@@ -226,11 +228,11 @@ function foraxes(
         return foreachindex(
             f, itr, backend;
             max_tasks, min_elems,
-            block_size,
+            prefer_threads, block_size,
         )
     end
 
-    if backend isa GPU
+    if use_KA_algo(itr, prefer_threads)
         _forindices_gpu(f, axes(itr, dims), backend; block_size)
     else
         _forindices_threads(f, axes(itr, dims); max_tasks, min_elems)
 
@@ -33,19 +33,12 @@ end
 """
 function map!(
     f, dst::AbstractArray, src::AbstractArray, backend::Backend=get_backend(src);
-
-    # CPU settings
-    max_tasks=Threads.nthreads(),
-    min_elems=1,
-
-    # GPU settings
-    block_size=256,
+    kwargs...
 )
     @argcheck length(dst) == length(src)
     foreachindex(
         src, backend;
-        max_tasks, min_elems,
-        block_size,
+        kwargs...
     ) do idx
         dst[idx] = f(src[idx])
     end
 
@@ -114,11 +114,12 @@ function _any_impl(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
 )
-    if backend isa GPU
+    if use_KA_algo(v, prefer_threads)
         @argcheck block_size > 0
 
         # Some platforms crash when multiple threads write to the same memory location in a global
@@ -137,7 +138,8 @@ function _any_impl(
                 backend;
                 init=false,
                 neutral=false,
-                block_size=block_size,
+                prefer_threads=true,
+                block_size,
                 temp=alg.temp,
                 switch_below=alg.switch_below,
             )
@@ -246,11 +248,12 @@ function _all_impl(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
 )
-    if backend isa GPU
+    if use_KA_algo(v, prefer_threads)
         @argcheck block_size > 0
 
         # Some platforms crash when multiple threads write to the same memory location in a global
@@ -269,7 +272,8 @@ function _all_impl(
                 backend;
                 init=true,
                 neutral=true,
-                block_size=block_size,
+                prefer_threads=false,
+                block_size,
                 temp=alg.temp,
                 switch_below=alg.switch_below,
             )
 
@@ -1,5 +1,5 @@
-function mapreduce_1d(
-    f, op, src::AbstractArray, backend::CPU;
+function mapreduce_1d_cpu(
+    f, op, src::AbstractArray, backend::Backend;
     init,
     neutral,