JuliaGPU
diff --git a/‎examples/histogram.jl‎
Lines changed: 12 additions & 12 deletions b/‎examples/histogram.jl‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎examples/performant_matmul.jl‎
Lines changed: 33 additions & 31 deletions b/‎examples/performant_matmul.jl‎
Lines changed: 33 additions & 31 deletions
diff --git a/‎src/KernelAbstractions.jl‎
Lines changed: 51 additions & 32 deletions b/‎src/KernelAbstractions.jl‎
Lines changed: 51 additions & 32 deletions
@@ -1,6 +1,8 @@
 # INCLUDE ROCM
 using KernelAbstractions, Test
 using KernelAbstractions: @atomic, @atomicswap, @atomicreplace
+import KernelAbstractions.KernelIntrinsics as KI
+
 include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
 
 # Function to use as a baseline for CPU metrics
@@ -12,16 +14,15 @@ function create_histogram(input)
     return histogram_output
 end
 
-# This a 1D histogram kernel where the histogramming happens on shmem
-@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input)
-    gid = @index(Group, Linear)
-    lid = @index(Local, Linear)
+# This a 1D histogram kernel where the histogramming happens on static shmem
+function histogram_kernel!(histogram_output, input, ::Val{gs}) where {gs}
+    gid = KI.get_group_id().x
+    lid = KI.get_local_id().x
 
-    @uniform gs = prod(@groupsize())
     tid = (gid - 1) * gs + lid
-    @uniform N = length(histogram_output)
+    N = length(histogram_output)
 
-    shared_histogram = @localmem eltype(input) (gs)
+    shared_histogram = KI.localmemory(eltype(input), gs)
 
     # This will go through all input elements and assign them to a location in
     # shmem. Note that if there is not enough shem, we create different shmem
@@ -32,7 +33,7 @@ end
 
         # Setting shared_histogram to 0
         @inbounds shared_histogram[lid] = 0
-        @synchronize()
+        KI.barrier()
 
         max_element = min_element + gs
         if max_element > N
@@ -46,21 +47,20 @@ end
             @atomic shared_histogram[bin] += 1
         end
 
-        @synchronize()
+        KI.barrier()
 
         if ((lid + min_element - 1) <= N)
             @atomic histogram_output[lid + min_element - 1] += shared_histogram[lid]
         end
 
     end
-
+    return
 end
 
 function histogram!(histogram_output, input, groupsize = 256)
     backend = get_backend(histogram_output)
     # Need static block size
-    kernel! = histogram_kernel!(backend, (groupsize,))
-    kernel!(histogram_output, input, ndrange = size(input))
+    KI.@kernel backend workgroupsize = groupsize numworkgroups = cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize))
     return
 end
 
 
@@ -1,78 +1,79 @@
 using KernelAbstractions
+import KernelAbstractions.KernelIntrinsics as KI
+
 using StaticArrays
 using Test
 using Random
+
 include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
 
 # We use a TILE_DIM of 16 as a safe value since while
 #  most backends support up to 1024 threads per group,
 #  Metal sometimes supports fewer.
 const TILE_DIM = 16
 
-@kernel unsafe_indices = true function coalesced_matmul_kernel!(
-        output, @Const(input1), @Const(input2), N, R, M,
-        ::Val{BANK} = Val(1),
-    ) where {BANK}
-    gi, gj = @index(Group, NTuple)
-    i, j = @index(Local, NTuple)
-
-    TILE_DIM = @uniform @groupsize()[1]
+function coalesced_matmul_kernel!(
+        output, input1, input2, N, R, M,
+        ::Val{TDIM}, ::Val{BANK} = Val(1)
+    ) where {TDIM, BANK}
+    gi, gj, _ = KI.get_group_id()
+    i, j, _ = KI.get_local_id()
 
     # +1 to avoid bank conflicts on shared memory
-    tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
-    tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
+    tile1 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM))
+    tile2 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM))
 
-    # private variable for tile output
-    outval = @private eltype(output) 1
-    @inbounds outval[1] = -zero(eltype(output))
+    # variable for tile output
+    outval = -zero(eltype(output))
 
-    @uniform N = size(output, 1)
+    N = size(output, 1)
     # number of tiles depends on inner dimension
-    @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)
+    NUM_TILES = div(R + TDIM - 1, TDIM)
 
     # loop over all tiles needed for this calculation
     for t in 0:(NUM_TILES - 1)
         # Can't use @index(Global), because we use a smaller ndrange
-        I = (gi - 1) * TILE_DIM + i
-        J = (gj - 1) * TILE_DIM + j
+        I = (gi - 1) * TDIM + i
+        J = (gj - 1) * TDIM + j
 
         # load inputs into tiles, with bounds checking for non-square matrices
-        if I <= N && t * TILE_DIM + j <= R
-            @inbounds tile1[i, j] = input1[I, t * TILE_DIM + j]
+        if I <= N && t * TDIM + j <= R
+            @inbounds tile1[i, j] = input1[I, t * TDIM + j]
         else
             @inbounds tile1[i, j] = 0.0
         end
         if t * TILE_DIM + i <= R && J <= M
-            @inbounds tile2[i, j] = input2[t * TILE_DIM + i, J]
+            @inbounds tile2[i, j] = input2[t * TDIM + i, J]
         else
             @inbounds tile2[i, j] = 0.0
         end
 
         # wait for all tiles to be loaded
-        @synchronize
+        KI.barrier()
 
         # get global values again
-        I = (gi - 1) * TILE_DIM + i
-        J = (gj - 1) * TILE_DIM + j
+        I = (gi - 1) * TDIM + i
+        J = (gj - 1) * TDIM + j
 
         # calculate value of spot in output, use temporary value to allow for vectorization
         out = zero(eltype(output))
-        @simd for k in 1:TILE_DIM
+        @simd for k in 1:TDIM
             @inbounds out += tile1[i, k] * tile2[k, j]
         end
-        outval[1] += out
+        outval += out
 
-        @synchronize
+        KI.barrier()
     end
 
     # get global indices again
-    I = (gi - 1) * TILE_DIM + i
-    J = (gj - 1) * TILE_DIM + j
+    I = (gi - 1) * TDIM + i
+    J = (gj - 1) * TDIM + j
 
     # save if inbounds
     if I <= N && J <= M
-        @inbounds output[I, J] = outval[1]
+        @inbounds output[I, J] = outval
     end
+    return
 end
 
 N = 1024
@@ -82,9 +83,10 @@ A = rand!(allocate(backend, Float32, N, R))
 B = rand!(allocate(backend, Float32, R, M))
 C = KernelAbstractions.zeros(backend, Float32, N, M)
 
-kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
+workgroupsize = (TILE_DIM, TILE_DIM)
+numworkgroups = (cld(size(C, 1), TILE_DIM), cld(size(C, 2), TILE_DIM))
 
-kern(C, A, B, N, R, M, ndrange = size(C))
+KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM))
 KernelAbstractions.synchronize(backend)
 
 @test isapprox(A * B, C)
@@ -194,6 +194,15 @@ function unsafe_free! end
 
 unsafe_free!(::AbstractArray) = return
 
+"""
+Abstract type for all KernelAbstractions backends.
+"""
+abstract type Backend end
+
+include("intrinsics.jl")
+import .KernelIntrinsics as KI
+export KernelIntrinsics
+
 ###
 # Kernel language
 # - @localmem
@@ -360,6 +369,25 @@ macro context()
     return esc(:(__ctx__))
 end
 
+# Defined to keep cpu support for `__print`
+@generated function KI._print(items...)
+    str = ""
+    args = []
+
+    for i in 1:length(items)
+        item = :(items[$i])
+        T = items[i]
+        if T <: Val
+            item = QuoteNode(T.parameters[1])
+        end
+        push!(args, item)
+    end
+
+    return quote
+        print($(args...))
+    end
+end
+
 """
     @print(items...)
 
@@ -460,13 +488,27 @@ end
 # Internal kernel functions
 ###
 
-function __index_Local_Linear end
-function __index_Group_Linear end
-function __index_Global_Linear end
+@inline function __index_Local_Linear(ctx)
+    return KI.get_local_id().x
+end
+
+@inline function __index_Group_Linear(ctx)
+    return KI.get_group_id().x
+end
 
-function __index_Local_Cartesian end
-function __index_Group_Cartesian end
-function __index_Global_Cartesian end
+@inline function __index_Global_Linear(ctx)
+    return KI.get_global_id().x
+end
+
+@inline function __index_Local_Cartesian(ctx)
+    return @inbounds workitems(__iterspace(ctx))[KI.get_local_id().x]
+end
+@inline function __index_Group_Cartesian(ctx)
+    return @inbounds blocks(__iterspace(ctx))[KI.get_group_id().x]
+end
+@inline function __index_Global_Cartesian(ctx)
+    return @inbounds expand(__iterspace(ctx), KI.get_group_id().x, KI.get_local_id().x)
+end
 
 @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...))
 @inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...))
@@ -482,11 +524,6 @@ constify(arg) = adapt(ConstAdaptor(), arg)
 # Backend hierarchy
 ###
 
-"""
-
-Abstract type for all KernelAbstractions backends.
-"""
-abstract type Backend end
 
 """
 Abstract type for all GPU based KernelAbstractions backends.
@@ -796,29 +833,11 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-function SharedMemory end
-
-function __synchronize()
-    error("@synchronize used outside kernel or not captured")
-end
-
-@generated function __print(items...)
-    str = ""
-    args = []
+SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims)
 
-    for i in 1:length(items)
-        item = :(items[$i])
-        T = items[i]
-        if T <: Val
-            item = QuoteNode(T.parameters[1])
-        end
-        push!(args, item)
-    end
+__synchronize() = KI.barrier()
 
-    return quote
-        print($(args...))
-    end
-end
+__print(args...) = KI._print(args...)
 
 # Utils
 __size(args::Tuple) = Tuple{args...}