Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ env:
# SECRET_CODECOV_TOKEN: "..."

steps:
- label: "Julia 1.8"
- label: "Julia 1.11"
plugins:
- JuliaCI/julia#v0.5:
version: "1.8"
version: "1.11"
- JuliaCI/julia-test#v0.3: ~
# - JuliaCI/julia-coverage#v0.3:
# codecov: true
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ jobs:
- '1' # automatically expands to the latest stable 1.x release of Julia
# - 'nightly'
steps:
- uses: actions/checkout@v2
- uses: julia-actions/setup-julia@v1
- uses: actions/checkout@v5
- uses: julia-actions/setup-julia@v2
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: actions/cache@v1
- uses: actions/cache@v4
env:
cache-name: cache-artifacts
with:
Expand Down
11 changes: 7 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,17 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"

[extensions]
TullioCUDAExt = "CUDA"
TullioMetalExt = "Metal"
TullioChainRulesCoreExt = "ChainRulesCore"
TullioFillArraysExt = "FillArrays"
TullioTrackerExt = "Tracker"
TullioChainRulesCoreExt = "ChainRulesCore"

[compat]
CUDA = "4, 5"
Expand All @@ -29,6 +31,7 @@ FillArrays = "0.11, 0.12, 0.13, 1"
ForwardDiff = "0.10, 1.0"
KernelAbstractions = "0.9"
LoopVectorization = "0.12.101"
Metal = "1.8.1"
NamedDims = "0.2, 1"
OffsetArrays = "1"
Requires = "1"
Expand All @@ -39,8 +42,8 @@ Zygote = "0.6.33, 0.7"
julia = "1.10" # note that this is the minimum Julia version, 1.11 & 1.12 etc. are allowed & should work.

[extras]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Expand All @@ -58,4 +61,4 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[targets]
test = ["Test", "CUDA", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Pkg", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
test = ["Test", "CUDA", "Metal", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Pkg", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
4 changes: 4 additions & 0 deletions ext/TullioCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ else
using Tullio, CUDA
end

if isdefined(@__MODULE__, :Metal) && isdefined(@__MODULE__, :MtlArray)
@warn "Loading multiple GPU backends can lead to unexpected bugs"
end

Tullio.threader(fun!::F, ::Type{T},
Z::AbstractArray, As::Tuple, Is::Tuple, Js::Tuple,
redfun, block=0, keep=nothing) where {F<:Function, T<:CUDA.CuArray} =
Expand Down
21 changes: 21 additions & 0 deletions ext/TullioMetalExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
module TullioMetalExt
if !isdefined(Base, :get_extension)
using ..Tullio, ..Metal
else
using Tullio, Metal
end

if isdefined(@__MODULE__, :CUDA) && isdefined(@__MODULE__, :CuArray)
@warn "Loading multiple GPU backends can lead to unexpected bugs"
end

Tullio.threader(fun!::F, ::Type{T},
Z::AbstractArray, As::Tuple, Is::Tuple, Js::Tuple,
redfun, block=0, keep=nothing) where {F<:Function, T<:Metal.MtlArray} =
fun!(T, Z, As..., Is..., Js..., keep)

Tullio.∇threader(fun!::F, ::Type{T},
As::Tuple, Is::Tuple, Js::Tuple, block=0) where {F<:Function, T<:Metal.MtlArray} =
fun!(T, As..., Is..., Js...,)

end
14 changes: 14 additions & 0 deletions src/macro.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1175,6 +1175,20 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
end
store.verbose==2 && @info "=====KA===== KernelAbstractions CUDA actor $note" verbosetidy(kex2)
push!(store.outpre, kex2)
elseif isdefined(store.mod, :Metal) && isdefined(store.mod, :MtlArray)
info2 = store.verbose>0 ? :(@info "running KernelAbstractions + Metal actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
kex2 = quote

local @inline function $act!(::Type{<:MtlArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
$info2
mtl_kern! = $kernel(MetalBackend())
$(asserts...)
$ACC = mtl_kern!($(args...), $KEEP, $FINAL; ndrange=tuple($(sizes...)), workgroupsize=$workgroupsize)
end

end
store.verbose==2 && @info "=====KA===== KernelAbstractions Metal actor $note" verbosetidy(kex2)
push!(store.outpre, kex2)
end
info3 = store.verbose>0 ? :(@info "running KernelAbstractions CPU actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
kex3 = quote
Expand Down
14 changes: 13 additions & 1 deletion test/group-2.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ _gradient(x...) = Tracker.gradient(x...)
@testset "KernelAbstractions + gradients" begin
A = (rand(3,4));
B = (rand(4,5));
@tullio C[i,k] := A[i,j] * B[j,k] threads=false # verbose=2
@tullio C[i,k] := A[i,j] * B[j,k] threads=false # verbose=2
@test C ≈ A * B

@tullio threads=false # else KernelAbstractions CPU kernels not used
Expand All @@ -23,6 +23,16 @@ _gradient(x...) = Tracker.gradient(x...)
end
end

try # Load Metal before CUDA to avoid Kernel generation problems
using Metal
vi = Metal.versioninfo();
@info "===== found an Apple GPU, starting Metal tests ====="
@testset "===== Metal tests on GPU =====" begin
include("metal.jl")
end
catch
end

using CUDA

if is_buildkite
Expand All @@ -37,6 +47,8 @@ if CUDA.has_cuda_gpu()
end
end



@info @sprintf("KernelAbstractions tests took %.1f seconds", time()-t4)

@tullio cuda=false
54 changes: 54 additions & 0 deletions test/metal.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
using Tullio, Test
using Metal, KernelAbstractions
using Tracker, ForwardDiff
@tullio grad=Base

# matmul
mul(A, B) = @tullio C[i,k] := A[i,j] * B[j,k]

A = rand(3,40); B = rand(40,500);
@test A * B ≈ mul(A, B)
@test mtl(A * B) ≈ mul(mtl(A), mtl(B))

# gradient
# FIXME: Broken I think
ΔA = Tracker.gradient((A,B) -> sum(mul(A, B)), A, B)[1]
@test ΔA ≈ ones(3,500) * B'
@test mtl(ΔA) ≈ Tracker.gradient((A,B) -> sum(mul(A, B)), mtl(A), mtl(B))[1]

# shifts
@tullio D[i,j] := A[i,j+k] k in 0:10
@test axes(D) == (1:3, 1:30)
@tullio D_dev[i,j] := mtl(A)[i,j+k] k in 0:10
@test D_dev isa MtlArray
@test D_dev ≈ mtl(D)

# product
@tullio (*) F[j] := A[i,j]
@test F ≈ vec(prod(A, dims=1))
@tullio (*) F_dev[j] := mtl(A)[i,j]
@test F_dev ≈ mtl(F)

# maximum
g(A) = @tullio (max) G[j] := A[i,j]
@test g(A) == vec(maximum(A, dims=1))
A0 = zero(A);
A0[findmax(A, dims=1)[2]] .= 1
@test A0 ≈ Tracker.gradient(sum∘g, A)[1]
@test g(mtl(A)) isa MtlArray
@test g(mtl(A)) ≈ mtl(g(A))
@test mtl(A0) ≈ Tracker.gradient(sum∘g, mtl(A))[1]

# functions
h(A) = @tullio H[j] := exp(A[i,j]) / log(A[i,j])
@test h(mtl(A)) isa MtlArray
@test h(mtl(A)) ≈ mtl(h(A))
A1 = Tracker.gradient(sum∘h, A)[1]
@test mtl(A1) ≈ Tracker.gradient(sum∘h, mtl(A))[1]

A, B, C = Metal.rand(2, 2, 2), Metal.rand(2, 2), Metal.rand(2, 2, 2);
@tullio A[k,i,a] = tanh(B[i,a] + C[k,i,a])
A2 = similar(A)
struct Bee{T}; B::T; end
B2 = Bee(B)
@test A ≈ @tullio A2[k,i,a] = tanh(B2.B[i,a] + C[k,i,a])