From 04291e6bb07597ce10aeb7ace689c6027f9791f2 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Mon, 5 Jul 2021 23:09:18 -0400 Subject: [PATCH 1/5] add FLoops backend, take 1 --- src/macro.jl | 24 ++++++++++++++++++++++++ test/group-2.jl | 46 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/src/macro.jl b/src/macro.jl index 0f0aa0b..d04004e 100644 --- a/src/macro.jl +++ b/src/macro.jl @@ -102,6 +102,7 @@ OPTS = Dict( :grad => [false, :Base, :Dual], :avx => Integer, :cuda => Integer, + :floops => [true, false], :tensor => [true, false], ) @@ -111,6 +112,7 @@ _THREADS = Ref{Any}(true) _GRAD = Ref{Any}(:Base) _AVX = Ref{Any}(true) _CUDA = Ref{Any}(true) +_FLOOPS = Ref{Any}(false) function parse_options(exs...) opts = Dict{Symbol,Any}( @@ -123,6 +125,7 @@ function parse_options(exs...) :grad => _GRAD[], :avx => _AVX[], :cuda => _CUDA[], + :floops => _FLOOPS[], :tensor => false, ) expr = nothing @@ -178,6 +181,7 @@ function parse_options(exs...) _GRAD[] = opts[:grad] _AVX[] = opts[:avx] _CUDA[] = opts[:cuda] + _FLOOPS[] = opts[:floops] end opts[:tensor] == false || @warn "option tensor=true is deprecated, try Tullio.@tensor" (redfun=opts[:redfun], @@ -189,6 +193,7 @@ function parse_options(exs...) grad=opts[:grad], avx=opts[:avx], cuda=opts[:cuda], + floops=opts[:floops], nograd=nograd, ), ranges, expr end @@ -1059,6 +1064,25 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex store.verbose>0 && @warn "can't parallelise this gradient, no shared indices $note" end + #===== FLoops =====# + + if store.floops != false && isdefined(store.mod, :FLoops) + try + info0 = store.verbose>0 ? :(@info "running FLoops actor $($note)" maxlog=3 _id=$(hash(store))) : nothing + fex = quote + local @inline function $act!(::Type{<:AbstractArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP} + $info0 + FLoops.@floop begin $ex1; $ex2 end + end + end + store.verbose==2 && @info "=====FL===== FLoops actor $note" verbosetidy(fex) + push!(store.outpre, macroexpand(store.mod, fex)) + store.verbose==2 && @info "success expanding FLoops.@floops" + catch err + store.verbose>0 && @warn "FLoops failed $note" err + end + end + #===== LoopVectorization =====# expre, exloop0, expost = if isempty(outer) diff --git a/test/group-2.jl b/test/group-2.jl index 6381206..44792e7 100644 --- a/test/group-2.jl +++ b/test/group-2.jl @@ -4,7 +4,6 @@ t4 = time() using KernelAbstractions using Tracker - GRAD = :Tracker _gradient(x...) = Tracker.gradient(x...) @@ -40,3 +39,48 @@ end @info @sprintf("KernelAbstractions tests took %.1f seconds", time()-t4) @tullio cuda=false + +#===== FLoops =====# + +t5 = time() +using FLoops +@tullio floops=false + +using Tracker +GRAD = :Tracker +_gradient(x...) = Tracker.gradient(x...) + +@testset "FLoops + parsing + gradients" begin + A = (rand(3,4)); + B = (rand(4,5)); + @tullio C[i,k] := A[i,j] * B[j,k] threads=false verbose=1 + @test C ≈ A * B + + @tullio threads=false + include("parsing.jl") + include("gradients.jl") + @tullio threads=true + + for sy in Tullio.SYMBOLS + @test !isdefined(@__MODULE__, sy) + end +end + +# using CUDA + +# if is_buildkite +# # If we are on Buildkite, we should assert that we have a CUDA GPU available +# @test CUDA.has_cuda_gpu() +# end + +# if CUDA.has_cuda_gpu() +# @info "===== found a GPU, starting CUDA tests =====" +# @testset "===== CUDA tests on GPU =====" begin +# include("cuda.jl") +# end +# end + +@info @sprintf("FLoops tests took %.1f seconds", time()-t5) + +@tullio floops=false + From 573e25383196cf2448b2cbe397f7af6611059ff0 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Mon, 5 Jul 2021 23:20:10 -0400 Subject: [PATCH 2/5] tests --- Project.toml | 6 +++++- test/group-2.jl | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index cd47a89..fdb7f3f 100644 --- a/Project.toml +++ b/Project.toml @@ -15,6 +15,8 @@ CUDAKernels = "0.1, 0.2" ChainRulesCore = "0.10" DiffRules = "1" FillArrays = "0.11" +FLoops = "0.1.10" +FoldsCUDA = "0.1.5" ForwardDiff = "0.10" KernelAbstractions = "0.6" LoopVectorization = "0.12.48" @@ -31,6 +33,8 @@ julia = "1.5" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" +FLoops = "cc61a311-1640-44b5-9fba-1b764f453329" +FoldsCUDA = "6cd66ae4-5932-4b96-926d-e73e578e42cc" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -46,4 +50,4 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"] +test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "FLoops", "FoldsCUDA", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"] diff --git a/test/group-2.jl b/test/group-2.jl index 44792e7..1bbb17e 100644 --- a/test/group-2.jl +++ b/test/group-2.jl @@ -29,7 +29,7 @@ if is_buildkite @test CUDA.has_cuda_gpu() end -if CUDA.has_cuda_gpu() +if false # CUDA.has_cuda_gpu() @info "===== found a GPU, starting CUDA tests =====" @testset "===== CUDA tests on GPU =====" begin include("cuda.jl") From d98f0bffaf70f32c584b8b9217821f3e8b17b209 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Mon, 5 Jul 2021 23:54:48 -0400 Subject: [PATCH 3/5] + cu --- README.md | 4 +++- src/macro.jl | 33 +++++++++++++++++++++++++-------- test/cuda.jl | 6 +++++- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2178548..6854d5b 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,9 @@ But it also co-operates with various other packages, provided they are loaded be * It uses [`LoopVectorization.@avx`](https://github.com/chriselrod/LoopVectorization.jl) to speed many things up. (Disable with keyword `avx=false`.) On a good day this will match the speed of OpenBLAS for matrix multiplication. -* It uses [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast. +* It can use [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast. + +* It can also use [`FLoops.@floop`](https://github.com/JuliaFolds/FLoops.jl), in particular to execute using [FoldsCUDA.jl](https://github.com/JuliaFolds/FoldsCUDA.jl) on the GPU. The macro also tries to provide a gradient for use with [Tracker](https://github.com/FluxML/Tracker.jl) or (via [ChainRules](https://github.com/JuliaDiff/ChainRules.jl)) for [Zygote](https://github.com/FluxML/Zygote.jl), [Yota](https://github.com/dfdx/Yota.jl), etc. (Disable with `grad=false`, or `nograd=A`.) This is done in one of two ways: diff --git a/src/macro.jl b/src/macro.jl index d04004e..60765aa 100644 --- a/src/macro.jl +++ b/src/macro.jl @@ -1068,19 +1068,36 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex if store.floops != false && isdefined(store.mod, :FLoops) try - info0 = store.verbose>0 ? :(@info "running FLoops actor $($note)" maxlog=3 _id=$(hash(store))) : nothing - fex = quote + info1 = store.verbose>0 ? :(@info "running FLoops actor $($note)" maxlog=3 _id=$(hash(store))) : nothing + fex1 = quote + local @inline function $act!(::Type{<:AbstractArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP} - $info0 + $info1 FLoops.@floop begin $ex1; $ex2 end end + + end + store.verbose==2 && @info "=====FL===== FLoops actor $note" verbosetidy(fex1) + if store.threads==false + # same dodgy switch as for KernelAbstractions, threads=false routes CPU calculation here: + push!(store.outpre, macroexpand(store.mod, fex1)) + end + if isdefined(store.mod, :FoldsCUDA) && isdefined(store.mod, :CUDA) + info2 = store.verbose>0 ? :(@info "running FLoops + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing + fex2 = quote + + local @inline function $act!(::Type{<:CUDA.CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP} + $info1 + FLoops.@floop FLoops.CUDAEx() begin $ex1; $ex2 end + end + + end + push!(store.outpre, macroexpand(store.mod, fex2)) end - store.verbose==2 && @info "=====FL===== FLoops actor $note" verbosetidy(fex) - push!(store.outpre, macroexpand(store.mod, fex)) store.verbose==2 && @info "success expanding FLoops.@floops" catch err store.verbose>0 && @warn "FLoops failed $note" err - end + end end #===== LoopVectorization =====# @@ -1183,11 +1200,11 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex end store.verbose==2 && @info "=====KA===== KernelAbstractions kernel $note" verbosetidy(kex1) push!(store.outpre, macroexpand(store.mod, kex1)) - if isdefined(store.mod, :CUDA) && isdefined(store.mod, :CuArray) # new-style, CUDA.jl, with CUDADevice() + if isdefined(store.mod, :CUDA) info2 = store.verbose>0 ? :(@info "running KernelAbstractions + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing kex2 = quote - local @inline function $act!(::Type{<:CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP} + local @inline function $act!(::Type{<:CUDA.CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP} $info2 cu_kern! = $kernel(CUDADevice()) $(asserts...) diff --git a/test/cuda.jl b/test/cuda.jl index 2c1624a..0413a3b 100644 --- a/test/cuda.jl +++ b/test/cuda.jl @@ -1,7 +1,11 @@ using Tullio, Test -using CUDA, CUDAKernels, KernelAbstractions +using CUDA CUDA.allowscalar(false) + +# using CUDAKernels, KernelAbstractions +# using FoldsCUDA, FLoops + using Tracker, ForwardDiff @tullio grad=Base From eb192890323f67154fa538973424d67aac928e0c Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Tue, 6 Jul 2021 00:21:48 -0400 Subject: [PATCH 4/5] really +cu --- test/group-2.jl | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/test/group-2.jl b/test/group-2.jl index 1bbb17e..3ddaca3 100644 --- a/test/group-2.jl +++ b/test/group-2.jl @@ -22,7 +22,7 @@ _gradient(x...) = Tracker.gradient(x...) end end -using CUDA +using CUDA, CUDAKernels if is_buildkite # If we are on Buildkite, we should assert that we have a CUDA GPU available @@ -31,9 +31,11 @@ end if false # CUDA.has_cuda_gpu() @info "===== found a GPU, starting CUDA tests =====" - @testset "===== CUDA tests on GPU =====" begin + @testset "===== KernelAbstractions CUDA tests on GPU =====" begin include("cuda.jl") end +else + @info "===== skipping KernelAbstractions + CUDA tests =====" end @info @sprintf("KernelAbstractions tests took %.1f seconds", time()-t4) @@ -66,19 +68,14 @@ _gradient(x...) = Tracker.gradient(x...) end end -# using CUDA +using CUDA, FoldsCUDA -# if is_buildkite -# # If we are on Buildkite, we should assert that we have a CUDA GPU available -# @test CUDA.has_cuda_gpu() -# end - -# if CUDA.has_cuda_gpu() -# @info "===== found a GPU, starting CUDA tests =====" -# @testset "===== CUDA tests on GPU =====" begin -# include("cuda.jl") -# end -# end +if CUDA.has_cuda_gpu() + @info "===== found a GPU, starting CUDA tests =====" + @testset "===== FLoops + FoldsCUDA tests on GPU =====" begin + include("cuda.jl") + end +end @info @sprintf("FLoops tests took %.1f seconds", time()-t5) From b16244f096a0109790968a4a95dd742307152c87 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Tue, 6 Jul 2021 08:40:26 -0400 Subject: [PATCH 5/5] versions --- Project.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index fdb7f3f..7fd9f0d 100644 --- a/Project.toml +++ b/Project.toml @@ -11,14 +11,14 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df" [compat] CUDA = "2, 3" -CUDAKernels = "0.1, 0.2" +CUDAKernels = "0.1, 0.2, 0.3" ChainRulesCore = "0.10" DiffRules = "1" FillArrays = "0.11" FLoops = "0.1.10" FoldsCUDA = "0.1.5" ForwardDiff = "0.10" -KernelAbstractions = "0.6" +KernelAbstractions = "0.6, 0.7" LoopVectorization = "0.12.48" NamedDims = "0.2" OffsetArrays = "1"