From 04291e6bb07597ce10aeb7ace689c6027f9791f2 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 5 Jul 2021 23:09:18 -0400
Subject: [PATCH 1/5] add FLoops backend, take 1

---
 src/macro.jl    | 24 ++++++++++++++++++++++++
 test/group-2.jl | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/src/macro.jl b/src/macro.jl
index 0f0aa0b..d04004e 100644
--- a/src/macro.jl
+++ b/src/macro.jl
@@ -102,6 +102,7 @@ OPTS = Dict(
     :grad => [false, :Base, :Dual],
     :avx => Integer,
     :cuda => Integer,
+    :floops => [true, false],
     :tensor => [true, false],
     )
 
@@ -111,6 +112,7 @@ _THREADS = Ref{Any}(true)
 _GRAD = Ref{Any}(:Base)
 _AVX = Ref{Any}(true)
 _CUDA = Ref{Any}(true)
+_FLOOPS = Ref{Any}(false)
 
 function parse_options(exs...)
     opts = Dict{Symbol,Any}(
@@ -123,6 +125,7 @@ function parse_options(exs...)
         :grad => _GRAD[],
         :avx => _AVX[],
         :cuda => _CUDA[],
+        :floops => _FLOOPS[],
         :tensor => false,
         )
     expr = nothing
@@ -178,6 +181,7 @@ function parse_options(exs...)
         _GRAD[] = opts[:grad]
         _AVX[] = opts[:avx]
         _CUDA[] = opts[:cuda]
+        _FLOOPS[] = opts[:floops]
     end
     opts[:tensor] == false || @warn "option tensor=true is deprecated, try Tullio.@tensor"
     (redfun=opts[:redfun],
@@ -189,6 +193,7 @@ function parse_options(exs...)
         grad=opts[:grad],
         avx=opts[:avx],
         cuda=opts[:cuda],
+        floops=opts[:floops],
         nograd=nograd,
     ), ranges, expr
 end
@@ -1059,6 +1064,25 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
         store.verbose>0 && @warn "can't parallelise this gradient, no shared indices $note"
     end
 
+    #===== FLoops =====#
+
+    if store.floops != false && isdefined(store.mod, :FLoops)
+        try
+            info0 = store.verbose>0 ? :(@info "running FLoops actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
+            fex = quote
+                local @inline function $act!(::Type{<:AbstractArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
+                    $info0
+                    FLoops.@floop begin $ex1; $ex2 end
+                end
+            end
+            store.verbose==2 && @info "=====FL===== FLoops actor $note" verbosetidy(fex)
+            push!(store.outpre, macroexpand(store.mod, fex))
+            store.verbose==2 && @info "success expanding FLoops.@floops"
+        catch err
+            store.verbose>0 && @warn "FLoops failed $note" err
+        end
+    end
+
     #===== LoopVectorization =====#
 
     expre, exloop0, expost = if isempty(outer)
diff --git a/test/group-2.jl b/test/group-2.jl
index 6381206..44792e7 100644
--- a/test/group-2.jl
+++ b/test/group-2.jl
@@ -4,7 +4,6 @@ t4 = time()
 using KernelAbstractions
 
 using Tracker
-
 GRAD = :Tracker
 _gradient(x...) = Tracker.gradient(x...)
 
@@ -40,3 +39,48 @@ end
 @info @sprintf("KernelAbstractions tests took %.1f seconds", time()-t4)
 
 @tullio cuda=false
+
+#===== FLoops =====#
+
+t5 = time()
+using FLoops
+@tullio floops=false
+
+using Tracker
+GRAD = :Tracker
+_gradient(x...) = Tracker.gradient(x...)
+
+@testset "FLoops + parsing + gradients" begin
+    A = (rand(3,4));
+    B = (rand(4,5));
+    @tullio C[i,k] := A[i,j] * B[j,k]  threads=false  verbose=1
+    @test C ≈ A * B
+
+    @tullio threads=false 
+    include("parsing.jl")
+    include("gradients.jl")
+    @tullio threads=true
+
+    for sy in Tullio.SYMBOLS
+        @test !isdefined(@__MODULE__, sy)
+    end
+end
+
+# using CUDA
+
+# if is_buildkite
+#     # If we are on Buildkite, we should assert that we have a CUDA GPU available
+#     @test CUDA.has_cuda_gpu()
+# end
+
+# if CUDA.has_cuda_gpu()
+#     @info "===== found a GPU, starting CUDA tests ====="
+#     @testset "===== CUDA tests on GPU =====" begin
+#         include("cuda.jl")
+#     end
+# end
+
+@info @sprintf("FLoops tests took %.1f seconds", time()-t5)
+
+@tullio floops=false
+

From 573e25383196cf2448b2cbe397f7af6611059ff0 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 5 Jul 2021 23:20:10 -0400
Subject: [PATCH 2/5] tests

---
 Project.toml    | 6 +++++-
 test/group-2.jl | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index cd47a89..fdb7f3f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -15,6 +15,8 @@ CUDAKernels = "0.1, 0.2"
 ChainRulesCore = "0.10"
 DiffRules = "1"
 FillArrays = "0.11"
+FLoops = "0.1.10"
+FoldsCUDA = "0.1.5"
 ForwardDiff = "0.10"
 KernelAbstractions = "0.6"
 LoopVectorization = "0.12.48"
@@ -31,6 +33,8 @@ julia = "1.5"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
+FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
+FoldsCUDA = "6cd66ae4-5932-4b96-926d-e73e578e42cc"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -46,4 +50,4 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
+test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "FLoops", "FoldsCUDA", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
diff --git a/test/group-2.jl b/test/group-2.jl
index 44792e7..1bbb17e 100644
--- a/test/group-2.jl
+++ b/test/group-2.jl
@@ -29,7 +29,7 @@ if is_buildkite
     @test CUDA.has_cuda_gpu()
 end
 
-if CUDA.has_cuda_gpu()
+if false # CUDA.has_cuda_gpu()
     @info "===== found a GPU, starting CUDA tests ====="
     @testset "===== CUDA tests on GPU =====" begin
         include("cuda.jl")

From d98f0bffaf70f32c584b8b9217821f3e8b17b209 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 5 Jul 2021 23:54:48 -0400
Subject: [PATCH 3/5] + cu

---
 README.md    |  4 +++-
 src/macro.jl | 33 +++++++++++++++++++++++++--------
 test/cuda.jl |  6 +++++-
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 2178548..6854d5b 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,9 @@ But it also co-operates with various other packages, provided they are loaded be
 
 * It uses [`LoopVectorization.@avx`](https://github.com/chriselrod/LoopVectorization.jl) to speed many things up. (Disable with keyword `avx=false`.) On a good day this will match the speed of OpenBLAS for matrix multiplication.
 
-* It uses [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast.
+* It can use [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast.
+
+* It can also use [`FLoops.@floop`](https://github.com/JuliaFolds/FLoops.jl), in particular to execute using [FoldsCUDA.jl](https://github.com/JuliaFolds/FoldsCUDA.jl) on the GPU.
 
 The macro also tries to provide a gradient for use with [Tracker](https://github.com/FluxML/Tracker.jl) or (via  [ChainRules](https://github.com/JuliaDiff/ChainRules.jl)) for [Zygote](https://github.com/FluxML/Zygote.jl), [Yota](https://github.com/dfdx/Yota.jl), etc. <!-- or [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl). -->
 (Disable with `grad=false`, or `nograd=A`.) This is done in one of two ways:
diff --git a/src/macro.jl b/src/macro.jl
index d04004e..60765aa 100644
--- a/src/macro.jl
+++ b/src/macro.jl
@@ -1068,19 +1068,36 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
 
     if store.floops != false && isdefined(store.mod, :FLoops)
         try
-            info0 = store.verbose>0 ? :(@info "running FLoops actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
-            fex = quote
+            info1 = store.verbose>0 ? :(@info "running FLoops actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
+            fex1 = quote
+
                 local @inline function $act!(::Type{<:AbstractArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
-                    $info0
+                    $info1
                     FLoops.@floop begin $ex1; $ex2 end
                 end
+
+            end
+            store.verbose==2 && @info "=====FL===== FLoops actor $note" verbosetidy(fex1)
+            if store.threads==false
+                # same dodgy switch as for KernelAbstractions, threads=false routes CPU calculation here: 
+                push!(store.outpre, macroexpand(store.mod, fex1))
+            end
+            if isdefined(store.mod, :FoldsCUDA) && isdefined(store.mod, :CUDA)
+                info2 = store.verbose>0 ? :(@info "running FLoops + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
+                fex2 = quote
+
+                    local @inline function $act!(::Type{<:CUDA.CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
+                        $info1
+                        FLoops.@floop FLoops.CUDAEx() begin $ex1; $ex2 end
+                    end
+
+                end
+                push!(store.outpre, macroexpand(store.mod, fex2))
             end
-            store.verbose==2 && @info "=====FL===== FLoops actor $note" verbosetidy(fex)
-            push!(store.outpre, macroexpand(store.mod, fex))
             store.verbose==2 && @info "success expanding FLoops.@floops"
         catch err
             store.verbose>0 && @warn "FLoops failed $note" err
-        end
+        end                
     end
 
     #===== LoopVectorization =====#
@@ -1183,11 +1200,11 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
             end
             store.verbose==2 && @info "=====KA===== KernelAbstractions kernel $note" verbosetidy(kex1)
             push!(store.outpre, macroexpand(store.mod, kex1))
-            if isdefined(store.mod, :CUDA) && isdefined(store.mod, :CuArray) # new-style, CUDA.jl, with CUDADevice()
+            if isdefined(store.mod, :CUDA)
                 info2 = store.verbose>0 ? :(@info "running KernelAbstractions + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
                 kex2 = quote
 
-                    local @inline function $act!(::Type{<:CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
+                    local @inline function $act!(::Type{<:CUDA.CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
                         $info2
                         cu_kern! = $kernel(CUDADevice())
                         $(asserts...)
diff --git a/test/cuda.jl b/test/cuda.jl
index 2c1624a..0413a3b 100644
--- a/test/cuda.jl
+++ b/test/cuda.jl
@@ -1,7 +1,11 @@
 
 using Tullio, Test
-using CUDA, CUDAKernels, KernelAbstractions
+using CUDA 
 CUDA.allowscalar(false)
+
+# using CUDAKernels, KernelAbstractions
+# using FoldsCUDA, FLoops
+
 using Tracker, ForwardDiff
 @tullio grad=Base
 

From eb192890323f67154fa538973424d67aac928e0c Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Tue, 6 Jul 2021 00:21:48 -0400
Subject: [PATCH 4/5] really +cu

---
 test/group-2.jl | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/test/group-2.jl b/test/group-2.jl
index 1bbb17e..3ddaca3 100644
--- a/test/group-2.jl
+++ b/test/group-2.jl
@@ -22,7 +22,7 @@ _gradient(x...) = Tracker.gradient(x...)
     end
 end
 
-using CUDA
+using CUDA, CUDAKernels
 
 if is_buildkite
     # If we are on Buildkite, we should assert that we have a CUDA GPU available
@@ -31,9 +31,11 @@ end
 
 if false # CUDA.has_cuda_gpu()
     @info "===== found a GPU, starting CUDA tests ====="
-    @testset "===== CUDA tests on GPU =====" begin
+    @testset "===== KernelAbstractions CUDA tests on GPU =====" begin
         include("cuda.jl")
     end
+else
+    @info "===== skipping KernelAbstractions + CUDA tests ====="
 end
 
 @info @sprintf("KernelAbstractions tests took %.1f seconds", time()-t4)
@@ -66,19 +68,14 @@ _gradient(x...) = Tracker.gradient(x...)
     end
 end
 
-# using CUDA
+using CUDA, FoldsCUDA
 
-# if is_buildkite
-#     # If we are on Buildkite, we should assert that we have a CUDA GPU available
-#     @test CUDA.has_cuda_gpu()
-# end
-
-# if CUDA.has_cuda_gpu()
-#     @info "===== found a GPU, starting CUDA tests ====="
-#     @testset "===== CUDA tests on GPU =====" begin
-#         include("cuda.jl")
-#     end
-# end
+if CUDA.has_cuda_gpu()
+    @info "===== found a GPU, starting CUDA tests ====="
+    @testset "===== FLoops + FoldsCUDA tests on GPU =====" begin
+        include("cuda.jl")
+    end
+end
 
 @info @sprintf("FLoops tests took %.1f seconds", time()-t5)
 

From b16244f096a0109790968a4a95dd742307152c87 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Tue, 6 Jul 2021 08:40:26 -0400
Subject: [PATCH 5/5] versions

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index fdb7f3f..7fd9f0d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,14 +11,14 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 
 [compat]
 CUDA = "2, 3"
-CUDAKernels = "0.1, 0.2"
+CUDAKernels = "0.1, 0.2, 0.3"
 ChainRulesCore = "0.10"
 DiffRules = "1"
 FillArrays = "0.11"
 FLoops = "0.1.10"
 FoldsCUDA = "0.1.5"
 ForwardDiff = "0.10"
-KernelAbstractions = "0.6"
+KernelAbstractions = "0.6, 0.7"
 LoopVectorization = "0.12.48"
 NamedDims = "0.2"
 OffsetArrays = "1"