Merge pull request #9 from thomvet/Support-nested-duals

ChrisRackauckas · web-flow · commit 389e261b3a14 · 2021-11-03T12:59:40.000-04:00
Support nested duals
diff --git a/Project.toml b/Project.toml
@@ -19,10 +19,12 @@ julia = "1.6"
 [extras]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Optim = "429524aa-4258-5aef-a3af-852621145aeb"
+GalacticOptim = "a75be94c-b780-496d-a8a9-0878b188d577"
 
 [targets]
-test = ["LinearAlgebra", "OrdinaryDiffEq", "Test", "RecursiveArrayTools", "Pkg", "SafeTestsets"]
+test = ["LinearAlgebra", "OrdinaryDiffEq", "Test", "RecursiveArrayTools", "Pkg", "SafeTestsets", "GalacticOptim", "Optim"]
diff --git a/src/PreallocationTools.jl b/src/PreallocationTools.jl
@@ -7,21 +7,24 @@ struct DiffCache{T<:AbstractArray, S<:AbstractArray}
     dual_du::S
 end
 
-function DiffCache(u::AbstractArray{T}, siz, chunk_size) where {T}
-    x = adapt(ArrayInterface.parameterless_type(u), zeros(T,(chunk_size+1)*prod(siz)))
+function DiffCache(u::AbstractArray{T}, siz, chunk_sizes) where {T}
+    x = adapt(ArrayInterface.parameterless_type(u), zeros(T, prod(chunk_sizes .+ 1)*prod(siz)))
     DiffCache(u, x)
 end
 
 """
 
-`dualcache(u::AbstractArray, N = default_cache_size(length(u)))`
+`dualcache(u::AbstractArray, N::Int = ForwardDiff.pickchunksize(length(u)); levels::Int = 1)`
+`dualcache(u::AbstractArray; N::AbstractArray{<:Int})`
 
 Builds a `DualCache` object that stores both a version of the cache for `u`
 and for the `Dual` version of `u`, allowing use of pre-cached vectors with
-forward-mode automatic differentiation.
+forward-mode automatic differentiation. Supports nested AD via keyword `levels`
+or specifying an array of chunk_sizes.
 
 """
-dualcache(u::AbstractArray, N=ForwardDiff.pickchunksize(length(u))) = DiffCache(u, size(u), N)
+dualcache(u::AbstractArray, N::Int=ForwardDiff.pickchunksize(length(u)); levels::Int = 1) = DiffCache(u, size(u), N*ones(Int, levels))
+dualcache(u::AbstractArray, N::AbstractArray{<:Int}) = DiffCache(u, size(u), N)
 
 """
 
diff --git a/test/core_nesteddual.jl b/test/core_nesteddual.jl
@@ -0,0 +1,104 @@
+using LinearAlgebra, OrdinaryDiffEq, Test, PreallocationTools, ForwardDiff, GalacticOptim, Optim
+
+randmat = rand(5, 3)
+sto = similar(randmat)
+function claytonsample!(sto, τ, α; randmat=randmat)
+    sto = get_tmp(sto, τ)
+    sto .= randmat
+    τ == 0 && return sto
+
+    n = size(sto, 1)
+    for i in 1:n
+        v = sto[i, 2]
+        u = sto[i, 1]
+        sto[i, 1] = (1 - u^(-τ) + u^(-τ)*v^(-(τ/(1 + τ))))^(-1/τ)*α
+        sto[i, 2] = (1 - u^(-τ) + u^(-τ)*v^(-(τ/(1 + τ))))^(-1/τ)
+    end
+    return sto
+end
+
+#= taking the second derivative of claytonsample! with respect to τ with manual chunk_sizes.
+In setting up the dualcache, we are setting chunk_size to [1, 1], because we differentiate 
+only with respect to τ. This initializes the cache with the minimum memory needed. =#
+stod = dualcache(sto, [1, 1]) 
+df3 = ForwardDiff.derivative(τ -> ForwardDiff.derivative(ξ -> claytonsample!(stod, ξ, 0.0), τ), 0.3)
+
+#= taking the second derivative of claytonsample! with respect to τ with auto-detected chunk-size. 
+For the given size of sto, ForwardDiff's heuristic chooses chunk_size = 8. Since this is greater 
+than what's needed (1+1), the auto-allocated cache is big enough to handle the nested dual numbers, even
+if we don't specify the keyword argument levels = 2. This should in general not be relied on to work, 
+especially if more levels of nesting occur (see optimization example below). =#
+stod = dualcache(sto) 
+df4 = ForwardDiff.derivative(τ -> ForwardDiff.derivative(ξ -> claytonsample!(stod, ξ, 0.0), τ), 0.3)
+
+@test df3 ≈ df4
+
+#= taking the second derivative of claytonsample! with respect to τ with auto-detected chunk-size. 
+For the given size of sto, ForwardDiff's heuristic chooses chunk_size = 8 and with keyword arg levels = 2,
+the created cache size is larger than what's needed (even more so than the last example). =#
+stod = dualcache(sto, levels = 2) 
+df5 = ForwardDiff.derivative(τ -> ForwardDiff.derivative(ξ -> claytonsample!(stod, ξ, 0.0), τ), 0.3)
+
+@test df3 ≈ df5
+
+#= Checking nested dual numbers using optimization problem involving Optim.jl's Newton() (involving Hessians);
+so, we will need one level of AD for the ODE solver (TRBDF2) and two more to calculate the Hessian =#
+function foo(du, u, p, t)
+    tmp = p[2]
+    A = reshape(p[1], size(tmp.du))
+    tmp = get_tmp(tmp, u)
+    mul!(tmp, A, u)
+    @. du = u + tmp
+    nothing
+end
+
+ps = 3 #use to specify problem size; don't go crazy on this, because of the compilation time...
+coeffs = -rand(ps,ps)
+cache = dualcache(zeros(ps,ps), levels = 3)
+prob = ODEProblem(foo, ones(ps, ps), (0., 1.0), (coeffs, cache))
+realsol = solve(prob, TRBDF2(), saveat = 0.0:0.1:10.0, reltol = 1e-8)
+u0 = rand(length(coeffs))
+
+function objfun(x, prob, realsol, cache)
+    prob = remake(prob, u0 = eltype(x).(prob.u0), p = (x, cache))
+    sol = solve(prob, TRBDF2(), saveat = 0.0:0.1:10.0, reltol = 1e-8)
+
+    ofv = 0.0
+    if any((s.retcode != :Success for s in sol))
+      ofv = 1e12
+    else
+      ofv = sum((sol.-realsol).^2)
+    end    
+    return ofv
+end
+fn(x,p) = objfun(x, p[1], p[2], p[3])
+optfun = OptimizationFunction(fn, GalacticOptim.AutoForwardDiff())
+optprob = OptimizationProblem(optfun, -rand(length(coeffs)), (prob, realsol, cache), chunk_size = 2)
+newtonsol = solve(optprob, Newton())
+
+@test all(abs.(coeffs[:] .- newtonsol.u) .< 1e-2)
+
+#an example where chunk_sizes are not the same on all differentiation levels:
+cache = dualcache(zeros(ps,ps), [9, 9, 2])
+realsol = solve(prob, TRBDF2(chunk_size = 2), saveat = 0.0:0.1:10.0, reltol = 1e-8)
+
+function objfun(x, prob, realsol, cache)
+    prob = remake(prob, u0 = eltype(x).(prob.u0), p = (x, cache))
+    sol = solve(prob, TRBDF2(chunk_size = 2), saveat = 0.0:0.1:10.0, reltol = 1e-8)
+
+    ofv = 0.0
+    if any((s.retcode != :Success for s in sol))
+      ofv = 1e12
+    else
+      ofv = sum((sol.-realsol).^2)
+    end    
+    return ofv
+end
+
+fn(x,p) = objfun(x, p[1], p[2], p[3])
+
+optfun = OptimizationFunction(fn, GalacticOptim.AutoForwardDiff())
+optprob = OptimizationProblem(optfun, -rand(length(coeffs)), (prob, realsol, cache))
+newtonsol2 = solve(optprob, Newton())
+
+@test all(abs.(coeffs[:] .- newtonsol2.u) .< 1e-2)
diff --git a/test/gpu_all.jl b/test/gpu_all.jl
@@ -1,14 +1,16 @@
-using LinearAlgebra, OrdinaryDiffEq, Test, PreallocationTools, CUDA
+using LinearAlgebra, OrdinaryDiffEq, Test, PreallocationTools, CUDA, ForwardDiff, ArrayInterface
 
 #Dispatch tests
+chunk_size = 5
 u0_CU = cu(ones(5,5))
-dual_CU = cu(zeros(ForwardDiff.Dual{ForwardDiff.Tag{typeof(something), Float64}, Float64, chunk_size}, 2, 2))
+dual_CU = cu(zeros(ForwardDiff.Dual{ForwardDiff.Tag{typeof(something), Float32}, Float32, chunk_size}, 2, 2))
+dual_N = ForwardDiff.Dual{ForwardDiff.Tag{typeof(something), Float32}, Float32, 5}(0)
 cache_CU = dualcache(u0_CU, chunk_size)
 tmp_du_CUA = get_tmp(cache_CU, u0_CU)
 tmp_dual_du_CUA = get_tmp(cache_CU, dual_CU)
-tmp_du_CUN = get_tmp(cache_CU, u0_CU[1])
-tmp_dual_du_CUN = get_tmp(cache_CU, dual_CU[1])
-@test typeof(cache_CU.dual_du) == typeof(u0_CU) #check that dual cache array is a GPU array for performance reasons.
+tmp_du_CUN = get_tmp(cache_CU, 0.0)
+tmp_dual_du_CUN = get_tmp(cache_CU, dual_N)
+@test ArrayInterface.parameterless_type(typeof(cache_CU.dual_du)) == ArrayInterface.parameterless_type(typeof(u0_CU)) #check that dual cache array is a GPU array for performance reasons.
 @test size(tmp_du_CUA) == size(u0_CU)                
 @test typeof(tmp_du_CUA) == typeof(u0_CU)
 @test eltype(tmp_du_CUA) == eltype(u0_CU)
@@ -33,37 +35,34 @@ end
 chunk_size = 10
 u0 = cu(rand(10,10)) #example kept small for test purposes.
 A  = cu(-randn(10,10))                  
-cache = dualcache(A, chunk_size)
-prob = ODEProblem(foo, u0, (0.0f0,1.0f0), (A, cache))
+cache = dualcache(cu(zeros(10, 10)), chunk_size)
+prob = ODEProblem(foo, u0, (0.0f0, 1.0f0), (A, cache))
 sol = solve(prob, TRBDF2(chunk_size = chunk_size))
 @test sol.retcode == :Success
 
 #with auto-detected chunk_size
 u0 = cu(rand(10,10)) #example kept small for test purposes.
 A  = cu(-randn(10,10))                  
-cache = dualcache(A)
+cache = dualcache(cu(zeros(10, 10)))
 prob = ODEProblem(foo, u0, (0.0f0,1.0f0), (A, cache))
 sol = solve(prob, TRBDF2())
 @test sol.retcode == :Success
 
+#resizing tests
 randmat = cu(rand(5, 3))
 sto = similar(randmat)
 stod = dualcache(sto)
 function claytonsample!(sto, τ, α; randmat=randmat)
     sto = get_tmp(sto, τ)
-        sto .= randmat
+    sto .= randmat
     τ == 0 && return sto
-    n = size(sto, 1)
-    for i in 1:n
-        v = sto[i, 2]
-        u = sto[i, 1]
-        sto[i, 1] = (1 - u^(-τ) + u^(-τ)*v^(-(τ/(1 + τ))))^(-1/τ)*α
-        sto[i, 2] = (1 - u^(-τ) + u^(-τ)*v^(-(τ/(1 + τ))))^(-1/τ)
-    end
+    v = @view sto[:, 2]
+    u = @view sto[:, 1]
+    @. v = (1 - u^(-τ) + u^(-τ)*v^(-(τ/(1 + τ))))^(-1/τ)*α
+    @. u = (1 - u^(-τ) + u^(-τ)*v^(-(τ/(1 + τ))))^(-1/τ)
     return sto
 end
 
-#resizing tests
 #taking the derivative of claytonsample! with respect to τ only
 df1 = ForwardDiff.derivative(τ -> claytonsample!(stod, τ, 0.0), 0.3)
 @test size(randmat) == size(df1)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -13,7 +13,8 @@ end
 if GROUP == "All" || GROUP == "Core"
     @safetestset "Dispatch" begin include("core_dispatch.jl") end
     @safetestset "ODE tests" begin include("core_odes.jl") end
-    @safetestset "Base Array Resizing" begin include("core_resizing.jl") end
+    @safetestset "Resizing" begin include("core_resizing.jl") end
+    @safetestset "Nested Duals" begin include("core_nesteddual.jl") end
 end
 
 if !is_APPVEYOR && GROUP == "GPU"