support for nested duals; tests included

thomvet · thomvet · commit a4241845bbc8 · 2021-11-03T16:46:49.000+01:00
diff --git a/src/PreallocationTools.jl b/src/PreallocationTools.jl
@@ -19,7 +19,8 @@ end
 
 Builds a `DualCache` object that stores both a version of the cache for `u`
 and for the `Dual` version of `u`, allowing use of pre-cached vectors with
-forward-mode automatic differentiation. Supports nested AD.
+forward-mode automatic differentiation. Supports nested AD via keyword `levels`
+or specifying an array of chunk_sizes.
 
 """
 dualcache(u::AbstractArray, N::Int=ForwardDiff.pickchunksize(length(u)); levels::Int = 1) = DiffCache(u, size(u), N*ones(Int, levels))
diff --git a/test/core_nesteddual.jl b/test/core_nesteddual.jl
@@ -17,22 +17,32 @@ function claytonsample!(sto, τ, α; randmat=randmat)
     return sto
 end
 
-#= taking the second derivative of claytonsample! with respect to τ with manual chunk_sizes. In setting up the dualcache, 
-we are setting chunk_size to [1, 1], because we differentiate only with respect to τ.
-This initializes the cache with the minimum memory needed. =#
+#= taking the second derivative of claytonsample! with respect to τ with manual chunk_sizes.
+In setting up the dualcache, we are setting chunk_size to [1, 1], because we differentiate 
+only with respect to τ. This initializes the cache with the minimum memory needed. =#
 stod = dualcache(sto, [1, 1]) 
 df3 = ForwardDiff.derivative(τ -> ForwardDiff.derivative(ξ -> claytonsample!(stod, ξ, 0.0), τ), 0.3)
 
-#= taking the second derivative of claytonsample! with respect to τ, auto-detect. For the given size of sto, ForwardDiff's heuristic
-chooses chunk_size = 8. Since this is greater than what's needed (1+1), the auto-allocated cache is big enough to handle the nested
-dual numbers. This should in general not be relied on to work, especially if more levels of nesting occurs (as below). =#
+#= taking the second derivative of claytonsample! with respect to τ with auto-detected chunk-size. 
+For the given size of sto, ForwardDiff's heuristic chooses chunk_size = 8. Since this is greater 
+than what's needed (1+1), the auto-allocated cache is big enough to handle the nested dual numbers, even
+if we don't specify the keyword argument levels = 2. This should in general not be relied on to work, 
+especially if more levels of nesting occur (see optimization example below). =#
 stod = dualcache(sto) 
 df4 = ForwardDiff.derivative(τ -> ForwardDiff.derivative(ξ -> claytonsample!(stod, ξ, 0.0), τ), 0.3)
 
 @test df3 ≈ df4
 
-## Checking nested dual numbers: Checking an optimization problem inspired by the above tests 
-## (using Optim.jl's Newton() (involving Hessians) and BFGS() (involving gradients))
+#= taking the second derivative of claytonsample! with respect to τ with auto-detected chunk-size. 
+For the given size of sto, ForwardDiff's heuristic chooses chunk_size = 8 and with keyword arg levels = 2,
+the created cache size is larger than what's needed (even more so than the last example). =#
+stod = dualcache(sto, levels = 2) 
+df5 = ForwardDiff.derivative(τ -> ForwardDiff.derivative(ξ -> claytonsample!(stod, ξ, 0.0), τ), 0.3)
+
+@test df3 ≈ df5
+
+#= Checking nested dual numbers using optimization problem involving Optim.jl's Newton() (involving Hessians);
+so, we will need one level of AD for the ODE solver (TRBDF2) and two more to calculate the Hessian =#
 function foo(du, u, p, t)
     tmp = p[2]
     A = reshape(p[1], size(tmp.du))
@@ -42,15 +52,16 @@ function foo(du, u, p, t)
     nothing
 end
 
-ps = 3 #use to specify problem size; don't go crazy on this, because compilation time...
-coeffs = -rand(ps^2)
-cache = dualcache(zeros(ps,ps), [9, 9, 9])
+ps = 3 #use to specify problem size; don't go crazy on this, because of the compilation time...
+coeffs = -rand(ps,ps)
+cache = dualcache(zeros(ps,ps), levels = 3)
 prob = ODEProblem(foo, ones(ps, ps), (0., 1.0), (coeffs, cache))
-realsol = solve(prob, TRBDF2(), saveat = 0.0:0.01:1.0, reltol = 1e-8)
+realsol = solve(prob, TRBDF2(), saveat = 0.0:0.1:10.0, reltol = 1e-8)
+u0 = rand(length(coeffs))
 
 function objfun(x, prob, realsol, cache)
-    prob = remake(prob, u0 = eltype(x).(ones(ps, ps)), p = (x, cache))
-    sol = solve(prob, TRBDF2(), saveat = 0.0:0.01:1.0, reltol = 1e-8)
+    prob = remake(prob, u0 = eltype(x).(prob.u0), p = (x, cache))
+    sol = solve(prob, TRBDF2(), saveat = 0.0:0.1:10.0, reltol = 1e-8)
 
     ofv = 0.0
     if any((s.retcode != :Success for s in sol))
@@ -60,35 +71,20 @@ function objfun(x, prob, realsol, cache)
     end    
     return ofv
 end
-
 fn(x,p) = objfun(x, p[1], p[2], p[3])
-
 optfun = OptimizationFunction(fn, GalacticOptim.AutoForwardDiff())
-optprob = OptimizationProblem(optfun, rand(size(coeffs)...), (prob, realsol, cache))
+optprob = OptimizationProblem(optfun, -rand(length(coeffs)), (prob, realsol, cache), chunk_size = 2)
 newtonsol = solve(optprob, Newton())
-bfgssol = solve(optprob, BFGS()) #since only gradients are used here, we could go with a smaller dualcache(zeros(ps,ps), [9,9]) as well.
 
-@test all(abs.(coeffs .- newtonsol.u) .< 1e-3)
-@test all(abs.(coeffs .- bfgssol.u) .< 1e-3)
+@test all(abs.(coeffs[:] .- newtonsol.u) .< 1e-2)
 
 #an example where chunk_sizes are not the same on all differentiation levels:
-function foo(du, u, p, t)
-    tmp = p[2]
-    A = ones(size(tmp.du)).*p[1]
-    tmp = get_tmp(tmp, u)
-    mul!(tmp, A, u)
-    @. du = u + tmp
-    nothing
-end
-
-coeffs = rand(1)
-cache = dualcache(zeros(ps,ps), [1, 1, 4])
-prob = ODEProblem(foo, ones(ps, ps), (0., 1.0), (coeffs, cache))
-realsol = solve(prob, TRBDF2(), saveat = 0.0:0.01:1.0, reltol = 1e-8)
+cache = dualcache(zeros(ps,ps), [9, 9, 2])
+realsol = solve(prob, TRBDF2(chunk_size = 2), saveat = 0.0:0.1:10.0, reltol = 1e-8)
 
 function objfun(x, prob, realsol, cache)
-    prob = remake(prob, u0 = eltype(x).(ones(ps, ps)), p = (x, cache))
-    sol = solve(prob, TRBDF2(), saveat = 0.0:0.01:1.0, reltol = 1e-8)
+    prob = remake(prob, u0 = eltype(x).(prob.u0), p = (x, cache))
+    sol = solve(prob, TRBDF2(chunk_size = 2), saveat = 0.0:0.1:10.0, reltol = 1e-8)
 
     ofv = 0.0
     if any((s.retcode != :Success for s in sol))
@@ -102,7 +98,7 @@ end
 fn(x,p) = objfun(x, p[1], p[2], p[3])
 
 optfun = OptimizationFunction(fn, GalacticOptim.AutoForwardDiff())
-optprob = OptimizationProblem(optfun, rand(size(coeffs)...), (prob, realsol, cache))
+optprob = OptimizationProblem(optfun, -rand(length(coeffs)), (prob, realsol, cache))
 newtonsol2 = solve(optprob, Newton())
 
-@test all(abs.(coeffs .- newtonsol2.u) .< 1e-3)
+@test all(abs.(coeffs[:] .- newtonsol2.u) .< 1e-2)