Fix tests and remaining doc

ChrisRackauckas · ChrisRackauckas · commit c5b904f40f44 · 2022-06-21T04:22:27.000-04:00
diff --git a/docs/src/examples/normalizing_flows.md b/docs/src/examples/normalizing_flows.md
@@ -7,21 +7,21 @@ Now, we study a single layer neural network that can estimate the density `p_x`
 Before getting to the explanation, here's some code to start with. We will
 follow a full explanation of the definition and training process:
 
-```julia
+```@example cnf
 using Flux, DiffEqFlux, DifferentialEquations, Optimization, OptimizationFlux, 
       OptimizationOptimJL, Distributions
 
 nn = Flux.Chain(
     Flux.Dense(1, 3, tanh),
     Flux.Dense(3, 1, tanh),
 ) |> f32
-tspan = (0.0f0, 10.0f0)
+tspan = (0.0f0, 1.0f0)
 
 ffjord_mdl = FFJORD(nn, tspan, Tsit5())
 
 # Training
 data_dist = Normal(6.0f0, 0.7f0)
-train_data = rand(data_dist, 1, 100)
+train_data = Float32.(rand(data_dist, 1, 100))
 
 function loss(θ)
     logpx, λ₁, λ₂ = ffjord_mdl(train_data, θ)
@@ -57,7 +57,7 @@ new_data = rand(ffjord_dist, 100)
 
 We can use DiffEqFlux.jl to define, train and output the densities computed by CNF layers. In the same way as a neural ODE, the layer takes a neural network that defines its derivative function (see [1] for a reference). A possible way to define a CNF layer, would be:
 
-```julia
+```@example cnf2
 using Flux, DiffEqFlux, DifferentialEquations, Optimization, OptimizationFlux, 
       OptimizationOptimJL, Distributions
 
@@ -74,16 +74,17 @@ where we also pass as an input the desired timespan for which the differential e
 
 ### Training
 
-First, let's get an array from a normal distribution as the training data
+First, let's get an array from a normal distribution as the training data. Note that we want the data in Float32
+values to match how we have setup the neural network weights and the state space of the ODE.
 
-```julia
+```@example cnf2
 data_dist = Normal(6.0f0, 0.7f0)
-train_data = rand(data_dist, 1, 100)
+train_data = Float32.(rand(data_dist, 1, 100))
 ```
 
 Now we define a loss function that we wish to minimize
 
-```julia
+```@example cnf2
 function loss(θ)
     logpx, λ₁, λ₂ = ffjord_mdl(train_data, θ)
     -mean(logpx)
@@ -96,7 +97,7 @@ We then train the neural network to learn the distribution of `x`.
 
 Here we showcase starting the optimization with `ADAM` to more quickly find a minimum, and then honing in on the minimum by using `LBFGS`.
 
-```julia
+```@example cnf2
 adtype = Optimization.AutoZygote()
 optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, ffjord_mdl.p)
@@ -108,7 +109,7 @@ res1 = Optimization.solve(optprob,
 
 We then complete the training using a different optimizer starting from where `ADAM` stopped.
 
-```julia
+```@example cnf2
 optprob2 = Optimization.OptimizationProblem(optf, res1.u)
 res2 = Optimization.solve(optprob2,
                           Optim.LBFGS(),
@@ -119,7 +120,7 @@ res2 = Optimization.solve(optprob2,
 
 For evaluating the result, we can use `totalvariation` function from `Distances.jl`. First, we compute densities using actual distribution and FFJORD model. then we use a distance function.
 
-```julia
+```@example cnf2
 using Distances
 
 actual_pdf = pdf.(data_dist, train_data)
@@ -131,7 +132,7 @@ train_dis = totalvariation(learned_pdf, actual_pdf) / size(train_data, 2)
 
 What's more, we can generate new data by using FFJORD as a distribution in `rand`.
 
-```julia
+```@example cnf2
 ffjord_dist = FFJORDDistribution(FFJORD(nn, tspan, Tsit5(); p=res2.u))
 new_data = rand(ffjord_dist, 100)
 ```
diff --git a/docs/src/examples/tensor_layer.md b/docs/src/examples/tensor_layer.md
@@ -13,7 +13,7 @@ To obtain the training data, we solve the equation of motion using one of the
 solvers in `DifferentialEquations`:
 
 ```@example tensor
-using DiffEqFlux, Optimization, OptimizationOptimJL, DifferentialEquations, LinearAlgebra
+using DiffEqFlux, Optimization, OptimizationFlux, DifferentialEquations, LinearAlgebra
 k, α, β, γ = 1, 0.1, 0.2, 0.3
 tspan = (0.0,10.0)
 
@@ -24,7 +24,7 @@ end
 
 u0 = [1.0,0.0]
 ts = collect(0.0:0.1:tspan[2])
-prob_train = ODEProblem{true}(dxdt_train,u0,tspan,p=nothing)
+prob_train = ODEProblem{true}(dxdt_train,u0,tspan,p)
 data_train = Array(solve(prob_train,Tsit5(),saveat=ts))
 ```
 
@@ -49,7 +49,7 @@ end
 
 α = zeros(102)
 
-prob_pred = ODEProblem{true}(dxdt_pred,u0,tspan,p=nothing)
+prob_pred = ODEProblem{true}(dxdt_pred,u0,tspan)
 ```
 
 Note that we introduced a "cap" in the neural network term to avoid instabilities
@@ -59,9 +59,9 @@ in order to obtain a faster convergence for this particular example.
 Finally, we introduce the corresponding loss function:
 
 ```@example tensor
-
 function predict_adjoint(θ)
-  x = Array(solve(prob_pred,Tsit5(),p=θ,saveat=ts))
+  x = Array(solve(prob_pred,Tsit5(),p=θ,saveat=ts,
+                  sensealg=InterpolatingAdjoint(autojacvec=ReverseDiffVJP(true))))
 end
 
 function loss_adjoint(θ)
@@ -70,8 +70,13 @@ function loss_adjoint(θ)
   return loss
 end
 
-function cb(θ,l)
-  @show θ, l
+iter = 0
+function callback(θ,l)
+  global iter
+  iter += 1
+  if iter%10 == 0
+    println(l)
+  end
   return false
 end
 ```
@@ -82,18 +87,18 @@ and we train the network using two rounds of `ADAM`:
 adtype = Optimization.AutoZygote()
 optf = Optimization.OptimizationFunction((x,p) -> loss_adjoint(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, α)
-res1 = Optimization.solve(optprob, ADAM(0.05), cb = cb, maxiters = 150)
+res1 = Optimization.solve(optprob, ADAM(0.05), callback = callback, maxiters = 150)
 
 optprob2 = Optimization.OptimizationProblem(optf, res1.u)
-res2 = Optimization.solve(optprob2, ADAM(0.001), cb = cb,maxiters = 150)
+res2 = Optimization.solve(optprob2, ADAM(0.001), callback = callback,maxiters = 150)
 opt = res2.u
 ```
 
 We plot the results and we obtain a fairly accurate learned model:
 
 ```@example tensor
 using Plots
-data_pred = predict_adjoint(opt)
+data_pred = predict_adjoint(res1.u)
 plot(ts, data_train[1,:], label = "X (ODE)")
 plot!(ts, data_train[2,:], label = "V (ODE)")
 plot!(ts, data_pred[1,:], label = "X (NN)")
diff --git a/test/cnf_test.jl b/test/cnf_test.jl
@@ -17,7 +17,7 @@ end
     ffjord_mdl = FFJORD(nn, tspan, Tsit5())
 
     data_dist = Beta(2.0f0, 2.0f0)
-    train_data = rand(data_dist, 1, 100)
+    train_data = Float32.(rand(data_dist, 1, 100))
 
     function loss(θ; regularize, monte_carlo)
         logpx, λ₁, λ₂ = ffjord_mdl(train_data, θ; regularize, monte_carlo)
@@ -167,7 +167,7 @@ end
     ffjord_mdl = FFJORD(nn, tspan, Tsit5())
 
     data_dist = Beta(2.0f0, 2.0f0)
-    train_data = rand(data_dist, 1, 100)
+    train_data = Float32.(rand(data_dist, 1, 100))
 
     function loss(θ; regularize, monte_carlo)
         logpx, λ₁, λ₂ = ffjord_mdl(train_data, θ; regularize, monte_carlo)
@@ -196,8 +196,8 @@ end
     monte_carlo = false
 
     data_dist = Beta(7.0f0, 7.0f0)
-    train_data = rand(data_dist, 1, 100)
-    test_data = rand(data_dist, 1, 100)
+    train_data = Float32.(rand(data_dist, 1, 100))
+    test_data = Float32.(rand(data_dist, 1, 100))
 
     function loss(θ)
         logpx, λ₁, λ₂ = ffjord_mdl(train_data, θ; regularize, monte_carlo)
@@ -224,8 +224,8 @@ end
     monte_carlo = false
 
     data_dist = Normal(6.0f0, 0.7f0)
-    train_data = rand(data_dist, 1, 100)
-    test_data = rand(data_dist, 1, 100)
+    train_data = Float32.(rand(data_dist, 1, 100))
+    test_data = Float32.(rand(data_dist, 1, 100))
 
     function loss(θ)
         logpx, λ₁, λ₂ = ffjord_mdl(train_data, θ; regularize, monte_carlo)
@@ -253,8 +253,8 @@ end
     μ = ones(Float32, 2)
     Σ = Diagonal([7.0f0, 7.0f0])
     data_dist = MvNormal(μ, Σ)
-    train_data = rand(data_dist, 100)
-    test_data = rand(data_dist, 100)
+    train_data = Float32.(rand(data_dist, 100))
+    test_data = Float32.(rand(data_dist, 100))
 
     function loss(θ)
         logpx, λ₁, λ₂ = ffjord_mdl(train_data, θ; regularize, monte_carlo)
@@ -282,8 +282,8 @@ end
     μ = ones(Float32, 2)
     Σ = Diagonal([7.0f0, 7.0f0])
     data_dist = MvNormal(μ, Σ)
-    train_data = rand(data_dist, 100)
-    test_data = rand(data_dist, 100)
+    train_data = Float32.(rand(data_dist, 100))
+    test_data = Float32.(rand(data_dist, 100))
 
     function loss(θ)
         logpx, λ₁, λ₂ = ffjord_mdl(train_data, θ; regularize, monte_carlo)