Merge pull request #735 from SciML/neural_sde

ChrisRackauckas · web-flow · commit d68bd5614dc4 · 2022-06-23T06:28:19.000-04:00
Neural SDE tutorial
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -17,7 +17,9 @@ OptimizationPolyalgorithms = "500b13db-7e66-49ce-bda4-eed966be6282"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
+SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StochasticDiffEq = "789caeaf-c7a9-5a7d-9973-96adeb23e2a0"
 
 [compat]
 Documenter = "0.27"
diff --git a/docs/pages.jl b/docs/pages.jl
@@ -6,6 +6,7 @@ pages = [
             "examples/mnist_neural_ode.md",
             "examples/mnist_conv_neural_ode.md",
             "examples/augmented_neural_ode.md",
+            "examples/neural_sde.md",
             "examples/collocation.md",
             "examples/normalizing_flows.md",
             "examples/hamiltonian_nn.md",
diff --git a/docs/src/examples/neural_sde.md b/docs/src/examples/neural_sde.md
@@ -0,0 +1,200 @@
+# Neural Stochastic Differential Equations With Method of Moments
+
+With neural stochastic differential equations, there is once again a helper form
+`neural_dmsde` which can be used for the multiplicative noise case (consult the
+layers API documentation, or [this full example using the layer
+function](https://github.com/MikeInnes/zygote-paper/blob/master/neural_sde/neural_sde.jl)).
+
+However, since there are far too many possible combinations for the API to
+support, in many cases you will want to performantly define neural differential
+equations for non-ODE systems from scratch. For these systems, it is generally
+best to use `TrackerAdjoint` with non-mutating (out-of-place) forms. For
+example, the following defines a neural SDE with neural networks for both the
+drift and diffusion terms:
+
+```julia
+dudt(u, p, t) = model(u)
+g(u, p, t) = model2(u)
+prob = SDEProblem(dudt, g, x, tspan, nothing)
+```
+
+where `model` and `model2` are different neural networks. The same can apply to
+a neural delay differential equation. Its out-of-place formulation is
+`f(u,h,p,t)`. Thus for example, if we want to define a neural delay differential
+equation which uses the history value at `p.tau` in the past, we can define:
+
+```julia
+dudt!(u, h, p, t) = model([u; h(t - p.tau)])
+prob = DDEProblem(dudt_, u0, h, tspan, nothing)
+```
+
+
+First let's build training data from the same example as the neural ODE:
+
+```@example nsde
+using Plots, Statistics
+using Flux, Optimization, OptimizationFlux, DiffEqFlux, StochasticDiffEq, SciMLBase.EnsembleAnalysis
+
+u0 = Float32[2.; 0.]
+datasize = 30
+tspan = (0.0f0, 1.0f0)
+tsteps = range(tspan[1], tspan[2], length = datasize)
+```
+
+```@example nsde
+function trueSDEfunc(du, u, p, t)
+    true_A = [-0.1 2.0; -2.0 -0.1]
+    du .= ((u.^3)'true_A)'
+end
+
+mp = Float32[0.2, 0.2]
+function true_noise_func(du, u, p, t)
+    du .= mp.*u
+end
+
+prob_truesde = SDEProblem(trueSDEfunc, true_noise_func, u0, tspan)
+```
+
+For our dataset we will use DifferentialEquations.jl's [parallel ensemble
+interface](http://docs.juliadiffeq.org/dev/features/ensemble.html) to generate
+data from the average of 10,000 runs of the SDE:
+
+```@example nsde
+# Take a typical sample from the mean
+ensemble_prob = EnsembleProblem(prob_truesde)
+ensemble_sol = solve(ensemble_prob, SOSRI(), trajectories = 10000)
+ensemble_sum = EnsembleSummary(ensemble_sol)
+
+sde_data, sde_data_vars = Array.(timeseries_point_meanvar(ensemble_sol, tsteps))
+```
+
+Now we build a neural SDE. For simplicity we will use the `NeuralDSDE`
+neural SDE with diagonal noise layer function:
+
+```@example nsde
+drift_dudt = Flux.Chain(x -> x.^3,
+                       Flux.Dense(2, 50, tanh),
+                       Flux.Dense(50, 2))
+p1, re1 = Flux.destructure(drift_dudt)
+
+diffusion_dudt = Flux.Chain(Flux.Dense(2, 2))
+p2, re2 = Flux.destructure(diffusion_dudt)
+
+neuralsde = NeuralDSDE(drift_dudt, diffusion_dudt, tspan, SOSRI(),
+                       saveat = tsteps, reltol = 1e-1, abstol = 1e-1)
+```
+
+Let's see what that looks like:
+
+```@example nsde
+# Get the prediction using the correct initial condition
+prediction0 = neuralsde(u0)
+
+drift_(u, p, t) = re1(p[1:neuralsde.len])(u)
+diffusion_(u, p, t) = re2(p[neuralsde.len+1:end])(u)
+
+prob_neuralsde = SDEProblem(drift_, diffusion_, u0,(0.0f0, 1.2f0), neuralsde.p)
+
+ensemble_nprob = EnsembleProblem(prob_neuralsde)
+ensemble_nsol = solve(ensemble_nprob, SOSRI(), trajectories = 100,
+                      saveat = tsteps)
+ensemble_nsum = EnsembleSummary(ensemble_nsol)
+
+plt1 = plot(ensemble_nsum, title = "Neural SDE: Before Training")
+scatter!(plt1, tsteps, sde_data', lw = 3)
+
+scatter(tsteps, sde_data[1,:], label = "data")
+scatter!(tsteps, prediction0[1,:], label = "prediction")
+```
+
+Now just as with the neural ODE we define a loss function that calculates the
+mean and variance from `n` runs at each time point and uses the distance from
+the data values:
+
+```@example nsde
+function predict_neuralsde(p, u = u0)
+  return Array(neuralsde(u, p))
+end
+
+function loss_neuralsde(p; n = 100)
+  u = repeat(reshape(u0, :, 1), 1, n)
+  samples = predict_neuralsde(p, u)
+  means = mean(samples, dims = 2)
+  vars = var(samples, dims = 2, mean = means)[:, 1, :]
+  means = means[:, 1, :]
+  loss = sum(abs2, sde_data - means) + sum(abs2, sde_data_vars - vars)
+  return loss, means, vars
+end
+```
+
+```@example nsde
+list_plots = []
+iter = 0
+
+# Callback function to observe training
+callback = function (p, loss, means, vars; doplot = false)
+  global list_plots, iter
+
+  if iter == 0
+    list_plots = []
+  end
+  iter += 1
+
+  # loss against current data
+  display(loss)
+
+  # plot current prediction against data
+  plt = Plots.scatter(tsteps, sde_data[1,:], yerror = sde_data_vars[1,:],
+                     ylim = (-4.0, 8.0), label = "data")
+  Plots.scatter!(plt, tsteps, means[1,:], ribbon = vars[1,:], label = "prediction")
+  push!(list_plots, plt)
+
+  if doplot
+    display(plt)
+  end
+  return false
+end
+```
+
+Now we train using this loss function. We can pre-train a little bit using a
+smaller `n` and then decrease it after it has had some time to adjust towards
+the right mean behavior:
+
+```@example nsde
+opt = ADAM(0.025)
+
+# First round of training with n = 10
+adtype = Optimization.AutoZygote()
+optf = Optimization.OptimizationFunction((x,p) -> loss_neuralsde(x, n=10), adtype)
+optprob = Optimization.OptimizationProblem(optf, neuralsde.p)
+result1 = Optimization.solve(optprob, opt,
+                                 callback = callback, maxiters = 100)
+```
+
+We resume the training with a larger `n`. (WARNING - this step is a couple of
+orders of magnitude longer than the previous one).
+
+```@example nsde
+optf2 = Optimization.OptimizationFunction((x,p) -> loss_neuralsde(x, n=100), adtype)
+optprob2 = Optimization.OptimizationProblem(optf2, result1.u)
+result2 = Optimization.solve(optprob2, opt,
+                                 callback = callback, maxiters = 100)
+```
+
+And now we plot the solution to an ensemble of the trained neural SDE:
+
+```@example nsde
+_, means, vars = loss_neuralsde(result2.u, n = 1000)
+
+plt2 = Plots.scatter(tsteps, sde_data', yerror = sde_data_vars',
+                     label = "data", title = "Neural SDE: After Training",
+                     xlabel = "Time")
+plot!(plt2, tsteps, means', lw = 8, ribbon = vars', label = "prediction")
+
+plt = plot(plt1, plt2, layout = (2, 1))
+savefig(plt, "NN_sde_combined.png"); nothing # sde
+```
+
+![](https://user-images.githubusercontent.com/1814174/76975872-88dc9100-6909-11ea-80f7-242f661ebad1.png)
+
+Try this with GPUs as well!
diff --git a/src/neural_de.jl b/src/neural_de.jl
@@ -1,4 +1,4 @@
-abstract type NeuralDELayer <: Function end
+abstract type NeuralDELayer <: Lux.AbstractExplicitLayer end
 basic_tgrad(u,p,t) = zero(u)
 Flux.trainable(m::NeuralDELayer) = (m.p,)
 
@@ -69,6 +69,9 @@ struct NeuralODE{M,P,RE,T,A,K} <: NeuralDELayer
     end
 end
 
+Lux.initialparameters(rng::AbstractRNG, n::NeuralODE) = Lux.initialparameters(rng, n.model)
+Lux.initialstates(rng::AbstractRNG, n::NeuralODE) = Lux.initialstates(rng, n.model)
+
 function (n::NeuralODE)(x,p=n.p)
     dudt_(u,p,t) = n.re(p)(u)
     ff = ODEFunction{false}(dudt_,tgrad=basic_tgrad)
@@ -86,10 +89,11 @@ function (n::NeuralODE{M})(x,p=n.p) where {M<:FastChain}
 end
 
 function (n::NeuralODE{M})(x,p,st) where {M<:Lux.AbstractExplicitLayer}
-  function dudt(u,p,t)
+  function dudt(u,p,t;st=st)
     u_, st = n.model(u,p,st)
     return u_
   end
+
   ff = ODEFunction{false}(dudt,tgrad=basic_tgrad)
   prob = ODEProblem{false}(ff,x,n.tspan,p)
   sense = InterpolatingAdjoint(autojacvec=ZygoteVJP())
@@ -179,19 +183,33 @@ function (n::NeuralDSDE{M})(x,p=n.p) where {M<:FastChain}
     solve(prob,n.args...;sensealg=TrackerAdjoint(),n.kwargs...)
 end
 
-function (n::NeuralDSDE{M})(x,p,st1,st2) where {M<:Lux.AbstractExplicitLayer}
-    function dudt_(u,p,t)
-      u_, st1 = n.model1(u,p[1],st1)
+function Lux.initialparameters(rng::AbstractRNG, n::NeuralDSDE)
+    p1 = Lux.initialparameters(rng, n.model1)
+    p2 = Lux.initialparameters(rng, n.model2)
+    return Lux.ComponentArray((p1 = p1, p2 = p2))
+end
+
+function Lux.initialstates(rng::AbstractRNG, n::NeuralDSDE)
+    st1 = Lux.initialstates(rng, n.model1)
+    st2 = Lux.initialstates(rng, n.model2)
+    return (state1 = st1, state2 = st2)
+end
+
+function (n::NeuralDSDE{M})(x,p,st) where {M<:Lux.AbstractExplicitLayer}
+    st1 = st.state1
+    st2 = st.state2
+    function dudt_(u,p,t;st=st1)
+      u_, st = n.model1(u,p.p1,st)
       return u_
     end
-    function g(u,p,t)
-      u_, st2 = n.model2(u,p[2],st2)
+    function g(u,p,t;st=st2)
+      u_, st = n.model2(u,p.p2,st)
       return u_
     end
     
     ff = SDEFunction{false}(dudt_,g,tgrad=basic_tgrad)
     prob = SDEProblem{false}(ff,g,x,n.tspan,p)
-    return solve(prob,n.args...;sensealg=TrackerAdjoint(),n.kwargs...), st1, st2
+    return solve(prob,n.args...;sensealg=InterpolatingAdjoint(),n.kwargs...), (state1 = st1, state2 = st2)
 end
 
 """
@@ -251,6 +269,15 @@ struct NeuralSDE{P,M,RE,M2,RE2,T,A,K} <: NeuralDELayer
             typeof(tspan),typeof(args),typeof(kwargs)}(
             p,length(p1),model1,re1,model2,re2,tspan,nbrown,args,kwargs)
     end
+    
+    function NeuralSDE(model1::Lux.AbstractExplicitLayer, model2::Lux.AbstractExplicitLayer,tspan,nbrown,args...;
+                        p1 = nothing, p = nothing, kwargs...)
+        re1 = nothing
+        re2 = nothing
+        new{typeof(p),typeof(model1),typeof(re1),typeof(model2),typeof(re2),
+            typeof(tspan),typeof(args),typeof(kwargs)}(
+              p,Int(1),model1,re1,model2,re2,tspan,nbrown,args,kwargs)
+    end
 end
 
 function (n::NeuralSDE)(x,p=n.p)
@@ -269,6 +296,34 @@ function (n::NeuralSDE{P,M})(x,p=n.p) where {P,M<:FastChain}
     solve(prob,n.args...;sensealg=TrackerAdjoint(),n.kwargs...)
 end
 
+function Lux.initialparameters(rng::AbstractRNG, n::NeuralSDE)
+    p1 = Lux.initialparameters(rng, n.model1)
+    p2 = Lux.initialparameters(rng, n.model2)
+    return Lux.ComponentArray((p1 = p1, p2 = p2))
+end
+function Lux.initialstates(rng::AbstractRNG, n::NeuralSDE)
+    st1 = Lux.initialstates(rng, n.model1)
+    st2 = Lux.initialstates(rng, n.model2)
+    return (state1 = st1, state2 = st2)
+end
+
+function (n::NeuralSDE{P,M})(x,p,st) where {P,M<:Lux.AbstractExplicitLayer}
+    st1 = st.state1
+    st2 = st.state2
+    function dudt_(u,p,t;st=st1)
+        u_, st = n.model1(u,p.p1,st)
+        return u_
+    end
+    function g(u,p,t;st=st2)
+        u_, st = n.model2(u,p.p2,st)
+        return u_
+    end
+
+    ff = SDEFunction{false}(dudt_,g,tgrad=basic_tgrad)
+    prob = SDEProblem{false}(ff,g,x,n.tspan,p,noise_rate_prototype=zeros(Float32,length(x),n.nbrown))
+    solve(prob,n.args...;sensealg=InterpolatingAdjoint(),n.kwargs...), (state1 = st1, state2 = st2)
+end
+
 """
 Constructs a neural delay differential equation (neural DDE) with constant
 delays.