patrick-kidger
diff --git a/‎benchmarks/scan_stages.py‎
Lines changed: 67 additions & 0 deletions b/‎benchmarks/scan_stages.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎benchmarks/scan_stages_cnf.py‎
Lines changed: 92 additions & 0 deletions b/‎benchmarks/scan_stages_cnf.py‎
Lines changed: 92 additions & 0 deletions
@@ -0,0 +1,67 @@
+"""Benchmarks the effect of `diffrax.AbstractRungeKutta(scan_stages=...)`.
+
+On my CPU-only machine:
+```
+bash> python scan_stages.py False
+Compile+run time 24.38062646985054
+Run time 0.0018830380868166685
+
+bash> python scan_stages.py True
+Compile+run time 11.418417416978627
+Run time 0.0014536201488226652
+```
+"""
+
+import functools as ft
+import timeit
+
+import diffrax as dfx
+import equinox as eqx
+import fire
+import jax.numpy as jnp
+import jax.random as jr
+
+
+def _weight(in_, out, key):
+    return [[w_ij for w_ij in w_i] for w_i in jr.normal(key, (out, in_))]
+
+
+class VectorField(eqx.Module):
+    weights: list
+
+    def __init__(self, in_, out, width, depth, *, key):
+        keys = jr.split(key, depth + 1)
+        self.weights = [_weight(in_, width, keys[0])]
+        for i in range(1, depth):
+            self.weights.append(_weight(width, width, keys[i]))
+        self.weights.append(_weight(width, out, keys[depth]))
+
+    def __call__(self, t, y, args):
+        # Inefficient computation graph to make a toy example more expensive.
+        y = [y_i for y_i in y]
+        for w in self.weights:
+            y = [sum(w_ij * y_j for w_ij, y_j in zip(w_i, y)) for w_i in w]
+        return jnp.stack(y)
+
+
+def main(scan_stages):
+    vf = VectorField(1, 1, 16, 2, key=jr.PRNGKey(0))
+    term = dfx.ODETerm(vf)
+    solver = dfx.Dopri8(scan_stages=scan_stages)
+    stepsize_controller = dfx.PIDController(rtol=1e-3, atol=1e-6)
+    t0 = 0
+    t1 = 1
+    dt0 = None
+
+    @eqx.filter_jit
+    def solve(y0):
+        return dfx.diffeqsolve(
+            term, solver, t0, t1, dt0, y0, stepsize_controller=stepsize_controller
+        )
+
+    solve_ = ft.partial(solve, jnp.array([1.0]))
+    print("Compile+run time", timeit.timeit(solve_, number=1))
+    print("Run time", timeit.timeit(solve_, number=1))
+
+
+fire.Fire(main)
@@ -0,0 +1,92 @@
+"""Benchmarks the effect of `diffrax.AbstractRungeKutta(scan_stages=...)`.
+
+On my CPU-only machine:
+```
+bash> python scan_stages_cnf.py --scan_stages=False --backsolve=False
+Compile+run time 79.18114789901301
+Run time 0.16631506383419037
+
+bash> python scan_stages_cnf.py --scan_stages=False --backsolve=True
+Compile+run time 28.233896102989092
+Run time 0.021237157052382827
+
+bash> python scan_stages_cnf.py --scan_stages=True --backsolve=False
+Compile+run time 37.9795492868870
+Run time 0.16300765215419233
+
+bash> python scan_stages_cnf.py --scan_stages=True --backsolve=True
+Compile+run time 12.199542510090396
+Run time 0.024600893026217818
+```
+
+(Not forgetting that --backsolve=True produces only approximate gradients, so the fact
+that it obtains better compile time and run time doesn't mean it's always the best
+choice.)
+"""
+
+# This benchmark is adapted from
+# https://github.com/patrick-kidger/diffrax/issues/94#issuecomment-1140527134
+
+import functools as ft
+import timeit
+
+import diffrax
+import equinox as eqx
+import fire
+import jax
+import jax.nn as jnn
+import jax.numpy as jnp
+import jax.random as jr
+import jax.scipy as jsp
+
+
+def vector_field_prob(t, input, model):
+    y, _ = input
+    f, vjp_fn = jax.vjp(model, y)
+    (size,) = y.shape
+    eye = jnp.eye(size)
+    (dfdy,) = jax.vmap(vjp_fn)(eye)
+    logp = jnp.trace(dfdy)
+    return f, logp
+
+
+@eqx.filter_vmap(args=(None, 0, None, None))
+def log_prob(model, y0, scan_stages, backsolve):
+    term = diffrax.ODETerm(vector_field_prob)
+    solver = diffrax.Dopri5(scan_stages=scan_stages)
+    stepsize_controller = diffrax.PIDController(rtol=1.4e-8, atol=1.4e-8)
+    if backsolve:
+        adjoint = diffrax.BacksolveAdjoint()
+    else:
+        adjoint = diffrax.RecursiveCheckpointAdjoint()
+    sol = diffrax.diffeqsolve(
+        term,
+        solver,
+        t0=0.0,
+        t1=0.5,
+        dt0=0.05,
+        y0=(y0, 0.0),
+        args=model,
+        stepsize_controller=stepsize_controller,
+        adjoint=adjoint,
+    )
+    (y1,), (log_prob,) = sol.ys
+    return log_prob + jsp.stats.norm.logpdf(y1).sum(0)
+
+
+@eqx.filter_jit
+@eqx.filter_grad
+def solve(model, inputs, scan_stages, backsolve):
+    return -log_prob(model, inputs, scan_stages, backsolve).mean()
+
+
+def main(scan_stages, backsolve):
+    mkey, dkey = jr.split(jr.PRNGKey(0), 2)
+    model = eqx.nn.MLP(2, 2, 10, 2, activation=jnn.gelu, key=mkey)
+    x = jr.normal(dkey, (256, 2))
+    solve_ = ft.partial(solve, model, x, scan_stages, backsolve)
+    print("Compile+run time", timeit.timeit(solve_, number=1))
+    print("Run time", timeit.timeit(solve_, number=1))
+
+
+fire.Fire(main)