bayesflow-org
diff --git a/‎bayesflow/diagnostics/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎bayesflow/diagnostics/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bayesflow/diagnostics/metrics/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎bayesflow/diagnostics/metrics/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bayesflow/diagnostics/metrics/posterior_z_score.py‎
Lines changed: 108 additions & 0 deletions b/‎bayesflow/diagnostics/metrics/posterior_z_score.py‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎bayesflow/diagnostics/plots/plot_quantity.py‎
Lines changed: 6 additions & 1 deletion b/‎bayesflow/diagnostics/plots/plot_quantity.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎bayesflow/experimental/stable_consistency_model/stable_consistency_model.py‎
Lines changed: 2 additions & 2 deletions b/‎bayesflow/experimental/stable_consistency_model/stable_consistency_model.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bayesflow/networks/diffusion_model/diffusion_model.py‎
Lines changed: 18 additions & 20 deletions b/‎bayesflow/networks/diffusion_model/diffusion_model.py‎
Lines changed: 18 additions & 20 deletions
diff --git a/‎bayesflow/networks/flow_matching/flow_matching.py‎
Lines changed: 8 additions & 6 deletions b/‎bayesflow/networks/flow_matching/flow_matching.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎bayesflow/simulators/benchmark_simulators/lotka_volterra.py‎
Lines changed: 14 additions & 9 deletions b/‎bayesflow/simulators/benchmark_simulators/lotka_volterra.py‎
Lines changed: 14 additions & 9 deletions
@@ -7,6 +7,7 @@
     calibration_error,
     calibration_log_gamma,
     posterior_contraction,
+    posterior_z_score,
     summary_space_comparison,
 )
 
 
@@ -5,3 +5,4 @@
 from .classifier_two_sample_test import classifier_two_sample_test
 from .model_misspecification import bootstrap_comparison, summary_space_comparison
 from .calibration_log_gamma import calibration_log_gamma, gamma_null_distribution, gamma_discrepancy
+from .posterior_z_score import posterior_z_score
@@ -0,0 +1,108 @@
+from collections.abc import Sequence, Mapping, Callable
+
+import numpy as np
+
+from ...utils.dict_utils import dicts_to_arrays, compute_test_quantities
+
+
+def posterior_z_score(
+    estimates: Mapping[str, np.ndarray] | np.ndarray,
+    targets: Mapping[str, np.ndarray] | np.ndarray,
+    variable_keys: Sequence[str] = None,
+    variable_names: Sequence[str] = None,
+    test_quantities: dict[str, Callable] = None,
+    aggregation: Callable | None = np.median,
+) -> dict[str, any]:
+    """
+    Computes the posterior z-score from prior to posterior for the given samples according to [1]:
+
+    post_z_score = (posterior_mean - true_parameters) / posterior_std
+
+    The score is adequate if it centers around zero and spreads roughly
+    in the interval [-3, 3]
+
+    [1] Schad, D. J., Betancourt, M., & Vasishth, S. (2021).
+    Toward a principled Bayesian workflow in cognitive science.
+    Psychological methods, 26(1), 103.
+
+    Paper also available at https://arxiv.org/abs/1904.12765
+
+    Parameters
+    ----------
+    estimates   : np.ndarray of shape (num_datasets, num_draws_post, num_variables)
+        Posterior samples, comprising `num_draws_post` random draws from the posterior distribution
+        for each data set from `num_datasets`.
+    targets  : np.ndarray of shape (num_datasets, num_variables)
+        Prior samples, comprising `num_datasets` ground truths.
+    variable_keys : Sequence[str], optional (default = None)
+       Select keys from the dictionaries provided in estimates and targets.
+       By default, select all keys.
+    variable_names : Sequence[str], optional (default = None)
+        Optional variable names to show in the output.
+    test_quantities   : dict or None, optional, default: None
+        A dict that maps plot titles to functions that compute
+        test quantities based on estimate/target draws.
+
+        The dict keys are automatically added to ``variable_keys``
+        and ``variable_names``.
+        Test quantity functions are expected to accept a dict of draws with
+        shape ``(batch_size, ...)`` as the first (typically only)
+        positional argument and return an NumPy array of shape
+        ``(batch_size,)``.
+        The functions do not have to deal with an additional
+        sample dimension, as appropriate reshaping is done internally.
+    aggregation    : callable or None, optional (default = np.median)
+        Function to aggregate the PC across draws. Typically `np.mean` or `np.median`.
+        If None is provided, the individual values are returned.
+
+    Returns
+    -------
+    result : dict
+        Dictionary containing:
+
+        - "values" : float or np.ndarray
+            The (optionally aggregated) posterior z-score per variable
+        - "metric_name" : str
+            The name of the metric ("Posterior z-score").
+        - "variable_names" : str
+            The (inferred) variable names.
+
+    Notes
+    -----
+    Posterior z-score quantifies how far the posterior mean lies from
+    the true generating parameter, in standard-error units. Values near 0
+    (in absolute terms) mean the posterior mean is close to the truth;
+    large absolute values indicate substantial deviation.
+    The sign shows the direction of the bias.
+
+    """
+
+    # Optionally, compute and prepend test quantities from draws
+    if test_quantities is not None:
+        updated_data = compute_test_quantities(
+            targets=targets,
+            estimates=estimates,
+            variable_keys=variable_keys,
+            variable_names=variable_names,
+            test_quantities=test_quantities,
+        )
+        variable_names = updated_data["variable_names"]
+        variable_keys = updated_data["variable_keys"]
+        estimates = updated_data["estimates"]
+        targets = updated_data["targets"]
+
+    samples = dicts_to_arrays(
+        estimates=estimates,
+        targets=targets,
+        variable_keys=variable_keys,
+        variable_names=variable_names,
+    )
+
+    post_vars = samples["estimates"].var(axis=1, ddof=1)
+    post_means = samples["estimates"].mean(axis=1)
+    post_stds = np.sqrt(post_vars)
+    z_score = (post_means - samples["targets"]) / post_stds
+    if aggregation is not None:
+        z_score = aggregation(z_score, axis=0)
+    variable_names = samples["estimates"].variable_names
+    return {"values": z_score, "metric_name": "Posterior z-score", "variable_names": variable_names}
@@ -213,7 +213,12 @@ def _prepare_values(
 
     if estimates is not None:
         if is_values_callable:
-            values = values(estimates=estimates, targets=targets, **filter_kwargs({"aggregation": None}, values))
+            values = values(
+                estimates=estimates,
+                targets=targets,
+                variable_keys=variable_keys,
+                **filter_kwargs({"aggregation": None}, values),
+            )
 
         data = dicts_to_arrays(
             estimates=estimates,
 
@@ -222,9 +222,9 @@ def _inverse(self, z: Tensor, conditions: Tensor = None, **kwargs) -> Tensor:
         z           : Tensor
             Samples from a standard normal distribution
         conditions  : Tensor, optional, default: None
-            Conditions for a approximate conditional distribution
+            Conditions for an approximate conditional distribution
         **kwargs    : dict, optional, default: {}
-            Additional keyword arguments. Include `steps` (default: 30) to
+            Additional keyword arguments. Include `steps` (default: 15) to
             adjust the number of sampling steps.
 
         Returns
 
@@ -16,6 +16,7 @@
     integrate_stochastic,
     logging,
     tensor_utils,
+    STOCHASTIC_METHODS,
 )
 from bayesflow.utils.serialization import serialize, deserialize, serializable
 
@@ -39,13 +40,13 @@ class DiffusionModel(InferenceNetwork):
         "activation": "mish",
         "kernel_initializer": "he_normal",
         "residual": True,
-        "dropout": 0.0,
+        "dropout": 0.05,
         "spectral_normalization": False,
     }
 
     INTEGRATE_DEFAULT_CONFIG = {
-        "method": "rk45",
-        "steps": 100,
+        "method": "two_step_adaptive",
+        "steps": "adaptive",
     }
 
     def __init__(
@@ -402,14 +403,13 @@ def _forward(
         conditions: Tensor = None,
         density: bool = False,
         training: bool = False,
-        compositional: bool = False,
         **kwargs,
     ) -> Tensor | tuple[Tensor, Tensor]:
         integrate_kwargs = {"start_time": 0.0, "stop_time": 1.0}
         integrate_kwargs = integrate_kwargs | self.integrate_kwargs
         integrate_kwargs = integrate_kwargs | kwargs
 
-        if integrate_kwargs["method"] == "euler_maruyama":
+        if integrate_kwargs["method"] in STOCHASTIC_METHODS:
             raise ValueError("Stochastic methods are not supported for forward integration.")
 
         if density:
@@ -453,14 +453,13 @@ def _inverse(
         conditions: Tensor = None,
         density: bool = False,
         training: bool = False,
-        compositional: bool = False,
         **kwargs,
     ) -> Tensor | tuple[Tensor, Tensor]:
         integrate_kwargs = {"start_time": 1.0, "stop_time": 0.0}
         integrate_kwargs = integrate_kwargs | self.integrate_kwargs
         integrate_kwargs = integrate_kwargs | kwargs
         if density:
-            if integrate_kwargs["method"] == "euler_maruyama":
+            if integrate_kwargs["method"] in STOCHASTIC_METHODS:
                 raise ValueError("Stochastic methods are not supported for density computation.")
 
             def deltas(time, xz):
@@ -479,7 +478,7 @@ def deltas(time, xz):
             return x, log_density
 
         state = {"xz": z}
-        if integrate_kwargs["method"] == "euler_maruyama":
+        if integrate_kwargs["method"] in STOCHASTIC_METHODS:
 
             def deltas(time, xz):
                 return {
@@ -490,18 +489,17 @@ def diffusion(time, xz):
                 return {"xz": self.diffusion_term(xz, time=time, training=training)}
 
             score_fn = None
-            if "corrector_steps" in integrate_kwargs:
-                if integrate_kwargs["corrector_steps"] > 0:
-
-                    def score_fn(time, xz):
-                        return {
-                            "xz": self.score(
-                                xz,
-                                time=time,
-                                conditions=conditions,
-                                training=training,
-                            )
-                        }
+            if "corrector_steps" in integrate_kwargs or integrate_kwargs.get("method") == "langevin":
+
+                def score_fn(time, xz):
+                    return {
+                        "xz": self.score(
+                            xz,
+                            time=time,
+                            conditions=conditions,
+                            training=training,
+                        )
+                    }
 
             state = integrate_stochastic(
                 drift_fn=deltas,
 
@@ -53,8 +53,8 @@ class FlowMatching(InferenceNetwork):
     }
 
     INTEGRATE_DEFAULT_CONFIG = {
-        "method": "rk45",
-        "steps": 100,
+        "method": "tsit5",
+        "steps": "adaptive",
     }
 
     def __init__(
@@ -236,14 +236,15 @@ def f(x):
     def _forward(
         self, x: Tensor, conditions: Tensor = None, density: bool = False, training: bool = False, **kwargs
     ) -> Tensor | tuple[Tensor, Tensor]:
+        integrate_kwargs = self.integrate_kwargs | kwargs
         if density:
 
             def deltas(time, xz):
                 v, trace = self._velocity_trace(xz, time=time, conditions=conditions, training=training)
                 return {"xz": v, "trace": trace}
 
             state = {"xz": x, "trace": keras.ops.zeros(keras.ops.shape(x)[:-1] + (1,), dtype=keras.ops.dtype(x))}
-            state = integrate(deltas, state, start_time=1.0, stop_time=0.0, **(self.integrate_kwargs | kwargs))
+            state = integrate(deltas, state, start_time=1.0, stop_time=0.0, **integrate_kwargs)
 
             z = state["xz"]
             log_density = self.base_distribution.log_prob(z) + keras.ops.squeeze(state["trace"], axis=-1)
@@ -254,7 +255,7 @@ def deltas(time, xz):
             return {"xz": self.velocity(xz, time=time, conditions=conditions, training=training)}
 
         state = {"xz": x}
-        state = integrate(deltas, state, start_time=1.0, stop_time=0.0, **(self.integrate_kwargs | kwargs))
+        state = integrate(deltas, state, start_time=1.0, stop_time=0.0, **integrate_kwargs)
 
         z = state["xz"]
 
@@ -263,14 +264,15 @@ def deltas(time, xz):
     def _inverse(
         self, z: Tensor, conditions: Tensor = None, density: bool = False, training: bool = False, **kwargs
     ) -> Tensor | tuple[Tensor, Tensor]:
+        integrate_kwargs = self.integrate_kwargs | kwargs
         if density:
 
             def deltas(time, xz):
                 v, trace = self._velocity_trace(xz, time=time, conditions=conditions, training=training)
                 return {"xz": v, "trace": trace}
 
             state = {"xz": z, "trace": keras.ops.zeros(keras.ops.shape(z)[:-1] + (1,), dtype=keras.ops.dtype(z))}
-            state = integrate(deltas, state, start_time=0.0, stop_time=1.0, **(self.integrate_kwargs | kwargs))
+            state = integrate(deltas, state, start_time=0.0, stop_time=1.0, **integrate_kwargs)
 
             x = state["xz"]
             log_density = self.base_distribution.log_prob(z) - keras.ops.squeeze(state["trace"], axis=-1)
@@ -281,7 +283,7 @@ def deltas(time, xz):
             return {"xz": self.velocity(xz, time=time, conditions=conditions, training=training)}
 
         state = {"xz": z}
-        state = integrate(deltas, state, start_time=0.0, stop_time=1.0, **(self.integrate_kwargs | kwargs))
+        state = integrate(deltas, state, start_time=0.0, stop_time=1.0, **integrate_kwargs)
 
         x = state["xz"]
 
 
@@ -10,10 +10,10 @@ def __init__(
         X0: int = 30,
         Y0: int = 1,
         T: int | None = 20,
-        subsample: int = 10,
+        subsample: int | str = "original",
         flatten: bool = True,
         obs_noise: float = 0.1,
-        dt: float = None,
+        dt: float = 0.1,
         rng: np.random.Generator = None,
     ):
         """Lotka Volterra simulated benchmark.
@@ -27,14 +27,17 @@ def __init__(
             Initial number of predator species.
         T: int, optional, default: 20
             The duration (time horizon) of the simulation.
-        subsample: int or None, optional, default: 10
+        subsample: int, str or None, optional, default: 'original'
             The number of evenly spaced time points to return.
             If None, no subsampling will be performed and all T timepoints will be returned.
+            If 'original', the original benchmark task subsampling of 20 points is used.
         flatten: bool, optional, default: True
             A flag to indicate whether a 1D (`flatten=True`) or 2D (`flatten=False`)
             representation of the simulated data is returned.
         obs_noise: float, optional, default: 0.1
             The standard deviation of the log-normal likelihood.
+        dt: float, optional, default: 0.1
+            The time step size for the ODE solver.
         rng: np.random.Generator or None, optional, default: None
             An optional random number generator to use.
         """
@@ -95,21 +98,23 @@ def observation_model(self, params: np.ndarray) -> np.ndarray:
         # Unpack parameter vector into scalars
         alpha, beta, gamma, delta = params
 
-        # Prepate time vector between 0 and T of length T
-        t_vec = np.linspace(0, self.T, int(1 / self.dt))
+        # Prepare time vector between 0 and T of length T
+        t_vec = np.arange(0, self.T + self.dt, self.dt)
 
         # Integrate using scipy and retain only infected (2-nd dimension)
         pp = odeint(self._deriv, x0, t_vec, args=(alpha, beta, gamma, delta))
 
         # Subsample evenly the specified number of points, if specified
-        if self.subsample is not None:
+        if self.subsample == "original":
+            pp = pp[::21]
+        elif self.subsample is not None:
             pp = pp[:: (self.T // self.subsample)]
 
-        # Ensure minimum count is 0, which will later pass by log(0 + 1)
-        pp[pp < 0] = 0.0
+        # Ensure minimum count is 0
+        pp = np.clip(pp, a_min=1e-10, a_max=10000.0)
 
         # Add noise, decide whether to flatten and return
-        x = self.rng.lognormal(np.log1p(pp), sigma=self.obs_noise)
+        x = self.rng.lognormal(np.log(pp), sigma=self.obs_noise)
         if self.flatten:
             return x.flatten()
         return x
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`calibration_error,`
`8`	`8`	`calibration_log_gamma,`
`9`	`9`	`posterior_contraction,`
	`10`	`+ posterior_z_score,`
`10`	`11`	`summary_space_comparison,`
`11`	`12`	`)`
`12`	`13`