Implement add_lift_test_measurements for MMMM (#1738)

TeemuSailynoja · williambdean · juanitorduz · web-flow · commit 70c1649f44cd · 2025-06-23T21:37:03.000Z
* Add 'scaling' to MMM class docstring.

* implement add_lift_test_measurements for MMMM.

* Fix time-varying media lift test implementation for MMMM.

* Test MMMM lift tests

* Add runnable documentation example.

* Fix imports in documentation example.

* add type hints

* fix typo

* add tests for different scaling configurations

* Fix lift test scaling using xarray and model scalers.

* tests lift test exceptions

* Fix test for dims=().

* Add mock sampling to lift test tests

* test with different dimensions of saturation parameters

* test for different dimensions of saturation parameters

* add docstring for the private functions

* fix typo

* run pre-commit

* Test transfrom functions.

---------

Co-authored-by: Will Dean &lt;wd60622@gmail.com&gt;
Co-authored-by: Will Dean &lt;57733339+williambdean@users.noreply.github.com&gt;
Co-authored-by: Juan Orduz &lt;juanitorduz@gmail.com&gt;
diff --git a/pymc_marketing/mmm/lift_test.py b/pymc_marketing/mmm/lift_test.py
@@ -525,6 +525,7 @@ def scale_channel_lift_measurements(
     channel_col: str,
     channel_columns: list[str],
     transform: Callable[[np.ndarray], np.ndarray],
+    dim_cols: list[str] | None = None,
 ) -> pd.DataFrame:
     """Scale the lift measurements for a specific channel.
 
@@ -542,20 +543,25 @@ def scale_channel_lift_measurements(
         a subset of these values.
     transform : Callable[[np.ndarray], np.ndarray]
         Function to scale the lift measurements.
+    dim_cols : list[str], optional
+        Column names for model dimensions.
 
     Returns
     -------
     pd.DataFrame
         DataFrame with the scaled lift measurements.
 
     """
-    # DataFrame with MultiIndex (RangeIndex, channel_col)
+    # either [*dim_cols , channel_col], or [channel_col]
+    index_cols: list[str] = (dim_cols if dim_cols else []) + [channel_col]
+    # DataFrame with MultiIndex (RangeIndex, index_cols),
+    # where dim_cols  is optional.
     # columns: x, delta_x
-    df_original = df_lift_test.loc[:, [channel_col, "x", "delta_x"]].set_index(
-        channel_col, append=True
+    df_original = df_lift_test.loc[:, [*index_cols, "x", "delta_x"]].set_index(
+        index_cols, append=True
     )
 
-    # DataFrame with MultiIndex (RangeIndex, (x, delta_x))
+    # DataFrame with MultiIndex (RangeIndex, (x, *dim_cols , delta_x))
     # columns: channel_columns values
     df_to_rescale = (
         df_original.pipe(_swap_columns_and_last_index_level)
@@ -572,7 +578,7 @@ def scale_channel_lift_measurements(
     return (
         df_rescaled.pipe(_swap_columns_and_last_index_level)
         .loc[df_original.index, :]
-        .reset_index(channel_col)
+        .reset_index(index_cols)
     )
 
 
@@ -610,6 +616,7 @@ def scale_lift_measurements(
     channel_columns: list[str | int],
     channel_transform: Callable[[np.ndarray], np.ndarray],
     target_transform: Callable[[np.ndarray], np.ndarray],
+    dim_cols: list[str] | None = None,
 ) -> pd.DataFrame:
     """Scale the DataFrame with lift test results to be used in the model.
 
@@ -629,6 +636,8 @@ def scale_lift_measurements(
         Function to scale the lift measurements.
     target_transform : Callable[[np.ndarray], np.ndarray]
         Function to scale the target.
+    dim_cols : list[str], optional
+        Names of the columns for channel dimensions
 
     Returns
     -------
@@ -643,6 +652,7 @@ def scale_lift_measurements(
         channel_col=channel_col,
         channel_columns=channel_columns,  # type: ignore
         transform=channel_transform,
+        dim_cols=dim_cols,
     )
     df_target_scaled = scale_target_for_lift_measurements(
         df_lift_test["delta_y"],
diff --git a/pymc_marketing/mmm/multidimensional.py b/pymc_marketing/mmm/multidimensional.py
@@ -17,7 +17,7 @@
 
 import json
 import warnings
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from copy import deepcopy
 from typing import Annotated, Any, Literal
 
@@ -46,6 +46,10 @@
 )
 from pymc_marketing.mmm.events import EventEffect
 from pymc_marketing.mmm.fourier import YearlyFourier
+from pymc_marketing.mmm.lift_test import (
+    add_lift_measurements_to_likelihood_from_saturation,
+    scale_lift_measurements,
+)
 from pymc_marketing.mmm.plot import MMMPlotSuite
 from pymc_marketing.mmm.scaling import Scaling, VariableScaling
 from pymc_marketing.mmm.tvp import infer_time_index
@@ -94,6 +98,9 @@ class MMM(ModelBuilder):
         Whether to use time-varying effects for media channels.
     dims : tuple | None
         Additional dimensions for the model.
+    scaling : Scaling | dict | None
+        Scaling methods to be used for the target variable and the marketing channels.
+        Defaults to max scaling for both.
     model_config : dict | None
         Configuration settings for the model.
     sampler_config : dict | None
@@ -1505,6 +1512,242 @@ def sample_posterior_predictive(
 
         return posterior_predictive_samples
 
+    def _make_channel_transform(
+        self, df_lift_test: pd.DataFrame
+    ) -> Callable[[np.ndarray], np.ndarray]:
+        """Create a function for transforming the channel data into the same scale as in the model.
+
+        Parameters
+        ----------
+        df_lift_test : pd.DataFrame
+            Lift test measurements.
+
+        Returns
+        -------
+        Callable[[np.ndarray], np.ndarray]
+            The function for scaling the channel data.
+        """
+        # The transformer will be passed a np.ndarray of data corresponding to this index.
+        index_cols = [*list(self.dims), "channel"]
+        # We reconstruct the input dataframe following the transformations performed within
+        # `lift_test.scale_channel_lift_measurements()``.
+        input_df = (
+            df_lift_test.loc[:, [*index_cols, "x", "delta_x"]]
+            .set_index(index_cols, append=True)
+            .stack()
+            .unstack(level=-2)
+            .reindex(self.channel_columns, axis=1)  # type: ignore
+            .fillna(0)
+        )
+
+        def channel_transform(input: np.ndarray) -> np.ndarray:
+            """Transform lift test channel data to the same scale as in the model."""
+            # reconstruct the df corresponding to the input np.ndarray.
+            reconstructed = (
+                pd.DataFrame(data=input, index=input_df.index, columns=input_df.columns)
+                .stack()
+                .unstack(level=-2)
+            )
+            return (
+                (
+                    # Scale the data according to the scaler coords.
+                    reconstructed.to_xarray() / self.scalers._channel
+                )
+                .to_dataframe()
+                .fillna(0)
+                .stack()
+                .unstack(level=-2)
+                .loc[input_df.index, :]
+                .values
+            )
+
+        # Finally return the scaled data as a np.ndarray corresponding to the input index order.
+        return channel_transform
+
+    def _make_target_transform(
+        self, df_lift_test: pd.DataFrame
+    ) -> Callable[[np.ndarray], np.ndarray]:
+        """Create a function for transforming the target measurements into the same scale as in the model.
+
+        Parameters
+        ----------
+        df_lift_test : pd.DataFrame
+            Lift test measurements.
+
+        Returns
+        -------
+        Callable[[np.ndarray], np.ndarray]
+            The function for scaling the target data.
+        """
+        # These are the same order as in the original lift test measurements.
+        index_cols = [*list(self.dims), "channel"]
+        input_idx = df_lift_test.set_index(index_cols, append=True).index
+
+        def target_transform(input: np.ndarray) -> np.ndarray:
+            """Transform lift test measurements and sigma to the same scale as in the model."""
+            # Reconstruct the input df column with the correct index.
+            reconstructed = pd.DataFrame(
+                data=input, index=input_idx, columns=["target"]
+            )
+            return (
+                (
+                    # Scale the measurements.
+                    reconstructed.to_xarray() / self.scalers._target
+                )
+                .to_dataframe()
+                .loc[input_idx, :]
+                .values
+            )
+
+        # Finally, return the scaled measurements as a np.ndarray corresponding to
+        # the input index order.
+        return target_transform
+
+    def add_lift_test_measurements(
+        self,
+        df_lift_test: pd.DataFrame,
+        dist: type[pm.Distribution] = pm.Gamma,
+        name: str = "lift_measurements",
+    ) -> None:
+        """Add lift tests to the model.
+
+        The model for the difference of a channel's saturation curve is created
+        from `x` and `x + delta_x` for each channel. This random variable is
+        then conditioned using the empirical lift, `delta_y`, and `sigma` of the lift test
+        with the specified distribution `dist`.
+
+        The pseudo-code for the lift test is as follows:
+
+        .. code-block:: python
+
+            model_estimated_lift = saturation_curve(x + delta_x) - saturation_curve(x)
+            empirical_lift = delta_y
+            dist(abs(model_estimated_lift), sigma=sigma, observed=abs(empirical_lift))
+
+
+        The model has to be built before adding the lift tests.
+
+        Parameters
+        ----------
+        df_lift_test : pd.DataFrame
+            DataFrame with lift test results with at least the following columns:
+                * `DIM_NAME`: dimension name. One column per dimension in `mmm.dims`.
+                * `channel`: channel name. Must be present in `channel_columns`.
+                * `x`: x axis value of the lift test.
+                * `delta_x`: change in x axis value of the lift test.
+                * `delta_y`: change in y axis value of the lift test.
+                * `sigma`: standard deviation of the lift test.
+        dist : pm.Distribution, optional
+            The distribution to use for the likelihood, by default pm.Gamma
+        name : str, optional
+            The name of the likelihood of the lift test contribution(s),
+            by default "lift_measurements". Name change required if calling
+            this method multiple times.
+
+        Raises
+        ------
+        RuntimeError
+            If the model has not been built yet.
+        KeyError
+            If the 'channel' column or any of the model dimensions is not present
+            in df_lift_test.
+
+        Examples
+        --------
+        Build the model first then add lift test measurements.
+
+        .. code-block:: python
+
+            import pandas as pd
+            import numpy as np
+
+            from pymc_marketing.mmm import GeometricAdstock, LogisticSaturation
+
+            from pymc_marketing.mmm.multidimensional import MMM
+
+            model = MMM(
+                date_column="date",
+                channel_columns=["x1", "x2"],
+                target_column="target",
+                adstock=GeometricAdstock(l_max=8),
+                saturation=LogisticSaturation(),
+                yearly_seasonality=2,
+                dims=("geo",),
+            )
+
+            X = pd.DataFrame(
+                {
+                    "date": np.tile(
+                        pd.date_range(start="2025-01-01", end="2025-05-01", freq="W"), 2
+                    ),
+                    "x1": np.random.rand(34),
+                    "x2": np.random.rand(34),
+                    "target": np.random.rand(34),
+                    "geo": 17 * ["FIN"] + 17 * ["SWE"],
+                }
+            )
+            y = X["target"]
+
+            model.build_model(X.drop(columns=["target"]), y)
+
+            df_lift_test = pd.DataFrame(
+                {
+                    "channel": ["x1", "x1"],
+                    "geo": ["FIN", "SWE"],
+                    "x": [1, 1],
+                    "delta_x": [0.1, 0.2],
+                    "delta_y": [0.1, 0.1],
+                    "sigma": [0.1, 0.1],
+                }
+            )
+
+            model.add_lift_test_measurements(df_lift_test)
+
+        """
+        if not hasattr(self, "model"):
+            raise RuntimeError(
+                "The model has not been built yet. Please, build the model first."
+            )
+
+        if "channel" not in df_lift_test.columns:
+            raise KeyError(
+                "The 'channel' column is required to map the lift measurements to the model."
+            )
+
+        for dim in self.dims:
+            if dim not in df_lift_test.columns:
+                raise KeyError(
+                    f"The {dim} column is required to map the lift measurements to the model."
+                )
+
+        # Function to scale "delta_y", and "sigma" to same scale as target in model.
+        target_transform = self._make_target_transform(df_lift_test)
+
+        # Function to scale "x" and "delta_x" to the same scale as their respective channels.
+        channel_transform = self._make_channel_transform(df_lift_test)
+
+        df_lift_test_scaled = scale_lift_measurements(
+            df_lift_test=df_lift_test,
+            channel_col="channel",
+            channel_columns=self.channel_columns,  # type: ignore
+            channel_transform=channel_transform,
+            target_transform=target_transform,
+            dim_cols=list(self.dims),
+        )
+        # This is coupled with the name of the
+        # latent process Deterministic
+        time_varying_var_name = (
+            "media_latent_process" if self.time_varying_media else None
+        )
+        add_lift_measurements_to_likelihood_from_saturation(
+            df_lift_test=df_lift_test_scaled,
+            saturation=self.saturation,
+            time_varying_var_name=time_varying_var_name,
+            model=self.model,
+            dist=dist,
+            name=name,
+        )
+
 
 def create_sample_kwargs(
     sampler_config: dict[str, Any] | None,
diff --git a/tests/mmm/test_multidimensional.py b/tests/mmm/test_multidimensional.py