|
17 | 17 |
|
18 | 18 | import json |
19 | 19 | import warnings |
20 | | -from collections.abc import Sequence |
| 20 | +from collections.abc import Callable, Sequence |
21 | 21 | from copy import deepcopy |
22 | 22 | from typing import Annotated, Any, Literal |
23 | 23 |
|
|
46 | 46 | ) |
47 | 47 | from pymc_marketing.mmm.events import EventEffect |
48 | 48 | from pymc_marketing.mmm.fourier import YearlyFourier |
| 49 | +from pymc_marketing.mmm.lift_test import ( |
| 50 | + add_lift_measurements_to_likelihood_from_saturation, |
| 51 | + scale_lift_measurements, |
| 52 | +) |
49 | 53 | from pymc_marketing.mmm.plot import MMMPlotSuite |
50 | 54 | from pymc_marketing.mmm.scaling import Scaling, VariableScaling |
51 | 55 | from pymc_marketing.mmm.tvp import infer_time_index |
@@ -94,6 +98,9 @@ class MMM(ModelBuilder): |
94 | 98 | Whether to use time-varying effects for media channels. |
95 | 99 | dims : tuple | None |
96 | 100 | Additional dimensions for the model. |
| 101 | + scaling : Scaling | dict | None |
| 102 | + Scaling methods to be used for the target variable and the marketing channels. |
| 103 | + Defaults to max scaling for both. |
97 | 104 | model_config : dict | None |
98 | 105 | Configuration settings for the model. |
99 | 106 | sampler_config : dict | None |
@@ -1505,6 +1512,242 @@ def sample_posterior_predictive( |
1505 | 1512 |
|
1506 | 1513 | return posterior_predictive_samples |
1507 | 1514 |
|
| 1515 | + def _make_channel_transform( |
| 1516 | + self, df_lift_test: pd.DataFrame |
| 1517 | + ) -> Callable[[np.ndarray], np.ndarray]: |
| 1518 | + """Create a function for transforming the channel data into the same scale as in the model. |
| 1519 | +
|
| 1520 | + Parameters |
| 1521 | + ---------- |
| 1522 | + df_lift_test : pd.DataFrame |
| 1523 | + Lift test measurements. |
| 1524 | +
|
| 1525 | + Returns |
| 1526 | + ------- |
| 1527 | + Callable[[np.ndarray], np.ndarray] |
| 1528 | + The function for scaling the channel data. |
| 1529 | + """ |
| 1530 | + # The transformer will be passed a np.ndarray of data corresponding to this index. |
| 1531 | + index_cols = [*list(self.dims), "channel"] |
| 1532 | + # We reconstruct the input dataframe following the transformations performed within |
| 1533 | + # `lift_test.scale_channel_lift_measurements()``. |
| 1534 | + input_df = ( |
| 1535 | + df_lift_test.loc[:, [*index_cols, "x", "delta_x"]] |
| 1536 | + .set_index(index_cols, append=True) |
| 1537 | + .stack() |
| 1538 | + .unstack(level=-2) |
| 1539 | + .reindex(self.channel_columns, axis=1) # type: ignore |
| 1540 | + .fillna(0) |
| 1541 | + ) |
| 1542 | + |
| 1543 | + def channel_transform(input: np.ndarray) -> np.ndarray: |
| 1544 | + """Transform lift test channel data to the same scale as in the model.""" |
| 1545 | + # reconstruct the df corresponding to the input np.ndarray. |
| 1546 | + reconstructed = ( |
| 1547 | + pd.DataFrame(data=input, index=input_df.index, columns=input_df.columns) |
| 1548 | + .stack() |
| 1549 | + .unstack(level=-2) |
| 1550 | + ) |
| 1551 | + return ( |
| 1552 | + ( |
| 1553 | + # Scale the data according to the scaler coords. |
| 1554 | + reconstructed.to_xarray() / self.scalers._channel |
| 1555 | + ) |
| 1556 | + .to_dataframe() |
| 1557 | + .fillna(0) |
| 1558 | + .stack() |
| 1559 | + .unstack(level=-2) |
| 1560 | + .loc[input_df.index, :] |
| 1561 | + .values |
| 1562 | + ) |
| 1563 | + |
| 1564 | + # Finally return the scaled data as a np.ndarray corresponding to the input index order. |
| 1565 | + return channel_transform |
| 1566 | + |
| 1567 | + def _make_target_transform( |
| 1568 | + self, df_lift_test: pd.DataFrame |
| 1569 | + ) -> Callable[[np.ndarray], np.ndarray]: |
| 1570 | + """Create a function for transforming the target measurements into the same scale as in the model. |
| 1571 | +
|
| 1572 | + Parameters |
| 1573 | + ---------- |
| 1574 | + df_lift_test : pd.DataFrame |
| 1575 | + Lift test measurements. |
| 1576 | +
|
| 1577 | + Returns |
| 1578 | + ------- |
| 1579 | + Callable[[np.ndarray], np.ndarray] |
| 1580 | + The function for scaling the target data. |
| 1581 | + """ |
| 1582 | + # These are the same order as in the original lift test measurements. |
| 1583 | + index_cols = [*list(self.dims), "channel"] |
| 1584 | + input_idx = df_lift_test.set_index(index_cols, append=True).index |
| 1585 | + |
| 1586 | + def target_transform(input: np.ndarray) -> np.ndarray: |
| 1587 | + """Transform lift test measurements and sigma to the same scale as in the model.""" |
| 1588 | + # Reconstruct the input df column with the correct index. |
| 1589 | + reconstructed = pd.DataFrame( |
| 1590 | + data=input, index=input_idx, columns=["target"] |
| 1591 | + ) |
| 1592 | + return ( |
| 1593 | + ( |
| 1594 | + # Scale the measurements. |
| 1595 | + reconstructed.to_xarray() / self.scalers._target |
| 1596 | + ) |
| 1597 | + .to_dataframe() |
| 1598 | + .loc[input_idx, :] |
| 1599 | + .values |
| 1600 | + ) |
| 1601 | + |
| 1602 | + # Finally, return the scaled measurements as a np.ndarray corresponding to |
| 1603 | + # the input index order. |
| 1604 | + return target_transform |
| 1605 | + |
| 1606 | + def add_lift_test_measurements( |
| 1607 | + self, |
| 1608 | + df_lift_test: pd.DataFrame, |
| 1609 | + dist: type[pm.Distribution] = pm.Gamma, |
| 1610 | + name: str = "lift_measurements", |
| 1611 | + ) -> None: |
| 1612 | + """Add lift tests to the model. |
| 1613 | +
|
| 1614 | + The model for the difference of a channel's saturation curve is created |
| 1615 | + from `x` and `x + delta_x` for each channel. This random variable is |
| 1616 | + then conditioned using the empirical lift, `delta_y`, and `sigma` of the lift test |
| 1617 | + with the specified distribution `dist`. |
| 1618 | +
|
| 1619 | + The pseudo-code for the lift test is as follows: |
| 1620 | +
|
| 1621 | + .. code-block:: python |
| 1622 | +
|
| 1623 | + model_estimated_lift = saturation_curve(x + delta_x) - saturation_curve(x) |
| 1624 | + empirical_lift = delta_y |
| 1625 | + dist(abs(model_estimated_lift), sigma=sigma, observed=abs(empirical_lift)) |
| 1626 | +
|
| 1627 | +
|
| 1628 | + The model has to be built before adding the lift tests. |
| 1629 | +
|
| 1630 | + Parameters |
| 1631 | + ---------- |
| 1632 | + df_lift_test : pd.DataFrame |
| 1633 | + DataFrame with lift test results with at least the following columns: |
| 1634 | + * `DIM_NAME`: dimension name. One column per dimension in `mmm.dims`. |
| 1635 | + * `channel`: channel name. Must be present in `channel_columns`. |
| 1636 | + * `x`: x axis value of the lift test. |
| 1637 | + * `delta_x`: change in x axis value of the lift test. |
| 1638 | + * `delta_y`: change in y axis value of the lift test. |
| 1639 | + * `sigma`: standard deviation of the lift test. |
| 1640 | + dist : pm.Distribution, optional |
| 1641 | + The distribution to use for the likelihood, by default pm.Gamma |
| 1642 | + name : str, optional |
| 1643 | + The name of the likelihood of the lift test contribution(s), |
| 1644 | + by default "lift_measurements". Name change required if calling |
| 1645 | + this method multiple times. |
| 1646 | +
|
| 1647 | + Raises |
| 1648 | + ------ |
| 1649 | + RuntimeError |
| 1650 | + If the model has not been built yet. |
| 1651 | + KeyError |
| 1652 | + If the 'channel' column or any of the model dimensions is not present |
| 1653 | + in df_lift_test. |
| 1654 | +
|
| 1655 | + Examples |
| 1656 | + -------- |
| 1657 | + Build the model first then add lift test measurements. |
| 1658 | +
|
| 1659 | + .. code-block:: python |
| 1660 | +
|
| 1661 | + import pandas as pd |
| 1662 | + import numpy as np |
| 1663 | +
|
| 1664 | + from pymc_marketing.mmm import GeometricAdstock, LogisticSaturation |
| 1665 | +
|
| 1666 | + from pymc_marketing.mmm.multidimensional import MMM |
| 1667 | +
|
| 1668 | + model = MMM( |
| 1669 | + date_column="date", |
| 1670 | + channel_columns=["x1", "x2"], |
| 1671 | + target_column="target", |
| 1672 | + adstock=GeometricAdstock(l_max=8), |
| 1673 | + saturation=LogisticSaturation(), |
| 1674 | + yearly_seasonality=2, |
| 1675 | + dims=("geo",), |
| 1676 | + ) |
| 1677 | +
|
| 1678 | + X = pd.DataFrame( |
| 1679 | + { |
| 1680 | + "date": np.tile( |
| 1681 | + pd.date_range(start="2025-01-01", end="2025-05-01", freq="W"), 2 |
| 1682 | + ), |
| 1683 | + "x1": np.random.rand(34), |
| 1684 | + "x2": np.random.rand(34), |
| 1685 | + "target": np.random.rand(34), |
| 1686 | + "geo": 17 * ["FIN"] + 17 * ["SWE"], |
| 1687 | + } |
| 1688 | + ) |
| 1689 | + y = X["target"] |
| 1690 | +
|
| 1691 | + model.build_model(X.drop(columns=["target"]), y) |
| 1692 | +
|
| 1693 | + df_lift_test = pd.DataFrame( |
| 1694 | + { |
| 1695 | + "channel": ["x1", "x1"], |
| 1696 | + "geo": ["FIN", "SWE"], |
| 1697 | + "x": [1, 1], |
| 1698 | + "delta_x": [0.1, 0.2], |
| 1699 | + "delta_y": [0.1, 0.1], |
| 1700 | + "sigma": [0.1, 0.1], |
| 1701 | + } |
| 1702 | + ) |
| 1703 | +
|
| 1704 | + model.add_lift_test_measurements(df_lift_test) |
| 1705 | +
|
| 1706 | + """ |
| 1707 | + if not hasattr(self, "model"): |
| 1708 | + raise RuntimeError( |
| 1709 | + "The model has not been built yet. Please, build the model first." |
| 1710 | + ) |
| 1711 | + |
| 1712 | + if "channel" not in df_lift_test.columns: |
| 1713 | + raise KeyError( |
| 1714 | + "The 'channel' column is required to map the lift measurements to the model." |
| 1715 | + ) |
| 1716 | + |
| 1717 | + for dim in self.dims: |
| 1718 | + if dim not in df_lift_test.columns: |
| 1719 | + raise KeyError( |
| 1720 | + f"The {dim} column is required to map the lift measurements to the model." |
| 1721 | + ) |
| 1722 | + |
| 1723 | + # Function to scale "delta_y", and "sigma" to same scale as target in model. |
| 1724 | + target_transform = self._make_target_transform(df_lift_test) |
| 1725 | + |
| 1726 | + # Function to scale "x" and "delta_x" to the same scale as their respective channels. |
| 1727 | + channel_transform = self._make_channel_transform(df_lift_test) |
| 1728 | + |
| 1729 | + df_lift_test_scaled = scale_lift_measurements( |
| 1730 | + df_lift_test=df_lift_test, |
| 1731 | + channel_col="channel", |
| 1732 | + channel_columns=self.channel_columns, # type: ignore |
| 1733 | + channel_transform=channel_transform, |
| 1734 | + target_transform=target_transform, |
| 1735 | + dim_cols=list(self.dims), |
| 1736 | + ) |
| 1737 | + # This is coupled with the name of the |
| 1738 | + # latent process Deterministic |
| 1739 | + time_varying_var_name = ( |
| 1740 | + "media_latent_process" if self.time_varying_media else None |
| 1741 | + ) |
| 1742 | + add_lift_measurements_to_likelihood_from_saturation( |
| 1743 | + df_lift_test=df_lift_test_scaled, |
| 1744 | + saturation=self.saturation, |
| 1745 | + time_varying_var_name=time_varying_var_name, |
| 1746 | + model=self.model, |
| 1747 | + dist=dist, |
| 1748 | + name=name, |
| 1749 | + ) |
| 1750 | + |
1508 | 1751 |
|
1509 | 1752 | def create_sample_kwargs( |
1510 | 1753 | sampler_config: dict[str, Any] | None, |
|
0 commit comments