Bug: Sample posterior predictive raise error if dates overlap (#1778)

cetagostini · williambdean · web-flow · commit 900926b28874 · 2025-06-17T17:45:25.000-04:00
* Bug: Sample posterior predictive raise error if dates overlap

* Automatic casting based on target type

* William feedback

---------

Co-authored-by: Will Dean &lt;57733339+williambdean@users.noreply.github.com&gt;
diff --git a/pymc_marketing/mmm/multidimensional.py b/pymc_marketing/mmm/multidimensional.py
@@ -1159,6 +1159,43 @@ def create_deterministic(x: pt.TensorVariable) -> None:
                 observed=target_data_scaled,
             )
 
+    def _validate_date_overlap_with_include_last_observations(
+        self, X: pd.DataFrame, include_last_observations: bool
+    ) -> None:
+        """Validate that include_last_observations is not used with overlapping dates.
+
+        Parameters
+        ----------
+        X : pd.DataFrame
+            The input data for prediction.
+        include_last_observations : bool
+            Whether to include the last observations of the training data.
+
+        Raises
+        ------
+        ValueError
+            If include_last_observations=True and input dates overlap with training dates.
+        """
+        if not include_last_observations:
+            return
+
+        # Get training dates and input dates
+        training_dates = pd.to_datetime(self.model_coords["date"])
+        input_dates = pd.to_datetime(X[self.date_column].unique())
+
+        # Check for overlap
+        overlapping_dates = set(training_dates).intersection(set(input_dates))
+
+        if overlapping_dates:
+            overlapping_dates_str = ", ".join(
+                sorted([str(d.date()) for d in overlapping_dates])
+            )
+            raise ValueError(
+                f"Cannot use include_last_observations=True when input dates overlap with training dates. "
+                f"Overlapping dates found: {overlapping_dates_str}. "
+                f"Either set include_last_observations=False or use input dates that don't overlap with training data."
+            )
+
     def _posterior_predictive_data_transformation(
         self,
         X: pd.DataFrame,
@@ -1181,6 +1218,11 @@ def _posterior_predictive_data_transformation(
         xr.Dataset
             The transformed data in xarray format.
         """
+        # Validate that include_last_observations is not used with overlapping dates
+        self._validate_date_overlap_with_include_last_observations(
+            X, include_last_observations
+        )
+
         dataarrays = []
         if include_last_observations:
             last_obs = self.xarray_dataset.isel(date=slice(-self.adstock.l_max, None))
@@ -1220,13 +1262,15 @@ def _posterior_predictive_data_transformation(
             )
         else:
             # Return empty xarray with same dimensions as the target but full of zeros
+            # Use the same dtype as the existing target data to avoid dtype mismatches
+            target_dtype = self.xarray_dataset._target.dtype
             y_xarray = xr.DataArray(
                 np.zeros(
                     (
                         X[self.date_column].nunique(),
                         *[len(self.xarray_dataset.coords[dim]) for dim in self.dims],
                     ),
-                    dtype=np.int32,
+                    dtype=target_dtype,
                 ),
                 dims=("date", *self.dims),
                 coords={
diff --git a/tests/mmm/test_multidimensional.py b/tests/mmm/test_multidimensional.py
@@ -418,6 +418,132 @@ def test_sample_posterior_predictive_same_data(single_dim_data, mock_pymc_sample
     )
 
 
+def test_sample_posterior_predictive_same_data_with_include_last_observations(
+    single_dim_data, mock_pymc_sample
+):
+    """
+    Test that using include_last_observations=True with training data (overlapping dates)
+    raises a ValueError with a clear error message.
+    """
+    X, y = single_dim_data
+    X_train = X.iloc[:-5]
+    y_train = y.iloc[:-5]
+
+    # Build and fit the model
+    adstock = GeometricAdstock(l_max=2)
+    saturation = LogisticSaturation()
+
+    mmm = MMM(
+        date_column="date",
+        target_column="target",
+        channel_columns=["channel_1", "channel_2", "channel_3"],
+        adstock=adstock,
+        saturation=saturation,
+    )
+
+    mmm.build_model(X_train, y_train)
+    mmm.fit(X_train, y_train, draws=200, tune=100, chains=1, random_seed=123)
+
+    # Try to use include_last_observations=True with the same training data
+    # This should raise a ValueError
+    with pytest.raises(
+        ValueError,
+        match="Cannot use include_last_observations=True when input dates overlap with training dates",
+    ):
+        mmm.sample_posterior_predictive(
+            X_train,  # Same training data
+            include_last_observations=True,  # This should trigger the error
+            extend_idata=False,
+            random_seed=123,
+        )
+
+
+def test_sample_posterior_predictive_partial_overlap_with_include_last_observations(
+    single_dim_data, mock_pymc_sample
+):
+    """
+    Test that even partial date overlap with include_last_observations=True raises ValueError.
+    """
+    X, y = single_dim_data
+    X_train = X.iloc[:-5]
+    y_train = y.iloc[:-5]
+
+    # Build and fit the model
+    adstock = GeometricAdstock(l_max=2)
+    saturation = LogisticSaturation()
+
+    mmm = MMM(
+        date_column="date",
+        target_column="target",
+        channel_columns=["channel_1", "channel_2", "channel_3"],
+        adstock=adstock,
+        saturation=saturation,
+    )
+
+    mmm.build_model(X_train, y_train)
+    mmm.fit(X_train, y_train, draws=200, tune=100, chains=1, random_seed=123)
+
+    # Create data that partially overlaps with training data
+    # Take the last 3 training dates + 3 new future dates
+    overlap_data = X.iloc[-8:-2]  # This will include some training dates
+
+    # This should raise a ValueError due to partial overlap
+    with pytest.raises(
+        ValueError,
+        match="Cannot use include_last_observations=True when input dates overlap with training dates",
+    ):
+        mmm.sample_posterior_predictive(
+            overlap_data,
+            include_last_observations=True,
+            extend_idata=False,
+            random_seed=123,
+        )
+
+
+def test_sample_posterior_predictive_no_overlap_with_include_last_observations(
+    single_dim_data, mock_pymc_sample
+):
+    """
+    Test that include_last_observations=True works correctly when there's no date overlap.
+    """
+    X, y = single_dim_data
+    X_train = X.iloc[:-5]
+    X_new = X.iloc[-5:]  # Non-overlapping future dates
+    y_train = y.iloc[:-5]
+
+    # Build and fit the model
+    adstock = GeometricAdstock(l_max=2)
+    saturation = LogisticSaturation()
+
+    mmm = MMM(
+        date_column="date",
+        target_column="target",
+        channel_columns=["channel_1", "channel_2", "channel_3"],
+        adstock=adstock,
+        saturation=saturation,
+    )
+
+    mmm.build_model(X_train, y_train)
+    mmm.fit(X_train, y_train, draws=200, tune=100, chains=1, random_seed=123)
+
+    # This should work fine since dates don't overlap
+    try:
+        result = mmm.sample_posterior_predictive(
+            X_new,  # Non-overlapping dates
+            include_last_observations=True,  # Should work fine
+            extend_idata=False,
+            random_seed=123,
+        )
+
+        # Verify that the result includes the expected dates
+        # (should be l_max training dates + new prediction dates, then sliced to remove l_max)
+        expected_dates = X_new["date"].values
+        np.testing.assert_array_equal(result.coords["date"].values, expected_dates)
+
+    except ValueError as e:
+        pytest.fail(f"Unexpected error when using non-overlapping dates: {e}")
+
+
 @pytest.fixture
 def df_events() -> pd.DataFrame:
     return pd.DataFrame(