From 4d653a9a8d9c21952d242676ce26633cd4e0282b Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Fri, 23 Feb 2024 23:03:16 +0200 Subject: [PATCH 01/24] add group by variables to base forecast transformer --- .../forecasting/base_forecast_transformers.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/feature_engine/timeseries/forecasting/base_forecast_transformers.py b/feature_engine/timeseries/forecasting/base_forecast_transformers.py index d6e9fa30f..5f3fb3279 100644 --- a/feature_engine/timeseries/forecasting/base_forecast_transformers.py +++ b/feature_engine/timeseries/forecasting/base_forecast_transformers.py @@ -51,6 +51,9 @@ class BaseForecastTransformer(BaseEstimator, TransformerMixin, GetFeatureNamesOu {drop_original} + group_by_variables: str, list of str, default=None + variable of list of variables to create lag features based on. + Attributes ---------- {feature_names_in_} @@ -64,6 +67,7 @@ def __init__( variables: Union[None, int, str, List[Union[str, int]]] = None, missing_values: str = "raise", drop_original: bool = False, + group_by_variables: Optional[Union[str, List[str]]] = None, ) -> None: if missing_values not in ["raise", "ignore"]: @@ -78,9 +82,26 @@ def __init__( f"Got {drop_original} instead." ) + # check validity if group by variables passed + if group_by_variables: + # check group by variables data-types + if not ( + isinstance(group_by_variables, str) + or isinstance(group_by_variables, list) + ): + raise ValueError( + "group_by_variables must be an string or a list of strings. " + f"Got {group_by_variables} instead." + ) + # check if passed list has duplicates. + if isinstance(group_by_variables, list): + if len(set(group_by_variables)) != len(group_by_variables): + raise ValueError(f"group_by_variables contains duplicate values") + self.variables = _check_variables_input_value(variables) self.missing_values = missing_values self.drop_original = drop_original + self.group_by_variables = group_by_variables def _check_index(self, X: pd.DataFrame): """ @@ -165,6 +186,18 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if self.missing_values == "raise": self._check_na_and_inf(X) + if self.group_by_variables: + # check if input group by variables is in input dataframe variables. + # set of differences between input group by variables and dataframe variables + # valid if no differences between both + if isinstance(self.group_by_variables, list): + diff = set(self.group_by_variables).difference(X.columns.tolist()) + if len(diff) != 0: + raise ValueError(f"{list(diff)} not exist in dataframe") + else: + if self.group_by_variables not in X.columns.tolist(): + raise ValueError(f"{list(diff)} not exists in dataframe") + self._get_feature_names_in(X) return self From 4e9d8499bbd632956ebdf4ea0bb70cb84cf95889 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Fri, 23 Feb 2024 23:04:18 +0200 Subject: [PATCH 02/24] add group by variables to lag_features --- .../timeseries/forecasting/lag_features.py | 81 ++++++++++++++----- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 19822ea5f..6a50b621c 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -1,7 +1,7 @@ # Authors: Morgan Sell # License: BSD 3 clause -from typing import List, Union +from typing import List, Union, Optional import pandas as pd @@ -74,6 +74,9 @@ class LagFeatures(BaseForecastTransformer): {drop_original} + group_by_variables: str, list of str, default=None + variable of list of variables to create lag features based on. + Attributes ---------- variables_: @@ -117,6 +120,27 @@ class LagFeatures(BaseForecastTransformer): 2 2022-09-20 3 8 2.0 7.0 1.0 6.0 3 2022-09-21 4 9 3.0 8.0 2.0 7.0 4 2022-09-22 5 10 4.0 9.0 3.0 8.0 + + create lags based on other variables. + >>> import pandas as pd + >>> from feature_engine.timeseries.forecasting import LagFeatures + >>> X = pd.DataFrame(dict(date = ["2022-09-18", + >>> "2022-09-19", + >>> "2022-09-20", + >>> "2022-09-21", + >>> "2022-09-22"], + >>> x1 = [1,2,3,4,5], + >>> x2 = [6,7,8,9,10], + >>> x3 = ['a','b','a','b','a'] + >>> )) + >>> lf = LagFeatures(periods=[1,2], group_by_variables='x3') + >>> lf.fit_transform(X) + date x1 x2 x3 x1_lag_1 x2_lag_1 x1_lag_2 x2_lag_2 + 0 2022-09-18 1 6 a NaN NaN NaN NaN + 1 2022-09-19 2 7 b NaN NaN NaN NaN + 2 2022-09-20 3 8 a 1.0 6.0 NaN NaN + 3 2022-09-21 4 9 b 2.0 7.0 NaN NaN + 4 2022-09-22 5 10 a 3.0 8.0 1.0 6.0 """ def __init__( @@ -127,6 +151,7 @@ def __init__( sort_index: bool = True, missing_values: str = "raise", drop_original: bool = False, + group_by_variables: Optional[Union[str, List[str]]] = None, ) -> None: if not ( @@ -151,7 +176,7 @@ def __init__( "sort_index takes values True and False." f"Got {sort_index} instead." ) - super().__init__(variables, missing_values, drop_original) + super().__init__(variables, missing_values, drop_original, group_by_variables) self.periods = periods self.freq = freq @@ -180,35 +205,55 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if isinstance(self.freq, list): df_ls = [] for fr in self.freq: - tmp = X[self.variables_].shift( - freq=fr, - axis=0, - ) + if self.group_by_variables: + tmp = X.groupby(self.group_by_variables)[self.variables_].shift( + freq=fr, + ) + else: + tmp = X[self.variables_].shift( + freq=fr, + axis=0, + ) df_ls.append(tmp) tmp = pd.concat(df_ls, axis=1) else: - tmp = X[self.variables_].shift( - freq=self.freq, - axis=0, - ) + if self.group_by_variables: + tmp = X.groupby(self.group_by_variables)[self.variables_].shift( + freq=self.freq, + ) + else: + tmp = X[self.variables_].shift( + freq=self.freq, + axis=0, + ) else: if isinstance(self.periods, list): df_ls = [] for pr in self.periods: - tmp = X[self.variables_].shift( - periods=pr, - axis=0, - ) + if self.group_by_variables: + tmp = X.groupby(self.group_by_variables)[self.variables_].shift( + periods=pr, + ) + else: + tmp = X[self.variables_].shift( + periods=pr, + axis=0, + ) df_ls.append(tmp) tmp = pd.concat(df_ls, axis=1) else: - tmp = X[self.variables_].shift( - periods=self.periods, - axis=0, - ) + if self.group_by_variables: + tmp = X.groupby(self.group_by_variables)[self.variables_].shift( + periods=self.periods, + ) + else: + tmp = X[self.variables_].shift( + periods=self.periods, + axis=0, + ) tmp.columns = self._get_new_features_name() From 7f403918215869e0e246d1179ce34ec6ba9980a5 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Sun, 25 Feb 2024 19:54:13 +0200 Subject: [PATCH 03/24] add group by window features --- .../timeseries/forecasting/window_features.py | 73 +++++++++++++++---- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index 3cb89ccfa..8a8937882 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Union +from typing import Callable, List, Union, Optional import pandas as pd @@ -98,6 +98,9 @@ class WindowFeatures(BaseForecastTransformer): {drop_original} + group_by_variables: str, list of str, default=None + variable of list of variables to create lag features based on. + Attributes ---------- variables_: @@ -156,6 +159,7 @@ def __init__( sort_index: bool = True, missing_values: str = "raise", drop_original: bool = False, + group_by_variables: Optional[Union[str, List[str]]] = None, ) -> None: if isinstance(window, list) and len(window) != len(set(window)): @@ -176,7 +180,7 @@ def __init__( f"periods must be a positive integer. Got {periods} instead." ) - super().__init__(variables, missing_values, drop_original) + super().__init__(variables, missing_values, drop_original, group_by_variables) self.window = window self.min_periods = min_periods @@ -205,22 +209,34 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if isinstance(self.window, list): df_ls = [] for win in self.window: + if self.group_by_variables: + tmp = self._agg_window_features( + grouped_df=X.groupby(self.group_by_variables), + window=win, + ) + else: + tmp = ( + X[self.variables_] + .rolling(window=win) + .agg(self.functions) + .shift(periods=self.periods, freq=self.freq) + ) + df_ls.append(tmp) + tmp = pd.concat(df_ls, axis=1) + + else: + if self.group_by_variables: + tmp = self._agg_window_features( + grouped_df=X.groupby(self.group_by_variables), + window=self.window, + ) + else: tmp = ( X[self.variables_] - .rolling(window=win) + .rolling(window=self.window) .agg(self.functions) .shift(periods=self.periods, freq=self.freq) ) - df_ls.append(tmp) - tmp = pd.concat(df_ls, axis=1) - - else: - tmp = ( - X[self.variables_] - .rolling(window=self.window) - .agg(self.functions) - .shift(periods=self.periods, freq=self.freq) - ) tmp.columns = self._get_new_features_name() @@ -254,3 +270,34 @@ def _get_new_features_name(self) -> List: ] return feature_names + + def _agg_window_features( + self, + grouped_df: pd.core.groupby.generic.DataFrameGroupBy, + window: int + ) -> Union[pd.Series, pd.DataFrame]: + """generate window features based on groups + Parameters + ---------- + grouped_df : pd.core.groupby.generic.DataFrameGroupBy + dataframe of groups + + window: int + Size of the moving window + + Returns + ------- + Union[pd.Series, pd.DataFrame] + returned window features + """ + tmp_data = [] + for _, group in grouped_df: + tmp = ( + group[self.variables_] + .rolling(window=window) + .agg(self.functions) + .shift(periods=self.periods, freq=self.freq) + ) + tmp_data.append(tmp) + tmp = pd.concat(tmp_data).sort_index() + return tmp From b476748a83881a7cb08a7044afd25a34753a8187 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Sun, 25 Feb 2024 20:32:11 +0200 Subject: [PATCH 04/24] add group by expanding window features --- .../forecasting/expanding_window_features.py | 80 +++++++++++++++++-- 1 file changed, 72 insertions(+), 8 deletions(-) diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py index 6a2e5037c..8057f47b7 100644 --- a/feature_engine/timeseries/forecasting/expanding_window_features.py +++ b/feature_engine/timeseries/forecasting/expanding_window_features.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import List +from typing import List, Union, Optional import pandas as pd @@ -139,6 +139,37 @@ class ExpandingWindowFeatures(BaseForecastTransformer): 2 2022-09-20 3 8 1.5 6.5 3 2022-09-21 4 9 2.0 7.0 4 2022-09-22 5 10 2.5 7.5 + + create expanding window features based on other variables. + >>> import pandas as pd + >>> from feature_engine.timeseries.forecasting import ExpandingWindowFeatures + >>> X = pd.DataFrame(dict(date = ["2022-09-18", + >>> "2022-09-19", + >>> "2022-09-20", + >>> "2022-09-21", + >>> "2022-09-22", + >>> "2022-09-18", + >>> "2022-09-19", + >>> "2022-09-20", + >>> "2022-09-21", + >>> "2022-09-22"], + >>> x1 = [1,2,3,4,5, 3,5,6,8,11], + >>> x2 = [6,7,8,9,10, 2,9,10,15,2], + >>> x3=['a','a','a','a','a', 'b','b','b','b','b'] + >>> )) + >>> ewf = ExpandingWindowFeatures(group_by_variables='x3') + >>> ewf.fit_transform(X) + date x1 x2 x3 x1_expanding_mean x2_expanding_mean + 0 2022-09-18 1 6 a NaN NaN + 1 2022-09-19 2 7 a 1.000000 6.0 + 2 2022-09-20 3 8 a 1.500000 6.5 + 3 2022-09-21 4 9 a 2.000000 7.0 + 4 2022-09-22 5 10 a 2.500000 7.5 + 5 2022-09-18 3 2 b NaN NaN + 6 2022-09-19 5 9 b 3.000000 2.0 + 7 2022-09-20 6 10 b 4.000000 5.5 + 8 2022-09-21 8 15 b 4.666667 7.0 + 9 2022-09-22 11 2 b 5.500000 9.0 """ def __init__( @@ -151,6 +182,7 @@ def __init__( sort_index: bool = True, missing_values: str = "raise", drop_original: bool = False, + group_by_variables: Optional[Union[str, List[str]]] = None, ) -> None: if not isinstance(functions, (str, list)) or not all( @@ -168,7 +200,7 @@ def __init__( f"periods must be a non-negative integer. Got {periods} instead." ) - super().__init__(variables, missing_values, drop_original) + super().__init__(variables, missing_values, drop_original, group_by_variables) self.min_periods = min_periods self.functions = functions @@ -193,12 +225,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # Common dataframe checks and setting up. X = self._check_transform_input_and_state(X) - tmp = ( - X[self.variables_] - .expanding(min_periods=self.min_periods) - .agg(self.functions) - .shift(periods=self.periods, freq=self.freq) - ) + if self.group_by_variables: + tmp = self._agg_expanding_window_features( + grouped_df=X.groupby(self.group_by_variables) + ) + else: + tmp = ( + X[self.variables_] + .expanding(min_periods=self.min_periods) + .agg(self.functions) + .shift(periods=self.periods, freq=self.freq) + ) tmp.columns = self._get_new_features_name() @@ -224,3 +261,30 @@ def _get_new_features_name(self) -> List: ] return feature_names + + def _agg_expanding_window_features( + self, + grouped_df: pd.core.groupby.generic.DataFrameGroupBy, + ) -> Union[pd.Series, pd.DataFrame]: + """generate expanding window features based on groups + Parameters + ---------- + grouped_df : pd.core.groupby.generic.DataFrameGroupBy + dataframe of groups + + Returns + ------- + Union[pd.Series, pd.DataFrame] + returned expanding window features + """ + tmp_data = [] + for _, group in grouped_df: + tmp = ( + group[self.variables_] + .expanding(min_periods=self.min_periods) + .agg(self.functions) + .shift(periods=self.periods, freq=self.freq) + ) + tmp_data.append(tmp) + tmp = pd.concat(tmp_data).sort_index() + return tmp From 02c59bdf9b1f49dfc9ae63a3192fbe524b0a29d0 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Sun, 25 Feb 2024 22:10:47 +0200 Subject: [PATCH 05/24] add test cases of groupby timeseries features --- .../test_expanding_window_features.py | 135 ++++++++++++++++++ .../test_forecasting/test_lag_features.py | 134 +++++++++++++++++ .../test_forecasting/test_window_features.py | 134 +++++++++++++++++ 3 files changed, 403 insertions(+) diff --git a/tests/test_time_series/test_forecasting/test_expanding_window_features.py b/tests/test_time_series/test_forecasting/test_expanding_window_features.py index cb33ea8e1..e5cf99670 100644 --- a/tests/test_time_series/test_forecasting/test_expanding_window_features.py +++ b/tests/test_time_series/test_forecasting/test_expanding_window_features.py @@ -1,6 +1,7 @@ import numpy as np import pytest from pandas.testing import assert_frame_equal +import pandas as pd from feature_engine.timeseries.forecasting import ExpandingWindowFeatures @@ -428,3 +429,137 @@ def test_expanding_window_raises_when_periods_negative(): ValueError, match="periods must be a non-negative integer. Got -1 instead." ): ExpandingWindowFeatures(periods=-1) + + +def test_correct_groupby_expanding_window_when_using_periods(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "ambient_temp_expanding_mean": [ + np.nan, + 31.31, + 31.41, + 31.656666666666666, + 31.84, + 31.996, + 32.08, + 32.142857142857146, + 32.21, + 32.382222222222225, + np.nan, + 34.08, + 33.89, + 33.89, + 33.9275, + ], + "irradiation_expanding_mean": [ + np.nan, + 0.51, + 0.65, + 0.65, + 0.6775, + 0.626, + 0.6033333333333334, + 0.5985714285714285, + 0.59375, + 0.61, + np.nan, + 0.47, + 0.505, + 0.47000000000000003, + 0.465, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by_variabels to color + transformer = ExpandingWindowFeatures( + variables=["ambient_temp", "irradiation"], group_by_variables="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) diff --git a/tests/test_time_series/test_forecasting/test_lag_features.py b/tests/test_time_series/test_forecasting/test_lag_features.py index 8ea349778..4bfb72848 100644 --- a/tests/test_time_series/test_forecasting/test_lag_features.py +++ b/tests/test_time_series/test_forecasting/test_lag_features.py @@ -233,3 +233,137 @@ def test_sort_index(df_time): A = Xs[transformer.variables_].iloc[0:4].values B = X_tr[transformer._get_new_features_name()].iloc[1:5].values assert (A == B).all() + + +def test_correct_groupby_lag_when_using_periods(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "ambient_temp_lag_3": [ + np.nan, + np.nan, + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + np.nan, + np.nan, + np.nan, + 34.08, + 33.7, + ], + "irradiation_lag_3": [ + np.nan, + np.nan, + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + np.nan, + np.nan, + np.nan, + 0.47, + 0.54, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by_variabels to color + transformer = LagFeatures( + variables=["ambient_temp", "irradiation"], periods=3, group_by_variables="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) diff --git a/tests/test_time_series/test_forecasting/test_window_features.py b/tests/test_time_series/test_forecasting/test_window_features.py index a03259b7e..50b373e72 100644 --- a/tests/test_time_series/test_forecasting/test_window_features.py +++ b/tests/test_time_series/test_forecasting/test_window_features.py @@ -454,3 +454,137 @@ def test_sort_index(df_time): assert_frame_equal( df_tr[transformer.variables_], Xs[transformer.variables_].sort_index() ) + + +def test_correct_groupby_window_when_using_periods(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "ambient_temp_window_3_mean": [ + np.nan, + np.nan, + np.nan, + 31.656666666666666, + 32.016666666666666, + 32.38666666666666, + 32.50333333333333, + 32.54666666666667, + 32.56666666666667, + 32.98666666666667, + np.nan, + np.nan, + np.nan, + 33.89, + 33.876666666666665, + ], + "irradiation_window_3_mean": [ + np.nan, + np.nan, + np.nan, + 0.65, + 0.7333333333333334, + 0.61, + 0.5566666666666668, + 0.49333333333333335, + 0.54, + 0.6233333333333334, + np.nan, + np.nan, + np.nan, + 0.47000000000000003, + 0.4633333333333334, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by_variabels to color + transformer = WindowFeatures( + variables=["ambient_temp", "irradiation"], window=3, group_by_variables="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) From 0dd92cc6626ff93c2172f89d447afc931026c3fa Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Sun, 25 Feb 2024 22:35:56 +0200 Subject: [PATCH 06/24] ensure code style tests --- .../timeseries/forecasting/base_forecast_transformers.py | 4 +--- .../timeseries/forecasting/expanding_window_features.py | 2 +- feature_engine/timeseries/forecasting/lag_features.py | 2 +- feature_engine/timeseries/forecasting/window_features.py | 8 +++----- .../test_forecasting/test_expanding_window_features.py | 2 +- 5 files changed, 7 insertions(+), 11 deletions(-) diff --git a/feature_engine/timeseries/forecasting/base_forecast_transformers.py b/feature_engine/timeseries/forecasting/base_forecast_transformers.py index 5f3fb3279..db9b196fc 100644 --- a/feature_engine/timeseries/forecasting/base_forecast_transformers.py +++ b/feature_engine/timeseries/forecasting/base_forecast_transformers.py @@ -96,7 +96,7 @@ def __init__( # check if passed list has duplicates. if isinstance(group_by_variables, list): if len(set(group_by_variables)) != len(group_by_variables): - raise ValueError(f"group_by_variables contains duplicate values") + raise ValueError("group_by_variables contains duplicate values") self.variables = _check_variables_input_value(variables) self.missing_values = missing_values @@ -188,8 +188,6 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if self.group_by_variables: # check if input group by variables is in input dataframe variables. - # set of differences between input group by variables and dataframe variables - # valid if no differences between both if isinstance(self.group_by_variables, list): diff = set(self.group_by_variables).difference(X.columns.tolist()) if len(diff) != 0: diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py index 8057f47b7..4163a8be1 100644 --- a/feature_engine/timeseries/forecasting/expanding_window_features.py +++ b/feature_engine/timeseries/forecasting/expanding_window_features.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import List, Union, Optional +from typing import List, Optional, Union import pandas as pd diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 6a50b621c..399c40a0e 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -1,7 +1,7 @@ # Authors: Morgan Sell # License: BSD 3 clause -from typing import List, Union, Optional +from typing import List, Optional, Union import pandas as pd diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index 8a8937882..adf8257f7 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Union, Optional +from typing import Callable, List, Optional, Union import pandas as pd @@ -270,11 +270,9 @@ def _get_new_features_name(self) -> List: ] return feature_names - + def _agg_window_features( - self, - grouped_df: pd.core.groupby.generic.DataFrameGroupBy, - window: int + self, grouped_df: pd.core.groupby.generic.DataFrameGroupBy, window: int ) -> Union[pd.Series, pd.DataFrame]: """generate window features based on groups Parameters diff --git a/tests/test_time_series/test_forecasting/test_expanding_window_features.py b/tests/test_time_series/test_forecasting/test_expanding_window_features.py index e5cf99670..6fa89eacd 100644 --- a/tests/test_time_series/test_forecasting/test_expanding_window_features.py +++ b/tests/test_time_series/test_forecasting/test_expanding_window_features.py @@ -1,7 +1,7 @@ import numpy as np +import pandas as pd import pytest from pandas.testing import assert_frame_equal -import pandas as pd from feature_engine.timeseries.forecasting import ExpandingWindowFeatures From 47de2d6b9a6821b8a268f68504f3c3f17a42c487 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Sun, 25 Feb 2024 23:18:43 +0200 Subject: [PATCH 07/24] fixing typehint errors --- feature_engine/selection/drop_psi_features.py | 4 +-- .../forecasting/base_forecast_transformers.py | 33 ++++++++----------- .../forecasting/expanding_window_features.py | 21 ++++-------- .../timeseries/forecasting/lag_features.py | 21 ++++-------- .../timeseries/forecasting/window_features.py | 33 ++++++++----------- .../test_check_estimator_forecasting.py | 10 +++--- 6 files changed, 47 insertions(+), 75 deletions(-) diff --git a/feature_engine/selection/drop_psi_features.py b/feature_engine/selection/drop_psi_features.py index e425f674e..aed8a4f21 100644 --- a/feature_engine/selection/drop_psi_features.py +++ b/feature_engine/selection/drop_psi_features.py @@ -1,5 +1,5 @@ import datetime -from typing import List, Union +from typing import List, Union, Dict import numpy as np import pandas as pd @@ -475,7 +475,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None): threshold_cat = self.threshold # Compute the PSI by looping over the features - self.psi_values_ = {} + self.psi_values_: Dict = {} self.features_to_drop_ = [] # Compute PSI for numerical features diff --git a/feature_engine/timeseries/forecasting/base_forecast_transformers.py b/feature_engine/timeseries/forecasting/base_forecast_transformers.py index db9b196fc..80a82a520 100644 --- a/feature_engine/timeseries/forecasting/base_forecast_transformers.py +++ b/feature_engine/timeseries/forecasting/base_forecast_transformers.py @@ -5,30 +5,21 @@ from sklearn.utils.validation import check_is_fitted from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin -from feature_engine._check_init_parameters.check_variables import ( - _check_variables_input_value, -) +from feature_engine._check_init_parameters.check_variables import \ + _check_variables_input_value from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _n_features_in_docstring, -) + _feature_names_in_docstring, _n_features_in_docstring) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _drop_original_docstring, - _missing_values_docstring, -) + _drop_original_docstring, _missing_values_docstring) from feature_engine._docstrings.methods import _fit_not_learn_docstring from feature_engine._docstrings.substitute import Substitution -from feature_engine.dataframe_checks import ( - _check_contains_inf, - _check_contains_na, - _check_X_matches_training_df, - check_X, -) +from feature_engine.dataframe_checks import (_check_contains_inf, + _check_contains_na, + _check_X_matches_training_df, + check_X) from feature_engine.tags import _return_tags -from feature_engine.variable_handling import ( - check_numerical_variables, - find_numerical_variables, -) +from feature_engine.variable_handling import (check_numerical_variables, + find_numerical_variables) @Substitution( @@ -194,7 +185,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): raise ValueError(f"{list(diff)} not exist in dataframe") else: if self.group_by_variables not in X.columns.tolist(): - raise ValueError(f"{list(diff)} not exists in dataframe") + raise ValueError( + f"{self.group_by_variables} not exists in dataframe" + ) self._get_feature_names_in(X) diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py index 4163a8be1..a1981c28c 100644 --- a/feature_engine/timeseries/forecasting/expanding_window_features.py +++ b/feature_engine/timeseries/forecasting/expanding_window_features.py @@ -8,22 +8,15 @@ import pandas as pd from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _n_features_in_docstring, -) + _feature_names_in_docstring, _n_features_in_docstring) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _drop_original_docstring, - _missing_values_docstring, - _variables_numerical_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_not_learn_docstring, - _fit_transform_docstring, -) + _drop_original_docstring, _missing_values_docstring, + _variables_numerical_docstring) +from feature_engine._docstrings.methods import (_fit_not_learn_docstring, + _fit_transform_docstring) from feature_engine._docstrings.substitute import Substitution -from feature_engine.timeseries.forecasting.base_forecast_transformers import ( - BaseForecastTransformer, -) +from feature_engine.timeseries.forecasting.base_forecast_transformers import \ + BaseForecastTransformer @Substitution( diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 399c40a0e..5333491a4 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -6,22 +6,15 @@ import pandas as pd from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _n_features_in_docstring, -) + _feature_names_in_docstring, _n_features_in_docstring) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _drop_original_docstring, - _missing_values_docstring, - _variables_numerical_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_not_learn_docstring, - _fit_transform_docstring, -) + _drop_original_docstring, _missing_values_docstring, + _variables_numerical_docstring) +from feature_engine._docstrings.methods import (_fit_not_learn_docstring, + _fit_transform_docstring) from feature_engine._docstrings.substitute import Substitution -from feature_engine.timeseries.forecasting.base_forecast_transformers import ( - BaseForecastTransformer, -) +from feature_engine.timeseries.forecasting.base_forecast_transformers import \ + BaseForecastTransformer @Substitution( diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index adf8257f7..def4c7706 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -3,22 +3,15 @@ import pandas as pd from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _n_features_in_docstring, -) + _feature_names_in_docstring, _n_features_in_docstring) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _drop_original_docstring, - _missing_values_docstring, - _variables_numerical_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_not_learn_docstring, - _fit_transform_docstring, -) + _drop_original_docstring, _missing_values_docstring, + _variables_numerical_docstring) +from feature_engine._docstrings.methods import (_fit_not_learn_docstring, + _fit_transform_docstring) from feature_engine._docstrings.substitute import Substitution -from feature_engine.timeseries.forecasting.base_forecast_transformers import ( - BaseForecastTransformer, -) +from feature_engine.timeseries.forecasting.base_forecast_transformers import \ + BaseForecastTransformer @Substitution( @@ -212,7 +205,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if self.group_by_variables: tmp = self._agg_window_features( grouped_df=X.groupby(self.group_by_variables), - window=win, + win=win, ) else: tmp = ( @@ -228,7 +221,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if self.group_by_variables: tmp = self._agg_window_features( grouped_df=X.groupby(self.group_by_variables), - window=self.window, + win=self.window, ) else: tmp = ( @@ -272,7 +265,9 @@ def _get_new_features_name(self) -> List: return feature_names def _agg_window_features( - self, grouped_df: pd.core.groupby.generic.DataFrameGroupBy, window: int + self, + grouped_df: pd.core.groupby.generic.DataFrameGroupBy, + win: Union[str, int, Callable, List[int], List[str]], ) -> Union[pd.Series, pd.DataFrame]: """generate window features based on groups Parameters @@ -280,7 +275,7 @@ def _agg_window_features( grouped_df : pd.core.groupby.generic.DataFrameGroupBy dataframe of groups - window: int + window: Union[str, int, Callable, List[int], List[str]] Size of the moving window Returns @@ -292,7 +287,7 @@ def _agg_window_features( for _, group in grouped_df: tmp = ( group[self.variables_] - .rolling(window=window) + .rolling(window=win) .agg(self.functions) .shift(periods=self.periods, freq=self.freq) ) diff --git a/tests/test_time_series/test_forecasting/test_check_estimator_forecasting.py b/tests/test_time_series/test_forecasting/test_check_estimator_forecasting.py index 2ac81edad..978ef9ddf 100644 --- a/tests/test_time_series/test_forecasting/test_check_estimator_forecasting.py +++ b/tests/test_time_series/test_forecasting/test_check_estimator_forecasting.py @@ -5,12 +5,10 @@ from sklearn.pipeline import Pipeline from sklearn.utils.estimator_checks import check_estimator -from feature_engine.timeseries.forecasting import ( - ExpandingWindowFeatures, - LagFeatures, - WindowFeatures, -) -from tests.estimator_checks.estimator_checks import check_feature_engine_estimator +from feature_engine.timeseries.forecasting import (ExpandingWindowFeatures, + LagFeatures, WindowFeatures) +from tests.estimator_checks.estimator_checks import \ + check_feature_engine_estimator _estimators = [ LagFeatures(missing_values="ignore"), From dd43c27e7c28712d533365b5d9693874d20e2b76 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Sun, 25 Feb 2024 23:55:08 +0200 Subject: [PATCH 08/24] fixing docs indentation issue --- .../timeseries/forecasting/expanding_window_features.py | 1 - 1 file changed, 1 deletion(-) diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py index a1981c28c..01590f583 100644 --- a/feature_engine/timeseries/forecasting/expanding_window_features.py +++ b/feature_engine/timeseries/forecasting/expanding_window_features.py @@ -132,7 +132,6 @@ class ExpandingWindowFeatures(BaseForecastTransformer): 2 2022-09-20 3 8 1.5 6.5 3 2022-09-21 4 9 2.0 7.0 4 2022-09-22 5 10 2.5 7.5 - create expanding window features based on other variables. >>> import pandas as pd >>> from feature_engine.timeseries.forecasting import ExpandingWindowFeatures From 7459811e5d0aaa8fbca178c259884c235217c3fa Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Mon, 26 Feb 2024 00:02:30 +0200 Subject: [PATCH 09/24] fixing docs indentation issue in lag_features --- feature_engine/timeseries/forecasting/lag_features.py | 1 - 1 file changed, 1 deletion(-) diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 5333491a4..dbbec5443 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -113,7 +113,6 @@ class LagFeatures(BaseForecastTransformer): 2 2022-09-20 3 8 2.0 7.0 1.0 6.0 3 2022-09-21 4 9 3.0 8.0 2.0 7.0 4 2022-09-22 5 10 4.0 9.0 3.0 8.0 - create lags based on other variables. >>> import pandas as pd >>> from feature_engine.timeseries.forecasting import LagFeatures From 12aa825298befb9b3b42e68a9e3f8e7a6f3f6bd9 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Thu, 29 Feb 2024 17:42:58 +0200 Subject: [PATCH 10/24] adjust formatting and code style in tests --- .../test_forecasting/test_expanding_window_features.py | 4 ++-- tests/test_time_series/test_forecasting/test_lag_features.py | 4 ++-- .../test_time_series/test_forecasting/test_window_features.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_time_series/test_forecasting/test_expanding_window_features.py b/tests/test_time_series/test_forecasting/test_expanding_window_features.py index 6fa89eacd..01e9806ba 100644 --- a/tests/test_time_series/test_forecasting/test_expanding_window_features.py +++ b/tests/test_time_series/test_forecasting/test_expanding_window_features.py @@ -557,9 +557,9 @@ def test_correct_groupby_expanding_window_when_using_periods(df_time): data=expected_results, index=date_time, ) - # When setting group_by_variabels to color + # When setting group_by to color transformer = ExpandingWindowFeatures( - variables=["ambient_temp", "irradiation"], group_by_variables="color" + variables=["ambient_temp", "irradiation"], group_by="color" ) df_tr = transformer.fit_transform(df_time) assert df_tr.equals(expected_results_df) diff --git a/tests/test_time_series/test_forecasting/test_lag_features.py b/tests/test_time_series/test_forecasting/test_lag_features.py index 4bfb72848..79d11f292 100644 --- a/tests/test_time_series/test_forecasting/test_lag_features.py +++ b/tests/test_time_series/test_forecasting/test_lag_features.py @@ -361,9 +361,9 @@ def test_correct_groupby_lag_when_using_periods(df_time): data=expected_results, index=date_time, ) - # When setting group_by_variabels to color + # When setting group_by to color transformer = LagFeatures( - variables=["ambient_temp", "irradiation"], periods=3, group_by_variables="color" + variables=["ambient_temp", "irradiation"], periods=3, group_by="color" ) df_tr = transformer.fit_transform(df_time) assert df_tr.equals(expected_results_df) diff --git a/tests/test_time_series/test_forecasting/test_window_features.py b/tests/test_time_series/test_forecasting/test_window_features.py index 50b373e72..ab213e240 100644 --- a/tests/test_time_series/test_forecasting/test_window_features.py +++ b/tests/test_time_series/test_forecasting/test_window_features.py @@ -582,9 +582,9 @@ def test_correct_groupby_window_when_using_periods(df_time): data=expected_results, index=date_time, ) - # When setting group_by_variabels to color + # When setting group_by to color transformer = WindowFeatures( - variables=["ambient_temp", "irradiation"], window=3, group_by_variables="color" + variables=["ambient_temp", "irradiation"], window=3, group_by="color" ) df_tr = transformer.fit_transform(df_time) assert df_tr.equals(expected_results_df) From c3bee668916c9567e470487460774d67dc7f0876 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Thu, 29 Feb 2024 17:44:42 +0200 Subject: [PATCH 11/24] refactoring timeseries & reformatting the code --- feature_engine/selection/drop_psi_features.py | 2 +- .../forecasting/base_forecast_transformers.py | 63 +++++++------------ .../forecasting/expanding_window_features.py | 36 +++++++---- .../timeseries/forecasting/lag_features.py | 45 +++++++------ .../timeseries/forecasting/window_features.py | 37 ++++++----- 5 files changed, 94 insertions(+), 89 deletions(-) diff --git a/feature_engine/selection/drop_psi_features.py b/feature_engine/selection/drop_psi_features.py index aed8a4f21..3e87adbdb 100644 --- a/feature_engine/selection/drop_psi_features.py +++ b/feature_engine/selection/drop_psi_features.py @@ -1,5 +1,5 @@ import datetime -from typing import List, Union, Dict +from typing import Dict, List, Union import numpy as np import pandas as pd diff --git a/feature_engine/timeseries/forecasting/base_forecast_transformers.py b/feature_engine/timeseries/forecasting/base_forecast_transformers.py index 80a82a520..3a9506c74 100644 --- a/feature_engine/timeseries/forecasting/base_forecast_transformers.py +++ b/feature_engine/timeseries/forecasting/base_forecast_transformers.py @@ -5,21 +5,30 @@ from sklearn.utils.validation import check_is_fitted from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin -from feature_engine._check_init_parameters.check_variables import \ - _check_variables_input_value +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, _n_features_in_docstring) + _feature_names_in_docstring, + _n_features_in_docstring, +) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _drop_original_docstring, _missing_values_docstring) + _drop_original_docstring, + _missing_values_docstring, +) from feature_engine._docstrings.methods import _fit_not_learn_docstring from feature_engine._docstrings.substitute import Substitution -from feature_engine.dataframe_checks import (_check_contains_inf, - _check_contains_na, - _check_X_matches_training_df, - check_X) +from feature_engine.dataframe_checks import ( + _check_contains_inf, + _check_contains_na, + _check_X_matches_training_df, + check_X, +) from feature_engine.tags import _return_tags -from feature_engine.variable_handling import (check_numerical_variables, - find_numerical_variables) +from feature_engine.variable_handling import ( + check_numerical_variables, + find_numerical_variables, +) @Substitution( @@ -42,7 +51,7 @@ class BaseForecastTransformer(BaseEstimator, TransformerMixin, GetFeatureNamesOu {drop_original} - group_by_variables: str, list of str, default=None + group_by: str, str, int, or list of strings or integers, default=None variable of list of variables to create lag features based on. Attributes @@ -58,7 +67,7 @@ def __init__( variables: Union[None, int, str, List[Union[str, int]]] = None, missing_values: str = "raise", drop_original: bool = False, - group_by_variables: Optional[Union[str, List[str]]] = None, + group_by: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if missing_values not in ["raise", "ignore"]: @@ -73,26 +82,10 @@ def __init__( f"Got {drop_original} instead." ) - # check validity if group by variables passed - if group_by_variables: - # check group by variables data-types - if not ( - isinstance(group_by_variables, str) - or isinstance(group_by_variables, list) - ): - raise ValueError( - "group_by_variables must be an string or a list of strings. " - f"Got {group_by_variables} instead." - ) - # check if passed list has duplicates. - if isinstance(group_by_variables, list): - if len(set(group_by_variables)) != len(group_by_variables): - raise ValueError("group_by_variables contains duplicate values") - self.variables = _check_variables_input_value(variables) self.missing_values = missing_values self.drop_original = drop_original - self.group_by_variables = group_by_variables + self.group_by = _check_variables_input_value(group_by) def _check_index(self, X: pd.DataFrame): """ @@ -177,18 +170,6 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if self.missing_values == "raise": self._check_na_and_inf(X) - if self.group_by_variables: - # check if input group by variables is in input dataframe variables. - if isinstance(self.group_by_variables, list): - diff = set(self.group_by_variables).difference(X.columns.tolist()) - if len(diff) != 0: - raise ValueError(f"{list(diff)} not exist in dataframe") - else: - if self.group_by_variables not in X.columns.tolist(): - raise ValueError( - f"{self.group_by_variables} not exists in dataframe" - ) - self._get_feature_names_in(X) return self diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py index 01590f583..9b8e83476 100644 --- a/feature_engine/timeseries/forecasting/expanding_window_features.py +++ b/feature_engine/timeseries/forecasting/expanding_window_features.py @@ -3,20 +3,27 @@ from __future__ import annotations -from typing import List, Optional, Union +from typing import List, Union import pandas as pd from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, _n_features_in_docstring) + _feature_names_in_docstring, + _n_features_in_docstring, +) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _drop_original_docstring, _missing_values_docstring, - _variables_numerical_docstring) -from feature_engine._docstrings.methods import (_fit_not_learn_docstring, - _fit_transform_docstring) + _drop_original_docstring, + _missing_values_docstring, + _variables_numerical_docstring, +) +from feature_engine._docstrings.methods import ( + _fit_not_learn_docstring, + _fit_transform_docstring, +) from feature_engine._docstrings.substitute import Substitution -from feature_engine.timeseries.forecasting.base_forecast_transformers import \ - BaseForecastTransformer +from feature_engine.timeseries.forecasting.base_forecast_transformers import ( + BaseForecastTransformer, +) @Substitution( @@ -86,6 +93,9 @@ class ExpandingWindowFeatures(BaseForecastTransformer): {drop_original} + group_by: str, str, int, or list of strings or integers, default=None + variable of list of variables to create lag features based on. + Attributes ---------- variables_: @@ -149,7 +159,7 @@ class ExpandingWindowFeatures(BaseForecastTransformer): >>> x2 = [6,7,8,9,10, 2,9,10,15,2], >>> x3=['a','a','a','a','a', 'b','b','b','b','b'] >>> )) - >>> ewf = ExpandingWindowFeatures(group_by_variables='x3') + >>> ewf = ExpandingWindowFeatures(group_by='x3') >>> ewf.fit_transform(X) date x1 x2 x3 x1_expanding_mean x2_expanding_mean 0 2022-09-18 1 6 a NaN NaN @@ -174,7 +184,7 @@ def __init__( sort_index: bool = True, missing_values: str = "raise", drop_original: bool = False, - group_by_variables: Optional[Union[str, List[str]]] = None, + group_by: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if not isinstance(functions, (str, list)) or not all( @@ -192,7 +202,7 @@ def __init__( f"periods must be a non-negative integer. Got {periods} instead." ) - super().__init__(variables, missing_values, drop_original, group_by_variables) + super().__init__(variables, missing_values, drop_original, group_by) self.min_periods = min_periods self.functions = functions @@ -217,9 +227,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # Common dataframe checks and setting up. X = self._check_transform_input_and_state(X) - if self.group_by_variables: + if self.group_by: tmp = self._agg_expanding_window_features( - grouped_df=X.groupby(self.group_by_variables) + grouped_df=X.groupby(self.group_by) ) else: tmp = ( diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index dbbec5443..86f89a733 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -1,20 +1,27 @@ # Authors: Morgan Sell # License: BSD 3 clause -from typing import List, Optional, Union +from typing import List, Union import pandas as pd from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, _n_features_in_docstring) + _feature_names_in_docstring, + _n_features_in_docstring, +) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _drop_original_docstring, _missing_values_docstring, - _variables_numerical_docstring) -from feature_engine._docstrings.methods import (_fit_not_learn_docstring, - _fit_transform_docstring) + _drop_original_docstring, + _missing_values_docstring, + _variables_numerical_docstring, +) +from feature_engine._docstrings.methods import ( + _fit_not_learn_docstring, + _fit_transform_docstring, +) from feature_engine._docstrings.substitute import Substitution -from feature_engine.timeseries.forecasting.base_forecast_transformers import \ - BaseForecastTransformer +from feature_engine.timeseries.forecasting.base_forecast_transformers import ( + BaseForecastTransformer, +) @Substitution( @@ -67,7 +74,7 @@ class LagFeatures(BaseForecastTransformer): {drop_original} - group_by_variables: str, list of str, default=None + group_by: str, str, int, or list of strings or integers, default=None variable of list of variables to create lag features based on. Attributes @@ -143,7 +150,7 @@ def __init__( sort_index: bool = True, missing_values: str = "raise", drop_original: bool = False, - group_by_variables: Optional[Union[str, List[str]]] = None, + group_by: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if not ( @@ -168,7 +175,7 @@ def __init__( "sort_index takes values True and False." f"Got {sort_index} instead." ) - super().__init__(variables, missing_values, drop_original, group_by_variables) + super().__init__(variables, missing_values, drop_original, group_by) self.periods = periods self.freq = freq @@ -197,8 +204,8 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if isinstance(self.freq, list): df_ls = [] for fr in self.freq: - if self.group_by_variables: - tmp = X.groupby(self.group_by_variables)[self.variables_].shift( + if self.group_by: + tmp = X.groupby(self.group_by)[self.variables_].shift( freq=fr, ) else: @@ -210,8 +217,8 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: tmp = pd.concat(df_ls, axis=1) else: - if self.group_by_variables: - tmp = X.groupby(self.group_by_variables)[self.variables_].shift( + if self.group_by: + tmp = X.groupby(self.group_by)[self.variables_].shift( freq=self.freq, ) else: @@ -224,8 +231,8 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if isinstance(self.periods, list): df_ls = [] for pr in self.periods: - if self.group_by_variables: - tmp = X.groupby(self.group_by_variables)[self.variables_].shift( + if self.group_by: + tmp = X.groupby(self.group_by)[self.variables_].shift( periods=pr, ) else: @@ -237,8 +244,8 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: tmp = pd.concat(df_ls, axis=1) else: - if self.group_by_variables: - tmp = X.groupby(self.group_by_variables)[self.variables_].shift( + if self.group_by: + tmp = X.groupby(self.group_by)[self.variables_].shift( periods=self.periods, ) else: diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index def4c7706..df346fb40 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -1,17 +1,24 @@ -from typing import Callable, List, Optional, Union +from typing import Callable, List, Union import pandas as pd from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, _n_features_in_docstring) + _feature_names_in_docstring, + _n_features_in_docstring, +) from feature_engine._docstrings.init_parameters.all_trasnformers import ( - _drop_original_docstring, _missing_values_docstring, - _variables_numerical_docstring) -from feature_engine._docstrings.methods import (_fit_not_learn_docstring, - _fit_transform_docstring) + _drop_original_docstring, + _missing_values_docstring, + _variables_numerical_docstring, +) +from feature_engine._docstrings.methods import ( + _fit_not_learn_docstring, + _fit_transform_docstring, +) from feature_engine._docstrings.substitute import Substitution -from feature_engine.timeseries.forecasting.base_forecast_transformers import \ - BaseForecastTransformer +from feature_engine.timeseries.forecasting.base_forecast_transformers import ( + BaseForecastTransformer, +) @Substitution( @@ -91,7 +98,7 @@ class WindowFeatures(BaseForecastTransformer): {drop_original} - group_by_variables: str, list of str, default=None + group_by: str, str, int, or list of strings or integers, default=None variable of list of variables to create lag features based on. Attributes @@ -152,7 +159,7 @@ def __init__( sort_index: bool = True, missing_values: str = "raise", drop_original: bool = False, - group_by_variables: Optional[Union[str, List[str]]] = None, + group_by: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if isinstance(window, list) and len(window) != len(set(window)): @@ -173,7 +180,7 @@ def __init__( f"periods must be a positive integer. Got {periods} instead." ) - super().__init__(variables, missing_values, drop_original, group_by_variables) + super().__init__(variables, missing_values, drop_original, group_by) self.window = window self.min_periods = min_periods @@ -202,9 +209,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if isinstance(self.window, list): df_ls = [] for win in self.window: - if self.group_by_variables: + if self.group_by: tmp = self._agg_window_features( - grouped_df=X.groupby(self.group_by_variables), + grouped_df=X.groupby(self.group_by), win=win, ) else: @@ -218,9 +225,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: tmp = pd.concat(df_ls, axis=1) else: - if self.group_by_variables: + if self.group_by: tmp = self._agg_window_features( - grouped_df=X.groupby(self.group_by_variables), + grouped_df=X.groupby(self.group_by), win=self.window, ) else: From 67725dceddb7bea3b5730347fca0d1b6f25905cb Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Sat, 2 Mar 2024 21:33:48 +0200 Subject: [PATCH 12/24] adjust code formatting & style in tests --- .../test_check_estimator_forecasting.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_time_series/test_forecasting/test_check_estimator_forecasting.py b/tests/test_time_series/test_forecasting/test_check_estimator_forecasting.py index 978ef9ddf..2ac81edad 100644 --- a/tests/test_time_series/test_forecasting/test_check_estimator_forecasting.py +++ b/tests/test_time_series/test_forecasting/test_check_estimator_forecasting.py @@ -5,10 +5,12 @@ from sklearn.pipeline import Pipeline from sklearn.utils.estimator_checks import check_estimator -from feature_engine.timeseries.forecasting import (ExpandingWindowFeatures, - LagFeatures, WindowFeatures) -from tests.estimator_checks.estimator_checks import \ - check_feature_engine_estimator +from feature_engine.timeseries.forecasting import ( + ExpandingWindowFeatures, + LagFeatures, + WindowFeatures, +) +from tests.estimator_checks.estimator_checks import check_feature_engine_estimator _estimators = [ LagFeatures(missing_values="ignore"), From 9cb01ea852b67ea8a12fa773e0bf03366b545462 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Sat, 2 Mar 2024 23:29:25 +0200 Subject: [PATCH 13/24] fix create lag features using groupby & freq parameters --- .../timeseries/forecasting/lag_features.py | 39 ++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 86f89a733..6df173170 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -205,7 +205,8 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: df_ls = [] for fr in self.freq: if self.group_by: - tmp = X.groupby(self.group_by)[self.variables_].shift( + tmp = self._agg_freq_lags( + grouped_df=X.groupby(self.group_by), freq=fr, ) else: @@ -218,7 +219,8 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: else: if self.group_by: - tmp = X.groupby(self.group_by)[self.variables_].shift( + tmp = self._agg_freq_lags( + grouped_df=X.groupby(self.group_by), freq=self.freq, ) else: @@ -287,3 +289,36 @@ def _get_new_features_name(self) -> List: ] return feature_names + + def _agg_freq_lags( + self, + grouped_df: pd.core.groupby.generic.DataFrameGroupBy, + freq: Union[str, List[str]], + ) -> Union[pd.Series, pd.DataFrame]: + """_summary_ + + Parameters + ---------- + grouped_df : pd.core.groupby.generic.DataFrameGroupBy + dataframe of groups + freq : Union[str, List[str]] + Offset to use from the tseries module or time rule. See parameter `freq` in + pandas `shift()`. It is the same functionality. If freq is a list, lag features + will be created for each one of the frequency values in the list. + + Returns + ------- + Union[pd.Series, pd.DataFrame] + lag feature or dataframe of lag features + """ + tmp_data = [] + for _, group in grouped_df: + original_idx = group.index + tmp = ( + group[self.variables_] + .shift(freq=freq) + .reindex(original_idx) + ) + tmp_data.append(tmp) + tmp = pd.concat(tmp_data).sort_index() + return tmp \ No newline at end of file From 72ce43cba3be6f902af8c1415c2c57b313c2773b Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Sun, 3 Mar 2024 01:06:36 +0200 Subject: [PATCH 14/24] adjust code style --- .../timeseries/forecasting/lag_features.py | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 6df173170..76f3563b6 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -289,11 +289,11 @@ def _get_new_features_name(self) -> List: ] return feature_names - + def _agg_freq_lags( - self, - grouped_df: pd.core.groupby.generic.DataFrameGroupBy, - freq: Union[str, List[str]], + self, + grouped_df: pd.core.groupby.generic.DataFrameGroupBy, + freq: Union[str, List[str]], ) -> Union[pd.Series, pd.DataFrame]: """_summary_ @@ -302,9 +302,7 @@ def _agg_freq_lags( grouped_df : pd.core.groupby.generic.DataFrameGroupBy dataframe of groups freq : Union[str, List[str]] - Offset to use from the tseries module or time rule. See parameter `freq` in - pandas `shift()`. It is the same functionality. If freq is a list, lag features - will be created for each one of the frequency values in the list. + Offset to use from the tseries module or time rule. Returns ------- @@ -314,11 +312,7 @@ def _agg_freq_lags( tmp_data = [] for _, group in grouped_df: original_idx = group.index - tmp = ( - group[self.variables_] - .shift(freq=freq) - .reindex(original_idx) - ) + tmp = group[self.variables_].shift(freq=freq).reindex(original_idx) tmp_data.append(tmp) tmp = pd.concat(tmp_data).sort_index() - return tmp \ No newline at end of file + return tmp From 9d999b070531984205742ad8247332f8afa3fdae Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Sun, 3 Mar 2024 01:09:26 +0200 Subject: [PATCH 15/24] add test cases to ensure code coverage --- .../test_forecasting/test_lag_features.py | 472 ++++++++++++++++++ .../test_forecasting/test_window_features.py | 168 +++++++ 2 files changed, 640 insertions(+) diff --git a/tests/test_time_series/test_forecasting/test_lag_features.py b/tests/test_time_series/test_forecasting/test_lag_features.py index 79d11f292..f55ff168c 100644 --- a/tests/test_time_series/test_forecasting/test_lag_features.py +++ b/tests/test_time_series/test_forecasting/test_lag_features.py @@ -367,3 +367,475 @@ def test_correct_groupby_lag_when_using_periods(df_time): ) df_tr = transformer.fit_transform(df_time) assert df_tr.equals(expected_results_df) + + +def test_multiple_periods_with_groupby(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "ambient_temp_lag_2": [ + np.nan, + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + np.nan, + np.nan, + 34.08, + 33.7, + 33.89, + ], + "irradiation_lag_2": [ + np.nan, + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + np.nan, + np.nan, + 0.47, + 0.54, + 0.4, + ], + "ambient_temp_lag_3": [ + np.nan, + np.nan, + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + np.nan, + np.nan, + np.nan, + 34.08, + 33.7, + ], + "irradiation_lag_3": [ + np.nan, + np.nan, + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + np.nan, + np.nan, + np.nan, + 0.47, + 0.54, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = LagFeatures( + variables=["ambient_temp", "irradiation"], periods=[2, 3], group_by="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) + + +def test_correct_groupby_lag_when_using_freq(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "irradiation_lag_15min": [ + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + np.nan, + 0.47, + 0.54, + 0.4, + 0.45, + ], + "ambient_temp_lag_15min": [ + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + np.nan, + 34.08, + 33.7, + 33.89, + 34.04, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = LagFeatures( + variables=["irradiation", "ambient_temp"], freq="15min", group_by="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) + + +def test_multiple_freq_with_groupby(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "irradiation_lag_15min": [ + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + np.nan, + 0.47, + 0.54, + 0.4, + 0.45, + ], + "ambient_temp_lag_15min": [ + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + np.nan, + 34.08, + 33.7, + 33.89, + 34.04, + ], + "irradiation_lag_30min": [ + np.nan, + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + np.nan, + np.nan, + 0.47, + 0.54, + 0.4, + ], + "ambient_temp_lag_30min": [ + np.nan, + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + np.nan, + np.nan, + 34.08, + 33.7, + 33.89, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = LagFeatures( + variables=["irradiation", "ambient_temp"], + freq=["15min", "30min"], + group_by="color", + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) diff --git a/tests/test_time_series/test_forecasting/test_window_features.py b/tests/test_time_series/test_forecasting/test_window_features.py index ab213e240..40fdbc6fa 100644 --- a/tests/test_time_series/test_forecasting/test_window_features.py +++ b/tests/test_time_series/test_forecasting/test_window_features.py @@ -588,3 +588,171 @@ def test_correct_groupby_window_when_using_periods(df_time): ) df_tr = transformer.fit_transform(df_time) assert df_tr.equals(expected_results_df) + + +def test_multiple_windows_with_groupby(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "ambient_temp_window_2_mean": [ + np.nan, + np.nan, + 31.41, + 31.83, + 32.269999999999996, + 32.505, + 32.56, + 32.510000000000005, + 32.60000000000001, + 33.22, + np.nan, + np.nan, + 33.89, + 33.795, + 33.965, + ], + "irradiation_window_2_mean": [ + np.nan, + np.nan, + 0.65, + 0.72, + 0.7050000000000001, + 0.59, + 0.45499999999999996, + 0.53, + 0.5650000000000001, + 0.6500000000000001, + np.nan, + np.nan, + 0.505, + 0.47000000000000003, + 0.42500000000000004, + ], + "ambient_temp_window_3_mean": [ + np.nan, + np.nan, + np.nan, + 31.656666666666666, + 32.016666666666666, + 32.38666666666666, + 32.50333333333333, + 32.54666666666667, + 32.56666666666667, + 32.98666666666667, + np.nan, + np.nan, + np.nan, + 33.89, + 33.876666666666665, + ], + "irradiation_window_3_mean": [ + np.nan, + np.nan, + np.nan, + 0.65, + 0.7333333333333334, + 0.61, + 0.5566666666666668, + 0.49333333333333335, + 0.54, + 0.6233333333333334, + np.nan, + np.nan, + np.nan, + 0.47000000000000003, + 0.4633333333333334, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = WindowFeatures( + variables=["ambient_temp", "irradiation"], window=[2, 3], group_by="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) From b7b8bc9f4365f10f7c39480b5a039a8af378a55b Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Mon, 1 Apr 2024 17:02:39 +0200 Subject: [PATCH 16/24] add group_by docstring to _docstring --- .../_docstrings/init_parameters/all_trasnformers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/feature_engine/_docstrings/init_parameters/all_trasnformers.py b/feature_engine/_docstrings/init_parameters/all_trasnformers.py index 5c699d3de..ad9ff71e5 100644 --- a/feature_engine/_docstrings/init_parameters/all_trasnformers.py +++ b/feature_engine/_docstrings/init_parameters/all_trasnformers.py @@ -22,3 +22,10 @@ contain missing values. If `'ignore'`, missing data will be ignored when learning parameters or performing the transformation. """.rstrip() + +_group_by_docstring = """str, int, or list of strings or integers, default=None. + A group_by operation involves some combination of splitting the object, + applying a function, and combining the results. + This can be used to group large amounts of data and + compute operations on these groups. + """.rstrip() \ No newline at end of file From ba375a420070a78c1dba3f7e26846b5a56aafe2f Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Mon, 1 Apr 2024 17:57:55 +0200 Subject: [PATCH 17/24] remove check input of group_by --- .../timeseries/forecasting/base_forecast_transformers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/feature_engine/timeseries/forecasting/base_forecast_transformers.py b/feature_engine/timeseries/forecasting/base_forecast_transformers.py index 3a9506c74..1c3c9289e 100644 --- a/feature_engine/timeseries/forecasting/base_forecast_transformers.py +++ b/feature_engine/timeseries/forecasting/base_forecast_transformers.py @@ -15,6 +15,7 @@ from feature_engine._docstrings.init_parameters.all_trasnformers import ( _drop_original_docstring, _missing_values_docstring, + _group_by_docstring, ) from feature_engine._docstrings.methods import _fit_not_learn_docstring from feature_engine._docstrings.substitute import Substitution @@ -37,6 +38,7 @@ feature_names_in_=_feature_names_in_docstring, fit=_fit_not_learn_docstring, n_features_in_=_n_features_in_docstring, + group_by=_group_by_docstring, ) class BaseForecastTransformer(BaseEstimator, TransformerMixin, GetFeatureNamesOutMixin): """ @@ -51,8 +53,7 @@ class BaseForecastTransformer(BaseEstimator, TransformerMixin, GetFeatureNamesOu {drop_original} - group_by: str, str, int, or list of strings or integers, default=None - variable of list of variables to create lag features based on. + {group_by} Attributes ---------- @@ -85,7 +86,7 @@ def __init__( self.variables = _check_variables_input_value(variables) self.missing_values = missing_values self.drop_original = drop_original - self.group_by = _check_variables_input_value(group_by) + self.group_by = group_by def _check_index(self, X: pd.DataFrame): """ From 90f08f46f5f5e19196a4bfdc373c58bb042e81c4 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Mon, 1 Apr 2024 17:59:54 +0200 Subject: [PATCH 18/24] enhance performance of group_by window features operations --- .../timeseries/forecasting/window_features.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index df346fb40..99c45d4b7 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -10,6 +10,7 @@ _drop_original_docstring, _missing_values_docstring, _variables_numerical_docstring, + _group_by_docstring, ) from feature_engine._docstrings.methods import ( _fit_not_learn_docstring, @@ -29,6 +30,7 @@ n_features_in_=_n_features_in_docstring, fit=_fit_not_learn_docstring, fit_transform=_fit_transform_docstring, + group_by=_group_by_docstring, ) class WindowFeatures(BaseForecastTransformer): """ @@ -98,8 +100,7 @@ class WindowFeatures(BaseForecastTransformer): {drop_original} - group_by: str, str, int, or list of strings or integers, default=None - variable of list of variables to create lag features based on. + {group_by} Attributes ---------- @@ -210,10 +211,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: df_ls = [] for win in self.window: if self.group_by: - tmp = self._agg_window_features( - grouped_df=X.groupby(self.group_by), + tmp = X.groupby(self.group_by, as_index=False).apply( + self._agg_window_features, win=win, + include_groups=False, ) + tmp = tmp.reset_index(drop = True) else: tmp = ( X[self.variables_] @@ -226,10 +229,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: else: if self.group_by: - tmp = self._agg_window_features( - grouped_df=X.groupby(self.group_by), + tmp = X.groupby(self.group_by, as_index=False).apply( + self._agg_window_features, win=self.window, + include_groups=False, ) + tmp = tmp.reset_index(drop = True) else: tmp = ( X[self.variables_] @@ -290,14 +295,9 @@ def _agg_window_features( Union[pd.Series, pd.DataFrame] returned window features """ - tmp_data = [] - for _, group in grouped_df: - tmp = ( - group[self.variables_] - .rolling(window=win) - .agg(self.functions) - .shift(periods=self.periods, freq=self.freq) - ) - tmp_data.append(tmp) - tmp = pd.concat(tmp_data).sort_index() - return tmp + return ( + grouped_df[self.variables_] + .rolling(window=win) + .agg(self.functions) + .shift(periods=self.periods, freq=self.freq) + ) From 66baa750ae6b874a65d534617c0250343cbb5968 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Mon, 1 Apr 2024 18:00:26 +0200 Subject: [PATCH 19/24] enhance performance of group_by expanding window features operations --- .../forecasting/expanding_window_features.py | 58 +++++-------------- 1 file changed, 13 insertions(+), 45 deletions(-) diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py index 9b8e83476..561111f9d 100644 --- a/feature_engine/timeseries/forecasting/expanding_window_features.py +++ b/feature_engine/timeseries/forecasting/expanding_window_features.py @@ -15,6 +15,7 @@ _drop_original_docstring, _missing_values_docstring, _variables_numerical_docstring, + _group_by_docstring, ) from feature_engine._docstrings.methods import ( _fit_not_learn_docstring, @@ -34,6 +35,7 @@ n_features_in_=_n_features_in_docstring, fit=_fit_not_learn_docstring, fit_transform=_fit_transform_docstring, + group_by=_group_by_docstring, ) class ExpandingWindowFeatures(BaseForecastTransformer): """ @@ -93,8 +95,7 @@ class ExpandingWindowFeatures(BaseForecastTransformer): {drop_original} - group_by: str, str, int, or list of strings or integers, default=None - variable of list of variables to create lag features based on. + {group_by} Attributes ---------- @@ -142,36 +143,6 @@ class ExpandingWindowFeatures(BaseForecastTransformer): 2 2022-09-20 3 8 1.5 6.5 3 2022-09-21 4 9 2.0 7.0 4 2022-09-22 5 10 2.5 7.5 - create expanding window features based on other variables. - >>> import pandas as pd - >>> from feature_engine.timeseries.forecasting import ExpandingWindowFeatures - >>> X = pd.DataFrame(dict(date = ["2022-09-18", - >>> "2022-09-19", - >>> "2022-09-20", - >>> "2022-09-21", - >>> "2022-09-22", - >>> "2022-09-18", - >>> "2022-09-19", - >>> "2022-09-20", - >>> "2022-09-21", - >>> "2022-09-22"], - >>> x1 = [1,2,3,4,5, 3,5,6,8,11], - >>> x2 = [6,7,8,9,10, 2,9,10,15,2], - >>> x3=['a','a','a','a','a', 'b','b','b','b','b'] - >>> )) - >>> ewf = ExpandingWindowFeatures(group_by='x3') - >>> ewf.fit_transform(X) - date x1 x2 x3 x1_expanding_mean x2_expanding_mean - 0 2022-09-18 1 6 a NaN NaN - 1 2022-09-19 2 7 a 1.000000 6.0 - 2 2022-09-20 3 8 a 1.500000 6.5 - 3 2022-09-21 4 9 a 2.000000 7.0 - 4 2022-09-22 5 10 a 2.500000 7.5 - 5 2022-09-18 3 2 b NaN NaN - 6 2022-09-19 5 9 b 3.000000 2.0 - 7 2022-09-20 6 10 b 4.000000 5.5 - 8 2022-09-21 8 15 b 4.666667 7.0 - 9 2022-09-22 11 2 b 5.500000 9.0 """ def __init__( @@ -228,9 +199,11 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = self._check_transform_input_and_state(X) if self.group_by: - tmp = self._agg_expanding_window_features( - grouped_df=X.groupby(self.group_by) + tmp = X.groupby(self.group_by, as_index=False).apply( + self._agg_expanding_window_features, + include_groups=False, ) + tmp = tmp.reset_index(drop = True) else: tmp = ( X[self.variables_] @@ -279,14 +252,9 @@ def _agg_expanding_window_features( Union[pd.Series, pd.DataFrame] returned expanding window features """ - tmp_data = [] - for _, group in grouped_df: - tmp = ( - group[self.variables_] - .expanding(min_periods=self.min_periods) - .agg(self.functions) - .shift(periods=self.periods, freq=self.freq) - ) - tmp_data.append(tmp) - tmp = pd.concat(tmp_data).sort_index() - return tmp + return ( + grouped_df[self.variables_] + .expanding(min_periods=self.min_periods) + .agg(self.functions) + .shift(periods=self.periods, freq=self.freq) + ) From 92f996d3a823eb589ebd7b6b40009d0680f2267a Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Mon, 1 Apr 2024 21:08:38 +0200 Subject: [PATCH 20/24] fix reindexing to original index after grouping bug --- feature_engine/timeseries/forecasting/window_features.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index 99c45d4b7..f12a040a9 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -211,12 +211,14 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: df_ls = [] for win in self.window: if self.group_by: + original_index = X.index tmp = X.groupby(self.group_by, as_index=False).apply( self._agg_window_features, win=win, include_groups=False, ) - tmp = tmp.reset_index(drop = True) + tmp = tmp.set_index(original_index) + tmp = tmp.reindex(original_index) else: tmp = ( X[self.variables_] @@ -229,12 +231,14 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: else: if self.group_by: + original_index = X.index tmp = X.groupby(self.group_by, as_index=False).apply( self._agg_window_features, win=self.window, include_groups=False, ) - tmp = tmp.reset_index(drop = True) + tmp = tmp.set_index(original_index) + tmp = tmp.reindex(original_index) else: tmp = ( X[self.variables_] From 152c037f8d60ac93857718b2779a47327df397c3 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Mon, 1 Apr 2024 21:13:57 +0200 Subject: [PATCH 21/24] fix reindexing to original index after grouping operation bug --- .../timeseries/forecasting/expanding_window_features.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py index 561111f9d..a8fe60aa7 100644 --- a/feature_engine/timeseries/forecasting/expanding_window_features.py +++ b/feature_engine/timeseries/forecasting/expanding_window_features.py @@ -199,11 +199,13 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = self._check_transform_input_and_state(X) if self.group_by: + original_index = X.index tmp = X.groupby(self.group_by, as_index=False).apply( self._agg_expanding_window_features, include_groups=False, ) - tmp = tmp.reset_index(drop = True) + tmp = tmp.set_index(original_index) + tmp = tmp.reindex(original_index) else: tmp = ( X[self.variables_] From 5343e50f7d7663fb95789eb6686c26db3aa716af Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Mon, 1 Apr 2024 22:01:30 +0200 Subject: [PATCH 22/24] replacing group_by docstring with group_by_docstring --- .../timeseries/forecasting/lag_features.py | 25 +++---------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 76f3563b6..9d4186fdc 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -13,6 +13,7 @@ _drop_original_docstring, _missing_values_docstring, _variables_numerical_docstring, + _group_by_docstring, ) from feature_engine._docstrings.methods import ( _fit_not_learn_docstring, @@ -32,6 +33,7 @@ n_features_in_=_n_features_in_docstring, fit=_fit_not_learn_docstring, fit_transform=_fit_transform_docstring, + group_by=_group_by_docstring, ) class LagFeatures(BaseForecastTransformer): """ @@ -74,8 +76,7 @@ class LagFeatures(BaseForecastTransformer): {drop_original} - group_by: str, str, int, or list of strings or integers, default=None - variable of list of variables to create lag features based on. + {group_by} Attributes ---------- @@ -120,26 +121,6 @@ class LagFeatures(BaseForecastTransformer): 2 2022-09-20 3 8 2.0 7.0 1.0 6.0 3 2022-09-21 4 9 3.0 8.0 2.0 7.0 4 2022-09-22 5 10 4.0 9.0 3.0 8.0 - create lags based on other variables. - >>> import pandas as pd - >>> from feature_engine.timeseries.forecasting import LagFeatures - >>> X = pd.DataFrame(dict(date = ["2022-09-18", - >>> "2022-09-19", - >>> "2022-09-20", - >>> "2022-09-21", - >>> "2022-09-22"], - >>> x1 = [1,2,3,4,5], - >>> x2 = [6,7,8,9,10], - >>> x3 = ['a','b','a','b','a'] - >>> )) - >>> lf = LagFeatures(periods=[1,2], group_by_variables='x3') - >>> lf.fit_transform(X) - date x1 x2 x3 x1_lag_1 x2_lag_1 x1_lag_2 x2_lag_2 - 0 2022-09-18 1 6 a NaN NaN NaN NaN - 1 2022-09-19 2 7 b NaN NaN NaN NaN - 2 2022-09-20 3 8 a 1.0 6.0 NaN NaN - 3 2022-09-21 4 9 b 2.0 7.0 NaN NaN - 4 2022-09-22 5 10 a 3.0 8.0 1.0 6.0 """ def __init__( From ef1eaa8dff05c8886fcbe1235bb8da5103a7af40 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Mon, 1 Apr 2024 22:05:43 +0200 Subject: [PATCH 23/24] adjust code-style and formatting --- .../timeseries/forecasting/base_forecast_transformers.py | 2 +- .../timeseries/forecasting/expanding_window_features.py | 2 +- feature_engine/timeseries/forecasting/lag_features.py | 2 +- feature_engine/timeseries/forecasting/window_features.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/feature_engine/timeseries/forecasting/base_forecast_transformers.py b/feature_engine/timeseries/forecasting/base_forecast_transformers.py index 1c3c9289e..aee003833 100644 --- a/feature_engine/timeseries/forecasting/base_forecast_transformers.py +++ b/feature_engine/timeseries/forecasting/base_forecast_transformers.py @@ -14,8 +14,8 @@ ) from feature_engine._docstrings.init_parameters.all_trasnformers import ( _drop_original_docstring, - _missing_values_docstring, _group_by_docstring, + _missing_values_docstring, ) from feature_engine._docstrings.methods import _fit_not_learn_docstring from feature_engine._docstrings.substitute import Substitution diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py index a8fe60aa7..3061f10fc 100644 --- a/feature_engine/timeseries/forecasting/expanding_window_features.py +++ b/feature_engine/timeseries/forecasting/expanding_window_features.py @@ -13,9 +13,9 @@ ) from feature_engine._docstrings.init_parameters.all_trasnformers import ( _drop_original_docstring, + _group_by_docstring, _missing_values_docstring, _variables_numerical_docstring, - _group_by_docstring, ) from feature_engine._docstrings.methods import ( _fit_not_learn_docstring, diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 9d4186fdc..65c2c3d38 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -11,9 +11,9 @@ ) from feature_engine._docstrings.init_parameters.all_trasnformers import ( _drop_original_docstring, + _group_by_docstring, _missing_values_docstring, _variables_numerical_docstring, - _group_by_docstring, ) from feature_engine._docstrings.methods import ( _fit_not_learn_docstring, diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index f12a040a9..0e7d316cb 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -8,9 +8,9 @@ ) from feature_engine._docstrings.init_parameters.all_trasnformers import ( _drop_original_docstring, + _group_by_docstring, _missing_values_docstring, _variables_numerical_docstring, - _group_by_docstring, ) from feature_engine._docstrings.methods import ( _fit_not_learn_docstring, From 09db782c0e57bfcbdf010d9e55d21755ef94a2f5 Mon Sep 17 00:00:00 2001 From: Ezzaldin97 Date: Tue, 2 Apr 2024 02:38:48 +0200 Subject: [PATCH 24/24] remove white spaces --- .../_docstrings/init_parameters/all_trasnformers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/feature_engine/_docstrings/init_parameters/all_trasnformers.py b/feature_engine/_docstrings/init_parameters/all_trasnformers.py index ad9ff71e5..510180463 100644 --- a/feature_engine/_docstrings/init_parameters/all_trasnformers.py +++ b/feature_engine/_docstrings/init_parameters/all_trasnformers.py @@ -23,9 +23,9 @@ learning parameters or performing the transformation. """.rstrip() -_group_by_docstring = """str, int, or list of strings or integers, default=None. - A group_by operation involves some combination of splitting the object, - applying a function, and combining the results. - This can be used to group large amounts of data and +_group_by_docstring = """group_by: str, int, or list of strings or integers,default=None + A group_by operation involves some combination of splitting the object, + applying a function, and combining the results. + This can be used to group large amounts of data and compute operations on these groups. - """.rstrip() \ No newline at end of file + """.rstrip()