feature-engine
diff --git a/‎feature_engine/_docstrings/selection/_docstring.py‎
Lines changed: 5 additions & 0 deletions b/‎feature_engine/_docstrings/selection/_docstring.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎feature_engine/selection/base_recursive_selector.py‎
Lines changed: 11 additions & 3 deletions b/‎feature_engine/selection/base_recursive_selector.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎feature_engine/selection/base_selection_functions.py‎
Lines changed: 14 additions & 0 deletions b/‎feature_engine/selection/base_selection_functions.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎feature_engine/selection/probe_feature_selection.py‎
Lines changed: 19 additions & 7 deletions b/‎feature_engine/selection/probe_feature_selection.py‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎feature_engine/selection/recursive_feature_addition.py‎
Lines changed: 12 additions & 6 deletions b/‎feature_engine/selection/recursive_feature_addition.py‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎feature_engine/selection/recursive_feature_elimination.py‎
Lines changed: 12 additions & 6 deletions b/‎feature_engine/selection/recursive_feature_elimination.py‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎feature_engine/selection/shuffle_features.py‎
Lines changed: 3 additions & 3 deletions b/‎feature_engine/selection/shuffle_features.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎feature_engine/selection/single_feature_performance.py‎
Lines changed: 13 additions & 1 deletion b/‎feature_engine/selection/single_feature_performance.py‎
Lines changed: 13 additions & 1 deletion
@@ -53,6 +53,11 @@
         documentation.
         """.rstrip()
 
+_groups_docstring = """groups: array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting the dataset into train/test set.
+        Only used in conjunction with a “Group” cv instance (e.g., GroupKFold).
+        """.rstrip()
+
 _initial_model_performance_docstring = """initial_model_performance_:
         The model's performance when trained with the original dataset.
         """.rstrip()
 
@@ -66,6 +66,11 @@ class BaseRecursiveSelector(BaseSelector):
         across calls. For more details check Scikit-learn's `cross_validate`'s
         documentation.
 
+    groups: Array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting
+        the dataset into train/test set. Only used in conjunction with a
+        “Group” cv instance (e.g., GroupKFold).
+
     confirm_variables: bool, default=False
         If set to True, variables that are not present in the input dataframe will be
         removed from the list of variables. Only used when passing a variable list to
@@ -105,6 +110,7 @@ def __init__(
         estimator,
         scoring: str = "roc_auc",
         cv=3,
+        groups=None,
         threshold: Union[int, float] = 0.01,
         variables: Variables = None,
         confirm_variables: bool = False,
@@ -119,6 +125,7 @@ def __init__(
         self.scoring = scoring
         self.threshold = threshold
         self.cv = cv
+        self.groups = groups
 
     def fit(self, X: pd.DataFrame, y: pd.Series):
         """
@@ -155,10 +162,11 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
 
         # train model with all features and cross-validation
         model = cross_validate(
-            self.estimator,
-            X[self.variables_],
-            y,
+            estimator=self.estimator,
+            X=X[self.variables_],
+            y=y,
             cv=self._cv,
+            groups=self.groups,
             scoring=self.scoring,
             return_estimator=True,
         )
 
@@ -167,6 +167,7 @@ def single_feature_performance(
     estimator,
     cv,
     scoring,
+    groups=None,
 ):
     """
     Trains one estimator per feature and determines the performance of that estimator.
@@ -191,6 +192,11 @@ def single_feature_performance(
     scoring:
         The performance metric. Any supported by the Scikit-learn estimator.
 
+    groups: Array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting
+        the dataset into train/test set. Only used in conjunction with a
+        “Group” cv instance (e.g., GroupKFold).
+
     Returns
     -------
     feature_performance: dict
@@ -213,6 +219,7 @@ def single_feature_performance(
             X[feature].to_frame(),
             y,
             cv=cv,
+            groups=groups,
             return_estimator=False,
             scoring=scoring,
         )
@@ -228,6 +235,7 @@ def find_feature_importance(
     estimator,
     cv,
     scoring,
+    groups=None,
 ):
     """
     Trains an estimator using cross-validation and derives feature importance from it.
@@ -253,6 +261,11 @@ def find_feature_importance(
     scoring:
         The performance metric. Any supported by the Scikit-learn estimator.
 
+    groups: Array-like of shape (n_samples,), default=None
+        Group labels for the samples used while splitting
+        the dataset into train/test set. Only used in conjunction with a
+        “Group” cv instance (e.g., GroupKFold).
+
     Returns
     -------
     feature_importance: pd.Series
@@ -271,6 +284,7 @@ def find_feature_importance(
         X,
         y,
         cv=cv,
+        groups=groups,
         scoring=scoring,
         return_estimator=True,
     )
 
@@ -14,6 +14,7 @@
 from feature_engine._docstrings.methods import _fit_transform_docstring
 from feature_engine._docstrings.selection._docstring import (
     _cv_docstring,
+    _groups_docstring,
     _features_to_drop_docstring,
     _fit_docstring,
     _get_support_docstring,
@@ -40,6 +41,7 @@
     estimator=_estimator_docstring,
     scoring=_scoring_docstring,
     cv=_cv_docstring,
+    groups=_groups_docstring,
     confirm_variables=_confirm_variables_docstring,
     variables=_variables_numerical_docstring,
     feature_names_in_=_feature_names_in_docstring,
@@ -104,6 +106,8 @@ class ProbeFeatureSelection(BaseSelector):
 
     {cv}
 
+    {groups}
+
     Attributes
     ----------
     probe_features_:
@@ -173,6 +177,7 @@ def __init__(
         n_probes: int = 1,
         distribution: str = "normal",
         cv=5,
+        groups=None,
         random_state: int = 0,
         confirm_variables: bool = False,
     ):
@@ -203,6 +208,7 @@ def __init__(
         self.scoring = scoring
         self.distribution = distribution
         self.cv = cv
+        self.groups = groups
         self.n_probes = n_probes
         self.random_state = random_state
 
@@ -238,20 +244,26 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         if self.collective is True:
             # train model using entire dataset and derive feature importance
             f_importance_mean, f_importance_std = find_feature_importance(
-                X_new, y, self.estimator, self.cv, self.scoring,
+                X=X_new,
+                y=y,
+                estimator=self.estimator,
+                cv=self.cv,
+                groups=self.groups,
+                scoring=self.scoring,
             )
             self.feature_importances_ = f_importance_mean
             self.feature_importances_std_ = f_importance_std
 
         else:
             # trains a model per feature (single feature models)
             f_importance_mean, f_importance_std = single_feature_performance(
-                X_new,
-                y,
-                X_new.columns,
-                self.estimator,
-                self.cv,
-                self.scoring,
+                X=X_new,
+                y=y,
+                variables=X_new.columns,
+                estimator=self.estimator,
+                cv=self.cv,
+                groups=self.groups,
+                scoring=self.scoring,
             )
             self.feature_importances_ = pd.Series(f_importance_mean)
             self.feature_importances_std_ = pd.Series(f_importance_std)
 
@@ -19,6 +19,7 @@
     _features_to_drop_docstring,
     _fit_docstring,
     _get_support_docstring,
+    _groups_docstring,
     _initial_model_performance_docstring,
     _scoring_docstring,
     _threshold_docstring,
@@ -35,6 +36,7 @@
     scoring=_scoring_docstring,
     threshold=_threshold_docstring,
     cv=_cv_docstring,
+    groups=_groups_docstring,
     variables=_variables_numerical_docstring,
     confirm_variables=_confirm_variables_docstring,
     initial_model_performance_=_initial_model_performance_docstring,
@@ -87,6 +89,8 @@ class RecursiveFeatureAddition(BaseRecursiveSelector):
 
     {cv}
 
+    {groups}
+
     {confirm_variables}
 
     Attributes
@@ -167,10 +171,11 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
 
         # Run baseline model using only the most important feature
         baseline_model = cross_validate(
-            self.estimator,
-            X[first_most_important_feature].to_frame(),
-            y,
+            estimator=self.estimator,
+            X=X[first_most_important_feature].to_frame(),
+            y=y,
             cv=self._cv,
+            groups=self.groups,
             scoring=self.scoring,
             return_estimator=True,
         )
@@ -194,10 +199,11 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
 
             # Add feature and train new model
             model_tmp = cross_validate(
-                self.estimator,
-                X[_selected_features + [feature]],
-                y,
+                estimator=self.estimator,
+                X=X[_selected_features + [feature]],
+                y=y,
                 cv=self._cv,
+                groups=self.groups,
                 scoring=self.scoring,
                 return_estimator=True,
             )
 
@@ -19,6 +19,7 @@
     _features_to_drop_docstring,
     _fit_docstring,
     _get_support_docstring,
+    _groups_docstring,
     _initial_model_performance_docstring,
     _scoring_docstring,
     _threshold_docstring,
@@ -35,6 +36,7 @@
     scoring=_scoring_docstring,
     threshold=_threshold_docstring,
     cv=_cv_docstring,
+    groups=_groups_docstring,
     variables=_variables_numerical_docstring,
     confirm_variables=_confirm_variables_docstring,
     initial_model_performance_=_initial_model_performance_docstring,
@@ -88,6 +90,8 @@ class RecursiveFeatureElimination(BaseRecursiveSelector):
 
     {cv}
 
+    {groups}
+
     {confirm_variables}
 
     Attributes
@@ -186,10 +190,11 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
 
             # remove feature and train new model
             model_tmp = cross_validate(
-                self.estimator,
-                X_tmp.drop(columns=feature),
-                y,
+                estimator=self.estimator,
+                X=X_tmp.drop(columns=feature),
+                y=y,
                 cv=self._cv,
+                groups=self.groups,
                 scoring=self.scoring,
                 return_estimator=False,
             )
@@ -213,10 +218,11 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
                 X_tmp = X_tmp.drop(columns=feature)
 
                 baseline_model = cross_validate(
-                    self.estimator,
-                    X_tmp,
-                    y,
+                    estimator=self.estimator,
+                    X=X_tmp,
+                    y=y,
                     cv=self._cv,
+                    groups=self.groups,
                     return_estimator=False,
                     scoring=self.scoring,
                 )
 
@@ -226,9 +226,9 @@ def fit(
 
         # train model with all features and cross-validation
         model = cross_validate(
-            self.estimator,
-            X[self.variables_],
-            y,
+            estimator=self.estimator,
+            X=X[self.variables_],
+            y=y,
             cv=cv,
             return_estimator=True,
             scoring=self.scoring,
 
@@ -16,6 +16,7 @@
 from feature_engine._docstrings.methods import _fit_transform_docstring
 from feature_engine._docstrings.selection._docstring import (
     _cv_docstring,
+    _groups_docstring,
     _estimator_docstring,
     _features_to_drop_docstring,
     _fit_docstring,
@@ -45,6 +46,7 @@
     scoring=_scoring_docstring,
     threshold=_threshold_docstring,
     cv=_cv_docstring,
+    groups=_groups_docstring,
     variables=_variables_numerical_docstring,
     confirm_variables=_confirm_variables_docstring,
     initial_model_performance_=_initial_model_performance_docstring,
@@ -83,6 +85,8 @@ class SelectBySingleFeaturePerformance(BaseSelector):
 
     {cv}
 
+    {groups}
+
     {confirm_variables}
 
     Attributes
@@ -147,6 +151,7 @@ def __init__(
         estimator,
         scoring: str = "roc_auc",
         cv=3,
+        groups=None,
         threshold: Union[int, float, None] = None,
         variables: Variables = None,
         confirm_variables: bool = False,
@@ -177,6 +182,7 @@ def __init__(
         self.scoring = scoring
         self.threshold = threshold
         self.cv = cv
+        self.groups = groups
 
     def fit(self, X: pd.DataFrame, y: pd.Series):
         """
@@ -209,7 +215,13 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
             )
 
         self.feature_performance_, _ = single_feature_performance(
-            X, y, self.variables_, self.estimator, self.cv, self.scoring
+            X=X,
+            y=y,
+            variables=self.variables_,
+            estimator=self.estimator,
+            cv=self.cv,
+            groups=self.groups,
+            scoring=self.scoring,
         )
 
         # select features