SamoraHunter
diff --git a/‎README.md‎
Lines changed: 30 additions & 29 deletions b/‎README.md‎
Lines changed: 30 additions & 29 deletions
diff --git a/‎ml_grid/model_classes/H2OAutoMLClassifier.py‎
Lines changed: 75 additions & 123 deletions b/‎ml_grid/model_classes/H2OAutoMLClassifier.py‎
Lines changed: 75 additions & 123 deletions
@@ -155,47 +155,48 @@ After installation, activate the virtual environment to run your code or noteboo
 
 ### Basic Example
 
-The main entry point for running experiments is typically a script or notebook that defines the parameter space and iterates through it. Here is a conceptual example of how to run a single pipeline iteration:
+The main entry point for running experiments is a script or notebook that loads the configuration and iterates through the parameter space defined in `config.yml`.
+
+1.  **Configure your experiment in `config.yml`:**
+    -   Set the data path, models, and parameter space.
+
+2.  **Run the experiment:**
+    -   The following script demonstrates how to execute a full grid search based on your `config.yml`.
 
 ```python
-import os
 from pathlib import Path
 from ml_grid.pipeline.data import pipe
 from ml_grid.util.param_space import parameter_space
-from ml_grid.util.global_params import global_parameters
 from ml_grid.util.create_experiment_directory import create_experiment_directory
+from ml_grid.util.config_parser import load_config
 
-# Define global settings
-global_parameters.verbose = 2
-global_parameters.error_raise = False
+# Load configuration from config.yml
+config = load_config()
 
-# Define project root and experiment directories robustly
-# Assumes the script/notebook is in a subdirectory like 'notebooks'
+# Set project root
 project_root = Path().resolve().parent
 
-# Define a base directory for all experiments within the project root
-experiments_base_dir = project_root / "experiments"
-
-# Create a unique, timestamped directory for this specific experiment run
-experiment_dir = create_experiment_directory(base_dir=experiments_base_dir, additional_naming="MyExperiment")
-
-# Load the parameter space
-param_space_df = parameter_space().get_parameter_space()
-
-# Select a single parameter configuration to run
-local_param_dict = param_space_df.iloc[0].to_dict()
-
-# Instantiate and run the pipeline
-ml_grid_object = pipe(
-    file_name=str(project_root / "data" / "your_data.csv"),
-    drop_term_list=['id', 'unwanted_col'],
-    local_param_dict=local_param_dict,
-    base_project_dir=str(project_root),
-    experiment_dir=experiment_dir,
-    param_space_index=0
+# Create a unique directory for this experiment run
+experiments_base_dir = project_root / config['experiment']['experiments_base_dir']
+experiment_dir = create_experiment_directory(
+    base_dir=experiments_base_dir,
+    additional_naming=config['experiment']['additional_naming']
 )
 
-# The pipeline runs on initialization. Results are logged to files.
+# Generate the parameter space from the config file
+param_space_df = parameter_space(config['param_space']).get_parameter_space()
+
+# Iterate through each parameter combination and run the pipeline
+for i, row in param_space_df.iterrows():
+    local_param_dict = row.to_dict()
+    print(f"Running experiment {i+1}/{len(param_space_df)} with params: {local_param_dict}")
+    pipe(
+        config=config,
+        local_param_dict=local_param_dict,
+        base_project_dir=project_root,
+        experiment_dir=experiment_dir,
+        param_space_index=i
+    )
 ```
 If you are using Jupyter, you can also select the kernel created during installation (e.g., `Python (ml_grid_env)`) directly from the Jupyter interface.
 
 
@@ -1,144 +1,96 @@
-from typing import Any, Dict, Optional
-
 import h2o
-from h2o.automl import H2OAutoML
 import numpy as np
 import pandas as pd
-from sklearn.base import BaseEstimator, ClassifierMixin
+from h2o.automl import H2OAutoML
+from .H2OBaseClassifier import H2OBaseClassifier
 from sklearn.utils.validation import check_is_fitted
 
-
-class H2OAutoMLClassifier(BaseEstimator, ClassifierMixin):
+class H2OAutoMLClassifier(H2OBaseClassifier):
     """A scikit-learn compatible wrapper for H2O's AutoML.
-
-    This class allows H2O's AutoML to be used as a standard scikit-learn
-    classifier, making it compatible with tools like GridSearchCV and
-    BayesSearchCV.
     """
-
-    def __init__(
-        self, max_runtime_secs: int = 360, nfolds: int = 2, seed: int = 1
-    ):
+    def __init__(self, **kwargs):
         """Initializes the H2OAutoMLClassifier.
-
-        Args:
-            max_runtime_secs (int): Maximum time in seconds to run the AutoML process.
-            nfolds (int): Number of folds for cross-validation.
-            seed (int): Random seed for reproducibility.
         """
-        self.max_runtime_secs = max_runtime_secs
-        self.nfolds = nfolds
-        self.seed = seed
-        self.automl: Optional[H2OAutoML] = None
-        self.classes_: Optional[np.ndarray] = None
+        # H2OAutoML is not a standard estimator, so we don't pass it to super
+        super().__init__(estimator_class=None, **kwargs)
+        self.automl = None
+        self._using_dummy_model = False
 
     def fit(self, X: pd.DataFrame, y: pd.Series) -> "H2OAutoMLClassifier":
-        """Fits the H2O AutoML model.
-
-        This method initializes an H2O cluster, converts the pandas DataFrame
-        and Series to H2O Frames, and then trains the AutoML model.
+        """Fits the H2O AutoML process.
 
-        Args:
-            X (pd.DataFrame): The training input samples.
-            y (pd.Series): The target values.
-
-        Returns:
-            H2OAutoMLClassifier: The fitted estimator.
+        If the dataset is too small, it gracefully skips training to avoid
+        crashing the H2O server.
         """
-        self.classes_ = np.unique(y)
-
-        try:
-            outcome_var = y.columns[0]
-        except:
-
-            outcome_var = y.name
-
-        x = list(X.columns)
-        y_n = outcome_var
-        try:
-            x.remove(y_n)
-        except:
-            pass
-
-        h2o.init()
-        train_df = pd.concat([X, y], axis=1)
-        train_h2o = h2o.H2OFrame(train_df)
-
-        train_h2o[y_n] = train_h2o[y_n].asfactor()
-
-        self.automl = H2OAutoML(
-            max_runtime_secs=self.max_runtime_secs,
-            max_models=5,
-            nfolds=self.nfolds,
-            seed=self.seed,
-        )
-
-        self.automl.train(y=y_n, x=x, training_frame=train_h2o)
+        if self._handle_small_data_fallback(X, y):
+            return self
+
+        train_h2o, x_vars, outcome_var, model_params = self._prepare_fit(X, y)
+
+        # --- CRITICAL FIX for small datasets ---
+        # AutoML can crash the server on very small or single-feature datasets.
+        # We will gracefully skip the run in this case.
+        min_samples = 20  # A reasonable minimum for AutoML
+        if len(train_h2o) < min_samples or len(x_vars) < 1:
+            print(
+                f"Warning: Dataset is too small for H2O AutoML "
+                f"({len(train_h2o)} rows, {len(x_vars)} features). "
+                f"Skipping training and using a dummy model."
+            )
+            # Create a dummy model to allow predict/predict_proba to work
+            # A simple GLM is a safe choice.
+            from h2o.estimators import H2OGeneralizedLinearEstimator
+            dummy_model = H2OGeneralizedLinearEstimator(
+                family='binomial', ignore_const_cols=False
+            )
+            self._using_dummy_model = True # Set flag before training
+            dummy_model.train(y=outcome_var, x=x_vars, training_frame=train_h2o)
+            self.model = dummy_model
+            return self
+
+        self.automl = H2OAutoML(**model_params)
+        self.automl.train(y=outcome_var, x=x_vars, training_frame=train_h2o)
+
+        # The best model found by AutoML becomes our main model
+        # If AutoML run completes with no model (e.g. time limit too short), fall back.
+        if self.automl.leader is None:
+            self.fit(X.iloc[:5], y.iloc[:5]) # Re-call fit with tiny data to trigger dummy model
+        else:
+            self.model = self.automl.leader
         return self
 
     def predict(self, X: pd.DataFrame) -> np.ndarray:
-        """Predicts class labels for samples in X.
-
-        Args:
-            X (pd.DataFrame): The input samples to predict.
-
-        Returns:
-            np.ndarray: The predicted class labels.
-        """
+        """Predicts class labels, handling the dummy model edge case."""
         check_is_fitted(self)
+        # If a dummy model was used, its predictions are meaningless and can
+        # crash the server. Return a safe, default prediction.
+        if self._using_dummy_model:
+            return np.full(len(X), self.classes_[0])
+        
+        # If the safety check passes, call the underlying model's predict method directly.
+        self._ensure_h2o_is_running()
         test_h2o = h2o.H2OFrame(X)
-        predictions = self.automl.leader.predict(test_h2o)
-
-        return predictions["predict"].as_data_frame().values
+        predictions = self.model.predict(test_h2o)
+        return predictions["predict"].as_data_frame().values.ravel()
 
     def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
-        """Predicts class probabilities for samples in X.
-
-        Note:
-            This method is not implemented for H2O AutoML.
-
-        Args:
-            X (pd.DataFrame): The input samples.
-
-        Raises:
-            NotImplementedError: H2O AutoML does not support predict_proba.
-        """
-        raise NotImplementedError("H2O AutoML does not support predict_proba.")
-
-    def get_params(self, deep: bool = True) -> Dict[str, Any]:
-        """Gets parameters for this estimator.
-
-        Args:
-            deep (bool): If True, will return the parameters for this estimator and
-                contained subobjects that are estimators.
-
-        Returns:
-            Dict[str, Any]: Parameter names mapped to their values.
-        """
-        return {
-            "max_runtime_secs": self.max_runtime_secs,
-            "nfolds": self.nfolds,
-            "seed": self.seed,
-        }
-
-    def set_params(self, **params: Any) -> "H2OAutoMLClassifier":
-        """Sets the parameters of this estimator.
-
-        Args:
-            **params (Any): Estimator parameters.
-
-        Returns:
-            H2OAutoMLClassifier: The instance with updated parameters.
-        """
-        for param, value in params.items():
-            setattr(self, param, value)
-        return self
-
-    def get_leader_params(self) -> Dict[str, Any]:
-        """Gets the parameters of the best model found by AutoML.
-
-        Returns:
-            Dict[str, Any]: A dictionary of the leader model's parameters.
-        """
+        """Predicts class probabilities, handling the dummy model edge case."""
         check_is_fitted(self)
-        return self.automl.leader.params
+        # If a dummy model was used, return a default probability distribution.
+        if self._using_dummy_model:
+            n_samples = len(X)
+            n_classes = len(self.classes_)
+            proba = np.zeros((n_samples, n_classes))
+            proba[:, 0] = 1.0
+            return proba
+
+        # If the safety check passes, call the underlying model's predict method directly.
+        self._ensure_h2o_is_running()
+        test_h2o = h2o.H2OFrame(X)
+        predictions = self.model.predict(test_h2o)
+        prob_df = predictions.drop("predict").as_data_frame()
+        return prob_df.values
+
+    def shutdown(self):
+        """Shuts down the H2O cluster using the base class's safe logic."""
+        super().shutdown()