Skip to content

Commit 52afe7f

Browse files
committed
Initial individual h2o model integration. Associated unit test. Related integrations. Initial consolidation of parameters to utilise config.yml.
1 parent a05143f commit 52afe7f

34 files changed

+1902
-531
lines changed

README.md

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -155,47 +155,48 @@ After installation, activate the virtual environment to run your code or noteboo
155155

156156
### Basic Example
157157

158-
The main entry point for running experiments is typically a script or notebook that defines the parameter space and iterates through it. Here is a conceptual example of how to run a single pipeline iteration:
158+
The main entry point for running experiments is a script or notebook that loads the configuration and iterates through the parameter space defined in `config.yml`.
159+
160+
1. **Configure your experiment in `config.yml`:**
161+
- Set the data path, models, and parameter space.
162+
163+
2. **Run the experiment:**
164+
- The following script demonstrates how to execute a full grid search based on your `config.yml`.
159165

160166
```python
161-
import os
162167
from pathlib import Path
163168
from ml_grid.pipeline.data import pipe
164169
from ml_grid.util.param_space import parameter_space
165-
from ml_grid.util.global_params import global_parameters
166170
from ml_grid.util.create_experiment_directory import create_experiment_directory
171+
from ml_grid.util.config_parser import load_config
167172
168-
# Define global settings
169-
global_parameters.verbose = 2
170-
global_parameters.error_raise = False
173+
# Load configuration from config.yml
174+
config = load_config()
171175
172-
# Define project root and experiment directories robustly
173-
# Assumes the script/notebook is in a subdirectory like 'notebooks'
176+
# Set project root
174177
project_root = Path().resolve().parent
175178
176-
# Define a base directory for all experiments within the project root
177-
experiments_base_dir = project_root / "experiments"
178-
179-
# Create a unique, timestamped directory for this specific experiment run
180-
experiment_dir = create_experiment_directory(base_dir=experiments_base_dir, additional_naming="MyExperiment")
181-
182-
# Load the parameter space
183-
param_space_df = parameter_space().get_parameter_space()
184-
185-
# Select a single parameter configuration to run
186-
local_param_dict = param_space_df.iloc[0].to_dict()
187-
188-
# Instantiate and run the pipeline
189-
ml_grid_object = pipe(
190-
file_name=str(project_root / "data" / "your_data.csv"),
191-
drop_term_list=['id', 'unwanted_col'],
192-
local_param_dict=local_param_dict,
193-
base_project_dir=str(project_root),
194-
experiment_dir=experiment_dir,
195-
param_space_index=0
179+
# Create a unique directory for this experiment run
180+
experiments_base_dir = project_root / config['experiment']['experiments_base_dir']
181+
experiment_dir = create_experiment_directory(
182+
base_dir=experiments_base_dir,
183+
additional_naming=config['experiment']['additional_naming']
196184
)
197185
198-
# The pipeline runs on initialization. Results are logged to files.
186+
# Generate the parameter space from the config file
187+
param_space_df = parameter_space(config['param_space']).get_parameter_space()
188+
189+
# Iterate through each parameter combination and run the pipeline
190+
for i, row in param_space_df.iterrows():
191+
local_param_dict = row.to_dict()
192+
print(f"Running experiment {i+1}/{len(param_space_df)} with params: {local_param_dict}")
193+
pipe(
194+
config=config,
195+
local_param_dict=local_param_dict,
196+
base_project_dir=project_root,
197+
experiment_dir=experiment_dir,
198+
param_space_index=i
199+
)
199200
```
200201
If you are using Jupyter, you can also select the kernel created during installation (e.g., `Python (ml_grid_env)`) directly from the Jupyter interface.
201202
Lines changed: 75 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -1,144 +1,96 @@
1-
from typing import Any, Dict, Optional
2-
31
import h2o
4-
from h2o.automl import H2OAutoML
52
import numpy as np
63
import pandas as pd
7-
from sklearn.base import BaseEstimator, ClassifierMixin
4+
from h2o.automl import H2OAutoML
5+
from .H2OBaseClassifier import H2OBaseClassifier
86
from sklearn.utils.validation import check_is_fitted
97

10-
11-
class H2OAutoMLClassifier(BaseEstimator, ClassifierMixin):
8+
class H2OAutoMLClassifier(H2OBaseClassifier):
129
"""A scikit-learn compatible wrapper for H2O's AutoML.
13-
14-
This class allows H2O's AutoML to be used as a standard scikit-learn
15-
classifier, making it compatible with tools like GridSearchCV and
16-
BayesSearchCV.
1710
"""
18-
19-
def __init__(
20-
self, max_runtime_secs: int = 360, nfolds: int = 2, seed: int = 1
21-
):
11+
def __init__(self, **kwargs):
2212
"""Initializes the H2OAutoMLClassifier.
23-
24-
Args:
25-
max_runtime_secs (int): Maximum time in seconds to run the AutoML process.
26-
nfolds (int): Number of folds for cross-validation.
27-
seed (int): Random seed for reproducibility.
2813
"""
29-
self.max_runtime_secs = max_runtime_secs
30-
self.nfolds = nfolds
31-
self.seed = seed
32-
self.automl: Optional[H2OAutoML] = None
33-
self.classes_: Optional[np.ndarray] = None
14+
# H2OAutoML is not a standard estimator, so we don't pass it to super
15+
super().__init__(estimator_class=None, **kwargs)
16+
self.automl = None
17+
self._using_dummy_model = False
3418

3519
def fit(self, X: pd.DataFrame, y: pd.Series) -> "H2OAutoMLClassifier":
36-
"""Fits the H2O AutoML model.
37-
38-
This method initializes an H2O cluster, converts the pandas DataFrame
39-
and Series to H2O Frames, and then trains the AutoML model.
20+
"""Fits the H2O AutoML process.
4021
41-
Args:
42-
X (pd.DataFrame): The training input samples.
43-
y (pd.Series): The target values.
44-
45-
Returns:
46-
H2OAutoMLClassifier: The fitted estimator.
22+
If the dataset is too small, it gracefully skips training to avoid
23+
crashing the H2O server.
4724
"""
48-
self.classes_ = np.unique(y)
49-
50-
try:
51-
outcome_var = y.columns[0]
52-
except:
53-
54-
outcome_var = y.name
55-
56-
x = list(X.columns)
57-
y_n = outcome_var
58-
try:
59-
x.remove(y_n)
60-
except:
61-
pass
62-
63-
h2o.init()
64-
train_df = pd.concat([X, y], axis=1)
65-
train_h2o = h2o.H2OFrame(train_df)
66-
67-
train_h2o[y_n] = train_h2o[y_n].asfactor()
68-
69-
self.automl = H2OAutoML(
70-
max_runtime_secs=self.max_runtime_secs,
71-
max_models=5,
72-
nfolds=self.nfolds,
73-
seed=self.seed,
74-
)
75-
76-
self.automl.train(y=y_n, x=x, training_frame=train_h2o)
25+
if self._handle_small_data_fallback(X, y):
26+
return self
27+
28+
train_h2o, x_vars, outcome_var, model_params = self._prepare_fit(X, y)
29+
30+
# --- CRITICAL FIX for small datasets ---
31+
# AutoML can crash the server on very small or single-feature datasets.
32+
# We will gracefully skip the run in this case.
33+
min_samples = 20 # A reasonable minimum for AutoML
34+
if len(train_h2o) < min_samples or len(x_vars) < 1:
35+
print(
36+
f"Warning: Dataset is too small for H2O AutoML "
37+
f"({len(train_h2o)} rows, {len(x_vars)} features). "
38+
f"Skipping training and using a dummy model."
39+
)
40+
# Create a dummy model to allow predict/predict_proba to work
41+
# A simple GLM is a safe choice.
42+
from h2o.estimators import H2OGeneralizedLinearEstimator
43+
dummy_model = H2OGeneralizedLinearEstimator(
44+
family='binomial', ignore_const_cols=False
45+
)
46+
self._using_dummy_model = True # Set flag before training
47+
dummy_model.train(y=outcome_var, x=x_vars, training_frame=train_h2o)
48+
self.model = dummy_model
49+
return self
50+
51+
self.automl = H2OAutoML(**model_params)
52+
self.automl.train(y=outcome_var, x=x_vars, training_frame=train_h2o)
53+
54+
# The best model found by AutoML becomes our main model
55+
# If AutoML run completes with no model (e.g. time limit too short), fall back.
56+
if self.automl.leader is None:
57+
self.fit(X.iloc[:5], y.iloc[:5]) # Re-call fit with tiny data to trigger dummy model
58+
else:
59+
self.model = self.automl.leader
7760
return self
7861

7962
def predict(self, X: pd.DataFrame) -> np.ndarray:
80-
"""Predicts class labels for samples in X.
81-
82-
Args:
83-
X (pd.DataFrame): The input samples to predict.
84-
85-
Returns:
86-
np.ndarray: The predicted class labels.
87-
"""
63+
"""Predicts class labels, handling the dummy model edge case."""
8864
check_is_fitted(self)
65+
# If a dummy model was used, its predictions are meaningless and can
66+
# crash the server. Return a safe, default prediction.
67+
if self._using_dummy_model:
68+
return np.full(len(X), self.classes_[0])
69+
70+
# If the safety check passes, call the underlying model's predict method directly.
71+
self._ensure_h2o_is_running()
8972
test_h2o = h2o.H2OFrame(X)
90-
predictions = self.automl.leader.predict(test_h2o)
91-
92-
return predictions["predict"].as_data_frame().values
73+
predictions = self.model.predict(test_h2o)
74+
return predictions["predict"].as_data_frame().values.ravel()
9375

9476
def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
95-
"""Predicts class probabilities for samples in X.
96-
97-
Note:
98-
This method is not implemented for H2O AutoML.
99-
100-
Args:
101-
X (pd.DataFrame): The input samples.
102-
103-
Raises:
104-
NotImplementedError: H2O AutoML does not support predict_proba.
105-
"""
106-
raise NotImplementedError("H2O AutoML does not support predict_proba.")
107-
108-
def get_params(self, deep: bool = True) -> Dict[str, Any]:
109-
"""Gets parameters for this estimator.
110-
111-
Args:
112-
deep (bool): If True, will return the parameters for this estimator and
113-
contained subobjects that are estimators.
114-
115-
Returns:
116-
Dict[str, Any]: Parameter names mapped to their values.
117-
"""
118-
return {
119-
"max_runtime_secs": self.max_runtime_secs,
120-
"nfolds": self.nfolds,
121-
"seed": self.seed,
122-
}
123-
124-
def set_params(self, **params: Any) -> "H2OAutoMLClassifier":
125-
"""Sets the parameters of this estimator.
126-
127-
Args:
128-
**params (Any): Estimator parameters.
129-
130-
Returns:
131-
H2OAutoMLClassifier: The instance with updated parameters.
132-
"""
133-
for param, value in params.items():
134-
setattr(self, param, value)
135-
return self
136-
137-
def get_leader_params(self) -> Dict[str, Any]:
138-
"""Gets the parameters of the best model found by AutoML.
139-
140-
Returns:
141-
Dict[str, Any]: A dictionary of the leader model's parameters.
142-
"""
77+
"""Predicts class probabilities, handling the dummy model edge case."""
14378
check_is_fitted(self)
144-
return self.automl.leader.params
79+
# If a dummy model was used, return a default probability distribution.
80+
if self._using_dummy_model:
81+
n_samples = len(X)
82+
n_classes = len(self.classes_)
83+
proba = np.zeros((n_samples, n_classes))
84+
proba[:, 0] = 1.0
85+
return proba
86+
87+
# If the safety check passes, call the underlying model's predict method directly.
88+
self._ensure_h2o_is_running()
89+
test_h2o = h2o.H2OFrame(X)
90+
predictions = self.model.predict(test_h2o)
91+
prob_df = predictions.drop("predict").as_data_frame()
92+
return prob_df.values
93+
94+
def shutdown(self):
95+
"""Shuts down the H2O cluster using the base class's safe logic."""
96+
super().shutdown()

0 commit comments

Comments
 (0)