Skip to content

Commit 815edae

Browse files
committed
h2o fixes
1 parent c89883d commit 815edae

File tree

3 files changed

+72
-22
lines changed

3 files changed

+72
-22
lines changed

ml_grid/model_classes/H2OBaseClassifier.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def _validate_input_data(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -
146146
# If X is a numpy array, convert it to a DataFrame and ensure
147147
# its columns are strings to prevent KeyErrors with H2O.
148148
X = pd.DataFrame(X)
149-
X.columns = X.columns.astype(str)
149+
X.columns = [str(c) for c in X.columns]
150150
else:
151151
# If it's already a DataFrame, still ensure columns are strings.
152152
X.columns = X.columns.astype(str)
@@ -428,12 +428,21 @@ def predict(self, X: pd.DataFrame) -> np.ndarray:
428428
self._ensure_model_is_loaded()
429429

430430
# Create H2O frame with explicit column names
431+
# --- ROBUSTNESS FIX for java.lang.NullPointerException ---
432+
# Instead of creating the frame directly, upload the data and then assign it.
433+
# This seems to create a more 'stable' frame in the H2O cluster, preventing
434+
# internal errors during prediction with some models like GLM.
431435
try:
432-
# --- CRITICAL FIX: Enforce training-time column types ---
433-
# This prevents H2O from re-inferring types on the test data, which
434-
# can lead to "Operation not allowed on string vector" errors.
435-
test_h2o = h2o.H2OFrame(X, column_names=self.feature_names_,
436-
column_types=self.feature_types_)
436+
# Create a temporary H2OFrame by uploading the pandas DataFrame
437+
tmp_frame = h2o.H2OFrame(X, column_names=self.feature_names_, column_types=self.feature_types_)
438+
439+
# Assign it to a unique key in the H2O cluster. This is more reliable.
440+
frame_id = f"predict_frame_{self.model_id}_{pd.Timestamp.now().strftime('%Y%m%d%H%M%S%f')}"
441+
h2o.assign(tmp_frame, frame_id)
442+
443+
# Get a handle to the newly created frame
444+
test_h2o = h2o.get_frame(frame_id)
445+
437446
except Exception as e:
438447
raise RuntimeError(f"Failed to create H2O frame for prediction: {e}")
439448

ml_grid/model_classes/H2OGAMClassifier.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -161,16 +161,36 @@ def _prepare_fit(self, X: pd.DataFrame, y: pd.Series):
161161
def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGAMClassifier":
162162
"""Fits the H2O GAM model, falling back to GLM if necessary."""
163163
# The base class fit will call our overridden _prepare_fit
164-
original_estimator_class = self.estimator_class
164+
# We need to explicitly call _prepare_fit here to set _fallback_to_glm
165+
# and get the processed parameters.
166+
167+
# --- CRITICAL FIX: Manually call validation ---
168+
# This ensures that if X is a numpy array, it's converted to a DataFrame
169+
# with string columns before being passed to _prepare_fit.
170+
X, y = self._validate_input_data(X, y)
171+
# Call our overridden _prepare_fit to determine fallback and get processed data/params.
172+
# This method will internally call super()._prepare_fit which handles validation,
173+
# setting classes_, feature_names_, feature_types_, and H2OFrame creation.
174+
train_h2o, x_vars, outcome_var, model_params = self._prepare_fit(X, y)
175+
176+
# Determine the actual H2O estimator class to use
165177
if self._fallback_to_glm:
166-
self.estimator_class = H2OGeneralizedLinearEstimator
167-
168-
try:
169-
super().fit(X, y, **kwargs)
170-
finally:
171-
# CRITICAL: Always restore the original estimator class
172-
self.estimator_class = original_estimator_class
173-
self.logger.debug(f"Restored self.estimator_class to {self.estimator_class.__name__}")
178+
self.logger.warning("H2OGAMClassifier.fit: Fallback to GLM triggered. Using H2OGeneralizedLinearEstimator.")
179+
h2o_estimator_to_use = H2OGeneralizedLinearEstimator
180+
else:
181+
h2o_estimator_to_use = self.estimator_class # This is H2OGeneralizedAdditiveEstimator
182+
183+
# Instantiate the H2O model with all the hyperparameters
184+
self.logger.debug(f"Creating H2O model ({h2o_estimator_to_use.__name__}) with params: {model_params}")
185+
self.model_ = h2o_estimator_to_use(**model_params)
186+
187+
# Call the train() method with ONLY the data-related arguments
188+
self.logger.debug("Calling H2O model.train()...")
189+
self.model_.train(x=x_vars, y=outcome_var, training_frame=train_h2o)
190+
191+
# Store model_id for recovery - THIS IS CRITICAL for predict() to work
192+
self.logger.debug(f"H2O train complete, extracting model_id from {self.model_}")
193+
self.model_id = self.model_.model_id
174194

175195
return self
176196

tests/test_h2o_base_classifier.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,12 @@ def test_fit_successful(mock_h2o_init, mock_h2o_cluster, mock_h2o_frame, classif
116116
assert classifier_instance.feature_types_ == {'feature1': 'real', 'feature2': 'real', 'feature3': 'enum'}
117117

118118
@patch('h2o.get_model')
119+
@patch('h2o.get_frame')
120+
@patch('h2o.assign')
119121
@patch('h2o.H2OFrame')
120122
@patch('h2o.cluster')
121-
def test_predict_successful(mock_h2o_cluster, mock_h2o_frame, mock_h2o_get_model, classifier_instance, sample_data):
123+
def test_predict_successful(mock_h2o_cluster, mock_h2o_frame, mock_h2o_assign, mock_h2o_get_frame, mock_h2o_get_model,
124+
classifier_instance, sample_data):
122125
"""
123126
Tests the `predict` method on a pre-fitted classifier.
124127
"""
@@ -132,12 +135,26 @@ def test_predict_successful(mock_h2o_cluster, mock_h2o_frame, mock_h2o_get_model
132135

133136
# --- Setup Mocks ---
134137
# Mock the H2OFrame that will be created from the input data
135-
mock_frame_instance = MagicMock()
136-
mock_frame_instance.nrows = len(X) # This is the crucial fix
137-
mock_h2o_frame.return_value = mock_frame_instance
138+
mock_tmp_frame = MagicMock(spec=h2o.H2OFrame)
139+
mock_h2o_frame.return_value = mock_tmp_frame
138140

141+
# Mock the final frame that get_frame will return
142+
mock_final_frame = MagicMock(spec=h2o.H2OFrame)
143+
mock_final_frame.nrows = len(X)
144+
mock_h2o_get_frame.return_value = mock_final_frame
139145
# Mock the model object that `h2o.get_model` will return
140-
mock_model = MockH2OEstimator()
146+
# --- FIX: Replace the real predict method with a MagicMock ---
147+
# Instantiate the mock estimator
148+
mock_model = MockH2OEstimator()
149+
# Create a mock for the predict method so we can assert calls
150+
mock_model.predict = MagicMock()
151+
# Configure the mock's return value to simulate H2O's behavior
152+
mock_pred_frame = MagicMock()
153+
mock_pred_frame.as_data_frame.return_value = pd.DataFrame({
154+
'predict': np.random.randint(0, 2, len(X))
155+
})
156+
mock_model.predict.return_value = mock_pred_frame
157+
141158
mock_h2o_get_model.return_value = mock_model
142159

143160
# Mock H2O cluster status
@@ -150,8 +167,12 @@ def test_predict_successful(mock_h2o_cluster, mock_h2o_frame, mock_h2o_get_model
150167
# 1. Check that the model was retrieved from H2O
151168
mock_h2o_get_model.assert_called_with("fitted_model_123")
152169

153-
# 2. Check that an H2OFrame was created for the prediction data with correct types
154-
mock_h2o_frame.assert_called_with(X, column_names=list(X.columns), column_types=classifier_instance.feature_types_)
170+
# 2. Check that the new frame creation logic was called
171+
mock_h2o_frame.assert_called_once_with(X, column_names=list(X.columns), column_types=classifier_instance.feature_types_)
172+
mock_h2o_assign.assert_called_once_with(mock_tmp_frame, ANY)
173+
mock_h2o_get_frame.assert_called_once()
174+
# Verify the model's predict method was called with the final mocked frame
175+
mock_model.predict.assert_called_once_with(mock_final_frame)
155176

156177
# 3. Check the output of the prediction
157178
assert isinstance(predictions, np.ndarray)

0 commit comments

Comments
 (0)