h2o fixes

SamoraHunter · SamoraHunter · commit 815edaeb3b37 · 2025-10-29T15:41:43.000Z
diff --git a/ml_grid/model_classes/H2OBaseClassifier.py b/ml_grid/model_classes/H2OBaseClassifier.py
@@ -146,7 +146,7 @@ def _validate_input_data(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -
                 # If X is a numpy array, convert it to a DataFrame and ensure
                 # its columns are strings to prevent KeyErrors with H2O.
                 X = pd.DataFrame(X)
-                X.columns = X.columns.astype(str)
+                X.columns = [str(c) for c in X.columns]
         else:
             # If it's already a DataFrame, still ensure columns are strings.
             X.columns = X.columns.astype(str)
@@ -428,12 +428,21 @@ def predict(self, X: pd.DataFrame) -> np.ndarray:
         self._ensure_model_is_loaded()
 
         # Create H2O frame with explicit column names
+        # --- ROBUSTNESS FIX for java.lang.NullPointerException ---
+        # Instead of creating the frame directly, upload the data and then assign it.
+        # This seems to create a more 'stable' frame in the H2O cluster, preventing
+        # internal errors during prediction with some models like GLM.
         try:
-            # --- CRITICAL FIX: Enforce training-time column types ---
-            # This prevents H2O from re-inferring types on the test data, which
-            # can lead to "Operation not allowed on string vector" errors.
-            test_h2o = h2o.H2OFrame(X, column_names=self.feature_names_,
-                                    column_types=self.feature_types_)
+            # Create a temporary H2OFrame by uploading the pandas DataFrame
+            tmp_frame = h2o.H2OFrame(X, column_names=self.feature_names_, column_types=self.feature_types_)
+            
+            # Assign it to a unique key in the H2O cluster. This is more reliable.
+            frame_id = f"predict_frame_{self.model_id}_{pd.Timestamp.now().strftime('%Y%m%d%H%M%S%f')}"
+            h2o.assign(tmp_frame, frame_id)
+            
+            # Get a handle to the newly created frame
+            test_h2o = h2o.get_frame(frame_id)
+
         except Exception as e:
             raise RuntimeError(f"Failed to create H2O frame for prediction: {e}")
         
diff --git a/ml_grid/model_classes/H2OGAMClassifier.py b/ml_grid/model_classes/H2OGAMClassifier.py
@@ -161,16 +161,36 @@ def _prepare_fit(self, X: pd.DataFrame, y: pd.Series):
     def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGAMClassifier":
         """Fits the H2O GAM model, falling back to GLM if necessary."""
         # The base class fit will call our overridden _prepare_fit
-        original_estimator_class = self.estimator_class
+        # We need to explicitly call _prepare_fit here to set _fallback_to_glm
+        # and get the processed parameters.
+        
+        # --- CRITICAL FIX: Manually call validation ---
+        # This ensures that if X is a numpy array, it's converted to a DataFrame
+        # with string columns before being passed to _prepare_fit.
+        X, y = self._validate_input_data(X, y)
+        # Call our overridden _prepare_fit to determine fallback and get processed data/params.
+        # This method will internally call super()._prepare_fit which handles validation,
+        # setting classes_, feature_names_, feature_types_, and H2OFrame creation.
+        train_h2o, x_vars, outcome_var, model_params = self._prepare_fit(X, y)
+
+        # Determine the actual H2O estimator class to use
         if self._fallback_to_glm:
-            self.estimator_class = H2OGeneralizedLinearEstimator
-
-        try:
-            super().fit(X, y, **kwargs)
-        finally:
-            # CRITICAL: Always restore the original estimator class
-            self.estimator_class = original_estimator_class
-            self.logger.debug(f"Restored self.estimator_class to {self.estimator_class.__name__}")
+            self.logger.warning("H2OGAMClassifier.fit: Fallback to GLM triggered. Using H2OGeneralizedLinearEstimator.")
+            h2o_estimator_to_use = H2OGeneralizedLinearEstimator
+        else:
+            h2o_estimator_to_use = self.estimator_class # This is H2OGeneralizedAdditiveEstimator
+
+        # Instantiate the H2O model with all the hyperparameters
+        self.logger.debug(f"Creating H2O model ({h2o_estimator_to_use.__name__}) with params: {model_params}")
+        self.model_ = h2o_estimator_to_use(**model_params)
+        
+        # Call the train() method with ONLY the data-related arguments
+        self.logger.debug("Calling H2O model.train()...")
+        self.model_.train(x=x_vars, y=outcome_var, training_frame=train_h2o)
+
+        # Store model_id for recovery - THIS IS CRITICAL for predict() to work
+        self.logger.debug(f"H2O train complete, extracting model_id from {self.model_}")
+        self.model_id = self.model_.model_id
 
         return self
 
diff --git a/tests/test_h2o_base_classifier.py b/tests/test_h2o_base_classifier.py
@@ -116,9 +116,12 @@ def test_fit_successful(mock_h2o_init, mock_h2o_cluster, mock_h2o_frame, classif
     assert classifier_instance.feature_types_ == {'feature1': 'real', 'feature2': 'real', 'feature3': 'enum'}
 
 @patch('h2o.get_model')
+@patch('h2o.get_frame')
+@patch('h2o.assign')
 @patch('h2o.H2OFrame')
 @patch('h2o.cluster')
-def test_predict_successful(mock_h2o_cluster, mock_h2o_frame, mock_h2o_get_model, classifier_instance, sample_data):
+def test_predict_successful(mock_h2o_cluster, mock_h2o_frame, mock_h2o_assign, mock_h2o_get_frame, mock_h2o_get_model,
+                          classifier_instance, sample_data):
     """
     Tests the `predict` method on a pre-fitted classifier.
     """
@@ -132,12 +135,26 @@ def test_predict_successful(mock_h2o_cluster, mock_h2o_frame, mock_h2o_get_model
     
     # --- Setup Mocks ---
     # Mock the H2OFrame that will be created from the input data
-    mock_frame_instance = MagicMock()
-    mock_frame_instance.nrows = len(X) # This is the crucial fix
-    mock_h2o_frame.return_value = mock_frame_instance
+    mock_tmp_frame = MagicMock(spec=h2o.H2OFrame)
+    mock_h2o_frame.return_value = mock_tmp_frame
 
+    # Mock the final frame that get_frame will return
+    mock_final_frame = MagicMock(spec=h2o.H2OFrame)
+    mock_final_frame.nrows = len(X)
+    mock_h2o_get_frame.return_value = mock_final_frame
     # Mock the model object that `h2o.get_model` will return
-    mock_model = MockH2OEstimator()
+    # --- FIX: Replace the real predict method with a MagicMock ---
+    # Instantiate the mock estimator
+    mock_model = MockH2OEstimator() 
+    # Create a mock for the predict method so we can assert calls
+    mock_model.predict = MagicMock()
+    # Configure the mock's return value to simulate H2O's behavior
+    mock_pred_frame = MagicMock()
+    mock_pred_frame.as_data_frame.return_value = pd.DataFrame({
+        'predict': np.random.randint(0, 2, len(X))
+    })
+    mock_model.predict.return_value = mock_pred_frame
+
     mock_h2o_get_model.return_value = mock_model
     
     # Mock H2O cluster status
@@ -150,8 +167,12 @@ def test_predict_successful(mock_h2o_cluster, mock_h2o_frame, mock_h2o_get_model
     # 1. Check that the model was retrieved from H2O
     mock_h2o_get_model.assert_called_with("fitted_model_123")
     
-    # 2. Check that an H2OFrame was created for the prediction data with correct types
-    mock_h2o_frame.assert_called_with(X, column_names=list(X.columns), column_types=classifier_instance.feature_types_)
+    # 2. Check that the new frame creation logic was called
+    mock_h2o_frame.assert_called_once_with(X, column_names=list(X.columns), column_types=classifier_instance.feature_types_)
+    mock_h2o_assign.assert_called_once_with(mock_tmp_frame, ANY)
+    mock_h2o_get_frame.assert_called_once()
+    # Verify the model's predict method was called with the final mocked frame
+    mock_model.predict.assert_called_once_with(mock_final_frame)
 
     # 3. Check the output of the prediction
     assert isinstance(predictions, np.ndarray)