SamoraHunter
diff --git a/‎ml_grid/model_classes/H2OBaseClassifier.py‎
Lines changed: 35 additions & 22 deletions b/‎ml_grid/model_classes/H2OBaseClassifier.py‎
Lines changed: 35 additions & 22 deletions
diff --git a/‎ml_grid/model_classes/H2OGAMClassifier.py‎
Lines changed: 9 additions & 1 deletion b/‎ml_grid/model_classes/H2OGAMClassifier.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎ml_grid/model_classes/H2OGLMClassifier.py‎
Lines changed: 26 additions & 1 deletion b/‎ml_grid/model_classes/H2OGLMClassifier.py‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎tests/conftest.py‎
Lines changed: 66 additions & 20 deletions b/‎tests/conftest.py‎
Lines changed: 66 additions & 20 deletions
@@ -88,8 +88,8 @@ def __init__(self, estimator_class=None, **kwargs):
     def __del__(self):
         """Cleans up the shared checkpoint directory if this is the last instance."""
         # This is a best-effort cleanup. In multi-process scenarios,
-        # the directory might be in use by other processes.
-        if os.path.exists(self._checkpoint_dir) and not os.listdir(self._checkpoint_dir):
+        # the directory might be in use by other processes. Add hasattr check for partial init.
+        if hasattr(self, '_checkpoint_dir') and os.path.exists(self._checkpoint_dir) and not os.listdir(self._checkpoint_dir):
             shutil.rmtree(self._checkpoint_dir, ignore_errors=True)
             logger.debug(f"Cleaned up empty shared checkpoint directory: {self._checkpoint_dir}")
 
@@ -132,7 +132,7 @@ def _validate_input_data(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -
         Raises:
             ValueError: If data is invalid
         """
-        # Convert to DataFrame if needed
+        # Convert to DataFrame if needed and ensure columns are strings
         if not isinstance(X, pd.DataFrame):
             if self.feature_names_ is not None:
                 X = pd.DataFrame(X, columns=self.feature_names_)
@@ -142,8 +142,14 @@ def _validate_input_data(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -
                         f"Input data (X) has {X.shape[1]} columns, but expected {len(self.feature_names_)} "
                         f"based on training features. Please ensure column count matches."
                     ) # This was the syntax error fix
-            else: # This else block should be aligned with the outer 'if self.feature_names_ is not None:'
+            else:
+                # If X is a numpy array, convert it to a DataFrame and ensure
+                # its columns are strings to prevent KeyErrors with H2O.
                 X = pd.DataFrame(X)
+                X.columns = X.columns.astype(str)
+        else:
+            # If it's already a DataFrame, still ensure columns are strings.
+            X.columns = X.columns.astype(str)
 
         # Reset index to avoid sklearn CV indexing issues
         # CRITICAL: If we reset X, we MUST also reset y to maintain alignment.
@@ -272,8 +278,9 @@ def _prepare_fit(self, X: pd.DataFrame, y: pd.Series):
                 model_params.setdefault('ignore_const_cols', False)
 
         # --- ROBUSTNESS FIX: Save checkpoints for model recovery ---
-        # Unconditionally add checkpoint directory. All H2O estimators support this.
-        model_params["export_checkpoints_dir"] = self._checkpoint_dir
+        # Conditionally add checkpoint directory, as not all estimators (e.g., RuleFit) support it.
+        if 'export_checkpoints_dir' in estimator_params:
+            model_params["export_checkpoints_dir"] = self._checkpoint_dir
 
         return train_h2o, x_vars, outcome_var, model_params
 
@@ -294,6 +301,15 @@ def _get_model_params(self) -> Dict[str, Any]:
             if key in valid_param_keys
         }
 
+        # --- FIX for H2OTypeError (e.g., max_depth, sample_rate, learn_rate) ---
+        # Scikit-learn's ParameterGrid/RandomizedSearchCV can pass single-element numpy arrays or lists.
+        # H2O expects native Python types (int, float), so we convert them.
+        for key, value in model_params.items():
+            if isinstance(value, np.ndarray) and value.size == 1:
+                model_params[key] = value.item()
+            elif isinstance(value, list) and len(value) == 1:
+                model_params[key] = value[0]
+
         return model_params
 
     def _handle_small_data_fallback(self, X: pd.DataFrame, y: pd.Series) -> bool:
@@ -575,31 +591,28 @@ def __sklearn_clone__(self):
         self.logger.debug(f"__sklearn_clone__ called: original instance {id(self)}, clone instance {id(cloned)}")
         return cloned # Removing dead code
 
-    @classmethod
-    def _get_param_names(cls):
+    def _get_param_names(self):
         """Get parameter names for the estimator.
         
         This override is necessary because we use **kwargs in __init__.
+        It's an instance method to access parameters stored on self.
         
         CRITICAL: This should ONLY return parameter names, NOT fitted attribute names.
         """
-        init_signature = inspect.signature(cls.__init__)
+        init_signature = inspect.signature(self.__class__.__init__)
         init_params = [p.name for p in init_signature.parameters.values() 
                       if p.name not in ('self', 'args', 'kwargs')]
 
-        # For instances, also include kwargs that were set
-        if not isinstance(cls, type):
-            extra_params = [
-                key for key in cls.__dict__
-                if not key.startswith('_')  # Exclude private attributes
-                and not key.endswith('_')    # CRITICAL: Exclude fitted attributes
-                and key not in init_params
-                and key not in ['estimator_class', 'logger']  # Exclude special attributes
-                and key not in ['model', 'model_', 'classes_', 'feature_names_', 'model_id']  # Exclude fitted
-            ]
-            return sorted(init_params + extra_params)
-        
-        return sorted(init_params)
+        extra_params = [
+            key for key in self.__dict__
+            if not key.startswith('_')
+            and not (key.endswith('_') and key != 'lambda_') # Allow lambda_
+            and key not in init_params
+            and key not in ['estimator_class', 'logger']
+            and key not in ['model', 'model_', 'classes_', 'feature_names_', 'model_id']
+        ]
+        
+        return sorted(init_params + extra_params)
 
     def set_params(self: "H2OBaseClassifier", **kwargs: Any) -> "H2OBaseClassifier":
         """Sets the parameters of this estimator.
 
@@ -37,6 +37,8 @@ def _prepare_fit(self, X: pd.DataFrame, y: pd.Series):
         self._fallback_to_glm = False # Reset flag
 
         # --- 1. Parameter Preprocessing for GAM ---
+        self.logger.debug(f"DEBUG: Before GAM column processing, model_params['gam_columns'] type: {type(model_params.get('gam_columns'))}, value: {model_params.get('gam_columns')}")
+
         if 'gam_columns' not in model_params or not model_params['gam_columns']:
              self.logger.warning("H2OGAMClassifier: 'gam_columns' not provided or empty. Defaulting to all numerical features.")
              numeric_cols = [col for col in x_vars if train_h2o[col].types[col] in ['int', 'real']]
@@ -46,6 +48,12 @@ def _prepare_fit(self, X: pd.DataFrame, y: pd.Series):
              model_params['gam_columns'] = [model_params['gam_columns']]
         elif isinstance(model_params['gam_columns'], tuple):
              model_params['gam_columns'] = list(model_params['gam_columns'])
+        # --- FIX for TypeError: object of type 'int' has no len() ---
+        elif isinstance(model_params['gam_columns'], int):
+             # If an integer is passed (e.g., from a hyperparameter search),
+             # convert it to a list containing the column name as a string.
+             # H2O expects column names to be strings.
+             model_params['gam_columns'] = [str(model_params['gam_columns'])]
         elif isinstance(model_params['gam_columns'], list) and model_params['gam_columns'] and isinstance(model_params['gam_columns'][0], list):
              model_params['gam_columns'] = [item for sublist in model_params['gam_columns'] for item in sublist]
 
@@ -127,7 +135,7 @@ def _prepare_fit(self, X: pd.DataFrame, y: pd.Series):
                 suitable_gam_cols.append(col)
                 suitable_knots.append(required_knots)
                 if i < len(bs_list): suitable_bs.append(bs_list[i])
-                if i < len(scale_list): suitable_scale.append(scale_list[i])
+                if scale_list and i < len(scale_list): suitable_scale.append(scale_list[i])
 
             if not suitable_gam_cols:
                 self.logger.warning("No suitable GAM columns found after checking cardinality. Falling back to GLM.")
 
@@ -1,5 +1,7 @@
 from h2o.estimators import H2OGeneralizedLinearEstimator
 from .H2OBaseClassifier import H2OBaseClassifier
+import pandas as pd
+from typing import Any, Dict
 
 class H2OGLMClassifier(H2OBaseClassifier):
     """A scikit-learn compatible wrapper for H2O's Generalized Linear Models.
@@ -10,7 +12,30 @@ def __init__(self, **kwargs):
         All keyword arguments are passed directly to the H2OGeneralizedLinearEstimator.
         Example args: family='binomial', alpha=0.5
         """
+        # --- FIX for scikit-learn cloning and H2O's 'lambda' parameter ---
+        # scikit-learn's get_params() will return 'lambda_', but the user might
+        # provide 'lambda' in the parameter grid. We must handle both cases.
+        if 'lambda' in kwargs and 'lambda_' not in kwargs:
+            kwargs['lambda_'] = kwargs.pop('lambda')
+            
         # Remove estimator_class from kwargs if present (happens during sklearn clone)
         kwargs.pop('estimator_class', None)
         # Pass the specific estimator class
-        super().__init__(estimator_class=H2OGeneralizedLinearEstimator, **kwargs)
+        super().__init__(estimator_class=H2OGeneralizedLinearEstimator, **kwargs)
+
+    def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGLMClassifier":
+        """
+        Fits the H2O GLM model and then corrects the 'lambda_' parameter name for
+        compatibility with the H2O backend during prediction.
+        """
+        # Call the parent class's fit method to perform the actual training
+        super().fit(X, y, **kwargs)
+
+        # --- CRITICAL FIX for predict-time NullPointerException ---
+        # The H2O backend's predict method requires the 'lambda' parameter, but the
+        # Python object may hold it as 'lambda_'. We must ensure the final model
+        # object has the correct 'lambda' parameter set in its internal params dict.
+        if self.model_ and 'lambda_' in self.model_.params:
+            self.model_.params['lambda'] = self.model_.params.pop('lambda_')
+
+        return self
@@ -1,30 +1,76 @@
-"""
-Pytest configuration file for shared fixtures.
-
-This file makes fixtures available to all test files in this directory
-and its subdirectories without needing to import them.
-"""
+# tests/conftest.py
 
 import pytest
-import pandas as pd
-import numpy as np
 import h2o
-
-# Add the project root directory to the Python path
-import sys
+import logging
 import os
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+# --- Tame TensorFlow ---
+# Set log level to suppress info/warnings before importing
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
+try:
+    import tensorflow as tf
+    # Explicitly prevent TF from allocating any GPU memory.
+    # This stops it from conflicting with H2O's Java VM.
+    tf.config.set_visible_devices([], 'GPU') 
+    print("\n--- [Fixture Config] TensorFlow GPU explicitly disabled. ---")
+except ImportError:
+    print("\n--- [Fixture Config] TensorFlow not found, skipping GPU disable. ---")
+    pass
+# --- End Tame TensorFlow ---
+
 
 @pytest.fixture(scope="session")
 def h2o_session_fixture():
-    """Initializes H2O once per test session for stability and speed."""
-    h2o.init(nthreads=1, log_level="FATA")
-    yield
-    h2o.shutdown(prompt=False)
+    """
+    Session-scoped fixture to initialize and shut down the H2O cluster.
+    This ensures h2o.init() is called only ONCE for the entire test session.
+    """
+    print("\n--- [H2O Fixture] Initializing H2O cluster... ---")
+    
+    # Stop h2o from printing progress bars, which can hang in pytest
+    h2o.no_progress()
+    
+    # Set up logging
+    logging.getLogger('h2o').setLevel(logging.DEBUG)
+
+    try:
+        # Start the H2O cluster. 
+        h2o.init(
+            nthreads=-1,  # Use all available cores
+            max_mem_size="4g", # Adjust as needed
+            log_level="DEBUG" 
+        )
+        print("--- [H2O Fixture] H2O cluster initialized successfully. ---")
+        
+        # Yield to let the tests run
+        yield
+        
+    finally:
+        # This code runs *after* all tests in the session are complete
+        print("\n--- [H2O Fixture] Shutting down H2O cluster... ---")
+        
+        # Call remove_all() BEFORE shutdown() to avoid ConnectionError
+        h2o.remove_all()
+        h2o.cluster().shutdown()
+        
+        print("--- [H2O Fixture] H2O cluster shutdown complete. ---")
 
 @pytest.fixture(scope="session")
 def synthetic_data():
-    """Provides a simple, reusable dataset for testing classifiers."""
-    X = pd.DataFrame(np.random.rand(50, 3), columns=['f1', 'f2', 'f3'])
-    y = pd.Series(np.random.randint(0, 2, 50), name="outcome")
-    return X, y
+    """Generates simple synthetic data for classification."""
+    try:
+        from sklearn.datasets import make_classification
+        
+        # Keep n_samples large as a safety precaution
+        X, y = make_classification(
+            n_samples=1000,
+            n_features=10,
+            n_informative=5,
+            n_redundant=0,
+            n_classes=2,
+            random_state=42
+        )
+        return X, y
+    except ImportError:
+        pytest.skip("sklearn not installed, skipping synthetic_data generation")