@@ -88,8 +88,8 @@ def __init__(self, estimator_class=None, **kwargs):
8888 def __del__ (self ):
8989 """Cleans up the shared checkpoint directory if this is the last instance."""
9090 # This is a best-effort cleanup. In multi-process scenarios,
91- # the directory might be in use by other processes.
92- if os .path .exists (self ._checkpoint_dir ) and not os .listdir (self ._checkpoint_dir ):
91+ # the directory might be in use by other processes. Add hasattr check for partial init.
92+ if hasattr ( self , '_checkpoint_dir' ) and os .path .exists (self ._checkpoint_dir ) and not os .listdir (self ._checkpoint_dir ):
9393 shutil .rmtree (self ._checkpoint_dir , ignore_errors = True )
9494 logger .debug (f"Cleaned up empty shared checkpoint directory: { self ._checkpoint_dir } " )
9595
@@ -132,7 +132,7 @@ def _validate_input_data(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -
132132 Raises:
133133 ValueError: If data is invalid
134134 """
135- # Convert to DataFrame if needed
135+ # Convert to DataFrame if needed and ensure columns are strings
136136 if not isinstance (X , pd .DataFrame ):
137137 if self .feature_names_ is not None :
138138 X = pd .DataFrame (X , columns = self .feature_names_ )
@@ -142,8 +142,14 @@ def _validate_input_data(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -
142142 f"Input data (X) has { X .shape [1 ]} columns, but expected { len (self .feature_names_ )} "
143143 f"based on training features. Please ensure column count matches."
144144 ) # This was the syntax error fix
145- else : # This else block should be aligned with the outer 'if self.feature_names_ is not None:'
145+ else :
146+ # If X is a numpy array, convert it to a DataFrame and ensure
147+ # its columns are strings to prevent KeyErrors with H2O.
146148 X = pd .DataFrame (X )
149+ X .columns = X .columns .astype (str )
150+ else :
151+ # If it's already a DataFrame, still ensure columns are strings.
152+ X .columns = X .columns .astype (str )
147153
148154 # Reset index to avoid sklearn CV indexing issues
149155 # CRITICAL: If we reset X, we MUST also reset y to maintain alignment.
@@ -272,8 +278,9 @@ def _prepare_fit(self, X: pd.DataFrame, y: pd.Series):
272278 model_params .setdefault ('ignore_const_cols' , False )
273279
274280 # --- ROBUSTNESS FIX: Save checkpoints for model recovery ---
275- # Unconditionally add checkpoint directory. All H2O estimators support this.
276- model_params ["export_checkpoints_dir" ] = self ._checkpoint_dir
281+ # Conditionally add checkpoint directory, as not all estimators (e.g., RuleFit) support it.
282+ if 'export_checkpoints_dir' in estimator_params :
283+ model_params ["export_checkpoints_dir" ] = self ._checkpoint_dir
277284
278285 return train_h2o , x_vars , outcome_var , model_params
279286
@@ -294,6 +301,15 @@ def _get_model_params(self) -> Dict[str, Any]:
294301 if key in valid_param_keys
295302 }
296303
304+ # --- FIX for H2OTypeError (e.g., max_depth, sample_rate, learn_rate) ---
305+ # Scikit-learn's ParameterGrid/RandomizedSearchCV can pass single-element numpy arrays or lists.
306+ # H2O expects native Python types (int, float), so we convert them.
307+ for key , value in model_params .items ():
308+ if isinstance (value , np .ndarray ) and value .size == 1 :
309+ model_params [key ] = value .item ()
310+ elif isinstance (value , list ) and len (value ) == 1 :
311+ model_params [key ] = value [0 ]
312+
297313 return model_params
298314
299315 def _handle_small_data_fallback (self , X : pd .DataFrame , y : pd .Series ) -> bool :
@@ -575,31 +591,28 @@ def __sklearn_clone__(self):
575591 self .logger .debug (f"__sklearn_clone__ called: original instance { id (self )} , clone instance { id (cloned )} " )
576592 return cloned # Removing dead code
577593
578- @classmethod
579- def _get_param_names (cls ):
594+ def _get_param_names (self ):
580595 """Get parameter names for the estimator.
581596
582597 This override is necessary because we use **kwargs in __init__.
598+ It's an instance method to access parameters stored on self.
583599
584600 CRITICAL: This should ONLY return parameter names, NOT fitted attribute names.
585601 """
586- init_signature = inspect .signature (cls .__init__ )
602+ init_signature = inspect .signature (self . __class__ .__init__ )
587603 init_params = [p .name for p in init_signature .parameters .values ()
588604 if p .name not in ('self' , 'args' , 'kwargs' )]
589605
590- # For instances, also include kwargs that were set
591- if not isinstance (cls , type ):
592- extra_params = [
593- key for key in cls .__dict__
594- if not key .startswith ('_' ) # Exclude private attributes
595- and not key .endswith ('_' ) # CRITICAL: Exclude fitted attributes
596- and key not in init_params
597- and key not in ['estimator_class' , 'logger' ] # Exclude special attributes
598- and key not in ['model' , 'model_' , 'classes_' , 'feature_names_' , 'model_id' ] # Exclude fitted
599- ]
600- return sorted (init_params + extra_params )
601-
602- return sorted (init_params )
606+ extra_params = [
607+ key for key in self .__dict__
608+ if not key .startswith ('_' )
609+ and not (key .endswith ('_' ) and key != 'lambda_' ) # Allow lambda_
610+ and key not in init_params
611+ and key not in ['estimator_class' , 'logger' ]
612+ and key not in ['model' , 'model_' , 'classes_' , 'feature_names_' , 'model_id' ]
613+ ]
614+
615+ return sorted (init_params + extra_params )
603616
604617 def set_params (self : "H2OBaseClassifier" , ** kwargs : Any ) -> "H2OBaseClassifier" :
605618 """Sets the parameters of this estimator.
0 commit comments