minor changes

SamoraHunter · SamoraHunter · commit 0b47bbf392fa · 2025-11-09T21:25:15.000Z
diff --git a/ml_grid/model_classes/H2OGAMClassifier.py b/ml_grid/model_classes/H2OGAMClassifier.py
@@ -143,46 +143,30 @@ def _prepare_fit(self, X: pd.DataFrame, y: pd.Series):
                 model_params["num_knots"] = num_knots_list
 
             for i, col in enumerate(gam_columns):
+                # --- FIX: Ensure column exists before trying to access it ---
                 if col not in X.columns:
                     self.logger.warning(
                         f"GAM column '{col}' not found in input data X. Skipping."
                     )
                     continue
 
+                # --- FIX: Validate knot count against unique values in the data ---
                 n_unique = X[col].nunique()
                 required_knots = num_knots_list[i]
 
-                # H2O's backend requires num_knots < n_unique.
-                if n_unique <= required_knots:
+                # --- ROBUSTNESS FIX for java.lang.AssertionError in H2O quantile calculation ---
+                # The quantile calculation can fail on sparse data or data with low cardinality.
+                # Enforce a stricter requirement: the number of unique values must be at least
+                # double the number of knots. This provides a safer margin for the algorithm.
+                if n_unique < (required_knots * 2):
                     if not self._suppress_low_cardinality_error:
                         raise ValueError(
-                            f"Number of knots ({required_knots}) must be at least one less than the number of unique values ({n_unique}) for feature '{col}'."
+                            f"Feature '{col}' has {n_unique} unique values, which is insufficient "
+                            f"for the requested {required_knots} knots. At least {required_knots * 2} unique values are required."
                         )
                     self.logger.warning(
                         f"Excluding GAM column '{col}': {n_unique} unique values "
-                        f"insufficient for {required_knots} knots (require >= {required_knots + 1})."
-                    )
-                    continue
-
-                # Pre-check for well-defined knots
-                try:
-                    quantiles = np.linspace(0, 1, required_knots)
-                    knot_values = X[col].quantile(quantiles)
-                    # Check for enough unique values AND that they are monotonically increasing
-                    # The diff() will be > 0 for all elements in a strictly increasing series.
-                    are_knots_valid = (knot_values.nunique() >= required_knots) and (
-                        np.all(np.diff(knot_values.to_numpy()) > 0)
-                    )
-
-                    if not are_knots_valid:
-                        self.logger.warning(
-                            f"Excluding GAM column '{col}': Not enough unique values to generate distinct, "
-                            f"monotonically increasing knots."
-                        )
-                        continue
-                except Exception as e:
-                    self.logger.warning(
-                        f"Excluding GAM column '{col}' due to an error during knot pre-check: {e}"
+                        f"is insufficient for {required_knots} knots (requires at least {required_knots * 2}). Skipping."
                     )
                     continue
 
diff --git a/ml_grid/pipeline/model_class_list.py b/ml_grid/pipeline/model_class_list.py
@@ -40,15 +40,60 @@
 from ml_grid.model_classes.light_gbm_class import LightGBMClassifierWrapper
 from ml_grid.model_classes.logistic_regression_class import LogisticRegressionClass
 from ml_grid.model_classes.mlp_classifier_class import MLPClassifierClass as MLPClassifierClass
+from ml_grid.model_classes.NeuralNetworkClassifier_class import (
+    NeuralNetworkClassifier_class,
+)
+
 from ml_grid.model_classes.quadratic_discriminant_class import (
     QuadraticDiscriminantAnalysisClass,
 )
 from ml_grid.model_classes.randomforest_classifier_class import (
     RandomForestClassifierClass,
 )
+from ml_grid.model_classes.svc_class import SVCClass
 from ml_grid.model_classes.xgb_classifier_class import XGBClassifierClass
 
 
+# --- ROBUST MAPPING of config names to class objects ---
+# This dictionary provides a direct, secure, and explicit mapping from the
+# string names used in the YAML config files to the actual imported Python classes.
+# This avoids the use of `eval()` and makes the code easier to maintain.
+MODEL_CLASS_MAP = {
+    # Scikit-learn and similar
+    "LogisticRegression": LogisticRegressionClass,
+    "LogisticRegressionClass": LogisticRegressionClass,
+    "RandomForestClassifier": RandomForestClassifierClass,
+    "RandomForestClassifierClass": RandomForestClassifierClass,
+    "XGB_class": XGBClassifierClass,
+    "XGBClassifierClass": XGBClassifierClass,
+    "AdaBoostClassifierClass": AdaBoostClassifierClass,
+    "CatBoostClassifierClass": CatBoostClassifierClass,
+    "GaussianNBClassifierClass": GaussianNBClassifierClass,
+    "GradientBoostingClassifierClass": GradientBoostingClassifierClass,
+    "KNeighborsClassifierClass": KNeighborsClassifierClass,
+    "LightGBMClassifierWrapper": LightGBMClassifierWrapper,
+    "MLPClassifierClass": MLPClassifierClass,
+    "QuadraticDiscriminantAnalysisClass": QuadraticDiscriminantAnalysisClass,
+    "SVCClass": SVCClass,
+    "NeuralNetworkClassifier_class": NeuralNetworkClassifier_class, # Corrected mapping
+    # GPU specific
+    "KerasClassifierClass": KerasClassifierClass,
+    "KNNGpuWrapperClass": KNNGpuWrapperClass,
+    # H2O Models
+    "H2O_class": H2OAutoMLClass,  # Alias for AutoML
+    "H2OAutoMLClass": H2OAutoMLClass,
+    "H2O_GBM_class": H2O_GBM_class,
+    "H2O_DRF_class": H2O_DRF_class,
+    "H2O_DeepLearning_class": H2O_DeepLearning_class,
+    "H2O_GLM_class": H2O_GLM_class,
+    "H2O_NaiveBayes_class": H2O_NaiveBayes_class,
+    "H2O_RuleFit_class": H2O_RuleFit_class,
+    "H2O_XGBoost_class": H2O_XGBoost_class,
+    "H2O_StackedEnsemble_class": H2O_StackedEnsemble_class,
+    "H2O_GAM_class": H2O_GAM_class,
+}
+
+
 def get_model_class_list(ml_grid_object: pipe) -> List[Any]:
     """Generates a list of instantiated model classes based on the configuration.
 
@@ -153,17 +198,16 @@ def get_model_class_list(ml_grid_object: pipe) -> List[Any]:
                     f"Skipping '{class_name}' because it requires a GPU, but no CUDA-enabled GPU is available."
                 )
                 continue
-            # Try the exact name first, then try with '_class' appended for convenience
-            try:
-                model_class = eval(class_name)
-            except NameError:
-                class_name_with_suffix = f"{class_name}_class"
-                try:
-                    model_class = eval(class_name_with_suffix)
-                except NameError:
-                    raise NameError(
-                        f"Could not find model class '{class_name}' or '{class_name_with_suffix}'. Please check the name and ensure it's imported."
-                    )
+            
+            # Look up the class in our explicit mapping dictionary
+            model_class = MODEL_CLASS_MAP.get(class_name)
+            
+            if model_class is None:
+                raise KeyError(
+                    f"Could not find model class '{class_name}' in MODEL_CLASS_MAP. "
+                    f"Please check the model name in your configuration and ensure it is imported and mapped in model_class_list.py."
+                )
+
             # Pass X and y to constructors that accept them (like H2OStackedEnsemble)
             init_signature = inspect.signature(model_class.__init__)
             init_params = {}
diff --git a/ml_grid/pipeline/test_data_pipeline.py b/ml_grid/pipeline/test_data_pipeline.py
@@ -85,7 +85,7 @@ def setUp(self):
             },
         }
         self.drop_term_list = ["chrom", "hfe", "phlebo"]
-        self.model_class_dict = {"LogisticRegression_class": True}
+        self.model_class_dict = {"LogisticRegressionClass": True}
 
     def tearDown(self):
         """Clean up the temporary directory after each test."""
diff --git a/tests/test_h2o_classifiers.py b/tests/test_h2o_classifiers.py
@@ -195,10 +195,9 @@ def test_h2o_gam_knot_cardinality_error(h2o_session_fixture):
     estimator = H2O_GAM_class(X=X, y=y, parameter_space_size="small").algorithm_implementation
 
     # Set parameters that will cause the error: 5 knots for a feature with 2
-    # unique values. # noqa: E501
+    # unique values.
     # Also, we must disable the wrapper's internal error handling that
-    # suppresses this
-    # specific error, so that cross_val_score can raise it as intended.
+    # suppresses this specific error, so that cross_val_score can raise it as intended.
     estimator.set_params(
         gam_columns=['feature2'],
         num_knots=5,
@@ -210,9 +209,10 @@ def test_h2o_gam_knot_cardinality_error(h2o_session_fixture):
     cv = KFold(n_splits=2, shuffle=True, random_state=42)
 
     # We expect cross_val_score to fail and raise our specific ValueError
+    # Updated regex to match the actual error message from the code
     with pytest.raises(
         ValueError,
-        match=r"Number of knots .* must be at least one less than the number of unique values",
+        match=r"Feature .* has \d+ unique values, which is insufficient for the requested \d+ knots\. At least \d+ unique values are required\.",
     ):
         # The error_score='raise' is crucial for pytest.raises to catch the exception
         cross_val_score(estimator, X, y, cv=cv, error_score='raise', n_jobs=1)

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ def setUp(self):`
`85`	`85`	`},`
`86`	`86`	`}`
`87`	`87`	`self.drop_term_list = ["chrom", "hfe", "phlebo"]`
`88`		`- self.model_class_dict = {"LogisticRegression_class": True}`
	`88`	`+ self.model_class_dict = {"LogisticRegressionClass": True}`
`89`	`89`
`90`	`90`	`def tearDown(self):`
`91`	`91`	`"""Clean up the temporary directory after each test."""`