Skip to content

Commit a3e3b64

Browse files
committed
fall back for catboost n sample problem. adjustment for knn params
1 parent e026d83 commit a3e3b64

File tree

1 file changed

+54
-14
lines changed

1 file changed

+54
-14
lines changed

ml_grid/pipeline/grid_search_cross_validate.py

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,10 @@ def __init__(
138138

139139
# CRITICAL: Initialize the cross-validation object before it is used.
140140
self.cv = RepeatedKFold(
141-
n_splits=max(2, min(len(self.X_train), 2) + 1),
142-
n_repeats=2,
141+
# Ensure n_splits is at least 2 but not more than the number of samples.
142+
# This prevents errors with very small datasets.
143+
n_splits=min(len(self.X_train), 5),
144+
n_repeats=2,
143145
random_state=1
144146
)
145147

@@ -258,18 +260,31 @@ def __init__(
258260
self.logger.info(f"n_iter_v = {n_iter_v}")
259261

260262
# Dynamically adjust KNN parameter space for small datasets
261-
if "kneighbors" in method_name.lower():
263+
if "kneighbors" in method_name.lower() or "simbsig" in method_name.lower():
262264
self._adjust_knn_parameters(parameter_space)
263265
self.logger.info(
264266
"Adjusted KNN n_neighbors parameter space to prevent errors on small CV folds."
265267
)
268+
269+
# Check if dataset is too small for CatBoost
270+
if "catboost" in method_name.lower():
271+
min_samples_required = 10 # CatBoost needs a reasonable amount of data
272+
if len(self.X_train) < min_samples_required:
273+
self.logger.warning(
274+
f"Dataset too small for CatBoost ({len(self.X_train)} samples < {min_samples_required} required). "
275+
f"Skipping {method_name}."
276+
)
277+
# Return early with default scores
278+
self.grid_search_cross_validate_score_result = 0.5
279+
return
266280

267281
# Dynamically adjust CatBoost subsample parameter for small datasets
268282
if "catboost" in method_name.lower():
269283
self._adjust_catboost_parameters(parameter_space)
270284
self.logger.info(
271285
"Adjusted CatBoost subsample parameter space to prevent errors on small CV folds."
272286
)
287+
273288

274289
# Instantiate and run the hyperparameter grid/random search
275290
search = HyperparameterSearch(
@@ -310,6 +325,7 @@ def __init__(
310325

311326
# Pass reset data to search
312327
current_algorithm = search.run_search(X_train_reset, y_train_reset)
328+
313329

314330
except XGBoostError as e:
315331
if 'cuda' in str(e).lower() or 'memory' in str(e).lower():
@@ -510,26 +526,38 @@ def _adjust_knn_parameters(self, parameter_space: Union[Dict, List[Dict]]):
510526
Dynamically adjusts the 'n_neighbors' parameter for KNN-based models
511527
to prevent errors on small datasets during cross-validation.
512528
"""
513-
# Smallest fold size will be n_samples * (n_splits-1)/n_splits
514-
# With RepeatedKFold, n_splits is at least 2. Smallest fold is 1/2 of data.
515529
n_splits = self.cv.get_n_splits()
516-
n_samples_in_fold = int(len(self.X_train) * (n_splits - 1) / n_splits)
517530

518-
# Ensure n_samples_in_fold is at least 1
519-
n_samples_in_fold = max(1, n_samples_in_fold)
531+
# Calculate BOTH training and test fold sizes
532+
n_samples_train_fold = len(self.X_train) - (len(self.X_train) // n_splits)
533+
n_samples_test_fold = len(self.X_train) // n_splits
534+
535+
# CRITICAL: Use the MINIMUM of both constraints
536+
# During scoring, KNN.predict() requires n_neighbors <= len(test_fold)
537+
# During fitting, KNN.fit() requires n_neighbors <= len(train_fold)
538+
max_n_neighbors = min(n_samples_train_fold, n_samples_test_fold)
539+
max_n_neighbors = max(1, max_n_neighbors)
540+
541+
self.logger.info(
542+
f"KNN constraints - train_fold_size={n_samples_train_fold}, "
543+
f"test_fold_size={n_samples_test_fold}, max_n_neighbors={max_n_neighbors}"
544+
)
520545

521546
def adjust_param(param_value):
522547
if is_skopt_space(param_value):
523548
# For skopt.space objects, adjust the upper bound
524-
new_high = min(param_value.high, n_samples_in_fold)
549+
new_high = min(param_value.high, max_n_neighbors)
525550
new_low = min(param_value.low, new_high)
526551
param_value.high = new_high
527552
param_value.low = new_low
553+
self.logger.debug(f"Adjusted skopt space: low={new_low}, high={new_high}")
528554
elif isinstance(param_value, (list, np.ndarray)):
529555
# For lists, filter the values
530-
new_param_value = [n for n in param_value if n <= n_samples_in_fold]
556+
new_param_value = [n for n in param_value if n <= max_n_neighbors]
531557
if not new_param_value:
532-
return [n_samples_in_fold]
558+
self.logger.warning(f"All n_neighbors values filtered out. Using [{max_n_neighbors}]")
559+
return [max_n_neighbors]
560+
self.logger.debug(f"Filtered n_neighbors list: {new_param_value}")
533561
return new_param_value
534562
return param_value
535563

@@ -546,13 +574,19 @@ def _adjust_catboost_parameters(self, parameter_space: Union[Dict, List[Dict]]):
546574
errors on small datasets during cross-validation.
547575
"""
548576
n_splits = self.cv.get_n_splits()
549-
n_samples_in_fold = int(len(self.X_train) * (n_splits - 1) / n_splits)
577+
# Correctly calculate the size of the smallest training fold.
578+
n_samples_in_fold = len(self.X_train) - (len(self.X_train) // n_splits)
550579

551580
# Ensure n_samples_in_fold is at least 1 to avoid division by zero
552581
n_samples_in_fold = max(1, n_samples_in_fold)
553582

554-
# The minimum subsample value must be > 1/n_samples to ensure at least one sample is chosen
555-
min_subsample = 1.0 / n_samples_in_fold
583+
# If the training fold is extremely small, force subsample to 1.0
584+
# to prevent CatBoost from failing on constant features.
585+
if n_samples_in_fold <= 2:
586+
min_subsample = 1.0
587+
else:
588+
# The minimum subsample value must be > 1/n_samples to ensure at least one sample is chosen
589+
min_subsample = 1.0 / n_samples_in_fold
556590

557591
def adjust_param(param_value):
558592
if is_skopt_space(param_value):
@@ -562,13 +596,19 @@ def adjust_param(param_value):
562596
if new_low > param_value.high:
563597
new_low = param_value.high
564598
param_value.low = new_low
599+
# If the fold is tiny, force the entire space to be 1.0
600+
if n_samples_in_fold <= 2:
601+
param_value.low = param_value.high = 1.0
565602
elif isinstance(param_value, (list, np.ndarray)):
566603
# For lists, filter the values
567604
new_param_value = [s for s in param_value if s >= min_subsample]
568605
if not new_param_value:
569606
# If all values are filtered out, use the smallest valid value
570607
return [min(p for p in param_value if p > 0) if any(p > 0 for p in param_value) else 1.0]
571608
return new_param_value
609+
# If the fold is tiny, force subsample to 1.0
610+
if n_samples_in_fold <= 2:
611+
return [1.0] if isinstance(param_value, list) else 1.0
572612
return param_value
573613

574614
if isinstance(parameter_space, list):

0 commit comments

Comments
 (0)