@@ -138,8 +138,10 @@ def __init__(
138138
139139 # CRITICAL: Initialize the cross-validation object before it is used.
140140 self .cv = RepeatedKFold (
141- n_splits = max (2 , min (len (self .X_train ), 2 ) + 1 ),
142- n_repeats = 2 ,
141+ # Ensure n_splits is at least 2 but not more than the number of samples.
142+ # This prevents errors with very small datasets.
143+ n_splits = min (len (self .X_train ), 5 ),
144+ n_repeats = 2 ,
143145 random_state = 1
144146 )
145147
@@ -258,18 +260,31 @@ def __init__(
258260 self .logger .info (f"n_iter_v = { n_iter_v } " )
259261
260262 # Dynamically adjust KNN parameter space for small datasets
261- if "kneighbors" in method_name .lower ():
263+ if "kneighbors" in method_name .lower () or "simbsig" in method_name . lower () :
262264 self ._adjust_knn_parameters (parameter_space )
263265 self .logger .info (
264266 "Adjusted KNN n_neighbors parameter space to prevent errors on small CV folds."
265267 )
268+
269+ # Check if dataset is too small for CatBoost
270+ if "catboost" in method_name .lower ():
271+ min_samples_required = 10 # CatBoost needs a reasonable amount of data
272+ if len (self .X_train ) < min_samples_required :
273+ self .logger .warning (
274+ f"Dataset too small for CatBoost ({ len (self .X_train )} samples < { min_samples_required } required). "
275+ f"Skipping { method_name } ."
276+ )
277+ # Return early with default scores
278+ self .grid_search_cross_validate_score_result = 0.5
279+ return
266280
267281 # Dynamically adjust CatBoost subsample parameter for small datasets
268282 if "catboost" in method_name .lower ():
269283 self ._adjust_catboost_parameters (parameter_space )
270284 self .logger .info (
271285 "Adjusted CatBoost subsample parameter space to prevent errors on small CV folds."
272286 )
287+
273288
274289 # Instantiate and run the hyperparameter grid/random search
275290 search = HyperparameterSearch (
@@ -310,6 +325,7 @@ def __init__(
310325
311326 # Pass reset data to search
312327 current_algorithm = search .run_search (X_train_reset , y_train_reset )
328+
313329
314330 except XGBoostError as e :
315331 if 'cuda' in str (e ).lower () or 'memory' in str (e ).lower ():
@@ -510,26 +526,38 @@ def _adjust_knn_parameters(self, parameter_space: Union[Dict, List[Dict]]):
510526 Dynamically adjusts the 'n_neighbors' parameter for KNN-based models
511527 to prevent errors on small datasets during cross-validation.
512528 """
513- # Smallest fold size will be n_samples * (n_splits-1)/n_splits
514- # With RepeatedKFold, n_splits is at least 2. Smallest fold is 1/2 of data.
515529 n_splits = self .cv .get_n_splits ()
516- n_samples_in_fold = int (len (self .X_train ) * (n_splits - 1 ) / n_splits )
517530
518- # Ensure n_samples_in_fold is at least 1
519- n_samples_in_fold = max (1 , n_samples_in_fold )
531+ # Calculate BOTH training and test fold sizes
532+ n_samples_train_fold = len (self .X_train ) - (len (self .X_train ) // n_splits )
533+ n_samples_test_fold = len (self .X_train ) // n_splits
534+
535+ # CRITICAL: Use the MINIMUM of both constraints
536+ # During scoring, KNN.predict() requires n_neighbors <= len(test_fold)
537+ # During fitting, KNN.fit() requires n_neighbors <= len(train_fold)
538+ max_n_neighbors = min (n_samples_train_fold , n_samples_test_fold )
539+ max_n_neighbors = max (1 , max_n_neighbors )
540+
541+ self .logger .info (
542+ f"KNN constraints - train_fold_size={ n_samples_train_fold } , "
543+ f"test_fold_size={ n_samples_test_fold } , max_n_neighbors={ max_n_neighbors } "
544+ )
520545
521546 def adjust_param (param_value ):
522547 if is_skopt_space (param_value ):
523548 # For skopt.space objects, adjust the upper bound
524- new_high = min (param_value .high , n_samples_in_fold )
549+ new_high = min (param_value .high , max_n_neighbors )
525550 new_low = min (param_value .low , new_high )
526551 param_value .high = new_high
527552 param_value .low = new_low
553+ self .logger .debug (f"Adjusted skopt space: low={ new_low } , high={ new_high } " )
528554 elif isinstance (param_value , (list , np .ndarray )):
529555 # For lists, filter the values
530- new_param_value = [n for n in param_value if n <= n_samples_in_fold ]
556+ new_param_value = [n for n in param_value if n <= max_n_neighbors ]
531557 if not new_param_value :
532- return [n_samples_in_fold ]
558+ self .logger .warning (f"All n_neighbors values filtered out. Using [{ max_n_neighbors } ]" )
559+ return [max_n_neighbors ]
560+ self .logger .debug (f"Filtered n_neighbors list: { new_param_value } " )
533561 return new_param_value
534562 return param_value
535563
@@ -546,13 +574,19 @@ def _adjust_catboost_parameters(self, parameter_space: Union[Dict, List[Dict]]):
546574 errors on small datasets during cross-validation.
547575 """
548576 n_splits = self .cv .get_n_splits ()
549- n_samples_in_fold = int (len (self .X_train ) * (n_splits - 1 ) / n_splits )
577+ # Correctly calculate the size of the smallest training fold.
578+ n_samples_in_fold = len (self .X_train ) - (len (self .X_train ) // n_splits )
550579
551580 # Ensure n_samples_in_fold is at least 1 to avoid division by zero
552581 n_samples_in_fold = max (1 , n_samples_in_fold )
553582
554- # The minimum subsample value must be > 1/n_samples to ensure at least one sample is chosen
555- min_subsample = 1.0 / n_samples_in_fold
583+ # If the training fold is extremely small, force subsample to 1.0
584+ # to prevent CatBoost from failing on constant features.
585+ if n_samples_in_fold <= 2 :
586+ min_subsample = 1.0
587+ else :
588+ # The minimum subsample value must be > 1/n_samples to ensure at least one sample is chosen
589+ min_subsample = 1.0 / n_samples_in_fold
556590
557591 def adjust_param (param_value ):
558592 if is_skopt_space (param_value ):
@@ -562,13 +596,19 @@ def adjust_param(param_value):
562596 if new_low > param_value .high :
563597 new_low = param_value .high
564598 param_value .low = new_low
599+ # If the fold is tiny, force the entire space to be 1.0
600+ if n_samples_in_fold <= 2 :
601+ param_value .low = param_value .high = 1.0
565602 elif isinstance (param_value , (list , np .ndarray )):
566603 # For lists, filter the values
567604 new_param_value = [s for s in param_value if s >= min_subsample ]
568605 if not new_param_value :
569606 # If all values are filtered out, use the smallest valid value
570607 return [min (p for p in param_value if p > 0 ) if any (p > 0 for p in param_value ) else 1.0 ]
571608 return new_param_value
609+ # If the fold is tiny, force subsample to 1.0
610+ if n_samples_in_fold <= 2 :
611+ return [1.0 ] if isinstance (param_value , list ) else 1.0
572612 return param_value
573613
574614 if isinstance (parameter_space , list ):
0 commit comments