66import pandas as pd
77import tensorflow as tf
88from sklearn .model_selection import GridSearchCV , RandomizedSearchCV , ParameterGrid
9+ from pandas .testing import assert_index_equal
910from sklearn .exceptions import ConvergenceWarning
1011from skopt import BayesSearchCV
1112from sklearn .base import is_classifier , BaseEstimator
@@ -56,6 +57,7 @@ def __init__(
5657 sub_sample_pct : int = 100 ,
5758 max_iter : int = 100 ,
5859 ml_grid_object : Any = None ,
60+ cv : Any = None ,
5961 ):
6062 """Initializes the HyperparameterSearch class.
6163
@@ -70,6 +72,8 @@ def __init__(
7072 Bayesian search. Defaults to 100.
7173 ml_grid_object (Any, optional): The main pipeline object containing data and
7274 other parameters. Defaults to None.
75+ cv (Any, optional): Cross-validation splitting strategy. Can be None, int,
76+ or a CV splitter. Defaults to None (no cross-validation).
7377 """
7478 self .algorithm = algorithm
7579 self .parameter_space = parameter_space
@@ -78,6 +82,7 @@ def __init__(
7882 self .sub_sample_pct = sub_sample_pct
7983 self .max_iter = max_iter
8084 self .ml_grid_object = ml_grid_object
85+ self .cv = cv
8186
8287 if self .ml_grid_object is None :
8388 raise ValueError ("ml_grid_object is required." )
@@ -105,6 +110,7 @@ def __init__(
105110 # Configure warnings
106111 warnings .filterwarnings ("ignore" , category = ConvergenceWarning )
107112 warnings .filterwarnings ("ignore" , category = UserWarning )
113+ warnings .filterwarnings ("ignore" , category = RuntimeWarning ) # Suppress divide by zero warnings from NaiveBayes
108114
109115 # Configure GPUs if applicable
110116 if (
@@ -130,40 +136,82 @@ def run_search(self, X_train: pd.DataFrame, y_train: pd.Series) -> BaseEstimator
130136 on global parameters and runs the search on the provided training data.
131137
132138 Args:
133- X_train (pd.DataFrame): Training features.
134- y_train (pd.Series): Training labels.
139+ X_train (pd.DataFrame): Training features with reset index .
140+ y_train (pd.Series): Training labels with reset index .
135141
136142 Returns:
137143 BaseEstimator: The best estimator found during the search.
138144 """
139145 random_search = self .global_params .random_grid_search
140146 grid_n_jobs = self .global_params .grid_n_jobs
141147 bayessearch = self .global_params .bayessearch
148+ verbose = getattr (self .global_params , 'verbose' , 0 ) # Get verbosity level, default to 0
142149
143- # Limit n_jobs for GPU-heavy methods to avoid memory issues
150+ # Limit n_jobs for GPU-heavy models or Bayesian search to avoid memory/parallelization issues
144151 gpu_heavy_models = (KNNWrapper , kerasClassifier_class )
145- if bayessearch and isinstance (self .algorithm , gpu_heavy_models ):
152+ if bayessearch or isinstance (self .algorithm , gpu_heavy_models ):
153+ if verbose > 0 :
154+ self .ml_grid_object .logger .info (
155+ "Using n_jobs=1 to avoid pandas indexing issues in parallel processing"
156+ )
146157 grid_n_jobs = 1
147158
159+ # Validate parameters - skip for Bayesian search as it uses different parameter format
148160 if not bayessearch :
149- # Validate parameters
161+ # Grid and Random search use standard sklearn parameter format (lists/arrays)
150162 parameters = validate_parameters_helper (
151163 algorithm_implementation = self .algorithm ,
152164 parameters = self .parameter_space ,
153165 ml_grid_object = self .ml_grid_object
154166 )
155167 else :
168+ # Bayesian search uses skopt space objects (Integer, Real, Categorical)
169+ # These cannot go through standard validation
156170 parameters = self .parameter_space
157171
172+ # Reset index to ensure clean integer indexing for CV splits
173+ # Keep as pandas to retain feature names
174+ if hasattr (X_train , 'reset_index' ):
175+ X_train_reset = X_train .reset_index (drop = True )
176+ if verbose > 1 :
177+ self .ml_grid_object .logger .debug (
178+ f"Reset X_train index. Shape: { X_train_reset .shape } "
179+ )
180+ else :
181+ X_train_reset = X_train
182+
183+ if hasattr (y_train , 'reset_index' ):
184+ y_train_reset = y_train .reset_index (drop = True )
185+ if verbose > 1 :
186+ self .ml_grid_object .logger .debug (
187+ f"Reset y_train index. Shape: { y_train_reset .shape } "
188+ )
189+ else :
190+ y_train_reset = y_train
191+
192+ # Verify data integrity
193+ if len (X_train_reset ) != len (y_train_reset ):
194+ raise ValueError (
195+ f"Length mismatch: X={ len (X_train_reset )} , y={ len (y_train_reset )} "
196+ )
197+
198+ if verbose > 1 :
199+ self .ml_grid_object .logger .debug (
200+ f"X_train type: { type (X_train_reset )} , shape: { X_train_reset .shape } "
201+ )
202+ self .ml_grid_object .logger .debug (
203+ f"y_train type: { type (y_train_reset )} , shape: { y_train_reset .shape } "
204+ )
205+
158206 if bayessearch :
159207 # Bayesian Optimization
160208 grid = BayesSearchCV (
161209 estimator = self .algorithm ,
162210 search_spaces = parameters ,
163211 n_iter = self .max_iter ,
164- cv = [( slice ( None ), slice ( None ))] ,
212+ cv = self . cv ,
165213 n_jobs = grid_n_jobs ,
166- verbose = 1 ,
214+ verbose = verbose ,
167215 error_score = "raise" ,
168216 )
169217
@@ -176,23 +224,30 @@ def run_search(self, X_train: pd.DataFrame, y_train: pd.Series) -> BaseEstimator
176224 grid = RandomizedSearchCV (
177225 self .algorithm ,
178226 parameters ,
179- verbose = 1 ,
180- cv = [( slice ( None ), slice ( None ))] ,
227+ verbose = verbose ,
228+ cv = self . cv ,
181229 n_jobs = grid_n_jobs ,
182230 n_iter = n_iter ,
183231 error_score = "raise" ,
184232 )
185233 else :
234+ # Grid Search
186235 grid = GridSearchCV (
187236 self .algorithm ,
188237 parameters ,
189- verbose = 1 ,
190- cv = [( slice ( None ), slice ( None ))] ,
238+ verbose = verbose ,
239+ cv = self . cv ,
191240 n_jobs = grid_n_jobs ,
192- error_score = np . nan ,
241+ error_score = "raise" ,
193242 )
194243
195- grid .fit (X_train , y_train )
244+ if verbose > 0 :
245+ self .ml_grid_object .logger .info (
246+ f"Starting hyperparameter search with { len (X_train_reset )} samples"
247+ )
248+
249+ # Fit the grid search with pandas DataFrames/Series (retains feature names)
250+ grid .fit (X_train_reset , y_train_reset )
196251
197252 best_model = grid .best_estimator_
198- return best_model
253+ return best_model
0 commit comments