Skip to content

Commit 6110a35

Browse files
committed
adjust cv to None, added more logging
1 parent cd7b5e9 commit 6110a35

1 file changed

Lines changed: 69 additions & 14 deletions

File tree

ml_grid/pipeline/hyperparameter_search.py

Lines changed: 69 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pandas as pd
77
import tensorflow as tf
88
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid
9+
from pandas.testing import assert_index_equal
910
from sklearn.exceptions import ConvergenceWarning
1011
from skopt import BayesSearchCV
1112
from sklearn.base import is_classifier, BaseEstimator
@@ -56,6 +57,7 @@ def __init__(
5657
sub_sample_pct: int = 100,
5758
max_iter: int = 100,
5859
ml_grid_object: Any = None,
60+
cv: Any = None,
5961
):
6062
"""Initializes the HyperparameterSearch class.
6163
@@ -70,6 +72,8 @@ def __init__(
7072
Bayesian search. Defaults to 100.
7173
ml_grid_object (Any, optional): The main pipeline object containing data and
7274
other parameters. Defaults to None.
75+
cv (Any, optional): Cross-validation splitting strategy. Can be None, int,
76+
or a CV splitter. Defaults to None (no cross-validation).
7377
"""
7478
self.algorithm = algorithm
7579
self.parameter_space = parameter_space
@@ -78,6 +82,7 @@ def __init__(
7882
self.sub_sample_pct = sub_sample_pct
7983
self.max_iter = max_iter
8084
self.ml_grid_object = ml_grid_object
85+
self.cv = cv
8186

8287
if self.ml_grid_object is None:
8388
raise ValueError("ml_grid_object is required.")
@@ -105,6 +110,7 @@ def __init__(
105110
# Configure warnings
106111
warnings.filterwarnings("ignore", category=ConvergenceWarning)
107112
warnings.filterwarnings("ignore", category=UserWarning)
113+
warnings.filterwarnings("ignore", category=RuntimeWarning) # Suppress divide by zero warnings from NaiveBayes
108114

109115
# Configure GPUs if applicable
110116
if (
@@ -130,40 +136,82 @@ def run_search(self, X_train: pd.DataFrame, y_train: pd.Series) -> BaseEstimator
130136
on global parameters and runs the search on the provided training data.
131137
132138
Args:
133-
X_train (pd.DataFrame): Training features.
134-
y_train (pd.Series): Training labels.
139+
X_train (pd.DataFrame): Training features with reset index.
140+
y_train (pd.Series): Training labels with reset index.
135141
136142
Returns:
137143
BaseEstimator: The best estimator found during the search.
138144
"""
139145
random_search = self.global_params.random_grid_search
140146
grid_n_jobs = self.global_params.grid_n_jobs
141147
bayessearch = self.global_params.bayessearch
148+
verbose = getattr(self.global_params, 'verbose', 0) # Get verbosity level, default to 0
142149

143-
# Limit n_jobs for GPU-heavy methods to avoid memory issues
150+
# Limit n_jobs for GPU-heavy models or Bayesian search to avoid memory/parallelization issues
144151
gpu_heavy_models = (KNNWrapper, kerasClassifier_class)
145-
if bayessearch and isinstance(self.algorithm, gpu_heavy_models):
152+
if bayessearch or isinstance(self.algorithm, gpu_heavy_models):
153+
if verbose > 0:
154+
self.ml_grid_object.logger.info(
155+
"Using n_jobs=1 to avoid pandas indexing issues in parallel processing"
156+
)
146157
grid_n_jobs = 1
147158

159+
# Validate parameters - skip for Bayesian search as it uses different parameter format
148160
if not bayessearch:
149-
# Validate parameters
161+
# Grid and Random search use standard sklearn parameter format (lists/arrays)
150162
parameters = validate_parameters_helper(
151163
algorithm_implementation=self.algorithm,
152164
parameters=self.parameter_space,
153165
ml_grid_object=self.ml_grid_object
154166
)
155167
else:
168+
# Bayesian search uses skopt space objects (Integer, Real, Categorical)
169+
# These cannot go through standard validation
156170
parameters = self.parameter_space
157171

172+
# Reset index to ensure clean integer indexing for CV splits
173+
# Keep as pandas to retain feature names
174+
if hasattr(X_train, 'reset_index'):
175+
X_train_reset = X_train.reset_index(drop=True)
176+
if verbose > 1:
177+
self.ml_grid_object.logger.debug(
178+
f"Reset X_train index. Shape: {X_train_reset.shape}"
179+
)
180+
else:
181+
X_train_reset = X_train
182+
183+
if hasattr(y_train, 'reset_index'):
184+
y_train_reset = y_train.reset_index(drop=True)
185+
if verbose > 1:
186+
self.ml_grid_object.logger.debug(
187+
f"Reset y_train index. Shape: {y_train_reset.shape}"
188+
)
189+
else:
190+
y_train_reset = y_train
191+
192+
# Verify data integrity
193+
if len(X_train_reset) != len(y_train_reset):
194+
raise ValueError(
195+
f"Length mismatch: X={len(X_train_reset)}, y={len(y_train_reset)}"
196+
)
197+
198+
if verbose > 1:
199+
self.ml_grid_object.logger.debug(
200+
f"X_train type: {type(X_train_reset)}, shape: {X_train_reset.shape}"
201+
)
202+
self.ml_grid_object.logger.debug(
203+
f"y_train type: {type(y_train_reset)}, shape: {y_train_reset.shape}"
204+
)
205+
158206
if bayessearch:
159207
# Bayesian Optimization
160208
grid = BayesSearchCV(
161209
estimator=self.algorithm,
162210
search_spaces=parameters,
163211
n_iter=self.max_iter,
164-
cv=[(slice(None), slice(None))],
212+
cv=self.cv,
165213
n_jobs=grid_n_jobs,
166-
verbose=1,
214+
verbose=verbose,
167215
error_score="raise",
168216
)
169217

@@ -176,23 +224,30 @@ def run_search(self, X_train: pd.DataFrame, y_train: pd.Series) -> BaseEstimator
176224
grid = RandomizedSearchCV(
177225
self.algorithm,
178226
parameters,
179-
verbose=1,
180-
cv=[(slice(None), slice(None))],
227+
verbose=verbose,
228+
cv=self.cv,
181229
n_jobs=grid_n_jobs,
182230
n_iter=n_iter,
183231
error_score="raise",
184232
)
185233
else:
234+
# Grid Search
186235
grid = GridSearchCV(
187236
self.algorithm,
188237
parameters,
189-
verbose=1,
190-
cv=[(slice(None), slice(None))],
238+
verbose=verbose,
239+
cv=self.cv,
191240
n_jobs=grid_n_jobs,
192-
error_score=np.nan,
241+
error_score="raise",
193242
)
194243

195-
grid.fit(X_train, y_train)
244+
if verbose > 0:
245+
self.ml_grid_object.logger.info(
246+
f"Starting hyperparameter search with {len(X_train_reset)} samples"
247+
)
248+
249+
# Fit the grid search with pandas DataFrames/Series (retains feature names)
250+
grid.fit(X_train_reset, y_train_reset)
196251

197252
best_model = grid.best_estimator_
198-
return best_model
253+
return best_model

0 commit comments

Comments
 (0)