Skip to content

Commit 0b257f6

Browse files
committed
additional logging for index issue, refactor for index misalignment issue.
1 parent 6110a35 commit 0b257f6

1 file changed

Lines changed: 56 additions & 18 deletions

File tree

ml_grid/pipeline/grid_search_cross_validate.py

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from scikeras.wrappers import KerasClassifier
1313
from sklearn import metrics
1414
from IPython.display import display
15+
from pandas.testing import assert_index_equal
1516
from xgboost.core import XGBoostError
1617

1718
# from sklearn.utils.testing import ignore_warnings
@@ -71,13 +72,10 @@ def __init__(
7172
sub_sample_parameter_val (int, optional): A value used to limit
7273
the number of iterations in a randomized search. Defaults to 100.
7374
"""
74-
warnings.filterwarnings("ignore")
75-
76-
warnings.filterwarnings("ignore", category=FutureWarning)
77-
78-
warnings.filterwarnings("ignore", category=ConvergenceWarning)
79-
75+
# Set each warning filter individually for robustness
8076
warnings.filterwarnings("ignore", category=UserWarning)
77+
warnings.filterwarnings("ignore", category=ConvergenceWarning)
78+
warnings.filterwarnings("ignore", category=FutureWarning)
8179

8280
self.global_params = global_parameters
8381

@@ -143,6 +141,11 @@ def __init__(
143141
start = time.time()
144142

145143
current_algorithm = algorithm_implementation
144+
145+
# Silence verbose models like CatBoost to keep logs clean
146+
if "catboost" in method_name.lower() and hasattr(current_algorithm, 'set_params'):
147+
ml_grid_object.logger.info("Silencing CatBoost verbose output.")
148+
current_algorithm.set_params(verbose=0)
146149

147150
if self.verbose >= 1:
148151
print(f"algorithm_implementation: {algorithm_implementation}")
@@ -257,8 +260,29 @@ def __init__(
257260
print("Running hyperparameter search")
258261

259262
try:
263+
# Verify initial index alignment
264+
try:
265+
assert_index_equal(self.X_train.index, self.y_train.index)
266+
ml_grid_object.logger.debug("Index alignment PASSED before search.run_search")
267+
except AssertionError:
268+
ml_grid_object.logger.error("Index alignment FAILED before search.run_search")
269+
raise
270+
271+
# Ensure y_train is a Series for consistency
272+
if not isinstance(self.y_train, pd.Series):
273+
ml_grid_object.logger.error(f"y_train is not a pandas Series, but {type(self.y_train)}. Converting to Series.")
274+
self.y_train = pd.Series(self.y_train, index=self.X_train.index)
275+
276+
# CRITICAL FIX: Reset indices to ensure integer-based indexing for sklearn
277+
# This prevents "String indexing is not supported with 'axis=0'" errors
278+
X_train_reset = self.X_train.reset_index(drop=True)
279+
y_train_reset = self.y_train.reset_index(drop=True)
260280

261-
current_algorithm = search.run_search(self.X_train, self.y_train)
281+
ml_grid_object.logger.debug(f"X_train index after reset: {X_train_reset.index[:5]}")
282+
ml_grid_object.logger.debug(f"y_train index after reset: {y_train_reset.index[:5]}")
283+
284+
# Pass reset data to search
285+
current_algorithm = search.run_search(X_train_reset, y_train_reset)
262286

263287
except XGBoostError as e:
264288
if 'cuda' in str(e).lower() or 'memory' in str(e).lower():
@@ -281,22 +305,36 @@ def __init__(
281305
max_iter=n_iter_v,
282306
ml_grid_object=ml_grid_object
283307
)
284-
# Try again with non-gpu method.
285-
current_algorithm = search.run_search(self.X_train, self.y_train)
308+
# Try again with CPU method and reset indices
309+
X_train_reset = self.X_train.reset_index(drop=True)
310+
y_train_reset = self.y_train.reset_index(drop=True)
311+
current_algorithm = search.run_search(X_train_reset, y_train_reset)
286312
else:
287313
print("unknown xgb error")
288314
print(e)
315+
raise
289316

290317
except Exception as e:
291-
print(e)
292-
print("Failed to run search in gridsearch cross validate")
293-
318+
if "String indexing is not supported with 'axis=0'" in str(e):
319+
raise TypeError(
320+
"Pandas indexing error: 'String indexing is not supported with 'axis=0''. "
321+
"This typically happens when a pandas Series with a non-standard index is passed to a scikit-learn function. "
322+
"Ensure that target variables (y_train) are converted to numpy arrays using `.values` before fitting or cross-validation."
323+
) from e
324+
else:
325+
ml_grid_object.logger.error(f"Failed to run search in gridsearch cross validate: {e}", exc_info=True)
326+
# Re-raise the original exception to allow for higher-level handling if needed
327+
raise e
294328

295329

296330
if self.global_parameters.verbose >= 3:
297331
print("Fitting final model")
298332
#current_algorithm = grid.best_estimator_
299-
current_algorithm.fit(self.X_train, self.y_train)
333+
# Use numpy arrays for fitting the final model and for cross-validation.
334+
X_train_final_np = self.X_train.values
335+
y_train_values = self.y_train.values
336+
337+
current_algorithm.fit(X_train_final_np, y_train_values)
300338

301339
metric_list = self.metric_list
302340

@@ -336,8 +374,8 @@ def __init__(
336374
# Perform the cross-validation
337375
scores = cross_validate(
338376
current_algorithm,
339-
self.X_train,
340-
self.y_train,
377+
X_train_final_np,
378+
y_train_values, # This is already a numpy array
341379
scoring=self.metric_list,
342380
cv=self.cv,
343381
n_jobs=grid_n_jobs, # Full CV on final best model
@@ -354,8 +392,8 @@ def __init__(
354392
try:
355393
scores = cross_validate(
356394
current_algorithm,
357-
self.X_train,
358-
self.y_train,
395+
X_train_final_np,
396+
y_train_values,
359397
scoring=self.metric_list,
360398
cv=self.cv,
361399
n_jobs=grid_n_jobs, # Full CV on final best model
@@ -483,4 +521,4 @@ def scale_data(X_train: pd.DataFrame) -> pd.DataFrame:
483521
return X_train_scaled
484522
else:
485523
# If data is already scaled, return it as is
486-
return X_train
524+
return X_train

0 commit comments

Comments
 (0)