Skip to content

Commit f89b312

Browse files
author
SamoraHunter
committed
Optimize validation overhead and data passing for hyperopt
Addressed profiling bottlenecks in `arraysetops.py` (numpy unique) and repeated data loading: - `ml_grid/pipeline/data.py`: Update `pipe` constructor to accept an `input_df` argument, allowing pre-loaded data to be passed to workers and eliminating redundant disk I/O during hyperopt trials. - `ml_grid/util/global_params.py`: Optimize `custom_roc_auc_score` to use `min() == max()` checks instead of the expensive `np.unique()` sort (O(N) vs O(N log N)). - `ml_grid/pipeline/grid_search_cross_validate.py`: - Refactor H2O model checks to use a module-level `H2O_MODEL_TYPES` constant. - Optimize `y_train` handling: only convert to categorical for H2O models; keep as numeric/numpy for Scikit-learn to avoid validation overhead. - Replace `len(np.unique())` with `series.nunique()` for faster class count checks. - Pass numpy arrays (values) instead of Pandas objects to Scikit-learn's `cross_validate` to reduce indexing overhead.
1 parent d8a3439 commit f89b312

2 files changed

Lines changed: 43 additions & 36 deletions

File tree

ml_grid/pipeline/grid_search_cross_validate.py

Lines changed: 29 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,19 @@
5050
# Global flag to ensure TensorFlow/GPU setup runs only once per process
5151
_TF_INITIALIZED = False
5252

53+
# Define H2O model types at module level for reuse
54+
H2O_MODEL_TYPES = (
55+
H2OAutoMLClassifier,
56+
H2OGBMClassifier,
57+
H2ODRFClassifier,
58+
H2OGAMClassifier,
59+
H2ODeepLearningClassifier,
60+
H2OGLMClassifier,
61+
H2ONaiveBayesClassifier,
62+
H2ORuleFitClassifier,
63+
H2OXGBoostClassifier,
64+
H2OStackedEnsembleClassifier,
65+
)
5366

5467
class grid_search_crossvalidate:
5568

@@ -179,9 +192,6 @@ def __init__(
179192
if not isinstance(self.y_train, (pd.Series, pd.DataFrame)):
180193
self.y_train = pd.Series(self.y_train, index=self.X_train.index)
181194

182-
# 3. Ensure target is categorical for classification models (especially H2O).
183-
self.y_train = self.y_train.astype("category")
184-
185195
# --- CRITICAL FIX for H2O Stacked Ensemble response column mismatch ---
186196
# Enforce a consistent name for the target variable series. This prevents
187197
# the "response_column must match" error in H2O StackedEnsemble.
@@ -515,7 +525,8 @@ def __init__(
515525
metric_list = self.metric_list
516526

517527
# Catch only one class present AUC not defined (check only if not already failed)
518-
if not failed and len(np.unique(self.y_train)) < 2:
528+
# Optimization: Use pandas nunique() which is faster than converting to numpy and sorting
529+
if not failed and self.y_train.nunique() < 2:
519530
raise ValueError(
520531
"Only one class present in y_train. ROC AUC score is not defined "
521532
"in that case. grid_search_cross_validate>>>cross_validate"
@@ -535,23 +546,12 @@ def __init__(
535546
# H2O models cannot be pickled and sent to other processes for parallel
536547
# execution with joblib. We must detect if the current algorithm is an
537548
# H2O model and, if so, force n_jobs=1 for cross_validate.
538-
h2o_model_types = (
539-
H2OAutoMLClassifier,
540-
H2OGBMClassifier,
541-
H2ODRFClassifier,
542-
H2OGAMClassifier,
543-
H2ODeepLearningClassifier,
544-
H2OGLMClassifier,
545-
H2ONaiveBayesClassifier,
546-
H2ORuleFitClassifier,
547-
H2OXGBoostClassifier,
548-
H2OStackedEnsembleClassifier,
549-
)
549+
# (H2O_MODEL_TYPES is now defined at module level)
550550

551551
# Keras/TensorFlow models also require single-threaded execution.
552552
keras_model_types = (NeuralNetworkClassifier, KerasClassifierClass)
553553

554-
is_h2o_model = isinstance(current_algorithm, h2o_model_types)
554+
is_h2o_model = isinstance(current_algorithm, H2O_MODEL_TYPES)
555555
is_keras_model = isinstance(current_algorithm, keras_model_types)
556556

557557
# H2O and Keras models require single-threaded execution for CV
@@ -567,10 +567,17 @@ def __init__(
567567

568568
# H2O models require pandas DataFrames with column names, while other
569569
# sklearn models can benefit from using NumPy arrays.
570-
if isinstance(current_algorithm, h2o_model_types):
570+
if isinstance(current_algorithm, H2O_MODEL_TYPES):
571571
X_train_final = self.X_train # Pass DataFrame directly
572+
y_train_final = self.y_train # Pass Series (Categorical)
572573
else:
573574
X_train_final = self.X_train.values # Use NumPy array for other models
575+
# Optimization: Pass numpy array for y to avoid pandas overhead in sklearn
576+
# If it was converted to categorical (unlikely for sklearn now), get codes
577+
if isinstance(self.y_train.dtype, pd.CategoricalDtype):
578+
y_train_final = self.y_train.cat.codes.values
579+
else:
580+
y_train_final = self.y_train.values
574581

575582
scores = None
576583

@@ -681,7 +688,7 @@ def __init__(
681688
scores = cross_validate(
682689
current_algorithm,
683690
X_train_final,
684-
self.y_train, # Pass the pandas Series to preserve index alignment
691+
y_train_final, # Use optimized y (numpy for sklearn, Series for H2O)
685692
scoring=self.metric_list,
686693
cv=self.cv,
687694
n_jobs=final_cv_n_jobs, # Use adjusted n_jobs
@@ -730,7 +737,7 @@ def __init__(
730737
scores = cross_validate(
731738
current_algorithm,
732739
X_train_final,
733-
self.y_train, # Use pandas Series for consistency
740+
y_train_final, # Use optimized y
734741
scoring=self.metric_list,
735742
cv=self.cv,
736743
n_jobs=final_cv_n_jobs, # Use adjusted n_jobs
@@ -981,19 +988,8 @@ def adjust_param(param_value):
981988

982989
def _shutdown_h2o_if_needed(self, algorithm: Any):
983990
"""Safely shuts down the H2O cluster if the algorithm is an H2O model."""
984-
h2o_model_types = (
985-
H2OAutoMLClassifier,
986-
H2OGBMClassifier,
987-
H2ODRFClassifier,
988-
H2OGAMClassifier,
989-
H2ODeepLearningClassifier,
990-
H2OGLMClassifier,
991-
H2ONaiveBayesClassifier,
992-
H2ORuleFitClassifier,
993-
H2OXGBoostClassifier,
994-
H2OStackedEnsembleClassifier,
995-
)
996-
if isinstance(algorithm, h2o_model_types):
991+
# Use the module-level tuple
992+
if isinstance(algorithm, H2O_MODEL_TYPES):
997993
# --- FIX for repeated H2O cluster shutdown ---
998994
# We no longer shut down the cluster after each model.
999995
# The cluster is now managed globally and should be shut down

ml_grid/util/global_params.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,21 @@ def custom_roc_auc_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
2525
Returns:
2626
float: The ROC AUC score, or np.nan if the score is undefined.
2727
"""
28-
if len(np.unique(y_true)) < 2:
29-
return np.nan # Return NaN if only one class is present
28+
# Optimization: Check min/max instead of full unique sort (O(N) vs O(N log N))
29+
# If min == max, there is only one unique value (or array is empty/NaNs which implies undefined AUC)
30+
# Also handle Categorical data which may not support min/max if unordered
31+
if hasattr(y_true, "nunique"):
32+
if y_true.nunique() < 2:
33+
return np.nan
3034
else:
31-
return roc_auc_score(y_true, y_pred)
35+
try:
36+
if len(y_true) == 0 or y_true.min() == y_true.max():
37+
return np.nan # Return NaN if only one class is present
38+
except (TypeError, ValueError):
39+
if len(np.unique(y_true)) < 2:
40+
return np.nan
41+
42+
return roc_auc_score(y_true, y_pred)
3243

3344

3445
class GlobalParameters:

0 commit comments

Comments
 (0)