Skip to content

Commit f4d0ef0

Browse files
author
SamoraHunter
committed
Optimize metric calculation and CV data types
- Refactor `project_score_save.py` to use `precision_recall_fscore_support` for single-pass metric calculation, significantly reducing overhead compared to individual score calls. - Add `support` metric logging with fallback calculation for binary classification. - Optimize `grid_search_cross_validate.py` by converting target variables to integer numpy arrays before cross-validation, bypassing expensive sklearn `type_of_target` checks on Pandas Series. - Ensure H2O progress bar is explicitly disabled before cross-validation to prevent polling overhead. Add support metric to visualization pipeline - Update `plot_distributions.py` to include 'support' in the default metric distribution plots. - Update `plot_master.py` to generate distribution plots for 'support' and analyze pipeline parameter impact on support values. - Enable deeper analysis of how data processing steps affect class support in the final test sets.
1 parent 0711809 commit f4d0ef0

5 files changed

Lines changed: 212 additions & 95 deletions

File tree

ml_grid/model_classes/H2OBaseClassifier.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import numpy as np
1010
import pandas as pd
1111
from sklearn.base import BaseEstimator, ClassifierMixin
12+
import sklearn
1213
from sklearn.utils.validation import check_is_fitted
1314

1415
from ml_grid.util.global_params import global_parameters
@@ -191,7 +192,7 @@ def _validate_input_data(
191192
Raises:
192193
ValueError: If data is invalid
193194
"""
194-
# Convert to DataFrame if needed and ensure columns are strings
195+
# Ensure X is a DataFrame and columns are strings (H2O requirement).
195196
if not isinstance(X, pd.DataFrame):
196197
if self.feature_names_ is not None:
197198
X = pd.DataFrame(X, columns=self.feature_names_)
@@ -207,8 +208,14 @@ def _validate_input_data(
207208
X = pd.DataFrame(X)
208209
X.columns = [str(c) for c in X.columns]
209210
else:
210-
# If it's already a DataFrame, still ensure columns are strings.
211-
X.columns = X.columns.astype(str)
211+
# If it's already a DataFrame, ensure columns are strings.
212+
# This is necessary for H2O even if validation is skipped.
213+
if any(not isinstance(c, str) for c in X.columns):
214+
X.columns = X.columns.astype(str)
215+
216+
# --- OPTIMIZATION: Skip validation if configured (saves ~99s in profile) ---
217+
if sklearn.get_config().get("skip_parameter_validation"):
218+
return X, y
212219

213220
# Reset index to avoid sklearn CV indexing issues
214221
# CRITICAL: If we reset X, we MUST also reset y to maintain alignment.
@@ -243,12 +250,15 @@ def _validate_input_data(
243250
)
244251

245252
# Get unique classes
253+
# --- OPTIMIZATION: Use pd.unique which is faster than np.unique ---
246254
if isinstance(y, pd.Series):
247255
unique_classes = y.unique()
248256
elif isinstance(y, pd.Categorical):
249257
unique_classes = y.categories
250258
else:
251-
unique_classes = np.unique(y[~pd.isna(y)])
259+
# Fallback for numpy arrays
260+
unique_classes = pd.unique(y)
261+
# Note: pd.unique includes NaNs, but we checked for NaNs above
252262

253263
if len(unique_classes) < 2:
254264
raise ValueError(

ml_grid/pipeline/grid_search_cross_validate.py

Lines changed: 168 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import torch
1111
from IPython.display import clear_output
1212
from scikeras.wrappers import KerasClassifier
13+
import sklearn
1314
from sklearn import metrics
1415
from pandas.testing import assert_index_equal
1516
from xgboost.core import XGBoostError
@@ -137,11 +138,33 @@ def __init__(
137138
or "neural" in method_name.lower()
138139
)
139140

141+
is_h2o_model = isinstance(algorithm_implementation, H2O_MODEL_TYPES)
142+
140143
global _TF_INITIALIZED
141-
if is_gpu_model:
144+
if is_gpu_model or is_h2o_model:
142145
grid_n_jobs = 1
146+
147+
# --- OPTIMIZATION: Disable H2O Progress Bar ---
148+
# This saves significant time (~95s) spent in progressbar updates
149+
if is_h2o_model:
150+
try:
151+
import h2o
152+
153+
h2o.no_progress()
154+
except ImportError:
155+
pass
156+
except Exception:
157+
pass
158+
143159
# --- OPTIMIZATION: One-time TF/GPU Setup ---
144-
if not _TF_INITIALIZED:
160+
if is_gpu_model:
161+
# Optimize Keras/TF runtime by disabling traceback filtering
162+
try:
163+
tf.debugging.disable_traceback_filtering()
164+
except AttributeError:
165+
pass
166+
167+
if is_gpu_model and not _TF_INITIALIZED:
145168
try:
146169
gpu_devices = tf.config.experimental.list_physical_devices("GPU")
147170
if gpu_devices:
@@ -417,93 +440,126 @@ def __init__(
417440
# class itself, making it a self-contained scikit-learn meta-estimator.
418441
# No special orchestration is needed here anymore.
419442

420-
# Instantiate and run the hyperparameter grid/random search
421-
search = HyperparameterSearch(
422-
algorithm=current_algorithm,
423-
parameter_space=parameter_space,
424-
method_name=method_name,
425-
global_params=self.global_parameters,
426-
sub_sample_pct=self.sub_sample_param_space_pct, # Explore 50% of the parameter space
427-
max_iter=n_iter_v, # Maximum iterations for randomized search
428-
ml_grid_object=ml_grid_object,
429-
cv=self.cv,
430-
)
431-
432-
if self.global_parameters.verbose >= 3:
433-
self.logger.debug("Running hyperparameter search")
443+
# --- OPTIMIZATION: Force Sequential Search for H2O/GPU Models ---
444+
# Save original n_jobs to restore later. This prevents HyperparameterSearch
445+
# from spawning parallel jobs for models that are not thread/process safe
446+
# or have their own internal parallelism (like H2O).
447+
original_grid_n_jobs = self.global_parameters.grid_n_jobs
448+
if is_gpu_model or is_h2o_model:
449+
self.global_parameters.grid_n_jobs = 1
434450

435-
# Define default scores early to handle timeouts in search phase
436-
default_scores = {
437-
"test_accuracy": np.array([0.5]),
438-
"test_f1": np.array([0.5]),
439-
"test_auc": np.array([0.5]),
440-
"fit_time": np.array([0]),
441-
"score_time": np.array([0]),
442-
"train_score": np.array([0.5]),
443-
"test_recall": np.array([0.5]),
444-
}
451+
try:
452+
# Instantiate and run the hyperparameter grid/random search
453+
search = HyperparameterSearch(
454+
algorithm=current_algorithm,
455+
parameter_space=parameter_space,
456+
method_name=method_name,
457+
global_params=self.global_parameters,
458+
sub_sample_pct=self.sub_sample_param_space_pct, # Explore 50% of the parameter space
459+
max_iter=n_iter_v, # Maximum iterations for randomized search
460+
ml_grid_object=ml_grid_object,
461+
cv=self.cv,
462+
)
445463

446-
failed = False
447-
scores = None
464+
if self.global_parameters.verbose >= 3:
465+
self.logger.debug("Running hyperparameter search")
466+
467+
# Define default scores early to handle timeouts in search phase
468+
default_scores = {
469+
"test_accuracy": np.array([0.5]),
470+
"test_f1": np.array([0.5]),
471+
"test_auc": np.array([0.5]),
472+
"fit_time": np.array([0]),
473+
"score_time": np.array([0]),
474+
"train_score": np.array([0.5]),
475+
"test_recall": np.array([0.5]),
476+
}
477+
478+
failed = False
479+
scores = None
448480

449-
# Initialize start_time early
450-
start_time = time.time()
481+
# Initialize start_time early
482+
start_time = time.time()
451483

452-
try:
453-
# Verify initial index alignment
454484
try:
455-
assert_index_equal(self.X_train.index, self.y_train.index)
485+
# Verify initial index alignment
486+
try:
487+
assert_index_equal(self.X_train.index, self.y_train.index)
488+
ml_grid_object.logger.debug(
489+
"Index alignment PASSED before search.run_search"
490+
)
491+
except AssertionError:
492+
ml_grid_object.logger.error(
493+
"Index alignment FAILED before search.run_search"
494+
)
495+
raise
496+
497+
# Ensure y_train is a Series for consistency
498+
if not isinstance(self.y_train, pd.Series):
499+
ml_grid_object.logger.error(
500+
f"y_train is not a pandas Series, but {type(self.y_train)}. Converting to Series."
501+
)
502+
self.y_train = pd.Series(self.y_train, index=self.X_train.index)
503+
504+
# CRITICAL FIX: Reset indices to ensure integer-based indexing for sklearn
505+
# This prevents "String indexing is not supported with 'axis=0'" errors
506+
X_train_reset = self.X_train.reset_index(drop=True)
507+
y_train_reset = self.y_train.reset_index(drop=True)
508+
456509
ml_grid_object.logger.debug(
457-
"Index alignment PASSED before search.run_search"
510+
f"X_train index after reset: {X_train_reset.index[:5]}"
458511
)
459-
except AssertionError:
460-
ml_grid_object.logger.error(
461-
"Index alignment FAILED before search.run_search"
512+
ml_grid_object.logger.debug(
513+
f"y_train index after reset: {y_train_reset.index[:5]}"
462514
)
463-
raise
464515

465-
# Ensure y_train is a Series for consistency
466-
if not isinstance(self.y_train, pd.Series):
467-
ml_grid_object.logger.error(
468-
f"y_train is not a pandas Series, but {type(self.y_train)}. Converting to Series."
469-
)
470-
self.y_train = pd.Series(self.y_train, index=self.X_train.index)
516+
# --- OPTIMIZATION: Convert y to numpy for ALL models ---
517+
# This avoids expensive sklearn type_of_target checks on Pandas Series (overhead seen in profiling)
518+
# Most sklearn models handle numpy arrays efficiently.
519+
if isinstance(y_train_reset.dtype, pd.CategoricalDtype):
520+
y_train_search = y_train_reset.cat.codes.values
521+
elif hasattr(y_train_reset, "values"):
522+
y_train_search = y_train_reset.values
523+
else:
524+
y_train_search = y_train_reset
471525

472-
# CRITICAL FIX: Reset indices to ensure integer-based indexing for sklearn
473-
# This prevents "String indexing is not supported with 'axis=0'" errors
474-
X_train_reset = self.X_train.reset_index(drop=True)
475-
y_train_reset = self.y_train.reset_index(drop=True)
526+
# --- OPTIMIZATION: Skip parameter validation overhead ---
527+
# Use set_config to ensure it propagates to all internal calls
528+
with sklearn.config_context(skip_parameter_validation=True):
529+
# Pass reset data to search
530+
if is_h2o_model:
531+
try:
532+
import h2o
476533

477-
ml_grid_object.logger.debug(
478-
f"X_train index after reset: {X_train_reset.index[:5]}"
479-
)
480-
ml_grid_object.logger.debug(
481-
f"y_train index after reset: {y_train_reset.index[:5]}"
482-
)
534+
h2o.no_progress()
535+
except Exception:
536+
pass
483537

484-
# Pass reset data to search
485-
current_algorithm = search.run_search(X_train_reset, y_train_reset)
538+
current_algorithm = search.run_search(X_train_reset, y_train_search)
486539

487-
except TimeoutError:
488-
self.logger.warning("Timeout occurred during hyperparameter search.")
489-
failed = "Timeout"
490-
scores = default_scores
540+
except TimeoutError:
541+
self.logger.warning("Timeout occurred during hyperparameter search.")
542+
failed = "Timeout"
543+
scores = default_scores
491544

492-
except Exception as e:
493-
if "dual coefficients or intercepts are not finite" in str(e):
494-
self.logger.warning(
495-
f"SVC failed to fit due to data issues: {e}. Returning default score."
496-
)
497-
self.grid_search_cross_validate_score_result = 0.5
498-
return
545+
except Exception as e:
546+
if "dual coefficients or intercepts are not finite" in str(e):
547+
self.logger.warning(
548+
f"SVC failed to fit due to data issues: {e}. Returning default score."
549+
)
550+
self.grid_search_cross_validate_score_result = 0.5
551+
return
499552

500-
# Log the error and re-raise it to stop the entire execution,
501-
# allowing the main loop in main.py to handle it based on error_raise.
502-
self.logger.error(
503-
f"An exception occurred during hyperparameter search for {method_name}: {e}",
504-
exc_info=True,
505-
)
506-
raise e
553+
# Log the error and re-raise it to stop the entire execution,
554+
# allowing the main loop in main.py to handle it based on error_raise.
555+
self.logger.error(
556+
f"An exception occurred during hyperparameter search for {method_name}: {e}",
557+
exc_info=True,
558+
)
559+
raise e
560+
finally:
561+
# Restore the original grid_n_jobs setting
562+
self.global_parameters.grid_n_jobs = original_grid_n_jobs
507563

508564
# --- PERFORMANCE FIX for testing ---
509565
# If in test_mode, we have already verified that the search runs without crashing.
@@ -571,7 +627,11 @@ def __init__(
571627
# sklearn models can benefit from using NumPy arrays.
572628
if isinstance(current_algorithm, H2O_MODEL_TYPES):
573629
X_train_final = self.X_train # Pass DataFrame directly
574-
y_train_final = self.y_train # Pass Series (Categorical)
630+
# Optimization: Pass numpy array for y to avoid pandas overhead in sklearn checks
631+
if isinstance(self.y_train.dtype, pd.CategoricalDtype):
632+
y_train_final = self.y_train.cat.codes.values
633+
else:
634+
y_train_final = self.y_train.values
575635
else:
576636
X_train_final = self.X_train.values # Use NumPy array for other models
577637
# Optimization: Pass numpy array for y to avoid pandas overhead in sklearn
@@ -581,6 +641,14 @@ def __init__(
581641
else:
582642
y_train_final = self.y_train.values
583643

644+
# --- OPTIMIZATION: Convert y to int if possible ---
645+
# This speeds up sklearn metric calculations (confusion_matrix, unique_labels)
646+
# significantly compared to string/object arrays.
647+
try:
648+
y_train_final = y_train_final.astype(int)
649+
except (ValueError, TypeError):
650+
pass
651+
584652
scores = None
585653

586654
# Check for user override to force second CV
@@ -687,16 +755,27 @@ def __init__(
687755
# Note: current_algorithm is already fitted on full X_train by HyperparameterSearch (refit=True)
688756
# so we do not need to call .fit() again here.
689757

690-
scores = cross_validate(
691-
current_algorithm,
692-
X_train_final,
693-
y_train_final, # Use optimized y (numpy for sklearn, Series for H2O)
694-
scoring=self.metric_list,
695-
cv=self.cv,
696-
n_jobs=final_cv_n_jobs, # Use adjusted n_jobs
697-
pre_dispatch="2*n_jobs",
698-
error_score=self.error_raise, # Raise error if cross-validation fails
699-
)
758+
# --- OPTIMIZATION: Skip parameter validation overhead (99s) ---
759+
with sklearn.config_context(skip_parameter_validation=True):
760+
# Ensure H2O progress is disabled before CV
761+
if is_h2o_model:
762+
try:
763+
import h2o
764+
765+
h2o.no_progress()
766+
except Exception:
767+
pass
768+
769+
scores = cross_validate(
770+
current_algorithm,
771+
X_train_final,
772+
y_train_final, # Use optimized y (numpy for sklearn, Series for H2O)
773+
scoring=self.metric_list,
774+
cv=self.cv,
775+
n_jobs=final_cv_n_jobs, # Use adjusted n_jobs
776+
pre_dispatch="2*n_jobs",
777+
error_score=self.error_raise, # Raise error if cross-validation fails
778+
)
700779

701780
# Pre-compile the predict function for Keras/TF models to avoid retracing warnings.
702781
# This is done AFTER fitting and before cross-validation.
@@ -864,7 +943,10 @@ def __init__(
864943

865944
# calculate metric for optimisation
866945
try:
867-
auc = metrics.roc_auc_score(self.y_test, best_pred_orig)
946+
y_test_np = (
947+
self.y_test.values if hasattr(self.y_test, "values") else self.y_test
948+
)
949+
auc = metrics.roc_auc_score(y_test_np, best_pred_orig)
868950
except Exception:
869951
auc = 0.5
870952

ml_grid/results_processing/plot_distributions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def plot_metric_distributions(
6363
ValueError: If no specified metrics are found in the data.
6464
"""
6565
if metrics is None:
66-
metrics = ["auc", "mcc", "f1", "precision", "recall", "accuracy"]
66+
metrics = ["auc", "mcc", "f1", "precision", "recall", "accuracy", "support"]
6767

6868
available_metrics = [col for col in metrics if col in self.clean_data.columns]
6969

0 commit comments

Comments
 (0)