1010import torch
1111from IPython .display import clear_output
1212from scikeras .wrappers import KerasClassifier
13+ import sklearn
1314from sklearn import metrics
1415from pandas .testing import assert_index_equal
1516from xgboost .core import XGBoostError
@@ -137,11 +138,33 @@ def __init__(
137138 or "neural" in method_name .lower ()
138139 )
139140
141+ is_h2o_model = isinstance (algorithm_implementation , H2O_MODEL_TYPES )
142+
140143 global _TF_INITIALIZED
141- if is_gpu_model :
144+ if is_gpu_model or is_h2o_model :
142145 grid_n_jobs = 1
146+
147+ # --- OPTIMIZATION: Disable H2O Progress Bar ---
148+ # This saves significant time (~95s) spent in progressbar updates
149+ if is_h2o_model :
150+ try :
151+ import h2o
152+
153+ h2o .no_progress ()
154+ except ImportError :
155+ pass
156+ except Exception :
157+ pass
158+
143159 # --- OPTIMIZATION: One-time TF/GPU Setup ---
144- if not _TF_INITIALIZED :
160+ if is_gpu_model :
161+ # Optimize Keras/TF runtime by disabling traceback filtering
162+ try :
163+ tf .debugging .disable_traceback_filtering ()
164+ except AttributeError :
165+ pass
166+
167+ if is_gpu_model and not _TF_INITIALIZED :
145168 try :
146169 gpu_devices = tf .config .experimental .list_physical_devices ("GPU" )
147170 if gpu_devices :
@@ -417,93 +440,126 @@ def __init__(
417440 # class itself, making it a self-contained scikit-learn meta-estimator.
418441 # No special orchestration is needed here anymore.
419442
420- # Instantiate and run the hyperparameter grid/random search
421- search = HyperparameterSearch (
422- algorithm = current_algorithm ,
423- parameter_space = parameter_space ,
424- method_name = method_name ,
425- global_params = self .global_parameters ,
426- sub_sample_pct = self .sub_sample_param_space_pct , # Explore 50% of the parameter space
427- max_iter = n_iter_v , # Maximum iterations for randomized search
428- ml_grid_object = ml_grid_object ,
429- cv = self .cv ,
430- )
431-
432- if self .global_parameters .verbose >= 3 :
433- self .logger .debug ("Running hyperparameter search" )
443+ # --- OPTIMIZATION: Force Sequential Search for H2O/GPU Models ---
444+ # Save original n_jobs to restore later. This prevents HyperparameterSearch
445+ # from spawning parallel jobs for models that are not thread/process safe
446+ # or have their own internal parallelism (like H2O).
447+ original_grid_n_jobs = self .global_parameters .grid_n_jobs
448+ if is_gpu_model or is_h2o_model :
449+ self .global_parameters .grid_n_jobs = 1
434450
435- # Define default scores early to handle timeouts in search phase
436- default_scores = {
437- "test_accuracy" : np .array ([0.5 ]),
438- "test_f1" : np .array ([0.5 ]),
439- "test_auc" : np .array ([0.5 ]),
440- "fit_time" : np .array ([0 ]),
441- "score_time" : np .array ([0 ]),
442- "train_score" : np .array ([0.5 ]),
443- "test_recall" : np .array ([0.5 ]),
444- }
451+ try :
452+ # Instantiate and run the hyperparameter grid/random search
453+ search = HyperparameterSearch (
454+ algorithm = current_algorithm ,
455+ parameter_space = parameter_space ,
456+ method_name = method_name ,
457+ global_params = self .global_parameters ,
458+ sub_sample_pct = self .sub_sample_param_space_pct , # Explore 50% of the parameter space
459+ max_iter = n_iter_v , # Maximum iterations for randomized search
460+ ml_grid_object = ml_grid_object ,
461+ cv = self .cv ,
462+ )
445463
446- failed = False
447- scores = None
464+ if self .global_parameters .verbose >= 3 :
465+ self .logger .debug ("Running hyperparameter search" )
466+
467+ # Define default scores early to handle timeouts in search phase
468+ default_scores = {
469+ "test_accuracy" : np .array ([0.5 ]),
470+ "test_f1" : np .array ([0.5 ]),
471+ "test_auc" : np .array ([0.5 ]),
472+ "fit_time" : np .array ([0 ]),
473+ "score_time" : np .array ([0 ]),
474+ "train_score" : np .array ([0.5 ]),
475+ "test_recall" : np .array ([0.5 ]),
476+ }
477+
478+ failed = False
479+ scores = None
448480
449- # Initialize start_time early
450- start_time = time .time ()
481+ # Initialize start_time early
482+ start_time = time .time ()
451483
452- try :
453- # Verify initial index alignment
454484 try :
455- assert_index_equal (self .X_train .index , self .y_train .index )
485+ # Verify initial index alignment
486+ try :
487+ assert_index_equal (self .X_train .index , self .y_train .index )
488+ ml_grid_object .logger .debug (
489+ "Index alignment PASSED before search.run_search"
490+ )
491+ except AssertionError :
492+ ml_grid_object .logger .error (
493+ "Index alignment FAILED before search.run_search"
494+ )
495+ raise
496+
497+ # Ensure y_train is a Series for consistency
498+ if not isinstance (self .y_train , pd .Series ):
499+ ml_grid_object .logger .error (
500+ f"y_train is not a pandas Series, but { type (self .y_train )} . Converting to Series."
501+ )
502+ self .y_train = pd .Series (self .y_train , index = self .X_train .index )
503+
504+ # CRITICAL FIX: Reset indices to ensure integer-based indexing for sklearn
505+ # This prevents "String indexing is not supported with 'axis=0'" errors
506+ X_train_reset = self .X_train .reset_index (drop = True )
507+ y_train_reset = self .y_train .reset_index (drop = True )
508+
456509 ml_grid_object .logger .debug (
457- "Index alignment PASSED before search.run_search "
510+ f"X_train index after reset: { X_train_reset . index [: 5 ] } "
458511 )
459- except AssertionError :
460- ml_grid_object .logger .error (
461- "Index alignment FAILED before search.run_search"
512+ ml_grid_object .logger .debug (
513+ f"y_train index after reset: { y_train_reset .index [:5 ]} "
462514 )
463- raise
464515
465- # Ensure y_train is a Series for consistency
466- if not isinstance (self .y_train , pd .Series ):
467- ml_grid_object .logger .error (
468- f"y_train is not a pandas Series, but { type (self .y_train )} . Converting to Series."
469- )
470- self .y_train = pd .Series (self .y_train , index = self .X_train .index )
516+ # --- OPTIMIZATION: Convert y to numpy for ALL models ---
517+ # This avoids expensive sklearn type_of_target checks on Pandas Series (overhead seen in profiling)
518+ # Most sklearn models handle numpy arrays efficiently.
519+ if isinstance (y_train_reset .dtype , pd .CategoricalDtype ):
520+ y_train_search = y_train_reset .cat .codes .values
521+ elif hasattr (y_train_reset , "values" ):
522+ y_train_search = y_train_reset .values
523+ else :
524+ y_train_search = y_train_reset
471525
472- # CRITICAL FIX: Reset indices to ensure integer-based indexing for sklearn
473- # This prevents "String indexing is not supported with 'axis=0'" errors
474- X_train_reset = self .X_train .reset_index (drop = True )
475- y_train_reset = self .y_train .reset_index (drop = True )
526+ # --- OPTIMIZATION: Skip parameter validation overhead ---
527+ # Use set_config to ensure it propagates to all internal calls
528+ with sklearn .config_context (skip_parameter_validation = True ):
529+ # Pass reset data to search
530+ if is_h2o_model :
531+ try :
532+ import h2o
476533
477- ml_grid_object .logger .debug (
478- f"X_train index after reset: { X_train_reset .index [:5 ]} "
479- )
480- ml_grid_object .logger .debug (
481- f"y_train index after reset: { y_train_reset .index [:5 ]} "
482- )
534+ h2o .no_progress ()
535+ except Exception :
536+ pass
483537
484- # Pass reset data to search
485- current_algorithm = search .run_search (X_train_reset , y_train_reset )
538+ current_algorithm = search .run_search (X_train_reset , y_train_search )
486539
487- except TimeoutError :
488- self .logger .warning ("Timeout occurred during hyperparameter search." )
489- failed = "Timeout"
490- scores = default_scores
540+ except TimeoutError :
541+ self .logger .warning ("Timeout occurred during hyperparameter search." )
542+ failed = "Timeout"
543+ scores = default_scores
491544
492- except Exception as e :
493- if "dual coefficients or intercepts are not finite" in str (e ):
494- self .logger .warning (
495- f"SVC failed to fit due to data issues: { e } . Returning default score."
496- )
497- self .grid_search_cross_validate_score_result = 0.5
498- return
545+ except Exception as e :
546+ if "dual coefficients or intercepts are not finite" in str (e ):
547+ self .logger .warning (
548+ f"SVC failed to fit due to data issues: { e } . Returning default score."
549+ )
550+ self .grid_search_cross_validate_score_result = 0.5
551+ return
499552
500- # Log the error and re-raise it to stop the entire execution,
501- # allowing the main loop in main.py to handle it based on error_raise.
502- self .logger .error (
503- f"An exception occurred during hyperparameter search for { method_name } : { e } " ,
504- exc_info = True ,
505- )
506- raise e
553+ # Log the error and re-raise it to stop the entire execution,
554+ # allowing the main loop in main.py to handle it based on error_raise.
555+ self .logger .error (
556+ f"An exception occurred during hyperparameter search for { method_name } : { e } " ,
557+ exc_info = True ,
558+ )
559+ raise e
560+ finally :
561+ # Restore the original grid_n_jobs setting
562+ self .global_parameters .grid_n_jobs = original_grid_n_jobs
507563
508564 # --- PERFORMANCE FIX for testing ---
509565 # If in test_mode, we have already verified that the search runs without crashing.
@@ -571,7 +627,11 @@ def __init__(
571627 # sklearn models can benefit from using NumPy arrays.
572628 if isinstance (current_algorithm , H2O_MODEL_TYPES ):
573629 X_train_final = self .X_train # Pass DataFrame directly
574- y_train_final = self .y_train # Pass Series (Categorical)
630+ # Optimization: Pass numpy array for y to avoid pandas overhead in sklearn checks
631+ if isinstance (self .y_train .dtype , pd .CategoricalDtype ):
632+ y_train_final = self .y_train .cat .codes .values
633+ else :
634+ y_train_final = self .y_train .values
575635 else :
576636 X_train_final = self .X_train .values # Use NumPy array for other models
577637 # Optimization: Pass numpy array for y to avoid pandas overhead in sklearn
@@ -581,6 +641,14 @@ def __init__(
581641 else :
582642 y_train_final = self .y_train .values
583643
644+ # --- OPTIMIZATION: Convert y to int if possible ---
645+ # This speeds up sklearn metric calculations (confusion_matrix, unique_labels)
646+ # significantly compared to string/object arrays.
647+ try :
648+ y_train_final = y_train_final .astype (int )
649+ except (ValueError , TypeError ):
650+ pass
651+
584652 scores = None
585653
586654 # Check for user override to force second CV
@@ -687,16 +755,27 @@ def __init__(
687755 # Note: current_algorithm is already fitted on full X_train by HyperparameterSearch (refit=True)
688756 # so we do not need to call .fit() again here.
689757
690- scores = cross_validate (
691- current_algorithm ,
692- X_train_final ,
693- y_train_final , # Use optimized y (numpy for sklearn, Series for H2O)
694- scoring = self .metric_list ,
695- cv = self .cv ,
696- n_jobs = final_cv_n_jobs , # Use adjusted n_jobs
697- pre_dispatch = "2*n_jobs" ,
698- error_score = self .error_raise , # Raise error if cross-validation fails
699- )
758+ # --- OPTIMIZATION: Skip parameter validation overhead (99s) ---
759+ with sklearn .config_context (skip_parameter_validation = True ):
760+ # Ensure H2O progress is disabled before CV
761+ if is_h2o_model :
762+ try :
763+ import h2o
764+
765+ h2o .no_progress ()
766+ except Exception :
767+ pass
768+
769+ scores = cross_validate (
770+ current_algorithm ,
771+ X_train_final ,
772+ y_train_final , # Use optimized y (numpy for sklearn, Series for H2O)
773+ scoring = self .metric_list ,
774+ cv = self .cv ,
775+ n_jobs = final_cv_n_jobs , # Use adjusted n_jobs
776+ pre_dispatch = "2*n_jobs" ,
777+ error_score = self .error_raise , # Raise error if cross-validation fails
778+ )
700779
701780 # Pre-compile the predict function for Keras/TF models to avoid retracing warnings.
702781 # This is done AFTER fitting and before cross-validation.
@@ -864,7 +943,10 @@ def __init__(
864943
865944 # calculate metric for optimisation
866945 try :
867- auc = metrics .roc_auc_score (self .y_test , best_pred_orig )
946+ y_test_np = (
947+ self .y_test .values if hasattr (self .y_test , "values" ) else self .y_test
948+ )
949+ auc = metrics .roc_auc_score (y_test_np , best_pred_orig )
868950 except Exception :
869951 auc = 0.5
870952
0 commit comments