Skip to content

Commit 2d73cf3

Browse files
author
SamoraHunter
committed
implemented timeout at model fit method and trial level. Added user config values and defaults. propgate failing trials to score save. Error and timeout failures propagated.
1 parent 13d8223 commit 2d73cf3

5 files changed

Lines changed: 194 additions & 106 deletions

File tree

config_hyperopt.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ global_params:
1111
n_iter: 2
1212
max_param_space_iter_value : 10
1313
force_second_cv: false # If True, forces a second cross-validation run even if cached results are available. Defaults to False.
14+
model_eval_time_limit: 3600
1415

1516
# Experiment settings for the hyperopt run
1617
experiment:
@@ -103,4 +104,5 @@ hyperopt_search_space:
103104

104105
# Hyperopt-specific settings
105106
hyperopt_settings:
106-
max_evals: 2 # Number of iterations per outcome variable
107+
max_evals: 2 # Number of iterations per outcome variable
108+
trial_timeout: 1120 # Timeout in seconds for a full trial (data prep + all models)

ml_grid/pipeline/grid_search_cross_validate.py

Lines changed: 47 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,23 @@ def __init__(
400400
if self.global_parameters.verbose >= 3:
401401
self.logger.debug("Running hyperparameter search")
402402

403+
# Define default scores early to handle timeouts in search phase
404+
default_scores = {
405+
"test_accuracy": np.array([0.5]),
406+
"test_f1": np.array([0.5]),
407+
"test_auc": np.array([0.5]),
408+
"fit_time": np.array([0]),
409+
"score_time": np.array([0]),
410+
"train_score": np.array([0.5]),
411+
"test_recall": np.array([0.5]),
412+
}
413+
414+
failed = False
415+
scores = None
416+
417+
# Initialize start_time early
418+
start_time = time.time()
419+
403420
try:
404421
# Verify initial index alignment
405422
try:
@@ -435,6 +452,11 @@ def __init__(
435452
# Pass reset data to search
436453
current_algorithm = search.run_search(X_train_reset, y_train_reset)
437454

455+
except TimeoutError:
456+
self.logger.warning("Timeout occurred during hyperparameter search.")
457+
failed = "Timeout"
458+
scores = default_scores
459+
438460
except Exception as e:
439461
if "dual coefficients or intercepts are not finite" in str(e):
440462
self.logger.warning(
@@ -454,7 +476,7 @@ def __init__(
454476
# --- PERFORMANCE FIX for testing ---
455477
# If in test_mode, we have already verified that the search runs without crashing.
456478
# We can skip the final, slow cross-validation and return a dummy score.
457-
if getattr(self.global_parameters, "test_mode", False):
479+
if not failed and getattr(self.global_parameters, "test_mode", False):
458480
self.logger.info(
459481
"Test mode enabled. Skipping final cross-validation for speed."
460482
)
@@ -463,7 +485,7 @@ def __init__(
463485
self._shutdown_h2o_if_needed(current_algorithm)
464486
return
465487

466-
if self.global_parameters.verbose >= 3:
488+
if not failed and self.global_parameters.verbose >= 3:
467489
self.logger.debug("Fitting final model")
468490

469491
# In production, we re-fit the best estimator on the full training data before CV.
@@ -472,15 +494,14 @@ def __init__(
472494

473495
metric_list = self.metric_list
474496

475-
# Catch only one class present AUC not defined:
476-
477-
if len(np.unique(self.y_train)) < 2:
497+
# Catch only one class present AUC not defined (check only if not already failed)
498+
if not failed and len(np.unique(self.y_train)) < 2:
478499
raise ValueError(
479500
"Only one class present in y_train. ROC AUC score is not defined "
480501
"in that case. grid_search_cross_validate>>>cross_validate"
481502
)
482503

483-
if self.global_parameters.verbose >= 1:
504+
if not failed and self.global_parameters.verbose >= 1:
484505
self.logger.info("Getting cross validation scores")
485506
self.logger.debug(
486507
f"X_train shape: {self.X_train.shape}, y_train shape: {self.y_train.shape}"
@@ -490,27 +511,6 @@ def __init__(
490511
# Set a time threshold in seconds
491512
time_threshold = 60 # For example, 60 seconds
492513

493-
start_time = time.time()
494-
495-
# Define default scores (e.g., mean score of 0.5 for binary classification)
496-
# Default scores if cross-validation fails
497-
default_scores = {
498-
"test_accuracy": np.array(
499-
[0.5]
500-
), # Default to random classifier performance
501-
"test_f1": np.array(
502-
[0.5]
503-
), # Default F1 score (again, 0.5 for random classification)
504-
"test_auc": np.array(
505-
[0.5]
506-
), # Default ROC AUC score (0.5 for random classifier)
507-
"fit_time": np.array([0]), # No fitting time if the model fails
508-
"score_time": np.array([0]), # No scoring time if the model fails
509-
"train_score": np.array([0.5]), # Default train score
510-
"test_recall": np.array([0.5]),
511-
#'test_auc': [0.5] # ?
512-
}
513-
514514
# --- CRITICAL FIX for H2O multiprocessing error ---
515515
# H2O models cannot be pickled and sent to other processes for parallel
516516
# execution with joblib. We must detect if the current algorithm is an
@@ -541,9 +541,10 @@ def __init__(
541541
"H2O or Keras model detected. Forcing n_jobs=1 for final cross-validation."
542542
)
543543

544-
failed = False
545-
546544
try:
545+
if failed:
546+
raise TimeoutError
547+
547548
# H2O models require pandas DataFrames with column names, while other
548549
# sklearn models can benefit from using NumPy arrays.
549550
if isinstance(current_algorithm, h2o_model_types):
@@ -737,17 +738,19 @@ def __init__(
737738
)
738739

739740
# Set default scores if the AdaBoostClassifier fails
741+
failed = True
740742
scores = default_scores # Use default scores
741743

742744
else:
743745
self.logger.error(
744746
f"An unexpected ValueError occurred during cross-validation: {e}",
745747
exc_info=True,
746748
)
749+
failed = True
747750
scores = default_scores # Use default scores for other errors
748751

749752
except RuntimeError as e:
750-
raise e # raise h2o errors to aid development
753+
# raise e # raise h2o errors to aid development
751754
# --- FIX for UnboundLocalError with H2OStackedEnsemble ---
752755
# Catch any RuntimeError, which can be raised by H2O models during fit
753756
# (e.g., base model training failure) or predict.
@@ -759,12 +762,18 @@ def __init__(
759762
failed = True
760763
scores = default_scores
761764

765+
except TimeoutError:
766+
self.logger.warning("Timeout occurred during cross-validation.")
767+
failed = "Timeout"
768+
scores = default_scores
769+
762770
except Exception as e:
763771
# Catch any other general exceptions and log them
764772
self.logger.error(
765773
f"An unexpected error occurred during cross-validation: {e}",
766774
exc_info=True,
767775
)
776+
failed = True
768777
scores = default_scores # Use default scores if an error occurs
769778

770779
# End the timer
@@ -801,7 +810,10 @@ def __init__(
801810
# plot_auc_results(grid.best_estimator_, X_test_orig, self.y_test_orig, cv)
802811

803812
# this should be x_test...?
804-
best_pred_orig = current_algorithm.predict(self.X_test) # exp
813+
try:
814+
best_pred_orig = current_algorithm.predict(self.X_test) # exp
815+
except Exception:
816+
best_pred_orig = np.zeros(len(self.X_test))
805817

806818
# Call the update_score_log method on the provided instance
807819
if self.project_score_save_class_instance:
@@ -822,7 +834,10 @@ def __init__(
822834
)
823835

824836
# calculate metric for optimisation
825-
auc = metrics.roc_auc_score(self.y_test, best_pred_orig)
837+
try:
838+
auc = metrics.roc_auc_score(self.y_test, best_pred_orig)
839+
except Exception:
840+
auc = 0.5
826841

827842
self.grid_search_cross_validate_score_result = auc
828843

ml_grid/pipeline/main.py

Lines changed: 89 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import logging
2+
import signal
3+
import time
24
import traceback
35
from typing import Any, Dict, List, Tuple
6+
from contextlib import contextmanager
47

58
import numpy as np
69
from sklearn.model_selection import ParameterGrid
@@ -12,6 +15,56 @@
1215
from ml_grid.util.project_score_save import project_score_save_class # Import the class
1316

1417

18+
@contextmanager
19+
def time_limit(seconds):
20+
if seconds is None:
21+
yield
22+
return
23+
24+
try:
25+
seconds_int = int(seconds)
26+
except (ValueError, TypeError):
27+
logging.getLogger("ml_grid").warning(f"Invalid timeout value: {seconds}. Timeout disabled.")
28+
yield
29+
return
30+
31+
if seconds_int <= 0:
32+
yield
33+
return
34+
35+
if not hasattr(signal, "SIGALRM"):
36+
logging.getLogger("ml_grid").warning("Timeout not supported on this platform (SIGALRM missing).")
37+
yield
38+
return
39+
def signal_handler(signum, frame):
40+
raise TimeoutError(f"Timeout of {seconds}s reached")
41+
42+
# Check for existing alarm (nesting support)
43+
previous_remaining = signal.alarm(0)
44+
start_time = time.time()
45+
46+
# Determine effective timeout (min of new and remaining outer)
47+
if previous_remaining > 0:
48+
effective_seconds = min(seconds_int, previous_remaining)
49+
else:
50+
effective_seconds = seconds_int
51+
52+
# Save the old handler
53+
original_handler = signal.signal(signal.SIGALRM, signal_handler)
54+
signal.alarm(effective_seconds)
55+
try:
56+
yield
57+
finally:
58+
signal.alarm(0)
59+
signal.signal(signal.SIGALRM, original_handler)
60+
61+
# Restore previous alarm if it existed, adjusting for elapsed time
62+
if previous_remaining > 0:
63+
elapsed = time.time() - start_time
64+
# Ensure we don't set 0 or negative; if expired, set 1s to trigger immediately
65+
remaining_outer = max(1, int(previous_remaining - elapsed))
66+
signal.alarm(remaining_outer)
67+
1568
class run:
1669
"""Orchestrates the hyperparameter search for a list of models."""
1770

@@ -241,11 +294,24 @@ def execute_single_model(self, args: Tuple) -> float:
241294
"""
242295
try:
243296
self.logger.info(f"Starting grid search for {args[2]}...")
244-
gscv_instance = grid_search_cross_validate.grid_search_crossvalidate(*args)
245-
score = gscv_instance.grid_search_cross_validate_score_result
297+
298+
# Retrieve timeout from local_param_dict via ml_grid_object (args[3])
299+
timeout = args[3].local_param_dict.get("model_eval_time_limit")
300+
if timeout is None:
301+
timeout = args[3].global_params.model_eval_time_limit
302+
303+
with time_limit(timeout):
304+
gscv_instance = grid_search_cross_validate.grid_search_crossvalidate(*args)
305+
score = gscv_instance.grid_search_cross_validate_score_result
306+
246307
self.logger.info(f"Score for {args[2]}: {score:.4f}")
247308
return score
248309

310+
except TimeoutError as e:
311+
self.logger.warning(f"Timeout occurred for {args[2]}: {e}")
312+
self.model_error_list.append([args[0], e, traceback.format_exc()])
313+
return 0.0
314+
249315
except Exception as e:
250316
self.logger.error(
251317
f"An exception occurred during grid search for {args[2]}: {e}",
@@ -298,18 +364,31 @@ def multi_run_wrapper(args: Tuple) -> Any:
298364
self.logger.info(
299365
f"Starting grid search for {self.arg_list[k][2]}..."
300366
)
301-
gscv_instance = (
302-
grid_search_cross_validate.grid_search_crossvalidate(
303-
*self.arg_list[k] # Unpack all arguments
367+
368+
timeout = self.local_param_dict.get("model_eval_time_limit")
369+
if timeout is None:
370+
timeout = self.global_params.model_eval_time_limit
371+
372+
with time_limit(timeout):
373+
gscv_instance = (
374+
grid_search_cross_validate.grid_search_crossvalidate(
375+
*self.arg_list[k] # Unpack all arguments
376+
)
304377
)
305-
)
306378

307-
self.highest_score = max(
308-
self.highest_score,
309-
gscv_instance.grid_search_cross_validate_score_result,
310-
)
379+
self.highest_score = max(
380+
self.highest_score,
381+
gscv_instance.grid_search_cross_validate_score_result,
382+
)
311383
self.logger.info(f"Current highest score: {self.highest_score:.4f}")
312384

385+
except TimeoutError as e:
386+
self.logger.warning(f"Timeout occurred for {self.arg_list[k][2]}: {e}")
387+
self.model_error_list.append(
388+
[self.arg_list[k][0], e, traceback.format_exc()]
389+
)
390+
continue
391+
313392
except (
314393
Exception
315394
) as e: # Catches any exception from grid_search_crossvalidate

ml_grid/util/global_params.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ class GlobalParameters:
9696
"""Verbosity level for the search object (GridSearchCV, etc.). Defaults to 0."""
9797
force_second_cv: bool
9898
"""If True, forces a second cross-validation run even if cached results are available. Defaults to False."""
99+
model_eval_time_limit: int
100+
"""The time limit in seconds for a single model evaluation. Defaults to None (no limit)."""
99101

100102
def __new__(cls, *args: Any, **kwargs: Any) -> "GlobalParameters":
101103
"""Creates a new instance if one does not already exist (Singleton pattern)."""
@@ -141,6 +143,7 @@ def __init__(self, debug_level: int = 0, knn_n_jobs: int = -1) -> None:
141143
self.h2o_show_progress = False
142144
self.search_verbose = 0
143145
self.force_second_cv = False
146+
self.model_eval_time_limit = None
144147

145148
custom_scorer = make_scorer(custom_roc_auc_score)
146149
self.metric_list = {

0 commit comments

Comments
 (0)