Skip to content

Commit d8a3439

Browse files
author
SamoraHunter
committed
Optimize TF/GPU init and fix nested parallelism in grid search
- Prevent repeated TensorFlow/GPU initialization overhead by adding a global `_TF_INITIALIZED` flag in `grid_search_cross_validate.py`. - Detect execution within worker processes (daemon) and force `grid_n_jobs=1` to prevent CPU oversubscription caused by nested parallelism (outer hyperopt loop + inner cross-validation loop).
1 parent 8656cd4 commit d8a3439

1 file changed

Lines changed: 31 additions & 11 deletions

File tree

ml_grid/pipeline/grid_search_cross_validate.py

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import time
22
import logging
3+
import multiprocessing
34
import warnings
45
from typing import Any, Dict, List, Optional, Union
56

@@ -46,6 +47,9 @@
4647
from ml_grid.util.bayes_utils import is_skopt_space
4748
from skopt.space import Categorical
4849

50+
# Global flag to ensure TensorFlow/GPU setup runs only once per process
51+
_TF_INITIALIZED = False
52+
4953

5054
class grid_search_crossvalidate:
5155

@@ -102,26 +106,42 @@ def __init__(
102106

103107
self.sub_sample_parameter_val = sub_sample_parameter_val
104108

105-
grid_n_jobs = self.global_params.grid_n_jobs
109+
# --- OPTIMIZATION: Detect Nested Parallelism ---
110+
# If running inside a worker process (daemon), force n_jobs=1 to prevent
111+
# oversubscription (outer loop parallel * inner loop parallel).
112+
if multiprocessing.current_process().daemon:
113+
self.global_params.grid_n_jobs = 1
114+
grid_n_jobs = 1
115+
else:
116+
grid_n_jobs = self.global_params.grid_n_jobs
106117

107118
# Configure GPU usage and job limits for specific models
108119
is_gpu_model = (
109120
"keras" in method_name.lower()
110121
or "xgb" in method_name.lower()
111122
or "catboost" in method_name.lower()
112123
)
124+
125+
global _TF_INITIALIZED
113126
if is_gpu_model:
114127
grid_n_jobs = 1
115-
try:
116-
gpu_devices = tf.config.experimental.list_physical_devices("GPU")
117-
if gpu_devices:
118-
for device in gpu_devices:
119-
tf.config.experimental.set_memory_growth(device, True)
120-
else:
121-
# Explicitly set CPU as the visible device for TensorFlow to avoid CUDA init errors
122-
tf.config.set_visible_devices([], "GPU")
123-
except Exception as e:
124-
self.logger.warning(f"Could not configure GPU for TensorFlow: {e}")
128+
# --- OPTIMIZATION: One-time TF/GPU Setup ---
129+
if not _TF_INITIALIZED:
130+
try:
131+
gpu_devices = tf.config.experimental.list_physical_devices("GPU")
132+
if gpu_devices:
133+
for device in gpu_devices:
134+
try:
135+
tf.config.experimental.set_memory_growth(device, True)
136+
except RuntimeError:
137+
pass # Memory growth must be set before GPUs have been initialized
138+
else:
139+
# Explicitly set CPU as the visible device for TensorFlow to avoid CUDA init errors
140+
tf.config.set_visible_devices([], "GPU")
141+
except Exception as e:
142+
self.logger.warning(f"Could not configure GPU for TensorFlow: {e}")
143+
finally:
144+
_TF_INITIALIZED = True
125145

126146
self.metric_list = self.global_params.metric_list
127147

0 commit comments

Comments
 (0)