Skip to content

Commit a97190f

Browse files
author
SamoraHunter
committed
cluster management fixes
1 parent c64fe9d commit a97190f

2 files changed

Lines changed: 45 additions & 8 deletions

File tree

ml_grid/model_classes/H2OBaseClassifier.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,11 +129,48 @@ def __setstate__(self, state):
129129

130130
def _ensure_h2o_is_running(self):
131131
"""Safely checks for and initializes an H2O cluster if not running."""
132-
cluster = h2o.cluster()
132+
try:
133+
cluster = h2o.cluster()
134+
except Exception:
135+
cluster = None
136+
133137
show_progress = getattr(global_parameters, "h2o_show_progress", False)
134138

135-
if not (cluster and cluster.is_running()):
136-
h2o.init()
139+
is_healthy = False
140+
if cluster and cluster.is_running():
141+
is_healthy = True
142+
try:
143+
# Check if cluster has memory.
144+
# total_mem is in bytes. If it's 0 or None, it's broken.
145+
memory = None
146+
try:
147+
memory = cluster.total_mem()
148+
except Exception:
149+
try:
150+
memory = cluster.free_mem()
151+
except Exception:
152+
pass
153+
154+
if memory is not None and isinstance(memory, (int, float)):
155+
if memory < 1024 * 1024: # < 1MB
156+
self.logger.warning(
157+
f"H2O cluster is running but reports {memory} memory. Treating as unhealthy."
158+
)
159+
is_healthy = False
160+
except Exception as e:
161+
self.logger.warning(f"H2O cluster check failed: {e}")
162+
163+
if not is_healthy:
164+
# If it was running but unhealthy, try to shut it down first to clear state
165+
if cluster and cluster.is_running():
166+
try:
167+
self.logger.warning("Shutting down unhealthy H2O cluster...")
168+
cluster.shutdown()
169+
except Exception:
170+
pass
171+
172+
self.logger.info("Initializing H2O cluster...")
173+
h2o.init(strict_version_check=False)
137174
self._is_cluster_owner = True
138175

139176
# Set progress bar visibility based on the global parameter

tests/test_h2o_classifiers.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -236,13 +236,13 @@ def test_h2o_gam_knot_distribution_error(h2o_session_fixture):
236236

237237
# Ensure enough unique values survive the CV split to pass the cardinality check (>= 10)
238238
# We need > 10 unique values in the training fold.
239-
# With 50/50 split, we need roughly > 20 unique values in total.
240-
# We keep it skewed (mostly 0s) to trigger the quantile error.
241-
skewed_vals = np.array([0] * 70 + list(range(1, 31)))
239+
# We increase sample size to 200 to ensure stability of the split while maintaining skew.
240+
# 140 zeros, 60 unique values (1..60).
241+
skewed_vals = np.array([0] * 140 + list(range(1, 61)))
242242
np.random.shuffle(skewed_vals)
243243

244-
X = pd.DataFrame({"feature1": np.random.rand(100), "feature_skewed": skewed_vals})
245-
y = pd.Series(np.random.randint(0, 2, 100), name="outcome")
244+
X = pd.DataFrame({"feature1": np.random.rand(200), "feature_skewed": skewed_vals})
245+
y = pd.Series(np.random.randint(0, 2, 200), name="outcome")
246246

247247
estimator = H2O_GAM_class(
248248
X=X, y=y, parameter_space_size="small"

0 commit comments

Comments
 (0)