@@ -129,11 +129,48 @@ def __setstate__(self, state):
129129
130130 def _ensure_h2o_is_running (self ):
131131 """Safely checks for and initializes an H2O cluster if not running."""
132- cluster = h2o .cluster ()
132+ try :
133+ cluster = h2o .cluster ()
134+ except Exception :
135+ cluster = None
136+
133137 show_progress = getattr (global_parameters , "h2o_show_progress" , False )
134138
135- if not (cluster and cluster .is_running ()):
136- h2o .init ()
139+ is_healthy = False
140+ if cluster and cluster .is_running ():
141+ is_healthy = True
142+ try :
143+ # Check if cluster has memory.
144+ # total_mem is in bytes. If it's 0 or None, it's broken.
145+ memory = None
146+ try :
147+ memory = cluster .total_mem ()
148+ except Exception :
149+ try :
150+ memory = cluster .free_mem ()
151+ except Exception :
152+ pass
153+
154+ if memory is not None and isinstance (memory , (int , float )):
155+ if memory < 1024 * 1024 : # < 1MB
156+ self .logger .warning (
157+ f"H2O cluster is running but reports { memory } memory. Treating as unhealthy."
158+ )
159+ is_healthy = False
160+ except Exception as e :
161+ self .logger .warning (f"H2O cluster check failed: { e } " )
162+
163+ if not is_healthy :
164+ # If it was running but unhealthy, try to shut it down first to clear state
165+ if cluster and cluster .is_running ():
166+ try :
167+ self .logger .warning ("Shutting down unhealthy H2O cluster..." )
168+ cluster .shutdown ()
169+ except Exception :
170+ pass
171+
172+ self .logger .info ("Initializing H2O cluster..." )
173+ h2o .init (strict_version_check = False )
137174 self ._is_cluster_owner = True
138175
139176 # Set progress bar visibility based on the global parameter
0 commit comments