@@ -360,25 +360,30 @@ def _journal_has_crash_sentinel(ipaddr):
360360def reboot_machine (config , mc , reservation_id , machine_ids , machine_ips ):
361361 """Reboot the machine via SSH, falling back to BMC power cycle."""
362362 primary_ip = machine_ips [0 ]
363- boot_timeout = config .getint ('hw' , 'max_kexec_boot_timeout' , fallback = 300 )
364363 power_cycle_timeout = config .getint ('hw' , 'max_power_cycle_timeout' , fallback = 600 )
365364
366365 def _refresh ():
367366 mc .reservation_refresh (reservation_id )
368367
369- # Try SSH reboot first
370- print (f"reboot_machine: rebooting { primary_ip } via SSH" )
371- _ssh (primary_ip , 'reboot' , check = False , timeout = 5 )
372-
373- try :
374- _wait_for_ssh (primary_ip , timeout = power_cycle_timeout , keepalive = _refresh )
375- print (f"reboot_machine: { primary_ip } is back" )
376- except TimeoutError :
377- # SSH reboot didn't work, hard cycle via BMC
378- print (f"reboot_machine: SSH reboot timed out, power cycling" )
379- mc .power_cycle (machine_ids [0 ])
380- _wait_for_ssh (primary_ip , timeout = power_cycle_timeout , keepalive = _refresh )
381- print (f"reboot_machine: { primary_ip } back after power cycle" )
368+ # Check if SSH is responsive at all before trying reboot
369+ ssh_ok = _ssh_retcode (primary_ip , 'true' , timeout = 10 ) == 0
370+ if ssh_ok :
371+ print (f"reboot_machine: rebooting { primary_ip } via SSH" )
372+ _ssh (primary_ip , 'reboot' , check = False , timeout = 5 )
373+ try :
374+ _wait_for_ssh (primary_ip , timeout = power_cycle_timeout , keepalive = _refresh )
375+ print (f"reboot_machine: { primary_ip } is back" )
376+ return
377+ except TimeoutError :
378+ print (f"reboot_machine: SSH reboot timed out, falling back to BMC" )
379+ else :
380+ print (f"reboot_machine: SSH not responsive on { primary_ip } " )
381+
382+ # BMC power cycle
383+ print (f"reboot_machine: power cycling { primary_ip } via BMC" )
384+ mc .power_cycle (machine_ids [0 ])
385+ _wait_for_ssh (primary_ip , timeout = power_cycle_timeout , keepalive = _refresh )
386+ print (f"reboot_machine: { primary_ip } back after power cycle" )
382387
383388
384389def fetch_results (machine_ips , reservation_id , results_path ):
0 commit comments