Skip to content

Commit bb88e15

Browse files
committed
contest: hw: hwksft: probe the SSH before issuing reboot
Not sure why but the reboot cmd hangs if kernel crashes while holding rtnl. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 74a8708 commit bb88e15

1 file changed

Lines changed: 19 additions & 14 deletions

File tree

contest/hw/lib/deployer.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -360,25 +360,30 @@ def _journal_has_crash_sentinel(ipaddr):
360360
def reboot_machine(config, mc, reservation_id, machine_ids, machine_ips):
361361
"""Reboot the machine via SSH, falling back to BMC power cycle."""
362362
primary_ip = machine_ips[0]
363-
boot_timeout = config.getint('hw', 'max_kexec_boot_timeout', fallback=300)
364363
power_cycle_timeout = config.getint('hw', 'max_power_cycle_timeout', fallback=600)
365364

366365
def _refresh():
367366
mc.reservation_refresh(reservation_id)
368367

369-
# Try SSH reboot first
370-
print(f"reboot_machine: rebooting {primary_ip} via SSH")
371-
_ssh(primary_ip, 'reboot', check=False, timeout=5)
372-
373-
try:
374-
_wait_for_ssh(primary_ip, timeout=power_cycle_timeout, keepalive=_refresh)
375-
print(f"reboot_machine: {primary_ip} is back")
376-
except TimeoutError:
377-
# SSH reboot didn't work, hard cycle via BMC
378-
print(f"reboot_machine: SSH reboot timed out, power cycling")
379-
mc.power_cycle(machine_ids[0])
380-
_wait_for_ssh(primary_ip, timeout=power_cycle_timeout, keepalive=_refresh)
381-
print(f"reboot_machine: {primary_ip} back after power cycle")
368+
# Check if SSH is responsive at all before trying reboot
369+
ssh_ok = _ssh_retcode(primary_ip, 'true', timeout=10) == 0
370+
if ssh_ok:
371+
print(f"reboot_machine: rebooting {primary_ip} via SSH")
372+
_ssh(primary_ip, 'reboot', check=False, timeout=5)
373+
try:
374+
_wait_for_ssh(primary_ip, timeout=power_cycle_timeout, keepalive=_refresh)
375+
print(f"reboot_machine: {primary_ip} is back")
376+
return
377+
except TimeoutError:
378+
print(f"reboot_machine: SSH reboot timed out, falling back to BMC")
379+
else:
380+
print(f"reboot_machine: SSH not responsive on {primary_ip}")
381+
382+
# BMC power cycle
383+
print(f"reboot_machine: power cycling {primary_ip} via BMC")
384+
mc.power_cycle(machine_ids[0])
385+
_wait_for_ssh(primary_ip, timeout=power_cycle_timeout, keepalive=_refresh)
386+
print(f"reboot_machine: {primary_ip} back after power cycle")
382387

383388

384389
def fetch_results(machine_ips, reservation_id, results_path):

0 commit comments

Comments
 (0)