Skip to content

Commit d249b38

Browse files
committed
contest: hw: make sure the machine is healthy before we fetch results
If the machine hung we need to reboot, wait, then fetch. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 762bddb commit d249b38

2 files changed

Lines changed: 14 additions & 4 deletions

File tree

contest/hw/hwksft.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
kexec_machine, wait_for_results, fetch_results,
2323
parse_results, process_crashes, set_log_file,
2424
WaitResult, grab_hw_worker_journal, grab_sol_logs,
25-
reboot_machine,
25+
reboot_machine, check_healthy_ssh,
2626
CRASH_SENTINEL, _journal_has_crash_sentinel)
2727

2828
# Config:
@@ -263,7 +263,13 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
263263
print(f"Max crash retries ({max_crash_retries}) reached, giving up")
264264
break
265265

266-
# 10. Copy back results
266+
# 10. Ensure machine is reachable before fetching results
267+
if not check_healthy_ssh(machine_ips[0]):
268+
print("Machine unreachable, rebooting before fetching results")
269+
reboot_machine(config, mc, reservation_id,
270+
machine_ids, machine_ips)
271+
272+
# 11. Copy back results
267273
fetch_results(machine_ips, reservation_id, results_path)
268274

269275
# 11. Parse results

contest/hw/lib/deployer.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,11 @@ def _journal_has_crash_sentinel(ipaddr):
358358
return CRASH_SENTINEL in journal
359359

360360

361+
def check_healthy_ssh(ipaddr):
362+
"""Check if a machine is reachable via SSH. Returns True if healthy."""
363+
return _ssh_retcode(ipaddr, 'true', timeout=10) == 0
364+
365+
361366
def reboot_machine(config, mc, reservation_id, machine_ids, machine_ips):
362367
"""Reboot the machine via SSH, falling back to BMC power cycle."""
363368
primary_ip = machine_ips[0]
@@ -367,8 +372,7 @@ def _refresh():
367372
mc.reservation_refresh(reservation_id)
368373

369374
# Check if SSH is responsive at all before trying reboot
370-
ssh_ok = _ssh_retcode(primary_ip, 'true', timeout=10) == 0
371-
if ssh_ok:
375+
if check_healthy_ssh(primary_ip):
372376
print(f"reboot_machine: rebooting {primary_ip} via SSH")
373377
_ssh(primary_ip, 'reboot', check=False, timeout=5)
374378
# Wait for the machine to actually go down before checking

0 commit comments

Comments
 (0)