Skip to content

Commit 44f0cd6

Browse files
committed
contest: hw: rework the power cycling logic
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent ddbca38 commit 44f0cd6

3 files changed

Lines changed: 33 additions & 31 deletions

File tree

contest/hw/hwksft.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
kexec_machine, wait_for_results, fetch_results,
2323
parse_results, process_crashes, set_log_file,
2424
WaitResult, grab_hw_worker_journal, grab_sol_logs,
25-
reboot_machine, CRASH_SENTINEL,
26-
_journal_has_crash_sentinel)
25+
reboot_machine,
26+
CRASH_SENTINEL, _journal_has_crash_sentinel)
2727

2828
# Config:
2929
#
@@ -207,7 +207,9 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
207207
wait_result = wait_for_results(config, mc, reservation_id,
208208
machine_ids, machine_ips)
209209

210-
# 8. Grab debug artifacts for this attempt
210+
# 8. Grab debug artifacts for this attempt.
211+
# If machine is hung (needs_power_cycle), it may still
212+
# respond to SSH briefly — try to grab what we can.
211213
try:
212214
grab_hw_worker_journal(machine_ips[0],
213215
results_path, suffix=attempt_sfx)
@@ -219,26 +221,32 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
219221
except Exception as e:
220222
print(f"Warning: failed to grab SOL logs: {e}")
221223

222-
# 9. Copy back results
223-
fetch_results(machine_ips, reservation_id, results_path)
224+
# 9. Check if we need to retry
225+
needs_retry = wait_result.needs_power_cycle
226+
if not needs_retry:
227+
try:
228+
needs_retry = _journal_has_crash_sentinel(machine_ips[0])
229+
except Exception:
230+
pass
224231

225-
# 10. Check if hw-worker detected a crash and wants a reboot
226-
needs_reboot = False
227-
try:
228-
needs_reboot = _journal_has_crash_sentinel(machine_ips[0])
229-
except Exception:
230-
pass
231-
232-
if not needs_reboot:
232+
if not needs_retry:
233233
break
234234

235+
if wait_result.needs_power_cycle:
236+
print(f"Machine hung (attempt {attempt+1}), rebooting")
237+
else:
238+
print(f"hw-worker detected crash (attempt {attempt+1}), rebooting")
239+
reboot_machine(config, mc, reservation_id,
240+
machine_ids, machine_ips)
241+
242+
# Do the reboot even if we are about to give up, otherwise
243+
# if machine is hung we won't be able to fetch results
235244
if attempt >= max_crash_retries:
236245
print(f"Max crash retries ({max_crash_retries}) reached, giving up")
237246
break
238247

239-
print(f"hw-worker detected crash (attempt {attempt+1}), rebooting")
240-
reboot_machine(config, mc, reservation_id,
241-
machine_ids, machine_ips)
248+
# 10. Copy back results
249+
fetch_results(machine_ips, reservation_id, results_path)
242250

243251
# 11. Parse results
244252
cases = parse_results(results_path, link)

contest/hw/lib/deployer.py

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class WaitResult:
2828
"""Result of wait_for_results()."""
2929
ok: bool
3030
error: str = ''
31+
needs_power_cycle: bool = False
3132

3233

3334
def set_log_file(fp):
@@ -336,18 +337,10 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips):
336337
# No new SOL output after crash — machine may be hung
337338
crash_age = time.monotonic() - crash_detected_at[mid]
338339
if crash_age >= crash_wait_time:
339-
print(f"wait_for_results: machine {mid} hung, power cycling")
340-
mc.power_cycle(mid)
341-
power_cycle_timeout = config.getint(
342-
'hw', 'max_power_cycle_timeout', fallback=600)
343-
_wait_for_ssh(ipaddr, timeout=power_cycle_timeout,
344-
keepalive=lambda: mc.reservation_refresh(reservation_id))
345-
# Machine rebooted into default kernel, hw-worker
346-
# will see kernel mismatch and exit. The service
347-
# state will flip to inactive/failed, caught on
348-
# the next iteration.
349-
del crash_detected_at[mid]
350-
print(f"wait_for_results: machine {mid} back after power cycle")
340+
print(f"wait_for_results: machine {mid} hung")
341+
return WaitResult(ok=False,
342+
error='machine hung after crash',
343+
needs_power_cycle=True)
351344
# else: SOL still progressing post-crash, hw-worker may
352345
# still be running and will detect the crash via dmesg
353346

@@ -378,7 +371,7 @@ def _refresh():
378371
_ssh(primary_ip, 'reboot', check=False, timeout=5)
379372

380373
try:
381-
_wait_for_ssh(primary_ip, timeout=boot_timeout, keepalive=_refresh)
374+
_wait_for_ssh(primary_ip, timeout=power_cycle_timeout, keepalive=_refresh)
382375
print(f"reboot_machine: {primary_ip} is back")
383376
except TimeoutError:
384377
# SSH reboot didn't work, hard cycle via BMC

contest/hw/tests/test_hwksft.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -498,8 +498,9 @@ def ssh_side_effect(ip, cmd, check=True, timeout=30):
498498
with mock.patch('lib.deployer._wait_for_ssh'):
499499
result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
500500

501-
mc.power_cycle.assert_called_once_with(1)
502-
self.assertTrue(result.ok)
501+
mc.power_cycle.assert_not_called()
502+
self.assertFalse(result.ok)
503+
self.assertTrue(result.needs_power_cycle)
503504

504505
@mock.patch('subprocess.run')
505506
@mock.patch('time.monotonic')

0 commit comments

Comments
 (0)