contest: hw: rework the power cycling logic

kuba-moo · kuba-moo · commit 44f0cd62bc12 · 2026-03-15T10:25:36.000-07:00
Signed-off-by: Jakub Kicinski &lt;kuba@kernel.org&gt;
diff --git a/contest/hw/hwksft.py b/contest/hw/hwksft.py
@@ -22,8 +22,8 @@
                           kexec_machine, wait_for_results, fetch_results,
                           parse_results, process_crashes, set_log_file,
                           WaitResult, grab_hw_worker_journal, grab_sol_logs,
-                          reboot_machine, CRASH_SENTINEL,
-                          _journal_has_crash_sentinel)
+                          reboot_machine,
+                          CRASH_SENTINEL, _journal_has_crash_sentinel)
 
 # Config:
 #
@@ -207,7 +207,9 @@ def test(binfo, rinfo, cbarg):  # pylint: disable=unused-argument
             wait_result = wait_for_results(config, mc, reservation_id,
                                            machine_ids, machine_ips)
 
-            # 8. Grab debug artifacts for this attempt
+            # 8. Grab debug artifacts for this attempt.
+            # If machine is hung (needs_power_cycle), it may still
+            # respond to SSH briefly — try to grab what we can.
             try:
                 grab_hw_worker_journal(machine_ips[0],
                                        results_path, suffix=attempt_sfx)
@@ -219,26 +221,32 @@ def test(binfo, rinfo, cbarg):  # pylint: disable=unused-argument
             except Exception as e:
                 print(f"Warning: failed to grab SOL logs: {e}")
 
-            # 9. Copy back results
-            fetch_results(machine_ips, reservation_id, results_path)
+            # 9. Check if we need to retry
+            needs_retry = wait_result.needs_power_cycle
+            if not needs_retry:
+                try:
+                    needs_retry = _journal_has_crash_sentinel(machine_ips[0])
+                except Exception:
+                    pass
 
-            # 10. Check if hw-worker detected a crash and wants a reboot
-            needs_reboot = False
-            try:
-                needs_reboot = _journal_has_crash_sentinel(machine_ips[0])
-            except Exception:
-                pass
-
-            if not needs_reboot:
+            if not needs_retry:
                 break
 
+            if wait_result.needs_power_cycle:
+                print(f"Machine hung (attempt {attempt+1}), rebooting")
+            else:
+                print(f"hw-worker detected crash (attempt {attempt+1}), rebooting")
+            reboot_machine(config, mc, reservation_id,
+                           machine_ids, machine_ips)
+
+            # Do the reboot even if we are about to give up, otherwise
+            # if machine is hung we won't be able to fetch results
             if attempt >= max_crash_retries:
                 print(f"Max crash retries ({max_crash_retries}) reached, giving up")
                 break
 
-            print(f"hw-worker detected crash (attempt {attempt+1}), rebooting")
-            reboot_machine(config, mc, reservation_id,
-                           machine_ids, machine_ips)
+        # 10. Copy back results
+        fetch_results(machine_ips, reservation_id, results_path)
 
         # 11. Parse results
         cases = parse_results(results_path, link)
diff --git a/contest/hw/lib/deployer.py b/contest/hw/lib/deployer.py
@@ -28,6 +28,7 @@ class WaitResult:
     """Result of wait_for_results()."""
     ok: bool
     error: str = ''
+    needs_power_cycle: bool = False
 
 
 def set_log_file(fp):
@@ -336,18 +337,10 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips):
                     # No new SOL output after crash — machine may be hung
                     crash_age = time.monotonic() - crash_detected_at[mid]
                     if crash_age >= crash_wait_time:
-                        print(f"wait_for_results: machine {mid} hung, power cycling")
-                        mc.power_cycle(mid)
-                        power_cycle_timeout = config.getint(
-                            'hw', 'max_power_cycle_timeout', fallback=600)
-                        _wait_for_ssh(ipaddr, timeout=power_cycle_timeout,
-                                      keepalive=lambda: mc.reservation_refresh(reservation_id))
-                        # Machine rebooted into default kernel, hw-worker
-                        # will see kernel mismatch and exit.  The service
-                        # state will flip to inactive/failed, caught on
-                        # the next iteration.
-                        del crash_detected_at[mid]
-                        print(f"wait_for_results: machine {mid} back after power cycle")
+                        print(f"wait_for_results: machine {mid} hung")
+                        return WaitResult(ok=False,
+                                          error='machine hung after crash',
+                                          needs_power_cycle=True)
                 # else: SOL still progressing post-crash, hw-worker may
                 # still be running and will detect the crash via dmesg
 
@@ -378,7 +371,7 @@ def _refresh():
     _ssh(primary_ip, 'reboot', check=False, timeout=5)
 
     try:
-        _wait_for_ssh(primary_ip, timeout=boot_timeout, keepalive=_refresh)
+        _wait_for_ssh(primary_ip, timeout=power_cycle_timeout, keepalive=_refresh)
         print(f"reboot_machine: {primary_ip} is back")
     except TimeoutError:
         # SSH reboot didn't work, hard cycle via BMC
diff --git a/contest/hw/tests/test_hwksft.py b/contest/hw/tests/test_hwksft.py
@@ -498,8 +498,9 @@ def ssh_side_effect(ip, cmd, check=True, timeout=30):
             with mock.patch('lib.deployer._wait_for_ssh'):
                 result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
 
-        mc.power_cycle.assert_called_once_with(1)
-        self.assertTrue(result.ok)
+        mc.power_cycle.assert_not_called()
+        self.assertFalse(result.ok)
+        self.assertTrue(result.needs_power_cycle)
 
     @mock.patch('subprocess.run')
     @mock.patch('time.monotonic')