contest-hw: add more debug for failing runs

kuba-moo · kuba-moo · commit 2a660c228718 · 2026-03-14T14:12:22.000-07:00
Signed-off-by: Jakub Kicinski &lt;kuba@kernel.org&gt;
diff --git a/contest/hw/hw_worker.py b/contest/hw/hw_worker.py
@@ -144,7 +144,7 @@ def main():
     # the deployed kernel version against the running kernel.
     kver_path = os.path.join(test_dir, '.kernel-version')
     if not os.path.exists(kver_path):
-        print("No kernel version file, skipping")
+        print(test_dir, "No kernel version file, skipping")
         return
     with open(kver_path, encoding='utf-8') as fp:
         expected = fp.read().strip()
@@ -155,11 +155,14 @@ def main():
     # (e.g. "6.1" matching "6.12.0") cannot happen in practice.
     # The '-' separator check is an extra safety measure.
     if actual != expected and not actual.startswith(expected + '-'):
-        print(f"Kernel mismatch: running {actual}, expected {expected}")
+        print(test_dir,
+              f"Kernel mismatch: running {actual}, expected {expected}")
         return
 
     mark_all_seen(tests_dir)
 
+    print(test_dir, "Starting tests")
+
     # Configure test interfaces and write net.config
     setup_test_interfaces(test_dir)
 
diff --git a/contest/hw/hwksft.py b/contest/hw/hwksft.py
@@ -24,7 +24,7 @@
 from lib.mc_client import MCClient, resolve_machines, resolve_nic_id  # noqa: E402
 from lib.deployer import (build_kernel, build_ksft, deploy_artifacts,  # noqa: E402
                           kexec_machine, wait_for_results, fetch_results,
-                          set_log_file)
+                          set_log_file, WaitResult)
 
 # Config:
 #
@@ -167,14 +167,18 @@ def test(binfo, rinfo, cbarg):  # pylint: disable=unused-argument
         kexec_machine(config, machine_ips, reservation_id, mc=mc)
 
         # 7. Wait for hw-worker with crash monitoring
-        has_results = wait_for_results(config, mc, reservation_id,
+        wait_result = wait_for_results(config, mc, reservation_id,
                                        machine_ids, machine_ips,
                                        results_path=results_path)
 
         # 8. Copy back results
-        if has_results:
+        if wait_result.ok:
             cases = fetch_results(config, machine_ips, reservation_id, rinfo)
         else:
+            # Write error to disk so it's visible via the UI result link
+            with open(os.path.join(results_path, 'error'), 'w',
+                      encoding='utf-8') as fp:
+                fp.write(wait_result.error + '\n')
             cases = [{
                 'test': 'hw-worker',
                 'group': grp_name,
diff --git a/contest/hw/lib/deployer.py b/contest/hw/lib/deployer.py
@@ -10,6 +10,7 @@
 import subprocess
 import tempfile
 import time
+from dataclasses import dataclass, field
 
 
 # Log file handle, set by set_log_file() before builds start.
@@ -19,6 +20,13 @@
 _initrd_cache = {}
 
 
+@dataclass
+class WaitResult:
+    """Result of wait_for_results()."""
+    ok: bool
+    error: str = ''
+
+
 def set_log_file(fp):
     """Set the file handle for command output logging."""
     global _log_fp  # pylint: disable=global-statement
@@ -223,31 +231,40 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips,
     while True:
         elapsed = time.monotonic() - start_time
         if elapsed > max_test_time:
-            # Caller (hwksft.test) will still try fetch_results, which
-            # handles missing results gracefully.
-            print("wait_for_results: max test time exceeded")
-            break
+            msg = "max test time exceeded"
+            print(f"wait_for_results: {msg}")
+            return WaitResult(ok=False, error=msg)
 
         # Refresh reservation
         result = mc.reservation_refresh(reservation_id)
         if not result.get('ok') and 'error' in result:
-            print(f"wait_for_results: reservation refresh failed: {result['error']}")
-            break
+            msg = f"reservation refresh failed: {result['error']}"
+            print(f"wait_for_results: {msg}")
+            return WaitResult(ok=False, error=msg)
 
         # Check if hw-worker has produced results on primary machine
         primary_ip = machine_ips[0]
         ret = _ssh_retcode(primary_ip,
                             f'test -f /srv/hw-worker/results/{reservation_id}/results.json')
         if ret == 0:
             print("wait_for_results: hw-worker completed")
-            return True
+            return WaitResult(ok=True)
 
         # Check if hw-worker exited without producing results
         ret = _ssh_retcode(primary_ip, 'systemctl is-active nipa-hw-worker.service')
         if ret != 0:
-            # Service is inactive/failed — no results.json means it failed
-            print("wait_for_results: hw-worker exited without results")
-            return False
+            # Service is inactive/failed — no results.json means it failed.
+            # Grab journalctl output for debugging.
+            journal = _ssh(primary_ip,
+                           'journalctl -u nipa-hw-worker.service -n 100 --no-pager',
+                           check=False)
+            if journal and results_path:
+                journal_file = os.path.join(results_path, 'hw-worker-journal')
+                with open(journal_file, 'w', encoding='utf-8') as fp:
+                    fp.write(journal)
+            msg = "hw-worker exited without results"
+            print(f"wait_for_results: {msg}")
+            return WaitResult(ok=False, error=msg)
 
         # Check SOL logs for crashes on each machine
         for i, mid in enumerate(machine_ids):
diff --git a/contest/hw/lib/runner.py b/contest/hw/lib/runner.py
@@ -156,11 +156,14 @@ def run_tests(test_dir, results_dir):
         print("No tests found")
         return []
 
+    print(f"Found {len(tests)} tests")
+
     previously_attempted = set(load_attempted(test_dir))
     results = []
 
     # Mark previously attempted tests as crashed
     for test_name in previously_attempted:
+        print(f"Skipping previously attempted (crashed): {test_name}")
         results.append({
             'test': _namify(test_name),
             'group': 'selftests-hw',
@@ -176,6 +179,8 @@ def run_tests(test_dir, results_dir):
         if test_name in previously_attempted:
             continue
 
+        print(f"[{test_idx+1}/{len(tests)}] Running {test_name}")
+
         # Mark as attempted before execution
         mark_attempted(test_dir, test_name)
 
@@ -204,6 +209,7 @@ def run_tests(test_dir, results_dir):
             retcode = 1
             stdout = ''
             stderr = 'test timed out'
+            print(f"[{test_idx+1}/{len(tests)}] {test_name}: timed out")
         t2 = time.monotonic()
 
         # Save output
@@ -236,6 +242,8 @@ def run_tests(test_dir, results_dir):
             outcome['crashes'] = crash_lines
             outcome['result'] = 'fail'
 
+        print(f"[{test_idx+1}/{len(tests)}] {test_name}: {outcome['result']} ({outcome['time']}s)")
+
         results.append(outcome)
 
     return results
diff --git a/contest/hw/tests/test_hwksft.py b/contest/hw/tests/test_hwksft.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 import json
+import tempfile
 import unittest
 from unittest import mock
 
@@ -314,6 +315,129 @@ def test_sol_reboot_detected(self):
         normal_output = "[  123.456789] Normal kernel message"
         self.assertFalse(_has_reboot(normal_output))
 
+    @mock.patch('subprocess.run')
+    @mock.patch('time.monotonic')
+    @mock.patch('time.sleep')
+    def test_wait_for_results_timeout(self, _mock_sleep,
+                                      mock_monotonic, mock_run):
+        """max_test_time exceeded returns WaitResult with error string."""
+        from lib.deployer import wait_for_results, WaitResult
+
+        mock_run.return_value = mock.Mock(returncode=0, stdout=b'', stderr=b'')
+        mock_monotonic.side_effect = [
+            0,      # start_time
+            99999,  # elapsed check -> exceeds max_test_time
+        ]
+
+        config = mock.Mock()
+        config.getint.side_effect = lambda section, key, fallback=None: {
+            'max_test_time': 3600,
+            'sol_poll_interval': 15,
+            'crash_wait_time': 120,
+            'max_kexec_boot_timeout': 300,
+        }.get(key, fallback)
+
+        mc = mock.Mock()
+        mc.get_sol_logs.return_value = {'last_id': 0, 'lines': []}
+
+        result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
+
+        self.assertIsInstance(result, WaitResult)
+        self.assertFalse(result.ok)
+        self.assertIn('max test time exceeded', result.error)
+
+    @mock.patch('subprocess.run')
+    @mock.patch('time.monotonic')
+    @mock.patch('time.sleep')
+    def test_wait_for_results_no_results(self, _mock_sleep,
+                                         mock_monotonic, mock_run):
+        """hw-worker exits without results returns WaitResult with error."""
+        from lib.deployer import wait_for_results, WaitResult
+
+        mock_run.return_value = mock.Mock(returncode=0, stdout=b'', stderr=b'')
+        mock_monotonic.side_effect = [
+            0,   # start_time
+            10,  # elapsed check
+        ]
+
+        config = mock.Mock()
+        config.getint.side_effect = lambda section, key, fallback=None: {
+            'max_test_time': 3600,
+            'sol_poll_interval': 15,
+            'crash_wait_time': 120,
+            'max_kexec_boot_timeout': 300,
+        }.get(key, fallback)
+
+        mc = mock.Mock()
+        mc.get_sol_logs.return_value = {'last_id': 0, 'lines': []}
+        mc.reservation_refresh.return_value = {'ok': True}
+
+        def ssh_retcode_side_effect(ip, cmd, timeout=30):
+            if 'test -f' in cmd:
+                return 1  # no results.json
+            if 'is-active' in cmd:
+                return 1  # service exited
+            return 0
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with mock.patch('lib.deployer._ssh_retcode',
+                             side_effect=ssh_retcode_side_effect):
+                with mock.patch('lib.deployer._ssh',
+                                 return_value='Mar 14 hw-worker[123]: some log\n') as mock_ssh:
+                    result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'],
+                                              results_path=tmpdir)
+
+            self.assertIsInstance(result, WaitResult)
+            self.assertFalse(result.ok)
+            self.assertIn('hw-worker exited without results', result.error)
+
+            # Verify journalctl output was fetched and saved
+            mock_ssh.assert_called_once()
+            self.assertIn('journalctl', mock_ssh.call_args[0][1])
+            journal_file = os.path.join(tmpdir, 'hw-worker-journal')
+            self.assertTrue(os.path.exists(journal_file))
+            with open(journal_file) as fp:
+                self.assertIn('some log', fp.read())
+
+    @mock.patch('subprocess.run')
+    @mock.patch('time.monotonic')
+    @mock.patch('time.sleep')
+    def test_wait_for_results_success(self, _mock_sleep,
+                                      mock_monotonic, mock_run):
+        """hw-worker completes with results returns WaitResult(ok=True)."""
+        from lib.deployer import wait_for_results, WaitResult
+
+        mock_run.return_value = mock.Mock(returncode=0, stdout=b'', stderr=b'')
+        mock_monotonic.side_effect = [
+            0,   # start_time
+            10,  # elapsed check
+        ]
+
+        config = mock.Mock()
+        config.getint.side_effect = lambda section, key, fallback=None: {
+            'max_test_time': 3600,
+            'sol_poll_interval': 15,
+            'crash_wait_time': 120,
+            'max_kexec_boot_timeout': 300,
+        }.get(key, fallback)
+
+        mc = mock.Mock()
+        mc.get_sol_logs.return_value = {'last_id': 0, 'lines': []}
+        mc.reservation_refresh.return_value = {'ok': True}
+
+        def ssh_retcode_side_effect(ip, cmd, timeout=30):
+            if 'test -f' in cmd:
+                return 0  # results.json exists
+            return 0
+
+        with mock.patch('lib.deployer._ssh_retcode',
+                         side_effect=ssh_retcode_side_effect):
+            result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
+
+        self.assertIsInstance(result, WaitResult)
+        self.assertTrue(result.ok)
+        self.assertEqual(result.error, '')
+
     @mock.patch('subprocess.run')
     @mock.patch('time.monotonic')
     @mock.patch('time.sleep')