linux-netdev
diff --git a/‎contest/hw/README.rst‎
Lines changed: 49 additions & 11 deletions b/‎contest/hw/README.rst‎
Lines changed: 49 additions & 11 deletions
diff --git a/‎contest/hw/hw_worker.py‎
Lines changed: 2 additions & 9 deletions b/‎contest/hw/hw_worker.py‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎contest/hw/hwksft.py‎
Lines changed: 24 additions & 11 deletions b/‎contest/hw/hwksft.py‎
Lines changed: 24 additions & 11 deletions
diff --git a/‎contest/hw/lib/deployer.py‎
Lines changed: 100 additions & 64 deletions b/‎contest/hw/lib/deployer.py‎
Lines changed: 100 additions & 64 deletions
@@ -430,16 +430,54 @@ Operation
    immediately.
 3. Mark all entries under ``/srv/hw-worker/tests`` as "seen" (create a
    ``.seen`` file in each directory). This prevents loop-testing the same set.
-4. Run the tests. For each test:
+4. Open ``/dev/kmsg`` and drain existing boot messages to
+   ``results_dir/boot-dmesg``.
+5. Run the tests. For each test:
     a. Check if test name is in ``.attempted`` — if so, skip (crash recovery).
     b. Write test name to ``.attempted`` + fsync before execution.
-    c. Start a dmesg monitor thread (reads ``/dev/kmsg``) to detect kernel
-       crashes during the test (``RIP:``, ``Call Trace:``, etc.).
-    d. Run via ``./run_kselftest.sh -t <target>/<test>`` (installed form).
-    e. Capture stdout/stderr, save to ``results_dir/<test_name>/``.
-    f. Stop dmesg monitor, collect any crash lines.
-    g. Determine result: pass/fail/skip based on return code and output.
-5. Results are saved under ``/srv/hw-worker/results/$reservation_id/``
-   as ``results.json``. Previously-attempted tests (from crash recovery)
-   are included as failures with a crash note.
-6. Service exits.
+    c. Run via ``./run_kselftest.sh -t <target>:<test>`` (installed form).
+    d. Capture stdout/stderr, save to ``results_dir/<idx>-<name>/``.
+    e. Drain ``/dev/kmsg`` — if any dmesg output was produced during
+       the test, save it to ``results_dir/<idx>-<name>/dmesg``.
+    f. Save metadata to ``results_dir/<idx>-<name>/info`` (JSON).
+6. Results are saved under ``/srv/hw-worker/results/$reservation_id/``.
+   hw-worker does **not** determine pass/fail — that is done by hwksft
+   when it copies back and parses the output files.
+7. Service exits.
+
+Output artifacts
+----------------
+
+hw-worker produces the following files under
+``/srv/hw-worker/results/$reservation_id/``.  hwksft copies this tree
+back and parses it to build the final result JSON.
+
+::
+
+  $reservation_id/
+  ├── boot-dmesg                    # dmesg from boot until first test
+  ├── 0-test_name/                  # per-test output directory
+  │   ├── stdout                    # test stdout (KTAP/TAP output)
+  │   ├── stderr                    # test stderr
+  │   ├── info                      # JSON: {retcode, time, target, prog}
+  │   └── dmesg                     # dmesg during this test (if any)
+  ├── 1-another_test/
+  │   ├── stdout
+  │   ├── stderr
+  │   ├── info
+  │   └── dmesg
+  └── ...
+
+``info`` JSON fields:
+
+``retcode``
+  Exit code of ``run_kselftest.sh``.  0 = pass, 4 = skip, other = fail.
+
+``time``
+  Wall-clock seconds the test took (float).
+
+``target``
+  kselftest collection name (e.g. ``drivers/net/hw``).
+
+``prog``
+  Test program name within the collection (e.g. ``rss_drv.py``).
@@ -170,16 +170,9 @@ def main():
     results_dir = os.path.join(results_base, reservation_id)
     os.makedirs(results_dir, exist_ok=True)
 
-    results = run_tests(test_dir, results_dir)
+    run_tests(test_dir, results_dir)
 
-    results_file = os.path.join(results_dir, 'results.json')
-    fd = os.open(results_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
-    with os.fdopen(fd, 'w') as fp:
-        json.dump(results, fp)
-        fp.flush()
-        os.fsync(fp.fileno())
-
-    print(f"Completed {len(results)} tests, results in {results_dir}")
+    print(f"Completed, results in {results_dir}")
 
 
 if __name__ == '__main__':
 
@@ -24,7 +24,8 @@
 from lib.mc_client import MCClient, resolve_machines, resolve_nic_id  # noqa: E402
 from lib.deployer import (build_kernel, build_ksft, deploy_artifacts,  # noqa: E402
                           kexec_machine, wait_for_results, fetch_results,
-                          set_log_file, WaitResult, grab_hw_worker_journal)
+                          parse_results, set_log_file, WaitResult,
+                          grab_hw_worker_journal)
 
 # Config:
 #
@@ -155,6 +156,7 @@ def test(binfo, rinfo, cbarg):  # pylint: disable=unused-argument
     else:
         raise RuntimeError(f"Failed to reserve machines after {max_retries} attempts")
 
+    cases = None
     try:
         # 5. Deploy artifacts via SCP
         with open(os.path.join(results_path, 'deploy'), 'w', encoding='utf-8') as fp:
@@ -170,32 +172,43 @@ def test(binfo, rinfo, cbarg):  # pylint: disable=unused-argument
         wait_result = wait_for_results(config, mc, reservation_id,
                                        machine_ids, machine_ips)
 
-        # 8. Grab hw-worker journal for debugging
-        grab_hw_worker_journal(machine_ips[0], results_path)
+        # 8. Copy back results
+        fetch_results(machine_ips, reservation_id, results_path)
 
-        # 9. Copy back results
-        if wait_result.ok:
-            cases = fetch_results(config, machine_ips, reservation_id, rinfo)
-        else:
+        # 9. Parse results
+        cases = parse_results(reservation_id, results_path, link)
+        if not wait_result.ok:
             # Write error to disk so it's visible via the UI result link
             with open(os.path.join(results_path, 'error'), 'w',
                       encoding='utf-8') as fp:
                 fp.write(wait_result.error + '\n')
-            cases = [{
-                'test': 'hw-worker',
+            cases.insert(0, {
+                'test': 'worker-failed',
                 'group': grp_name,
                 'result': 'fail',
                 'link': link,
-            }]
+            })
     finally:
         set_log_file(None)
-        # 10. Release reservation
+        # 10. Grab hw-worker journal for debugging
+        try:
+            grab_hw_worker_journal(machine_ips[0], results_path)
+        except Exception as e:
+            print(f"Warning: failed to grab hw-worker journal: {e}")
+        # 11. Release reservation
         try:
             mc.reservation_close(reservation_id)
         except Exception as e:
             print(f"Warning: failed to close reservation {reservation_id}: {e}")
 
     print("Done at", datetime.datetime.now())
+    if cases is None:
+        cases = [{
+            'test': 'worker-failed',
+            'group': grp_name,
+            'result': 'fail',
+            'link': link,
+        }]
     return cases
 
 
 
@@ -5,10 +5,10 @@
 import json
 import os
 import random
+import re
 import shutil
 import string
 import subprocess
-import tempfile
 import time
 from dataclasses import dataclass, field
 
@@ -210,7 +210,7 @@ def _refresh():
 def grab_hw_worker_journal(ipaddr, results_path):
     """Fetch hw-worker journal from the test machine and save locally."""
     journal = _ssh(ipaddr,
-                   'journalctl -u nipa-hw-worker.service -n 250 --no-pager',
+                   'journalctl -u nipa-hw-worker.service -b --no-pager',
                    check=False)
     if journal:
         journal_file = os.path.join(results_path, 'hw-worker-journal')
@@ -252,38 +252,26 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips):
             print(f"wait_for_results: {msg}")
             return WaitResult(ok=False, error=msg)
 
-        # Check if hw-worker has produced results on primary machine
-        primary_ip = machine_ips[0]
-        ret = _ssh_retcode(primary_ip,
-                            f'test -f /srv/hw-worker/results/{reservation_id}/results.json')
-        if ret == 0:
-            print("wait_for_results: hw-worker completed")
-            return WaitResult(ok=True)
-
-        # Check if hw-worker exited without producing results.
+        # Check if hw-worker service has exited.
         # For Type=oneshot services, is-active returns "activating" (rc=3)
         # while running, "active" (rc=0) after success with RemainAfterExit=yes,
         # and "failed"/"inactive" after failure/stop.  Use show -p ActiveState
         # to distinguish "still running" from "done".
+        primary_ip = machine_ips[0]
         state = _ssh(primary_ip,
                      'systemctl show -p ActiveState --value nipa-hw-worker.service',
                      check=False).strip()
         if state == 'activating':
             pass  # still running, continue polling
-        elif state in ('inactive', 'failed'):
-            # Service exited, but results.json may have been written
-            # between our test -f check and the state check (race).
-            # Re-check before declaring failure.
-            ret = _ssh_retcode(primary_ip,
-                                f'test -f /srv/hw-worker/results/{reservation_id}/results.json')
-            if ret == 0:
-                print("wait_for_results: hw-worker completed")
-                return WaitResult(ok=True)
-
-            # Service finished and no results.json — something went wrong.
-            msg = f"hw-worker exited without results (state={state})"
+        elif state == 'failed':
+            msg = "hw-worker service failed"
             print(f"wait_for_results: {msg}")
             return WaitResult(ok=False, error=msg)
+        elif state in ('inactive', 'active'):
+            # inactive = exited normally (RemainAfterExit=no)
+            # active = exited normally (RemainAfterExit=yes)
+            print("wait_for_results: hw-worker completed")
+            return WaitResult(ok=True)
 
         # Check SOL logs for crashes on each machine
         for i, mid in enumerate(machine_ids):
@@ -363,61 +351,109 @@ def _refresh():
     kexec_machine(config, [ipaddr], reservation_id, mc=mc)
 
 
-def fetch_results(_config, machine_ips, reservation_id, rinfo):
-    """SCP results from test machines back to build node.
+def fetch_results(machine_ips, reservation_id, results_path):
+    """SCP test output from the test machine back to the build node.
 
-    Parse and format into vmksft-p-style result list.
-    Tests that crashed (in .attempted but not in results) are marked
-    as result='fail' with crash info.
+    Copies the results directory tree and the .attempted file.
     """
     primary_ip = machine_ips[0]
     remote_results = f'/srv/hw-worker/results/{reservation_id}'
+    remote_tests = f'/srv/hw-worker/tests/{reservation_id}'
 
-    with tempfile.TemporaryDirectory() as tmpdir:
-        # Copy results.json
-        _scp_from(primary_ip, f'{remote_results}/results.json',
-                  os.path.join(tmpdir, 'results.json'))
-
-        # Copy .attempted for crash tracking
-        remote_tests = f'/srv/hw-worker/tests/{reservation_id}'
-        _scp_from(primary_ip, f'{remote_tests}/.attempted',
-                  os.path.join(tmpdir, 'attempted.json'),
-                  check=False)
-
-        # Parse results
-        results_path = os.path.join(tmpdir, 'results.json')
-        if os.path.exists(results_path):
-            with open(results_path, encoding='utf-8') as fp:
-                raw_results = json.load(fp)
-        else:
-            raw_results = []
+    # Copy the entire results directory tree
+    local_results = os.path.join(results_path, 'test-outputs')
+    os.makedirs(local_results, exist_ok=True)
+    # Use scp -r to grab all test output directories
+    ret = subprocess.run(
+        ['scp', '-r', '-o', 'StrictHostKeyChecking=no',
+         '-o', 'BatchMode=yes',
+         f'root@{primary_ip}:{remote_results}/', local_results],
+        capture_output=True, timeout=300, check=False
+    )
+    if ret.returncode != 0:
+        print(f"fetch_results: scp failed: {ret.stderr.decode('utf-8', 'ignore')}")
+
+    # Copy .attempted for crash tracking
+    _scp_from(primary_ip, f'{remote_tests}/.attempted',
+              os.path.join(results_path, 'attempted.json'),
+              check=False)
 
-        # Load attempted tests
-        attempted_path = os.path.join(tmpdir, 'attempted.json')
-        attempted = []
-        if os.path.exists(attempted_path):
-            with open(attempted_path, encoding='utf-8') as fp:
-                attempted = json.load(fp)
 
-        # Identify crashed tests: in attempted but not in results
-        result_names = {r['test'] for r in raw_results}
-        link = rinfo.get('link', '')
+def parse_results(reservation_id, results_path, link):
+    """Parse fetched test output into a vmksft-p-style result list.
+
+    Reads info/stdout files from the test-outputs directory and
+    the .attempted file to identify crashed tests.
+    """
+    # Find the actual results subdir (scp -r creates reservation_id/ inside)
+    local_results = os.path.join(results_path, 'test-outputs')
+    output_dir = os.path.join(local_results, str(reservation_id))
+    if not os.path.isdir(output_dir):
+        output_dir = local_results
+
+    # Parse each test output directory
+    cases = []
+    completed_tests = set()
+    if os.path.isdir(output_dir):
+        for entry in sorted(os.listdir(output_dir)):
+            test_dir = os.path.join(output_dir, entry)
+            if not os.path.isdir(test_dir):
+                continue
+
+            info_path = os.path.join(test_dir, 'info')
+            stdout_path = os.path.join(test_dir, 'stdout')
+
+            if not os.path.exists(info_path):
+                continue
+
+            with open(info_path, encoding='utf-8') as fp:
+                info = json.load(fp)
+
+            retcode = info.get('retcode', 1)
+            target = info.get('target', 'unknown')
+            prog = info.get('prog', entry)
+            test_name = f"{target}:{prog}"
+            completed_tests.add(test_name)
+
+            stdout = ''
+            if os.path.exists(stdout_path):
+                with open(stdout_path, encoding='utf-8') as fp:
+                    stdout = fp.read()
+
+            # Determine result
+            result = 'pass'
+            if retcode == 4:
+                result = 'skip'
+            elif retcode != 0:
+                result = 'fail'
+            if 'ok' not in stdout.lower() and result == 'pass':
+                result = 'skip'
+
+            safe_name = re.sub(r'[^0-9a-zA-Z]+', '-', prog)
+            if safe_name and safe_name[-1] == '-':
+                safe_name = safe_name[:-1]
 
-        cases = []
-        for r in raw_results:
             outcome = {
-                'test': r['test'],
-                'group': r.get('group', 'selftests-hw'),
-                'result': r['result'],
+                'test': safe_name or entry,
+                'group': f'selftests-{re.sub(r"[^0-9a-zA-Z]+", "-", target).rstrip("-")}',
+                'result': result,
                 'link': link,
             }
-            for key in ['time', 'crashes']:
-                if key in r:
-                    outcome[key] = r[key]
+            if 'time' in info:
+                outcome['time'] = info['time']
             cases.append(outcome)
 
+    # Check .attempted for crashed tests (attempted but no output)
+    attempted_path = os.path.join(results_path, 'attempted.json')
+    if os.path.exists(attempted_path):
+        with open(attempted_path, encoding='utf-8') as fp:
+            try:
+                attempted = json.load(fp)
+            except (json.JSONDecodeError, ValueError):
+                attempted = []
+
         for test_name in attempted:
-            if test_name not in result_names:
+            if test_name not in completed_tests:
                 cases.append({
                     'test': test_name,
                     'group': 'selftests-hw',