Skip to content

Commit afd2e3b

Browse files
committed
contest-hw: rework responsibility split between ksft and worker
Don't try to parse outcomes in the worker. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 9c11d44 commit afd2e3b

7 files changed

Lines changed: 385 additions & 337 deletions

File tree

contest/hw/README.rst

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -430,16 +430,54 @@ Operation
430430
immediately.
431431
3. Mark all entries under ``/srv/hw-worker/tests`` as "seen" (create a
432432
``.seen`` file in each directory). This prevents loop-testing the same set.
433-
4. Run the tests. For each test:
433+
4. Open ``/dev/kmsg`` and drain existing boot messages to
434+
``results_dir/boot-dmesg``.
435+
5. Run the tests. For each test:
434436
a. Check if test name is in ``.attempted`` — if so, skip (crash recovery).
435437
b. Write test name to ``.attempted`` + fsync before execution.
436-
c. Start a dmesg monitor thread (reads ``/dev/kmsg``) to detect kernel
437-
crashes during the test (``RIP:``, ``Call Trace:``, etc.).
438-
d. Run via ``./run_kselftest.sh -t <target>/<test>`` (installed form).
439-
e. Capture stdout/stderr, save to ``results_dir/<test_name>/``.
440-
f. Stop dmesg monitor, collect any crash lines.
441-
g. Determine result: pass/fail/skip based on return code and output.
442-
5. Results are saved under ``/srv/hw-worker/results/$reservation_id/``
443-
as ``results.json``. Previously-attempted tests (from crash recovery)
444-
are included as failures with a crash note.
445-
6. Service exits.
438+
c. Run via ``./run_kselftest.sh -t <target>:<test>`` (installed form).
439+
d. Capture stdout/stderr, save to ``results_dir/<idx>-<name>/``.
440+
e. Drain ``/dev/kmsg`` — if any dmesg output was produced during
441+
the test, save it to ``results_dir/<idx>-<name>/dmesg``.
442+
f. Save metadata to ``results_dir/<idx>-<name>/info`` (JSON).
443+
6. Results are saved under ``/srv/hw-worker/results/$reservation_id/``.
444+
hw-worker does **not** determine pass/fail — that is done by hwksft
445+
when it copies back and parses the output files.
446+
7. Service exits.
447+
448+
Output artifacts
449+
----------------
450+
451+
hw-worker produces the following files under
452+
``/srv/hw-worker/results/$reservation_id/``. hwksft copies this tree
453+
back and parses it to build the final result JSON.
454+
455+
::
456+
457+
$reservation_id/
458+
├── boot-dmesg # dmesg from boot until first test
459+
├── 0-test_name/ # per-test output directory
460+
│ ├── stdout # test stdout (KTAP/TAP output)
461+
│ ├── stderr # test stderr
462+
│ ├── info # JSON: {retcode, time, target, prog}
463+
│ └── dmesg # dmesg during this test (if any)
464+
├── 1-another_test/
465+
│ ├── stdout
466+
│ ├── stderr
467+
│ ├── info
468+
│ └── dmesg
469+
└── ...
470+
471+
``info`` JSON fields:
472+
473+
``retcode``
474+
Exit code of ``run_kselftest.sh``. 0 = pass, 4 = skip, other = fail.
475+
476+
``time``
477+
Wall-clock seconds the test took (float).
478+
479+
``target``
480+
kselftest collection name (e.g. ``drivers/net/hw``).
481+
482+
``prog``
483+
Test program name within the collection (e.g. ``rss_drv.py``).

contest/hw/hw_worker.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -170,16 +170,9 @@ def main():
170170
results_dir = os.path.join(results_base, reservation_id)
171171
os.makedirs(results_dir, exist_ok=True)
172172

173-
results = run_tests(test_dir, results_dir)
173+
run_tests(test_dir, results_dir)
174174

175-
results_file = os.path.join(results_dir, 'results.json')
176-
fd = os.open(results_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
177-
with os.fdopen(fd, 'w') as fp:
178-
json.dump(results, fp)
179-
fp.flush()
180-
os.fsync(fp.fileno())
181-
182-
print(f"Completed {len(results)} tests, results in {results_dir}")
175+
print(f"Completed, results in {results_dir}")
183176

184177

185178
if __name__ == '__main__':

contest/hw/hwksft.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
from lib.mc_client import MCClient, resolve_machines, resolve_nic_id # noqa: E402
2525
from lib.deployer import (build_kernel, build_ksft, deploy_artifacts, # noqa: E402
2626
kexec_machine, wait_for_results, fetch_results,
27-
set_log_file, WaitResult, grab_hw_worker_journal)
27+
parse_results, set_log_file, WaitResult,
28+
grab_hw_worker_journal)
2829

2930
# Config:
3031
#
@@ -174,19 +175,21 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
174175
grab_hw_worker_journal(machine_ips[0], results_path)
175176

176177
# 9. Copy back results
177-
if wait_result.ok:
178-
cases = fetch_results(config, machine_ips, reservation_id, rinfo)
179-
else:
178+
fetch_results(machine_ips, reservation_id, results_path)
179+
180+
# 10. Parse results
181+
cases = parse_results(reservation_id, results_path, link)
182+
if not wait_result.ok:
180183
# Write error to disk so it's visible via the UI result link
181184
with open(os.path.join(results_path, 'error'), 'w',
182185
encoding='utf-8') as fp:
183186
fp.write(wait_result.error + '\n')
184-
cases = [{
185-
'test': 'hw-worker',
187+
cases.insert(0, {
188+
'test': 'worker-failed',
186189
'group': grp_name,
187190
'result': 'fail',
188191
'link': link,
189-
}]
192+
})
190193
finally:
191194
set_log_file(None)
192195
# 10. Release reservation

contest/hw/lib/deployer.py

Lines changed: 99 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
import json
66
import os
77
import random
8+
import re
89
import shutil
910
import string
1011
import subprocess
11-
import tempfile
1212
import time
1313
from dataclasses import dataclass, field
1414

@@ -252,38 +252,26 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips):
252252
print(f"wait_for_results: {msg}")
253253
return WaitResult(ok=False, error=msg)
254254

255-
# Check if hw-worker has produced results on primary machine
256-
primary_ip = machine_ips[0]
257-
ret = _ssh_retcode(primary_ip,
258-
f'test -f /srv/hw-worker/results/{reservation_id}/results.json')
259-
if ret == 0:
260-
print("wait_for_results: hw-worker completed")
261-
return WaitResult(ok=True)
262-
263-
# Check if hw-worker exited without producing results.
255+
# Check if hw-worker service has exited.
264256
# For Type=oneshot services, is-active returns "activating" (rc=3)
265257
# while running, "active" (rc=0) after success with RemainAfterExit=yes,
266258
# and "failed"/"inactive" after failure/stop. Use show -p ActiveState
267259
# to distinguish "still running" from "done".
260+
primary_ip = machine_ips[0]
268261
state = _ssh(primary_ip,
269262
'systemctl show -p ActiveState --value nipa-hw-worker.service',
270263
check=False).strip()
271264
if state == 'activating':
272265
pass # still running, continue polling
273-
elif state in ('inactive', 'failed'):
274-
# Service exited, but results.json may have been written
275-
# between our test -f check and the state check (race).
276-
# Re-check before declaring failure.
277-
ret = _ssh_retcode(primary_ip,
278-
f'test -f /srv/hw-worker/results/{reservation_id}/results.json')
279-
if ret == 0:
280-
print("wait_for_results: hw-worker completed")
281-
return WaitResult(ok=True)
282-
283-
# Service finished and no results.json — something went wrong.
284-
msg = f"hw-worker exited without results (state={state})"
266+
elif state == 'failed':
267+
msg = "hw-worker service failed"
285268
print(f"wait_for_results: {msg}")
286269
return WaitResult(ok=False, error=msg)
270+
elif state in ('inactive', 'active'):
271+
# inactive = exited normally (RemainAfterExit=no)
272+
# active = exited normally (RemainAfterExit=yes)
273+
print("wait_for_results: hw-worker completed")
274+
return WaitResult(ok=True)
287275

288276
# Check SOL logs for crashes on each machine
289277
for i, mid in enumerate(machine_ids):
@@ -363,61 +351,109 @@ def _refresh():
363351
kexec_machine(config, [ipaddr], reservation_id, mc=mc)
364352

365353

366-
def fetch_results(_config, machine_ips, reservation_id, rinfo):
367-
"""SCP results from test machines back to build node.
354+
def fetch_results(machine_ips, reservation_id, results_path):
355+
"""SCP test output from the test machine back to the build node.
368356
369-
Parse and format into vmksft-p-style result list.
370-
Tests that crashed (in .attempted but not in results) are marked
371-
as result='fail' with crash info.
357+
Copies the results directory tree and the .attempted file.
372358
"""
373359
primary_ip = machine_ips[0]
374360
remote_results = f'/srv/hw-worker/results/{reservation_id}'
361+
remote_tests = f'/srv/hw-worker/tests/{reservation_id}'
375362

376-
with tempfile.TemporaryDirectory() as tmpdir:
377-
# Copy results.json
378-
_scp_from(primary_ip, f'{remote_results}/results.json',
379-
os.path.join(tmpdir, 'results.json'))
380-
381-
# Copy .attempted for crash tracking
382-
remote_tests = f'/srv/hw-worker/tests/{reservation_id}'
383-
_scp_from(primary_ip, f'{remote_tests}/.attempted',
384-
os.path.join(tmpdir, 'attempted.json'),
385-
check=False)
386-
387-
# Parse results
388-
results_path = os.path.join(tmpdir, 'results.json')
389-
if os.path.exists(results_path):
390-
with open(results_path, encoding='utf-8') as fp:
391-
raw_results = json.load(fp)
392-
else:
393-
raw_results = []
363+
# Copy the entire results directory tree
364+
local_results = os.path.join(results_path, 'test-outputs')
365+
os.makedirs(local_results, exist_ok=True)
366+
# Use scp -r to grab all test output directories
367+
ret = subprocess.run(
368+
['scp', '-r', '-o', 'StrictHostKeyChecking=no',
369+
'-o', 'BatchMode=yes',
370+
f'root@{primary_ip}:{remote_results}/', local_results],
371+
capture_output=True, timeout=300, check=False
372+
)
373+
if ret.returncode != 0:
374+
print(f"fetch_results: scp failed: {ret.stderr.decode('utf-8', 'ignore')}")
375+
376+
# Copy .attempted for crash tracking
377+
_scp_from(primary_ip, f'{remote_tests}/.attempted',
378+
os.path.join(results_path, 'attempted.json'),
379+
check=False)
394380

395-
# Load attempted tests
396-
attempted_path = os.path.join(tmpdir, 'attempted.json')
397-
attempted = []
398-
if os.path.exists(attempted_path):
399-
with open(attempted_path, encoding='utf-8') as fp:
400-
attempted = json.load(fp)
401381

402-
# Identify crashed tests: in attempted but not in results
403-
result_names = {r['test'] for r in raw_results}
404-
link = rinfo.get('link', '')
382+
def parse_results(reservation_id, results_path, link):
383+
"""Parse fetched test output into a vmksft-p-style result list.
384+
385+
Reads info/stdout files from the test-outputs directory and
386+
the .attempted file to identify crashed tests.
387+
"""
388+
# Find the actual results subdir (scp -r creates reservation_id/ inside)
389+
local_results = os.path.join(results_path, 'test-outputs')
390+
output_dir = os.path.join(local_results, str(reservation_id))
391+
if not os.path.isdir(output_dir):
392+
output_dir = local_results
393+
394+
# Parse each test output directory
395+
cases = []
396+
completed_tests = set()
397+
if os.path.isdir(output_dir):
398+
for entry in sorted(os.listdir(output_dir)):
399+
test_dir = os.path.join(output_dir, entry)
400+
if not os.path.isdir(test_dir):
401+
continue
402+
403+
info_path = os.path.join(test_dir, 'info')
404+
stdout_path = os.path.join(test_dir, 'stdout')
405+
406+
if not os.path.exists(info_path):
407+
continue
408+
409+
with open(info_path, encoding='utf-8') as fp:
410+
info = json.load(fp)
411+
412+
retcode = info.get('retcode', 1)
413+
target = info.get('target', 'unknown')
414+
prog = info.get('prog', entry)
415+
test_name = f"{target}:{prog}"
416+
completed_tests.add(test_name)
417+
418+
stdout = ''
419+
if os.path.exists(stdout_path):
420+
with open(stdout_path, encoding='utf-8') as fp:
421+
stdout = fp.read()
422+
423+
# Determine result
424+
result = 'pass'
425+
if retcode == 4:
426+
result = 'skip'
427+
elif retcode != 0:
428+
result = 'fail'
429+
if 'ok' not in stdout.lower() and result == 'pass':
430+
result = 'skip'
431+
432+
safe_name = re.sub(r'[^0-9a-zA-Z]+', '-', prog)
433+
if safe_name and safe_name[-1] == '-':
434+
safe_name = safe_name[:-1]
405435

406-
cases = []
407-
for r in raw_results:
408436
outcome = {
409-
'test': r['test'],
410-
'group': r.get('group', 'selftests-hw'),
411-
'result': r['result'],
437+
'test': safe_name or entry,
438+
'group': f'selftests-{re.sub(r"[^0-9a-zA-Z]+", "-", target).rstrip("-")}',
439+
'result': result,
412440
'link': link,
413441
}
414-
for key in ['time', 'crashes']:
415-
if key in r:
416-
outcome[key] = r[key]
442+
if 'time' in info:
443+
outcome['time'] = info['time']
417444
cases.append(outcome)
418445

446+
# Check .attempted for crashed tests (attempted but no output)
447+
attempted_path = os.path.join(results_path, 'attempted.json')
448+
if os.path.exists(attempted_path):
449+
with open(attempted_path, encoding='utf-8') as fp:
450+
try:
451+
attempted = json.load(fp)
452+
except (json.JSONDecodeError, ValueError):
453+
attempted = []
454+
419455
for test_name in attempted:
420-
if test_name not in result_names:
456+
if test_name not in completed_tests:
421457
cases.append({
422458
'test': test_name,
423459
'group': 'selftests-hw',

0 commit comments

Comments
 (0)