Skip to content

Commit c570a17

Browse files
committed
contest-hw: rework responsibility split between ksft and worker
Don't try to parse outcomes in the worker. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 9c11d44 commit c570a17

7 files changed

Lines changed: 401 additions & 343 deletions

File tree

contest/hw/README.rst

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -430,16 +430,54 @@ Operation
430430
immediately.
431431
3. Mark all entries under ``/srv/hw-worker/tests`` as "seen" (create a
432432
``.seen`` file in each directory). This prevents loop-testing the same set.
433-
4. Run the tests. For each test:
433+
4. Open ``/dev/kmsg`` and drain existing boot messages to
434+
``results_dir/boot-dmesg``.
435+
5. Run the tests. For each test:
434436
a. Check if test name is in ``.attempted`` — if so, skip (crash recovery).
435437
b. Write test name to ``.attempted`` + fsync before execution.
436-
c. Start a dmesg monitor thread (reads ``/dev/kmsg``) to detect kernel
437-
crashes during the test (``RIP:``, ``Call Trace:``, etc.).
438-
d. Run via ``./run_kselftest.sh -t <target>/<test>`` (installed form).
439-
e. Capture stdout/stderr, save to ``results_dir/<test_name>/``.
440-
f. Stop dmesg monitor, collect any crash lines.
441-
g. Determine result: pass/fail/skip based on return code and output.
442-
5. Results are saved under ``/srv/hw-worker/results/$reservation_id/``
443-
as ``results.json``. Previously-attempted tests (from crash recovery)
444-
are included as failures with a crash note.
445-
6. Service exits.
438+
c. Run via ``./run_kselftest.sh -t <target>:<test>`` (installed form).
439+
d. Capture stdout/stderr, save to ``results_dir/<idx>-<name>/``.
440+
e. Drain ``/dev/kmsg`` — if any dmesg output was produced during
441+
the test, save it to ``results_dir/<idx>-<name>/dmesg``.
442+
f. Save metadata to ``results_dir/<idx>-<name>/info`` (JSON).
443+
6. Results are saved under ``/srv/hw-worker/results/$reservation_id/``.
444+
hw-worker does **not** determine pass/fail — that is done by hwksft
445+
when it copies back and parses the output files.
446+
7. Service exits.
447+
448+
Output artifacts
449+
----------------
450+
451+
hw-worker produces the following files under
452+
``/srv/hw-worker/results/$reservation_id/``. hwksft copies this tree
453+
back and parses it to build the final result JSON.
454+
455+
::
456+
457+
$reservation_id/
458+
├── boot-dmesg # dmesg from boot until first test
459+
├── 0-test_name/ # per-test output directory
460+
│ ├── stdout # test stdout (KTAP/TAP output)
461+
│ ├── stderr # test stderr
462+
│ ├── info # JSON: {retcode, time, target, prog}
463+
│ └── dmesg # dmesg during this test (if any)
464+
├── 1-another_test/
465+
│ ├── stdout
466+
│ ├── stderr
467+
│ ├── info
468+
│ └── dmesg
469+
└── ...
470+
471+
``info`` JSON fields:
472+
473+
``retcode``
474+
Exit code of ``run_kselftest.sh``. 0 = pass, 4 = skip, other = fail.
475+
476+
``time``
477+
Wall-clock seconds the test took (float).
478+
479+
``target``
480+
kselftest collection name (e.g. ``drivers/net/hw``).
481+
482+
``prog``
483+
Test program name within the collection (e.g. ``rss_drv.py``).

contest/hw/hw_worker.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -170,16 +170,9 @@ def main():
170170
results_dir = os.path.join(results_base, reservation_id)
171171
os.makedirs(results_dir, exist_ok=True)
172172

173-
results = run_tests(test_dir, results_dir)
173+
run_tests(test_dir, results_dir)
174174

175-
results_file = os.path.join(results_dir, 'results.json')
176-
fd = os.open(results_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
177-
with os.fdopen(fd, 'w') as fp:
178-
json.dump(results, fp)
179-
fp.flush()
180-
os.fsync(fp.fileno())
181-
182-
print(f"Completed {len(results)} tests, results in {results_dir}")
175+
print(f"Completed, results in {results_dir}")
183176

184177

185178
if __name__ == '__main__':

contest/hw/hwksft.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
from lib.mc_client import MCClient, resolve_machines, resolve_nic_id # noqa: E402
2525
from lib.deployer import (build_kernel, build_ksft, deploy_artifacts, # noqa: E402
2626
kexec_machine, wait_for_results, fetch_results,
27-
set_log_file, WaitResult, grab_hw_worker_journal)
27+
parse_results, set_log_file, WaitResult,
28+
grab_hw_worker_journal)
2829

2930
# Config:
3031
#
@@ -155,6 +156,7 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
155156
else:
156157
raise RuntimeError(f"Failed to reserve machines after {max_retries} attempts")
157158

159+
cases = None
158160
try:
159161
# 5. Deploy artifacts via SCP
160162
with open(os.path.join(results_path, 'deploy'), 'w', encoding='utf-8') as fp:
@@ -170,32 +172,43 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
170172
wait_result = wait_for_results(config, mc, reservation_id,
171173
machine_ids, machine_ips)
172174

173-
# 8. Grab hw-worker journal for debugging
174-
grab_hw_worker_journal(machine_ips[0], results_path)
175+
# 8. Copy back results
176+
fetch_results(machine_ips, reservation_id, results_path)
175177

176-
# 9. Copy back results
177-
if wait_result.ok:
178-
cases = fetch_results(config, machine_ips, reservation_id, rinfo)
179-
else:
178+
# 9. Parse results
179+
cases = parse_results(reservation_id, results_path, link)
180+
if not wait_result.ok:
180181
# Write error to disk so it's visible via the UI result link
181182
with open(os.path.join(results_path, 'error'), 'w',
182183
encoding='utf-8') as fp:
183184
fp.write(wait_result.error + '\n')
184-
cases = [{
185-
'test': 'hw-worker',
185+
cases.insert(0, {
186+
'test': 'worker-failed',
186187
'group': grp_name,
187188
'result': 'fail',
188189
'link': link,
189-
}]
190+
})
190191
finally:
191192
set_log_file(None)
192-
# 10. Release reservation
193+
# 10. Grab hw-worker journal for debugging
194+
try:
195+
grab_hw_worker_journal(machine_ips[0], results_path)
196+
except Exception as e:
197+
print(f"Warning: failed to grab hw-worker journal: {e}")
198+
# 11. Release reservation
193199
try:
194200
mc.reservation_close(reservation_id)
195201
except Exception as e:
196202
print(f"Warning: failed to close reservation {reservation_id}: {e}")
197203

198204
print("Done at", datetime.datetime.now())
205+
if cases is None:
206+
cases = [{
207+
'test': 'worker-failed',
208+
'group': grp_name,
209+
'result': 'fail',
210+
'link': link,
211+
}]
199212
return cases
200213

201214

contest/hw/lib/deployer.py

Lines changed: 100 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
import json
66
import os
77
import random
8+
import re
89
import shutil
910
import string
1011
import subprocess
11-
import tempfile
1212
import time
1313
from dataclasses import dataclass, field
1414

@@ -210,7 +210,7 @@ def _refresh():
210210
def grab_hw_worker_journal(ipaddr, results_path):
211211
"""Fetch hw-worker journal from the test machine and save locally."""
212212
journal = _ssh(ipaddr,
213-
'journalctl -u nipa-hw-worker.service -n 250 --no-pager',
213+
'journalctl -u nipa-hw-worker.service -b --no-pager',
214214
check=False)
215215
if journal:
216216
journal_file = os.path.join(results_path, 'hw-worker-journal')
@@ -252,38 +252,26 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips):
252252
print(f"wait_for_results: {msg}")
253253
return WaitResult(ok=False, error=msg)
254254

255-
# Check if hw-worker has produced results on primary machine
256-
primary_ip = machine_ips[0]
257-
ret = _ssh_retcode(primary_ip,
258-
f'test -f /srv/hw-worker/results/{reservation_id}/results.json')
259-
if ret == 0:
260-
print("wait_for_results: hw-worker completed")
261-
return WaitResult(ok=True)
262-
263-
# Check if hw-worker exited without producing results.
255+
# Check if hw-worker service has exited.
264256
# For Type=oneshot services, is-active returns "activating" (rc=3)
265257
# while running, "active" (rc=0) after success with RemainAfterExit=yes,
266258
# and "failed"/"inactive" after failure/stop. Use show -p ActiveState
267259
# to distinguish "still running" from "done".
260+
primary_ip = machine_ips[0]
268261
state = _ssh(primary_ip,
269262
'systemctl show -p ActiveState --value nipa-hw-worker.service',
270263
check=False).strip()
271264
if state == 'activating':
272265
pass # still running, continue polling
273-
elif state in ('inactive', 'failed'):
274-
# Service exited, but results.json may have been written
275-
# between our test -f check and the state check (race).
276-
# Re-check before declaring failure.
277-
ret = _ssh_retcode(primary_ip,
278-
f'test -f /srv/hw-worker/results/{reservation_id}/results.json')
279-
if ret == 0:
280-
print("wait_for_results: hw-worker completed")
281-
return WaitResult(ok=True)
282-
283-
# Service finished and no results.json — something went wrong.
284-
msg = f"hw-worker exited without results (state={state})"
266+
elif state == 'failed':
267+
msg = "hw-worker service failed"
285268
print(f"wait_for_results: {msg}")
286269
return WaitResult(ok=False, error=msg)
270+
elif state in ('inactive', 'active'):
271+
# inactive = exited normally (RemainAfterExit=no)
272+
# active = exited normally (RemainAfterExit=yes)
273+
print("wait_for_results: hw-worker completed")
274+
return WaitResult(ok=True)
287275

288276
# Check SOL logs for crashes on each machine
289277
for i, mid in enumerate(machine_ids):
@@ -363,61 +351,109 @@ def _refresh():
363351
kexec_machine(config, [ipaddr], reservation_id, mc=mc)
364352

365353

366-
def fetch_results(_config, machine_ips, reservation_id, rinfo):
367-
"""SCP results from test machines back to build node.
354+
def fetch_results(machine_ips, reservation_id, results_path):
355+
"""SCP test output from the test machine back to the build node.
368356
369-
Parse and format into vmksft-p-style result list.
370-
Tests that crashed (in .attempted but not in results) are marked
371-
as result='fail' with crash info.
357+
Copies the results directory tree and the .attempted file.
372358
"""
373359
primary_ip = machine_ips[0]
374360
remote_results = f'/srv/hw-worker/results/{reservation_id}'
361+
remote_tests = f'/srv/hw-worker/tests/{reservation_id}'
375362

376-
with tempfile.TemporaryDirectory() as tmpdir:
377-
# Copy results.json
378-
_scp_from(primary_ip, f'{remote_results}/results.json',
379-
os.path.join(tmpdir, 'results.json'))
380-
381-
# Copy .attempted for crash tracking
382-
remote_tests = f'/srv/hw-worker/tests/{reservation_id}'
383-
_scp_from(primary_ip, f'{remote_tests}/.attempted',
384-
os.path.join(tmpdir, 'attempted.json'),
385-
check=False)
386-
387-
# Parse results
388-
results_path = os.path.join(tmpdir, 'results.json')
389-
if os.path.exists(results_path):
390-
with open(results_path, encoding='utf-8') as fp:
391-
raw_results = json.load(fp)
392-
else:
393-
raw_results = []
363+
# Copy the entire results directory tree
364+
local_results = os.path.join(results_path, 'test-outputs')
365+
os.makedirs(local_results, exist_ok=True)
366+
# Use scp -r to grab all test output directories
367+
ret = subprocess.run(
368+
['scp', '-r', '-o', 'StrictHostKeyChecking=no',
369+
'-o', 'BatchMode=yes',
370+
f'root@{primary_ip}:{remote_results}/', local_results],
371+
capture_output=True, timeout=300, check=False
372+
)
373+
if ret.returncode != 0:
374+
print(f"fetch_results: scp failed: {ret.stderr.decode('utf-8', 'ignore')}")
375+
376+
# Copy .attempted for crash tracking
377+
_scp_from(primary_ip, f'{remote_tests}/.attempted',
378+
os.path.join(results_path, 'attempted.json'),
379+
check=False)
394380

395-
# Load attempted tests
396-
attempted_path = os.path.join(tmpdir, 'attempted.json')
397-
attempted = []
398-
if os.path.exists(attempted_path):
399-
with open(attempted_path, encoding='utf-8') as fp:
400-
attempted = json.load(fp)
401381

402-
# Identify crashed tests: in attempted but not in results
403-
result_names = {r['test'] for r in raw_results}
404-
link = rinfo.get('link', '')
382+
def parse_results(reservation_id, results_path, link):
383+
"""Parse fetched test output into a vmksft-p-style result list.
384+
385+
Reads info/stdout files from the test-outputs directory and
386+
the .attempted file to identify crashed tests.
387+
"""
388+
# Find the actual results subdir (scp -r creates reservation_id/ inside)
389+
local_results = os.path.join(results_path, 'test-outputs')
390+
output_dir = os.path.join(local_results, str(reservation_id))
391+
if not os.path.isdir(output_dir):
392+
output_dir = local_results
393+
394+
# Parse each test output directory
395+
cases = []
396+
completed_tests = set()
397+
if os.path.isdir(output_dir):
398+
for entry in sorted(os.listdir(output_dir)):
399+
test_dir = os.path.join(output_dir, entry)
400+
if not os.path.isdir(test_dir):
401+
continue
402+
403+
info_path = os.path.join(test_dir, 'info')
404+
stdout_path = os.path.join(test_dir, 'stdout')
405+
406+
if not os.path.exists(info_path):
407+
continue
408+
409+
with open(info_path, encoding='utf-8') as fp:
410+
info = json.load(fp)
411+
412+
retcode = info.get('retcode', 1)
413+
target = info.get('target', 'unknown')
414+
prog = info.get('prog', entry)
415+
test_name = f"{target}:{prog}"
416+
completed_tests.add(test_name)
417+
418+
stdout = ''
419+
if os.path.exists(stdout_path):
420+
with open(stdout_path, encoding='utf-8') as fp:
421+
stdout = fp.read()
422+
423+
# Determine result
424+
result = 'pass'
425+
if retcode == 4:
426+
result = 'skip'
427+
elif retcode != 0:
428+
result = 'fail'
429+
if 'ok' not in stdout.lower() and result == 'pass':
430+
result = 'skip'
431+
432+
safe_name = re.sub(r'[^0-9a-zA-Z]+', '-', prog)
433+
if safe_name and safe_name[-1] == '-':
434+
safe_name = safe_name[:-1]
405435

406-
cases = []
407-
for r in raw_results:
408436
outcome = {
409-
'test': r['test'],
410-
'group': r.get('group', 'selftests-hw'),
411-
'result': r['result'],
437+
'test': safe_name or entry,
438+
'group': f'selftests-{re.sub(r"[^0-9a-zA-Z]+", "-", target).rstrip("-")}',
439+
'result': result,
412440
'link': link,
413441
}
414-
for key in ['time', 'crashes']:
415-
if key in r:
416-
outcome[key] = r[key]
442+
if 'time' in info:
443+
outcome['time'] = info['time']
417444
cases.append(outcome)
418445

446+
# Check .attempted for crashed tests (attempted but no output)
447+
attempted_path = os.path.join(results_path, 'attempted.json')
448+
if os.path.exists(attempted_path):
449+
with open(attempted_path, encoding='utf-8') as fp:
450+
try:
451+
attempted = json.load(fp)
452+
except (json.JSONDecodeError, ValueError):
453+
attempted = []
454+
419455
for test_name in attempted:
420-
if test_name not in result_names:
456+
if test_name not in completed_tests:
421457
cases.append({
422458
'test': test_name,
423459
'group': 'selftests-hw',

0 commit comments

Comments
 (0)