Skip to content

Commit 0ccb309

Browse files
committed
contest-hw: initial implementation
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 04f4ebe commit 0ccb309

18 files changed

Lines changed: 4229 additions & 0 deletions

contest/hw/hw_worker.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: GPL-2.0
3+
4+
"""NIPA HW worker — one-shot on-boot test runner."""
5+
6+
import json
7+
import os
8+
import subprocess
9+
10+
from lib.runner import find_newest_unseen, mark_all_seen, run_tests
11+
12+
13+
TESTS_DIR = '/srv/hw-worker/tests'
14+
RESULTS_DIR = '/srv/hw-worker/results'
15+
16+
# kselftest net.config keys (see drivers/net/README.rst)
17+
_NET_CONFIG_KEYS = ['NETIF', 'LOCAL_V4', 'LOCAL_V6', 'REMOTE_V4', 'REMOTE_V6',
18+
'LOCAL_PREFIX_V6', 'REMOTE_TYPE', 'REMOTE_ARGS']
19+
20+
21+
def _parse_env_file(path):
22+
"""Parse a simple KEY=VALUE env file."""
23+
env = {}
24+
if not os.path.exists(path):
25+
return env
26+
with open(path, encoding='utf-8') as fp:
27+
for line in fp:
28+
line = line.strip()
29+
if not line or line.startswith('#'):
30+
continue
31+
key, sep, val = line.partition('=')
32+
if sep:
33+
env[key.strip()] = val.strip()
34+
return env
35+
36+
37+
def _ensure_link_up(ifname):
38+
"""Bring a network interface up if not already."""
39+
subprocess.run(['ip', 'link', 'set', ifname, 'up'], check=True)
40+
41+
42+
def _ensure_addr(ifname, addr):
43+
"""Add an IP address to an interface if not already present."""
44+
bare_addr = addr.split('/')[0]
45+
ret = subprocess.run(['ip', 'addr', 'show', 'dev', ifname],
46+
capture_output=True, check=False)
47+
if bare_addr in ret.stdout.decode():
48+
return
49+
if '/' not in addr:
50+
addr += '/64' if ':' in addr else '/24'
51+
subprocess.run(['ip', 'addr', 'add', addr, 'dev', ifname], check=True)
52+
53+
54+
def setup_test_interfaces(test_dir):
55+
"""Configure test NICs and write net.config from nic-test.env.
56+
57+
The hwksft orchestrator deploys nic-test.env with interface names,
58+
IP addresses, and remote connectivity info. This function:
59+
1. Brings up the DUT and peer interfaces
60+
2. Adds IP addresses if not already configured
61+
3. Writes drivers/net/net.config for the kselftest framework
62+
"""
63+
env = _parse_env_file(os.path.join(test_dir, 'nic-test.env'))
64+
if not env:
65+
return
66+
67+
# Configure DUT interface
68+
netif = env.get('NETIF')
69+
if netif:
70+
_ensure_link_up(netif)
71+
if env.get('LOCAL_V4'):
72+
_ensure_addr(netif, env['LOCAL_V4'])
73+
if env.get('LOCAL_V6'):
74+
_ensure_addr(netif, env['LOCAL_V6'])
75+
76+
# Configure peer interface (for loopback / same-machine peers)
77+
remote_ifname = env.get('REMOTE_IFNAME')
78+
if remote_ifname:
79+
_ensure_link_up(remote_ifname)
80+
if env.get('REMOTE_V4'):
81+
_ensure_addr(remote_ifname, env['REMOTE_V4'])
82+
if env.get('REMOTE_V6'):
83+
_ensure_addr(remote_ifname, env['REMOTE_V6'])
84+
85+
# Write net.config for the kselftest framework
86+
config_lines = []
87+
for key in _NET_CONFIG_KEYS:
88+
if env.get(key):
89+
config_lines.append(f'{key}={env[key]}')
90+
91+
if config_lines:
92+
config_content = '\n'.join(config_lines) + '\n'
93+
for subdir in ['drivers/net', 'drivers/net/hw']:
94+
config_dir = os.path.join(test_dir, subdir)
95+
if os.path.isdir(config_dir):
96+
path = os.path.join(config_dir, 'net.config')
97+
with open(path, 'w', encoding='utf-8') as fp:
98+
fp.write(config_content)
99+
print(f"Wrote {path}")
100+
101+
102+
def main():
103+
"""Find pending tests, run them, and write results."""
104+
tests_dir = TESTS_DIR
105+
results_base = RESULTS_DIR
106+
107+
test_dir = find_newest_unseen(tests_dir)
108+
if test_dir is None:
109+
print("No outstanding tests found")
110+
return
111+
112+
# Verify we booted into the expected test kernel by comparing
113+
# the deployed kernel version against the running kernel.
114+
kver_path = os.path.join(test_dir, '.kernel-version')
115+
if not os.path.exists(kver_path):
116+
print("No kernel version file, skipping")
117+
return
118+
with open(kver_path, encoding='utf-8') as fp:
119+
expected = fp.read().strip()
120+
121+
actual = os.uname().release
122+
# The kernel version includes the git hash and instance name
123+
# (via CONFIG_LOCALVERSION), so accidental prefix collisions
124+
# (e.g. "6.1" matching "6.12.0") cannot happen in practice.
125+
# The '-' separator check is an extra safety measure.
126+
if actual != expected and not actual.startswith(expected + '-'):
127+
print(f"Kernel mismatch: running {actual}, expected {expected}")
128+
return
129+
130+
mark_all_seen(tests_dir)
131+
132+
# Configure test interfaces and write net.config
133+
setup_test_interfaces(test_dir)
134+
135+
reservation_id = os.path.basename(test_dir)
136+
results_dir = os.path.join(results_base, reservation_id)
137+
os.makedirs(results_dir, exist_ok=True)
138+
139+
results = run_tests(test_dir, results_dir)
140+
141+
results_file = os.path.join(results_dir, 'results.json')
142+
fd = os.open(results_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
143+
with os.fdopen(fd, 'w') as fp:
144+
json.dump(results, fp)
145+
fp.flush()
146+
os.fsync(fp.fileno())
147+
148+
print(f"Completed {len(results)} tests, results in {results_dir}")
149+
150+
151+
if __name__ == '__main__':
152+
main()

contest/hw/hwksft.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: GPL-2.0
3+
4+
"""NIPA HW kselftest orchestrator service."""
5+
6+
import datetime
7+
import os
8+
import subprocess
9+
import sys
10+
import time
11+
12+
# Add the project root to path for core imports
13+
_project_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..')
14+
sys.path.insert(0, _project_root)
15+
16+
# Add contest/remote to path and import CbArg/Fetcher BEFORE our own lib
17+
# shadows the 'lib' name. We need to do this before any 'from lib' imports.
18+
_remote_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'remote')
19+
sys.path.insert(0, _remote_dir)
20+
21+
# pylint: disable=wrong-import-position,wrong-import-order
22+
# Imports below require sys.path manipulation for cross-package access.
23+
24+
from core import NipaLifetime # noqa: E402 # pylint: disable=import-error
25+
26+
# Import from the remote lib package under the remote dir
27+
from lib.cbarg import CbArg # noqa: E402 # pylint: disable=import-error,no-name-in-module
28+
from lib.fetcher import Fetcher # noqa: E402 # pylint: disable=import-error,no-name-in-module
29+
30+
# Now remove contest/remote from path so our own lib takes precedence
31+
sys.path.remove(_remote_dir)
32+
33+
from lib.mc_client import MCClient, resolve_machines, resolve_nic_id # noqa: E402
34+
from lib.deployer import (build_kernel, build_ksft, deploy_artifacts, # noqa: E402
35+
kexec_machine, wait_for_results, fetch_results)
36+
37+
# Config:
38+
#
39+
# [executor]
40+
# name=hwksft-nic0
41+
# group=selftests-hw
42+
# init=force / continue / next
43+
# [remote]
44+
# branches=https://url-to-branches-manifest
45+
# [local]
46+
# base_path=/common/path
47+
# json_path=base-relative/path/to/json
48+
# results_path=base-relative/path/to/raw/outputs
49+
# tree_path=/root-path/to/kernel/git
50+
# patches_path=/root-path/to/patches/dir
51+
# [www]
52+
# url=https://url-to-reach-base-path
53+
# [hw]
54+
# nic_vendor=Intel
55+
# nic_model=E810-C
56+
# machine_control_url=http://control-node:5050
57+
# reservation_retry_time=60
58+
# max_kexec_boot_timeout=300
59+
# max_test_time=3600
60+
# crash_wait_time=120
61+
# sol_poll_interval=15
62+
# [build]
63+
# extra_kconfig=/path/to/nic-driver.config
64+
# [ksft]
65+
# target=net
66+
67+
68+
def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
69+
"""Fetcher callback: build, deploy, run, and collect HW test results."""
70+
print("Run at", datetime.datetime.now())
71+
cbarg.refresh_config()
72+
config = cbarg.config
73+
74+
results_path = os.path.join(config.get('local', 'base_path'),
75+
config.get('local', 'results_path'),
76+
rinfo['run-cookie'])
77+
os.makedirs(results_path, exist_ok=True)
78+
79+
link = config.get('www', 'url') + '/' + \
80+
config.get('local', 'results_path') + '/' + \
81+
rinfo['run-cookie']
82+
rinfo['link'] = link
83+
grp_name = config.get('executor', 'group', fallback='selftests-hw')
84+
85+
tree_path = config.get('local', 'tree_path')
86+
mc_url = config.get('hw', 'machine_control_url')
87+
nic_vendor = config.get('hw', 'nic_vendor')
88+
nic_model = config.get('hw', 'nic_model')
89+
mc = MCClient(mc_url)
90+
91+
# 1. Build kernel + ksft
92+
try:
93+
kernel_version = build_kernel(config, tree_path)
94+
build_ksft(config, tree_path)
95+
except (subprocess.CalledProcessError, OSError) as e:
96+
print(f"Build failed: {e}")
97+
return [{
98+
'test': 'build',
99+
'group': grp_name,
100+
'result': 'fail',
101+
'link': link,
102+
}]
103+
104+
# 2. Resolve machines for NIC
105+
all_nics = mc.get_nic_info()
106+
nic_id = resolve_nic_id(all_nics, nic_vendor, nic_model)
107+
machine_ids, nic = resolve_machines(all_nics, nic_id)
108+
109+
# Build nic_info dict with peer info for deployment
110+
nic_deploy_info = {
111+
'ifname': nic.get('ifname', ''),
112+
'ip4addr': nic.get('ip4addr', ''),
113+
'ip6addr': nic.get('ip6addr', ''),
114+
}
115+
if nic.get('peer_id'):
116+
for n in all_nics:
117+
if n['id'] == nic['peer_id']:
118+
nic_deploy_info['peer'] = {
119+
'ifname': n.get('ifname', ''),
120+
'ip4addr': n.get('ip4addr', ''),
121+
'ip6addr': n.get('ip6addr', ''),
122+
}
123+
break
124+
125+
# 3. Get machine IPs for SSH/SCP
126+
all_machines = mc.get_machine_info()
127+
machine_ip_map = {m['id']: m['mgmt_ipaddr'] for m in all_machines}
128+
machine_ips = [machine_ip_map[mid] for mid in machine_ids]
129+
130+
# Record peer machine IP so deployer can set REMOTE_ARGS
131+
if nic.get('peer_id'):
132+
for n in all_nics:
133+
if n['id'] == nic['peer_id']:
134+
nic_deploy_info['peer_machine_ip'] = machine_ip_map.get(
135+
n['machine_id'], machine_ips[0])
136+
break
137+
138+
# 4. Reserve machines (retry loop with backoff)
139+
max_retries = config.getint('hw', 'max_reservation_retries', fallback=30)
140+
retry_time = config.getint('hw', 'reservation_retry_time', fallback=60)
141+
reservation_id = None
142+
for attempt in range(max_retries):
143+
result = mc.reserve(machine_ids)
144+
if 'reservation_id' in result:
145+
reservation_id = result['reservation_id']
146+
break
147+
wait = min(retry_time * (1.5 ** attempt), 300)
148+
print(f"Reserve failed ({result.get('error', '?')}), "
149+
f"retry {attempt+1}/{max_retries} in {wait:.0f}s")
150+
time.sleep(wait)
151+
else:
152+
raise RuntimeError(f"Failed to reserve machines after {max_retries} attempts")
153+
154+
try:
155+
# 5. Deploy artifacts via SCP
156+
deploy_artifacts(config, machine_ips, reservation_id, nic_deploy_info,
157+
tree_path, kernel_version)
158+
159+
# 6. kexec into new kernel
160+
kexec_machine(config, machine_ips, reservation_id)
161+
162+
# 7. Wait for hw-worker with crash monitoring
163+
wait_for_results(config, mc, reservation_id, machine_ids, machine_ips)
164+
165+
# 8. Copy back results
166+
cases = fetch_results(config, machine_ips, reservation_id, rinfo)
167+
finally:
168+
# 9. Release reservation
169+
try:
170+
mc.reservation_close(reservation_id)
171+
except Exception as e:
172+
print(f"Warning: failed to close reservation {reservation_id}: {e}")
173+
174+
print("Done at", datetime.datetime.now())
175+
return cases
176+
177+
178+
def main():
179+
"""Entry point: set up Fetcher poll loop."""
180+
cfg_paths = ['hw.config', 'hwksft.config']
181+
if len(sys.argv) > 1:
182+
cfg_paths += sys.argv[1:]
183+
184+
cbarg = CbArg(cfg_paths)
185+
config = cbarg.config
186+
187+
base_dir = config.get('local', 'base_path')
188+
189+
life = NipaLifetime(config)
190+
191+
f = Fetcher(test, cbarg,
192+
name=config.get('executor', 'name'),
193+
branches_url=config.get('remote', 'branches'),
194+
results_path=os.path.join(base_dir, config.get('local', 'json_path')),
195+
url_path=config.get('www', 'url') + '/' + config.get('local', 'json_path'),
196+
tree_path=config.get('local', 'tree_path'),
197+
patches_path=config.get('local', 'patches_path', fallback=None),
198+
life=life,
199+
first_run=config.get('executor', 'init', fallback="continue"))
200+
f.run()
201+
life.exit()
202+
203+
204+
if __name__ == '__main__':
205+
main()

contest/hw/lib/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# SPDX-License-Identifier: GPL-2.0

0 commit comments

Comments
 (0)