Skip to content

Commit 2a660c2

Browse files
committed
contest-hw: add more debug for failing runs
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 44f92dc commit 2a660c2

5 files changed

Lines changed: 171 additions & 15 deletions

File tree

contest/hw/hw_worker.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def main():
144144
# the deployed kernel version against the running kernel.
145145
kver_path = os.path.join(test_dir, '.kernel-version')
146146
if not os.path.exists(kver_path):
147-
print("No kernel version file, skipping")
147+
print(test_dir, "No kernel version file, skipping")
148148
return
149149
with open(kver_path, encoding='utf-8') as fp:
150150
expected = fp.read().strip()
@@ -155,11 +155,14 @@ def main():
155155
# (e.g. "6.1" matching "6.12.0") cannot happen in practice.
156156
# The '-' separator check is an extra safety measure.
157157
if actual != expected and not actual.startswith(expected + '-'):
158-
print(f"Kernel mismatch: running {actual}, expected {expected}")
158+
print(test_dir,
159+
f"Kernel mismatch: running {actual}, expected {expected}")
159160
return
160161

161162
mark_all_seen(tests_dir)
162163

164+
print(test_dir, "Starting tests")
165+
163166
# Configure test interfaces and write net.config
164167
setup_test_interfaces(test_dir)
165168

contest/hw/hwksft.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from lib.mc_client import MCClient, resolve_machines, resolve_nic_id # noqa: E402
2525
from lib.deployer import (build_kernel, build_ksft, deploy_artifacts, # noqa: E402
2626
kexec_machine, wait_for_results, fetch_results,
27-
set_log_file)
27+
set_log_file, WaitResult)
2828

2929
# Config:
3030
#
@@ -167,14 +167,18 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
167167
kexec_machine(config, machine_ips, reservation_id, mc=mc)
168168

169169
# 7. Wait for hw-worker with crash monitoring
170-
has_results = wait_for_results(config, mc, reservation_id,
170+
wait_result = wait_for_results(config, mc, reservation_id,
171171
machine_ids, machine_ips,
172172
results_path=results_path)
173173

174174
# 8. Copy back results
175-
if has_results:
175+
if wait_result.ok:
176176
cases = fetch_results(config, machine_ips, reservation_id, rinfo)
177177
else:
178+
# Write error to disk so it's visible via the UI result link
179+
with open(os.path.join(results_path, 'error'), 'w',
180+
encoding='utf-8') as fp:
181+
fp.write(wait_result.error + '\n')
178182
cases = [{
179183
'test': 'hw-worker',
180184
'group': grp_name,

contest/hw/lib/deployer.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import subprocess
1111
import tempfile
1212
import time
13+
from dataclasses import dataclass, field
1314

1415

1516
# Log file handle, set by set_log_file() before builds start.
@@ -19,6 +20,13 @@
1920
_initrd_cache = {}
2021

2122

23+
@dataclass
24+
class WaitResult:
25+
"""Result of wait_for_results()."""
26+
ok: bool
27+
error: str = ''
28+
29+
2230
def set_log_file(fp):
2331
"""Set the file handle for command output logging."""
2432
global _log_fp # pylint: disable=global-statement
@@ -223,31 +231,40 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips,
223231
while True:
224232
elapsed = time.monotonic() - start_time
225233
if elapsed > max_test_time:
226-
# Caller (hwksft.test) will still try fetch_results, which
227-
# handles missing results gracefully.
228-
print("wait_for_results: max test time exceeded")
229-
break
234+
msg = "max test time exceeded"
235+
print(f"wait_for_results: {msg}")
236+
return WaitResult(ok=False, error=msg)
230237

231238
# Refresh reservation
232239
result = mc.reservation_refresh(reservation_id)
233240
if not result.get('ok') and 'error' in result:
234-
print(f"wait_for_results: reservation refresh failed: {result['error']}")
235-
break
241+
msg = f"reservation refresh failed: {result['error']}"
242+
print(f"wait_for_results: {msg}")
243+
return WaitResult(ok=False, error=msg)
236244

237245
# Check if hw-worker has produced results on primary machine
238246
primary_ip = machine_ips[0]
239247
ret = _ssh_retcode(primary_ip,
240248
f'test -f /srv/hw-worker/results/{reservation_id}/results.json')
241249
if ret == 0:
242250
print("wait_for_results: hw-worker completed")
243-
return True
251+
return WaitResult(ok=True)
244252

245253
# Check if hw-worker exited without producing results
246254
ret = _ssh_retcode(primary_ip, 'systemctl is-active nipa-hw-worker.service')
247255
if ret != 0:
248-
# Service is inactive/failed — no results.json means it failed
249-
print("wait_for_results: hw-worker exited without results")
250-
return False
256+
# Service is inactive/failed — no results.json means it failed.
257+
# Grab journalctl output for debugging.
258+
journal = _ssh(primary_ip,
259+
'journalctl -u nipa-hw-worker.service -n 100 --no-pager',
260+
check=False)
261+
if journal and results_path:
262+
journal_file = os.path.join(results_path, 'hw-worker-journal')
263+
with open(journal_file, 'w', encoding='utf-8') as fp:
264+
fp.write(journal)
265+
msg = "hw-worker exited without results"
266+
print(f"wait_for_results: {msg}")
267+
return WaitResult(ok=False, error=msg)
251268

252269
# Check SOL logs for crashes on each machine
253270
for i, mid in enumerate(machine_ids):

contest/hw/lib/runner.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,11 +156,14 @@ def run_tests(test_dir, results_dir):
156156
print("No tests found")
157157
return []
158158

159+
print(f"Found {len(tests)} tests")
160+
159161
previously_attempted = set(load_attempted(test_dir))
160162
results = []
161163

162164
# Mark previously attempted tests as crashed
163165
for test_name in previously_attempted:
166+
print(f"Skipping previously attempted (crashed): {test_name}")
164167
results.append({
165168
'test': _namify(test_name),
166169
'group': 'selftests-hw',
@@ -176,6 +179,8 @@ def run_tests(test_dir, results_dir):
176179
if test_name in previously_attempted:
177180
continue
178181

182+
print(f"[{test_idx+1}/{len(tests)}] Running {test_name}")
183+
179184
# Mark as attempted before execution
180185
mark_attempted(test_dir, test_name)
181186

@@ -204,6 +209,7 @@ def run_tests(test_dir, results_dir):
204209
retcode = 1
205210
stdout = ''
206211
stderr = 'test timed out'
212+
print(f"[{test_idx+1}/{len(tests)}] {test_name}: timed out")
207213
t2 = time.monotonic()
208214

209215
# Save output
@@ -236,6 +242,8 @@ def run_tests(test_dir, results_dir):
236242
outcome['crashes'] = crash_lines
237243
outcome['result'] = 'fail'
238244

245+
print(f"[{test_idx+1}/{len(tests)}] {test_name}: {outcome['result']} ({outcome['time']}s)")
246+
239247
results.append(outcome)
240248

241249
return results

contest/hw/tests/test_hwksft.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-License-Identifier: GPL-2.0
22

33
import json
4+
import tempfile
45
import unittest
56
from unittest import mock
67

@@ -314,6 +315,129 @@ def test_sol_reboot_detected(self):
314315
normal_output = "[ 123.456789] Normal kernel message"
315316
self.assertFalse(_has_reboot(normal_output))
316317

318+
@mock.patch('subprocess.run')
319+
@mock.patch('time.monotonic')
320+
@mock.patch('time.sleep')
321+
def test_wait_for_results_timeout(self, _mock_sleep,
322+
mock_monotonic, mock_run):
323+
"""max_test_time exceeded returns WaitResult with error string."""
324+
from lib.deployer import wait_for_results, WaitResult
325+
326+
mock_run.return_value = mock.Mock(returncode=0, stdout=b'', stderr=b'')
327+
mock_monotonic.side_effect = [
328+
0, # start_time
329+
99999, # elapsed check -> exceeds max_test_time
330+
]
331+
332+
config = mock.Mock()
333+
config.getint.side_effect = lambda section, key, fallback=None: {
334+
'max_test_time': 3600,
335+
'sol_poll_interval': 15,
336+
'crash_wait_time': 120,
337+
'max_kexec_boot_timeout': 300,
338+
}.get(key, fallback)
339+
340+
mc = mock.Mock()
341+
mc.get_sol_logs.return_value = {'last_id': 0, 'lines': []}
342+
343+
result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
344+
345+
self.assertIsInstance(result, WaitResult)
346+
self.assertFalse(result.ok)
347+
self.assertIn('max test time exceeded', result.error)
348+
349+
@mock.patch('subprocess.run')
350+
@mock.patch('time.monotonic')
351+
@mock.patch('time.sleep')
352+
def test_wait_for_results_no_results(self, _mock_sleep,
353+
mock_monotonic, mock_run):
354+
"""hw-worker exits without results returns WaitResult with error."""
355+
from lib.deployer import wait_for_results, WaitResult
356+
357+
mock_run.return_value = mock.Mock(returncode=0, stdout=b'', stderr=b'')
358+
mock_monotonic.side_effect = [
359+
0, # start_time
360+
10, # elapsed check
361+
]
362+
363+
config = mock.Mock()
364+
config.getint.side_effect = lambda section, key, fallback=None: {
365+
'max_test_time': 3600,
366+
'sol_poll_interval': 15,
367+
'crash_wait_time': 120,
368+
'max_kexec_boot_timeout': 300,
369+
}.get(key, fallback)
370+
371+
mc = mock.Mock()
372+
mc.get_sol_logs.return_value = {'last_id': 0, 'lines': []}
373+
mc.reservation_refresh.return_value = {'ok': True}
374+
375+
def ssh_retcode_side_effect(ip, cmd, timeout=30):
376+
if 'test -f' in cmd:
377+
return 1 # no results.json
378+
if 'is-active' in cmd:
379+
return 1 # service exited
380+
return 0
381+
382+
with tempfile.TemporaryDirectory() as tmpdir:
383+
with mock.patch('lib.deployer._ssh_retcode',
384+
side_effect=ssh_retcode_side_effect):
385+
with mock.patch('lib.deployer._ssh',
386+
return_value='Mar 14 hw-worker[123]: some log\n') as mock_ssh:
387+
result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'],
388+
results_path=tmpdir)
389+
390+
self.assertIsInstance(result, WaitResult)
391+
self.assertFalse(result.ok)
392+
self.assertIn('hw-worker exited without results', result.error)
393+
394+
# Verify journalctl output was fetched and saved
395+
mock_ssh.assert_called_once()
396+
self.assertIn('journalctl', mock_ssh.call_args[0][1])
397+
journal_file = os.path.join(tmpdir, 'hw-worker-journal')
398+
self.assertTrue(os.path.exists(journal_file))
399+
with open(journal_file) as fp:
400+
self.assertIn('some log', fp.read())
401+
402+
@mock.patch('subprocess.run')
403+
@mock.patch('time.monotonic')
404+
@mock.patch('time.sleep')
405+
def test_wait_for_results_success(self, _mock_sleep,
406+
mock_monotonic, mock_run):
407+
"""hw-worker completes with results returns WaitResult(ok=True)."""
408+
from lib.deployer import wait_for_results, WaitResult
409+
410+
mock_run.return_value = mock.Mock(returncode=0, stdout=b'', stderr=b'')
411+
mock_monotonic.side_effect = [
412+
0, # start_time
413+
10, # elapsed check
414+
]
415+
416+
config = mock.Mock()
417+
config.getint.side_effect = lambda section, key, fallback=None: {
418+
'max_test_time': 3600,
419+
'sol_poll_interval': 15,
420+
'crash_wait_time': 120,
421+
'max_kexec_boot_timeout': 300,
422+
}.get(key, fallback)
423+
424+
mc = mock.Mock()
425+
mc.get_sol_logs.return_value = {'last_id': 0, 'lines': []}
426+
mc.reservation_refresh.return_value = {'ok': True}
427+
428+
def ssh_retcode_side_effect(ip, cmd, timeout=30):
429+
if 'test -f' in cmd:
430+
return 0 # results.json exists
431+
return 0
432+
433+
with mock.patch('lib.deployer._ssh_retcode',
434+
side_effect=ssh_retcode_side_effect):
435+
result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
436+
437+
self.assertIsInstance(result, WaitResult)
438+
self.assertTrue(result.ok)
439+
self.assertEqual(result.error, '')
440+
317441
@mock.patch('subprocess.run')
318442
@mock.patch('time.monotonic')
319443
@mock.patch('time.sleep')

0 commit comments

Comments
 (0)