Skip to content

Commit d7d266b

Browse files
committed
contest-hw: hwksft: always grab worker's journal
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 18d3d05 commit d7d266b

3 files changed

Lines changed: 45 additions & 44 deletions

File tree

contest/hw/hwksft.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from lib.mc_client import MCClient, resolve_machines, resolve_nic_id # noqa: E402
2525
from lib.deployer import (build_kernel, build_ksft, deploy_artifacts, # noqa: E402
2626
kexec_machine, wait_for_results, fetch_results,
27-
set_log_file, WaitResult)
27+
set_log_file, WaitResult, grab_hw_worker_journal)
2828

2929
# Config:
3030
#
@@ -168,10 +168,12 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
168168

169169
# 7. Wait for hw-worker with crash monitoring
170170
wait_result = wait_for_results(config, mc, reservation_id,
171-
machine_ids, machine_ips,
172-
results_path=results_path)
171+
machine_ids, machine_ips)
173172

174-
# 8. Copy back results
173+
# 8. Grab hw-worker journal for debugging
174+
grab_hw_worker_journal(machine_ips[0], results_path)
175+
176+
# 9. Copy back results
175177
if wait_result.ok:
176178
cases = fetch_results(config, machine_ips, reservation_id, rinfo)
177179
else:
@@ -187,7 +189,7 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
187189
}]
188190
finally:
189191
set_log_file(None)
190-
# 9. Release reservation
192+
# 10. Release reservation
191193
try:
192194
mc.reservation_close(reservation_id)
193195
except Exception as e:

contest/hw/lib/deployer.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,18 @@ def _refresh():
207207
print(f"kexec: {ipaddr} is back")
208208

209209

210-
def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips,
211-
results_path=None):
210+
def grab_hw_worker_journal(ipaddr, results_path):
211+
"""Fetch hw-worker journal from the test machine and save locally."""
212+
journal = _ssh(ipaddr,
213+
'journalctl -u nipa-hw-worker.service -n 250 --no-pager',
214+
check=False)
215+
if journal:
216+
journal_file = os.path.join(results_path, 'hw-worker-journal')
217+
with open(journal_file, 'w', encoding='utf-8') as fp:
218+
fp.write(journal)
219+
220+
221+
def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips):
212222
"""Main wait loop with crash monitoring.
213223
214224
Polls SOL logs via mc.get_sol_logs() to detect crashes.
@@ -271,14 +281,6 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips,
271281
return WaitResult(ok=True)
272282

273283
# Service finished and no results.json — something went wrong.
274-
# Grab journalctl output for debugging.
275-
journal = _ssh(primary_ip,
276-
'journalctl -u nipa-hw-worker.service -n 100 --no-pager',
277-
check=False)
278-
if journal and results_path:
279-
journal_file = os.path.join(results_path, 'hw-worker-journal')
280-
with open(journal_file, 'w', encoding='utf-8') as fp:
281-
fp.write(journal)
282284
msg = f"hw-worker exited without results (state={state})"
283285
print(f"wait_for_results: {msg}")
284286
return WaitResult(ok=False, error=msg)
@@ -300,10 +302,6 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips,
300302
'unreferenced object 0x'))]
301303
for cl in crash_lines:
302304
print(f"wait_for_results: crash on machine {mid}: {cl.strip()}")
303-
if results_path:
304-
crash_file = os.path.join(results_path, f'crash-machine-{mid}')
305-
with open(crash_file, 'a', encoding='utf-8') as fp:
306-
fp.write(sol_text + '\n')
307305

308306
if mid in crash_detected_at:
309307
if _has_reboot(sol_text):

contest/hw/tests/test_hwksft.py

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,23 @@ def test_reserve_409_returns_json(self, mock_post):
154154

155155

156156
class TestDeployer(unittest.TestCase):
157+
def test_grab_hw_worker_journal(self):
158+
"""Verify journal is fetched and saved to results dir."""
159+
from lib.deployer import grab_hw_worker_journal
160+
161+
with tempfile.TemporaryDirectory() as tmpdir:
162+
with mock.patch('lib.deployer._ssh',
163+
return_value='Mar 14 hw-worker[1]: test log\n') as mock_ssh:
164+
grab_hw_worker_journal('10.0.0.1', tmpdir)
165+
166+
mock_ssh.assert_called_once()
167+
self.assertIn('journalctl', mock_ssh.call_args[0][1])
168+
self.assertIn('-n 250', mock_ssh.call_args[0][1])
169+
journal_file = os.path.join(tmpdir, 'hw-worker-journal')
170+
self.assertTrue(os.path.exists(journal_file))
171+
with open(journal_file) as fp:
172+
self.assertIn('test log', fp.read())
173+
157174
@mock.patch('subprocess.run')
158175
def test_deploy_artifacts(self, mock_run):
159176
mock_run.return_value = mock.Mock(returncode=0, stdout=b'', stderr=b'')
@@ -378,36 +395,20 @@ def ssh_retcode_side_effect(ip, cmd, timeout=30):
378395
return 1 # no results.json
379396
return 0
380397

381-
ssh_call_count = {'n': 0}
382-
383398
def ssh_side_effect(ip, cmd, check=True, timeout=30):
384-
ssh_call_count['n'] += 1
385399
if 'systemctl show' in cmd:
386400
return 'failed\n'
387-
if 'journalctl' in cmd:
388-
return 'Mar 14 hw-worker[123]: some log\n'
389401
return ''
390402

391-
with tempfile.TemporaryDirectory() as tmpdir:
392-
with mock.patch('lib.deployer._ssh_retcode',
393-
side_effect=ssh_retcode_side_effect):
394-
with mock.patch('lib.deployer._ssh',
395-
side_effect=ssh_side_effect) as mock_ssh:
396-
result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'],
397-
results_path=tmpdir)
398-
399-
self.assertIsInstance(result, WaitResult)
400-
self.assertFalse(result.ok)
401-
self.assertIn('hw-worker exited without results', result.error)
402-
403-
# Verify journalctl output was fetched and saved
404-
journal_calls = [c for c in mock_ssh.call_args_list
405-
if 'journalctl' in c[0][1]]
406-
self.assertEqual(len(journal_calls), 1)
407-
journal_file = os.path.join(tmpdir, 'hw-worker-journal')
408-
self.assertTrue(os.path.exists(journal_file))
409-
with open(journal_file) as fp:
410-
self.assertIn('some log', fp.read())
403+
with mock.patch('lib.deployer._ssh_retcode',
404+
side_effect=ssh_retcode_side_effect):
405+
with mock.patch('lib.deployer._ssh',
406+
side_effect=ssh_side_effect):
407+
result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
408+
409+
self.assertIsInstance(result, WaitResult)
410+
self.assertFalse(result.ok)
411+
self.assertIn('hw-worker exited without results', result.error)
411412

412413
@mock.patch('subprocess.run')
413414
@mock.patch('time.monotonic')

0 commit comments

Comments
 (0)