Skip to content

Commit ae1141c

Browse files
committed
contest-hw: hwksft: correct the check if worker exited
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent b652c0f commit ae1141c

2 files changed

Lines changed: 58 additions & 23 deletions

File tree

contest/hw/lib/deployer.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -250,10 +250,27 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips,
250250
print("wait_for_results: hw-worker completed")
251251
return WaitResult(ok=True)
252252

253-
# Check if hw-worker exited without producing results
254-
ret = _ssh_retcode(primary_ip, 'systemctl is-active nipa-hw-worker.service')
255-
if ret != 0:
256-
# Service is inactive/failed — no results.json means it failed.
253+
# Check if hw-worker exited without producing results.
254+
# For Type=oneshot services, is-active returns "activating" (rc=3)
255+
# while running, "active" (rc=0) after success with RemainAfterExit=yes,
256+
# and "failed"/"inactive" after failure/stop. Use show -p ActiveState
257+
# to distinguish "still running" from "done".
258+
state = _ssh(primary_ip,
259+
'systemctl show -p ActiveState --value nipa-hw-worker.service',
260+
check=False).strip()
261+
if state == 'activating':
262+
pass # still running, continue polling
263+
elif state in ('inactive', 'failed'):
264+
# Service exited, but results.json may have been written
265+
# between our test -f check and the state check (race).
266+
# Re-check before declaring failure.
267+
ret = _ssh_retcode(primary_ip,
268+
f'test -f /srv/hw-worker/results/{reservation_id}/results.json')
269+
if ret == 0:
270+
print("wait_for_results: hw-worker completed")
271+
return WaitResult(ok=True)
272+
273+
# Service finished and no results.json — something went wrong.
257274
# Grab journalctl output for debugging.
258275
journal = _ssh(primary_ip,
259276
'journalctl -u nipa-hw-worker.service -n 100 --no-pager',
@@ -262,7 +279,7 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips,
262279
journal_file = os.path.join(results_path, 'hw-worker-journal')
263280
with open(journal_file, 'w', encoding='utf-8') as fp:
264281
fp.write(journal)
265-
msg = "hw-worker exited without results"
282+
msg = f"hw-worker exited without results (state={state})"
266283
print(f"wait_for_results: {msg}")
267284
return WaitResult(ok=False, error=msg)
268285

contest/hw/tests/test_hwksft.py

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -375,15 +375,23 @@ def test_wait_for_results_no_results(self, _mock_sleep,
375375
def ssh_retcode_side_effect(ip, cmd, timeout=30):
376376
if 'test -f' in cmd:
377377
return 1 # no results.json
378-
if 'is-active' in cmd:
379-
return 1 # service exited
380378
return 0
381379

380+
ssh_call_count = {'n': 0}
381+
382+
def ssh_side_effect(ip, cmd, check=True, timeout=30):
383+
ssh_call_count['n'] += 1
384+
if 'systemctl show' in cmd:
385+
return 'failed\n'
386+
if 'journalctl' in cmd:
387+
return 'Mar 14 hw-worker[123]: some log\n'
388+
return ''
389+
382390
with tempfile.TemporaryDirectory() as tmpdir:
383391
with mock.patch('lib.deployer._ssh_retcode',
384392
side_effect=ssh_retcode_side_effect):
385393
with mock.patch('lib.deployer._ssh',
386-
return_value='Mar 14 hw-worker[123]: some log\n') as mock_ssh:
394+
side_effect=ssh_side_effect) as mock_ssh:
387395
result = wait_for_results(config, mc, 42, [1], ['10.0.0.1'],
388396
results_path=tmpdir)
389397

@@ -392,8 +400,9 @@ def ssh_retcode_side_effect(ip, cmd, timeout=30):
392400
self.assertIn('hw-worker exited without results', result.error)
393401

394402
# Verify journalctl output was fetched and saved
395-
mock_ssh.assert_called_once()
396-
self.assertIn('journalctl', mock_ssh.call_args[0][1])
403+
journal_calls = [c for c in mock_ssh.call_args_list
404+
if 'journalctl' in c[0][1]]
405+
self.assertEqual(len(journal_calls), 1)
397406
journal_file = os.path.join(tmpdir, 'hw-worker-journal')
398407
self.assertTrue(os.path.exists(journal_file))
399408
with open(journal_file) as fp:
@@ -487,20 +496,24 @@ def test_self_reboot_skips_power_cycle(self, _mock_sleep,
487496
poll_num = {'n': 0}
488497

489498
def ssh_retcode_side_effect(ip, cmd, timeout=30):
490-
if 'is-active' in cmd:
491-
poll_num['n'] += 1
492-
if poll_num['n'] <= 2:
493-
return 0 # active
494-
return 1 # exited
495499
if 'test -f' in cmd:
496500
return 0 # results exist
497501
return 0
498-
ssh_retcode_side_effect.result_calls = 0
502+
503+
def ssh_side_effect(ip, cmd, check=True, timeout=30):
504+
if 'systemctl show' in cmd:
505+
poll_num['n'] += 1
506+
if poll_num['n'] <= 2:
507+
return 'activating\n'
508+
return 'active\n'
509+
return ''
499510

500511
with mock.patch('lib.deployer._ssh_retcode',
501512
side_effect=ssh_retcode_side_effect):
502-
with mock.patch('lib.deployer._crash_recover') as mock_recover:
503-
wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
513+
with mock.patch('lib.deployer._ssh',
514+
side_effect=ssh_side_effect):
515+
with mock.patch('lib.deployer._crash_recover') as mock_recover:
516+
wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
504517

505518
# Should have been called with skip_power_cycle=True
506519
mock_recover.assert_called_once()
@@ -546,21 +559,26 @@ def test_no_power_cycle_if_sol_progressing(self, _mock_sleep,
546559
},
547560
]
548561

549-
# hw-worker: not active, no results on first; results on second
562+
# hw-worker: still activating on first poll; results on second
550563
call_count = {'results': 0}
551564

552565
def ssh_retcode_side_effect(ip, cmd, timeout=30):
553-
if 'is-active' in cmd:
554-
return 1
555566
if 'test -f' in cmd:
556567
call_count['results'] += 1
557568
return 0 if call_count['results'] >= 2 else 1
558569
return 0
559570

571+
def ssh_side_effect(ip, cmd, check=True, timeout=30):
572+
if 'systemctl show' in cmd:
573+
return 'activating\n'
574+
return ''
575+
560576
with mock.patch('lib.deployer._ssh_retcode',
561577
side_effect=ssh_retcode_side_effect):
562-
with mock.patch('lib.deployer._crash_recover') as mock_recover:
563-
wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
578+
with mock.patch('lib.deployer._ssh',
579+
side_effect=ssh_side_effect):
580+
with mock.patch('lib.deployer._crash_recover') as mock_recover:
581+
wait_for_results(config, mc, 42, [1], ['10.0.0.1'])
564582

565583
# SOL was progressing — no recovery should have been triggered
566584
mock_recover.assert_not_called()

0 commit comments

Comments
 (0)