|
5 | 5 | import json |
6 | 6 | import os |
7 | 7 | import random |
| 8 | +import re |
8 | 9 | import shutil |
9 | 10 | import string |
10 | 11 | import subprocess |
11 | | -import tempfile |
12 | 12 | import time |
13 | 13 | from dataclasses import dataclass, field |
14 | 14 |
|
@@ -210,7 +210,7 @@ def _refresh(): |
210 | 210 | def grab_hw_worker_journal(ipaddr, results_path): |
211 | 211 | """Fetch hw-worker journal from the test machine and save locally.""" |
212 | 212 | journal = _ssh(ipaddr, |
213 | | - 'journalctl -u nipa-hw-worker.service -n 250 --no-pager', |
| 213 | + 'journalctl -u nipa-hw-worker.service -b --no-pager', |
214 | 214 | check=False) |
215 | 215 | if journal: |
216 | 216 | journal_file = os.path.join(results_path, 'hw-worker-journal') |
@@ -252,38 +252,26 @@ def wait_for_results(config, mc, reservation_id, machine_ids, machine_ips): |
252 | 252 | print(f"wait_for_results: {msg}") |
253 | 253 | return WaitResult(ok=False, error=msg) |
254 | 254 |
|
255 | | - # Check if hw-worker has produced results on primary machine |
256 | | - primary_ip = machine_ips[0] |
257 | | - ret = _ssh_retcode(primary_ip, |
258 | | - f'test -f /srv/hw-worker/results/{reservation_id}/results.json') |
259 | | - if ret == 0: |
260 | | - print("wait_for_results: hw-worker completed") |
261 | | - return WaitResult(ok=True) |
262 | | - |
263 | | - # Check if hw-worker exited without producing results. |
| 255 | + # Check if hw-worker service has exited. |
264 | 256 | # For Type=oneshot services, is-active returns "activating" (rc=3) |
265 | 257 | # while running, "active" (rc=0) after success with RemainAfterExit=yes, |
266 | 258 | # and "failed"/"inactive" after failure/stop. Use show -p ActiveState |
267 | 259 | # to distinguish "still running" from "done". |
| 260 | + primary_ip = machine_ips[0] |
268 | 261 | state = _ssh(primary_ip, |
269 | 262 | 'systemctl show -p ActiveState --value nipa-hw-worker.service', |
270 | 263 | check=False).strip() |
271 | 264 | if state == 'activating': |
272 | 265 | pass # still running, continue polling |
273 | | - elif state in ('inactive', 'failed'): |
274 | | - # Service exited, but results.json may have been written |
275 | | - # between our test -f check and the state check (race). |
276 | | - # Re-check before declaring failure. |
277 | | - ret = _ssh_retcode(primary_ip, |
278 | | - f'test -f /srv/hw-worker/results/{reservation_id}/results.json') |
279 | | - if ret == 0: |
280 | | - print("wait_for_results: hw-worker completed") |
281 | | - return WaitResult(ok=True) |
282 | | - |
283 | | - # Service finished and no results.json — something went wrong. |
284 | | - msg = f"hw-worker exited without results (state={state})" |
| 266 | + elif state == 'failed': |
| 267 | + msg = "hw-worker service failed" |
285 | 268 | print(f"wait_for_results: {msg}") |
286 | 269 | return WaitResult(ok=False, error=msg) |
| 270 | + elif state in ('inactive', 'active'): |
| 271 | + # inactive = exited normally (RemainAfterExit=no) |
| 272 | + # active = exited normally (RemainAfterExit=yes) |
| 273 | + print("wait_for_results: hw-worker completed") |
| 274 | + return WaitResult(ok=True) |
287 | 275 |
|
288 | 276 | # Check SOL logs for crashes on each machine |
289 | 277 | for i, mid in enumerate(machine_ids): |
@@ -363,61 +351,109 @@ def _refresh(): |
363 | 351 | kexec_machine(config, [ipaddr], reservation_id, mc=mc) |
364 | 352 |
|
365 | 353 |
|
366 | | -def fetch_results(_config, machine_ips, reservation_id, rinfo): |
367 | | - """SCP results from test machines back to build node. |
| 354 | +def fetch_results(machine_ips, reservation_id, results_path): |
| 355 | + """SCP test output from the test machine back to the build node. |
368 | 356 |
|
369 | | - Parse and format into vmksft-p-style result list. |
370 | | - Tests that crashed (in .attempted but not in results) are marked |
371 | | - as result='fail' with crash info. |
| 357 | + Copies the results directory tree and the .attempted file. |
372 | 358 | """ |
373 | 359 | primary_ip = machine_ips[0] |
374 | 360 | remote_results = f'/srv/hw-worker/results/{reservation_id}' |
| 361 | + remote_tests = f'/srv/hw-worker/tests/{reservation_id}' |
375 | 362 |
|
376 | | - with tempfile.TemporaryDirectory() as tmpdir: |
377 | | - # Copy results.json |
378 | | - _scp_from(primary_ip, f'{remote_results}/results.json', |
379 | | - os.path.join(tmpdir, 'results.json')) |
380 | | - |
381 | | - # Copy .attempted for crash tracking |
382 | | - remote_tests = f'/srv/hw-worker/tests/{reservation_id}' |
383 | | - _scp_from(primary_ip, f'{remote_tests}/.attempted', |
384 | | - os.path.join(tmpdir, 'attempted.json'), |
385 | | - check=False) |
386 | | - |
387 | | - # Parse results |
388 | | - results_path = os.path.join(tmpdir, 'results.json') |
389 | | - if os.path.exists(results_path): |
390 | | - with open(results_path, encoding='utf-8') as fp: |
391 | | - raw_results = json.load(fp) |
392 | | - else: |
393 | | - raw_results = [] |
| 363 | + # Copy the entire results directory tree |
| 364 | + local_results = os.path.join(results_path, 'test-outputs') |
| 365 | + os.makedirs(local_results, exist_ok=True) |
| 366 | + # Use scp -r to grab all test output directories |
| 367 | + ret = subprocess.run( |
| 368 | + ['scp', '-r', '-o', 'StrictHostKeyChecking=no', |
| 369 | + '-o', 'BatchMode=yes', |
| 370 | + f'root@{primary_ip}:{remote_results}/', local_results], |
| 371 | + capture_output=True, timeout=300, check=False |
| 372 | + ) |
| 373 | + if ret.returncode != 0: |
| 374 | + print(f"fetch_results: scp failed: {ret.stderr.decode('utf-8', 'ignore')}") |
| 375 | + |
| 376 | + # Copy .attempted for crash tracking |
| 377 | + _scp_from(primary_ip, f'{remote_tests}/.attempted', |
| 378 | + os.path.join(results_path, 'attempted.json'), |
| 379 | + check=False) |
394 | 380 |
|
395 | | - # Load attempted tests |
396 | | - attempted_path = os.path.join(tmpdir, 'attempted.json') |
397 | | - attempted = [] |
398 | | - if os.path.exists(attempted_path): |
399 | | - with open(attempted_path, encoding='utf-8') as fp: |
400 | | - attempted = json.load(fp) |
401 | 381 |
|
402 | | - # Identify crashed tests: in attempted but not in results |
403 | | - result_names = {r['test'] for r in raw_results} |
404 | | - link = rinfo.get('link', '') |
| 382 | +def parse_results(reservation_id, results_path, link): |
| 383 | + """Parse fetched test output into a vmksft-p-style result list. |
| 384 | +
|
| 385 | + Reads info/stdout files from the test-outputs directory and |
| 386 | + the .attempted file to identify crashed tests. |
| 387 | + """ |
| 388 | + # Find the actual results subdir (scp -r creates reservation_id/ inside) |
| 389 | + local_results = os.path.join(results_path, 'test-outputs') |
| 390 | + output_dir = os.path.join(local_results, str(reservation_id)) |
| 391 | + if not os.path.isdir(output_dir): |
| 392 | + output_dir = local_results |
| 393 | + |
| 394 | + # Parse each test output directory |
| 395 | + cases = [] |
| 396 | + completed_tests = set() |
| 397 | + if os.path.isdir(output_dir): |
| 398 | + for entry in sorted(os.listdir(output_dir)): |
| 399 | + test_dir = os.path.join(output_dir, entry) |
| 400 | + if not os.path.isdir(test_dir): |
| 401 | + continue |
| 402 | + |
| 403 | + info_path = os.path.join(test_dir, 'info') |
| 404 | + stdout_path = os.path.join(test_dir, 'stdout') |
| 405 | + |
| 406 | + if not os.path.exists(info_path): |
| 407 | + continue |
| 408 | + |
| 409 | + with open(info_path, encoding='utf-8') as fp: |
| 410 | + info = json.load(fp) |
| 411 | + |
| 412 | + retcode = info.get('retcode', 1) |
| 413 | + target = info.get('target', 'unknown') |
| 414 | + prog = info.get('prog', entry) |
| 415 | + test_name = f"{target}:{prog}" |
| 416 | + completed_tests.add(test_name) |
| 417 | + |
| 418 | + stdout = '' |
| 419 | + if os.path.exists(stdout_path): |
| 420 | + with open(stdout_path, encoding='utf-8') as fp: |
| 421 | + stdout = fp.read() |
| 422 | + |
| 423 | + # Determine result |
| 424 | + result = 'pass' |
| 425 | + if retcode == 4: |
| 426 | + result = 'skip' |
| 427 | + elif retcode != 0: |
| 428 | + result = 'fail' |
| 429 | + if 'ok' not in stdout.lower() and result == 'pass': |
| 430 | + result = 'skip' |
| 431 | + |
| 432 | + safe_name = re.sub(r'[^0-9a-zA-Z]+', '-', prog) |
| 433 | + if safe_name and safe_name[-1] == '-': |
| 434 | + safe_name = safe_name[:-1] |
405 | 435 |
|
406 | | - cases = [] |
407 | | - for r in raw_results: |
408 | 436 | outcome = { |
409 | | - 'test': r['test'], |
410 | | - 'group': r.get('group', 'selftests-hw'), |
411 | | - 'result': r['result'], |
| 437 | + 'test': safe_name or entry, |
| 438 | + 'group': f'selftests-{re.sub(r"[^0-9a-zA-Z]+", "-", target).rstrip("-")}', |
| 439 | + 'result': result, |
412 | 440 | 'link': link, |
413 | 441 | } |
414 | | - for key in ['time', 'crashes']: |
415 | | - if key in r: |
416 | | - outcome[key] = r[key] |
| 442 | + if 'time' in info: |
| 443 | + outcome['time'] = info['time'] |
417 | 444 | cases.append(outcome) |
418 | 445 |
|
| 446 | + # Check .attempted for crashed tests (attempted but no output) |
| 447 | + attempted_path = os.path.join(results_path, 'attempted.json') |
| 448 | + if os.path.exists(attempted_path): |
| 449 | + with open(attempted_path, encoding='utf-8') as fp: |
| 450 | + try: |
| 451 | + attempted = json.load(fp) |
| 452 | + except (json.JSONDecodeError, ValueError): |
| 453 | + attempted = [] |
| 454 | + |
419 | 455 | for test_name in attempted: |
420 | | - if test_name not in result_names: |
| 456 | + if test_name not in completed_tests: |
421 | 457 | cases.append({ |
422 | 458 | 'test': test_name, |
423 | 459 | 'group': 'selftests-hw', |
|
0 commit comments