|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# SPDX-License-Identifier: GPL-2.0 |
| 3 | + |
| 4 | +"""NIPA HW kselftest orchestrator service.""" |
| 5 | + |
| 6 | +import datetime |
| 7 | +import os |
| 8 | +import subprocess |
| 9 | +import sys |
| 10 | +import time |
| 11 | + |
| 12 | +# Add the project root to path for cross-package imports |
| 13 | +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..')) |
| 14 | + |
| 15 | +# pylint: disable=wrong-import-position,wrong-import-order |
| 16 | +# Imports below require sys.path manipulation for cross-package access. |
| 17 | + |
| 18 | +from core import NipaLifetime # noqa: E402 # pylint: disable=import-error |
| 19 | + |
| 20 | +from contest.remote.lib.cbarg import CbArg # noqa: E402 |
| 21 | +from contest.remote.lib.fetcher import Fetcher # noqa: E402 |
| 22 | + |
| 23 | +from lib.mc_client import MCClient, resolve_machines, resolve_nic_id # noqa: E402 |
| 24 | +from lib.deployer import (build_kernel, build_ksft, deploy_artifacts, # noqa: E402 |
| 25 | + kexec_machine, wait_for_results, fetch_results) |
| 26 | + |
| 27 | +# Config: |
| 28 | +# |
| 29 | +# [executor] |
| 30 | +# name=hwksft-nic0 |
| 31 | +# group=selftests-hw |
| 32 | +# init=force / continue / next |
| 33 | +# [remote] |
| 34 | +# branches=https://url-to-branches-manifest |
| 35 | +# [local] |
| 36 | +# base_path=/common/path |
| 37 | +# json_path=base-relative/path/to/json |
| 38 | +# results_path=base-relative/path/to/raw/outputs |
| 39 | +# tree_path=/root-path/to/kernel/git |
| 40 | +# patches_path=/root-path/to/patches/dir |
| 41 | +# [www] |
| 42 | +# url=https://url-to-reach-base-path |
| 43 | +# [hw] |
| 44 | +# nic_vendor=Intel |
| 45 | +# nic_model=E810-C |
| 46 | +# machine_control_url=http://control-node:5050 |
| 47 | +# reservation_retry_time=60 |
| 48 | +# max_kexec_boot_timeout=300 |
| 49 | +# max_test_time=3600 |
| 50 | +# crash_wait_time=120 |
| 51 | +# sol_poll_interval=15 |
| 52 | +# [build] |
| 53 | +# extra_kconfig=/path/to/nic-driver.config |
| 54 | +# [ksft] |
| 55 | +# target=net |
| 56 | + |
| 57 | + |
| 58 | +def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument |
| 59 | + """Fetcher callback: build, deploy, run, and collect HW test results.""" |
| 60 | + print("Run at", datetime.datetime.now()) |
| 61 | + cbarg.refresh_config() |
| 62 | + config = cbarg.config |
| 63 | + |
| 64 | + results_path = os.path.join(config.get('local', 'base_path'), |
| 65 | + config.get('local', 'results_path'), |
| 66 | + rinfo['run-cookie']) |
| 67 | + os.makedirs(results_path, exist_ok=True) |
| 68 | + |
| 69 | + link = config.get('www', 'url') + '/' + \ |
| 70 | + config.get('local', 'results_path') + '/' + \ |
| 71 | + rinfo['run-cookie'] |
| 72 | + rinfo['link'] = link |
| 73 | + grp_name = config.get('executor', 'group', fallback='selftests-hw') |
| 74 | + |
| 75 | + tree_path = config.get('local', 'tree_path') |
| 76 | + mc_url = config.get('hw', 'machine_control_url') |
| 77 | + nic_vendor = config.get('hw', 'nic_vendor') |
| 78 | + nic_model = config.get('hw', 'nic_model') |
| 79 | + mc = MCClient(mc_url) |
| 80 | + |
| 81 | + # 1. Build kernel + ksft |
| 82 | + try: |
| 83 | + kernel_version = build_kernel(config, tree_path) |
| 84 | + build_ksft(config, tree_path) |
| 85 | + except (subprocess.CalledProcessError, OSError) as e: |
| 86 | + print(f"Build failed: {e}") |
| 87 | + return [{ |
| 88 | + 'test': 'build', |
| 89 | + 'group': grp_name, |
| 90 | + 'result': 'fail', |
| 91 | + 'link': link, |
| 92 | + }] |
| 93 | + |
| 94 | + # 2. Resolve machines for NIC |
| 95 | + all_nics = mc.get_nic_info() |
| 96 | + nic_id = resolve_nic_id(all_nics, nic_vendor, nic_model) |
| 97 | + machine_ids, nic = resolve_machines(all_nics, nic_id) |
| 98 | + |
| 99 | + # Build nic_info dict with peer info for deployment |
| 100 | + nic_deploy_info = { |
| 101 | + 'ifname': nic.get('ifname', ''), |
| 102 | + 'ip4addr': nic.get('ip4addr', ''), |
| 103 | + 'ip6addr': nic.get('ip6addr', ''), |
| 104 | + } |
| 105 | + if nic.get('peer_id'): |
| 106 | + for n in all_nics: |
| 107 | + if n['id'] == nic['peer_id']: |
| 108 | + nic_deploy_info['peer'] = { |
| 109 | + 'ifname': n.get('ifname', ''), |
| 110 | + 'ip4addr': n.get('ip4addr', ''), |
| 111 | + 'ip6addr': n.get('ip6addr', ''), |
| 112 | + } |
| 113 | + break |
| 114 | + |
| 115 | + # 3. Get machine IPs for SSH/SCP |
| 116 | + all_machines = mc.get_machine_info() |
| 117 | + machine_ip_map = {m['id']: m['mgmt_ipaddr'] for m in all_machines} |
| 118 | + machine_ips = [machine_ip_map[mid] for mid in machine_ids] |
| 119 | + |
| 120 | + # Record peer machine IP so deployer can set REMOTE_ARGS |
| 121 | + if nic.get('peer_id'): |
| 122 | + for n in all_nics: |
| 123 | + if n['id'] == nic['peer_id']: |
| 124 | + nic_deploy_info['peer_machine_ip'] = machine_ip_map.get( |
| 125 | + n['machine_id'], machine_ips[0]) |
| 126 | + break |
| 127 | + |
| 128 | + # 4. Reserve machines (retry loop with backoff) |
| 129 | + max_retries = config.getint('hw', 'max_reservation_retries', fallback=30) |
| 130 | + retry_time = config.getint('hw', 'reservation_retry_time', fallback=60) |
| 131 | + reservation_id = None |
| 132 | + for attempt in range(max_retries): |
| 133 | + result = mc.reserve(machine_ids) |
| 134 | + if 'reservation_id' in result: |
| 135 | + reservation_id = result['reservation_id'] |
| 136 | + break |
| 137 | + wait = min(retry_time * (1.5 ** attempt), 300) |
| 138 | + print(f"Reserve failed ({result.get('error', '?')}), " |
| 139 | + f"retry {attempt+1}/{max_retries} in {wait:.0f}s") |
| 140 | + time.sleep(wait) |
| 141 | + else: |
| 142 | + raise RuntimeError(f"Failed to reserve machines after {max_retries} attempts") |
| 143 | + |
| 144 | + try: |
| 145 | + # 5. Deploy artifacts via SCP |
| 146 | + deploy_artifacts(config, machine_ips, reservation_id, nic_deploy_info, |
| 147 | + tree_path, kernel_version) |
| 148 | + |
| 149 | + # 6. kexec into new kernel |
| 150 | + kexec_machine(config, machine_ips, reservation_id) |
| 151 | + |
| 152 | + # 7. Wait for hw-worker with crash monitoring |
| 153 | + wait_for_results(config, mc, reservation_id, machine_ids, machine_ips) |
| 154 | + |
| 155 | + # 8. Copy back results |
| 156 | + cases = fetch_results(config, machine_ips, reservation_id, rinfo) |
| 157 | + finally: |
| 158 | + # 9. Release reservation |
| 159 | + try: |
| 160 | + mc.reservation_close(reservation_id) |
| 161 | + except Exception as e: |
| 162 | + print(f"Warning: failed to close reservation {reservation_id}: {e}") |
| 163 | + |
| 164 | + print("Done at", datetime.datetime.now()) |
| 165 | + return cases |
| 166 | + |
| 167 | + |
| 168 | +def main(): |
| 169 | + """Entry point: set up Fetcher poll loop.""" |
| 170 | + cfg_paths = ['hw.config', 'hwksft.config'] |
| 171 | + if len(sys.argv) > 1: |
| 172 | + cfg_paths += sys.argv[1:] |
| 173 | + |
| 174 | + cbarg = CbArg(cfg_paths) |
| 175 | + config = cbarg.config |
| 176 | + |
| 177 | + base_dir = config.get('local', 'base_path') |
| 178 | + |
| 179 | + life = NipaLifetime(config) |
| 180 | + |
| 181 | + f = Fetcher(test, cbarg, |
| 182 | + name=config.get('executor', 'name'), |
| 183 | + branches_url=config.get('remote', 'branches'), |
| 184 | + results_path=os.path.join(base_dir, config.get('local', 'json_path')), |
| 185 | + url_path=config.get('www', 'url') + '/' + config.get('local', 'json_path'), |
| 186 | + tree_path=config.get('local', 'tree_path'), |
| 187 | + patches_path=config.get('local', 'patches_path', fallback=None), |
| 188 | + life=life, |
| 189 | + first_run=config.get('executor', 'init', fallback="continue")) |
| 190 | + f.run() |
| 191 | + life.exit() |
| 192 | + |
| 193 | + |
| 194 | +if __name__ == '__main__': |
| 195 | + main() |
0 commit comments