diff --git a/.claude/skills/launch-prep/SKILL.md b/.claude/skills/launch-prep/SKILL.md
index 0753652f2..894191d3c 100644
--- a/.claude/skills/launch-prep/SKILL.md
+++ b/.claude/skills/launch-prep/SKILL.md
@@ -90,10 +90,36 @@ If `ruff format` changed files: `git diff --name-only`, then `git add <those fil
 
 ```bash
 source .env 2>/dev/null || true
-.venv/bin/python -m pytest -m live tests/test_smoke.py -v
+.venv/bin/python -m pytest -m live tests/test_smoke.py -v -ra \
+  --junitxml=/tmp/smoke.xml
+# A skipped live smoke is NOT green — exit 0 on a run that never executed
+# would false-green the e2e gate. pytest puts tests/skipped on the nested
+# <testsuite> elements, so sum over them and fail unless one ran clean.
+.venv/bin/python -c 'import sys,xml.etree.ElementTree as ET; \
+  es=list(ET.parse("/tmp/smoke.xml").getroot().iter("testsuite")); \
+  t=sum(int(e.get("tests",0)) for e in es); \
+  s=sum(int(e.get("skipped",0)) for e in es); \
+  sys.exit(0) if t and not s else (print("LIVE SMOKE SKIPPED OR NOT RUN — gate is RED, not green") or sys.exit(1))'
 ```
 
-If Docker is unavailable, warn and ask to skip or abort — do not skip silently. Expected: `test_hello_world_smoke` passes with reward > 0 and non-empty trajectory.
+The live smoke `skipif`s when Docker is down or the chosen model has no
+credential, and pytest exits `0` on a skip. The JUnit check above turns that
+silent pass into a hard failure so a skipped smoke cannot false-green the gate.
+Expected: `test_hello_world_smoke` passes with reward > 0 and non-empty
+trajectory.
+
+If the only credential on the machine is non-Anthropic, point the smoke at an
+agent/model it can authenticate instead of skipping (proven combo:
+openhands + deepseek):
+
+```bash
+export BENCHFLOW_SMOKE_AGENT=openhands
+export BENCHFLOW_SMOKE_MODEL=deepseek/deepseek-chat
+export DEEPSEEK_API_KEY=...  DEEPSEEK_BASE_URL=https://api.deepseek.com
+```
+
+`BENCHFLOW_SMOKE_AGENT` and `BENCHFLOW_SMOKE_MODEL` must be set together; the
+skip reason names the exact missing credential for whichever model is selected.
 
 ---
 
diff --git a/README.md b/README.md
index 59cf79632..4f06e47b3 100644
--- a/README.md
+++ b/README.md
@@ -65,6 +65,7 @@ Start with [Getting started](./docs/getting-started.md), then [Concepts](./docs/
 | Understand Rollout / Scene / Role / Verifier | [Concepts](./docs/concepts.md) |
 | Author a new task | [Task authoring](./docs/task-authoring.md) |
 | Author a task in the native `task.md` format | [Native task.md authoring](./docs/task-authoring-task-md.md) |
+| Adopt an upstream benchmark into BenchFlow | [Benchmark adoption](./docs/benchmark-adoption.md) |
 | Run a hosted PrimeIntellect / Verifiers environment | [CLI reference](./docs/reference/cli.md) |
 | Multi-agent: coder + reviewer, simulated user, BYOS, stateful envs | [Use cases](./docs/use-cases.md) |
 | Multi-round single-agent (progressive disclosure, oracle access) | [Progressive disclosure](./docs/progressive-disclosure.md) |
diff --git a/docs.json b/docs.json
index 32e9b1157..31bca8cfb 100644
--- a/docs.json
+++ b/docs.json
@@ -21,6 +21,7 @@
         "group": "Guides",
         "pages": [
           "docs/running-benchmarks",
+          "docs/benchmark-adoption",
           "docs/continue-runs",
           "docs/task-authoring",
           "docs/task-authoring-task-md",
diff --git a/docs/benchmark-adoption.md b/docs/benchmark-adoption.md
new file mode 100644
index 000000000..804bce7ec
--- /dev/null
+++ b/docs/benchmark-adoption.md
@@ -0,0 +1,330 @@
+# Benchmark adoption
+
+Adopt an upstream benchmark into a BenchFlow benchmark with `bench agent`.
+
+## What the router is
+
+`bench agent` is the benchmark-adoption router. It *routes* an external
+benchmark into `benchmarks/<name>/` — scaffold, codex-driven conversion, and a
+parity gate — so the result is a first-class BenchFlow benchmark. It sits
+upstream of evaluation: the router *adopts*, while `bench eval create` *runs*
+the resulting tasks. Once `bench agent verify <name>` reports
+`parity-confirmed`, you point `bench eval create` at the converted tasks and run
+them like any other benchmark.
+
+Three subcommands form the adopt → verify loop:
+
+```
+$ bench agent --help
+╭─ Commands ───────────────────────────────────────────────────────────────────╮
+│ create   Scaffold benchmarks/<name>/ for a new benchmark adoption.           │
+│ run      Drive the CONVERT.md workflow by launching the host codex CLI.      │
+│ verify   Run the parity gate for an adopted benchmark; emit a verdict.       │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+
+The reference for what a finished adoption looks like is
+[`benchmarks/programbench/`](../benchmarks/programbench/); the conversion
+contract is [`benchmarks/CONVERT.md`](../benchmarks/CONVERT.md). The router
+embeds both into the conversion workflow for you.
+
+## `bench agent create <name>`
+
+`create` writes a deterministic scaffold under `benchmarks/<name>/`, matching
+the reference layout and the CONVERT.md contract. Use `--benchmarks-dir` to
+target a directory other than the repo's `benchmarks/`:
+
+```
+$ bench agent create webarena-lite --benchmarks-dir /tmp/router-docs/benchmarks
+Scaffolded /tmp/router-docs/benchmarks/webarena-lite
+  README.md
+  __init__.py
+  benchflow.py
+  benchmark.yaml
+  main.py
+  parity_experiment.json
+  parity_test.py
+  run_webarena_lite.py
+  webarena-lite.yaml
+```
+
+That produces this tree:
+
+```
+webarena-lite/
+├── __init__.py
+├── benchflow.py            # converter: source instances → Harbor task dirs
+├── main.py                 # converter CLI delegator
+├── parity_test.py          # structural / eval / side-by-side parity checks
+├── parity_experiment.json  # recorded parity results (read by verify)
+├── benchmark.yaml          # standard benchmark descriptor
+├── run_webarena_lite.py    # runner: convert, then evaluate via BenchFlow
+├── webarena-lite.yaml      # BenchFlow job config (how to run)
+└── README.md               # generated workflow notes
+```
+
+What each file is for:
+
+- **`benchflow.py`** — the converter. Its documented `convert()` /
+  `convert_all()` entry points are `NotImplementedError` stubs that point at
+  CONVERT.md step 2; you fill them in to map each source instance to a
+  Harbor-format task directory (`task.toml`, `instruction.md`,
+  `environment/Dockerfile`, `tests/test.sh`).
+- **`parity_test.py`** — the parity harness, with `--mode full | eval-parity |
+  side-by-side` (CONVERT.md steps 3–5). Side-by-side parity records the
+  per-criterion `original_verdict` / `adapted_verdict` pairs that `verify`
+  scores.
+- **`parity_experiment.json`** — the recorded parity results `verify` reads. The
+  scaffold writes a `status: "template"` placeholder with empty
+  `conversion_parity.tasks` and `reward_distribution_parity.samples`; you
+  populate it from a real parity run.
+- **`benchmark.yaml`** — the standard descriptor (name, conversion method,
+  verification method, parity tallies). Fields start as `TODO`/`0`.
+
+`main.py`, `run_webarena_lite.py`, and `webarena-lite.yaml` are the converter
+CLI delegator, the convert-then-evaluate runner, and the BenchFlow job config
+respectively.
+
+### Fail-closed behavior
+
+`create` refuses to overwrite an existing benchmark — re-running it is an error,
+not a silent clobber:
+
+```
+$ bench agent create webarena-lite --benchmarks-dir /tmp/router-docs/benchmarks
+benchmark already exists: /tmp/router-docs/benchmarks/webarena-lite (refusing to
+overwrite)
+```
+
+Names must be lowercase slugs (leading letter, single internal hyphens). The
+slug is also the security floor — it keeps `create`/`verify` from being steered
+outside `benchmarks/`. An uppercase or underscored name is rejected:
+
+```
+$ bench agent create WebArena_Lite --benchmarks-dir /tmp/router-docs/benchmarks
+invalid benchmark name 'WebArena_Lite': use a lowercase slug like 'my-bench'
+(letters/digits, single internal hyphens, leading letter)
+```
+
+Both fail-closed cases exit non-zero.
+
+## `bench agent run <source> [--name]`
+
+`run` drives the conversion. It assembles an adoption prompt — the source, the
+target `benchmarks/<name>/` path, the adoption skills (CONVERT.md, the
+programbench worked example, the parity harness), and the full embedded
+CONVERT.md guide — then launches the host `codex` CLI to do the conversion
+toward a pull request. If you omit `--name`, the slug is derived from the source
+basename (so `.../webarena` becomes `webarena`).
+
+Use `--dry-run` to print the exact command the router would launch without
+running it:
+
+```
+$ bench agent run https://github.com/web-arena-x/webarena --name webarena-lite --dry-run
+codex exec --cd /path/to/benchflow --skip-git-repo-check --sandbox workspace-write '# Benchmark adoption: webarena-lite
+
+Adopt the source benchmark below into a BenchFlow benchmark by
+following the conversion guide. Produce the converter, parity tests,
+metadata, and task directories, then open a pull request.
+
+Source benchmark: https://github.com/web-arena-x/webarena
+Target directory: benchmarks/webarena-lite/
+
+## Adoption skills
+- conversion-guide: benchmarks/CONVERT.md
+- reference-benchmark: benchmarks/programbench/ (worked example)
+- parity-harness: parity_test.py + parity_experiment.json (verify gate)
+
+## Conversion guide (benchmarks/CONVERT.md)
+
+# Benchmark Conversion Guide
+...
+## Definition of done
+- benchmarks/webarena-lite/ has benchflow.py, parity_test.py,
+  parity_experiment.json, benchmark.yaml, run_webarena_lite.py,
+  README.md
+- `bench agent verify webarena-lite` reports parity-confirmed'
+```
+
+The full prompt embeds CONVERT.md verbatim (elided above). The `codex exec`
+argv is constructed deterministically: it runs in the repo root
+(`--cd <repo>`), with `--skip-git-repo-check` and
+`--sandbox workspace-write`. Pass `--model` to set the codex driver model and
+`--codex-bin` to point at a different codex binary.
+
+A live run (drop `--dry-run`) requires codex credentials and fails closed
+without them — set `OPENAI_API_KEY` (or `CODEX_API_KEY`), or run `codex login`
+to create `~/.codex/auth.json`. Without credentials `run` errors before
+assembling any context:
+
+```
+codex needs credentials to launch: set OPENAI_API_KEY (or CODEX_API_KEY), or run
+`codex login` to create ~/.codex/auth.json
+```
+
+The codex run is the manual-validation step — it iterates on the converter and
+parity tests until `bench agent verify` confirms parity.
+
+## `bench agent verify <name>`
+
+`verify` is the gate that closes the loop. It reads the adopted benchmark's
+`parity_experiment.json` and emits a confidence verdict. The gate is *parity
+only*: a faithful conversion must reproduce the original's behavior on identical
+inputs — including any reward-hackability the original has. It never "improves"
+or sanitizes the source.
+
+It scores two layers:
+
+- **Conversion parity (deterministic floor)** — every compared criterion's
+  converted verdict must match the original's verdict on identical inputs.
+- **Reward-distribution parity (statistical layer)** — every
+  legacy-vs-converted reward delta must sit within `--tolerance` (default
+  `0.02`).
+
+A layer with no recorded data does not block the verdict. The three verdicts:
+
+| Verdict | Meaning |
+| --- | --- |
+| `parity-confirmed` | Every recorded layer agrees; high-confidence the conversion is faithful. |
+| `parity-divergent` | A criterion disagrees or a reward delta exceeds tolerance. |
+| `insufficient-evidence` | No recorded comparisons at all — run `parity_test.py` and record results first. |
+
+A freshly scaffolded benchmark has no recorded parity, so it is
+`insufficient-evidence` and exits non-zero:
+
+```
+$ bench agent verify webarena-lite --benchmarks-dir /tmp/router-docs/benchmarks
+Verdict: insufficient-evidence
+  conversion: 0/0 criteria agree (rate 0.0000)
+Insufficient evidence: no recorded parity comparisons. Run parity_test.py and
+record results before trusting the conversion.
+...
+```
+
+### A parity-confirmed run
+
+Populate `parity_experiment.json` from a parity run. `verify` reads
+per-criterion verdicts under `conversion_parity.tasks` and reward samples under
+`reward_distribution_parity.samples`:
+
+```json
+{
+  "experiment": "side-by-side-parity",
+  "benchmark": "webarena-lite",
+  "status": "recorded",
+  "judge_model": "gemini-3.1-flash-lite",
+  "conversion_parity": {
+    "tasks": [
+      {
+        "task_id": "shopping-001",
+        "n_criteria": 2,
+        "criteria_results": [
+          {"criterion_id": "C-001", "original_verdict": "pass", "adapted_verdict": "pass", "agreement": true},
+          {"criterion_id": "C-002", "original_verdict": "fail", "adapted_verdict": "fail", "agreement": true}
+        ]
+      },
+      {
+        "task_id": "reddit-002",
+        "n_criteria": 1,
+        "criteria_results": [
+          {"criterion_id": "C-001", "original_verdict": "pass", "adapted_verdict": "pass", "agreement": true}
+        ]
+      }
+    ]
+  },
+  "reward_distribution_parity": {
+    "samples": [
+      {"task_id": "shopping-001", "legacy_reward": 0.50, "converted_reward": 0.50},
+      {"task_id": "reddit-002", "legacy_reward": 1.00, "converted_reward": 1.00}
+    ]
+  }
+}
+```
+
+With every criterion agreeing and every reward delta at zero, the verdict is
+`parity-confirmed` and `verify` exits zero:
+
+```
+$ bench agent verify webarena-lite --benchmarks-dir /tmp/router-docs/benchmarks
+Verdict: parity-confirmed
+  conversion: 3/3 criteria agree (rate 1.0000)
+  reward: max abs delta 0.0000 (tolerance 0.0200)
+High-confidence: the converted evaluation reproduces the original's verdicts on
+every compared criterion and stays within reward tolerance.
+```
+
+### A parity-divergent run
+
+Flip one criterion so the converted verdict no longer matches the original
+(here `C-002`'s `adapted_verdict` goes from `fail` to `pass`). The deterministic
+floor trips, the verdict becomes `parity-divergent`, and `verify` prints a draft
+GitHub issue body for the support path:
+
+```
+$ bench agent verify webarena-lite --benchmarks-dir /tmp/router-docs/benchmarks
+Verdict: parity-divergent
+  conversion: 2/3 criteria agree (rate 0.6667)
+  reward: max abs delta 0.0000 (tolerance 0.0200)
+Divergence found: the conversion does not yet reproduce the original's behavior
+— iterate, then open an issue for support.
+## Benchmark adoption parity: webarena-lite
+
+**Verdict:** parity-divergent
+
+Divergence found: the conversion does not yet reproduce the original's behavior
+— iterate, then open an issue for support.
+
+### Conversion parity (deterministic floor)
+- criteria compared: 3
+- agreed: 2
+- agreement rate: 0.6667
+  - shopping-001/C-002: original=fail converted=pass
+
+### Reward-distribution parity (statistical layer)
+- samples: 2
+- max abs delta: 0.0000
+- tolerance: 0.0200
+
+### Ask
+Parity could not be closed for this conversion. The translation must
+reproduce the original's behavior on identical inputs (including any
+reward-hackability it has). This draft has NOT been filed — review it,
+iterate on the converter, and open it manually if you need support.
+```
+
+The draft is **never filed automatically** — it is printed for a human to
+review and open if they need support. Pass `--issue-out PATH` to write it to a
+file instead of stdout:
+
+```
+$ bench agent verify webarena-lite --benchmarks-dir /tmp/router-docs/benchmarks --issue-out /tmp/router-docs/divergence.md
+Verdict: parity-divergent
+  ...
+Issue draft written to /tmp/router-docs/divergence.md
+```
+
+### The `--roundtrip-task` structural hook
+
+By default `verify` scores the recorded `parity_experiment.json` at the
+benchmark level. Pass `--roundtrip-task <task-dir>` to also run the structural
+round-trip conformance check on one concrete task tree (it reuses the existing
+Harbor round-trip parity utility). It is opt-in because that harness needs a
+concrete task directory, which the benchmark-level verdict does not require.
+
+`verify` exits non-zero for `parity-divergent` and `insufficient-evidence`, and
+errors if the benchmark was never adopted:
+
+```
+$ bench agent verify nonexistent-bench --benchmarks-dir /tmp/router-docs/benchmarks
+benchmark not adopted: /tmp/router-docs/benchmarks/nonexistent-bench — run
+`bench agent create nonexistent-bench` first
+```
+
+## From adoption to evaluation
+
+Once `verify` reports `parity-confirmed`, the benchmark is a normal BenchFlow
+benchmark: run its tasks with `bench eval create` (see
+[Running benchmarks](./running-benchmarks.md)), using the job config the
+scaffold generated. The router's job ends at `parity-confirmed`; evaluation
+takes it from there.
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
index 0a8283a8a..07049f23d 100644
--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@@ -339,6 +339,11 @@ than 24 hours; use `--dry-run` to preview what would be deleted.
 bench environment cleanup --dry-run --max-age 1440
 ```
 
+Daytona-backed evals also reap orphaned sandboxes automatically at run start
+(failure states such as `BUILD_FAILED` are reaped sooner than healthy ones, and
+concurrent live runs are never touched). Set `BENCHFLOW_DAYTONA_AUTO_REAP=0` to
+disable that automatic pass and rely on the manual command above.
+
 ## bench compat
 
 Third-party framework compatibility checks.
diff --git a/docs/running-benchmarks.md b/docs/running-benchmarks.md
index d679871b4..242466da8 100644
--- a/docs/running-benchmarks.md
+++ b/docs/running-benchmarks.md
@@ -530,17 +530,37 @@ jobs/
     └── ...
 ```
 
-The `result.json` contains:
+The `result.json` contains (abridged):
 ```json
 {
   "rewards": {"reward": 0.48},
   "n_tool_calls": 12,
   "n_skill_invocations": 2,
-  "passed": true,
-  "verifier_output": "..."
+  "agent_result": {"total_tokens": 23993, "cost_usd": 0.07, "usage_source": "provider_response"},
+  "final_metrics": {"total_prompt_tokens": 18000, "total_completion_tokens": 5993},
+  "error": null,
+  "verifier_error": null
 }
 ```
 
+**Canonical fields — read these, not invented top-level ones.** The reward,
+token totals, and outcome each live in exactly one place:
+
+| You want | Read | Notes |
+| --- | --- | --- |
+| reward | `rewards.reward` | scalar 0.0–1.0, or `null` if unscored |
+| token total | `agent_result.total_tokens` | `null` when no provider usage was captured (e.g. hosted runs) |
+| outcome / status | derived from `rewards.reward` + `error`/`verifier_error` | not stored as a field; see below |
+
+There is intentionally **no top-level `reward`, `total_tokens`, or `status`
+key** — those names are absent, not `null`. A naive consumer doing
+`result["reward"]` or `result.get("total_tokens")` gets `None` because the key
+does not exist, never because the value is null. Pass/fail is a *derived*
+classification (only `reward == 1.0` passes); BenchFlow computes it from
+`rewards.reward` plus the error channels rather than persisting a redundant
+`status`. The same nested shape is produced for both native rollouts and
+hosted-env runs, so one reader handles both.
+
 `n_skill_invocations` is derived from structured ACP trajectory events: BenchFlow
 counts only `tool_call` events whose `kind` is `skill`. Job `summary.json`
 also includes `total_skill_invocations` and `avg_skill_invocations` across the
diff --git a/tests/test_agent_router_cli_e2e.py b/tests/test_agent_router_cli_e2e.py
new file mode 100644
index 000000000..36c901fa9
--- /dev/null
+++ b/tests/test_agent_router_cli_e2e.py
@@ -0,0 +1,342 @@
+"""End-to-end CLI integration tests for the ``bench agent`` adoption router.
+
+These tests drive the *real* registered ``bench agent create|run|verify``
+commands through ``typer``'s ``CliRunner`` against an on-disk benchmarks tree in
+a tmp dir — no fake exec/report layer on the create/verify path. They pin
+behavior (CLI exit codes, verdict strings, on-disk file contents) so a mutation
+to the router's logic shows up as a failing assertion rather than passing by
+coincidence.
+
+The unit suite in ``test_agent_router.py`` covers the pure functions; this file
+covers the wired-up CLI surface: the scaffold actually lands on disk and the
+generated Python compiles + YAML parses; create refuses on re-run; bad slugs are
+rejected by the real CLI; a realistic parity record flips the verify verdict and
+exit code; and the live (non-dry-run) ``run`` path fails closed on missing
+credentials without ever spawning ``codex``.
+"""
+
+from __future__ import annotations
+
+import json
+import py_compile
+from pathlib import Path
+
+import click
+import pytest
+import yaml
+from click.testing import Result
+from typer.testing import CliRunner
+
+import benchflow.agent_router as agent_router
+from benchflow.cli.main import app
+
+# Every generated file the scaffold must land for slug ``my-bench``. Runner file
+# and job-yaml names embed the slug, so this set also locks the naming scheme.
+_EXPECTED_SCAFFOLD_FILES = {
+    "__init__.py",
+    "benchflow.py",
+    "main.py",
+    "parity_test.py",
+    "run_my_bench.py",
+    "my-bench.yaml",
+    "benchmark.yaml",
+    "parity_experiment.json",
+    "README.md",
+}
+
+
+def _run(runner: CliRunner, *args: str) -> Result:
+    return runner.invoke(app, list(args))
+
+
+# ── create: real scaffold lands, compiles, and parses ─────────────────
+
+
+def test_cli_create_writes_full_scaffold_that_compiles_and_parses(
+    tmp_path: Path,
+) -> None:
+    runner = CliRunner()
+    result = _run(
+        runner, "agent", "create", "my-bench", "--benchmarks-dir", str(tmp_path)
+    )
+    assert result.exit_code == 0, result.output
+
+    target = tmp_path / "my-bench"
+    on_disk = {p.name for p in target.iterdir()}
+    assert on_disk == _EXPECTED_SCAFFOLD_FILES
+
+    # Every generated Python module must byte-compile (catches template typos).
+    for rel in _EXPECTED_SCAFFOLD_FILES:
+        if rel.endswith(".py"):
+            py_compile.compile(str(target / rel), doraise=True)
+
+    # The converter module exposes the documented entry points with no leftover
+    # template tokens — the slug substitution actually ran.
+    converter = (target / "benchflow.py").read_text()
+    assert "def convert(" in converter
+    assert "def convert_all(" in converter
+    assert "{{NAME}}" not in converter and "{{TITLE}}" not in converter
+
+    # Both YAML files parse and carry the substituted slug in their key fields.
+    descriptor = yaml.safe_load((target / "benchmark.yaml").read_text())
+    assert descriptor["name"] == "my-bench"
+    job = yaml.safe_load((target / "my-bench.yaml").read_text())
+    assert job["tasks_dir"] == "benchmarks/my-bench/tasks"
+
+    # The scaffolded parity record is valid JSON in the template state.
+    parity = json.loads((target / "parity_experiment.json").read_text())
+    assert parity["benchmark"] == "my-bench"
+    assert parity["status"] == "template"
+
+
+def test_cli_create_refuses_existing_benchmark_with_nonzero_exit(
+    tmp_path: Path,
+) -> None:
+    runner = CliRunner()
+    first = _run(
+        runner, "agent", "create", "my-bench", "--benchmarks-dir", str(tmp_path)
+    )
+    assert first.exit_code == 0, first.output
+    sentinel = tmp_path / "my-bench" / "README.md"
+    original = sentinel.read_text()
+
+    again = _run(
+        runner, "agent", "create", "my-bench", "--benchmarks-dir", str(tmp_path)
+    )
+    assert again.exit_code == 1
+    assert "already exists" in click.unstyle(again.output)
+    # Fail-closed refusal: the existing scaffold is left untouched.
+    assert sentinel.read_text() == original
+
+
+@pytest.mark.parametrize(
+    "bad_name",
+    [
+        "MyBench",  # uppercase
+        "1bench",  # leading digit
+        "a/b",  # path separator
+    ],
+)
+def test_cli_create_rejects_invalid_slug_without_writing(
+    tmp_path: Path, bad_name: str
+) -> None:
+    runner = CliRunner()
+    result = _run(
+        runner, "agent", "create", bad_name, "--benchmarks-dir", str(tmp_path)
+    )
+    assert result.exit_code == 1
+    assert "invalid benchmark name" in click.unstyle(result.output)
+    # Nothing was written for the rejected slug.
+    assert list(tmp_path.iterdir()) == []
+
+
+# ── verify: realistic parity records flip verdict and exit code ───────
+
+
+def _confirmed_parity_record() -> dict:
+    """A realistic 'parity-confirmed' record (programbench-shaped).
+
+    Every compared criterion agrees (deterministic floor) and every
+    legacy-vs-converted reward delta is zero (statistical layer), so the gate
+    must return ``parity-confirmed``.
+    """
+    return {
+        "experiment": "side-by-side-parity",
+        "benchmark": "my-bench",
+        "status": "parity-confirmed",
+        "judge_model": "gemini-3.1-flash-lite-preview",
+        "conversion_parity": {
+            "tasks": [
+                {
+                    "task_id": "abishekvashok__cmatrix.5c082c6",
+                    "criteria_results": [
+                        {
+                            "criterion_id": "C-001",
+                            "original_verdict": "pass",
+                            "adapted_verdict": "pass",
+                            "agreement": True,
+                        },
+                        {
+                            "criterion_id": "C-002",
+                            "original_verdict": "fail",
+                            "adapted_verdict": "fail",
+                            "agreement": True,
+                        },
+                    ],
+                },
+            ],
+        },
+        "agent_parity": {
+            "results": [
+                {
+                    "task_id": "ajeetdsouza__zoxide.67ca1bc",
+                    "programbench": {"reward": 1.0},
+                    "benchflow": {"reward": 1.0},
+                },
+                {
+                    "task_id": "anordal__shellharden.6a6ffd4",
+                    "programbench": {"reward": 0.9992},
+                    "benchflow": {"reward": 0.9992},
+                },
+            ],
+        },
+    }
+
+
+def _divergent_parity_record() -> dict:
+    """A record where the converted verdict diverges from the original."""
+    return {
+        "experiment": "side-by-side-parity",
+        "benchmark": "my-bench",
+        "conversion_parity": {
+            "tasks": [
+                {
+                    "task_id": "abishekvashok__cmatrix.5c082c6",
+                    "criteria_results": [
+                        {
+                            "criterion_id": "C-001",
+                            "original_verdict": "pass",
+                            "adapted_verdict": "fail",
+                            "agreement": False,
+                        },
+                    ],
+                },
+            ],
+        },
+    }
+
+
+def test_cli_create_then_verify_confirmed_record_exits_zero(tmp_path: Path) -> None:
+    runner = CliRunner()
+    create = _run(
+        runner, "agent", "create", "my-bench", "--benchmarks-dir", str(tmp_path)
+    )
+    assert create.exit_code == 0, create.output
+
+    parity_file = tmp_path / "my-bench" / "parity_experiment.json"
+    parity_file.write_text(json.dumps(_confirmed_parity_record(), indent=2))
+
+    verify = _run(
+        runner, "agent", "verify", "my-bench", "--benchmarks-dir", str(tmp_path)
+    )
+    assert verify.exit_code == 0, verify.output
+    out = click.unstyle(verify.output)
+    assert "Verdict:" in out
+    assert "parity-confirmed" in out
+    # The confirmed path reports full criterion agreement, not a divergence.
+    assert "2/2 criteria agree" in out
+    assert "parity-divergent" not in out
+
+
+def test_cli_verify_divergent_record_exits_nonzero_and_writes_issue_draft(
+    tmp_path: Path,
+) -> None:
+    runner = CliRunner()
+    create = _run(
+        runner, "agent", "create", "my-bench", "--benchmarks-dir", str(tmp_path)
+    )
+    assert create.exit_code == 0, create.output
+
+    parity_file = tmp_path / "my-bench" / "parity_experiment.json"
+    parity_file.write_text(json.dumps(_divergent_parity_record(), indent=2))
+
+    issue_out = tmp_path / "divergence-issue.md"
+    verify = _run(
+        runner,
+        "agent",
+        "verify",
+        "my-bench",
+        "--benchmarks-dir",
+        str(tmp_path),
+        "--issue-out",
+        str(issue_out),
+    )
+    assert verify.exit_code == 1
+    assert "parity-divergent" in click.unstyle(verify.output)
+
+    # --issue-out writes a non-empty draft naming the failing criterion, and the
+    # draft states it was not auto-filed.
+    assert issue_out.exists()
+    issue = issue_out.read_text()
+    assert issue.strip()
+    assert "parity-divergent" in issue
+    assert "original=pass converted=fail" in issue
+    assert "NOT been filed" in issue
+
+
+def test_cli_verify_fresh_scaffold_is_insufficient_evidence(tmp_path: Path) -> None:
+    runner = CliRunner()
+    create = _run(
+        runner, "agent", "create", "my-bench", "--benchmarks-dir", str(tmp_path)
+    )
+    assert create.exit_code == 0, create.output
+
+    # A fresh scaffold ships a template parity record with no comparisons, so the
+    # gate withholds confidence and exits non-zero down the support path.
+    verify = _run(
+        runner, "agent", "verify", "my-bench", "--benchmarks-dir", str(tmp_path)
+    )
+    assert verify.exit_code == 1
+    out = click.unstyle(verify.output)
+    assert "insufficient-evidence" in out
+    assert "NOT been filed" in out
+
+
+# ── run: dry-run prints the command; live path fails closed ───────────
+
+
+def test_cli_run_dry_run_prints_codex_command_with_context_markers(
+    tmp_path: Path,
+) -> None:
+    runner = CliRunner()
+    result = _run(
+        runner,
+        "agent",
+        "run",
+        "github.com/foo/bar",
+        "--name",
+        "my-bench",
+        "--dry-run",
+    )
+    assert result.exit_code == 0, result.output
+    out = click.unstyle(result.output)
+    # The constructed (but un-launched) codex command carries the source and the
+    # CONVERT.md adoption context — the prompt the live run would have used.
+    assert "codex" in out
+    assert "exec" in out
+    assert "github.com/foo/bar" in out
+    assert "CONVERT.md" in out
+    assert "Benchmark adoption" in out
+    assert "benchmarks/my-bench/" in out
+
+
+def test_cli_run_live_path_fails_closed_without_credentials(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """The non-dry-run path refuses to launch when no codex credentials exist.
+
+    We force a credential-free host (no API-key env vars, auth file pointed at a
+    missing path) and replace the real subprocess exec with a tripwire, so the
+    test proves the fail-closed gate fires *before* anything is spawned —
+    independent of whatever credentials the host machine actually has.
+    """
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+    monkeypatch.delenv("CODEX_API_KEY", raising=False)
+    monkeypatch.setattr(
+        agent_router, "_default_codex_auth_file", lambda: tmp_path / "absent-auth.json"
+    )
+
+    def _tripwire(*_args: object, **_kwargs: object) -> int:
+        raise AssertionError("codex was spawned despite missing credentials")
+
+    monkeypatch.setattr(agent_router, "_subprocess_exec", _tripwire)
+
+    result = _run(
+        CliRunner(), "agent", "run", "github.com/foo/bar", "--name", "my-bench"
+    )
+    assert result.exit_code == 1
+    out = click.unstyle(result.output)
+    # The precise key/login guidance, unwrapped across rich's line-wrapping.
+    assert "codex needs credentials to launch" in out
+    assert "OPENAI_API_KEY" in out
+    assert "CODEX_API_KEY" in out
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 8cd09146c..2e7d795fb 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -7,6 +7,19 @@
 the ``addopts = "-m 'not live'"`` filter in pyproject.toml applies to direct
 file invocation too.
 
+Default agent/model is ``claude-agent-acp`` + Haiku 4.5. A contributor whose
+only credential is non-Anthropic can point the smoke at an agent/model they can
+actually authenticate via two env vars (proven combo: openhands + deepseek):
+
+    export BENCHFLOW_SMOKE_AGENT=openhands
+    export BENCHFLOW_SMOKE_MODEL=deepseek/deepseek-chat
+    export DEEPSEEK_API_KEY=...   DEEPSEEK_BASE_URL=https://api.deepseek.com
+    pytest -m live tests/test_smoke.py
+
+The skip reason is always specific (which credential is missing for the chosen
+model) so a release gate can grep the pytest summary and refuse to call a
+skipped live smoke "green" — see ``docs/release.md`` and the launch-prep skill.
+
 Importing ``benchflow.sdk`` triggers ``_patch_dind()`` at sdk.py:135.
 That patch is gated on ``/.dockerenv`` and runs ``docker info`` with a 5s
 timeout, swallowing all exceptions — safe but worth flagging.
@@ -36,6 +49,73 @@
 HELLO_TASK = Path(__file__).parent / "examples" / "hello-world-task"
 SMOKE_JOBS_BASE = Path(__file__).parent / ".smoke-jobs"
 
+DEFAULT_SMOKE_AGENT = "claude-agent-acp"
+DEFAULT_SMOKE_MODEL = "claude-haiku-4-5-20251001"
+
+SMOKE_AGENT_ENV = "BENCHFLOW_SMOKE_AGENT"
+SMOKE_MODEL_ENV = "BENCHFLOW_SMOKE_MODEL"
+
+
+def resolve_smoke_target() -> tuple[str, str]:
+    """Return the (agent, model) the live smoke will run.
+
+    Defaults to claude-agent-acp + Haiku 4.5. ``BENCHFLOW_SMOKE_AGENT`` and
+    ``BENCHFLOW_SMOKE_MODEL`` override both (the escape hatch for contributors
+    without an Anthropic credential). Both must be set together; setting only
+    one is a configuration error, not a silent fall-back to the Anthropic
+    default the contributor cannot authenticate.
+    """
+    agent = os.environ.get(SMOKE_AGENT_ENV)
+    model = os.environ.get(SMOKE_MODEL_ENV)
+    if (agent is None) != (model is None):
+        missing = SMOKE_MODEL_ENV if agent is not None else SMOKE_AGENT_ENV
+        raise RuntimeError(
+            f"{SMOKE_AGENT_ENV} and {SMOKE_MODEL_ENV} must be set together; "
+            f"{missing} is unset."
+        )
+    if agent and model:
+        return agent, model
+    return DEFAULT_SMOKE_AGENT, DEFAULT_SMOKE_MODEL
+
+
+def _missing_model_credentials(model: str) -> str | None:
+    """Return a reason string if the chosen model has no usable credential.
+
+    The default Anthropic model accepts either ``ANTHROPIC_API_KEY`` or
+    ``~/.claude/.credentials.json`` (subscription OAuth) — the dual path
+    ``resolve_agent_env`` honors for claude-agent-acp. Any other model is
+    validated against the credential env vars its registered provider needs:
+    the inferred API key plus every ``url_params`` env var (e.g. deepseek's
+    ``DEEPSEEK_BASE_URL``), so the skip reason names the missing var.
+    """
+    from benchflow.agents.providers import find_provider
+    from benchflow.agents.registry import infer_env_key_for_model
+
+    provider = find_provider(model)
+    if provider is None:
+        # No registered provider: only the built-in Anthropic dual path is
+        # special-cased; the heuristic key still applies as a fallback.
+        required = infer_env_key_for_model(model)
+        if required == "ANTHROPIC_API_KEY":
+            has_key = bool(os.environ.get("ANTHROPIC_API_KEY"))
+            has_login = Path("~/.claude/.credentials.json").expanduser().is_file()
+            if has_key or has_login:
+                return None
+            return "no ANTHROPIC_API_KEY and no ~/.claude/.credentials.json"
+        if required and not os.environ.get(required):
+            return f"no {required} for model {model!r}"
+        return None
+
+    _, cfg = provider
+    needed: list[str] = []
+    if cfg.auth_type == "api_key" and cfg.auth_env:
+        needed.append(cfg.auth_env)
+    needed.extend(cfg.url_params.values())
+    missing = [var for var in needed if not os.environ.get(var)]
+    if missing:
+        return f"missing {', '.join(missing)} for model {model!r}"
+    return None
+
 
 def _smoke_skip_reason() -> str | None:
     """Return a skip reason or None.
@@ -46,9 +126,9 @@ def _smoke_skip_reason() -> str | None:
     Checks:
     - docker CLI present (cheap, no subprocess)
     - docker daemon reachable (3s timeout to kill hangs on misconfigured DOCKER_HOST)
-    - ANTHROPIC_API_KEY env var OR ~/.claude/.credentials.json (matches what
-      resolve_agent_env at _agent_env.py accepts for claude-agent-acp's
-      subscription_auth path)
+    - credentials for the resolved smoke model (the Anthropic dual key/login
+      path for the default, or the provider's required env vars for an
+      escape-hatch model — see ``_missing_model_credentials``)
 
     Deliberately does NOT call resolve_agent_env — the test exercises that code
     path; skipping when it raises would mask real regressions.
@@ -65,11 +145,8 @@ def _smoke_skip_reason() -> str | None:
         return f"docker daemon unreachable: {e}"
     if r.returncode != 0:
         return "docker daemon unreachable"
-    has_key = bool(os.environ.get("ANTHROPIC_API_KEY"))
-    has_login = Path("~/.claude/.credentials.json").expanduser().is_file()
-    if not (has_key or has_login):
-        return "no ANTHROPIC_API_KEY and no ~/.claude/.credentials.json"
-    return None
+    _, model = resolve_smoke_target()
+    return _missing_model_credentials(model)
 
 
 @pytest.fixture(scope="session")
@@ -80,6 +157,11 @@ def smoke_prereqs() -> bool:
     only when a live test is actually selected. Replaces the naive
     ``@pytest.mark.skipif(_smoke_skip_reason() is not None, ...)`` pattern,
     which evaluates at decorator (collection) time on every pytest invocation.
+
+    A skip here is intentionally loud at the gate level: ``docs/release.md`` and
+    the launch-prep skill run the live smoke with ``-ra`` and fail the gate if
+    the summary reports this test as skipped, so a missing credential cannot
+    false-green the e2e step on a run that never executed.
     """
     reason = _smoke_skip_reason()
     if reason:
@@ -127,7 +209,11 @@ def smoke_jobs_dir(tmp_path: Path) -> Iterator[Path]:
 @pytest.mark.asyncio
 @pytest.mark.usefixtures("smoke_prereqs")
 async def test_hello_world_smoke(smoke_jobs_dir: Path) -> None:
-    """End-to-end: claude-agent-acp + Haiku 4.5 solves hello-world-task.
+    """End-to-end: the resolved agent + model solves hello-world-task.
+
+    Defaults to claude-agent-acp + Haiku 4.5; ``BENCHFLOW_SMOKE_AGENT`` /
+    ``BENCHFLOW_SMOKE_MODEL`` redirect it at any other ACP agent/model the
+    contributor can authenticate (e.g. openhands + deepseek).
 
     Asserts the minimal set that proves the orchestration pipeline ran:
     - Verifier produced reward 1.0 (strict equality on "Hello, world!")
@@ -136,10 +222,11 @@ async def test_hello_world_smoke(smoke_jobs_dir: Path) -> None:
       overwritten by scraped fallback — see sdk.py:83-84,540)
     - Trajectory file exists and is non-empty
     """
+    agent, model = resolve_smoke_target()
     result = await SDK().run(
         task_path=HELLO_TASK,
-        agent="claude-agent-acp",
-        model="claude-haiku-4-5-20251001",
+        agent=agent,
+        model=model,
         jobs_dir=smoke_jobs_dir,
     )
 
diff --git a/tests/test_smoke_wiring.py b/tests/test_smoke_wiring.py
new file mode 100644
index 000000000..e075fdcd9
--- /dev/null
+++ b/tests/test_smoke_wiring.py
@@ -0,0 +1,208 @@
+"""Non-live coverage for the live-smoke skip wiring.
+
+These tests guard the false-green hazard: the live smoke ``skipif``s (exit 0)
+when Docker is down or the chosen model has no credential. They pin down the
+pure helpers so a skip is always specific and attributable to a named missing
+credential, and they assert the JUnit-summary gate used by ``docs/release.md``
+/ launch-prep would treat such a skip as RED, not green.
+
+They never touch Docker or any model — ``_smoke_skip_reason`` (which shells out
+to ``docker version``) is only exercised through ``monkeypatch`` of its docker
+probe, so the suite stays in the ``not live`` default selection.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+import pytest
+
+from tests import test_smoke as smoke
+
+ALL_SMOKE_CRED_VARS = [
+    "ANTHROPIC_API_KEY",
+    "DEEPSEEK_API_KEY",
+    "DEEPSEEK_BASE_URL",
+    smoke.SMOKE_AGENT_ENV,
+    smoke.SMOKE_MODEL_ENV,
+]
+
+
+@pytest.fixture
+def clean_env(monkeypatch):
+    """Start from a known-empty credential environment.
+
+    The autouse ``isolate_local_dotenv`` fixture only redirects the dotenv path;
+    real ``ANTHROPIC_API_KEY`` etc. on the developer machine still leak through
+    ``os.environ``. Clearing them makes the missing-credential reasons
+    deterministic regardless of who runs the suite.
+    """
+    for var in ALL_SMOKE_CRED_VARS:
+        monkeypatch.delenv(var, raising=False)
+    return monkeypatch
+
+
+def test_resolve_smoke_target_defaults_to_claude(clean_env):
+    assert smoke.resolve_smoke_target() == (
+        smoke.DEFAULT_SMOKE_AGENT,
+        smoke.DEFAULT_SMOKE_MODEL,
+    )
+
+
+def test_resolve_smoke_target_env_overrides_both(clean_env):
+    clean_env.setenv(smoke.SMOKE_AGENT_ENV, "openhands")
+    clean_env.setenv(smoke.SMOKE_MODEL_ENV, "deepseek/deepseek-chat")
+    assert smoke.resolve_smoke_target() == ("openhands", "deepseek/deepseek-chat")
+
+
+@pytest.mark.parametrize("present", [smoke.SMOKE_AGENT_ENV, smoke.SMOKE_MODEL_ENV])
+def test_resolve_smoke_target_half_set_is_an_error(clean_env, present):
+    # Setting only one of the pair must NOT silently fall back to the Anthropic
+    # default the contributor cannot authenticate — that would re-introduce a
+    # false-green via an unauthenticatable default.
+    clean_env.setenv(present, "x")
+    with pytest.raises(RuntimeError, match="must be set together"):
+        smoke.resolve_smoke_target()
+
+
+def test_missing_credentials_anthropic_default_no_creds(clean_env):
+    reason = smoke._missing_model_credentials(smoke.DEFAULT_SMOKE_MODEL)
+    assert reason == "no ANTHROPIC_API_KEY and no ~/.claude/.credentials.json"
+
+
+def test_missing_credentials_anthropic_satisfied_by_api_key(clean_env):
+    clean_env.setenv("ANTHROPIC_API_KEY", "sk-ant-test")
+    assert smoke._missing_model_credentials(smoke.DEFAULT_SMOKE_MODEL) is None
+
+
+def test_missing_credentials_anthropic_satisfied_by_login(clean_env, monkeypatch):
+    real_is_file = Path.is_file
+
+    def fake_is_file(self):
+        if self == Path("~/.claude/.credentials.json").expanduser():
+            return True
+        return real_is_file(self)
+
+    monkeypatch.setattr(Path, "is_file", fake_is_file)
+    assert smoke._missing_model_credentials(smoke.DEFAULT_SMOKE_MODEL) is None
+
+
+def test_missing_credentials_deepseek_names_each_missing_var(clean_env):
+    reason = smoke._missing_model_credentials("deepseek/deepseek-chat")
+    assert reason is not None
+    # Both the API key and the provider's url_params var must be named so the
+    # skip message is actionable, not just "missing credential".
+    assert "DEEPSEEK_API_KEY" in reason
+    assert "DEEPSEEK_BASE_URL" in reason
+    assert "deepseek/deepseek-chat" in reason
+
+
+def test_missing_credentials_deepseek_satisfied(clean_env):
+    clean_env.setenv("DEEPSEEK_API_KEY", "sk-deepseek-test")
+    clean_env.setenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
+    assert smoke._missing_model_credentials("deepseek/deepseek-chat") is None
+
+
+def test_missing_credentials_deepseek_partial_still_skips(clean_env):
+    # API key alone is not enough — base_url is also required by the provider.
+    clean_env.setenv("DEEPSEEK_API_KEY", "sk-deepseek-test")
+    reason = smoke._missing_model_credentials("deepseek/deepseek-chat")
+    assert reason is not None
+    assert "DEEPSEEK_BASE_URL" in reason
+    assert "DEEPSEEK_API_KEY" not in reason
+
+
+def _patch_docker_ok(monkeypatch):
+    """Make the docker probe in _smoke_skip_reason report a reachable daemon."""
+    monkeypatch.setattr(smoke.shutil, "which", lambda _name: "/usr/bin/docker")
+
+    def fake_run(*_args, **_kwargs):
+        return subprocess.CompletedProcess(args=[], returncode=0, stdout=b"28.0")
+
+    monkeypatch.setattr(smoke.subprocess, "run", fake_run)
+
+
+def test_skip_reason_credential_gate_after_docker_ok(clean_env):
+    # Docker present and reachable, but no credential for the default model:
+    # the skip reason must be the credential one, not a docker one.
+    _patch_docker_ok(clean_env)
+    assert (
+        smoke._smoke_skip_reason()
+        == "no ANTHROPIC_API_KEY and no ~/.claude/.credentials.json"
+    )
+
+
+def test_skip_reason_none_when_docker_and_creds_present(clean_env):
+    _patch_docker_ok(clean_env)
+    clean_env.setenv("ANTHROPIC_API_KEY", "sk-ant-test")
+    assert smoke._smoke_skip_reason() is None
+
+
+def test_skip_reason_uses_escape_hatch_model(clean_env):
+    # With the escape hatch pointing at deepseek and its creds present, the
+    # smoke is runnable even with zero Anthropic credentials on the box.
+    _patch_docker_ok(clean_env)
+    clean_env.setenv(smoke.SMOKE_AGENT_ENV, "openhands")
+    clean_env.setenv(smoke.SMOKE_MODEL_ENV, "deepseek/deepseek-chat")
+    clean_env.setenv("DEEPSEEK_API_KEY", "sk-deepseek-test")
+    clean_env.setenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
+    assert smoke._smoke_skip_reason() is None
+
+
+def test_skip_reason_docker_missing_short_circuits(clean_env):
+    clean_env.setattr(smoke.shutil, "which", lambda _name: None)
+    assert smoke._smoke_skip_reason() == "docker CLI not installed"
+
+
+# --- The gate's own skip detection (mirrors the launch-prep Step 3 predicate) -
+
+
+def _junit_xml(tests: int, skipped: int) -> str:
+    """Build JUnit XML the way pytest does: a <testsuites> wrapper whose nested
+    <testsuite> carries the tests/skipped counts. A predicate that reads the
+    counts off the <testsuites> root instead of the child would see them as 0
+    and false-RED a passing run — these fixtures pin that down."""
+    return (
+        '<testsuites name="pytest tests">'
+        f'<testsuite name="pytest" tests="{tests}" skipped="{skipped}"'
+        ' errors="0" failures="0"></testsuite>'
+        "</testsuites>"
+    )
+
+
+def _gate_is_green(junit_xml: str) -> bool:
+    """Replicate the launch-prep gate predicate: green iff a test ran clean.
+
+    Sums tests/skipped over every <testsuite> element so it reads the counts at
+    the level pytest actually writes them (not the <testsuites> root)."""
+    suites = list(ET.fromstring(junit_xml).iter("testsuite"))
+    tests = sum(int(e.get("tests", 0)) for e in suites)
+    skipped = sum(int(e.get("skipped", 0)) for e in suites)
+    return bool(tests) and not skipped
+
+
+def test_gate_detects_skipped_live_smoke_as_red():
+    assert _gate_is_green(_junit_xml(tests=1, skipped=1)) is False
+
+
+def test_gate_detects_unrun_live_smoke_as_red():
+    # Deselected / collected-nothing run reports zero tests — also not green.
+    assert _gate_is_green(_junit_xml(tests=0, skipped=0)) is False
+
+
+def test_gate_passes_executed_live_smoke():
+    # The counts live on the nested <testsuite>; a root-only reader would
+    # false-RED this passing run, so this case is the regression guard.
+    assert _gate_is_green(_junit_xml(tests=1, skipped=0)) is True
+
+
+def test_gate_predicate_matches_real_pytest_junit_shape():
+    # Sanity: the shape we assert is the shape pytest emits — counts on the
+    # child <testsuite>, not the <testsuites> root.
+    root = ET.fromstring(_junit_xml(tests=1, skipped=0))
+    assert root.tag == "testsuites"
+    assert "tests" not in root.attrib
+    (child,) = list(root.iter("testsuite"))
+    assert child.get("tests") == "1"
diff --git a/tests/test_verify.py b/tests/test_verify.py
index 34c71bf33..ae0e6c28c 100644
--- a/tests/test_verify.py
+++ b/tests/test_verify.py
@@ -4,6 +4,7 @@
 import contextlib
 import json
 import logging
+from datetime import datetime
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
@@ -80,6 +81,61 @@ def test_clean_run_json(self, build_result_json):
         assert data["verifier_error"] is None
         assert data["rewards"] == {"reward": 1.0}
 
+    def test_no_invented_top_level_scalar_keys(self, build_result_json):
+        """Contract: reward/total_tokens/status are nested, never top-level.
+
+        The canonical surface is ``rewards.reward`` and
+        ``agent_result.total_tokens``; outcome is derived, not stored. A naive
+        consumer doing ``result["reward"]`` must hit a *missing* key (KeyError),
+        not a silent ``None`` from a vestigial top-level field. This pins the
+        absence so a future writer that bolts on a top-level ``reward=None``
+        (which would read as ``null`` to trainers/leaderboards) regresses here.
+        """
+        data = build_result_json(rewards={"reward": 0.05})
+        for absent in ("reward", "total_tokens", "status"):
+            assert absent not in data, f"top-level {absent!r} must not exist"
+        assert data["rewards"]["reward"] == 0.05
+        assert "total_tokens" in data["agent_result"]
+
+    def test_token_total_round_trips_into_agent_result(self, tmp_path):
+        """A captured in-process token total must serialize, not drop to None.
+
+        The proof wave saw a run whose in-process result held 23993 tokens but
+        whose result.json wrote ``total_tokens: null``. This pins the canonical
+        nested location so the captured value survives serialization with the
+        exact integer (not a truthy placeholder).
+        """
+        from benchflow.rollout import _build_rollout_result
+
+        rollout_dir = tmp_path / "trial"
+        rollout_dir.mkdir()
+        _build_rollout_result(
+            rollout_dir,
+            task_name="t1",
+            rollout_name="trial-1",
+            agent="test",
+            agent_name="openhands",
+            model="m",
+            n_tool_calls=12,
+            prompts=["x"],
+            error=None,
+            verifier_error=None,
+            trajectory=[],
+            partial_trajectory=False,
+            rewards={"reward": 0.05},
+            started_at=datetime.now(),
+            timing={},
+            n_input_tokens=18000,
+            n_output_tokens=5993,
+            total_tokens=23993,
+            usage_source="provider_response",
+        )
+        data = json.loads((rollout_dir / "result.json").read_text())
+        assert data["agent_result"]["total_tokens"] == 23993
+        assert data["final_metrics"]["total_prompt_tokens"] == 18000
+        assert data["final_metrics"]["total_completion_tokens"] == 5993
+        assert "total_tokens" not in data
+
 
 class TestSdkVerify:
     @pytest.fixture
@@ -638,8 +694,6 @@ def test_summary_includes_verifier_fields(self, sample_metrics):
         assert "verifier_error_breakdown" in s
 
     def test_collect_metrics_reads_verifier_error(self, tmp_path):
-        from datetime import datetime
-
         from benchflow.metrics import collect_metrics
 
         task_dir = tmp_path / "task1" / "trial-1"