diff --git a/src/benchflow/_utils/scoring.py b/src/benchflow/_utils/scoring.py
index 70ecf39b7..d3ed3b7e6 100644
--- a/src/benchflow/_utils/scoring.py
+++ b/src/benchflow/_utils/scoring.py
@@ -12,6 +12,13 @@
 SANDBOX_SETUP = "sandbox_setup"
 PROVIDER_AUTH = "provider_auth"
 TIMED_OUT = "timeout"
+# Provider API failures detected post-rollout (rate limit, quota, rejected
+# request, 5xx). "api_error" is proxy-proven (every captured provider request
+# failed); "suspected_api_error" is the zero-signal heuristic (no proxy
+# evidence, but the agent ended with zero tokens AND zero tool calls). Both
+# null the reward so the slot is excluded from score denominators.
+API_ERROR = "api_error"
+SUSPECTED_API_ERROR = "suspected_api_error"
 
 # Matched case-insensitively against the error string. Covers the
 # human-authored markers plus the sanitized "provider auth failed (HTTP 401)"
@@ -73,6 +80,12 @@ def classify_error(error: str | None) -> str | None:
         return INSTALL_FAILED
     if "closed stdout" in lower:
         return PIPE_CLOSED
+    # Order matters: "suspected provider api error" contains "provider api
+    # error", so the heuristic marker must be checked first.
+    if "suspected provider api error" in lower:
+        return SUSPECTED_API_ERROR
+    if "provider api error" in lower:
+        return API_ERROR
     if "ACP error" in error or "was rejected as invalid" in error:
         if any(m in lower for m in _PROVIDER_AUTH_MARKERS):
             return PROVIDER_AUTH
@@ -88,6 +101,17 @@ def classify_error(error: str | None) -> str | None:
     return "other"
 
 
+def api_error_is_transient(error: str | None) -> bool:
+    """True when an api_error string carries the transient marker.
+
+    Provider-api-error strings are formatted by the rollout classifier as
+    ``provider api error [<subcategory>/transient] ...`` or ``[.../permanent]``
+    — transient (rate limit, 5xx) is retryable, permanent (auth, quota,
+    model-not-found, rejected request) is not.
+    """
+    return bool(error) and "/transient]" in error
+
+
 def _looks_like_infra_error(error: str) -> bool:
     return any(
         marker in error
diff --git a/src/benchflow/diagnostics.py b/src/benchflow/diagnostics.py
index 0ab8454e8..5a9284bda 100644
--- a/src/benchflow/diagnostics.py
+++ b/src/benchflow/diagnostics.py
@@ -215,12 +215,70 @@ def format_issue(self, task_name: str) -> str:
         )
 
 
+@dataclass
+class ProviderApiErrorDiagnostic(Diagnostic):
+    """Every captured provider API request failed — no model response ever
+    reached the agent (rate limit, auth rejection, quota, model-not-found,
+    5xx). Proxy-proven: built from the usage proxy's captured exchange status
+    codes only (#546/#564 — bodies/headers are never read)."""
+
+    subcategory: str = "provider_error"
+    transient: bool = False
+    dominant_status: int | None = None
+    status_counts: dict[str, int] | None = None
+    total_requests: int = 0
+    failed_requests: int = 0
+    fingerprint: str = ""
+
+    field: ClassVar[str] = "api_error_info"
+    category: ClassVar[str | None] = "api_error"
+    summary_description: ClassVar[str] = "failed on provider API errors"
+
+    def format_issue(self, task_name: str) -> str:
+        kind = "transient" if self.transient else "permanent"
+        return (
+            f"{task_name}: provider api error [{self.subcategory}/{kind}] "
+            f"HTTP {self.dominant_status} on "
+            f"{self.failed_requests}/{self.total_requests} requests — "
+            f"measurement invalid (agent never got a model response)"
+        )
+
+
+@dataclass
+class SuspectedApiErrorDiagnostic(Diagnostic):
+    """Zero-signal rollout: the agent ended its turn with zero tokens AND
+    zero tool calls and no error — the signature of a provider API failure
+    swallowed inside the agent (e.g. a model id rejected against the agent's
+    own catalog before any request is issued)."""
+
+    total_tokens: int = 0
+    n_tool_calls: int = 0
+    total_requests: int = 0
+    failed_requests: int = 0
+
+    field: ClassVar[str] = "suspected_api_error_info"
+    category: ClassVar[str | None] = "suspected_api_error"
+    summary_description: ClassVar[str] = (
+        "ended with zero model/tool activity (suspected provider api error)"
+    )
+
+    def format_issue(self, task_name: str) -> str:
+        return (
+            f"{task_name}: suspected provider api error — agent ended with "
+            f"{self.total_tokens} tokens and {self.n_tool_calls} tool calls "
+            f"({self.failed_requests}/{self.total_requests} captured requests "
+            f"failed) — measurement suspect"
+        )
+
+
 # Public registry — every diagnostic kind goes here exactly once.
 DIAGNOSTIC_REGISTRY: tuple[type[Diagnostic], ...] = (
     IdleTimeoutDiagnostic,
     SandboxStartupDiagnostic,
     TransportClosedDiagnostic,
     VerifierTimeoutDiagnostic,
+    ProviderApiErrorDiagnostic,
+    SuspectedApiErrorDiagnostic,
 )
 
 # field_name → Diagnostic class, for check_results lookup.
diff --git a/src/benchflow/evaluation.py b/src/benchflow/evaluation.py
index 26c607ebf..cb257c8b9 100644
--- a/src/benchflow/evaluation.py
+++ b/src/benchflow/evaluation.py
@@ -14,6 +14,7 @@
 import json
 import logging
 import os
+import re
 import subprocess
 import threading
 import time
@@ -43,14 +44,17 @@
 from benchflow._utils.reward_events import memory_summary
 from benchflow._utils.scoring import (
     ACP_ERROR,
+    API_ERROR,
     IDLE_TIMEOUT,
     INFRA_ERROR,
     INSTALL_FAILED,
     PIPE_CLOSED,
     PROVIDER_AUTH,
+    SUSPECTED_API_ERROR,
     VERIFIER_DEP_INSTALL,
     VERIFIER_INFRA,
     VERIFIER_TIMEOUT,
+    api_error_is_transient,
     classify_error,
     classify_verifier_error,
     count_audit_outcomes,
@@ -155,6 +159,10 @@ class RetryConfig:
     retry_on_idle_timeout: bool = True
     retry_on_infra: bool = True
     retry_on_verifier_infra: bool = True
+    # Provider API errors: only TRANSIENT ones (rate limit, 5xx) are
+    # retryable — auth/quota/model-not-found are permanent until a human
+    # fixes the credential or model id, so retrying only burns wall-clock.
+    retry_on_api_error: bool = True
     wait_multiplier: float = 2.0
     min_wait_sec: float = 1.0
     max_wait_sec: float = 30.0
@@ -184,6 +192,9 @@ def from_mapping(cls, raw: dict | None) -> RetryConfig:
                 raw.get("retry_on_idle_timeout", defaults.retry_on_idle_timeout)
             ),
             retry_on_infra=bool(raw.get("retry_on_infra", defaults.retry_on_infra)),
+            retry_on_api_error=bool(
+                raw.get("retry_on_api_error", defaults.retry_on_api_error)
+            ),
             retry_on_verifier_infra=bool(
                 raw.get("retry_on_verifier_infra", defaults.retry_on_verifier_infra)
             ),
@@ -215,6 +226,14 @@ def should_retry(
             return True
         if self.retry_on_infra and category == INFRA_ERROR:
             return True
+        if category == API_ERROR:
+            # Transient-only: rate limit / provider 5xx self-heal on backoff;
+            # permanent (auth, quota, model_not_found, rejected_request) do not.
+            return self.retry_on_api_error and api_error_is_transient(error)
+        if category == SUSPECTED_API_ERROR:
+            # Zero-signal verdicts have an unknown subcategory — never provably
+            # transient, so never auto-retried (rerun is an operator action).
+            return False
         return bool(self.retry_on_acp and category == ACP_ERROR)
 
     def should_retry_verifier_error(self, verifier_error: str | None) -> bool:
@@ -232,6 +251,75 @@ def backoff_delay(self, attempt: int) -> float:
         return min(delay, self.max_wait_sec)
 
 
+class ApiErrorCircuitBreaker:
+    """Trip after N consecutive permanent provider-API failures with the SAME
+    fingerprint (classic dead key / wrong model id), so a doomed batch stops
+    burning sandbox-hours producing all-unhealthy artifacts.
+
+    Isolated api_errors never interrupt the batch — any completion that is not
+    a permanent api_error resets the streak. Threshold comes from
+    ``BENCHFLOW_API_ERROR_BREAKER_THRESHOLD`` (default 5; ``0`` disables).
+    Already-running tasks finish; only not-yet-started tasks are skipped.
+    """
+
+    ENV_VAR = "BENCHFLOW_API_ERROR_BREAKER_THRESHOLD"
+    DEFAULT_THRESHOLD = 5
+
+    def __init__(self, threshold: int | None = None) -> None:
+        if threshold is None:
+            raw = os.environ.get(self.ENV_VAR, "")
+            try:
+                threshold = int(raw) if raw.strip() else self.DEFAULT_THRESHOLD
+            except ValueError:
+                threshold = self.DEFAULT_THRESHOLD
+        self.threshold = max(threshold, 0)
+        self._fingerprint: str | None = None
+        self._streak = 0
+        self.tripped = False
+
+    @staticmethod
+    def _fingerprint_of(result: RunResult) -> str | None:
+        """Permanent-api-error fingerprint, or None when not breaker-relevant."""
+        category = result.error_category or classify_error(result.error)
+        if category == SUSPECTED_API_ERROR:
+            return "suspected:zero_signal"
+        if category == API_ERROR and not api_error_is_transient(result.error):
+            match = re.search(r"\[([a-z_]+)/permanent\] HTTP (\d+)", result.error or "")
+            return (
+                f"{match.group(1)}:{match.group(2)}" if match else "api_error:unknown"
+            )
+        return None
+
+    def record(self, result: RunResult) -> None:
+        """Track one completed task; trip when the same-fingerprint streak hits
+        the threshold."""
+        if self.threshold == 0 or self.tripped:
+            return
+        fingerprint = self._fingerprint_of(result)
+        if fingerprint is None:
+            self._fingerprint = None
+            self._streak = 0
+            return
+        if fingerprint == self._fingerprint:
+            self._streak += 1
+        else:
+            self._fingerprint = fingerprint
+            self._streak = 1
+        if self._streak >= self.threshold:
+            self.tripped = True
+            logger.error(
+                f"API-error circuit breaker OPEN: {self._streak} consecutive "
+                f"permanent provider failures [{fingerprint}] — skipping "
+                f"remaining unstarted tasks (set {self.ENV_VAR}=0 to disable)"
+            )
+
+    def skip_error(self) -> str:
+        return (
+            f"skipped: api-error circuit breaker open "
+            f"([{self._fingerprint}] x{self._streak} consecutive)"
+        )
+
+
 # Defaults: works out-of-the-box with `claude login` (subscription auth, no API key needed)
 DEFAULT_AGENT = "claude-agent-acp"
 DEFAULT_MODEL = "claude-haiku-4-5-20251001"
@@ -1006,8 +1094,14 @@ async def _run_parallel_independent(
         cfg = self._config
         sem = asyncio.Semaphore(cfg.concurrency)
 
+        breaker = ApiErrorCircuitBreaker()
+
         async def bounded(td: Path) -> tuple[str, RunResult]:
             async with sem:
+                if breaker.tripped:
+                    result = RunResult(task_name=td.name, error=breaker.skip_error())
+                    self._log_and_report(td, result)
+                    return td.name, result
                 # Jitter start to avoid SSH/docker-daemon storms at high
                 # concurrency. The window scales linearly with --concurrency so
                 # the average start rate stays around 2 tasks/sec; the previous
@@ -1019,6 +1113,7 @@ async def bounded(td: Path) -> tuple[str, RunResult]:
                     jitter_max = max(cfg.concurrency / 2, 8.0)
                     await asyncio.sleep(random.uniform(0, jitter_max))
                 result = await self._run_task(td)
+                breaker.record(result)
                 self._log_and_report(td, result)
                 return td.name, result
 
diff --git a/src/benchflow/rollout/__init__.py b/src/benchflow/rollout/__init__.py
index 5e92d9c9a..ebe1ec029 100644
--- a/src/benchflow/rollout/__init__.py
+++ b/src/benchflow/rollout/__init__.py
@@ -73,7 +73,11 @@
     default_rollout_planes,
 )
 from benchflow.contracts import RoundResult as RoundResult
-from benchflow.diagnostics import RolloutDiagnostics
+from benchflow.diagnostics import (
+    ProviderApiErrorDiagnostic,
+    RolloutDiagnostics,
+    SuspectedApiErrorDiagnostic,
+)
 from benchflow.models import RolloutResult, TrajectorySource
 from benchflow.rollout._config import GENERATED_SKILLS_ROOT as GENERATED_SKILLS_ROOT
 from benchflow.rollout._config import RolloutConfig as RolloutConfig
@@ -154,12 +158,16 @@
 )
 from benchflow.rollout._usage import _as_nonnegative_int as _as_nonnegative_int
 from benchflow.rollout._usage import _native_acp_usage_delta as _native_acp_usage_delta
+from benchflow.rollout._usage import (
+    _provider_api_failure_summary_from_runtime as _provider_api_failure_summary_from_runtime,
+)
 from benchflow.rollout._usage import (
     _provider_auth_status_from_runtime as _provider_auth_status_from_runtime,
 )
 from benchflow.rollout._usage import (
     _zero_native_acp_usage_metrics as _zero_native_acp_usage_metrics,
 )
+from benchflow.rollout._usage import classify_api_failure as classify_api_failure
 
 # Step / user-loop drivers live in ``_user_loop`` as free functions taking the
 # Rollout; the thin methods below delegate to these engine aliases.
@@ -252,6 +260,10 @@ def __init__(self, config: RolloutConfig) -> None:
         # trajectory on stop()). Read by _provider_auth_status() so ACP-error
         # classification can fail fast on auth failures (#546/#564).
         self._provider_auth_status_cached: int | None = None
+        # Provider API failure summary (all statuses >= 400), snapshotted in
+        # cleanup() alongside the auth status — consumed by the post-rollout
+        # silent-API-failure classifier in _build_result().
+        self._api_failure_summary_cached: dict[str, Any] | None = None
 
         # Populated by start()
         self._sandbox_id: str | None = None
@@ -1245,6 +1257,9 @@ async def cleanup(self) -> None:
             self._provider_auth_status_cached = _provider_auth_status_from_runtime(
                 usage_runtime
             )
+            self._api_failure_summary_cached = (
+                _provider_api_failure_summary_from_runtime(usage_runtime)
+            )
             try:
                 self._write_llm_trajectory(usage_runtime)
             except Exception as e:
@@ -1682,8 +1697,75 @@ def _current_sandbox_id(self) -> str | None:
         env_sandbox_id = getattr(getattr(self, "_env", None), "sandbox_id", None)
         return env_sandbox_id if isinstance(env_sandbox_id, str) else None
 
+    def _maybe_classify_api_error(self) -> None:
+        """Detect a silent provider API failure after the rollout finished.
+
+        Runs only when no other error was recorded. Layer 1 (proxy-proven):
+        every captured provider request failed and the agent produced zero
+        tokens -> error_category "api_error". Layer 2 (zero-signal): no proxy
+        failure evidence, but the agent ended with zero tokens AND zero tool
+        calls -> "suspected_api_error" (e.g. the agent rejected the model id
+        against its own catalog and never issued a request). Both null the
+        reward so the slot is excluded from score denominators instead of
+        polluting them as a fake healthy fail; the slot stays rerun-able and
+        the batch is never interrupted.
+        """
+        if self._error is not None:
+            return
+        # Only judge rollouts where the agent actually ran: when no execute()
+        # recorded a prompt, this is a setup/export failure path that owns its
+        # own error channels (#389) — zero activity there is expected, not a
+        # silent API failure.
+        if not getattr(self, "_executed_prompts", None):
+            return
+        # getattr-defensive: tests construct partial Rollout doubles that
+        # bypass __init__ (same pattern as _task_skill_policy below).
+        usage_metrics = getattr(self, "_usage_metrics", None) or {}
+        total_tokens = _as_nonnegative_int(usage_metrics.get("total_tokens"))
+        verdict, info = classify_api_failure(
+            getattr(self, "_api_failure_summary_cached", None),
+            total_tokens=total_tokens,
+            n_tool_calls=getattr(self, "_n_tool_calls", 0),
+        )
+        if verdict is None:
+            return
+        if verdict == "api_error":
+            subcategory = info.get("subcategory") or "provider_error"
+            kind = "transient" if info.get("transient") else "permanent"
+            diag = ProviderApiErrorDiagnostic(
+                subcategory=subcategory,
+                transient=bool(info.get("transient")),
+                dominant_status=info.get("dominant_status"),
+                status_counts=info.get("status_counts"),
+                total_requests=info.get("total_requests") or 0,
+                failed_requests=info.get("failed_requests") or 0,
+                fingerprint=info.get("fingerprint") or "",
+            )
+            self._diagnostics.set(diag)
+            self._error = (
+                f"provider api error [{subcategory}/{kind}] "
+                f"HTTP {info.get('dominant_status')} on "
+                f"{diag.failed_requests}/{diag.total_requests} requests"
+            )
+        else:
+            diag = SuspectedApiErrorDiagnostic(
+                total_tokens=total_tokens,
+                n_tool_calls=self._n_tool_calls,
+                total_requests=info.get("total_requests") or 0,
+                failed_requests=info.get("failed_requests") or 0,
+            )
+            self._diagnostics.set(diag)
+            self._error = (
+                "suspected provider api error: agent ended with zero tokens "
+                "and zero tool calls (no scoreable model activity)"
+            )
+        # Unhealthy by definition: drop any verifier reward so the slot is
+        # excluded from score denominators (rerun-able, never counted).
+        self._rewards = None
+
     def _build_result(self) -> RolloutResult:
         rollout_dir = self._require_rollout_dir()
+        self._maybe_classify_api_error()
         # For Scene/multi-turn rollouts, each execute() call records the
         # prompt(s) it sent into self._executed_prompts. Use that as the
         # authoritative prompt list so n_prompts and prompts.json reflect
diff --git a/src/benchflow/rollout/_usage.py b/src/benchflow/rollout/_usage.py
index 7ae89c344..4b360eb81 100644
--- a/src/benchflow/rollout/_usage.py
+++ b/src/benchflow/rollout/_usage.py
@@ -32,6 +32,98 @@ def _provider_auth_status_from_runtime(runtime: Any) -> int | None:
     return None
 
 
+def _api_error_subcategory(status: int) -> tuple[str, bool]:
+    """Map a provider HTTP failure status to (subcategory, transient).
+
+    Status-code-only by design — same #546/#564 security posture as the
+    401/403 scan above (never read bodies or headers, so no credential
+    material can leak into ``result.error``).
+    """
+    if status in (401, 403):
+        return "auth", False
+    if status == 402:
+        return "quota", False
+    if status == 404:
+        return "model_not_found", False
+    if status == 429:
+        return "rate_limit", True
+    if status >= 500 or status == 408:
+        return "provider_error", True
+    return "rejected_request", False
+
+
+def _provider_api_failure_summary_from_runtime(runtime: Any) -> dict[str, Any] | None:
+    """Summarize provider HTTP failures from a usage runtime's trajectory.
+
+    Returns ``None`` when there is no runtime/trajectory or no captured
+    exchanges; otherwise a dict with request totals, per-status failure
+    counts, and the dominant failure's (subcategory, transient, fingerprint)
+    classification. Reads only integer status codes (#546/#564).
+    """
+    server = getattr(runtime, "server", None)
+    trajectory = getattr(server, "trajectory", None)
+    exchanges = getattr(trajectory, "exchanges", None) or []
+    total = 0
+    failed: dict[int, int] = {}
+    last_failed_status: int | None = None
+    for exchange in exchanges:
+        status = getattr(getattr(exchange, "response", None), "status_code", None)
+        if not isinstance(status, int):
+            continue
+        total += 1
+        if status >= 400:
+            failed[status] = failed.get(status, 0) + 1
+            last_failed_status = status
+    if total == 0:
+        return None
+    summary: dict[str, Any] = {
+        "total_requests": total,
+        "failed_requests": sum(failed.values()),
+    }
+    if failed:
+        dominant = max(
+            failed.items(), key=lambda kv: (kv[1], kv[0] == last_failed_status)
+        )[0]
+        subcategory, transient = _api_error_subcategory(dominant)
+        summary.update(
+            status_counts={str(k): v for k, v in sorted(failed.items())},
+            dominant_status=dominant,
+            subcategory=subcategory,
+            transient=transient,
+            fingerprint=f"{subcategory}:{dominant}",
+        )
+    return summary
+
+
+def classify_api_failure(
+    summary: dict[str, Any] | None,
+    *,
+    total_tokens: int,
+    n_tool_calls: int,
+) -> tuple[str | None, dict[str, Any]]:
+    """Decide the post-rollout API-error verdict for an error-free rollout.
+
+    Returns ``("api_error", summary)`` when the proxy captured provider
+    requests and every one of them failed while the agent produced zero
+    tokens (proxy-proven); ``("suspected_api_error", {...})`` when there is
+    no proxy failure evidence but the agent ended with zero tokens AND zero
+    tool calls (zero-signal heuristic — e.g. an agent that validates the
+    model id locally and never issues a request); ``(None, {})`` otherwise.
+    A rollout with any real token usage or tool activity is never flagged.
+    """
+    summary = summary or {}
+    failed = summary.get("failed_requests") or 0
+    total = summary.get("total_requests") or 0
+    if failed and failed == total and total_tokens == 0:
+        return "api_error", summary
+    if total_tokens == 0 and n_tool_calls == 0:
+        return "suspected_api_error", {
+            "total_requests": total,
+            "failed_requests": failed,
+        }
+    return None, {}
+
+
 _NATIVE_ACP_USAGE_SNAPSHOT_TO_RESULT = {
     "input_tokens": "n_input_tokens",
     "output_tokens": "n_output_tokens",
diff --git a/tests/test_api_error_capture.py b/tests/test_api_error_capture.py
new file mode 100644
index 000000000..ad5129006
--- /dev/null
+++ b/tests/test_api_error_capture.py
@@ -0,0 +1,254 @@
+"""Silent provider-API-failure capture (api_error / suspected_api_error).
+
+Covers the post-rollout classification pipeline end to end at the unit level:
+status mapping -> proxy failure summary -> verdict -> diagnostics -> retry
+policy -> batch circuit breaker. The motivating fixture: an agent that
+rejects the model id against its own catalog, issues zero requests, ends its
+turn politely, and the verifier scores 0.0 — previously recorded as a healthy
+fail with error=None.
+"""
+
+from types import SimpleNamespace
+
+from benchflow._utils.scoring import (
+    API_ERROR,
+    SUSPECTED_API_ERROR,
+    api_error_is_transient,
+    classify_error,
+)
+from benchflow.diagnostics import (
+    DIAGNOSTIC_BY_FIELD,
+    DIAGNOSTIC_REGISTRY,
+    ProviderApiErrorDiagnostic,
+    SuspectedApiErrorDiagnostic,
+)
+from benchflow.evaluation import ApiErrorCircuitBreaker, RetryConfig
+from benchflow.models import RunResult
+from benchflow.rollout._usage import (
+    _api_error_subcategory,
+    _provider_api_failure_summary_from_runtime,
+    classify_api_failure,
+)
+
+
+def _runtime(statuses: list[int | None]) -> SimpleNamespace:
+    exchanges = [
+        SimpleNamespace(response=SimpleNamespace(status_code=s)) for s in statuses
+    ]
+    return SimpleNamespace(
+        server=SimpleNamespace(trajectory=SimpleNamespace(exchanges=exchanges))
+    )
+
+
+class TestClassifyErrorMarkers:
+    def test_api_error_marker(self):
+        err = "provider api error [rate_limit/transient] HTTP 429 on 3/3 requests"
+        assert classify_error(err) == API_ERROR
+
+    def test_suspected_marker_wins_over_api_marker(self):
+        err = "suspected provider api error: agent ended with zero tokens and zero tool calls"
+        assert classify_error(err) == SUSPECTED_API_ERROR
+
+    def test_auth_api_error_not_misrouted_to_provider_auth(self):
+        # "HTTP 401" is a provider_auth marker, but the api-error branch runs
+        # first for the structured string the classifier emits.
+        err = "provider api error [auth/permanent] HTTP 401 on 2/2 requests"
+        assert classify_error(err) == API_ERROR
+
+    def test_transient_marker_parsing(self):
+        assert api_error_is_transient(
+            "provider api error [rate_limit/transient] HTTP 429"
+        )
+        assert not api_error_is_transient(
+            "provider api error [auth/permanent] HTTP 401"
+        )
+        assert not api_error_is_transient(None)
+
+
+class TestStatusSubcategory:
+    def test_mapping(self):
+        assert _api_error_subcategory(401) == ("auth", False)
+        assert _api_error_subcategory(403) == ("auth", False)
+        assert _api_error_subcategory(402) == ("quota", False)
+        assert _api_error_subcategory(404) == ("model_not_found", False)
+        assert _api_error_subcategory(429) == ("rate_limit", True)
+        assert _api_error_subcategory(500) == ("provider_error", True)
+        assert _api_error_subcategory(503) == ("provider_error", True)
+        assert _api_error_subcategory(408) == ("provider_error", True)
+        assert _api_error_subcategory(400) == ("rejected_request", False)
+        assert _api_error_subcategory(422) == ("rejected_request", False)
+
+
+class TestFailureSummary:
+    def test_none_without_exchanges(self):
+        assert _provider_api_failure_summary_from_runtime(None) is None
+        assert _provider_api_failure_summary_from_runtime(_runtime([])) is None
+
+    def test_all_failed(self):
+        s = _provider_api_failure_summary_from_runtime(_runtime([429, 429, 500]))
+        assert s["total_requests"] == 3
+        assert s["failed_requests"] == 3
+        assert s["dominant_status"] == 429
+        assert s["subcategory"] == "rate_limit"
+        assert s["transient"] is True
+        assert s["fingerprint"] == "rate_limit:429"
+        assert s["status_counts"] == {"429": 2, "500": 1}
+
+    def test_successes_only(self):
+        s = _provider_api_failure_summary_from_runtime(_runtime([200, 200]))
+        assert s == {"total_requests": 2, "failed_requests": 0}
+
+    def test_non_int_statuses_skipped(self):
+        s = _provider_api_failure_summary_from_runtime(_runtime([None, 200, 401]))
+        assert s["total_requests"] == 2
+        assert s["failed_requests"] == 1
+        assert s["subcategory"] == "auth"
+
+
+class TestClassifyApiFailure:
+    def test_proxy_proven(self):
+        summary = _provider_api_failure_summary_from_runtime(_runtime([429, 429]))
+        verdict, info = classify_api_failure(summary, total_tokens=0, n_tool_calls=0)
+        assert verdict == "api_error"
+        assert info["subcategory"] == "rate_limit"
+
+    def test_zero_signal_without_proxy_evidence(self):
+        verdict, info = classify_api_failure(None, total_tokens=0, n_tool_calls=0)
+        assert verdict == "suspected_api_error"
+        assert info == {"total_requests": 0, "failed_requests": 0}
+
+    def test_healthy_rollout_never_flagged(self):
+        verdict, _ = classify_api_failure(None, total_tokens=26278, n_tool_calls=8)
+        assert verdict is None
+
+    def test_partial_failures_with_progress_not_flagged(self):
+        # Agent recovered from a mid-run blip: some failures, but real tokens.
+        summary = _provider_api_failure_summary_from_runtime(_runtime([429, 200, 200]))
+        verdict, _ = classify_api_failure(summary, total_tokens=5000, n_tool_calls=3)
+        assert verdict is None
+
+    def test_zero_tools_with_tokens_not_flagged(self):
+        # Prompt-only answer (no tools) with real usage is a legitimate rollout.
+        verdict, _ = classify_api_failure(None, total_tokens=1200, n_tool_calls=0)
+        assert verdict is None
+
+
+class TestDiagnostics:
+    def test_registered(self):
+        assert ProviderApiErrorDiagnostic in DIAGNOSTIC_REGISTRY
+        assert SuspectedApiErrorDiagnostic in DIAGNOSTIC_REGISTRY
+        assert DIAGNOSTIC_BY_FIELD["api_error_info"] is ProviderApiErrorDiagnostic
+        assert (
+            DIAGNOSTIC_BY_FIELD["suspected_api_error_info"]
+            is SuspectedApiErrorDiagnostic
+        )
+
+    def test_categories_and_channel(self):
+        assert ProviderApiErrorDiagnostic.category == API_ERROR
+        assert SuspectedApiErrorDiagnostic.category == SUSPECTED_API_ERROR
+        assert ProviderApiErrorDiagnostic.channel == "error"
+        assert SuspectedApiErrorDiagnostic.channel == "error"
+
+    def test_format_issue(self):
+        diag = ProviderApiErrorDiagnostic(
+            subcategory="auth",
+            transient=False,
+            dominant_status=401,
+            total_requests=4,
+            failed_requests=4,
+            fingerprint="auth:401",
+        )
+        line = diag.format_issue("some-task")
+        assert "auth/permanent" in line and "HTTP 401" in line and "4/4" in line
+
+
+class TestRetryPolicy:
+    def test_transient_api_error_retries(self):
+        cfg = RetryConfig()
+        assert cfg.should_retry(
+            "provider api error [rate_limit/transient] HTTP 429 on 3/3 requests",
+            category=API_ERROR,
+        )
+
+    def test_permanent_api_error_does_not_retry(self):
+        cfg = RetryConfig()
+        assert not cfg.should_retry(
+            "provider api error [auth/permanent] HTTP 401 on 2/2 requests",
+            category=API_ERROR,
+        )
+
+    def test_suspected_never_retries(self):
+        cfg = RetryConfig()
+        assert not cfg.should_retry(
+            "suspected provider api error: agent ended with zero tokens and zero tool calls",
+            category=SUSPECTED_API_ERROR,
+        )
+
+    def test_api_retry_can_be_disabled(self):
+        cfg = RetryConfig(retry_on_api_error=False)
+        assert not cfg.should_retry(
+            "provider api error [rate_limit/transient] HTTP 429",
+            category=API_ERROR,
+        )
+
+
+def _api_result(name: str, *, sub: str = "auth", status: int = 401) -> RunResult:
+    return RunResult(
+        task_name=name,
+        error=f"provider api error [{sub}/permanent] HTTP {status} on 1/1 requests",
+    )
+
+
+class TestCircuitBreaker:
+    def test_trips_on_same_fingerprint_streak(self):
+        breaker = ApiErrorCircuitBreaker(threshold=3)
+        for i in range(3):
+            assert not breaker.tripped
+            breaker.record(_api_result(f"t{i}"))
+        assert breaker.tripped
+        assert "auth:401" in breaker.skip_error()
+
+    def test_healthy_completion_resets_streak(self):
+        breaker = ApiErrorCircuitBreaker(threshold=3)
+        breaker.record(_api_result("t0"))
+        breaker.record(_api_result("t1"))
+        breaker.record(RunResult(task_name="ok", rewards={"reward": 0.0}))
+        breaker.record(_api_result("t2"))
+        breaker.record(_api_result("t3"))
+        assert not breaker.tripped
+
+    def test_different_fingerprints_do_not_accumulate(self):
+        breaker = ApiErrorCircuitBreaker(threshold=3)
+        breaker.record(_api_result("t0", sub="auth", status=401))
+        breaker.record(_api_result("t1", sub="quota", status=402))
+        breaker.record(_api_result("t2", sub="auth", status=401))
+        assert not breaker.tripped
+
+    def test_transient_api_error_is_not_breaker_relevant(self):
+        breaker = ApiErrorCircuitBreaker(threshold=2)
+        transient = RunResult(
+            task_name="t",
+            error="provider api error [rate_limit/transient] HTTP 429 on 1/1 requests",
+        )
+        breaker.record(transient)
+        breaker.record(transient)
+        assert not breaker.tripped
+
+    def test_suspected_counts_with_own_fingerprint(self):
+        breaker = ApiErrorCircuitBreaker(threshold=2)
+        suspected = RunResult(
+            task_name="t",
+            error=(
+                "suspected provider api error: agent ended with zero tokens "
+                "and zero tool calls (no scoreable model activity)"
+            ),
+        )
+        breaker.record(suspected)
+        breaker.record(suspected)
+        assert breaker.tripped
+
+    def test_zero_threshold_disables(self):
+        breaker = ApiErrorCircuitBreaker(threshold=0)
+        for i in range(10):
+            breaker.record(_api_result(f"t{i}"))
+        assert not breaker.tripped