From a764557eeeb4b5144950d1b24b9e27563321376d Mon Sep 17 00:00:00 2001
From: Jared Lunde <jared.lunde@gmail.com>
Date: Sun, 31 May 2026 11:07:55 -0700
Subject: [PATCH 1/7] =?UTF-8?q?feat(ai):=20Beyond=20AI=20Gateway=20?=
 =?UTF-8?q?=E2=80=94=20data-driven=20providers,=20doc-verified,=20live=20s?=
 =?UTF-8?q?moke?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A centralized internal egress L7 proxy to LLM providers (Pingora + tokio). Apps
point their stock OpenAI/Anthropic SDK at it; the gateway authenticates, swaps in
the real provider key, relays the response untouched, and emits token-usage facts
for billing. Self-contained: no path deps into the beyond repo.

Auth branches on key format: bai_… is a stateless Ed25519-signed virtual key
(verify → deny-set check → swap to the pool key); anything else is BYO — the
user's own provider token, passed through unchanged.

Providers are data: a row in route::KNOWN_PROVIDERS (name, authority, base path,
auth scheme) or a config entry — adding an OpenAI-wire provider is one line, no
new code paths. Ships 10 known providers (openai, anthropic, openrouter,
fireworks, groq, deepseek, together, cerebras, mistral, xai), each with its
connection facts verified against the provider's official docs (cited inline in
route.rs). The client's /v1 prefix is rewritten to each provider's real mount
point (Groq /openai/v1, Fireworks /inference/v1, OpenRouter /api/v1) so a
verbatim passthrough can't 404.

Hardening: per-key rate guardrail (count-min, fixed memory), gap-free deny-set
seeding (resume-from-revision), optional on-disk snapshot for restart-before-NATS
enforcement, chunked-safe body-size cap, redacting/zeroizing Secret newtype,
TTL-cached async DNS, NATS-independent auth (fail-open deny-set).

Verification:
- 45 unit tests; e2e suite (real beyond-ai binary + real nats-server + mock
  upstream) covering managed key-swap, BYO passthrough, both dialects, usage
  metering, deny-set propagation, rate limiting, snapshot restart.
- Live smoke suite (tests/smoke.rs, mise run test:smoke): exercises the full
  managed path — Ed25519 verify → deny-check → key-swap → real TLS — against real
  providers, gated per API key (#[ignore] + key-presence). The Anthropic managed
  path is verified green against production.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .cargo/audit.toml                  |   32 +
 .gitignore                         |   14 +
 ARCHITECTURE.md                    |  158 +
 CLAUDE.md                          |   21 -
 Cargo.lock                         | 4466 ++++++++++++++++++++++++++++
 Cargo.toml                         |   37 +
 README.md                          |   78 +
 config.example.toml                |   55 +
 crates/gateway/Cargo.toml          |   62 +
 crates/gateway/benches/e2e.rs      |  162 +
 crates/gateway/benches/unit.rs     |  140 +
 crates/gateway/src/config.rs       |  211 ++
 crates/gateway/src/deny.rs         |  152 +
 crates/gateway/src/doctor.rs       |   61 +
 crates/gateway/src/error.rs        |   15 +
 crates/gateway/src/key.rs          |  354 +++
 crates/gateway/src/lib.rs          |   25 +
 crates/gateway/src/main.rs         |  132 +
 crates/gateway/src/metrics.rs      |   67 +
 crates/gateway/src/peek.rs         |  250 ++
 crates/gateway/src/proxy.rs        |  507 ++++
 crates/gateway/src/ratelimit.rs    |  111 +
 crates/gateway/src/route.rs        |  308 ++
 crates/gateway/src/secret.rs       |   77 +
 crates/gateway/src/state.rs        |  191 ++
 crates/gateway/src/store_watch.rs  |  310 ++
 crates/gateway/src/usage.rs        |  190 ++
 crates/gateway/tests/common/mod.rs |  517 ++++
 crates/gateway/tests/e2e.rs        |  663 +++++
 crates/gateway/tests/smoke.rs      |  202 ++
 mise.toml                          |   52 +
 31 files changed, 9599 insertions(+), 21 deletions(-)
 create mode 100644 .cargo/audit.toml
 create mode 100644 .gitignore
 create mode 100644 ARCHITECTURE.md
 create mode 100644 Cargo.lock
 create mode 100644 Cargo.toml
 create mode 100644 README.md
 create mode 100644 config.example.toml
 create mode 100644 crates/gateway/Cargo.toml
 create mode 100644 crates/gateway/benches/e2e.rs
 create mode 100644 crates/gateway/benches/unit.rs
 create mode 100644 crates/gateway/src/config.rs
 create mode 100644 crates/gateway/src/deny.rs
 create mode 100644 crates/gateway/src/doctor.rs
 create mode 100644 crates/gateway/src/error.rs
 create mode 100644 crates/gateway/src/key.rs
 create mode 100644 crates/gateway/src/lib.rs
 create mode 100644 crates/gateway/src/main.rs
 create mode 100644 crates/gateway/src/metrics.rs
 create mode 100644 crates/gateway/src/peek.rs
 create mode 100644 crates/gateway/src/proxy.rs
 create mode 100644 crates/gateway/src/ratelimit.rs
 create mode 100644 crates/gateway/src/route.rs
 create mode 100644 crates/gateway/src/secret.rs
 create mode 100644 crates/gateway/src/state.rs
 create mode 100644 crates/gateway/src/store_watch.rs
 create mode 100644 crates/gateway/src/usage.rs
 create mode 100644 crates/gateway/tests/common/mod.rs
 create mode 100644 crates/gateway/tests/e2e.rs
 create mode 100644 crates/gateway/tests/smoke.rs
 create mode 100644 mise.toml

diff --git a/.cargo/audit.toml b/.cargo/audit.toml
new file mode 100644
index 0000000..d561358
--- /dev/null
+++ b/.cargo/audit.toml
@@ -0,0 +1,32 @@
+# cargo-audit configuration.
+#
+# The advisories ignored below are ALL transitively pinned by dependencies we cannot bump from this
+# repo, and each has been assessed for actual exposure. They are listed individually (not blanket
+# `informational` suppression) so a NEW advisory still fails the audit. Re-evaluate when the upstream
+# pins move — chiefly when `pingora` publishes past 0.8 (it is the latest published release as of
+# this writing) and when `beyond-slipstream` relaxes its `async-nats ^0.46` requirement.
+[advisories]
+ignore = [
+  # rustls-webpki 0.102.8: reachable panic in CRL parsing; name-constraint acceptance bugs (URI /
+  # wildcard); CRL distribution-point matching. Pulled ONLY by async-nats 0.46 (pinned by
+  # beyond-slipstream `^0.46`), used for the NATS/slipstream control-channel TLS — NOT the client-
+  # or provider-facing TLS, which already resolve the patched rustls-webpki 0.103.13. Blast radius
+  # is limited to MITM of the deny-set channel, which is fail-open and carries only deny entries.
+  # Fix path: a beyond-slipstream release on async-nats >= 0.47 (uses rustls-webpki 0.103+).
+  "RUSTSEC-2026-0104",
+  "RUSTSEC-2026-0098",
+  "RUSTSEC-2026-0099",
+  "RUSTSEC-2026-0049",
+
+  # protobuf 2.28.0: DoS via uncontrolled recursion when PARSING protobuf. Pulled by prometheus
+  # 0.13 (both our direct dep — kept at 0.13 to share pingora-core's default registry — and
+  # pingora-core 0.8 itself). We never parse untrusted protobuf: metrics are exposed in the text
+  # exposition format via pingora's prometheus_http_service. Fix path: pingora past 0.8 (drops the
+  # prometheus 0.13 / protobuf 2.x chain).
+  "RUSTSEC-2024-0437",
+
+  # Unmaintained-crate warnings (no known vulnerability), all transitive via pingora 0.8:
+  "RUSTSEC-2025-0134", # rustls-pemfile (via rustls-native-certs <- pingora-rustls / async-nats)
+  "RUSTSEC-2025-0069", # daemonize (via pingora-core)
+  "RUSTSEC-2024-0388", # derivative (via a pingora dependency)
+]
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..18aa6a2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+target/
+dist/
+**/*.rs.bk
+.env
+.env.*
+!.env.example
+.claude/settings.local.json
+.sqlx
+.wiki
+node_modules/
+bench/out/
+.mcp.json
+.claude
+.env
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
new file mode 100644
index 0000000..e307e66
--- /dev/null
+++ b/ARCHITECTURE.md
@@ -0,0 +1,158 @@
+# Beyond AI Gateway — Architecture
+
+A centralized, internal **egress L7 proxy** to LLM providers, built on **Pingora** + tokio. Apps point their stock
+OpenAI/Anthropic SDK at it; the gateway authenticates, swaps in the real provider key, relays the
+response untouched, and emits token-usage facts for billing.
+
+**Self-contained:** no `path` deps into the `beyond` repo. Depends only on crates.io + the published
+`beyond-slipstream` — so it clones/CI-builds/publishes anywhere.
+
+## Request flow (`proxy.rs`)
+
+```
+client (stock SDK, Bearer/ x-api-key)
+   │
+   ▼ request_filter
+   ├─ provider = dialect(path) [+ x-beyond-provider override]   (unknown → 400)
+   ├─ extract key
+   ├─ Content-Length abuse guard (declared size; streamed total enforced in body filter too)
+   ├─ key format branch:
+   │    • bai_…  → MANAGED: Ed25519 verify (stateless) → {tenant_id, vpc_id}
+   │              → deny-set check (O(1), default-allow) → require pool key
+   │    • else   → BYO: the user's own provider token, passed through unchanged
+   └─ per-key rate guardrail (tenant / BYO-token hash; over ceiling → 429)
+   ▼ upstream_peer        — TTL-cached DNS resolve → HttpPeer (no blocking getaddrinfo)
+   ▼ upstream_request_filter — managed: swap auth header to pool key; BYO: leave it. Set Host.
+   ▼ request_body_filter  — STREAM BODY THROUGH (never buffered); feed bytes to a structural
+   │                         scanner that extracts the exact root-level `model` (O(1), memchr-fast);
+   │                         enforce the body cap on the running total (chunked-safe)
+   ▼ response_filter      — TTFT; streaming? = response Content-Type is text/event-stream
+   ▼ response_body_filter — relay unbuffered; keep a bounded 64KB tail for the usage tap
+   ▼ logging              — parse usage from tail (by dialect+streaming); emit `ai.usage` fact
+        upstream: a registered provider (openai, anthropic, openrouter, fireworks,
+                  groq, deepseek, together, cerebras, mistral, xai — + config-added)
+```
+
+## What lives where
+
+- **NATS / slipstream:** exactly one thing — the **deny-set** (`blackhole.{tenant}`). Watched,
+  fail-open. Auth and keys do **not** depend on NATS.
+- **Config (boot, SSM/env):** `signing_keys` (Ed25519 **public** keys by kid — multiple for
+  rotation), `pool_keys` (managed pool keys **by provider name**, from `AI_POOL_KEY_<NAME>` env),
+  `provider_authorities` (per-name authority overrides / additions), `rate_limit_rps` (per-key
+  request ceiling; 0 disables), `snapshot_path` (optional on-disk deny-set cache; see below),
+  timeouts. Secret-bearing fields (`pool_keys`, `nats_creds`) are held as `Secret`, so a stray
+  `Debug`/`Serialize` of the config can't leak them. See `config.example.toml`.
+- **The virtual key (`bai_v1.{kid}.{payload}.{sig}`):** Ed25519-signed, payload = `{tenant_id,
+  vpc_id}`, verified with a public key — stateless, no lookup. Minted by the control plane (it holds
+  the private key); a compromised/OSS gateway can verify but not mint.
+
+## Key invariants
+
+- **Managed vs BYO by key format.** `bai_…` → verify + swap to pool key. Anything else → the user's
+  real token, passed through (no swap, no deny-set, no per-tenant attribution).
+- **Request body is never buffered** — it streams through with original framing; a streaming
+  structural scanner (`peek::ModelScanner`, O(1), SIMD `memchr` skip over big values) extracts the
+  exact root-level `model`. (Trade-off: OpenAI streaming without `stream_options.include_usage`
+  isn't metered — the SDK/platform can set it.)
+- **Response is never buffered** — relayed chunk-by-chunk; a bounded 64KB tail feeds the usage tap.
+- **Deny-set is `O(denied)`, default-allow, fail-open.** Restore = explicit delete or TTL expiry.
+  Seeding is **gap-free**: the seed records the stream revision it reflects, and the watch _resumes
+  from that revision_ (`watch_prefix_from`) rather than starting live — so a deny entry written in
+  the window between seeding and the watch attaching can't be lost (a plain `watch_prefix` uses NATS
+  `DeliverPolicy::New` and would silently drop it). The resume revision is kept across reconnects, so
+  a NATS blip resumes from where it left off instead of re-scanning.
+- **Deny-set seeding has two modes (`snapshot_path`).** Unset (ephemeral/Fargate): scan
+  `blackhole.*` from NATS each cold boot. Set (edge/tunnel, durable disk): load slipstream's on-disk
+  snapshot (entries + saved cursor), enforce immediately on restart **before NATS reconnects**, and
+  append each applied delta back to the file. The snapshot is a pure cache — delete it and the
+  gateway falls back to scanning; a `CursorExpired` (history compacted past the cursor) does the same.
+- **Auth works without NATS** (keys from config); a NATS outage only staleens the deny-set.
+- **Per-key rate guardrail, not a spend control.** The deny-set is the spend/fraud authority but
+  reacts on a lag and never sees floods that don't bill (auth failures, 4xx, BYO). A fixed-memory
+  count-min limiter (`ratelimit`, pingora-limits) caps a single tenant's / BYO caller's request
+  velocity — bounding a leaked/runaway key during deny-set lag and a retry-storm flood. Generous by
+  default (a circuit breaker, not a quota); `rate_limit_rps = 0` disables it.
+- **Routing is dialect-based** (model isn't known before peer selection); any non-default provider
+  is reached via the `x-beyond-provider: <name>` header. **Providers are data** — a row in
+  `route::KNOWN_PROVIDERS` (name, authority, **base path**, auth scheme) or a config entry — so
+  adding an OpenAI-wire provider is one line, no new code paths. Each row's connection facts are
+  **verified against the provider's official docs (cited inline in `route.rs`)**; the client's `/v1`
+  prefix is rewritten to the provider's mount point (Groq `/openai/v1`, Fireworks `/inference/v1`,
+  OpenRouter `/api/v1`) so a verbatim passthrough can't 404.
+- **Connect retries only** (`fail_to_connect`); no HTTP-status retry (Pingora-idiomatic, SDKs back off).
+- **Pricing is never here** — emit token _facts_; a closed downstream consumer prices.
+
+## Modules
+
+| Module                    | Role                                                                        | Tested        |
+| ------------------------- | --------------------------------------------------------------------------- | ------------- |
+| `key`                     | `bai_v1` parse + Ed25519 verify + mint; stateless identity                  | unit ✓        |
+| `route`                   | data-driven provider table (name/authority/auth) + dialect default          | unit ✓        |
+| `peek`                    | `ModelScanner` — streaming structural scan for the exact root-level `model` | unit ✓        |
+| `usage`                   | token extraction (OpenAI/Anthropic, body + SSE)                             | unit ✓        |
+| `deny`                    | sparse deny-set, default-allow, reason → status                             | unit ✓        |
+| `ratelimit`               | per-key request guardrail (count-min, fixed memory, no GC)                  | unit ✓        |
+| `secret`                  | redacting, zeroize-on-drop `Secret` newtype                                 | unit ✓        |
+| `config`                  | Figment config; build keyring; pool keys/authorities by provider name       | unit ✓        |
+| `state`                   | keyring + resolved provider registry + watched deny-set + TTL DNS cache     | unit ✓        |
+| `store_watch`             | the single NATS watcher (deny-set), as a Pingora `BackgroundService`        | —             |
+| `proxy`                   | the `ProxyHttp` impl                                                        | e2e ✓         |
+| `metrics`/`doctor`/`main` | Prometheus, diagnostics, bootstrap                                          | e2e/compile ✓ |
+
+## Verification
+
+- **Unit (`cargo test --lib`):** key, route, peek, usage, deny, secret, config. `clippy
+  --all-targets -D warnings` clean.
+- **End-to-end (`tests/e2e.rs`, `mise run test:integration:rs`):** real `beyond-ai` binary + real
+  nats-server + mock upstream. Covers managed key-swap + passthrough fidelity + usage metering
+  (OpenAI JSON + SSE, **Anthropic `/v1/messages`** with `x-api-key` swap + metering), **BYO
+  passthrough** (raw token unchanged), the **virtual key in either inbound header** (`Bearer` or
+  `x-api-key`), and deny-set propagation: spend (write `blackhole.{tenant}` → 402, delete → 200) and
+  **fraud** (→ 403). Error/edge paths: **missing key → 401**, **oversized `Content-Length` → 413**,
+  **managed key for an unconfigured provider → 503**, **streaming tail compaction** (>128KB before
+  the usage chunk still meters), **deny-set fail-open** (kill NATS → stale set retained, auth still
+  works), and **on-disk snapshot survival** (blackhole a tenant, restart with NATS down → the hold is
+  still enforced from disk). Managed/BYO/streaming seed **nothing** in NATS (signkey/pool keys from
+  config), demonstrating auth's independence from NATS.
+- **Live smoke (`tests/smoke.rs`, `mise run test:smoke`):** the real `beyond-ai` binary against the
+  **real** provider hosts over TLS, one per provider in `KNOWN_PROVIDERS`. Proves what docs and the
+  mock can't — real TLS/SNI, the `/v1`→base-path rewrite landing on a live mount (200, not 404), and
+  auth passthrough. Traffic is BYO (the env key forwarded as the caller's token). Doubly guarded:
+  every test is `#[ignore]` (a plain `cargo test` skips them) **and** skips unless its provider's API
+  key env var (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GROQ_API_KEY`, …) is set — so CI stays
+  hermetic and you only hit providers you have keys for.
+
+## Benchmarking
+
+Two harnesses, best-tool-per-job, mirroring the unit/e2e split of the tests:
+
+- **Unit micro (`benches/unit.rs`, `mise run bench:unit`) — `divan`.** Times the IO-free hot paths
+  (`key` verify/mint, `peek::ModelScanner` over 0/4KB/256KB bodies with `model` placed _last_ =
+  worst case, `usage` parsers, `route`, `deny`) **and** measures allocations natively: divan's
+  `AllocProfiler` (installed as the global allocator) reports alloc/dealloc/grow **count + bytes**
+  beside ns/iter, no extra plumbing — and stays clear of the crate's `#![deny(unsafe_code)]` (a
+  hand-rolled `GlobalAlloc` would need `unsafe impl`). This makes the design's allocation claims
+  _assertable_: `key/verify` shows **0 allocs** (stack-only decode — divan omits the alloc rows
+  entirely), `peek` a flat **1 alloc** independent of body size (the O(1)-memory claim),
+  `route`/`deny::parse_key` **0 allocs**. A regression surfaces as a non-zero / grown number.
+- **A-1 end-to-end (`benches/e2e.rs`, `mise run bench:e2e`) — `criterion`.** The real `beyond-ai`
+  binary + real nats-server + mock upstream (reuses `tests/common` verbatim), driven over real HTTP
+  — measures the whole request path: single-request latency + concurrent throughput. criterion is
+  chosen here for its saved-baseline comparison (`--save-baseline`), which tracks latency/RPS drift
+  across runs. Allocations are _not_ measured (the gateway is a separate process — its heap is
+  invisible to the bench); that's the unit bench's job. Needs `nats-server` on PATH (mise provides
+  it).
+
+`mise run bench` runs both.
+
+## Out of scope / deferred
+
+- **Go control plane** (mint/inject virtual keys, write deny entries) — separate workstream; the
+  e2e mints keys directly.
+- **OpenAI `stream_options` injection** — dropped to keep the request body a pure passthrough.
+- **HTTP 5xx/429 response retries + `Retry-After`** — non-idiomatic in Pingora 0.8; SDKs back off.
+- **Trickle/cancel e2e** — SSE relay is covered; incremental-timing/cancel assertions are flaky.
+- Cross-dialect IR translation; caching; guardrails; ClickHouse ingestion wiring (table exists).
+- **Anthropic streaming input tokens** can sit in `message_start` (response head) outside the 64KB
+  usage tail on very long streams — a pre-existing tap limitation.
diff --git a/CLAUDE.md b/CLAUDE.md
index b7a270e..67e4bef 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -51,24 +51,3 @@ Apply the **Theory of Constraints**: a system's throughput is limited by its sin
 5. **Repeat.** The bottleneck has shifted. Go back to step 1.
 
 The corollary: if you can't name the current constraint, you aren't ready to optimize.
-
-<!-- wiki-managed:start (managed by `wiki claude install`; edits inside this block will be overwritten) -->
-
-## Wiki
-
-This repo uses [agent-wiki](.wiki/): `.wiki/` indexes repo markdown docs and code symbols into a queryable knowledge graph.
-
-**Read the wiki before grepping the codebase or reading ARCHITECTURE.md.** Pages are pre-indexed — searching them is faster and ~5–10× cheaper than re-deriving from raw files.
-
-Wiki tools — pick based on what you need:
-
-- `wiki_query "<term>"` — first move for any specific question. BM25++ over repo docs and code symbols; returns ranked hits with paths, scores, and inline snippets.
-- `wiki_answer "<question>"` — returns top-ranked pages with query-relevant passage extracts in one round-trip. Best when you expect the answer exists and want it immediately.
-- `wiki_read "path/to/page.md"` (optionally `section: "..."` or `paths: [...]`) — full page, one section, or multiple pages in one call.
-- `wiki_search_code "<query>"` — search exported symbols, signatures, and doc comments when you need to locate a declaration or understand an API.
-- `wiki_usage_examples "<symbol>"` — real call sites with surrounding source code. Use before changing a function (to see every calling convention you must preserve) or when learning how an unfamiliar API is actually used.
-- `wiki_impact "<symbol>"` — blast radius: every symbol that transitively calls this one, ranked by hop distance. Use before refactoring or renaming to know what breaks.
-- `wiki_callees "<symbol>"` — outgoing call hierarchy (rust-analyzer equivalent): every function this symbol transitively calls, ranked by hop distance. Use when you need to understand what a function depends on before touching it — its DB calls, service calls, and abstractions.
-- `wiki_implementors "<symbol>"` — go-to-implementations (rust-analyzer equivalent): every concrete type that implements a trait or interface. Use when you need to know what's behind a trait object, or how many types a trait change will affect.
-
-<!-- wiki-managed:end -->
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..f3476ad
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,4466 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "addr2line"
+version = "0.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
+
+[[package]]
+name = "ahash"
+version = "0.8.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
+dependencies = [
+ "cfg-if",
+ "getrandom 0.3.4",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "aliasable"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd"
+
+[[package]]
+name = "alloc-no-stdlib"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
+
+[[package]]
+name = "alloc-stdlib"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
+dependencies = [
+ "alloc-no-stdlib",
+]
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
+[[package]]
+name = "anstream"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
+
+[[package]]
+name = "anstyle-parse"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+
+[[package]]
+name = "arc-swap"
+version = "1.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "arrayvec"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+
+[[package]]
+name = "asn1-rs"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048"
+dependencies = [
+ "asn1-rs-derive",
+ "asn1-rs-impl",
+ "displaydoc",
+ "nom",
+ "num-traits",
+ "rusticata-macros",
+ "thiserror 1.0.69",
+ "time",
+]
+
+[[package]]
+name = "asn1-rs-derive"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "synstructure",
+]
+
+[[package]]
+name = "asn1-rs-impl"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "async-nats"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df5af9ebfb0a14481d3eaf6101e6391261e4f30d25b26a7635ade8a39482ded0"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-util",
+ "memchr",
+ "nkeys",
+ "nuid",
+ "once_cell",
+ "pin-project",
+ "portable-atomic",
+ "rand 0.8.6",
+ "regex",
+ "ring",
+ "rustls-native-certs 0.7.3",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.8",
+ "serde",
+ "serde_json",
+ "serde_nanos",
+ "serde_repr",
+ "thiserror 1.0.69",
+ "time",
+ "tokio",
+ "tokio-rustls",
+ "tokio-stream",
+ "tokio-util",
+ "tokio-websockets",
+ "tracing",
+ "tryhard",
+ "url",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "atomic"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a89cbf775b137e9b968e67227ef7f775587cde3fd31b0d8599dbd0f598a48340"
+dependencies = [
+ "bytemuck",
+]
+
+[[package]]
+name = "atomic-waker"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+
+[[package]]
+name = "autocfg"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53"
+
+[[package]]
+name = "aws-lc-rs"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ec2f1fc3ec205783a5da9a7e6c1509cc69dedf09a1949e412c1e18469326d00"
+dependencies = [
+ "aws-lc-sys",
+ "zeroize",
+]
+
+[[package]]
+name = "aws-lc-sys"
+version = "0.41.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a2f9779ce85b93ab6170dd940ad0169b5766ff848247aff13bb788b832fe3f4"
+dependencies = [
+ "cc",
+ "cmake",
+ "dunce",
+ "fs_extra",
+]
+
+[[package]]
+name = "backtrace"
+version = "0.3.76"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6"
+dependencies = [
+ "addr2line",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+ "windows-link",
+]
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "base64ct"
+version = "1.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06"
+
+[[package]]
+name = "beyond-ai"
+version = "0.1.0"
+dependencies = [
+ "arc-swap",
+ "async-trait",
+ "base64",
+ "beyond-slipstream",
+ "bytes",
+ "clap",
+ "criterion",
+ "divan",
+ "ed25519-dalek",
+ "figment",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "memchr",
+ "pingora",
+ "pingora-core",
+ "pingora-limits",
+ "pingora-proxy",
+ "prometheus",
+ "reqwest",
+ "rustls",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "tracing-subscriber",
+ "zeroize",
+]
+
+[[package]]
+name = "beyond-slipstream"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b07e54aae9b02cf7d2e9d935bd99cbc4a045f19d00738f069f44ba238a01600"
+dependencies = [
+ "async-nats",
+ "async-trait",
+ "base64",
+ "crc32fast",
+ "futures",
+ "serde_json",
+ "tempfile",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "url",
+]
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitflags"
+version = "2.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
+
+[[package]]
+name = "blake2"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "brotli"
+version = "3.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d640d25bc63c50fb1f0b545ffd80207d2e10a4c965530809b40ba3386825c391"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+ "brotli-decompressor",
+]
+
+[[package]]
+name = "brotli-decompressor"
+version = "2.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+]
+
+[[package]]
+name = "bstr"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab"
+dependencies = [
+ "memchr",
+ "regex-automata",
+ "serde",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.20.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649"
+
+[[package]]
+name = "bytemuck"
+version = "1.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "bytes"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
+[[package]]
+name = "cc"
+version = "1.2.63"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f"
+dependencies = [
+ "find-msvc-tools",
+ "jobserver",
+ "libc",
+ "shlex",
+]
+
+[[package]]
+name = "cf-rustracing"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6565523d8145e63e0cf1b397a5f1bd4e90d5652a7dffb2de8cec460ff23ef6b1"
+dependencies = [
+ "backtrace",
+ "rand 0.10.1",
+ "tokio",
+ "trackable",
+]
+
+[[package]]
+name = "cf-rustracing-jaeger"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16c0e4d8cce27f6a6eaff58d2b66f063a18b8ed0d6ef0947ae7a263afa3b7c08"
+dependencies = [
+ "cf-rustracing",
+ "hostname",
+ "local-ip-address",
+ "percent-encoding",
+ "rand 0.10.1",
+ "thrift_codec",
+ "tokio",
+ "trackable",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
+[[package]]
+name = "chacha20"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.3.0",
+ "rand_core 0.10.1",
+]
+
+[[package]]
+name = "chrono"
+version = "0.4.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
+[[package]]
+name = "clap"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+ "terminal_size",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "clap_lex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
+
+[[package]]
+name = "cmake"
+version = "0.1.58"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+
+[[package]]
+name = "combine"
+version = "4.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd"
+dependencies = [
+ "bytes",
+ "memchr",
+]
+
+[[package]]
+name = "condtype"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf0a07a401f374238ab8e2f11a104d2851bf9ce711ec69804834de8af45c7af"
+
+[[package]]
+name = "const-oid"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
+
+[[package]]
+name = "core-foundation"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "cpufeatures"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "crc32fast"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "futures",
+ "is-terminal",
+ "itertools",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "tokio",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
+[[package]]
+name = "crypto-common"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "curve25519-dalek"
+version = "4.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.2.17",
+ "curve25519-dalek-derive",
+ "digest",
+ "fiat-crypto",
+ "rustc_version",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "curve25519-dalek-derive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "daemonize"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab8bfdaacb3c887a54d41bdf48d3af8873b3f5566469f8ba21b92057509f116e"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "daggy"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70def8d72740e44d9f676d8dab2c933a236663d86dd24319b57a2bed4d694774"
+dependencies = [
+ "petgraph",
+]
+
+[[package]]
+name = "darling"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "data-encoding"
+version = "2.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4ae5f15dda3c708c0ade84bfee31ccab44a3da4f88015ed22f63732abe300c8"
+
+[[package]]
+name = "der"
+version = "0.7.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
+dependencies = [
+ "const-oid",
+ "pem-rfc7468",
+ "zeroize",
+]
+
+[[package]]
+name = "der-parser"
+version = "9.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553"
+dependencies = [
+ "asn1-rs",
+ "displaydoc",
+ "nom",
+ "num-bigint",
+ "num-traits",
+ "rusticata-macros",
+]
+
+[[package]]
+name = "deranged"
+version = "0.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
+dependencies = [
+ "powerfmt",
+ "serde_core",
+]
+
+[[package]]
+name = "derivative"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "derive_builder"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947"
+dependencies = [
+ "derive_builder_macro",
+]
+
+[[package]]
+name = "derive_builder_core"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "derive_builder_macro"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
+dependencies = [
+ "derive_builder_core",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+ "subtle",
+]
+
+[[package]]
+name = "displaydoc"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "divan"
+version = "0.1.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a405457ec78b8fe08b0e32b4a3570ab5dff6dd16eb9e76a5ee0a9d9cbd898933"
+dependencies = [
+ "cfg-if",
+ "clap",
+ "condtype",
+ "divan-macros",
+ "libc",
+ "regex-lite",
+]
+
+[[package]]
+name = "divan-macros"
+version = "0.1.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9556bc800956545d6420a640173e5ba7dfa82f38d3ea5a167eb555bc69ac3323"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "dunce"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+
+[[package]]
+name = "ed25519"
+version = "2.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
+dependencies = [
+ "pkcs8",
+ "signature",
+]
+
+[[package]]
+name = "ed25519-dalek"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9"
+dependencies = [
+ "curve25519-dalek",
+ "ed25519",
+ "serde",
+ "sha2",
+ "signature",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "either"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "errno"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "fastrand"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
+
+[[package]]
+name = "fiat-crypto"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
+
+[[package]]
+name = "figment"
+version = "0.10.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8cb01cd46b0cf372153850f4c6c272d9cbea2da513e07538405148f95bd789f3"
+dependencies = [
+ "atomic",
+ "pear",
+ "serde",
+ "toml",
+ "uncased",
+ "version_check",
+]
+
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
+
+[[package]]
+name = "fixedbitset"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
+
+[[package]]
+name = "flate2"
+version = "1.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
+dependencies = [
+ "crc32fast",
+ "libz-ng-sys",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
+[[package]]
+name = "form_urlencoded"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "fs_extra"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
+
+[[package]]
+name = "futures"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893"
+
+[[package]]
+name = "futures-task"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
+
+[[package]]
+name = "futures-util"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "slab",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "wasi",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "r-efi 5.3.0",
+ "wasip2",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 6.0.0",
+ "rand_core 0.10.1",
+ "wasip2",
+ "wasip3",
+]
+
+[[package]]
+name = "getset"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9cf0fc11e47561d47397154977bc219f4cf809b2974facc3ccb3b89e2436f912"
+dependencies = [
+ "proc-macro-error2",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "gimli"
+version = "0.32.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7"
+
+[[package]]
+name = "h2"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "http",
+ "indexmap 2.14.0",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "half"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "zerocopy",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+dependencies = [
+ "foldhash 0.1.5",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash 0.2.0",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.17.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash 0.2.0",
+]
+
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
+[[package]]
+name = "hostname"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "windows-link",
+]
+
+[[package]]
+name = "http"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0"
+dependencies = [
+ "bytes",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "httparse"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
+[[package]]
+name = "hyper"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55281c53a1894c864990125767da440a4e630446785086f52523b20033b74498"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "http",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+ "want",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.27.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f"
+dependencies = [
+ "http",
+ "hyper",
+ "hyper-util",
+ "rustls",
+ "tokio",
+ "tokio-rustls",
+ "tower-service",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "ipnet",
+ "libc",
+ "percent-encoding",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "icu_collections"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c"
+dependencies = [
+ "displaydoc",
+ "potential_utf",
+ "utf8_iter",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locale_core"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4"
+dependencies = [
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38"
+
+[[package]]
+name = "icu_properties"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de"
+dependencies = [
+ "icu_collections",
+ "icu_locale_core",
+ "icu_properties_data",
+ "icu_provider",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14"
+
+[[package]]
+name = "icu_provider"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421"
+dependencies = [
+ "displaydoc",
+ "icu_locale_core",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
+[[package]]
+name = "idna"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
+dependencies = [
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
+]
+
+[[package]]
+name = "idna_adapter"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
+]
+
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.17.1",
+ "serde",
+ "serde_core",
+]
+
+[[package]]
+name = "inlinable_string"
+version = "0.1.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8fae54786f62fb2918dcfae3d568594e50eb9b5c25bf04371af6fe7516452fb"
+
+[[package]]
+name = "ipnet"
+version = "2.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
+
+[[package]]
+name = "is-terminal"
+version = "0.4.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+
+[[package]]
+name = "jni"
+version = "0.22.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5efd9a482cf3a427f00d6b35f14332adc7902ce91efb778580e180ff90fa3498"
+dependencies = [
+ "cfg-if",
+ "combine",
+ "jni-macros",
+ "jni-sys",
+ "log",
+ "simd_cesu8",
+ "thiserror 2.0.18",
+ "walkdir",
+ "windows-link",
+]
+
+[[package]]
+name = "jni-macros"
+version = "0.22.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a00109accc170f0bdb141fed3e393c565b6f5e072365c3bd58f5b062591560a3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "rustc_version",
+ "simd_cesu8",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "jni-sys"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2"
+dependencies = [
+ "jni-sys-macros",
+]
+
+[[package]]
+name = "jni-sys-macros"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264"
+dependencies = [
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "jobserver"
+version = "0.1.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
+dependencies = [
+ "getrandom 0.3.4",
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11"
+dependencies = [
+ "cfg-if",
+ "futures-util",
+ "once_cell",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
+[[package]]
+name = "leb128fmt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
+
+[[package]]
+name = "libc"
+version = "0.2.186"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
+
+[[package]]
+name = "libz-ng-sys"
+version = "1.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be734b33b7bc6a42d92d23e25e69758f866cf564a88d0bf80866fcf5a52c2255"
+dependencies = [
+ "cmake",
+ "libc",
+]
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
+
+[[package]]
+name = "litemap"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
+
+[[package]]
+name = "local-ip-address"
+version = "0.6.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa08fb2b1ec3ea84575e94b489d06d4ce0cbf052d12acd515838f50e3c3d63e3"
+dependencies = [
+ "libc",
+ "neli",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5"
+
+[[package]]
+name = "lru"
+version = "0.16.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39"
+dependencies = [
+ "hashbrown 0.16.1",
+]
+
+[[package]]
+name = "lru-slab"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
+
+[[package]]
+name = "matchers"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
+dependencies = [
+ "regex-automata",
+]
+
+[[package]]
+name = "memchr"
+version = "2.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
+
+[[package]]
+name = "memoffset"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
+dependencies = [
+ "adler2",
+ "simd-adler32",
+]
+
+[[package]]
+name = "mio"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02bd0af71c67b473010cbbc60715ee815645a4dc942899111f494b4b737d6fda"
+dependencies = [
+ "libc",
+ "wasi",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "neli"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22f9786d56d972959e1408b6a93be6af13b9c1392036c5c1fafa08a1b0c6ee87"
+dependencies = [
+ "bitflags 2.11.1",
+ "byteorder",
+ "derive_builder",
+ "getset",
+ "libc",
+ "log",
+ "neli-proc-macros",
+ "parking_lot",
+]
+
+[[package]]
+name = "neli-proc-macros"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05d8d08c6e98f20a62417478ebf7be8e1425ec9acecc6f63e22da633f6b71609"
+dependencies = [
+ "either",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "nix"
+version = "0.24.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa52e972a9a719cecb6864fb88568781eb706bac2cd1d4f04a648542dbf78069"
+dependencies = [
+ "bitflags 1.3.2",
+ "cfg-if",
+ "libc",
+ "memoffset",
+]
+
+[[package]]
+name = "nkeys"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf"
+dependencies = [
+ "data-encoding",
+ "ed25519",
+ "ed25519-dalek",
+ "getrandom 0.2.17",
+ "log",
+ "rand 0.8.6",
+ "signatory",
+]
+
+[[package]]
+name = "no_debug"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f23a60c850e1144fc1dd9435152e0cfdc7dd18725350b4243584118013a52a4"
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "nu-ansi-term"
+version = "0.50.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "nuid"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83"
+dependencies = [
+ "rand 0.8.6",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-conv"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441"
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "object"
+version = "0.37.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "oid-registry"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9"
+dependencies = [
+ "asn1-rs",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+
+[[package]]
+name = "oorandom"
+version = "11.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
+
+[[package]]
+name = "openssl-probe"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
+
+[[package]]
+name = "openssl-probe"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
+
+[[package]]
+name = "ouroboros"
+version = "0.18.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59"
+dependencies = [
+ "aliasable",
+ "ouroboros_macro",
+ "static_assertions",
+]
+
+[[package]]
+name = "ouroboros_macro"
+version = "0.18.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c7028bdd3d43083f6d8d4d5187680d0d3560d54df4cc9d752005268b41e64d0"
+dependencies = [
+ "heck 0.4.1",
+ "proc-macro2",
+ "proc-macro2-diagnostics",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "parking_lot"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-link",
+]
+
+[[package]]
+name = "pear"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdeeaa00ce488657faba8ebf44ab9361f9365a97bd39ffb8a60663f57ff4b467"
+dependencies = [
+ "inlinable_string",
+ "pear_codegen",
+ "yansi",
+]
+
+[[package]]
+name = "pear_codegen"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bab5b985dc082b345f812b7df84e1bef27e7207b39e448439ba8bd69c93f147"
+dependencies = [
+ "proc-macro2",
+ "proc-macro2-diagnostics",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
+
+[[package]]
+name = "petgraph"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
+dependencies = [
+ "fixedbitset",
+ "indexmap 2.14.0",
+]
+
+[[package]]
+name = "pin-project"
+version = "1.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
+
+[[package]]
+name = "pingora"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "844a13b16e556293f4ea96dc5ac0923ac6f36855a9dfc13b640d0da183f6b5b7"
+dependencies = [
+ "pingora-cache",
+ "pingora-core",
+ "pingora-http",
+ "pingora-load-balancing",
+ "pingora-proxy",
+ "pingora-timeout",
+]
+
+[[package]]
+name = "pingora-cache"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59d8c4c939a3a193a3da0e061aa7acf7432431f92ee62a26f5a9e5167a0ade2"
+dependencies = [
+ "ahash",
+ "async-trait",
+ "blake2",
+ "bstr",
+ "bytes",
+ "cf-rustracing",
+ "cf-rustracing-jaeger",
+ "hex",
+ "http",
+ "httparse",
+ "httpdate",
+ "indexmap 1.9.3",
+ "log",
+ "lru",
+ "once_cell",
+ "parking_lot",
+ "pingora-core",
+ "pingora-error",
+ "pingora-header-serde",
+ "pingora-http",
+ "pingora-lru",
+ "pingora-timeout",
+ "rand 0.8.6",
+ "regex",
+ "rmp",
+ "rmp-serde",
+ "serde",
+ "strum",
+ "tokio",
+]
+
+[[package]]
+name = "pingora-core"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08973c4853cef4c682f7a592907e81a32dcad69476c4846e5de079f16448b177"
+dependencies = [
+ "ahash",
+ "async-trait",
+ "brotli",
+ "bstr",
+ "bytes",
+ "chrono",
+ "clap",
+ "daemonize",
+ "daggy",
+ "derivative",
+ "flate2",
+ "futures",
+ "h2",
+ "http",
+ "httparse",
+ "httpdate",
+ "libc",
+ "log",
+ "nix",
+ "once_cell",
+ "openssl-probe 0.1.6",
+ "ouroboros",
+ "parking_lot",
+ "percent-encoding",
+ "pingora-error",
+ "pingora-http",
+ "pingora-pool",
+ "pingora-runtime",
+ "pingora-rustls",
+ "pingora-timeout",
+ "prometheus",
+ "rand 0.8.6",
+ "regex",
+ "serde",
+ "serde_yaml",
+ "sfv",
+ "socket2",
+ "strum",
+ "strum_macros",
+ "tokio",
+ "tokio-stream",
+ "tokio-test",
+ "unicase",
+ "windows-sys 0.59.0",
+ "x509-parser",
+ "zstd",
+]
+
+[[package]]
+name = "pingora-error"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9fa97a500e7e5c27a7b8609b9294c8922c9656322285268bfad9520f12feb38"
+
+[[package]]
+name = "pingora-header-serde"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2705feb8b50d4e734e0c7d3879aa040e655a45656276323ff530e254585dd816"
+dependencies = [
+ "bytes",
+ "http",
+ "httparse",
+ "pingora-error",
+ "pingora-http",
+ "thread_local",
+ "zstd",
+ "zstd-safe",
+]
+
+[[package]]
+name = "pingora-http"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbb52d4651b687fab6abf669539cfd97b7cd94b301fde8f57c63354f9c9cc5e2"
+dependencies = [
+ "bytes",
+ "http",
+ "pingora-error",
+]
+
+[[package]]
+name = "pingora-ketama"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0286fb5a0359dca1e2e137dfe14ca4d94f676635a5eae4616bb3d8d4ce06d120"
+dependencies = [
+ "crc32fast",
+]
+
+[[package]]
+name = "pingora-limits"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7568624fc0e2f11fa32d27053ac862048b40bad98140b07a11d82f1b4989700"
+dependencies = [
+ "ahash",
+]
+
+[[package]]
+name = "pingora-load-balancing"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2606e9e22e72927a69772cefe56b0d41d251c3ffdfcd548a6020fe157fb79ad"
+dependencies = [
+ "arc-swap",
+ "async-trait",
+ "derivative",
+ "fnv",
+ "futures",
+ "http",
+ "log",
+ "pingora-core",
+ "pingora-error",
+ "pingora-http",
+ "pingora-ketama",
+ "pingora-runtime",
+ "rand 0.8.6",
+ "tokio",
+]
+
+[[package]]
+name = "pingora-lru"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91bb5030596a3d442c0866ac68afe29c14ba558e77c726dcdf7016b0dbb359d9"
+dependencies = [
+ "arrayvec",
+ "hashbrown 0.17.1",
+ "parking_lot",
+ "rand 0.8.6",
+]
+
+[[package]]
+name = "pingora-pool"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67f034be36772f318370d058913db43dbd22c3763ad974c995ba2e4afb2bb52a"
+dependencies = [
+ "crossbeam-queue",
+ "log",
+ "lru",
+ "parking_lot",
+ "pingora-timeout",
+ "thread_local",
+ "tokio",
+]
+
+[[package]]
+name = "pingora-proxy"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e1e070a98a70d0d05f2fdcfb706237e06a043b2fbc9261e8772a3459cc2175e"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "clap",
+ "futures",
+ "h2",
+ "http",
+ "log",
+ "once_cell",
+ "pingora-cache",
+ "pingora-core",
+ "pingora-error",
+ "pingora-http",
+ "rand 0.8.6",
+ "regex",
+ "tokio",
+]
+
+[[package]]
+name = "pingora-runtime"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e371315b1c44c2e5a8788fdc61577527b785e121e6ff49144755f40d86511430"
+dependencies = [
+ "once_cell",
+ "rand 0.8.6",
+ "thread_local",
+ "tokio",
+]
+
+[[package]]
+name = "pingora-rustls"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "239b663618bb822ddeddaf6d8384177a8ab226cb22febc627a72c2fd55e7bb75"
+dependencies = [
+ "log",
+ "no_debug",
+ "pingora-error",
+ "ring",
+ "rustls",
+ "rustls-native-certs 0.7.3",
+ "rustls-pemfile",
+ "rustls-pki-types",
+ "tokio-rustls",
+]
+
+[[package]]
+name = "pingora-timeout"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a853fee5ce510a7f5db2561f99c752724112ed13fc3820e70d462d278d704ea"
+dependencies = [
+ "once_cell",
+ "parking_lot",
+ "pin-project-lite",
+ "thread_local",
+ "tokio",
+]
+
+[[package]]
+name = "pkcs8"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
+dependencies = [
+ "der",
+ "spki",
+]
+
+[[package]]
+name = "pkg-config"
+version = "0.3.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
+
+[[package]]
+name = "plotters"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
+dependencies = [
+ "plotters-backend",
+]
+
+[[package]]
+name = "portable-atomic"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
+
+[[package]]
+name = "potential_utf"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564"
+dependencies = [
+ "zerovec",
+]
+
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "prettyplease"
+version = "0.2.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "proc-macro-error-attr2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+]
+
+[[package]]
+name = "proc-macro-error2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802"
+dependencies = [
+ "proc-macro-error-attr2",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "proc-macro2-diagnostics"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "version_check",
+ "yansi",
+]
+
+[[package]]
+name = "prometheus"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
+dependencies = [
+ "cfg-if",
+ "fnv",
+ "lazy_static",
+ "memchr",
+ "parking_lot",
+ "protobuf",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "protobuf"
+version = "2.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
+
+[[package]]
+name = "quinn"
+version = "0.11.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
+dependencies = [
+ "bytes",
+ "cfg_aliases",
+ "pin-project-lite",
+ "quinn-proto",
+ "quinn-udp",
+ "rustc-hash",
+ "rustls",
+ "socket2",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-proto"
+version = "0.11.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
+dependencies = [
+ "aws-lc-rs",
+ "bytes",
+ "getrandom 0.3.4",
+ "lru-slab",
+ "rand 0.9.4",
+ "ring",
+ "rustc-hash",
+ "rustls",
+ "rustls-pki-types",
+ "slab",
+ "thiserror 2.0.18",
+ "tinyvec",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-udp"
+version = "0.5.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
+dependencies = [
+ "cfg_aliases",
+ "libc",
+ "once_cell",
+ "socket2",
+ "tracing",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
+[[package]]
+name = "r-efi"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
+
+[[package]]
+name = "rand"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
+dependencies = [
+ "libc",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
+dependencies = [
+ "chacha20",
+ "getrandom 0.4.2",
+ "rand_core 0.10.1",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom 0.2.17",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom 0.3.4",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69"
+
+[[package]]
+name = "rayon"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
+dependencies = [
+ "bitflags 2.11.1",
+]
+
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-lite"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973"
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+
+[[package]]
+name = "reqwest"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "219c5811de6525e5416c7d5d53bb656d3afdbc6c5af816e0802bcfa42dbdc1c3"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-rustls",
+ "hyper-util",
+ "js-sys",
+ "log",
+ "percent-encoding",
+ "pin-project-lite",
+ "quinn",
+ "rustls",
+ "rustls-pki-types",
+ "rustls-platform-verifier",
+ "serde",
+ "serde_json",
+ "sync_wrapper",
+ "tokio",
+ "tokio-rustls",
+ "tower",
+ "tower-http",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
+[[package]]
+name = "ring"
+version = "0.17.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "getrandom 0.2.17",
+ "libc",
+ "untrusted",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rmp"
+version = "0.8.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ba8be72d372b2c9b35542551678538b562e7cf86c3315773cae48dfbfe7790c"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "rmp-serde"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72f81bee8c8ef9b577d1681a70ebbc962c232461e397b22c208c43c04b67a155"
+dependencies = [
+ "rmp",
+ "serde",
+]
+
+[[package]]
+name = "rust_decimal"
+version = "1.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c5108e3d4d903e21aac27f12ba5377b6b34f9f44b325e4894c7924169d06995"
+dependencies = [
+ "arrayvec",
+ "num-traits",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d"
+
+[[package]]
+name = "rustc-hash"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rusticata-macros"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "rustix"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
+dependencies = [
+ "bitflags 2.11.1",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustls"
+version = "0.23.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b"
+dependencies = [
+ "aws-lc-rs",
+ "log",
+ "once_cell",
+ "ring",
+ "rustls-pki-types",
+ "rustls-webpki 0.103.13",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-native-certs"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5"
+dependencies = [
+ "openssl-probe 0.1.6",
+ "rustls-pemfile",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework 2.11.1",
+]
+
+[[package]]
+name = "rustls-native-certs"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
+dependencies = [
+ "openssl-probe 0.2.1",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework 3.7.0",
+]
+
+[[package]]
+name = "rustls-pemfile"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "rustls-pki-types"
+version = "1.14.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9"
+dependencies = [
+ "web-time",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-platform-verifier"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26d1e2536ce4f35f4846aa13bff16bd0ff40157cdb14cc056c7b14ba41233ba0"
+dependencies = [
+ "core-foundation 0.10.1",
+ "core-foundation-sys",
+ "jni",
+ "log",
+ "once_cell",
+ "rustls",
+ "rustls-native-certs 0.8.3",
+ "rustls-platform-verifier-android",
+ "rustls-webpki 0.103.13",
+ "security-framework 3.7.0",
+ "security-framework-sys",
+ "webpki-root-certs",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustls-platform-verifier-android"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
+
+[[package]]
+name = "rustls-webpki"
+version = "0.102.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
+dependencies = [
+ "rustls-pki-types",
+ "untrusted",
+]
+
+[[package]]
+name = "rustls-webpki"
+version = "0.103.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e"
+dependencies = [
+ "aws-lc-rs",
+ "ring",
+ "rustls-pki-types",
+ "untrusted",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "ryu"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "schannel"
+version = "0.1.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "security-framework"
+version = "2.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
+dependencies = [
+ "bitflags 2.11.1",
+ "core-foundation 0.9.4",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework"
+version = "3.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
+dependencies = [
+ "bitflags 2.11.1",
+ "core-foundation 0.10.1",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework-sys"
+version = "2.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.150"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "serde_nanos"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "serde_repr"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "serde_spanned"
+version = "0.6.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "serde_yaml"
+version = "0.9.34+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
+dependencies = [
+ "indexmap 2.14.0",
+ "itoa",
+ "ryu",
+ "serde",
+ "unsafe-libyaml",
+]
+
+[[package]]
+name = "sfv"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fa1f336066b758b7c9df34ed049c0e693a426afe2b27ff7d5b14f410ab1a132"
+dependencies = [
+ "base64",
+ "indexmap 2.14.0",
+ "rust_decimal",
+]
+
+[[package]]
+name = "sha2"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.2.17",
+ "digest",
+]
+
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "shlex"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba"
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b"
+dependencies = [
+ "errno",
+ "libc",
+]
+
+[[package]]
+name = "signatory"
+version = "0.27.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31"
+dependencies = [
+ "pkcs8",
+ "rand_core 0.6.4",
+ "signature",
+ "zeroize",
+]
+
+[[package]]
+name = "signature"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
+dependencies = [
+ "digest",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "simd-adler32"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
+
+[[package]]
+name = "simd_cesu8"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94f90157bb87cddf702797c5dadfa0be7d266cdf49e22da2fcaa32eff75b2c33"
+dependencies = [
+ "rustc_version",
+ "simdutf8",
+]
+
+[[package]]
+name = "simdutf8"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
+
+[[package]]
+name = "slab"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
+
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
+[[package]]
+name = "socket2"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "spki"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
+dependencies = [
+ "base64ct",
+ "der",
+]
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "strum"
+version = "0.26.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "sync_wrapper"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+dependencies = [
+ "futures-core",
+]
+
+[[package]]
+name = "synstructure"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tempfile"
+version = "3.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
+dependencies = [
+ "fastrand",
+ "getrandom 0.4.2",
+ "once_cell",
+ "rustix",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "terminal_size"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874"
+dependencies = [
+ "rustix",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
+dependencies = [
+ "thiserror-impl 2.0.18",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "thread_local"
+version = "1.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "thrift_codec"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83d957f535b242b91aa9f47bde08080f9a6fef276477e55b0079979d002759d5"
+dependencies = [
+ "byteorder",
+ "trackable",
+]
+
+[[package]]
+name = "time"
+version = "0.3.47"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c"
+dependencies = [
+ "deranged",
+ "itoa",
+ "num-conv",
+ "powerfmt",
+ "serde_core",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
+
+[[package]]
+name = "time-macros"
+version = "0.2.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215"
+dependencies = [
+ "num-conv",
+ "time-core",
+]
+
+[[package]]
+name = "tinystr"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.52.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe"
+dependencies = [
+ "bytes",
+ "libc",
+ "mio",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2",
+ "tokio-macros",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
+dependencies = [
+ "rustls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-test"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f6d24790a10a7af737693a3e8f1d03faef7e6ca0cc99aae5066f533766de545"
+dependencies = [
+ "futures-core",
+ "tokio",
+ "tokio-stream",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-websockets"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "http",
+ "httparse",
+ "rand 0.8.6",
+ "ring",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls",
+ "tokio-util",
+ "webpki-roots 0.26.11",
+]
+
+[[package]]
+name = "toml"
+version = "0.8.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit",
+]
+
+[[package]]
+name = "toml_datetime"
+version = "0.6.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.22.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
+dependencies = [
+ "indexmap 2.14.0",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_write",
+ "winnow",
+]
+
+[[package]]
+name = "toml_write"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
+
+[[package]]
+name = "tower"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.6.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840"
+dependencies = [
+ "bitflags 2.11.1",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "pin-project-lite",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "url",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
+dependencies = [
+ "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-serde"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "regex-automata",
+ "serde",
+ "serde_json",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-serde",
+]
+
+[[package]]
+name = "trackable"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15bd114abb99ef8cee977e517c8f37aee63f184f2d08e3e6ceca092373369ae"
+dependencies = [
+ "trackable_derive",
+]
+
+[[package]]
+name = "trackable_derive"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebeb235c5847e2f82cfe0f07eb971d1e5f6804b18dac2ae16349cc604380f82f"
+dependencies = [
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
+[[package]]
+name = "tryhard"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5"
+dependencies = [
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "typenum"
+version = "1.20.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20"
+
+[[package]]
+name = "uncased"
+version = "0.9.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1b88fcfe09e89d3866a5c11019378088af2d24c3fbd4f0543f96b479ec90697"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "unicase"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
+[[package]]
+name = "unsafe-libyaml"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
+
+[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
+[[package]]
+name = "url"
+version = "2.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+ "serde",
+]
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "valuable"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
+[[package]]
+name = "wasip2"
+version = "1.0.3+wasi-0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
+dependencies = [
+ "wit-bindgen 0.57.1",
+]
+
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+dependencies = [
+ "wit-bindgen 0.51.0",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.122"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.72"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9473dbd2991ae90b6291c3c32c30c6187ac49aa32f9905d1cce280ec1e110b0f"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.122"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.122"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.122"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap 2.14.0",
+ "wasm-encoder",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags 2.11.1",
+ "hashbrown 0.15.5",
+ "indexmap 2.14.0",
+ "semver",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d621441cfc37b84979402712047321980c178f299193a3589d05b99e8763436"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "webpki-root-certs"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "0.26.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
+dependencies = [
+ "webpki-roots 1.0.7",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "winapi-util"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "winnow"
+version = "0.7.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "wit-bindgen"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+dependencies = [
+ "wit-bindgen-rust-macro",
+]
+
+[[package]]
+name = "wit-bindgen"
+version = "0.57.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
+
+[[package]]
+name = "wit-bindgen-core"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
+dependencies = [
+ "anyhow",
+ "heck 0.5.0",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck 0.5.0",
+ "indexmap 2.14.0",
+ "prettyplease",
+ "syn 2.0.117",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags 2.11.1",
+ "indexmap 2.14.0",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
+dependencies = [
+ "anyhow",
+ "id-arena",
+ "indexmap 2.14.0",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
+]
+
+[[package]]
+name = "writeable"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
+
+[[package]]
+name = "x509-parser"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69"
+dependencies = [
+ "asn1-rs",
+ "data-encoding",
+ "der-parser",
+ "lazy_static",
+ "nom",
+ "oid-registry",
+ "rusticata-macros",
+ "thiserror 1.0.69",
+ "time",
+]
+
+[[package]]
+name = "yansi"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
+
+[[package]]
+name = "yoke"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca"
+dependencies = [
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "synstructure",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b065d4f0e55f82fae73202e189638116a87c55ab6b8e6c2721e13dd9d854ad1"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b631b19d36a892ab55420c92dbc83ccd79274f25be714855d3074aa71cab639"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "zerofrom"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "synstructure",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
+
+[[package]]
+name = "zerotrie"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+]
+
+[[package]]
+name = "zerovec"
+version = "0.11.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
+
+[[package]]
+name = "zstd"
+version = "0.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "7.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
+dependencies = [
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.16+zstd.1.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
+dependencies = [
+ "cc",
+ "pkg-config",
+]
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..5b5467c
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,37 @@
+[workspace]
+resolver = "2"
+members = ["crates/gateway"]
+
+[workspace.package]
+edition = "2024"
+license = "MIT"
+rust-version = "1.85"
+
+[workspace.dependencies]
+# slipstream is published — consume it from crates.io, aliased to `store` so the code's
+# `use store::...` is unchanged. No path deps into the `beyond` repo: this crate builds standalone.
+store = { package = "beyond-slipstream", version = "0.1.0" }
+zeroize = "1"
+
+pingora = { version = "0.8", default-features = false, features = ["rustls"] }
+pingora-core = "0.8"
+pingora-limits = "0.8"
+pingora-proxy = "0.8"
+
+arc-swap = "1"
+async-nats = "0.46"
+async-trait = "0.1"
+base64 = "0.22"
+bytes = "1"
+clap = { version = "4", features = ["derive", "env"] }
+ed25519-dalek = "2.2"
+figment = { version = "0.10", features = ["toml", "env"] }
+memchr = "2"
+prometheus = "0.13"
+rustls = { version = "0.23", default-features = false, features = ["ring"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+thiserror = "2"
+tokio = { version = "1", features = ["full"] }
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7c9309f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,78 @@
+# beyond/ai
+
+Route LLM traffic through one internal proxy. Apps use their stock OpenAI or Anthropic SDK unchanged — the gateway authenticates, swaps in the real provider key, and meters every token.
+
+## Quick Start
+
+```sh
+cp config.example.toml config.toml
+# Set at minimum: signing_keys and one pool key
+AI_POOL_KEY_OPENAI=sk-... cargo run --release
+```
+
+Point any OpenAI-wire SDK at `http://ai.internal` with a virtual key:
+
+```python
+from openai import OpenAI
+client = OpenAI(base_url="http://ai.internal/v1", api_key="bai_v1.1.<payload>.<sig>")
+```
+
+Or pass your own provider key directly (BYO — forwarded unchanged, no swap):
+
+```python
+client = OpenAI(base_url="http://ai.internal/v1", api_key="sk-your-openai-key")
+```
+
+## What It Does
+
+- **Managed keys** (`bai_v1…`) — Ed25519-verified, stateless. Swaps to the pool key. Attributes usage to tenant + VPC. Deny-set checked (spend/fraud).
+- **BYO keys** — any other token passes through to the provider untouched. No attribution, no deny-set, no metering.
+- **10 providers, zero config** — openai, anthropic, openrouter, fireworks, groq, deepseek, together, cerebras, mistral, xai. Add more in `config.toml` under `[provider_authorities]`.
+- **Never buffers** — request and response stream through; a SIMD scanner extracts `model` in O(1) memory. 64KB tail taps usage without holding the body.
+- **Token facts, not pricing** — emits `ai.usage` events to slipstream. A downstream consumer prices.
+- **Rate guardrail** — per-key request ceiling (`rate_limit_rps`). Circuit breaker against runaway keys. Deny-set owns spend control.
+- **Fail-open NATS** — auth works without NATS. A NATS outage stales the deny-set; existing allows stay allowed.
+
+## Providers
+
+Select a non-default provider with `x-beyond-provider: <name>`:
+
+```python
+client = OpenAI(
+    base_url="http://ai.internal/v1",
+    api_key="bai_v1...",
+    default_headers={"x-beyond-provider": "groq"},
+)
+```
+
+## Config
+
+All config keys are overridable by `AI_`-prefixed env vars (`AI_NATS_URL`, `AI_POOL_KEY_OPENAI`, …). See `config.example.toml` for the full reference.
+
+Required to serve managed traffic:
+
+| Key                  | Source        | Purpose                                                 |
+| -------------------- | ------------- | ------------------------------------------------------- |
+| `signing_keys`       | `config.toml` | Ed25519 public keys by `kid` — verifies `bai_v1` tokens |
+| `AI_POOL_KEY_<NAME>` | env (SSM)     | Provider key swapped in for managed requests            |
+
+Optional:
+
+| Key                      | Default   | Purpose                                                                  |
+| ------------------------ | --------- | ------------------------------------------------------------------------ |
+| `snapshot_path`          | unset     | On-disk deny-set snapshot — set on durable nodes, leave unset on Fargate |
+| `rate_limit_rps`         | `100`     | Per-key request ceiling; `0` disables                                    |
+| `[provider_authorities]` | built-ins | Override or add upstream hosts                                           |
+
+## Running Tests
+
+```sh
+mise run test:unit:rs        # pure-logic unit tests (no network)
+mise run test:integration:rs # gateway + mock upstream + NATS
+mise run test:smoke          # live providers — needs API keys in env, bills real (tiny) requests
+mise run bench               # unit micro-benchmarks + end-to-end throughput
+```
+
+## Architecture
+
+[ARCHITECTURE.md](ARCHITECTURE.md) — request flow, module map, key invariants.
diff --git a/config.example.toml b/config.example.toml
new file mode 100644
index 0000000..6fd258c
--- /dev/null
+++ b/config.example.toml
@@ -0,0 +1,55 @@
+# Beyond AI gateway — example config. Every key is overridable by an `AI_`-prefixed env var
+# (e.g. `AI_NATS_URL`, `AI_POOL_KEY_OPENAI`, `AI_READ_TIMEOUT_SECS`). Values below are defaults.
+
+listen = "0.0.0.0:8080" # client (app) traffic; internal-only, fronted as ai.internal
+metrics_listen = "0.0.0.0:9090" # Prometheus /metrics
+
+# NATS / slipstream — carries ONLY the deny-set (`blackhole.*`). Auth + keys come from this file,
+# so the gateway authenticates + serves managed traffic even if NATS is down.
+nats_url = "nats://localhost:4222"
+# nats_creds = "<base64 .creds>"        # ECS via SOPS
+# nats_creds_file = "/path/to/nats.creds"
+config_bucket = "ai-gateway"
+
+# Optional on-disk deny-set snapshot (slipstream append-log + resume cursor). Set this ONLY on
+# durable storage (edge/tunnel nodes): a restart then seeds the deny-set from disk and resumes the
+# NATS watch from the saved revision — enforcing immediately, even before NATS reconnects, and
+# skipping the boot scan. Leave unset on ephemeral hosts (e.g. Fargate); the gateway seeds from a
+# NATS scan each boot. It's a pure cache: deleting the file just forces a rescan.
+# snapshot_path = "/var/lib/beyond-ai/denyset.snap"
+
+# Upstream timeouts. read/idle are generous: LLM streams can run for minutes.
+connect_timeout_secs = 10
+read_timeout_secs = 600
+write_timeout_secs = 60
+idle_timeout_secs = 90
+
+# upstream_tls = true   # set false only for a plaintext mock (tests)
+
+# Per-key request-rate ceiling (requests/sec) — a blast-radius circuit breaker, not a spend control
+# (the deny-set owns spend). Caps how fast one tenant (managed) or BYO caller can drive the gateway,
+# bounding a leaked/runaway key during the deny-set's reaction lag and a failure flood that never
+# bills. Generous by default so legitimate traffic never trips it; set 0 to disable. Tune from the
+# `ai_rejections_total{reason="rate_limit"}` metric.
+rate_limit_rps = 100
+
+# Optional per-provider upstream authority (host:port), BY PROVIDER NAME. For a known provider this
+# overrides its built-in default; for an unknown name it ADDS a new OpenAI-wire provider, reachable
+# via `x-beyond-provider: <name>`. Known providers (zero-config defaults): openai, anthropic,
+# openrouter, fireworks, groq, deepseek, together, cerebras, mistral, xai.
+# [provider_authorities]
+# openai = "api.openai.com:443"
+# my-self-hosted = "llm.internal:8443"
+
+# Managed Beyond pool keys, BY PROVIDER NAME. Inject via SSM-backed env in production
+# (AI_POOL_KEY_OPENAI, AI_POOL_KEY_GROQ, …) rather than this file; env wins over any value here.
+# A provider with no pool key can't serve managed traffic (→ 503); BYO is unaffected.
+# [pool_keys]
+# openai = "sk-..."
+# anthropic = "sk-ant-..."
+# fireworks = "fw-..."
+
+# Trusted Ed25519 signing PUBLIC keys (kid -> base64). Multiple for zero-downtime rotation.
+# Managed virtual keys (bai_…) are verified against these; BYO raw tokens skip verification.
+[signing_keys]
+# 1 = "<base64-ed25519-public-key>"
diff --git a/crates/gateway/Cargo.toml b/crates/gateway/Cargo.toml
new file mode 100644
index 0000000..75a0b52
--- /dev/null
+++ b/crates/gateway/Cargo.toml
@@ -0,0 +1,62 @@
+[package]
+name = "beyond-ai"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+rust-version.workspace = true
+description = "Beyond AI gateway — egress L7 proxy to LLM providers"
+
+[lib]
+name = "beyond_ai"
+path = "src/lib.rs"
+
+[[bin]]
+name = "beyond-ai"
+path = "src/main.rs"
+
+[dependencies]
+store = { workspace = true }
+
+pingora = { workspace = true }
+pingora-core = { workspace = true }
+pingora-limits = { workspace = true }
+pingora-proxy = { workspace = true }
+
+arc-swap = { workspace = true }
+async-trait = { workspace = true }
+base64 = { workspace = true }
+bytes = { workspace = true }
+clap = { workspace = true }
+ed25519-dalek = { workspace = true }
+figment = { workspace = true }
+memchr = { workspace = true }
+prometheus = { workspace = true }
+rustls = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+thiserror = { workspace = true }
+tokio = { workspace = true }
+tracing = { workspace = true }
+tracing-subscriber = { workspace = true }
+zeroize = { workspace = true }
+
+[dev-dependencies]
+# e2e harness: real gateway subprocess + real nats-server + a mock HTTP upstream.
+base64 = { workspace = true }
+# Bench harnesses. Best tool per job: `divan` for the unit micro-bench (it measures allocations
+# natively via AllocProfiler, alongside timing); `criterion` for the e2e macro-bench (`async_tokio`
+# drives the round-trips, and its saved-baseline comparison tracks latency/RPS over time).
+criterion = { version = "0.5", features = ["async_tokio"] }
+divan = "0.1"
+http-body-util = "0.1"
+hyper = { version = "1", features = ["server", "http1"] }
+hyper-util = { version = "0.1", features = ["tokio"] }
+reqwest = { version = "0.13", default-features = false, features = ["json", "rustls"] }
+
+[[bench]]
+name = "unit"
+harness = false
+
+[[bench]]
+name = "e2e"
+harness = false
diff --git a/crates/gateway/benches/e2e.rs b/crates/gateway/benches/e2e.rs
new file mode 100644
index 0000000..6f23561
--- /dev/null
+++ b/crates/gateway/benches/e2e.rs
@@ -0,0 +1,162 @@
+//! A-1 end-to-end bench: the real `beyond-ai` binary + real `nats-server` + a mock upstream,
+//! driven over real HTTP. Run with `mise run bench:e2e` (needs `nats-server` on PATH — mise
+//! provides it). This is the macro counterpart to `unit.rs`: it measures the *whole* request path
+//! (TCP accept → Pingora filters → key verify → key swap → body stream → upstream → usage tap),
+//! not a single function.
+//!
+//! Reuses the e2e test harness (`tests/common`) verbatim so the bench and the integration tests
+//! exercise the same stack. Allocations are deliberately *not* measured here — the gateway is a
+//! separate process, so its heap is invisible to this binary; allocation regressions belong to the
+//! in-process `unit` bench.
+//!
+//! The stack starts **once** and stays warm for the whole run; each iteration is one (or, for the
+//! throughput group, N concurrent) HTTP round-trip(s) against that live gateway.
+
+#[path = "../tests/common/mod.rs"]
+mod common;
+
+use std::time::Duration;
+
+use criterion::{Criterion, Throughput, criterion_group, criterion_main};
+use tokio::runtime::Runtime;
+use tokio::task::JoinSet;
+
+use beyond_ai::key::{VirtualKey, mint};
+use common::*;
+
+const MANAGED_BODY: &str = r#"{"model":"gpt-4o","messages":[{"role":"user","content":"hi"}]}"#;
+
+/// Concurrency level for the throughput group — enough in-flight requests to expose per-request
+/// overhead and connection-pool behavior without saturating a laptop.
+const CONCURRENCY: u64 = 32;
+
+/// A live, warmed-up stack. Field order matters only for drop (children are killed on drop); we
+/// hold every piece so nothing is torn down mid-bench.
+struct Stack {
+    // RAII guards: held only so their `Drop` (kill subprocess / abort task / clean tempdir) fires
+    // when the bench ends. Never read directly — the requests go through `url`/`client`.
+    #[allow(dead_code)]
+    gw: Gateway,
+    #[allow(dead_code)]
+    mock: MockUpstream,
+    #[allow(dead_code)]
+    nats: Nats,
+    client: reqwest::Client,
+    vkey: String,
+    url: String,
+}
+
+async fn start_stack() -> Stack {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(1);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 42,
+            vpc_id: 7,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    let url = gw.url();
+
+    // Warm until the gateway answers 200 — the watcher connects to NATS and the DNS cache fills on
+    // the first call, neither of which we want inside the timed loop.
+    {
+        let (c, u, k) = (client.clone(), url.clone(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                c.post(format!("{u}/v1/chat/completions"))
+                    .header("authorization", format!("Bearer {k}"))
+                    .header("content-type", "application/json")
+                    .body(MANAGED_BODY)
+                    .send()
+                    .await
+                    .map(|r| r.status().as_u16())
+                    .unwrap_or(0)
+            }
+        })
+        .await;
+    }
+
+    Stack {
+        gw,
+        mock,
+        nats,
+        client,
+        vkey,
+        url,
+    }
+}
+
+/// One full managed round-trip: key swap + body relay + non-streaming usage tap. Drains the
+/// response body so the connection is returned to the pool (otherwise reqwest would open a new
+/// socket every iteration and we'd be benching `connect`, not the gateway).
+async fn managed_roundtrip(s: &Stack) {
+    let resp = s
+        .client
+        .post(format!("{}/v1/chat/completions", s.url))
+        .header("authorization", format!("Bearer {}", s.vkey))
+        .header("content-type", "application/json")
+        .body(MANAGED_BODY)
+        .send()
+        .await
+        .expect("request");
+    debug_assert_eq!(resp.status().as_u16(), 200);
+    let _ = resp.bytes().await.expect("body");
+}
+
+fn bench_e2e(c: &mut Criterion) {
+    let rt = Runtime::new().expect("tokio runtime");
+    let stack = rt.block_on(start_stack());
+
+    let mut group = c.benchmark_group("e2e");
+    // Real round-trips are sub-millisecond on loopback but still ~100× a micro-bench; trim the
+    // sample count so the suite stays in the seconds, not minutes.
+    group.sample_size(50);
+    group.measurement_time(Duration::from_secs(10));
+
+    // Single-request latency through the full proxy.
+    group.bench_function("managed_json_latency", |b| {
+        b.to_async(&rt).iter(|| managed_roundtrip(&stack));
+    });
+
+    // Throughput: CONCURRENCY requests in flight per iteration. `Throughput::Elements` makes
+    // criterion report requests/sec.
+    group.throughput(Throughput::Elements(CONCURRENCY));
+    group.bench_function("managed_json_throughput", |b| {
+        b.to_async(&rt).iter(|| async {
+            let mut set = JoinSet::new();
+            for _ in 0..CONCURRENCY {
+                let client = stack.client.clone();
+                let url = stack.url.clone();
+                let vkey = stack.vkey.clone();
+                set.spawn(async move {
+                    let resp = client
+                        .post(format!("{url}/v1/chat/completions"))
+                        .header("authorization", format!("Bearer {vkey}"))
+                        .header("content-type", "application/json")
+                        .body(MANAGED_BODY)
+                        .send()
+                        .await
+                        .expect("request");
+                    let _ = resp.bytes().await.expect("body");
+                });
+            }
+            while let Some(r) = set.join_next().await {
+                r.expect("task");
+            }
+        });
+    });
+
+    group.finish();
+
+    // Keep the stack alive until every bench has run, then tear it down explicitly.
+    drop(stack);
+}
+
+criterion_group!(benches, bench_e2e);
+criterion_main!(benches);
diff --git a/crates/gateway/benches/unit.rs b/crates/gateway/benches/unit.rs
new file mode 100644
index 0000000..75010dc
--- /dev/null
+++ b/crates/gateway/benches/unit.rs
@@ -0,0 +1,140 @@
+//! Unit bench: the pure, IO-free hot paths. Timing **and** allocations come from `divan` — its
+//! `AllocProfiler` (installed as the global allocator below) reports alloc count + bytes per
+//! sample right beside ns/iter, so the design's allocation claims are visible in one table.
+//! Run with `mise run bench:unit` (or `cargo bench --bench unit`).
+//!
+//! The headline invariant to watch: managed-key **verify** is 0 allocs — it decodes onto the
+//! stack (see `key.rs`). `peek` should hold a flat, tiny alloc count independent of body size
+//! (the O(1)-memory claim). A regression shows up as a non-zero / grown number in the alloc
+//! columns the moment this runs.
+//!
+//! Fixtures are built *outside* the closure handed to `Bencher::bench` (or in `args`), so only the
+//! measured call is timed and counted — setup allocations don't pollute the numbers.
+
+use std::hint::black_box;
+
+use divan::Bencher;
+use divan::counter::BytesCount;
+
+#[global_allocator]
+static ALLOC: divan::AllocProfiler = divan::AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+mod key {
+    use super::*;
+    use beyond_ai::key::{Keyring, VirtualKey, mint};
+    use ed25519_dalek::SigningKey;
+
+    const ID: VirtualKey = VirtualKey {
+        tenant_id: 42,
+        vpc_id: 7,
+    };
+
+    /// Stateless verify — must not touch the heap (stack-only base64 decode + signature check).
+    #[divan::bench]
+    fn verify(bencher: Bencher) {
+        let sk = SigningKey::from_bytes(&[1u8; 32]);
+        let mut ring = Keyring::new();
+        ring.insert(1, sk.verifying_key());
+        let token = mint(&ID, 1, &sk);
+        bencher.bench(|| ring.verify(black_box(&token)));
+    }
+
+    /// Reference mint path (allocates the output string + base64 segments) — tracked so the Go
+    /// control-plane parity implementation has a baseline.
+    #[divan::bench]
+    fn mint_key(bencher: Bencher) {
+        let sk = SigningKey::from_bytes(&[1u8; 32]);
+        bencher.bench(|| mint(black_box(&ID), 1, &sk));
+    }
+}
+
+mod route {
+    use super::*;
+    use beyond_ai::route::{Dialect, dialect_default};
+
+    // Dialect → default provider name: the per-request routing decision (sans override). 0-alloc.
+    #[divan::bench(args = [Dialect::OpenAI, Dialect::Anthropic])]
+    fn dialect_default_name(bencher: Bencher, dialect: Dialect) {
+        bencher.bench(|| dialect_default(black_box(dialect)));
+    }
+}
+
+mod deny {
+    use super::*;
+    use beyond_ai::deny;
+
+    #[divan::bench]
+    fn parse_key() -> Option<u64> {
+        deny::parse_key(black_box("blackhole.123456789"))
+    }
+
+    #[divan::bench]
+    fn parse_reason_bare() -> beyond_ai::deny::DenyReason {
+        deny::parse_reason(black_box(b"spend"))
+    }
+
+    #[divan::bench]
+    fn parse_reason_json() -> beyond_ai::deny::DenyReason {
+        deny::parse_reason(black_box(br#"{"reason":"fraud","exp":123}"#))
+    }
+}
+
+mod usage {
+    use super::*;
+    use beyond_ai::usage::{self, Usage};
+
+    const OAI: &[u8] = br#"{"usage":{"prompt_tokens":12,"completion_tokens":34,"prompt_tokens_details":{"cached_tokens":4}}}"#;
+    const ANT: &[u8] = br#"{"usage":{"input_tokens":100,"output_tokens":50,"cache_read_input_tokens":10,"cache_creation_input_tokens":7}}"#;
+    const OAI_SSE: &[u8] = b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\ndata: {\"choices\":[],\"usage\":{\"prompt_tokens\":5,\"completion_tokens\":9}}\n\ndata: [DONE]\n\n";
+    const ANT_SSE: &[u8] = b"event: message_start\ndata: {\"type\":\"message_start\",\"message\":{\"usage\":{\"input_tokens\":20,\"output_tokens\":0}}}\n\nevent: message_delta\ndata: {\"type\":\"message_delta\",\"usage\":{\"output_tokens\":15}}\n\n";
+
+    #[divan::bench]
+    fn openai_body() -> Option<Usage> {
+        usage::openai_body(black_box(OAI))
+    }
+
+    #[divan::bench]
+    fn anthropic_body() -> Option<Usage> {
+        usage::anthropic_body(black_box(ANT))
+    }
+
+    #[divan::bench]
+    fn openai_stream() -> Option<Usage> {
+        usage::openai_stream(black_box(OAI_SSE))
+    }
+
+    #[divan::bench]
+    fn anthropic_stream() -> Option<Usage> {
+        usage::anthropic_stream(black_box(ANT_SSE))
+    }
+}
+
+mod peek {
+    use super::*;
+    use beyond_ai::peek::ModelScanner;
+
+    /// A realistic chat body with `padding` bytes of message content, the root `model` placed
+    /// **last** so the scanner must walk the whole body (worst case for the streaming scan).
+    fn body_with_model_last(padding: usize) -> Vec<u8> {
+        let content = "x".repeat(padding);
+        format!(r#"{{"messages":[{{"role":"user","content":"{content}"}}],"stream":true,"model":"claude-opus-4-8"}}"#)
+            .into_bytes()
+    }
+
+    /// Sizes span a tiny request, a typical prompt, and a large one (e.g. a pasted document /
+    /// base64 image) that exercises the SIMD fast-skip over uninteresting string content. The
+    /// `BytesCount` makes divan report bytes/sec; the alloc columns should stay flat across sizes.
+    #[divan::bench(args = [0, 4 * 1024, 256 * 1024])]
+    fn scan_model_last(bencher: Bencher, padding: usize) {
+        let body = body_with_model_last(padding);
+        bencher.counter(BytesCount::of_slice(&body)).bench(|| {
+            let mut scanner = ModelScanner::new();
+            scanner.feed(black_box(&body));
+            scanner.take_model()
+        });
+    }
+}
diff --git a/crates/gateway/src/config.rs b/crates/gateway/src/config.rs
new file mode 100644
index 0000000..227a024
--- /dev/null
+++ b/crates/gateway/src/config.rs
@@ -0,0 +1,211 @@
+//! Layered configuration (PATTERNS.md: Figment defaults → TOML → `AI_`-prefixed env).
+//!
+//! Auth + key material come from config (signing public keys, managed pool keys), so the gateway
+//! is fully functional from boot config alone — NATS is only needed for the deny-set.
+
+use crate::error::{GatewayError, Result};
+use crate::key::{Keyring, Kid};
+use crate::secret::Secret;
+use figment::Figment;
+use figment::providers::{Env, Format, Toml};
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::path::Path;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+// `default` so every field is optional. We deliberately do NOT set `deny_unknown_fields`: config is
+// merged from `Env::prefixed("AI_")`, a namespace shared with foreign variables the platform injects
+// (e.g. `AI_AGENT`, `AI_LOG`), so rejecting unknown keys would fail load on a valid environment
+// rather than catch a typo.
+#[serde(default)]
+pub struct AiConfig {
+    /// Downstream listener for client (app) traffic. Internal-only in production (Service Connect
+    /// fronts it as `ai.internal`); no public ingress, so plain HTTP here is fine.
+    pub listen: String,
+    /// Prometheus metrics listener.
+    pub metrics_listen: String,
+
+    /// NATS / slipstream connection (cf. `_envcommon/ecs-service.hcl`: `tls://connect.ngs.global`).
+    /// Used only for the watched deny-set (`blackhole.*`).
+    pub nats_url: String,
+    /// Base64 `.creds` (ECS via SOPS) — takes priority over `nats_creds_file`. Held in `Secret` so
+    /// it can't leak through the `Debug`/`Serialize` this struct derives (a stray `?config` log).
+    pub nats_creds: Option<Secret>,
+    pub nats_creds_file: Option<String>,
+    /// slipstream bucket holding `blackhole.*` (the deny-set — the only thing in NATS).
+    pub config_bucket: String,
+
+    /// Optional path to an on-disk deny-set snapshot (slipstream's append-log + resume cursor). When
+    /// set **and on durable storage** (the edge/tunnel deployment model), a restart seeds the
+    /// deny-set from this file and *resumes the NATS watch from the saved revision* — skipping the
+    /// boot scan and surviving a restart with enforcement intact even before NATS reconnects. Unset
+    /// (the default, e.g. ephemeral/Fargate) ⇒ seed from a NATS scan each boot, unchanged. The file
+    /// is a pure cache: delete it (or point at scratch) and the gateway falls back to scanning.
+    pub snapshot_path: Option<String>,
+
+    /// Trusted Ed25519 signing **public** keys: `kid` (as string — TOML/JSON map keys are strings)
+    /// → base64 public key. Multiple allowed for zero-downtime rotation. Config, not NATS.
+    pub signing_keys: HashMap<String, String>,
+
+    /// Managed Beyond pool keys, **by provider name** (`openai`, `anthropic`, `fireworks`, …).
+    /// From the `[pool_keys]` TOML table or SSM-injected `AI_POOL_KEY_<NAME>` env (the env form is
+    /// the production path — see `load_with_path`). A provider with no pool key here can't serve
+    /// managed traffic (→ 503); BYO is unaffected. Values are `Secret` so a key can't leak through
+    /// the `Debug`/`Serialize` this struct derives; read the plaintext via `expose` at the use site.
+    pub pool_keys: HashMap<String, Secret>,
+
+    /// Per-provider upstream authority (`host:port`), **by provider name**. For a known provider
+    /// (see `route::KNOWN_PROVIDERS`) this *overrides* its default; for an unknown name it *adds* a
+    /// new OpenAI-wire provider reachable via `x-beyond-provider`. Empty = every known provider uses
+    /// its built-in default. (The e2e harness points providers at a mock here.)
+    pub provider_authorities: HashMap<String, String>,
+
+    /// Upstream timeouts (seconds). Streaming responses are long, so read/idle are generous.
+    pub connect_timeout_secs: u64,
+    pub read_timeout_secs: u64,
+    pub write_timeout_secs: u64,
+    pub idle_timeout_secs: u64,
+
+    /// TLS to the upstream provider. Real providers are HTTPS (true); the e2e harness sets false
+    /// to talk to a plaintext mock.
+    pub upstream_tls: bool,
+
+    /// Per-key request-rate ceiling (requests/sec). A blast-radius guardrail (see `ratelimit`), not
+    /// a spend control: it caps how fast a single tenant (managed) or BYO caller can drive the
+    /// gateway, bounding a leaked/runaway key during the deny-set's reaction lag and a failure flood
+    /// that never bills. `0` disables it. The default is generous — a circuit breaker, not a quota;
+    /// tune from `ai_rejections_total{reason="rate_limit"}`.
+    pub rate_limit_rps: u32,
+}
+
+impl Default for AiConfig {
+    fn default() -> Self {
+        Self {
+            listen: "0.0.0.0:8080".to_string(),
+            metrics_listen: "0.0.0.0:9090".to_string(),
+            nats_url: "nats://localhost:4222".to_string(),
+            nats_creds: None,
+            nats_creds_file: None,
+            config_bucket: "ai-gateway".to_string(),
+            snapshot_path: None,
+            signing_keys: HashMap::new(),
+            pool_keys: HashMap::new(),
+            provider_authorities: HashMap::new(),
+            connect_timeout_secs: 10,
+            // Generous: LLM streams can run for minutes; a tight read timeout would kill them.
+            read_timeout_secs: 600,
+            write_timeout_secs: 60,
+            idle_timeout_secs: 90,
+            upstream_tls: true,
+            // Generous per-key circuit breaker, on by default. Won't touch legitimate steady-state
+            // traffic; caps a runaway/leaked key or a retry-storm flood. Set 0 to disable.
+            rate_limit_rps: 100,
+        }
+    }
+}
+
+impl AiConfig {
+    pub fn load_with_path(path: Option<&Path>) -> Result<Self> {
+        let mut fig = Figment::from(figment::providers::Serialized::defaults(AiConfig::default()));
+        fig = fig.merge(Toml::file(path.unwrap_or_else(|| Path::new("config.toml"))));
+        // Flat mapping: `AI_READ_TIMEOUT_SECS` → `read_timeout_secs`. (No `.split('_')` — these are
+        // flat fields, not nested tables.) Unknown `AI_*` vars are tolerated (see the
+        // `deny_unknown_fields` note on `AiConfig`) — which is also why pool keys are collected
+        // separately below rather than via this flat merge.
+        fig = fig.merge(Env::prefixed("AI_"));
+        let mut cfg: AiConfig = fig
+            .extract()
+            .map_err(|e| GatewayError::Config(e.to_string()))?;
+        cfg.merge_pool_key_env(std::env::vars());
+        Ok(cfg)
+    }
+
+    /// Fold `AI_POOL_KEY_<NAME>` environment variables into `pool_keys` (provider name lowercased).
+    /// This is the production secret path (SSM-injected env); a flat figment merge can't target a
+    /// map field, and env must win over any `[pool_keys]` value baked into a config file.
+    fn merge_pool_key_env(&mut self, vars: impl Iterator<Item = (String, String)>) {
+        for (k, v) in vars {
+            if let Some(name) = k.strip_prefix("AI_POOL_KEY_") {
+                self.pool_keys
+                    .insert(name.to_ascii_lowercase(), Secret::new(v));
+            }
+        }
+    }
+
+    /// Build the trusted keyring from the configured signing public keys.
+    pub fn build_keyring(&self) -> Result<Keyring> {
+        let mut ring = Keyring::new();
+        for (kid_str, b64) in &self.signing_keys {
+            let kid: Kid = kid_str
+                .parse()
+                .map_err(|_| GatewayError::Config(format!("invalid signing key id {kid_str}")))?;
+            let vk = crate::key::verifying_key_from_value(b64.as_bytes()).ok_or_else(|| {
+                GatewayError::Config(format!("invalid signing public key for kid {kid}"))
+            })?;
+            ring.insert(kid, vk);
+        }
+        Ok(ring)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn defaults_are_sane() {
+        let c = AiConfig::default();
+        // Read timeout must comfortably exceed a long stream.
+        assert!(c.read_timeout_secs >= 300);
+        assert_eq!(c.config_bucket, "ai-gateway");
+    }
+
+    #[test]
+    fn loads_without_a_file() {
+        let c = AiConfig::load_with_path(None).unwrap();
+        assert_eq!(c.listen, "0.0.0.0:8080");
+    }
+
+    #[test]
+    fn build_keyring_rejects_non_numeric_kid() {
+        // `kid` is parsed as `u32`; a non-numeric map key must fail boot (loud) rather than
+        // silently drop a trusted signing key (which would 401 every token under it).
+        let c = AiConfig {
+            signing_keys: HashMap::from([("not-a-number".to_string(), "AAAA".to_string())]),
+            ..Default::default()
+        };
+        assert!(c.build_keyring().is_err());
+    }
+
+    #[test]
+    fn build_keyring_rejects_invalid_public_key() {
+        // A value that is neither raw 32 bytes nor base64 of 32 bytes must fail boot, not install a
+        // bogus key that can never verify anything.
+        let c = AiConfig {
+            signing_keys: HashMap::from([("1".to_string(), "!!! not base64 !!!".to_string())]),
+            ..Default::default()
+        };
+        assert!(c.build_keyring().is_err());
+    }
+
+    #[test]
+    fn pool_key_env_merges_and_overrides() {
+        // `AI_POOL_KEY_<NAME>` → `pool_keys[name]` (lowercased), and env wins over a config-file
+        // value (the production secret path). A non-pool `AI_*` var is ignored.
+        let mut c = AiConfig {
+            pool_keys: HashMap::from([("openai".to_string(), Secret::new("from-file"))]),
+            ..Default::default()
+        };
+        c.merge_pool_key_env(
+            [
+                ("AI_POOL_KEY_OPENAI".to_string(), "from-env".to_string()),
+                ("AI_POOL_KEY_GROQ".to_string(), "gsk-x".to_string()),
+                ("AI_LOG".to_string(), "debug".to_string()),
+            ]
+            .into_iter(),
+        );
+        assert_eq!(c.pool_keys.get("openai").unwrap().expose(), "from-env");
+        assert_eq!(c.pool_keys.get("groq").unwrap().expose(), "gsk-x");
+        assert!(!c.pool_keys.contains_key("log"));
+    }
+}
diff --git a/crates/gateway/src/deny.rs b/crates/gateway/src/deny.rs
new file mode 100644
index 0000000..503b39b
--- /dev/null
+++ b/crates/gateway/src/deny.rs
@@ -0,0 +1,152 @@
+//! Sparse per-tenant deny-set — the gateway's *entire* spend/fraud surface.
+//!
+//! Design (deliberate, see plan): the gateway only ever asks "is this tenant cut off?" and
+//! default-**allows** on a miss. We hold **only the exceptions** (the cut-off tenants), so memory
+//! is `O(denied)`, not `O(tenants)` — this scales to millions of tenants because `denied` stays a
+//! tiny slice (a few MB even at 1M entries; a tenant id is 8 bytes). The gateway never decides
+//! *why* a tenant is denied — the control plane writes/removes entries; we just enforce + log.
+//!
+//! TTL/auto-restore is handled by slipstream, not here: spend holds are written with a TTL to the
+//! next budget reset, so they expire into a `Del` event that removes them; fraud holds have no TTL
+//! (sticky). This struct only reflects current membership.
+
+use std::collections::HashMap;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DenyReason {
+    /// Over budget. Typically written with a TTL to the next reset → auto-restores.
+    Spend,
+    /// Abuse / fraud. Sticky (no TTL) until a human clears it.
+    Fraud,
+    /// Reason not recognized in the entry value — still denied (fail safe on the enforce side).
+    Unknown,
+}
+
+impl DenyReason {
+    /// HTTP status to return. 402 Payment Required for spend, 403 Forbidden for fraud/other —
+    /// gives the client (and our own dashboards) a meaningful signal without leaking detail.
+    pub fn http_status(self) -> u16 {
+        match self {
+            DenyReason::Spend => 402,
+            DenyReason::Fraud | DenyReason::Unknown => 403,
+        }
+    }
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct DenySet {
+    denied: HashMap<u64, DenyReason>,
+}
+
+impl DenySet {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Default-allow: absence from the set = allowed. This is the safe-for-availability default —
+    /// a tenant we've never heard of is served, not blocked.
+    pub fn is_denied(&self, tenant_id: u64) -> bool {
+        self.denied.contains_key(&tenant_id)
+    }
+
+    pub fn reason(&self, tenant_id: u64) -> Option<DenyReason> {
+        self.denied.get(&tenant_id).copied()
+    }
+
+    pub fn insert(&mut self, tenant_id: u64, reason: DenyReason) {
+        self.denied.insert(tenant_id, reason);
+    }
+
+    pub fn remove(&mut self, tenant_id: u64) {
+        self.denied.remove(&tenant_id);
+    }
+
+    pub fn len(&self) -> usize {
+        self.denied.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.denied.is_empty()
+    }
+}
+
+impl FromIterator<(u64, DenyReason)> for DenySet {
+    fn from_iter<I: IntoIterator<Item = (u64, DenyReason)>>(iter: I) -> Self {
+        Self {
+            denied: iter.into_iter().collect(),
+        }
+    }
+}
+
+/// Parse a slipstream deny key `blackhole.{tenant_id}` → tenant id. Returns `None` for keys that
+/// don't match (so an unrelated watched key never corrupts the set).
+pub fn parse_key(key: &str) -> Option<u64> {
+    key.strip_prefix("blackhole.")?.parse().ok()
+}
+
+/// Parse the entry value into a reason. Accepts either a bare token (`spend`/`fraud`) or a JSON
+/// object `{"reason":"spend", ...}`. Anything else → `Unknown` (still denied — fail safe).
+pub fn parse_reason(value: &[u8]) -> DenyReason {
+    let s = std::str::from_utf8(value).unwrap_or("").trim();
+    let token = if s.starts_with('{') {
+        serde_json::from_slice::<serde_json::Value>(value)
+            .ok()
+            .and_then(|v| v.get("reason").and_then(|r| r.as_str()).map(str::to_owned))
+            .unwrap_or_default()
+    } else {
+        s.to_owned()
+    };
+    match token.as_str() {
+        "spend" => DenyReason::Spend,
+        "fraud" => DenyReason::Fraud,
+        _ => DenyReason::Unknown,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn default_allows_unknown_tenants() {
+        let set = DenySet::new();
+        assert!(!set.is_denied(12345));
+    }
+
+    #[test]
+    fn insert_remove_and_reason() {
+        let mut set = DenySet::new();
+        set.insert(1, DenyReason::Spend);
+        set.insert(2, DenyReason::Fraud);
+        assert!(set.is_denied(1));
+        assert_eq!(set.reason(1), Some(DenyReason::Spend));
+        assert_eq!(set.reason(2).unwrap().http_status(), 403);
+        set.remove(1);
+        assert!(!set.is_denied(1)); // restored
+        assert_eq!(set.len(), 1);
+    }
+
+    #[test]
+    fn key_parsing() {
+        assert_eq!(parse_key("blackhole.42"), Some(42));
+        assert_eq!(parse_key("blackhole.notanumber"), None);
+        assert_eq!(parse_key("signkey.1"), None);
+    }
+
+    #[test]
+    fn reason_parsing_bare_and_json() {
+        assert_eq!(parse_reason(b"spend"), DenyReason::Spend);
+        assert_eq!(parse_reason(b" fraud "), DenyReason::Fraud);
+        assert_eq!(
+            parse_reason(br#"{"reason":"spend","exp":123}"#),
+            DenyReason::Spend
+        );
+        assert_eq!(parse_reason(b"weird"), DenyReason::Unknown);
+    }
+
+    #[test]
+    fn spend_is_402_fraud_is_403() {
+        assert_eq!(DenyReason::Spend.http_status(), 402);
+        assert_eq!(DenyReason::Fraud.http_status(), 403);
+    }
+}
diff --git a/crates/gateway/src/doctor.rs b/crates/gateway/src/doctor.rs
new file mode 100644
index 0000000..1db5852
--- /dev/null
+++ b/crates/gateway/src/doctor.rs
@@ -0,0 +1,61 @@
+//! Diagnostics (PATTERNS.md `doctor` pattern): fast prerequisite checks, exit 0/1.
+
+use crate::config::AiConfig;
+
+pub struct CheckResult {
+    pub name: &'static str,
+    pub passed: bool,
+    pub message: String,
+    pub hint: Option<String>,
+}
+
+fn pass(name: &'static str, message: impl Into<String>) -> CheckResult {
+    CheckResult {
+        name,
+        passed: true,
+        message: message.into(),
+        hint: None,
+    }
+}
+
+fn fail(name: &'static str, message: impl Into<String>, hint: &str) -> CheckResult {
+    CheckResult {
+        name,
+        passed: false,
+        message: message.into(),
+        hint: Some(hint.to_string()),
+    }
+}
+
+pub async fn run_checks(config: &AiConfig) -> Vec<CheckResult> {
+    let mut out = Vec::new();
+
+    // NATS / slipstream reachability — without it we can't load signing keys or the deny-set.
+    match store::nats_connect(
+        &config.nats_url,
+        config.nats_creds.as_ref().map(|s| s.expose()),
+        config.nats_creds_file.as_deref(),
+    )
+    .await
+    {
+        Ok(_) => out.push(pass("nats", format!("connected to {}", config.nats_url))),
+        Err(e) => out.push(fail(
+            "nats",
+            e.to_string(),
+            "check AI_NATS_URL and credentials",
+        )),
+    }
+
+    out
+}
+
+pub fn print_results(title: &str, results: &[CheckResult]) {
+    println!("== {title} ==");
+    for r in results {
+        let mark = if r.passed { "ok" } else { "FAIL" };
+        println!("[{mark}] {}: {}", r.name, r.message);
+        if let (false, Some(hint)) = (r.passed, &r.hint) {
+            println!("       hint: {hint}");
+        }
+    }
+}
diff --git a/crates/gateway/src/error.rs b/crates/gateway/src/error.rs
new file mode 100644
index 0000000..9f841ea
--- /dev/null
+++ b/crates/gateway/src/error.rs
@@ -0,0 +1,15 @@
+//! Structured error type (PATTERNS.md convention: `thiserror` enum, `From` for foreign errors).
+
+#[derive(Debug, thiserror::Error)]
+pub enum GatewayError {
+    #[error("configuration error: {0}")]
+    Config(String),
+
+    #[error("store error: {0}")]
+    Store(#[from] store::KvError),
+
+    #[error("dns resolution error: {0}")]
+    Dns(String),
+}
+
+pub type Result<T> = std::result::Result<T, GatewayError>;
diff --git a/crates/gateway/src/key.rs b/crates/gateway/src/key.rs
new file mode 100644
index 0000000..923aa1e
--- /dev/null
+++ b/crates/gateway/src/key.rs
@@ -0,0 +1,354 @@
+//! Stateless virtual API key: `bai_v1.{kid}.{payload}.{sig}`.
+//!
+//! The gateway authenticates every request from a `{payload}` it can verify **without a
+//! lookup**: tenant/app identity lives *inside* the token, signed with Ed25519. We hold only
+//! the *public* keys (by `kid`), so a compromised — or third-party / OSS — gateway can verify
+//! but **cannot mint** new tenant keys; the private signing key lives only in the control plane.
+//!
+//! Why signed-token instead of opaque-token + registry lookup: at millions of tenants we don't
+//! want a per-request lookup (latency + a state dependency) just to learn *who* is calling.
+//! Identity is stateless here; the only per-request state is the sparse deny-set (see `deny`),
+//! which is a membership check, not an identity lookup.
+//!
+//! Why deterministic (no nonce/timestamp in the payload): `mint(tenant, app)` is reproducible,
+//! so the control plane can re-derive a tenant's key on demand and store nothing. Revocation is
+//! handled out-of-band by the deny-set, not by per-key expiry.
+
+use base64::Engine;
+use base64::engine::general_purpose::URL_SAFE_NO_PAD;
+use ed25519_dalek::{Signature, Signer, SigningKey, Verifier, VerifyingKey};
+use std::collections::HashMap;
+
+/// Wire prefix + version. Bumping the version is a breaking change to the token format;
+/// the version is inside the signed bytes so it cannot be downgraded by an attacker.
+pub const PREFIX: &str = "bai_v1";
+
+/// Signing-key identifier. Lets the control plane rotate signing keys: new tokens are minted
+/// under a new `kid` while the gateway still trusts the public keys of older, un-retired `kid`s.
+pub type Kid = u32;
+
+/// The identity carried by (and the entire contents of) a virtual key.
+///
+/// `tenant_id`/`vpc_id` are `u64` to match the platform's id width (cf. ClickHouse
+/// `tenant_id UInt64` / `vpc_id UInt64`) and to keep the payload a fixed 16 bytes.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct VirtualKey {
+    pub tenant_id: u64,
+    pub vpc_id: u64,
+}
+
+impl VirtualKey {
+    /// Fixed 16-byte little-endian payload: `tenant_id ++ vpc_id`. Fixed layout (not JSON) so
+    /// the encoding is deterministic byte-for-byte — required for `mint` to be reproducible.
+    fn encode_payload(&self) -> [u8; 16] {
+        let mut out = [0u8; 16];
+        out[..8].copy_from_slice(&self.tenant_id.to_le_bytes());
+        out[8..].copy_from_slice(&self.vpc_id.to_le_bytes());
+        out
+    }
+
+    fn decode_payload(bytes: &[u8]) -> Option<Self> {
+        if bytes.len() != 16 {
+            return None;
+        }
+        Some(Self {
+            tenant_id: u64::from_le_bytes(bytes[..8].try_into().ok()?),
+            vpc_id: u64::from_le_bytes(bytes[8..].try_into().ok()?),
+        })
+    }
+}
+
+#[derive(Debug, thiserror::Error, PartialEq, Eq)]
+pub enum KeyError {
+    #[error("malformed virtual key")]
+    Malformed,
+    #[error("unsupported key version")]
+    BadVersion,
+    #[error("unknown signing key id {0}")]
+    UnknownKid(Kid),
+    #[error("signature verification failed")]
+    BadSignature,
+}
+
+/// The set of trusted Ed25519 public keys, indexed by `kid`. Built once at boot from config
+/// (`signing_keys`); multiple kids may be trusted at once for zero-downtime rotation via redeploy.
+#[derive(Debug, Default, Clone)]
+pub struct Keyring {
+    keys: HashMap<Kid, VerifyingKey>,
+}
+
+impl Keyring {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn insert(&mut self, kid: Kid, key: VerifyingKey) {
+        self.keys.insert(kid, key);
+    }
+
+    pub fn get(&self, kid: Kid) -> Option<&VerifyingKey> {
+        self.keys.get(&kid)
+    }
+
+    pub fn remove(&mut self, kid: Kid) {
+        self.keys.remove(&kid);
+    }
+
+    pub fn len(&self) -> usize {
+        self.keys.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.keys.is_empty()
+    }
+
+    /// Verify a virtual key string and extract its identity. Stateless: the only input besides
+    /// the token is the public keyring.
+    pub fn verify(&self, token: &str) -> Result<VirtualKey, KeyError> {
+        // Split into exactly 4 parts: `bai_v1`, kid, payload, sig. `splitn(4, '.')` rejects any
+        // token with fewer separators; a payload/sig never contains '.' (base64url has none).
+        let mut parts = token.splitn(4, '.');
+        let prefix = parts.next().ok_or(KeyError::Malformed)?;
+        let kid_str = parts.next().ok_or(KeyError::Malformed)?;
+        let payload_b64 = parts.next().ok_or(KeyError::Malformed)?;
+        let sig_b64 = parts.next().ok_or(KeyError::Malformed)?;
+
+        if prefix != PREFIX {
+            // Distinguish "wrong version of our token" from "not our token at all" only loosely;
+            // both are unauthenticated. A `bai_vN` with N != 1 reports BadVersion for clarity.
+            return if prefix.starts_with("bai_v") {
+                Err(KeyError::BadVersion)
+            } else {
+                Err(KeyError::Malformed)
+            };
+        }
+
+        let kid: Kid = kid_str.parse().map_err(|_| KeyError::Malformed)?;
+
+        // Decode the fixed-size fields straight onto the stack — no per-request heap allocation on
+        // the verify hot path. The payload is always 16 bytes, the signature 64. `decode_slice`
+        // sizes its bounds check against a (ceil) estimate, so the buffers are a few bytes larger
+        // than the exact decoded length; we slice to what was actually written and the fixed-size
+        // checks below reject anything off (an oversized field overruns the estimate → Malformed).
+        let mut payload_buf = [0u8; 24]; // ≥ estimate for a 22-char (16-byte) payload
+        let plen = URL_SAFE_NO_PAD
+            .decode_slice(payload_b64, &mut payload_buf)
+            .map_err(|_| KeyError::Malformed)?;
+        let payload = &payload_buf[..plen];
+
+        let mut sig_buf = [0u8; 72]; // ≥ estimate for an 86-char (64-byte) signature
+        let slen = URL_SAFE_NO_PAD
+            .decode_slice(sig_b64, &mut sig_buf)
+            .map_err(|_| KeyError::Malformed)?;
+        let sig_arr: [u8; 64] = sig_buf[..slen]
+            .try_into()
+            .map_err(|_| KeyError::Malformed)?;
+        let signature = Signature::from_bytes(&sig_arr);
+
+        // Resolve the public key *before* the cryptographic check so an unknown kid is a distinct,
+        // cheap rejection (no signature math on keys we don't trust).
+        let vk = self.get(kid).ok_or(KeyError::UnknownKid(kid))?;
+
+        // The signed message binds version + kid + payload, so none can be swapped independently.
+        // Build it into a stack buffer (≤ 40 bytes) — no allocation per verify.
+        let mut signed_buf = [0u8; SIGNED_BYTES_CAP];
+        let signed = write_signed_bytes(&mut signed_buf, kid, payload_b64);
+        vk.verify(signed, &signature)
+            .map_err(|_| KeyError::BadSignature)?;
+
+        VirtualKey::decode_payload(payload).ok_or(KeyError::Malformed)
+    }
+}
+
+/// Upper bound on `bai_v1.{kid}.{payload}`: `PREFIX` (6) + `.` + a `u32` kid (≤ 10 digits) + `.`
+/// + a 16-byte base64url payload (22 chars) = 40 bytes. 64 leaves headroom.
+const SIGNED_BYTES_CAP: usize = 64;
+
+/// Write the signature-covered bytes `bai_v1.{kid}.{payload}` into `buf`, returning the written
+/// slice. Binding kid + payload here is what stops an attacker from re-pointing a valid signature
+/// at a different kid or a tampered payload. Bounded length (see `SIGNED_BYTES_CAP`) so the fixed
+/// buffer never overflows — keeps both verify and mint allocation-free.
+fn write_signed_bytes<'a>(
+    buf: &'a mut [u8; SIGNED_BYTES_CAP],
+    kid: Kid,
+    payload_b64: &str,
+) -> &'a [u8] {
+    use std::io::Write;
+    let mut cur = std::io::Cursor::new(&mut buf[..]);
+    write!(cur, "{PREFIX}.{kid}.{payload_b64}").expect("signed bytes fit in SIGNED_BYTES_CAP");
+    let n = cur.position() as usize;
+    &buf[..n]
+}
+
+/// Parse an Ed25519 public key from a slipstream `signkey.*` value: accept raw 32 bytes or
+/// base64 (standard or url-safe) of 32 bytes, so the control plane can store whichever form.
+pub fn verifying_key_from_value(bytes: &[u8]) -> Option<VerifyingKey> {
+    if let Ok(arr) = <[u8; 32]>::try_from(bytes) {
+        return VerifyingKey::from_bytes(&arr).ok();
+    }
+    let s = std::str::from_utf8(bytes).ok()?.trim();
+    for decoded in [
+        base64::engine::general_purpose::STANDARD.decode(s).ok(),
+        URL_SAFE_NO_PAD.decode(s).ok(),
+    ]
+    .into_iter()
+    .flatten()
+    {
+        if let Ok(arr) = <[u8; 32]>::try_from(decoded.as_slice()) {
+            return VerifyingKey::from_bytes(&arr).ok();
+        }
+    }
+    None
+}
+
+/// Mint a virtual key. Lives here for tests + determinism checks and as the reference
+/// implementation; production minting is the Go control plane (`crypto/ed25519`), which must
+/// produce byte-identical output for the same inputs.
+pub fn mint(vk: &VirtualKey, kid: Kid, signing_key: &SigningKey) -> String {
+    let payload_b64 = URL_SAFE_NO_PAD.encode(vk.encode_payload());
+    let mut signed_buf = [0u8; SIGNED_BYTES_CAP];
+    let signed = write_signed_bytes(&mut signed_buf, kid, &payload_b64);
+    let sig: Signature = signing_key.sign(signed);
+    let sig_b64 = URL_SAFE_NO_PAD.encode(sig.to_bytes());
+    format!("{PREFIX}.{kid}.{payload_b64}.{sig_b64}")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Deterministic test keypair from a fixed seed — avoids an RNG dep and keeps tests reproducible.
+    fn test_keypair(seed: u8) -> (SigningKey, VerifyingKey) {
+        let sk = SigningKey::from_bytes(&[seed; 32]);
+        let vk = sk.verifying_key();
+        (sk, vk)
+    }
+
+    fn ring_with(kid: Kid, vk: VerifyingKey) -> Keyring {
+        let mut r = Keyring::new();
+        r.insert(kid, vk);
+        r
+    }
+
+    #[test]
+    fn mint_then_verify_roundtrips_identity() {
+        let (sk, vk) = test_keypair(1);
+        let ring = ring_with(7, vk);
+        let id = VirtualKey {
+            tenant_id: 42,
+            vpc_id: 99,
+        };
+
+        let token = mint(&id, 7, &sk);
+        assert_eq!(ring.verify(&token).unwrap(), id);
+    }
+
+    #[test]
+    fn mint_is_deterministic() {
+        let (sk, _) = test_keypair(2);
+        let id = VirtualKey {
+            tenant_id: 1,
+            vpc_id: 2,
+        };
+        // Ed25519 is deterministic (RFC 8032) and the payload has no nonce, so two mints match.
+        assert_eq!(mint(&id, 1, &sk), mint(&id, 1, &sk));
+    }
+
+    #[test]
+    fn tampered_payload_is_rejected() {
+        let (sk, vk) = test_keypair(3);
+        let ring = ring_with(1, vk);
+        let token = mint(
+            &VirtualKey {
+                tenant_id: 10,
+                vpc_id: 20,
+            },
+            1,
+            &sk,
+        );
+
+        // Flip a byte in the payload segment; the signature no longer covers it.
+        let mut parts: Vec<&str> = token.split('.').collect();
+        let mut payload = URL_SAFE_NO_PAD.decode(parts[2]).unwrap();
+        payload[0] ^= 0xff;
+        let tampered_payload = URL_SAFE_NO_PAD.encode(&payload);
+        parts[2] = &tampered_payload;
+        let tampered = parts.join(".");
+
+        assert_eq!(ring.verify(&tampered), Err(KeyError::BadSignature));
+    }
+
+    #[test]
+    fn tampered_signature_is_rejected() {
+        let (sk, vk) = test_keypair(4);
+        let ring = ring_with(1, vk);
+        let token = mint(
+            &VirtualKey {
+                tenant_id: 5,
+                vpc_id: 6,
+            },
+            1,
+            &sk,
+        );
+
+        let mut sig = URL_SAFE_NO_PAD
+            .decode(token.rsplit('.').next().unwrap())
+            .unwrap();
+        sig[0] ^= 0xff;
+        let bad_sig = URL_SAFE_NO_PAD.encode(&sig);
+        let base = &token[..token.rfind('.').unwrap()];
+        let tampered = format!("{base}.{bad_sig}");
+
+        assert_eq!(ring.verify(&tampered), Err(KeyError::BadSignature));
+    }
+
+    #[test]
+    fn unknown_kid_is_rejected_without_crypto() {
+        let (sk, vk) = test_keypair(5);
+        let ring = ring_with(1, vk); // trusts kid=1 only
+        let token = mint(
+            &VirtualKey {
+                tenant_id: 1,
+                vpc_id: 1,
+            },
+            2,
+            &sk,
+        ); // minted under kid=2
+        assert_eq!(ring.verify(&token), Err(KeyError::UnknownKid(2)));
+    }
+
+    #[test]
+    fn signature_from_a_different_kid_is_rejected() {
+        // A valid signature minted under kid=2 must not verify when presented as kid=1, even if
+        // the gateway trusts both — because kid is part of the signed bytes.
+        let (sk1, vk1) = test_keypair(6);
+        let (sk2, vk2) = test_keypair(7);
+        let mut ring = Keyring::new();
+        ring.insert(1, vk1);
+        ring.insert(2, vk2);
+
+        let id = VirtualKey {
+            tenant_id: 3,
+            vpc_id: 4,
+        };
+        let token2 = mint(&id, 2, &sk2);
+        // Re-label the kid segment as 1 while keeping kid=2's signature.
+        let parts: Vec<&str> = token2.split('.').collect();
+        let relabeled = format!("{}.1.{}.{}", parts[0], parts[2], parts[3]);
+        assert_eq!(ring.verify(&relabeled), Err(KeyError::BadSignature));
+        let _ = sk1;
+    }
+
+    #[test]
+    fn malformed_and_version_errors() {
+        let (_, vk) = test_keypair(8);
+        let ring = ring_with(1, vk);
+        assert_eq!(ring.verify("garbage"), Err(KeyError::Malformed));
+        assert_eq!(ring.verify("bai_v1.1.only-three"), Err(KeyError::Malformed));
+        assert_eq!(ring.verify("bai_v2.1.aaaa.bbbb"), Err(KeyError::BadVersion));
+        assert_eq!(
+            ring.verify("sk-openai.1.aaaa.bbbb"),
+            Err(KeyError::Malformed)
+        );
+    }
+}
diff --git a/crates/gateway/src/lib.rs b/crates/gateway/src/lib.rs
new file mode 100644
index 0000000..0c529d3
--- /dev/null
+++ b/crates/gateway/src/lib.rs
@@ -0,0 +1,25 @@
+//! Beyond AI gateway library.
+//!
+//! `src/main.rs` wires these modules into a Pingora `ProxyHttp` service. The load-bearing logic
+//! (virtual-key verification, deny-set, usage parsing, routing, request peek) lives in modules
+//! free of Pingora/IO so it is unit-tested without a running proxy or live providers.
+
+// Application crate: no `unsafe` is needed, so forbid it outright. `unused_must_use` is denied so
+// a dropped `Result` (e.g. an unchecked `write_response_*`) is a hard error, not a silent swallow.
+#![deny(unsafe_code)]
+#![deny(unused_must_use)]
+
+pub mod config;
+pub mod deny;
+pub mod doctor;
+pub mod error;
+pub mod key;
+pub mod metrics;
+pub mod peek;
+pub mod proxy;
+pub mod ratelimit;
+pub mod route;
+pub mod secret;
+pub mod state;
+pub mod store_watch;
+pub mod usage;
diff --git a/crates/gateway/src/main.rs b/crates/gateway/src/main.rs
new file mode 100644
index 0000000..01e662b
--- /dev/null
+++ b/crates/gateway/src/main.rs
@@ -0,0 +1,132 @@
+//! Beyond AI gateway binary: clap `Run`/`Doctor`, Pingora server bootstrap, services.
+
+use beyond_ai::config::AiConfig;
+use beyond_ai::doctor;
+use beyond_ai::metrics::Metrics;
+use beyond_ai::proxy::AiProxy;
+use beyond_ai::state::GatewayState;
+use beyond_ai::store_watch::WatcherService;
+use clap::{Parser, Subcommand};
+use pingora_core::server::Server;
+use pingora_core::services::background::background_service;
+use pingora_proxy::http_proxy_service;
+use std::path::Path;
+use std::process::exit;
+use tracing_subscriber::EnvFilter;
+use tracing_subscriber::layer::SubscriberExt;
+use tracing_subscriber::util::SubscriberInitExt;
+
+#[derive(Parser)]
+#[command(
+    name = "beyond-ai",
+    about = "Beyond AI gateway — egress proxy to LLM providers"
+)]
+struct Cli {
+    /// Path to config file (defaults to ./config.toml).
+    #[arg(short, long, env = "AI_CONFIG_PATH", global = true)]
+    config: Option<std::path::PathBuf>,
+
+    #[command(subcommand)]
+    command: Option<Commands>,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    /// Run prerequisite diagnostics and exit.
+    Doctor,
+    /// Start the gateway (default).
+    Run,
+}
+
+fn load_config(path: Option<&Path>) -> AiConfig {
+    match AiConfig::load_with_path(path) {
+        Ok(c) => c,
+        Err(e) => {
+            eprintln!("failed to load config: {e}");
+            exit(1);
+        }
+    }
+}
+
+fn init_tracing() {
+    // JSON to stdout; the `ai.usage` target carries billing facts that logfwd/OTLP ships to
+    // ClickHouse. `AI_LOG` overrides the level filter.
+    let filter = EnvFilter::try_from_env("AI_LOG").unwrap_or_else(|_| EnvFilter::new("info"));
+    tracing_subscriber::registry()
+        .with(tracing_subscriber::fmt::layer().json())
+        .with(filter)
+        .init();
+}
+
+fn main() {
+    // rustls 0.23 requires a process-wide crypto provider for the TLS connections to providers.
+    rustls::crypto::ring::default_provider()
+        .install_default()
+        .ok();
+
+    let cli = Cli::parse();
+
+    // Doctor runs before any server setup (minimal current-thread runtime), exits 0/1.
+    if matches!(cli.command, Some(Commands::Doctor)) {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("runtime");
+        let config = load_config(cli.config.as_deref());
+        let results = rt.block_on(doctor::run_checks(&config));
+        doctor::print_results("Beyond AI Gateway Doctor", &results);
+        exit(if results.iter().all(|r| r.passed) {
+            0
+        } else {
+            1
+        });
+    }
+
+    init_tracing();
+    let config = load_config(cli.config.as_deref());
+    let listen = config.listen.clone();
+    let metrics_listen = config.metrics_listen.clone();
+    let metrics = match Metrics::new() {
+        Ok(m) => m,
+        Err(e) => {
+            eprintln!("failed to register metrics: {e}");
+            exit(1);
+        }
+    };
+    let state = match GatewayState::new(config, metrics) {
+        Ok(s) => s,
+        Err(e) => {
+            eprintln!("failed to build gateway state: {e}");
+            exit(1);
+        }
+    };
+
+    let mut server = Server::new(None).expect("failed to init pingora server");
+    server.bootstrap();
+
+    // Client (app) traffic.
+    let mut proxy_svc = http_proxy_service(
+        &server.configuration,
+        AiProxy {
+            state: state.clone(),
+        },
+    );
+    proxy_svc.add_tcp(&listen);
+    server.add_service(proxy_svc);
+
+    // slipstream watchers + NATS connectivity (connects on Pingora's runtime; see WatcherService).
+    server.add_service(background_service(
+        "ai-watchers",
+        WatcherService {
+            state: state.clone(),
+        },
+    ));
+
+    // Prometheus /metrics (serves the default registry that `Metrics` registered on).
+    let mut prom = pingora::services::listening::Service::prometheus_http_service();
+    prom.add_tcp(&metrics_listen);
+    server.add_service(prom);
+
+    tracing::info!(%listen, %metrics_listen, "starting beyond-ai");
+    server.run_forever();
+}
diff --git a/crates/gateway/src/metrics.rs b/crates/gateway/src/metrics.rs
new file mode 100644
index 0000000..81ee864
--- /dev/null
+++ b/crates/gateway/src/metrics.rs
@@ -0,0 +1,67 @@
+//! Prometheus metrics (PATTERNS.md: `Arc<Metrics>`).
+//!
+//! Registered on the **default** registry so Pingora's built-in `prometheus_http_service`
+//! exposes them with no extra wiring. `Metrics::new` is called exactly once (in `main`).
+
+use prometheus::{
+    Histogram, HistogramOpts, IntCounter, IntCounterVec, IntGauge, Opts, default_registry,
+};
+use std::sync::Arc;
+
+pub struct Metrics {
+    pub requests_total: IntCounter,
+    /// Labeled by reason ("auth", "deny_spend", "deny_fraud") so we can see *why* we rejected.
+    pub rejections_total: IntCounterVec,
+    /// Labeled by kind: input|output.
+    pub tokens_total: IntCounterVec,
+    pub ttft_seconds: Histogram,
+    pub upstream_latency_seconds: Histogram,
+    pub active_streams: IntGauge,
+}
+
+impl Metrics {
+    /// Build and register every metric on the default registry. Fallible: registering a name that
+    /// already exists (a second `Metrics::new()` against the process-wide default registry) returns
+    /// `AlreadyRegisteredError` rather than panicking, so a double-init surfaces as an error the
+    /// caller can report instead of crashing the process.
+    pub fn new() -> prometheus::Result<Arc<Self>> {
+        let r = default_registry();
+
+        let requests_total =
+            IntCounter::with_opts(Opts::new("ai_requests_total", "Total requests handled"))?;
+        let rejections_total = IntCounterVec::new(
+            Opts::new("ai_rejections_total", "Requests rejected before upstream"),
+            &["reason"],
+        )?;
+        let tokens_total =
+            IntCounterVec::new(Opts::new("ai_tokens_total", "Tokens metered"), &["kind"])?;
+        let ttft_seconds = Histogram::with_opts(HistogramOpts::new(
+            "ai_ttft_seconds",
+            "Time to first byte from upstream",
+        ))?;
+        let upstream_latency_seconds = Histogram::with_opts(HistogramOpts::new(
+            "ai_upstream_latency_seconds",
+            "Full upstream request duration",
+        ))?;
+        let active_streams = IntGauge::with_opts(Opts::new(
+            "ai_active_streams",
+            "In-flight streaming responses",
+        ))?;
+
+        r.register(Box::new(requests_total.clone()))?;
+        r.register(Box::new(rejections_total.clone()))?;
+        r.register(Box::new(tokens_total.clone()))?;
+        r.register(Box::new(ttft_seconds.clone()))?;
+        r.register(Box::new(upstream_latency_seconds.clone()))?;
+        r.register(Box::new(active_streams.clone()))?;
+
+        Ok(Arc::new(Self {
+            requests_total,
+            rejections_total,
+            tokens_total,
+            ttft_seconds,
+            upstream_latency_seconds,
+            active_streams,
+        }))
+    }
+}
diff --git a/crates/gateway/src/peek.rs b/crates/gateway/src/peek.rs
new file mode 100644
index 0000000..0b4a924
--- /dev/null
+++ b/crates/gateway/src/peek.rs
@@ -0,0 +1,250 @@
+//! Streaming, 100%-accurate extraction of the **root-level `model`** from a JSON request body.
+//!
+//! Both OpenAI and Anthropic require `model` as a top-level field of the request object. We extract
+//! it with a structural state machine fed the body chunks *as they stream through* — the body is
+//! never buffered or reordered. This is exact (not a byte-heuristic): it tracks nesting depth and
+//! string/escape state, so a `"model"` appearing inside a nested object (e.g. a message) or inside
+//! a string value is correctly ignored, and field order is irrelevant. Memory is O(1): only short
+//! root-level *keys* and the `model` value are accumulated. Large uninteresting string content
+//! (system prompts, base64 images) is skipped with a SIMD-accelerated `memchr2` search to the next
+//! `"`/`\`, not inspected byte-by-byte — so even a multi-MB request is walked cheaply.
+
+#[derive(Clone, Copy, PartialEq, Default)]
+enum Cap {
+    #[default]
+    No,
+    Key,
+    ModelValue,
+}
+
+#[derive(Default)]
+pub struct ModelScanner {
+    model: Option<String>,
+    done: bool,
+    /// Nesting depth: number of currently-open `{`/`[`. Root object contents are at depth 1.
+    depth: u32,
+    root_is_object: bool,
+    in_string: bool,
+    escaped: bool,
+    /// Whether the next root-level string is a key (`{`/`,` → key; `:` → value).
+    expect_key: bool,
+    /// The most recent root-level key was exactly `model`.
+    last_key_is_model: bool,
+    /// What (if anything) we're accumulating into `cur` for the current string.
+    cap: Cap,
+    cur: Vec<u8>,
+}
+
+impl ModelScanner {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Take the extracted model, if found. (Available as soon as the value is seen.)
+    pub fn take_model(&mut self) -> Option<String> {
+        self.model.take()
+    }
+
+    #[inline]
+    fn at_root_object(&self) -> bool {
+        self.depth == 1 && self.root_is_object
+    }
+
+    pub fn feed(&mut self, bytes: &[u8]) {
+        if self.done {
+            return;
+        }
+        let mut i = 0;
+        let n = bytes.len();
+        while i < n {
+            if self.in_string {
+                // Fast path: the content of a string we don't accumulate (a big base64 image, a long
+                // prompt, anything nested) — jump straight to the next `"` or `\` with a
+                // SIMD-accelerated search instead of inspecting every byte.
+                if self.cap == Cap::No && !self.escaped {
+                    match memchr::memchr2(b'"', b'\\', &bytes[i..]) {
+                        Some(rel) => i += rel,
+                        None => return, // rest of this chunk is skippable string content
+                    }
+                }
+                let b = bytes[i];
+                i += 1;
+                if self.escaped {
+                    self.escaped = false;
+                    if self.cap != Cap::No {
+                        self.cur.push(b);
+                    }
+                } else if b == b'\\' {
+                    self.escaped = true;
+                } else if b == b'"' {
+                    self.in_string = false;
+                    match self.cap {
+                        Cap::Key => self.last_key_is_model = self.cur == b"model",
+                        Cap::ModelValue => {
+                            // A valid JSON string value is UTF-8; if a malformed/adversarial body
+                            // smuggles non-UTF-8 bytes here we record "unknown" rather than emitting
+                            // a `U+FFFD`-corrupted model into the billing log. Either way we're done.
+                            self.model = Some(
+                                String::from_utf8(std::mem::take(&mut self.cur))
+                                    .unwrap_or_else(|_| "unknown".to_string()),
+                            );
+                            self.done = true;
+                            return;
+                        }
+                        Cap::No => {}
+                    }
+                    self.cap = Cap::No;
+                    self.cur.clear();
+                } else if self.cap != Cap::No {
+                    self.cur.push(b);
+                }
+                continue;
+            }
+
+            let b = bytes[i];
+            i += 1;
+            match b {
+                b'"' => {
+                    self.in_string = true;
+                    self.cur.clear();
+                    // Decide whether this string is worth accumulating — only root-object keys and
+                    // the `model` value matter.
+                    self.cap = if self.at_root_object() {
+                        if self.expect_key {
+                            Cap::Key
+                        } else if self.last_key_is_model {
+                            Cap::ModelValue
+                        } else {
+                            Cap::No
+                        }
+                    } else {
+                        Cap::No
+                    };
+                }
+                b'{' => {
+                    if self.depth == 0 {
+                        self.root_is_object = true;
+                        self.expect_key = true;
+                    }
+                    self.depth += 1;
+                }
+                b'[' => {
+                    if self.depth == 0 {
+                        self.root_is_object = false;
+                    }
+                    self.depth += 1;
+                }
+                b'}' | b']' => self.depth = self.depth.saturating_sub(1),
+                b':' if self.depth == 1 => self.expect_key = false,
+                b',' if self.depth == 1 => {
+                    self.expect_key = true;
+                    self.last_key_is_model = false;
+                }
+                _ => {}
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn scan(body: &[u8]) -> Option<String> {
+        let mut s = ModelScanner::new();
+        s.feed(body);
+        s.take_model()
+    }
+
+    #[test]
+    fn simple() {
+        assert_eq!(
+            scan(br#"{"model":"gpt-4o","messages":[]}"#).as_deref(),
+            Some("gpt-4o")
+        );
+    }
+
+    #[test]
+    fn model_last_after_huge_array() {
+        let body = br#"{"messages":[{"role":"user","content":"...lots of text..."}],"stream":true,"model":"claude-opus-4-8"}"#;
+        assert_eq!(scan(body).as_deref(), Some("claude-opus-4-8"));
+    }
+
+    #[test]
+    fn nested_model_is_ignored() {
+        // `"model"` inside a message object must NOT win over the real root-level one.
+        let body = br#"{"messages":[{"role":"x","model":"NESTED"}],"model":"real"}"#;
+        assert_eq!(scan(body).as_deref(), Some("real"));
+    }
+
+    #[test]
+    fn model_word_inside_a_string_value_is_ignored() {
+        let body = br#"{"system":"use the model called \"gpt\" please","model":"real"}"#;
+        assert_eq!(scan(body).as_deref(), Some("real"));
+    }
+
+    #[test]
+    fn whitespace_tolerant() {
+        assert_eq!(
+            scan(br#"{  "model" :  "m1" , "x":1 }"#).as_deref(),
+            Some("m1")
+        );
+    }
+
+    #[test]
+    fn vendor_prefixed_value() {
+        assert_eq!(
+            scan(br#"{"model":"openrouter/meta-llama/llama-3.1"}"#).as_deref(),
+            Some("openrouter/meta-llama/llama-3.1")
+        );
+    }
+
+    #[test]
+    fn split_across_feeds() {
+        let mut s = ModelScanner::new();
+        for part in [
+            &b"{\"messages\":[],\"mod"[..],
+            &b"el\":\"gp"[..],
+            &b"t-4o\"}"[..],
+        ] {
+            s.feed(part);
+        }
+        assert_eq!(s.take_model().as_deref(), Some("gpt-4o"));
+    }
+
+    #[test]
+    fn absent_is_none() {
+        assert_eq!(scan(br#"{"messages":[]}"#), None);
+        assert_eq!(scan(b"not json"), None);
+    }
+
+    #[test]
+    fn large_skipped_value_then_model() {
+        // Exercises the SIMD fast-skip: a ~256KB content string (with an escaped quote) then the
+        // real model. Must skip the bulk and still find the root model exactly.
+        let big = "x".repeat(256 * 1024);
+        let body =
+            format!(r#"{{"messages":[{{"content":"{big}\"still in string"}}],"model":"gpt-4o"}}"#);
+        assert_eq!(scan(body.as_bytes()).as_deref(), Some("gpt-4o"));
+    }
+
+    #[test]
+    fn nested_object_value_then_root_model() {
+        // A root key whose value is an object, followed by the real model.
+        let body = br#"{"response_format":{"type":"json_object"},"model":"gpt-4o"}"#;
+        assert_eq!(scan(body).as_deref(), Some("gpt-4o"));
+    }
+
+    #[test]
+    fn escaped_quote_inside_model_value_does_not_terminate_it() {
+        // An escaped `"` *inside the model value itself* exercises the `Cap::ModelValue` escape
+        // path (line ~72): the backslash-escaped quote must be kept in the accumulated value rather
+        // than ending the string early. (Model ids never really contain quotes, but a structural
+        // regression here would truncate the model — and thus mislabel usage — for any value that
+        // happens to contain an escape.)
+        assert_eq!(
+            scan(br#"{"model":"gpt-4\"o"}"#).as_deref(),
+            Some("gpt-4\"o")
+        );
+    }
+}
diff --git a/crates/gateway/src/proxy.rs b/crates/gateway/src/proxy.rs
new file mode 100644
index 0000000..d846f4e
--- /dev/null
+++ b/crates/gateway/src/proxy.rs
@@ -0,0 +1,507 @@
+//! The Pingora `ProxyHttp` passthrough service.
+//!
+//! Flow: verify the virtual key (stateless) → deny-set check (O(1), default-allow) → pick the
+//! provider from the ingress dialect (+ optional `x-beyond-provider` override) → swap the auth
+//! header to the pool key (managed only) → **stream the request body straight through** (never
+//! buffered; original framing preserved) while feeding it to a structural scanner that extracts the
+//! exact root-level `model` → relay the response **without buffering** → tap usage from a bounded
+//! tail → emit a usage fact. Whether the call is streaming is derived from the *response*
+//! Content-Type.
+//!
+//! Verified end-to-end (`tests/e2e.rs`): a real `beyond-ai` binary against real nats-server + a
+//! mock upstream — passthrough fidelity, key swap, usage metering (non-streaming + SSE), BYO
+//! passthrough, and deny-set propagation all pass.
+//!
+//! We never read the request body in `request_filter`: Pingora's body-forward phase reads the
+//! downstream body itself, so draining it earlier would make Pingora send `Content-Length` bytes
+//! with no body and the upstream would hang. We let the body flow through `request_body_filter`
+//! (the supported hook), feeding each chunk to a streaming structural scanner (`peek::ModelScanner`,
+//! O(1) memory) — never withholding or buffering it.
+//!
+//! Not done by design: OpenAI `stream_options.include_usage` injection — a streaming OpenAI client
+//! that omits it has no usage chunk to meter (the SDK/platform can set it). Worth it to keep the
+//! request body a pure passthrough rather than buffering+rewriting every request.
+//!
+//! Auth branches on key format: `bai_…` is a managed virtual key (verify → deny-check → swap to
+//! the pool key); anything else is a **BYO** request — the user's own provider token, passed
+//! through unchanged (no swap, no Beyond identity, no deny-set).
+//!
+//! Consequence: routing is by **dialect**, not model — the body (hence model) isn't known when
+//! `upstream_peer` runs. Any non-default provider is reached via the `x-beyond-provider` header
+//! (providers are data — see `route`). Model is still captured (from the streamed body) for usage.
+
+use crate::ratelimit::RlKey;
+use crate::route::{self, Dialect, Provider};
+use crate::state::GatewayState;
+use crate::{peek, usage};
+use async_trait::async_trait;
+use bytes::Bytes;
+use pingora::http::ResponseHeader;
+use pingora_core::Result;
+use pingora_core::upstreams::peer::HttpPeer;
+use pingora_proxy::{ProxyHttp, Session};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+use tracing::info;
+
+/// Reject requests whose declared Content-Length exceeds this. The body itself is **not** buffered
+/// (it streams straight through); this is purely an abuse guard checked up front via the header.
+const MAX_REQUEST_BODY: usize = 100 * 1024 * 1024;
+
+/// Bounded tail of the response kept for usage extraction. The usage event is the final SSE chunk
+/// / the whole non-streaming body; keeping a tail means we never buffer a long stream.
+const USAGE_TAIL_CAP: usize = 64 * 1024;
+
+/// Max upstream **connect** retries before surfacing the failure to the client.
+///
+/// We retry connect failures only (the idiomatic Pingora pattern, same as edge). Retrying on a
+/// received **5xx/429 response** is deliberately *not* done: Pingora 0.8 has no clean
+/// post-response retry hook for a streaming passthrough (edge doesn't do it either), the upstream
+/// may have started streaming, and the provider SDKs already back off on 429/5xx + `Retry-After`.
+const MAX_CONNECT_RETRIES: u8 = 2;
+
+pub struct AiProxy {
+    pub state: Arc<GatewayState>,
+}
+
+/// Per-request context. `None` until `request_filter` admits the request; short-circuited
+/// requests (auth/deny failures) leave it `None`, so later filters no-op.
+pub struct RequestCtx {
+    tenant_id: u64,
+    vpc_id: u64,
+    dialect: Dialect,
+    /// The resolved upstream provider (authority/host + precomputed managed auth value), shared from
+    /// the boot-time registry — a cheap `Arc` clone, nothing re-allocated per request.
+    provider: Arc<Provider>,
+    /// Whether this is a **managed** request (`bai_…` key → swap to the pool key). `false` for
+    /// **BYO** — we leave the user's own auth header untouched (passthrough).
+    managed: bool,
+    /// Requested model, extracted exactly from the request body via a streaming structural scan.
+    model: String,
+    model_scanner: peek::ModelScanner,
+    /// Whether the upstream response is an SSE stream — set in `response_filter` from the response
+    /// Content-Type (we don't read the request to learn this).
+    streaming: bool,
+    /// Bounded tail of the response, for the usage tap.
+    resp_tail: Vec<u8>,
+    /// Running total of request-body bytes seen, to enforce `MAX_REQUEST_BODY` even when the client
+    /// uses chunked transfer encoding (no `Content-Length` to check up front).
+    body_bytes_fed: usize,
+    start: Instant,
+    /// Connect-retry counter (see `fail_to_connect`).
+    attempt: u8,
+}
+
+impl AiProxy {
+    /// Write a small JSON error and signal `request_filter` to short-circuit.
+    async fn reject(session: &mut Session, status: u16, typ: &str, msg: &str) -> Result<bool> {
+        let body = Bytes::from(format!(
+            r#"{{"error":{{"type":"{typ}","message":"{msg}"}}}}"#
+        ));
+        let mut resp = ResponseHeader::build(status, None)?;
+        resp.insert_header("content-type", "application/json")?;
+        resp.insert_header("content-length", body.len().to_string())?;
+        session.write_response_header(Box::new(resp), false).await?;
+        session.write_response_body(Some(body), true).await?;
+        Ok(true)
+    }
+}
+
+fn extract_virtual_key(session: &Session) -> Option<&str> {
+    let h = session.req_header();
+    // Anthropic SDK sends `x-api-key`; OpenAI SDK sends `Authorization: Bearer`. One neutral
+    // virtual key works in either, so check both. Borrowed from the header — no per-request copy.
+    if let Some(v) = h.headers.get("x-api-key").and_then(|v| v.to_str().ok()) {
+        return Some(v);
+    }
+    h.headers
+        .get("authorization")
+        .and_then(|v| v.to_str().ok())
+        .and_then(|v| v.strip_prefix("Bearer "))
+}
+
+fn dialect_for_path(path: &str) -> Dialect {
+    // Anthropic Messages vs OpenAI Chat Completions/Embeddings. Embeddings are OpenAI-dialect only.
+    if path.starts_with("/v1/messages") {
+        Dialect::Anthropic
+    } else {
+        Dialect::OpenAI
+    }
+}
+
+/// The `x-beyond-provider` override value, if present — a provider *name* resolved against the
+/// registry in `request_filter`. (An unknown name is rejected there, not silently ignored.)
+fn provider_override(session: &Session) -> Option<&str> {
+    session
+        .req_header()
+        .headers
+        .get("x-beyond-provider")?
+        .to_str()
+        .ok()
+}
+
+#[async_trait]
+impl ProxyHttp for AiProxy {
+    type CTX = Option<RequestCtx>;
+
+    fn new_ctx(&self) -> Self::CTX {
+        None
+    }
+
+    async fn request_filter(&self, session: &mut Session, ctx: &mut Self::CTX) -> Result<bool> {
+        self.state.metrics.requests_total.inc();
+        let start = Instant::now();
+
+        // 1. Resolve the upstream provider first — from the ingress dialect (the body/model isn't
+        // available pre-connect), with an explicit `x-beyond-provider` override. Resolving up front
+        // means an unknown provider is a clean 400 before any auth work, and (since it borrows
+        // nothing) keeps the borrow checker happy when the key is extracted next. An `Arc` clone of
+        // the boot-time registry entry — nothing re-allocated per request.
+        let dialect = dialect_for_path(session.req_header().uri.path());
+        let provider = match provider_override(session) {
+            Some(name) => self.state.provider(name).cloned(),
+            None => self
+                .state
+                .provider(route::dialect_default(dialect))
+                .cloned(),
+        };
+        let Some(provider) = provider else {
+            return Self::reject(session, 400, "invalid_request_error", "unknown provider").await;
+        };
+
+        // 2. Extract the presented key — a managed virtual key (`bai_…`) or a raw BYO provider token.
+        let Some(raw_key) = extract_virtual_key(session) else {
+            return Self::reject(session, 401, "authentication_error", "missing API key").await;
+        };
+
+        // 3. Reject oversized bodies up front (Content-Length) so we never buffer a huge upload.
+        if let Some(len) = session
+            .req_header()
+            .headers
+            .get("content-length")
+            .and_then(|v| v.to_str().ok())
+            .and_then(|v| v.parse::<usize>().ok())
+        {
+            if len > MAX_REQUEST_BODY {
+                return Self::reject(
+                    session,
+                    413,
+                    "invalid_request_error",
+                    "request body too large",
+                )
+                .await;
+            }
+        }
+
+        // 4. Identity + key handling. `bai_…` → managed (stateless verify → deny-check → swap to the
+        // pool key). Anything else → BYO: the user's own provider token, passed through unchanged
+        // (no Beyond identity, so no deny-set and no per-tenant attribution).
+        let (tenant_id, vpc_id, managed) = if raw_key.starts_with("bai_") {
+            let Ok(identity) = self.state.keyring.verify(raw_key) else {
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&["auth"])
+                    .inc();
+                return Self::reject(session, 401, "authentication_error", "invalid API key").await;
+            };
+            // Deny-set: O(1), default-allow. The gateway never learns *why*, only the reason code.
+            if let Some(reason) = self.state.deny.load().reason(identity.tenant_id) {
+                let label = match reason {
+                    crate::deny::DenyReason::Spend => "deny_spend",
+                    _ => "deny_fraud",
+                };
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&[label])
+                    .inc();
+                return Self::reject(
+                    session,
+                    reason.http_status(),
+                    "access_denied",
+                    "tenant is over limit or suspended",
+                )
+                .await;
+            }
+            // The actual `Bearer …`/`x-api-key` value is precomputed in the provider registry and
+            // applied in `upstream_request_filter`; here we only confirm a pool key exists.
+            if provider.pool_auth_value.is_none() {
+                return Self::reject(session, 503, "api_error", "no provider key available").await;
+            }
+            (identity.tenant_id, identity.vpc_id, true)
+        } else {
+            (0, 0, false)
+        };
+
+        // 5. Per-key rate guardrail (see `ratelimit`): caps a single key's request velocity. Keyed by
+        // tenant for managed traffic, by a hash of the BYO token otherwise. Computed into an owned
+        // key so the `raw_key` borrow of `session` ends before the `&mut session` reject below.
+        if let Some(rl) = &self.state.rate_limit {
+            let key = if managed {
+                RlKey::Tenant(tenant_id)
+            } else {
+                RlKey::byo(raw_key)
+            };
+            if !rl.check(&key) {
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&["rate_limit"])
+                    .inc();
+                return Self::reject(session, 429, "rate_limit_error", "rate limit exceeded").await;
+            }
+        }
+
+        *ctx = Some(RequestCtx {
+            tenant_id,
+            vpc_id,
+            dialect,
+            provider,
+            managed,
+            model: String::new(),
+            model_scanner: peek::ModelScanner::new(),
+            streaming: false,
+            resp_tail: Vec::with_capacity(USAGE_TAIL_CAP),
+            body_bytes_fed: 0,
+            start,
+            attempt: 0,
+        });
+        Ok(false)
+    }
+
+    async fn upstream_peer(
+        &self,
+        _session: &mut Session,
+        ctx: &mut Self::CTX,
+    ) -> Result<Box<HttpPeer>> {
+        // `ctx` is set by `request_filter` for every admitted request; a missing ctx here means an
+        // unadmitted request reached `upstream_peer` (a Pingora ordering change or future refactor).
+        // Surface it as an error rather than panicking the worker.
+        let Some(rc) = ctx.as_ref() else {
+            return Err(pingora_core::Error::new_str(
+                "upstream_peer reached without request context",
+            ));
+        };
+
+        // Resolve via the TTL cache (async, non-blocking) rather than `HttpPeer::new`'s eager
+        // blocking `getaddrinfo`. SNI/Host = the configured host; TLS on for real providers (the
+        // e2e harness flips `upstream_tls=false` for a plaintext mock).
+        let addr = match self.state.resolve(&rc.provider.authority).await {
+            Ok(a) => a,
+            Err(_) => {
+                return Err(pingora_core::Error::new_str(
+                    "upstream dns resolution failed",
+                ));
+            }
+        };
+        let mut peer = HttpPeer::new(
+            addr,
+            self.state.config.upstream_tls,
+            rc.provider.host.clone(),
+        );
+        peer.options.connection_timeout =
+            Some(Duration::from_secs(self.state.config.connect_timeout_secs));
+        peer.options.read_timeout = Some(Duration::from_secs(self.state.config.read_timeout_secs));
+        peer.options.write_timeout =
+            Some(Duration::from_secs(self.state.config.write_timeout_secs));
+        peer.options.idle_timeout = Some(Duration::from_secs(self.state.config.idle_timeout_secs));
+        Ok(Box::new(peer))
+    }
+
+    async fn upstream_request_filter(
+        &self,
+        _session: &mut Session,
+        upstream_request: &mut pingora::http::RequestHeader,
+        ctx: &mut Self::CTX,
+    ) -> Result<()> {
+        let Some(rc) = ctx.as_ref() else {
+            return Ok(());
+        };
+
+        // Managed: swap the virtual key for the real pool key (precomputed at boot) in the scheme
+        // the upstream wants — removing *both* inbound auth headers first so the virtual key never
+        // leaks upstream. BYO (`!managed`): leave the user's own auth header exactly as presented.
+        if rc.managed {
+            if let Some(av) = &rc.provider.pool_auth_value {
+                upstream_request.remove_header("authorization");
+                upstream_request.remove_header("x-api-key");
+                upstream_request.insert_header(rc.provider.auth.header(), av.expose())?;
+            }
+        }
+
+        // Point Host at the upstream. The body passes through untouched, so the client's original
+        // framing (Content-Length / chunked) is preserved — true passthrough.
+        upstream_request.insert_header("host", rc.provider.host.as_str())?;
+
+        // Rewrite the path to the provider's mount point when it isn't `/v1` (e.g. Groq serves the
+        // OpenAI surface under `/openai/v1`, Fireworks under `/inference/v1`). Most providers mount
+        // at `/v1`, so `upstream_path` returns `None` and the URI is left untouched (no realloc).
+        // The query string is preserved.
+        if let Some(new_path) = rc.provider.upstream_path(upstream_request.uri.path()) {
+            let pq = match upstream_request.uri.query() {
+                Some(q) => format!("{new_path}?{q}"),
+                None => new_path,
+            };
+            if let Ok(uri) = pq.parse() {
+                upstream_request.set_uri(uri);
+            }
+        }
+        Ok(())
+    }
+
+    async fn request_body_filter(
+        &self,
+        _session: &mut Session,
+        body: &mut Option<Bytes>,
+        end_of_stream: bool,
+        ctx: &mut Self::CTX,
+    ) -> Result<()> {
+        let Some(rc) = ctx.as_mut() else {
+            return Ok(());
+        };
+        // Feed the body through the structural scanner as it passes (never withheld, never
+        // buffered) to extract the exact root-level `model`. Body framing is untouched.
+        if let Some(chunk) = body.as_ref() {
+            // Enforce the body cap on the *streamed* size too: the up-front `Content-Length` check in
+            // `request_filter` can't see a chunked-encoded body (no declared length). We don't buffer
+            // — we just count — and abort the proxied request once the running total crosses the cap.
+            // Aborting (vs. a clean 413) is acceptable here: headers are already away to the upstream,
+            // and this is an abuse guard, not a normal client path.
+            rc.body_bytes_fed = rc.body_bytes_fed.saturating_add(chunk.len());
+            if rc.body_bytes_fed > MAX_REQUEST_BODY {
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&["body_too_large"])
+                    .inc();
+                return Err(pingora_core::Error::new_str("request body exceeds limit"));
+            }
+            rc.model_scanner.feed(chunk);
+        }
+        if end_of_stream && rc.model.is_empty() {
+            if let Some(m) = rc.model_scanner.take_model() {
+                rc.model = m;
+            }
+        }
+        Ok(())
+    }
+
+    async fn response_filter(
+        &self,
+        _session: &mut Session,
+        upstream_response: &mut ResponseHeader,
+        ctx: &mut Self::CTX,
+    ) -> Result<()> {
+        if let Some(rc) = ctx.as_mut() {
+            // Headers arrived ≈ time-to-first-byte.
+            self.state
+                .metrics
+                .ttft_seconds
+                .observe(rc.start.elapsed().as_secs_f64());
+            // Derive streaming from the response, not the request: SSE ⇒ use the streaming usage
+            // parser; otherwise the body is a single JSON object.
+            rc.streaming = upstream_response
+                .headers
+                .get("content-type")
+                .and_then(|v| v.to_str().ok())
+                .is_some_and(|ct| ct.contains("event-stream"));
+        }
+        Ok(())
+    }
+
+    fn response_body_filter(
+        &self,
+        _session: &mut Session,
+        body: &mut Option<Bytes>,
+        _end_of_stream: bool,
+        ctx: &mut Self::CTX,
+    ) -> Result<Option<Duration>>
+    where
+        Self::CTX: Send + Sync,
+    {
+        // Passive tap: copy each chunk into a bounded tail for usage parsing, but never withhold it
+        // — chunks pass straight through, so the stream is relayed with no added buffering.
+        //
+        // We let the tail grow to 2× the cap, then compact once with a single `copy_within` that
+        // keeps the last cap bytes. This bounds memory the same way the old per-chunk `drain` did,
+        // but moves bytes O(stream_len / cap) times instead of once per chunk — for a long stream of
+        // small chunks that's the difference between one memmove per 64 KB and one per chunk.
+        if let (Some(rc), Some(chunk)) = (ctx.as_mut(), body.as_ref()) {
+            rc.resp_tail.extend_from_slice(chunk);
+            if rc.resp_tail.len() > 2 * USAGE_TAIL_CAP {
+                let keep_from = rc.resp_tail.len() - USAGE_TAIL_CAP;
+                rc.resp_tail.copy_within(keep_from.., 0);
+                rc.resp_tail.truncate(USAGE_TAIL_CAP);
+            }
+        }
+        Ok(None)
+    }
+
+    fn fail_to_connect(
+        &self,
+        _session: &mut Session,
+        _peer: &HttpPeer,
+        ctx: &mut Self::CTX,
+        mut e: Box<pingora_core::Error>,
+    ) -> Box<pingora_core::Error> {
+        if let Some(rc) = ctx.as_mut() {
+            // Retry transient connect failures a couple of times (Pingora re-invokes upstream_peer).
+            if rc.attempt < MAX_CONNECT_RETRIES {
+                rc.attempt += 1;
+                e.set_retry(true);
+            }
+        }
+        e
+    }
+
+    async fn logging(
+        &self,
+        _session: &mut Session,
+        _e: Option<&pingora_core::Error>,
+        ctx: &mut Self::CTX,
+    ) {
+        let Some(rc) = ctx.as_ref() else { return };
+
+        // The buffer may transiently hold up to 2× the cap before compaction; the usage event is
+        // always in the last cap bytes, so slice to that bounded tail before parsing.
+        let tail_start = rc.resp_tail.len().saturating_sub(USAGE_TAIL_CAP);
+        let tail = &rc.resp_tail[tail_start..];
+
+        // Extract usage facts from the tail (shape depends on dialect + streaming).
+        let usage = match (rc.dialect, rc.streaming) {
+            (Dialect::OpenAI, true) => usage::openai_stream(tail),
+            (Dialect::OpenAI, false) => usage::openai_body(tail),
+            (Dialect::Anthropic, true) => usage::anthropic_stream(tail),
+            (Dialect::Anthropic, false) => usage::anthropic_body(tail),
+        }
+        .unwrap_or_default();
+
+        let m = &self.state.metrics;
+        m.tokens_total
+            .with_label_values(&["input"])
+            .inc_by(usage.input_tokens);
+        m.tokens_total
+            .with_label_values(&["output"])
+            .inc_by(usage.output_tokens);
+        m.upstream_latency_seconds
+            .observe(rc.start.elapsed().as_secs_f64());
+
+        // Emit the usage *fact* on a dedicated target. logfwd/OTLP ships `ai.usage` → ClickHouse;
+        // pricing is a closed downstream consumer (we emit token counts only).
+        info!(
+            target: "ai.usage",
+            tenant_id = rc.tenant_id,
+            vpc_id = rc.vpc_id,
+            provider = rc.provider.name.as_str(),
+            model = %rc.model,
+            stream = rc.streaming,
+            input_tokens = usage.input_tokens,
+            output_tokens = usage.output_tokens,
+            cache_read_tokens = usage.cache_read_tokens,
+            cache_write_tokens = usage.cache_write_tokens,
+            latency_ms = rc.start.elapsed().as_millis() as u64,
+            "usage"
+        );
+    }
+}
diff --git a/crates/gateway/src/ratelimit.rs b/crates/gateway/src/ratelimit.rs
new file mode 100644
index 0000000..4365cbe
--- /dev/null
+++ b/crates/gateway/src/ratelimit.rs
@@ -0,0 +1,111 @@
+//! Per-key request-rate guardrail — a blast-radius circuit breaker, **not** a spend control.
+//!
+//! The deny-set (see `deny`) is the spend/fraud authority, but it's *cumulative* and reacts on a
+//! lag: it only learns of spend after usage facts round-trip through the control plane, and it's
+//! structurally blind to request floods that never bill — auth failures (rejected here, never reach
+//! upstream), provider 4xx, and BYO traffic (on the caller's own key, no Beyond identity). This caps
+//! the *velocity* a single key can drive, which bounds two things the deny-set can't: (1) spend from
+//! a leaked/runaway managed key during the deny-set's reaction lag, and (2) the gateway-resource cost
+//! (verifies, sockets, upstream connections) of a failure flood — the classic internal-service
+//! incident: a buggy client in a retry storm.
+//!
+//! It is deliberately generous: a ceiling well above any legitimate single-tenant steady state, so
+//! it never trips in normal operation. Tune it from `ai_rejections_total{reason="rate_limit"}`.
+//!
+//! Backed by pingora-limits' `Rate`: a count-min-sketch estimator with **fixed memory regardless of
+//! key cardinality** (no per-tenant entry, no background GC), matching the deny-set's O(denied)
+//! ethos. The sketch can *over*estimate a key's rate on hash collision but never under, so the cap
+//! is always enforced; `SLOTS` is sized wide enough that overestimation stays negligible at our
+//! active-key counts.
+
+use pingora_limits::rate::Rate;
+use std::hash::Hash;
+use std::time::Duration;
+
+/// Count-min sketch dimensions. Wider than `Rate::new`'s 1024-slot default because our key
+/// cardinality (active tenants + BYO callers within a 1s window) is high; more slots keeps
+/// collision-driven overestimation negligible. ~8192·4·2 atomic counters — a few hundred KB, fixed.
+const SLOTS: usize = 8192;
+const HASHES: usize = 4;
+
+/// The rate window. The ceiling is expressed per this interval, i.e. requests/second.
+const WINDOW: Duration = Duration::from_secs(1);
+
+/// What a single request is charged against. Managed traffic is keyed by tenant, so one tenant's
+/// runaway can't throttle another; BYO has no Beyond identity, so it's keyed by a hash of the
+/// caller's own token. One key space — the enum discriminant keeps a `tenant_id` from colliding with
+/// a BYO token hash that happens to share its value.
+#[derive(Hash)]
+pub enum RlKey {
+    Tenant(u64),
+    Byo(u64),
+}
+
+impl RlKey {
+    /// Key a BYO request by a hash of its raw token (we have no tenant identity for BYO). The token
+    /// itself is never stored — only this digest, which the sketch hashes again into its slots.
+    pub fn byo(raw_token: &str) -> Self {
+        use std::hash::Hasher;
+        let mut h = std::collections::hash_map::DefaultHasher::new();
+        raw_token.hash(&mut h);
+        RlKey::Byo(h.finish())
+    }
+}
+
+pub struct RateLimit {
+    rate: Rate,
+    /// Max requests per `WINDOW` for a single key before we start rejecting.
+    max_per_window: isize,
+}
+
+impl RateLimit {
+    /// `rps` is the per-key requests/second ceiling. `rps == 0` disables the limiter (`None`).
+    pub fn new(rps: u32) -> Option<Self> {
+        if rps == 0 {
+            return None;
+        }
+        Some(Self {
+            rate: Rate::new_with_estimator_config(WINDOW, HASHES, SLOTS),
+            max_per_window: rps as isize,
+        })
+    }
+
+    /// Charge one request to `key`. Returns `true` when it's within budget, `false` once the key has
+    /// exceeded its ceiling in the current window. `observe` counts the event and returns the running
+    /// total for the window, so the very request that crosses the line is the first one rejected.
+    pub fn check(&self, key: &RlKey) -> bool {
+        self.rate.observe(key, 1) <= self.max_per_window
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn zero_rps_disables() {
+        assert!(RateLimit::new(0).is_none());
+    }
+
+    #[test]
+    fn allows_up_to_ceiling_then_rejects() {
+        let rl = RateLimit::new(5).unwrap();
+        let k = RlKey::Tenant(1);
+        for _ in 0..5 {
+            assert!(rl.check(&k));
+        }
+        // 6th request in the same 1s window crosses the ceiling.
+        assert!(!rl.check(&k));
+    }
+
+    #[test]
+    fn keys_have_independent_budgets() {
+        let rl = RateLimit::new(2).unwrap();
+        assert!(rl.check(&RlKey::Tenant(1)));
+        assert!(rl.check(&RlKey::Tenant(1)));
+        assert!(!rl.check(&RlKey::Tenant(1))); // tenant 1 exhausted
+        assert!(rl.check(&RlKey::Tenant(2))); // a different tenant is unaffected
+        // Same numeric value, different variant ⇒ different key (discriminant disambiguates).
+        assert!(rl.check(&RlKey::Byo(1)));
+    }
+}
diff --git a/crates/gateway/src/route.rs b/crates/gateway/src/route.rs
new file mode 100644
index 0000000..9f69e07
--- /dev/null
+++ b/crates/gateway/src/route.rs
@@ -0,0 +1,308 @@
+//! Provider routing and per-provider wire details — **data-driven**.
+//!
+//! Passthrough-first: the ingress *dialect* (which API surface the client called) picks the default
+//! provider; an `x-beyond-provider: <name>` header selects any registered provider by name. A
+//! provider is a *row* in [`KNOWN_PROVIDERS`] (name, upstream authority, base path, auth scheme) —
+//! adding an OpenAI-wire provider (Groq, DeepSeek, Together, …) is one line there, no new code
+//! paths, no enum, no match arms. Operators can also add/override providers from config (see
+//! `state`/`config`). We do not translate between dialects — that's deliberately out of scope.
+
+use crate::secret::Secret;
+
+/// The path prefix client SDKs use (OpenAI + Anthropic both mount their API under `/v1`). A provider
+/// whose `base_path` differs has this leading segment rewritten to its prefix (see
+/// [`Provider::upstream_path`]).
+pub const CLIENT_PREFIX: &str = "/v1";
+
+/// Which API surface the client called. Drives usage parsing and the default provider.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Dialect {
+    OpenAI,
+    Anthropic,
+}
+
+/// How the upstream expects the API key. OpenAI-wire providers use `Authorization: Bearer <key>`;
+/// Anthropic uses the `x-api-key` header. The gateway swaps the client's virtual key for the real
+/// pool key in whichever header the upstream wants (see `proxy`).
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum AuthScheme {
+    Bearer,
+    XApiKey,
+}
+
+impl AuthScheme {
+    /// The request header the upstream expects the key in.
+    pub fn header(self) -> &'static str {
+        match self {
+            AuthScheme::Bearer => "authorization",
+            AuthScheme::XApiKey => "x-api-key",
+        }
+    }
+
+    /// Format `key` as the upstream wants it for [`Self::header`].
+    pub fn format(self, key: &str) -> String {
+        match self {
+            AuthScheme::Bearer => format!("Bearer {key}"),
+            AuthScheme::XApiKey => key.to_string(),
+        }
+    }
+}
+
+/// Static wire facts for a known provider. Adding a provider = one row in [`KNOWN_PROVIDERS`].
+pub struct ProviderSpec {
+    pub name: &'static str,
+    /// Default upstream `host:port` (TLS:443). Overridable per-provider via config.
+    pub authority: &'static str,
+    /// Where the provider mounts the OpenAI-wire surface. The client always calls `/v1/…` (its
+    /// SDK's fixed prefix); for a provider whose `base_path != "/v1"` the gateway rewrites that
+    /// leading segment (e.g. Groq serves under `/openai/v1`, so `/v1/chat/completions` →
+    /// `/openai/v1/chat/completions`). Most providers mount at `/v1`, so this is usually `"/v1"`.
+    pub base_path: &'static str,
+    pub auth: AuthScheme,
+}
+
+/// The providers the gateway knows out of the box. All but Anthropic speak the OpenAI wire format
+/// (Bearer auth, chat/completions + embeddings); they differ by authority and where they mount that
+/// surface (`base_path`) — a new one is a single row here, then reachable via `x-beyond-provider:
+/// <name>`. (Config can add further OpenAI-wire providers or override any authority — see
+/// `state::build_providers`.)
+///
+/// `base_path` values are deliberate, not cosmetic: Groq/OpenRouter/Fireworks do **not** mount at
+/// `/v1`, so a verbatim path passthrough would 404 against the real provider.
+///
+/// Every row's `authority`/`base_path`/`auth` was verified against the provider's **official** docs
+/// (cited inline) as of 2026-05; the citation is the source of truth if a provider later moves an
+/// endpoint. These are static facts, so doc-verification (not a live call) is the right proof.
+pub const KNOWN_PROVIDERS: &[ProviderSpec] = &[
+    // docs: https://platform.openai.com/docs/api-reference/authentication — base https://api.openai.com/v1, Bearer.
+    ProviderSpec {
+        name: "openai",
+        authority: "api.openai.com:443",
+        base_path: "/v1",
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://docs.claude.com/en/api/messages — base https://api.anthropic.com, Messages at /v1/messages,
+    // auth is `x-api-key` (NOT Bearer). The required `anthropic-version` header is the client's; we pass it through.
+    ProviderSpec {
+        name: "anthropic",
+        authority: "api.anthropic.com:443",
+        base_path: "/v1",
+        auth: AuthScheme::XApiKey,
+    },
+    // docs: https://openrouter.ai/docs/quickstart — base https://openrouter.ai/api/v1 (note `/api/v1`, not `/v1`), Bearer.
+    ProviderSpec {
+        name: "openrouter",
+        authority: "openrouter.ai:443",
+        base_path: "/api/v1",
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://docs.fireworks.ai/tools-sdks/openai-compatibility — base https://api.fireworks.ai/inference/v1, Bearer.
+    ProviderSpec {
+        name: "fireworks",
+        authority: "api.fireworks.ai:443",
+        base_path: "/inference/v1",
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://console.groq.com/docs/openai — base https://api.groq.com/openai/v1 (note `/openai/v1`), Bearer.
+    ProviderSpec {
+        name: "groq",
+        authority: "api.groq.com:443",
+        base_path: "/openai/v1",
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://api-docs.deepseek.com/ — base https://api.deepseek.com/v1 (the `/v1` is an OpenAI-compat alias,
+    // not API versioning); /v1/chat/completions is officially supported. Bearer.
+    ProviderSpec {
+        name: "deepseek",
+        authority: "api.deepseek.com:443",
+        base_path: "/v1",
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://docs.together.ai/docs/openai-api-compatibility — base https://api.together.ai/v1, Bearer.
+    // Canonical host is `api.together.ai`; the legacy `api.together.xyz` is still live but no longer documented.
+    ProviderSpec {
+        name: "together",
+        authority: "api.together.ai:443",
+        base_path: "/v1",
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://inference-docs.cerebras.ai/resources/openai — base https://api.cerebras.ai/v1, Bearer.
+    ProviderSpec {
+        name: "cerebras",
+        authority: "api.cerebras.ai:443",
+        base_path: "/v1",
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://docs.mistral.ai/api/ — base https://api.mistral.ai/v1, Bearer.
+    ProviderSpec {
+        name: "mistral",
+        authority: "api.mistral.ai:443",
+        base_path: "/v1",
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://docs.x.ai/docs/api-reference — base https://api.x.ai/v1, Bearer. Reasoning models are slow:
+    // the generous read/idle timeouts (see `config`) matter here.
+    ProviderSpec {
+        name: "xai",
+        authority: "api.x.ai:443",
+        base_path: "/v1",
+        auth: AuthScheme::Bearer,
+    },
+];
+
+/// The default provider name for a dialect, used when no `x-beyond-provider` override is given.
+/// (Model-based auto-routing isn't possible — the body isn't read before peer selection — so the
+/// long tail is reached explicitly via the header.)
+pub fn dialect_default(d: Dialect) -> &'static str {
+    match d {
+        Dialect::OpenAI => "openai",
+        Dialect::Anthropic => "anthropic",
+    }
+}
+
+/// A *resolved* provider: static wire facts + the boot-resolved upstream authority/host + (for
+/// managed traffic) the precomputed pool auth header value. Built once at boot (see
+/// `state::build_providers`); the request hot path holds an `Arc<Provider>` (cheap clone) and
+/// borrows these fields, so nothing is re-allocated or re-formatted per request.
+pub struct Provider {
+    pub name: String,
+    /// Upstream `host:port`.
+    pub authority: String,
+    /// Bare upstream host (SNI / `Host` header) = authority without the port.
+    pub host: String,
+    /// Where the provider mounts the OpenAI-wire surface (see [`ProviderSpec::base_path`]).
+    pub base_path: String,
+    pub auth: AuthScheme,
+    /// Precomputed managed auth header value (`Bearer <key>` / bare key). `None` ⇒ no pool key is
+    /// configured for this provider ⇒ managed requests to it are rejected (503). Kept in `Secret`
+    /// for the redacting-`Debug` + zeroize-on-drop hygiene of the underlying key.
+    pub pool_auth_value: Option<Secret>,
+}
+
+impl Provider {
+    /// Resolve a provider from its name, upstream authority, base path, auth scheme, and (optional)
+    /// pool key. Derives the bare host and precomputes the managed auth header value once.
+    pub fn resolve(
+        name: &str,
+        authority: String,
+        base_path: &str,
+        auth: AuthScheme,
+        pool_key: Option<&str>,
+    ) -> Self {
+        let host = authority
+            .split(':')
+            .next()
+            .unwrap_or(&authority)
+            .to_string();
+        let pool_auth_value = pool_key.map(|k| Secret::new(auth.format(k)));
+        Provider {
+            name: name.to_string(),
+            authority,
+            host,
+            base_path: base_path.to_string(),
+            auth,
+            pool_auth_value,
+        }
+    }
+
+    /// Map a client request path to the upstream path for this provider. The client's SDK uses the
+    /// fixed `/v1` prefix; if this provider mounts elsewhere (`base_path != "/v1"`) the leading
+    /// `/v1` is replaced. Returns `None` when no rewrite is needed (the common `/v1` case, or a
+    /// path that doesn't start with `/v1`), so the hot path skips reallocating the URI.
+    pub fn upstream_path(&self, client_path: &str) -> Option<String> {
+        if self.base_path == CLIENT_PREFIX {
+            return None;
+        }
+        // Only remap when the segment is exactly `/v1` (followed by `/` or end), never a prefix
+        // match like `/v1beta`. `rest` keeps the remainder (incl. its leading `/`, or empty).
+        let rest = client_path.strip_prefix(CLIENT_PREFIX)?;
+        if !rest.is_empty() && !rest.starts_with('/') {
+            return None;
+        }
+        Some(format!("{}{}", self.base_path, rest))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn dialect_defaults() {
+        assert_eq!(dialect_default(Dialect::OpenAI), "openai");
+        assert_eq!(dialect_default(Dialect::Anthropic), "anthropic");
+    }
+
+    #[test]
+    fn known_provider_names_are_unique() {
+        let mut names: Vec<_> = KNOWN_PROVIDERS.iter().map(|p| p.name).collect();
+        names.sort_unstable();
+        let before = names.len();
+        names.dedup();
+        assert_eq!(
+            before,
+            names.len(),
+            "duplicate provider name in KNOWN_PROVIDERS"
+        );
+    }
+
+    #[test]
+    fn auth_scheme_formats_and_headers() {
+        assert_eq!(AuthScheme::Bearer.header(), "authorization");
+        assert_eq!(AuthScheme::XApiKey.header(), "x-api-key");
+        assert_eq!(AuthScheme::Bearer.format("k"), "Bearer k");
+        // Anthropic wants the bare key (no `Bearer`). Getting this wrong → upstream 401.
+        assert_eq!(AuthScheme::XApiKey.format("k"), "k");
+    }
+
+    #[test]
+    fn resolve_derives_host_and_pool_auth() {
+        let p = Provider::resolve(
+            "openai",
+            "api.openai.com:443".to_string(),
+            "/v1",
+            AuthScheme::Bearer,
+            Some("sk-x"),
+        );
+        assert_eq!(p.host, "api.openai.com");
+        assert_eq!(p.pool_auth_value.as_ref().unwrap().expose(), "Bearer sk-x");
+
+        // No pool key ⇒ no managed auth value (managed requests to it would 503).
+        let a = Provider::resolve(
+            "anthropic",
+            "api.anthropic.com:443".to_string(),
+            "/v1",
+            AuthScheme::XApiKey,
+            None,
+        );
+        assert!(a.pool_auth_value.is_none());
+    }
+
+    #[test]
+    fn upstream_path_rewrites_only_non_v1_bases() {
+        let v1 = Provider::resolve("openai", "h:443".into(), "/v1", AuthScheme::Bearer, None);
+        // `/v1` provider: no rewrite (None) — the hot path passes the client path through verbatim.
+        assert_eq!(v1.upstream_path("/v1/chat/completions"), None);
+
+        let groq = Provider::resolve(
+            "groq",
+            "h:443".into(),
+            "/openai/v1",
+            AuthScheme::Bearer,
+            None,
+        );
+        assert_eq!(
+            groq.upstream_path("/v1/chat/completions").as_deref(),
+            Some("/openai/v1/chat/completions")
+        );
+        // Anthropic-style messages path under a remapped base, and the bare prefix.
+        assert_eq!(
+            groq.upstream_path("/v1/embeddings").as_deref(),
+            Some("/openai/v1/embeddings")
+        );
+        assert_eq!(groq.upstream_path("/v1").as_deref(), Some("/openai/v1"));
+        // A non-`/v1` path (e.g. a health probe) is left alone, as is a `/v1beta`-style false match.
+        assert_eq!(groq.upstream_path("/healthz"), None);
+        assert_eq!(groq.upstream_path("/v1beta/models"), None);
+    }
+}
diff --git a/crates/gateway/src/secret.rs b/crates/gateway/src/secret.rs
new file mode 100644
index 0000000..5fcd8d8
--- /dev/null
+++ b/crates/gateway/src/secret.rs
@@ -0,0 +1,77 @@
+//! A string secret that won't leak into logs and is scrubbed on drop.
+//!
+//! Hygiene, not a hard control: provider keys are necessarily long-lived in RAM (held for the
+//! process life, copied into Pingora's request headers we don't own), so zeroize-on-drop only
+//! helps at rotation/shutdown. The real protections are SSM-at-rest + never logging + rotation.
+//! What this newtype buys: a redacting `Debug` (so a stray `{:?}` or `tracing` field can't print a
+//! key) and a best-effort scrub when the value is dropped.
+
+use std::fmt;
+use zeroize::Zeroize;
+
+#[derive(Clone)]
+pub struct Secret(String);
+
+impl Secret {
+    pub fn new(s: impl Into<String>) -> Self {
+        Self(s.into())
+    }
+
+    /// Borrow the plaintext. Call sites should keep the exposure as narrow as possible.
+    pub fn expose(&self) -> &str {
+        &self.0
+    }
+}
+
+impl From<String> for Secret {
+    fn from(s: String) -> Self {
+        Self(s)
+    }
+}
+
+impl fmt::Debug for Secret {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str("Secret(***)")
+    }
+}
+
+// Deserialize transparently from a plain string so config (`AI_POOL_KEY_*`, `nats_creds`) can load a
+// secret straight into `Option<Secret>`.
+impl<'de> serde::Deserialize<'de> for Secret {
+    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+        Ok(Self(String::deserialize(d)?))
+    }
+}
+
+// Serialize **redacting** — same threat model as `Debug`: a stray `serde_json::to_string(&config)`
+// in a log line must not leak the key. This is sound for our only serialize path (figment's
+// `Serialized::defaults`, where every secret field defaults to `None` and is skipped); a `Secret`
+// is for holding a key, never for round-tripping config back out. Read the plaintext via `expose`.
+impl serde::Serialize for Secret {
+    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+        s.serialize_str("***")
+    }
+}
+
+impl Drop for Secret {
+    fn drop(&mut self) {
+        self.0.zeroize();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn debug_redacts() {
+        let s = Secret::new("sk-supersecret");
+        assert_eq!(format!("{s:?}"), "Secret(***)");
+        assert!(!format!("{s:?}").contains("supersecret"));
+    }
+
+    #[test]
+    fn expose_returns_plaintext() {
+        assert_eq!(Secret::new("abc").expose(), "abc");
+    }
+}
diff --git a/crates/gateway/src/state.rs b/crates/gateway/src/state.rs
new file mode 100644
index 0000000..19d49ee
--- /dev/null
+++ b/crates/gateway/src/state.rs
@@ -0,0 +1,191 @@
+//! Shared gateway state.
+//!
+//! Only the **deny-set** is dynamic (watched from NATS, behind `ArcSwap` for lock-free reads).
+//! Everything else — the signing keyring and the resolved provider registry (upstreams + pool auth
+//! values) — is built once at boot from config (SSM/env), so the auth + key paths have **no runtime
+//! dependency on NATS**.
+
+use crate::config::AiConfig;
+use crate::deny::DenySet;
+use crate::error::{GatewayError, Result};
+use crate::key::Keyring;
+use crate::metrics::Metrics;
+use crate::ratelimit::RateLimit;
+use crate::route::{self, AuthScheme, Provider};
+use arc_swap::ArcSwap;
+use std::collections::HashMap;
+use std::net::SocketAddr;
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+/// How long a resolved upstream address is reused before re-resolving.
+const DNS_TTL: Duration = Duration::from_secs(60);
+
+/// Build the resolved provider registry from the static known set + config: every known provider
+/// (its authority overridable by `provider_authorities`), plus any config-only OpenAI-wire provider
+/// (a `provider_authorities` entry whose name isn't known). Each provider's pool key (if any) is
+/// looked up by name and its managed auth header value precomputed.
+fn build_providers(config: &AiConfig) -> HashMap<String, Arc<Provider>> {
+    let mut providers = HashMap::new();
+    for spec in route::KNOWN_PROVIDERS {
+        let authority = config
+            .provider_authorities
+            .get(spec.name)
+            .cloned()
+            .unwrap_or_else(|| spec.authority.to_string());
+        let pool_key = config.pool_keys.get(spec.name).map(|s| s.expose());
+        providers.insert(
+            spec.name.to_string(),
+            Arc::new(Provider::resolve(
+                spec.name,
+                authority,
+                spec.base_path,
+                spec.auth,
+                pool_key,
+            )),
+        );
+    }
+    // Config-only providers (name not in the known set): assume OpenAI-wire (Bearer). A non-OpenAI
+    // wire format would need real code, so we don't pretend to support it from config alone.
+    for (name, authority) in &config.provider_authorities {
+        if !providers.contains_key(name) {
+            let pool_key = config.pool_keys.get(name).map(|s| s.expose());
+            providers.insert(
+                name.clone(),
+                Arc::new(Provider::resolve(
+                    name,
+                    authority.clone(),
+                    route::CLIENT_PREFIX,
+                    AuthScheme::Bearer,
+                    pool_key,
+                )),
+            );
+        }
+    }
+    providers
+}
+
+pub struct GatewayState {
+    pub config: AiConfig,
+    pub metrics: Arc<Metrics>,
+
+    /// Trusted Ed25519 public keys by kid — from config (rotate via redeploy). Static for life.
+    pub keyring: Keyring,
+    /// Resolved providers by name (upstream authority/host + precomputed managed auth value). Built
+    /// once at boot from `route::KNOWN_PROVIDERS` + config; the request path clones the `Arc`.
+    providers: HashMap<String, Arc<Provider>>,
+
+    /// Sparse deny-set — the ONE thing watched from NATS. Default-allow on miss; fail-open.
+    pub deny: ArcSwap<DenySet>,
+
+    /// Per-key request-rate guardrail (see `ratelimit`). `None` when `rate_limit_rps == 0`. Fixed
+    /// memory regardless of tenant count, so it lives in the static state with no GC.
+    pub rate_limit: Option<RateLimit>,
+
+    /// TTL cache of resolved upstream addresses, so `upstream_peer` neither blocks on a synchronous
+    /// `getaddrinfo` nor re-resolves the same provider host every request.
+    dns_cache: Mutex<HashMap<String, (SocketAddr, Instant)>>,
+}
+
+impl GatewayState {
+    pub fn new(config: AiConfig, metrics: Arc<Metrics>) -> Result<Arc<Self>> {
+        let keyring = config.build_keyring()?;
+        let providers = build_providers(&config);
+        let rate_limit = RateLimit::new(config.rate_limit_rps);
+
+        Ok(Arc::new(Self {
+            metrics,
+            keyring,
+            providers,
+            deny: ArcSwap::from_pointee(DenySet::new()),
+            rate_limit,
+            dns_cache: Mutex::new(HashMap::new()),
+            config,
+        }))
+    }
+
+    /// The resolved provider for `name` (`x-beyond-provider` value or dialect default), or `None`
+    /// if no such provider is registered.
+    pub fn provider(&self, name: &str) -> Option<&Arc<Provider>> {
+        self.providers.get(name)
+    }
+
+    /// Resolve an `host:port` authority to a `SocketAddr`, cached for `DNS_TTL`. Uses
+    /// `tokio::net::lookup_host` (runs `getaddrinfo` on the blocking pool — async-safe) instead of
+    /// `HttpPeer::new`'s eager blocking resolve.
+    pub async fn resolve(&self, authority: &str) -> Result<SocketAddr> {
+        // Scope the guard so the `std::sync::Mutex` is provably released before the `.await` below —
+        // a `std` guard is not `Send` and must never be held across an await. The explicit block
+        // makes that invariant local and obvious (a stray log/borrow added before the await would
+        // otherwise either deadlock or fail to compile). A miss may let two concurrent callers both
+        // resolve; that's harmless (same answer, last writer wins) and not worth a lock across DNS.
+        {
+            // Recover from a poisoned lock (a prior holder panicked) rather than propagating the
+            // panic: the cache holds only transient `SocketAddr` entries, so a poisoned-but-readable
+            // map is safe to use. Without this, one panic would wedge every later DNS lookup.
+            let cache = self.dns_cache.lock().unwrap_or_else(|p| p.into_inner());
+            if let Some((addr, at)) = cache.get(authority) {
+                if at.elapsed() < DNS_TTL {
+                    return Ok(*addr);
+                }
+            }
+        }
+        let addr = tokio::net::lookup_host(authority)
+            .await
+            .map_err(|e| GatewayError::Dns(format!("{authority}: {e}")))?
+            .next()
+            .ok_or_else(|| GatewayError::Dns(format!("{authority}: no addresses")))?;
+        self.dns_cache
+            .lock()
+            .unwrap_or_else(|p| p.into_inner())
+            .insert(authority.to_string(), (addr, Instant::now()));
+        Ok(addr)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::route::AuthScheme;
+    use crate::secret::Secret;
+
+    #[test]
+    fn registry_resolves_known_overrides_and_additions() {
+        let config = AiConfig {
+            // Override a known provider's authority + give it a pool key; add a config-only one.
+            provider_authorities: HashMap::from([
+                ("openai".to_string(), "127.0.0.1:9".to_string()),
+                ("custom".to_string(), "llm.internal:8443".to_string()),
+            ]),
+            pool_keys: HashMap::from([
+                ("openai".to_string(), Secret::new("sk-openai")),
+                ("custom".to_string(), Secret::new("sk-custom")),
+            ]),
+            ..Default::default()
+        };
+        let providers = build_providers(&config);
+
+        // Known provider: authority overridden, pool auth precomputed in the right scheme.
+        let openai = providers.get("openai").unwrap();
+        assert_eq!(openai.authority, "127.0.0.1:9");
+        assert_eq!(openai.auth, AuthScheme::Bearer);
+        assert_eq!(
+            openai.pool_auth_value.as_ref().unwrap().expose(),
+            "Bearer sk-openai"
+        );
+
+        // Known provider, no override: built-in default + no pool key ⇒ no managed auth value.
+        let anthropic = providers.get("anthropic").unwrap();
+        assert_eq!(anthropic.authority, "api.anthropic.com:443");
+        assert_eq!(anthropic.auth, AuthScheme::XApiKey);
+        assert!(anthropic.pool_auth_value.is_none());
+
+        // Config-only provider: added as OpenAI-wire (Bearer), reachable by name.
+        let custom = providers.get("custom").unwrap();
+        assert_eq!(custom.host, "llm.internal");
+        assert_eq!(
+            custom.pool_auth_value.as_ref().unwrap().expose(),
+            "Bearer sk-custom"
+        );
+    }
+}
diff --git a/crates/gateway/src/store_watch.rs b/crates/gateway/src/store_watch.rs
new file mode 100644
index 0000000..7c98707
--- /dev/null
+++ b/crates/gateway/src/store_watch.rs
@@ -0,0 +1,310 @@
+//! slipstream deny-set watcher — the gateway's **only** use of NATS.
+//!
+//! Seeds the deny-set at boot, then streams deltas. **Fail-open**: a NATS blip keeps the last-known
+//! set (we never clear), so an outage degrades to a stale deny-set, not "reject everything". Auth
+//! and pool/signing keys come from config, so they're unaffected by NATS being down — only
+//! spend/fraud enforcement goes stale.
+//!
+//! Seeding has two modes, chosen by `config.snapshot_path`:
+//!
+//! - **Unset (ephemeral, e.g. Fargate):** scan `blackhole.*` from NATS on first connect. The resume
+//!   revision is kept *in memory* across reconnects, so a NATS blip resumes the watch from where it
+//!   left off (gap-free) rather than re-scanning.
+//! - **Set (edge/tunnel, durable disk):** load slipstream's on-disk snapshot (entries + a saved
+//!   watch cursor), seed from it, and resume the watch from that cursor — a restart skips the scan
+//!   and enforces immediately, even before NATS reconnects. Every applied delta is appended back to
+//!   the snapshot so the file tracks the live set.
+//!
+//! Either way the watch resumes from a **revision** (`watch_prefix_from`), never a bare
+//! `watch_prefix`: the latter uses NATS `DeliverPolicy::New` (no replay), so a deny entry written in
+//! the window between seeding and the subscription attaching would be silently lost. Resuming from
+//! the seeded revision closes that window with no gap and no double-apply (it starts strictly after
+//! the seeded revision). If the backend compacted past the cursor (`CursorExpired`), we drop back to
+//! a fresh scan, which re-establishes a valid baseline.
+//!
+//! Runs as a Pingora `BackgroundService` so the NATS client is created on the serving runtime
+//! (async-nats ties its tasks to the runtime it's built on; connecting earlier would break it).
+
+use crate::deny::{self, DenySet};
+use crate::state::GatewayState;
+use async_trait::async_trait;
+use pingora_core::server::ShutdownWatch;
+use pingora_core::services::background::BackgroundService;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+use store::snapshot::SnapshotWriter;
+use store::{
+    Connection, KvEntry, KvError, KvStore, KvUpdate, NatsConnection, NatsConnectionConfig,
+    StoreConfig, WatchCursor,
+};
+use tracing::{error, info, warn};
+
+const BLACKHOLE_PREFIX: &str = "blackhole.";
+
+/// Compact the on-disk snapshot once it grows past this many bytes of appended deltas. The deny-set
+/// is low-churn, so this is rarely hit; it just bounds the log if a tenant flaps.
+const SNAPSHOT_COMPACT_THRESHOLD: u64 = 1024 * 1024;
+
+pub struct WatcherService {
+    pub state: Arc<GatewayState>,
+}
+
+#[async_trait]
+impl BackgroundService for WatcherService {
+    async fn start(&self, _shutdown: ShutdownWatch) {
+        // Resume position + on-disk snapshot writer persist across reconnects: a NATS blip resumes
+        // the watch from `cursor` instead of re-scanning, and `seeded` stays true so we don't reseed.
+        let mut cursor = WatchCursor::none();
+        let mut writer: Option<SnapshotWriter> = None;
+        let mut seeded = false;
+
+        if let Some(path) = self.state.config.snapshot_path.clone() {
+            let path = PathBuf::from(path);
+            // Snapshot I/O is synchronous (whole-file read/rewrite) — offload it so we never stall
+            // the serving runtime this BackgroundService shares with the proxy.
+            let load_path = path.clone();
+            match tokio::task::spawn_blocking(move || store::snapshot::load(&load_path)).await {
+                Ok(Ok(Some(snap))) => {
+                    let set = denyset_from_entries(snap.entries.values());
+                    info!(count = set.len(), "seeded deny-set from on-disk snapshot");
+                    self.state.deny.store(Arc::new(set));
+                    // A snapshot without a saved cursor can't safely resume (a bare watch would
+                    // race), so only treat it as seeded when it carries a resume point; otherwise
+                    // fall through to a NATS scan on connect.
+                    if !snap.cursor.is_none() {
+                        cursor = snap.cursor;
+                        seeded = true;
+                    }
+                }
+                Ok(Ok(None)) => info!("no on-disk snapshot yet; will seed from a NATS scan"),
+                Ok(Err(e)) => warn!(error = %e, "snapshot load failed; will seed from a NATS scan"),
+                Err(e) => warn!(error = %e, "snapshot load task panicked; seeding from NATS"),
+            }
+            let open_path = path.clone();
+            match tokio::task::spawn_blocking(move || {
+                SnapshotWriter::open(&open_path, SNAPSHOT_COMPACT_THRESHOLD)
+            })
+            .await
+            {
+                Ok(Ok(w)) => writer = Some(w),
+                Ok(Err(e)) => warn!(error = %e, "snapshot writer open failed; running without it"),
+                Err(e) => warn!(error = %e, "snapshot writer open task panicked"),
+            }
+        }
+
+        loop {
+            match connect(&self.state).await {
+                Ok(store) => {
+                    info!("slipstream connected; watching deny-set");
+                    watch_deny(&self.state, store, &mut cursor, &mut writer, &mut seeded).await;
+                    warn!("deny-set watch exited; reconnecting");
+                }
+                Err(e) => error!(error = %e, "slipstream connect failed; retrying"),
+            }
+            tokio::time::sleep(Duration::from_secs(2)).await;
+        }
+    }
+}
+
+/// Build a `DenySet` from KV entries, dropping any whose key isn't a `blackhole.{tenant}`.
+fn denyset_from_entries<'a>(entries: impl Iterator<Item = &'a KvEntry>) -> DenySet {
+    entries
+        .filter_map(|e| Some((deny::parse_key(&e.key)?, deny::parse_reason(&e.value))))
+        .collect()
+}
+
+/// Rewrite the on-disk snapshot from a fresh scan: truncate, write one `Put` per live entry, and
+/// checkpoint the cursor. Returns the reopened writer, or `None` if the rewrite failed (the gateway
+/// then runs snapshot-less — the in-memory deny-set is unaffected). Synchronous file I/O, so it runs
+/// on a blocking thread off the serving runtime.
+async fn rebuild_snapshot(
+    path: PathBuf,
+    entries: Vec<KvEntry>,
+    cursor: WatchCursor,
+) -> Option<SnapshotWriter> {
+    let res = tokio::task::spawn_blocking(
+        move || -> Result<SnapshotWriter, store::snapshot::SnapshotError> {
+            // Remove the old log so we don't replay a deleted-but-uncompacted key on a later load.
+            let _ = std::fs::remove_file(&path);
+            let mut w = SnapshotWriter::open(&path, SNAPSHOT_COMPACT_THRESHOLD)?;
+            for e in &entries {
+                w.write_update(&KvUpdate::Put(e.clone()))?;
+            }
+            w.checkpoint(&cursor)?;
+            Ok(w)
+        },
+    )
+    .await;
+    match res {
+        Ok(Ok(w)) => Some(w),
+        Ok(Err(e)) => {
+            warn!(error = %e, "snapshot rebuild failed; running without on-disk snapshot");
+            None
+        }
+        Err(e) => {
+            warn!(error = %e, "snapshot rebuild task panicked");
+            None
+        }
+    }
+}
+
+async fn connect(state: &GatewayState) -> crate::error::Result<Arc<dyn KvStore>> {
+    let cfg = &state.config;
+    let conn = NatsConnection::new(NatsConnectionConfig {
+        url: cfg.nats_url.clone(),
+        creds: cfg.nats_creds.as_ref().map(|s| s.expose().to_string()),
+        creds_file: cfg.nats_creds_file.clone(),
+    });
+    conn.connect().await?;
+    let store = conn
+        .store_with_config(StoreConfig {
+            name: cfg.config_bucket.clone(),
+            ..Default::default()
+        })
+        .await?;
+    Ok(store)
+}
+
+async fn watch_deny(
+    state: &Arc<GatewayState>,
+    store: Arc<dyn KvStore>,
+    cursor: &mut WatchCursor,
+    writer: &mut Option<SnapshotWriter>,
+    seeded: &mut bool,
+) {
+    // Seed once, on the first connect that lacks a usable resume point (cold boot with no snapshot,
+    // or after a `CursorExpired` reset). A NATS scan is a point-in-time read of the live set; the
+    // highest revision among its entries is the baseline the watch resumes strictly after. An empty
+    // set ⇒ revision 0 ⇒ resume from the start of history (the deny bucket is low-churn, so a full
+    // replay is cheap and still gap-free).
+    if !*seeded {
+        match store.reader().scan(BLACKHOLE_PREFIX).await {
+            Ok(entries) => {
+                let baseline_rev = entries
+                    .iter()
+                    .filter_map(|e| e.version.as_u64())
+                    .max()
+                    .unwrap_or(0);
+                let set = denyset_from_entries(entries.iter());
+                info!(
+                    count = set.len(),
+                    revision = baseline_rev,
+                    "seeded deny-set from scan"
+                );
+                state.deny.store(Arc::new(set));
+                *cursor = WatchCursor::from_u64(baseline_rev);
+                // Persist the freshly-scanned baseline so a later restart can skip the scan. We
+                // *rebuild* the file (not append): this path runs on a cold boot or after a
+                // `CursorExpired` reset, and a stale prior log could otherwise contain a `Put` for a
+                // tenant deleted while we were offline — whose `Delete` was compacted away — which a
+                // later `load()` would replay and resurrect (wrongly re-denying a tenant). A clean
+                // rewrite from the live scan makes the on-disk state exactly match NATS.
+                if writer.is_some() {
+                    if let Some(path) = state.config.snapshot_path.clone() {
+                        *writer =
+                            rebuild_snapshot(PathBuf::from(path), entries, cursor.clone()).await;
+                    }
+                }
+                *seeded = true;
+            }
+            Err(e) => {
+                // No baseline yet — serve whatever's already in memory (fail-open) and let the
+                // reconnect loop retry the scan.
+                warn!(error = %e, "deny-set scan failed; serving current set, will retry");
+                return;
+            }
+        }
+    }
+
+    // Stream deltas, resuming from the seeded revision. Never a bare `watch_prefix` (DeliverPolicy::
+    // New) — that would drop anything written in the seed→subscribe window.
+    let Some(watcher) = store.watcher() else {
+        warn!("store has no watcher; deny-set will not update");
+        return;
+    };
+    let (tx, mut rx) = tokio::sync::mpsc::channel::<KvUpdate>(256);
+    let w = watcher.clone();
+    let start_cursor = cursor.clone();
+    let watch = tokio::spawn(async move {
+        w.watch_prefix_from(BLACKHOLE_PREFIX, &start_cursor, tx)
+            .await
+    });
+
+    // Updates are rcu (clone-on-write); the set is tiny (O(denied)). Each applied delta also
+    // advances the in-memory cursor (so a reconnect resumes from here) and is appended to the
+    // on-disk snapshot if one is configured.
+    while let Some(update) = rx.recv().await {
+        state.deny.rcu(|cur| {
+            let mut set = (**cur).clone();
+            match &update {
+                KvUpdate::Put(e) => {
+                    if let Some(t) = deny::parse_key(&e.key) {
+                        set.insert(t, deny::parse_reason(&e.value));
+                    }
+                }
+                // Delete/Purge = restore (explicit delete or TTL expiry).
+                KvUpdate::Delete { key, .. } | KvUpdate::Purge { key, .. } => {
+                    if let Some(t) = deny::parse_key(key) {
+                        set.remove(t);
+                    }
+                }
+            }
+            Arc::new(set)
+        });
+        *cursor = WatchCursor::from_version(update.version().clone());
+        persist_update(writer, &update, cursor).await;
+    }
+
+    // The watch ended (NATS dropped, or the cursor was compacted away). Inspect why so a compacted
+    // cursor forces a fresh scan on the next connect instead of resuming from a dead revision.
+    match watch.await {
+        Ok(Ok(())) => {}
+        Ok(Err(KvError::CursorExpired)) => {
+            warn!("deny-set resume cursor expired (history compacted past it); will rescan");
+            *seeded = false;
+            *cursor = WatchCursor::none();
+        }
+        Ok(Err(e)) => warn!(error = %e, "deny-set watch ended"),
+        Err(e) => warn!(error = %e, "deny-set watch task panicked"),
+    }
+}
+
+/// Append one applied delta to the on-disk snapshot (if configured) and checkpoint the cursor.
+/// `write_update`/`checkpoint` are buffered/`write(2)` and cheap; `compact` reads+rewrites the whole
+/// file, so it's offloaded off the serving runtime when the log crosses its threshold.
+async fn persist_update(
+    writer: &mut Option<SnapshotWriter>,
+    update: &KvUpdate,
+    cursor: &WatchCursor,
+) {
+    let needs_compact = match writer.as_mut() {
+        Some(w) => {
+            if let Err(e) = w.write_update(update) {
+                warn!(error = %e, "snapshot write failed");
+            }
+            match w.checkpoint(cursor) {
+                Ok(b) => b,
+                Err(e) => {
+                    warn!(error = %e, "snapshot checkpoint failed");
+                    false
+                }
+            }
+        }
+        None => false,
+    };
+    if needs_compact {
+        // Move the writer into a blocking task for the rewrite, then take it back. If it fails we
+        // drop the writer (None) and run snapshot-less until the next restart reopens the file —
+        // the deny-set itself is unaffected (it lives in the ArcSwap, fed by NATS).
+        if let Some(mut w) = writer.take() {
+            match tokio::task::spawn_blocking(move || w.compact().map(|()| w)).await {
+                Ok(Ok(w)) => *writer = Some(w),
+                Ok(Err(e)) => {
+                    warn!(error = %e, "snapshot compaction failed; disabling snapshot writer")
+                }
+                Err(e) => warn!(error = %e, "snapshot compaction task panicked"),
+            }
+        }
+    }
+}
diff --git a/crates/gateway/src/usage.rs b/crates/gateway/src/usage.rs
new file mode 100644
index 0000000..f5281fb
--- /dev/null
+++ b/crates/gateway/src/usage.rs
@@ -0,0 +1,190 @@
+//! Token-usage extraction — the "passive tap" the gateway emits as billing *facts*.
+//!
+//! We never compute price here (pricing is a closed downstream consumer); we only extract raw
+//! token counts. Two shapes per provider: the non-streaming JSON body, and the terminal event of
+//! an SSE stream. For streaming we scan the relayed bytes for the usage event but never block the
+//! relay on it (see `proxy`).
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub struct Usage {
+    pub input_tokens: u64,
+    pub output_tokens: u64,
+    pub cache_read_tokens: u64,
+    pub cache_write_tokens: u64,
+}
+
+fn u64_at(v: &serde_json::Value, ptr: &str) -> u64 {
+    v.pointer(ptr).and_then(|x| x.as_u64()).unwrap_or(0)
+}
+
+/// OpenAI non-streaming: `usage.{prompt_tokens, completion_tokens}` (+ cached details).
+pub fn openai_body(body: &[u8]) -> Option<Usage> {
+    let v: serde_json::Value = serde_json::from_slice(body).ok()?;
+    let u = v.get("usage")?;
+    Some(Usage {
+        input_tokens: u64_at(u, "/prompt_tokens"),
+        output_tokens: u64_at(u, "/completion_tokens"),
+        cache_read_tokens: u64_at(u, "/prompt_tokens_details/cached_tokens"),
+        cache_write_tokens: 0,
+    })
+}
+
+/// Anthropic non-streaming: `usage.{input_tokens, output_tokens, cache_*}`.
+pub fn anthropic_body(body: &[u8]) -> Option<Usage> {
+    let v: serde_json::Value = serde_json::from_slice(body).ok()?;
+    let u = v.get("usage")?;
+    Some(Usage {
+        input_tokens: u64_at(u, "/input_tokens"),
+        output_tokens: u64_at(u, "/output_tokens"),
+        cache_read_tokens: u64_at(u, "/cache_read_input_tokens"),
+        cache_write_tokens: u64_at(u, "/cache_creation_input_tokens"),
+    })
+}
+
+/// Iterate the JSON objects carried on `data:` lines of an SSE byte stream. `[DONE]` and
+/// non-JSON payloads are skipped. Used by both stream parsers below.
+fn sse_data_objects(sse: &[u8]) -> impl Iterator<Item = serde_json::Value> + '_ {
+    sse.split(|&b| b == b'\n').filter_map(|line| {
+        let line = line.strip_prefix(b"data:")?;
+        let line = line.strip_prefix(b" ").unwrap_or(line);
+        if line == b"[DONE]" {
+            return None;
+        }
+        serde_json::from_slice::<serde_json::Value>(line).ok()
+    })
+}
+
+/// OpenAI streaming (requires `stream_options.include_usage`): the penultimate chunk carries a
+/// top-level `usage` object. Last one with usage wins.
+pub fn openai_stream(sse: &[u8]) -> Option<Usage> {
+    let mut found = None;
+    for v in sse_data_objects(sse) {
+        if let Some(u) = v.get("usage").filter(|u| !u.is_null()) {
+            found = Some(Usage {
+                input_tokens: u64_at(u, "/prompt_tokens"),
+                output_tokens: u64_at(u, "/completion_tokens"),
+                cache_read_tokens: u64_at(u, "/prompt_tokens_details/cached_tokens"),
+                cache_write_tokens: 0,
+            });
+        }
+    }
+    found
+}
+
+/// Anthropic streaming: input + cache tokens arrive in `message_start.message.usage`; output
+/// accumulates in `message_delta.usage.output_tokens` (last delta is the cumulative total).
+pub fn anthropic_stream(sse: &[u8]) -> Option<Usage> {
+    let mut usage = Usage::default();
+    let mut saw_any = false;
+    for v in sse_data_objects(sse) {
+        if let Some(u) = v.pointer("/message/usage") {
+            usage.input_tokens = u64_at(u, "/input_tokens");
+            usage.cache_read_tokens = u64_at(u, "/cache_read_input_tokens");
+            usage.cache_write_tokens = u64_at(u, "/cache_creation_input_tokens");
+            saw_any = true;
+        }
+        if let Some(u) = v.get("usage") {
+            // message_delta carries the running output token count.
+            let out = u64_at(u, "/output_tokens");
+            if out > 0 {
+                usage.output_tokens = out;
+            }
+            saw_any = true;
+        }
+    }
+    saw_any.then_some(usage)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn openai_nonstreaming() {
+        let body = br#"{"usage":{"prompt_tokens":12,"completion_tokens":34,
+            "prompt_tokens_details":{"cached_tokens":4}}}"#;
+        assert_eq!(
+            openai_body(body).unwrap(),
+            Usage {
+                input_tokens: 12,
+                output_tokens: 34,
+                cache_read_tokens: 4,
+                cache_write_tokens: 0
+            }
+        );
+    }
+
+    #[test]
+    fn anthropic_nonstreaming() {
+        let body = br#"{"usage":{"input_tokens":100,"output_tokens":50,
+            "cache_read_input_tokens":10,"cache_creation_input_tokens":7}}"#;
+        assert_eq!(
+            anthropic_body(body).unwrap(),
+            Usage {
+                input_tokens: 100,
+                output_tokens: 50,
+                cache_read_tokens: 10,
+                cache_write_tokens: 7
+            }
+        );
+    }
+
+    #[test]
+    fn openai_streaming_terminal_usage() {
+        let sse = b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\n\
+                    data: {\"choices\":[],\"usage\":{\"prompt_tokens\":5,\"completion_tokens\":9}}\n\n\
+                    data: [DONE]\n\n";
+        assert_eq!(
+            openai_stream(sse).unwrap(),
+            Usage {
+                input_tokens: 5,
+                output_tokens: 9,
+                cache_read_tokens: 0,
+                cache_write_tokens: 0
+            }
+        );
+    }
+
+    #[test]
+    fn anthropic_streaming_accumulates() {
+        let sse = b"event: message_start\n\
+                    data: {\"type\":\"message_start\",\"message\":{\"usage\":{\"input_tokens\":20,\"output_tokens\":0}}}\n\n\
+                    event: message_delta\n\
+                    data: {\"type\":\"message_delta\",\"usage\":{\"output_tokens\":15}}\n\n";
+        assert_eq!(
+            anthropic_stream(sse).unwrap(),
+            Usage {
+                input_tokens: 20,
+                output_tokens: 15,
+                cache_read_tokens: 0,
+                cache_write_tokens: 0
+            }
+        );
+    }
+
+    #[test]
+    fn anthropic_streaming_includes_cache_tokens() {
+        // Cache tokens ride in `message_start.message.usage` alongside input_tokens. The earlier
+        // accumulation test omits them; this guards the `cache_read`/`cache_creation` pointers so a
+        // regression can't silently zero cache billing.
+        let sse = b"event: message_start\n\
+                    data: {\"type\":\"message_start\",\"message\":{\"usage\":{\"input_tokens\":20,\"output_tokens\":0,\"cache_read_input_tokens\":12,\"cache_creation_input_tokens\":8}}}\n\n\
+                    event: message_delta\n\
+                    data: {\"type\":\"message_delta\",\"usage\":{\"output_tokens\":15}}\n\n";
+        assert_eq!(
+            anthropic_stream(sse).unwrap(),
+            Usage {
+                input_tokens: 20,
+                output_tokens: 15,
+                cache_read_tokens: 12,
+                cache_write_tokens: 8
+            }
+        );
+    }
+
+    #[test]
+    fn no_usage_returns_none() {
+        assert!(openai_stream(b"data: {\"choices\":[]}\n\n").is_none());
+        assert!(anthropic_body(b"{}").map(|u| u.input_tokens).unwrap_or(0) == 0);
+    }
+}
diff --git a/crates/gateway/tests/common/mod.rs b/crates/gateway/tests/common/mod.rs
new file mode 100644
index 0000000..45b86fb
--- /dev/null
+++ b/crates/gateway/tests/common/mod.rs
@@ -0,0 +1,517 @@
+//! e2e harness: a real `beyond-ai` binary, a real `nats-server` (JetStream KV backing the deny-set),
+//! and a mock HTTP upstream that records what the gateway forwarded.
+//!
+//! Requires `nats-server` on PATH — run via `mise run test:integration:rs`.
+//! Signing keys + pool keys are passed via the gateway's *config* (not NATS); NATS carries only the
+//! deny-set. Every component picks a free port and cleans up on drop, so tests run in parallel.
+
+#![allow(dead_code)]
+
+use std::io::Write;
+use std::net::TcpListener as StdTcpListener;
+use std::process::{Child, Command};
+use std::sync::{Arc, Mutex};
+use std::time::Duration;
+
+use base64::Engine;
+use bytes::Bytes;
+use http_body_util::{BodyExt, Full};
+use hyper::service::service_fn;
+use hyper::{HeaderMap, Request, Response};
+use hyper_util::rt::TokioIo;
+use store::Connection;
+use tokio::net::TcpListener;
+use tokio::time::{sleep, timeout};
+
+/// Hand out a TCP port no other `free_port()` call in this test binary has returned.
+///
+/// Tests run as concurrent threads in **one** process, and closing a `bind(:0)` listener lets the OS
+/// immediately re-hand that ephemeral port to the next `bind(:0)` — so two `free_port()` calls (a
+/// gateway's `listen` + `metrics` ports, or two tests at once) can collide, and a component then
+/// fails to bind → a *different* test flakes. A process-global reservation set makes every returned
+/// port distinct within the run; binding fresh listeners on collision forces the OS off the just-used
+/// port (it can't re-hand a port still held open) so the loop makes progress.
+///
+/// A residual TOCTOU window remains between returning a port and a *subprocess* (nats/gateway) binding
+/// it, vs. other OS processes — unavoidable when the bind happens in another process. In-process
+/// servers must instead bind `:0` and read the port back (see `MockUpstream`), which has no window.
+pub fn free_port() -> u16 {
+    use std::collections::HashSet;
+    use std::sync::OnceLock;
+    static USED: OnceLock<Mutex<HashSet<u16>>> = OnceLock::new();
+    let used = USED.get_or_init(|| Mutex::new(HashSet::new()));
+
+    let mut held = Vec::new();
+    for _ in 0..1000 {
+        let listener = StdTcpListener::bind("127.0.0.1:0").unwrap();
+        let port = listener.local_addr().unwrap().port();
+        if used.lock().unwrap_or_else(|p| p.into_inner()).insert(port) {
+            return port; // `listener` drops here, freeing the port for the (sub)process to bind.
+        }
+        // Already handed out: keep this listener open so the next bind gets a different port, then
+        // try again. The held listeners all drop at return, releasing those ports back to the OS.
+        held.push(listener);
+    }
+    panic!("could not find an unused free port after 1000 attempts");
+}
+
+/// Base64 (standard) — used to put an Ed25519 public key into the gateway's `signing_keys` config.
+pub fn b64(bytes: &[u8]) -> String {
+    base64::engine::general_purpose::STANDARD.encode(bytes)
+}
+
+/// Deterministic Ed25519 keypair: (raw 32-byte public key, signing key).
+pub fn test_keypair(seed: u8) -> (Vec<u8>, ed25519_dalek::SigningKey) {
+    let sk = ed25519_dalek::SigningKey::from_bytes(&[seed; 32]);
+    (sk.verifying_key().to_bytes().to_vec(), sk)
+}
+
+async fn wait_for_port(port: u16, what: &str) {
+    timeout(Duration::from_secs(20), async {
+        loop {
+            if tokio::net::TcpStream::connect(("127.0.0.1", port))
+                .await
+                .is_ok()
+            {
+                return;
+            }
+            sleep(Duration::from_millis(50)).await;
+        }
+    })
+    .await
+    .unwrap_or_else(|_| panic!("{what} did not come up on port {port}"));
+}
+
+// --- nats-server (JetStream) ------------------------------------------------
+
+pub struct Nats {
+    child: Child,
+    pub port: u16,
+    store_dir: std::path::PathBuf,
+}
+
+impl Nats {
+    pub async fn start() -> Self {
+        let port = free_port();
+        let store_dir = std::env::temp_dir().join(format!("beyond-ai-nats-{port}"));
+        let _ = std::fs::create_dir_all(&store_dir);
+        let child = Command::new("nats-server")
+            .args([
+                "-js",
+                "-a",
+                "127.0.0.1",
+                "-p",
+                &port.to_string(),
+                "-sd",
+                store_dir.to_str().unwrap(),
+            ])
+            .stdout(std::process::Stdio::null())
+            .stderr(std::process::Stdio::null())
+            .spawn()
+            .expect("spawn nats-server (on PATH? run via mise)");
+        let nats = Nats {
+            child,
+            port,
+            store_dir,
+        };
+        wait_for_port(port, "nats-server").await;
+        nats
+    }
+}
+
+impl Nats {
+    /// Kill the server mid-test (for fail-open coverage). Idempotent with `Drop`.
+    pub fn stop(&mut self) {
+        let _ = self.child.kill();
+        let _ = self.child.wait();
+    }
+}
+
+impl Drop for Nats {
+    fn drop(&mut self) {
+        let _ = self.child.kill();
+        let _ = std::fs::remove_dir_all(&self.store_dir);
+    }
+}
+
+pub async fn put_kv(nats_port: u16, key: &str, value: &[u8]) {
+    open_writer(nats_port).await.put(key, value).await.unwrap();
+}
+
+pub async fn del_kv(nats_port: u16, key: &str) {
+    open_writer(nats_port).await.delete(key).await.unwrap();
+}
+
+async fn open_writer(nats_port: u16) -> std::sync::Arc<dyn store::KvWriter> {
+    let conn = store::NatsConnection::new(store::NatsConnectionConfig {
+        url: format!("nats://127.0.0.1:{nats_port}"),
+        creds: None,
+        creds_file: None,
+    });
+    conn.connect().await.unwrap();
+    let kv = conn
+        .store_with_config(store::StoreConfig {
+            name: "ai-gateway".into(),
+            ..Default::default()
+        })
+        .await
+        .unwrap();
+    kv.writer().expect("bucket is writable")
+}
+
+// --- mock upstream provider -------------------------------------------------
+
+#[derive(Clone, Copy)]
+pub enum Mode {
+    /// OpenAI-shaped non-streaming JSON body.
+    Json,
+    /// OpenAI-shaped SSE stream with a terminal usage chunk.
+    Sse,
+    /// Anthropic-shaped non-streaming JSON body (`usage.input_tokens`).
+    AnthropicJson,
+    /// OpenAI-shaped SSE stream with >128 KiB of content *before* the usage chunk — forces the
+    /// proxy's response-tail compaction path.
+    SseLarge,
+}
+
+#[derive(Default, Clone)]
+pub struct Captured {
+    pub path: String,
+    pub authorization: Option<String>,
+    pub x_api_key: Option<String>,
+    pub host: Option<String>,
+    pub body: Vec<u8>,
+}
+
+pub struct MockUpstream {
+    pub port: u16,
+    captured: Arc<Mutex<Option<Captured>>>,
+    task: tokio::task::JoinHandle<()>,
+}
+
+const CANNED_JSON: &str = r#"{"id":"chatcmpl-mock","object":"chat.completion","model":"gpt-4o-2024-08-06","choices":[{"index":0,"message":{"role":"assistant","content":"hi"},"finish_reason":"stop"}],"usage":{"prompt_tokens":11,"completion_tokens":7,"total_tokens":18}}"#;
+
+const CANNED_SSE: &str = "data: {\"id\":\"chatcmpl-mock\",\"object\":\"chat.completion.chunk\",\"model\":\"gpt-4o-2024-08-06\",\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\ndata: {\"choices\":[],\"usage\":{\"prompt_tokens\":5,\"completion_tokens\":9}}\n\ndata: [DONE]\n\n";
+
+const CANNED_ANTHROPIC_JSON: &str = r#"{"id":"msg_mock","type":"message","model":"claude-opus-4-8","content":[{"type":"text","text":"hi"}],"usage":{"input_tokens":13,"output_tokens":7}}"#;
+
+/// An OpenAI SSE stream whose first chunk carries ~130 KiB of content, pushing the proxy's response
+/// tail past `2 × USAGE_TAIL_CAP` (128 KiB) so it compacts at least once before the trailing usage
+/// chunk arrives. The usage event must survive in the retained 64 KiB tail.
+fn large_sse() -> String {
+    let filler = "x".repeat(130 * 1024);
+    format!(
+        "data: {{\"id\":\"chatcmpl-mock\",\"object\":\"chat.completion.chunk\",\"model\":\"gpt-4o-2024-08-06\",\"choices\":[{{\"delta\":{{\"content\":\"{filler}\"}}}}]}}\n\n\
+         data: {{\"choices\":[],\"usage\":{{\"prompt_tokens\":5,\"completion_tokens\":9}}}}\n\n\
+         data: [DONE]\n\n"
+    )
+}
+
+impl MockUpstream {
+    pub async fn start(mode: Mode) -> Self {
+        // Bind `:0` and read the port back, keeping the listener open the whole time — no
+        // free_port()→rebind window for another test to slip into (this is an in-process server).
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let port = listener.local_addr().unwrap().port();
+        let captured: Arc<Mutex<Option<Captured>>> = Arc::new(Mutex::new(None));
+        let cap = captured.clone();
+        let task = tokio::spawn(async move {
+            loop {
+                let Ok((stream, _)) = listener.accept().await else {
+                    break;
+                };
+                let io = TokioIo::new(stream);
+                let cap = cap.clone();
+                tokio::spawn(async move {
+                    let svc = service_fn(move |req: Request<hyper::body::Incoming>| {
+                        let cap = cap.clone();
+                        async move {
+                            let path = req.uri().path().to_string();
+                            let h: &HeaderMap = req.headers();
+                            let get =
+                                |k: &str| h.get(k).and_then(|v| v.to_str().ok()).map(String::from);
+                            let c = Captured {
+                                path,
+                                authorization: get("authorization"),
+                                x_api_key: get("x-api-key"),
+                                host: get("host"),
+                                body: req
+                                    .into_body()
+                                    .collect()
+                                    .await
+                                    .map(|b| b.to_bytes().to_vec())
+                                    .unwrap_or_default(),
+                            };
+                            *cap.lock().unwrap() = Some(c);
+                            let (ct, payload): (&str, Bytes) = match mode {
+                                Mode::Json => (
+                                    "application/json",
+                                    Bytes::from_static(CANNED_JSON.as_bytes()),
+                                ),
+                                Mode::Sse => (
+                                    "text/event-stream",
+                                    Bytes::from_static(CANNED_SSE.as_bytes()),
+                                ),
+                                Mode::AnthropicJson => (
+                                    "application/json",
+                                    Bytes::from_static(CANNED_ANTHROPIC_JSON.as_bytes()),
+                                ),
+                                Mode::SseLarge => ("text/event-stream", Bytes::from(large_sse())),
+                            };
+                            Ok::<_, std::convert::Infallible>(
+                                Response::builder()
+                                    .status(200)
+                                    .header("content-type", ct)
+                                    .body(Full::new(payload))
+                                    .unwrap(),
+                            )
+                        }
+                    });
+                    let _ = hyper::server::conn::http1::Builder::new()
+                        .serve_connection(io, svc)
+                        .await;
+                });
+            }
+        });
+        MockUpstream {
+            port,
+            captured,
+            task,
+        }
+    }
+
+    pub fn authority(&self) -> String {
+        format!("127.0.0.1:{}", self.port)
+    }
+
+    pub fn captured(&self) -> Option<Captured> {
+        self.captured.lock().unwrap().clone()
+    }
+}
+
+impl Drop for MockUpstream {
+    fn drop(&mut self) {
+        self.task.abort();
+    }
+}
+
+// --- the real beyond-ai binary ----------------------------------------------
+
+pub struct Gateway {
+    child: Child,
+    pub port: u16,
+    pub metrics_port: u16,
+    config_path: std::path::PathBuf,
+}
+
+/// The managed pool key configured for a provider. Each provider gets a distinct value so a test
+/// can assert the gateway swapped in the *right* one.
+fn pool_key(provider: &str) -> &'static str {
+    match provider {
+        "openai" => "sk-pool-secret",
+        "anthropic" => "sk-anthropic-pool",
+        "fireworks" => "sk-fireworks-pool",
+        "openrouter" => "sk-openrouter-pool",
+        _ => "sk-unknown-pool",
+    }
+}
+
+/// Builds a gateway config, choosing which providers are *configured* (authority → the mock + a
+/// pool key). A managed request to a provider absent from this list has no pool key → 503.
+pub struct GatewayBuilder {
+    nats_port: u16,
+    authority: String,
+    signkey_b64: String,
+    providers: Vec<&'static str>,
+    snapshot_path: Option<String>,
+    real_upstreams: bool,
+    pool_key_overrides: Vec<(String, String)>,
+}
+
+impl GatewayBuilder {
+    /// Set which providers are configured. Defaults to `["openai", "fireworks"]`.
+    pub fn providers(mut self, providers: &[&'static str]) -> Self {
+        self.providers = providers.to_vec();
+        self
+    }
+
+    /// Point the gateway at the **real** provider hosts over TLS (the `route::KNOWN_PROVIDERS`
+    /// defaults), instead of the plaintext mock. Used by the live smoke tests (`tests/smoke.rs`):
+    /// no authority overrides, no pool keys, no signing keys — smoke traffic is BYO (the caller's
+    /// real provider token, passed through), so none of that is needed.
+    pub fn real_upstreams(mut self) -> Self {
+        self.real_upstreams = true;
+        self
+    }
+
+    /// Set the managed pool key for a provider by name — in `real_upstreams` mode this is the *real*
+    /// provider key the gateway swaps in for a managed (`bai_…`) request. Combine with a signing key
+    /// (the `signkey_b64` passed to `builder`) to smoke-test the full managed path against the real
+    /// provider.
+    pub fn pool_key(mut self, provider: &str, key: &str) -> Self {
+        self.pool_key_overrides
+            .push((provider.to_string(), key.to_string()));
+        self
+    }
+
+    /// Point the gateway at an on-disk deny-set snapshot. Pass the same path to two `start()` calls
+    /// to model a restart that reloads from disk.
+    pub fn snapshot_path(mut self, path: impl Into<String>) -> Self {
+        self.snapshot_path = Some(path.into());
+        self
+    }
+
+    pub async fn start(self) -> Gateway {
+        let port = free_port();
+        let metrics_port = free_port();
+        let config_path = std::env::temp_dir().join(format!("beyond-ai-config-{port}.toml"));
+        let nats_port = self.nats_port;
+        // Scalars first, `[…]` tables last (TOML ordering).
+        let tls = self.real_upstreams;
+        let mut cfg = format!(
+            "listen = \"127.0.0.1:{port}\"\n\
+             metrics_listen = \"127.0.0.1:{metrics_port}\"\n\
+             nats_url = \"nats://127.0.0.1:{nats_port}\"\n\
+             config_bucket = \"ai-gateway\"\n\
+             upstream_tls = {tls}\n"
+        );
+        if let Some(path) = &self.snapshot_path {
+            cfg.push_str(&format!("snapshot_path = \"{path}\"\n"));
+        }
+        if self.real_upstreams {
+            // Real-host smoke mode: built-in provider defaults (no authority overrides). For a
+            // *managed* smoke we still write the caller-supplied pool key(s) — the real provider key
+            // the gateway swaps in — and the signing key the minted virtual key verifies against.
+            // With neither set, this is a BYO smoke (the caller's token passes through).
+            if !self.pool_key_overrides.is_empty() {
+                cfg.push_str("\n[pool_keys]\n");
+                for (p, k) in &self.pool_key_overrides {
+                    cfg.push_str(&format!("{p} = \"{k}\"\n"));
+                }
+            }
+            if !self.signkey_b64.is_empty() {
+                cfg.push_str(&format!("\n[signing_keys]\n1 = \"{}\"\n", self.signkey_b64));
+            }
+        } else {
+            // Every configured provider points at the one mock upstream...
+            cfg.push_str("\n[provider_authorities]\n");
+            for p in &self.providers {
+                cfg.push_str(&format!("{p} = \"{}\"\n", self.authority));
+            }
+            // ...with a distinct pool key per provider so key-swap assertions can tell them apart.
+            cfg.push_str("\n[pool_keys]\n");
+            for p in &self.providers {
+                cfg.push_str(&format!("{p} = \"{}\"\n", pool_key(p)));
+            }
+            cfg.push_str(&format!("\n[signing_keys]\n1 = \"{}\"\n", self.signkey_b64));
+        }
+        std::fs::File::create(&config_path)
+            .unwrap()
+            .write_all(cfg.as_bytes())
+            .unwrap();
+
+        let child = Command::new(env!("CARGO_BIN_EXE_beyond-ai"))
+            .arg("run")
+            .arg("-c")
+            .arg(&config_path)
+            .env(
+                "AI_LOG",
+                std::env::var("AI_LOG").unwrap_or_else(|_| "warn".into()),
+            )
+            .spawn()
+            .expect("spawn beyond-ai");
+        let gw = Gateway {
+            child,
+            port,
+            metrics_port,
+            config_path,
+        };
+        wait_for_port(port, "beyond-ai").await;
+        gw
+    }
+}
+
+impl Gateway {
+    /// Start the gateway pointed at `nats` (deny-set) + the mock upstream, configuring the OpenAI
+    /// and Fireworks providers. Signing key + pool key come from config (mirrors production: NATS
+    /// holds only the deny-set). For other provider sets use [`Gateway::builder`].
+    pub async fn start(nats_port: u16, openai_authority: &str, signkey_b64: &str) -> Self {
+        Gateway::builder(nats_port, openai_authority, signkey_b64)
+            .start()
+            .await
+    }
+
+    /// A configurable gateway (which providers exist, etc.). Defaults match [`Gateway::start`].
+    pub fn builder(nats_port: u16, authority: &str, signkey_b64: &str) -> GatewayBuilder {
+        GatewayBuilder {
+            nats_port,
+            authority: authority.to_string(),
+            signkey_b64: signkey_b64.to_string(),
+            providers: vec!["openai", "fireworks"],
+            snapshot_path: None,
+            real_upstreams: false,
+            pool_key_overrides: Vec::new(),
+        }
+    }
+
+    pub fn url(&self) -> String {
+        format!("http://127.0.0.1:{}", self.port)
+    }
+
+    pub async fn metrics(&self) -> String {
+        reqwest::get(format!("http://127.0.0.1:{}/metrics", self.metrics_port))
+            .await
+            .unwrap()
+            .text()
+            .await
+            .unwrap()
+    }
+}
+
+impl Drop for Gateway {
+    fn drop(&mut self) {
+        let _ = self.child.kill();
+        let _ = std::fs::remove_file(&self.config_path);
+    }
+}
+
+// --- assertions -------------------------------------------------------------
+
+pub fn parse_metric(metrics: &str, name: &str, label_value: &str) -> f64 {
+    metrics
+        .lines()
+        .find(|l| l.starts_with(name) && l.contains(label_value))
+        .and_then(|l| l.rsplit(' ').next())
+        .and_then(|v| v.parse().ok())
+        .unwrap_or(0.0)
+}
+
+pub async fn wait_for_metric(gw: &Gateway, name: &str, label: &str, min: f64) {
+    let r = timeout(Duration::from_secs(5), async {
+        loop {
+            if parse_metric(&gw.metrics().await, name, label) >= min {
+                return;
+            }
+            sleep(Duration::from_millis(150)).await;
+        }
+    })
+    .await;
+    assert!(r.is_ok(), "metric {name}{{{label}}} never reached {min}");
+}
+
+pub async fn wait_for_status<F, Fut>(want: u16, mut f: F)
+where
+    F: FnMut() -> Fut,
+    Fut: std::future::Future<Output = u16>,
+{
+    let r = timeout(Duration::from_secs(10), async {
+        loop {
+            if f().await == want {
+                return;
+            }
+            sleep(Duration::from_millis(150)).await;
+        }
+    })
+    .await;
+    assert!(r.is_ok(), "status never became {want}");
+}
diff --git a/crates/gateway/tests/e2e.rs b/crates/gateway/tests/e2e.rs
new file mode 100644
index 0000000..ae6f648
--- /dev/null
+++ b/crates/gateway/tests/e2e.rs
@@ -0,0 +1,663 @@
+//! End-to-end: real `beyond-ai` binary + real nats-server + mock upstream.
+//! Run via `mise run test:integration:rs` (needs `nats-server` on PATH).
+//!
+//! Signing key + pool key come from the gateway's *config*; NATS carries only the deny-set.
+
+mod common;
+
+use beyond_ai::key::{VirtualKey, mint};
+use common::*;
+
+fn body_for(model: &str) -> String {
+    format!(r#"{{"model":"{model}","messages":[{{"role":"user","content":"hi"}}]}}"#)
+}
+
+async fn post_status(client: &reqwest::Client, url: &str, key: &str, body: String) -> u16 {
+    client
+        .post(format!("{url}/v1/chat/completions"))
+        .header("authorization", format!("Bearer {key}"))
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .map(|r| r.status().as_u16())
+        .unwrap_or(0)
+}
+
+async fn post_status_provider(
+    client: &reqwest::Client,
+    url: &str,
+    key: &str,
+    provider: &str,
+    body: String,
+) -> u16 {
+    client
+        .post(format!("{url}/v1/chat/completions"))
+        .header("authorization", format!("Bearer {key}"))
+        .header("x-beyond-provider", provider)
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .map(|r| r.status().as_u16())
+        .unwrap_or(0)
+}
+
+/// POST with the virtual key in the `x-api-key` header (Anthropic-SDK style) instead of `Bearer`.
+async fn post_status_xapikey(
+    client: &reqwest::Client,
+    url: &str,
+    path: &str,
+    key: &str,
+    body: String,
+) -> u16 {
+    client
+        .post(format!("{url}{path}"))
+        .header("x-api-key", key)
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .map(|r| r.status().as_u16())
+        .unwrap_or(0)
+}
+
+/// Send a hand-written HTTP/1.1 request and return the response status. Used to declare a
+/// Content-Length the body guard must reject *without* actually transferring that many bytes
+/// (the guard fires on the header, before any body is read).
+async fn raw_status(port: u16, request: &str) -> u16 {
+    use tokio::io::{AsyncReadExt, AsyncWriteExt};
+    let mut s = tokio::net::TcpStream::connect(("127.0.0.1", port))
+        .await
+        .unwrap();
+    s.write_all(request.as_bytes()).await.unwrap();
+    s.flush().await.unwrap();
+    let mut buf = vec![0u8; 256];
+    let n = s.read(&mut buf).await.unwrap();
+    String::from_utf8_lossy(&buf[..n])
+        .split_whitespace()
+        .nth(1)
+        .and_then(|c| c.parse().ok())
+        .unwrap_or(0)
+}
+
+#[tokio::test]
+async fn managed_swaps_key_relays_body_and_meters_usage() {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(1);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 42,
+            vpc_id: 7,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    let resp = client
+        .post(format!("{}/v1/chat/completions", gw.url()))
+        .header("authorization", format!("Bearer {vkey}"))
+        .header("content-type", "application/json")
+        .body(body_for("gpt-4o"))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+    assert!(resp.text().await.unwrap().contains("\"chatcmpl-mock\""));
+
+    // Managed: the mock saw the real pool key, never the virtual key.
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(cap.path, "/v1/chat/completions");
+    assert_eq!(cap.authorization.as_deref(), Some("Bearer sk-pool-secret"));
+    assert!(!cap.body.is_empty());
+
+    wait_for_metric(&gw, "ai_tokens_total", "input", 11.0).await;
+
+    // Bad managed key → 401.
+    assert_eq!(
+        post_status(
+            &client,
+            &gw.url(),
+            "bai_v1.1.bogus.bogus",
+            body_for("gpt-4o")
+        )
+        .await,
+        401
+    );
+}
+
+#[tokio::test]
+async fn byo_passes_user_token_through_unchanged() {
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(1); // gateway still needs a signing key in config to boot
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let client = reqwest::Client::new();
+    // A raw provider token (not `bai_`) → BYO → forwarded verbatim.
+    {
+        let (c, u) = (client.clone(), gw.url());
+        wait_for_status(200, move || {
+            let (c, u) = (c.clone(), u.clone());
+            async move { post_status(&c, &u, "sk-user-byo", body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(
+        cap.authorization.as_deref(),
+        Some("Bearer sk-user-byo"),
+        "BYO token must pass through unchanged (no swap)"
+    );
+}
+
+#[tokio::test]
+async fn fireworks_provider_header_routes_and_swaps_pool_key() {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(4);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 5,
+            vpc_id: 6,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    // Fireworks model ids contain `/`, so it's reached via the `x-beyond-provider` header, not
+    // model inference. A managed key must swap to the Fireworks-specific pool key.
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                post_status_provider(
+                    &c,
+                    &u,
+                    &k,
+                    "fireworks",
+                    body_for("accounts/fireworks/models/llama-v3p1-70b-instruct"),
+                )
+                .await
+            }
+        })
+        .await;
+    }
+
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(
+        cap.authorization.as_deref(),
+        Some("Bearer sk-fireworks-pool"),
+        "managed Fireworks request must swap to the Fireworks pool key"
+    );
+    // Fireworks mounts the OpenAI surface under `/inference/v1`, not `/v1`: the client's
+    // `/v1/chat/completions` must be rewritten or the real upstream would 404.
+    assert_eq!(
+        cap.path, "/inference/v1/chat/completions",
+        "client `/v1` path must be remapped to the provider's base path"
+    );
+}
+
+#[tokio::test]
+async fn streaming_relays_sse_and_meters_usage() {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(3);
+    let mock = MockUpstream::start(Mode::Sse).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 7,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    let body = r#"{"model":"gpt-4o","stream":true,"messages":[{"role":"user","content":"hi"}]}"#
+        .to_string();
+
+    {
+        let (c, u, k, b) = (client.clone(), gw.url(), vkey.clone(), body.clone());
+        wait_for_status(200, move || {
+            let (c, u, k, b) = (c.clone(), u.clone(), k.clone(), b.clone());
+            async move { post_status(&c, &u, &k, b).await }
+        })
+        .await;
+    }
+
+    let resp = client
+        .post(format!("{}/v1/chat/completions", gw.url()))
+        .header("authorization", format!("Bearer {vkey}"))
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+    assert!(resp.text().await.unwrap().contains("[DONE]"));
+    wait_for_metric(&gw, "ai_tokens_total", "input", 5.0).await;
+}
+
+#[tokio::test]
+async fn blackhole_denies_then_restores() {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(2);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 99,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    let probe = |want: u16| {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        async move {
+            wait_for_status(want, move || {
+                let (c, u, k) = (c.clone(), u.clone(), k.clone());
+                async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+            })
+            .await
+        }
+    };
+
+    probe(200).await; // ready + allowed
+    put_kv(nats.port, "blackhole.99", b"spend").await;
+    probe(402).await; // denied once the watch delta lands
+    del_kv(nats.port, "blackhole.99").await;
+    probe(200).await; // restored
+}
+
+#[tokio::test]
+async fn blackhole_fraud_returns_403() {
+    // The spend path (402) is covered above; fraud takes the separate `DenyReason::Fraud` branch
+    // and must surface as 403 (not 402, not 200) end-to-end.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(20);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 1234,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    let probe = |want: u16| {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        async move {
+            wait_for_status(want, move || {
+                let (c, u, k) = (c.clone(), u.clone(), k.clone());
+                async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+            })
+            .await
+        }
+    };
+
+    probe(200).await; // ready + allowed
+    put_kv(nats.port, "blackhole.1234", b"fraud").await;
+    probe(403).await; // fraud → forbidden
+}
+
+#[tokio::test]
+async fn oversized_content_length_is_rejected_413() {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(21);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 1,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    // Wait until the gateway is serving.
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    // Declare a body of 200 MiB + 1 (> the 100 MiB guard) but send no body — the guard rejects on
+    // the Content-Length header in request_filter before any body is read.
+    let req = format!(
+        "POST /v1/chat/completions HTTP/1.1\r\n\
+         Host: x\r\n\
+         Authorization: Bearer {vkey}\r\n\
+         Content-Type: application/json\r\n\
+         Content-Length: 209715201\r\n\
+         Connection: close\r\n\r\n"
+    );
+    assert_eq!(raw_status(gw.port, &req).await, 413);
+}
+
+#[tokio::test]
+async fn managed_key_via_x_api_key_header_is_accepted() {
+    // Anthropic SDKs present the key in `x-api-key`, not `Authorization: Bearer`. A managed virtual
+    // key must be extracted from either header; here it arrives via x-api-key on the OpenAI path and
+    // must still swap to the OpenAI pool key in the Bearer scheme the upstream wants.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(22);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 8,
+            vpc_id: 8,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                post_status_xapikey(&c, &u, "/v1/chat/completions", &k, body_for("gpt-4o")).await
+            }
+        })
+        .await;
+    }
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(cap.authorization.as_deref(), Some("Bearer sk-pool-secret"));
+}
+
+#[tokio::test]
+async fn managed_key_for_unconfigured_provider_returns_503() {
+    // The default gateway configures OpenAI + Fireworks pool keys, but NOT Anthropic. A managed key
+    // routed to Anthropic (via the override header) has no pool key → 503, surfaced in
+    // request_filter before any upstream connect.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(23);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 11,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+    wait_for_status(503, move || {
+        let (c, u, k) = (c.clone(), u.clone(), k.clone());
+        async move {
+            post_status_provider(&c, &u, &k, "anthropic", body_for("claude-opus-4-8")).await
+        }
+    })
+    .await;
+}
+
+#[tokio::test]
+async fn anthropic_dialect_swaps_key_relays_and_meters() {
+    // The Anthropic path (`/v1/messages`) drives a different dialect, a different auth scheme
+    // (x-api-key, not Bearer), and a different usage parser than the OpenAI tests above.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(24);
+    let mock = MockUpstream::start(Mode::AnthropicJson).await;
+    let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .providers(&["anthropic"])
+        .start()
+        .await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 77,
+            vpc_id: 2,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                post_status_xapikey(&c, &u, "/v1/messages", &k, body_for("claude-opus-4-8")).await
+            }
+        })
+        .await;
+    }
+
+    let resp = client
+        .post(format!("{}/v1/messages", gw.url()))
+        .header("x-api-key", &vkey)
+        .header("content-type", "application/json")
+        .body(body_for("claude-opus-4-8"))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(cap.path, "/v1/messages");
+    // Anthropic wants the key in x-api-key, and the inbound virtual key must not leak upstream.
+    assert_eq!(cap.x_api_key.as_deref(), Some("sk-anthropic-pool"));
+    assert_eq!(cap.authorization, None);
+
+    wait_for_metric(&gw, "ai_tokens_total", "input", 13.0).await;
+}
+
+#[tokio::test]
+async fn missing_api_key_returns_401() {
+    // A request with neither Authorization nor x-api-key takes the "missing API key" branch — a
+    // different path than the malformed-key (invalid) branch the managed test exercises.
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(25);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let client = reqwest::Client::new();
+
+    let (c, u) = (client.clone(), gw.url());
+    wait_for_status(401, move || {
+        let (c, u) = (c.clone(), u.clone());
+        async move {
+            c.post(format!("{u}/v1/chat/completions"))
+                .header("content-type", "application/json")
+                .body(body_for("gpt-4o"))
+                .send()
+                .await
+                .map(|r| r.status().as_u16())
+                .unwrap_or(0)
+        }
+    })
+    .await;
+}
+
+#[tokio::test]
+async fn deny_set_is_fail_open_when_nats_drops() {
+    // After NATS goes away the last-known deny-set must be *retained* (fail-open), and auth/keys —
+    // which come from config, not NATS — must keep working.
+    let mut nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(26);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let denied = mint(
+        &VirtualKey {
+            tenant_id: 555,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let allowed = mint(
+        &VirtualKey {
+            tenant_id: 556,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    let probe = |key: String, want: u16| {
+        let (c, u) = (client.clone(), gw.url());
+        async move {
+            wait_for_status(want, move || {
+                let (c, u, k) = (c.clone(), u.clone(), key.clone());
+                async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+            })
+            .await
+        }
+    };
+
+    probe(denied.clone(), 200).await; // ready + allowed
+    put_kv(nats.port, "blackhole.555", b"spend").await;
+    probe(denied.clone(), 402).await; // deny delta landed
+
+    nats.stop(); // NATS disappears
+
+    probe(denied.clone(), 402).await; // stale deny retained, not cleared
+    probe(allowed.clone(), 200).await; // un-denied tenant still served without NATS
+}
+
+#[tokio::test]
+async fn streaming_tail_compaction_preserves_usage_event() {
+    // The usage chunk trails 130 KiB of content, forcing the proxy's response-tail compaction
+    // (resp_tail grows past 2× the 64 KiB cap). The usage event must survive in the retained tail.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(27);
+    let mock = MockUpstream::start(Mode::SseLarge).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 21,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    let body = r#"{"model":"gpt-4o","stream":true,"messages":[{"role":"user","content":"hi"}]}"#
+        .to_string();
+
+    {
+        let (c, u, k, b) = (client.clone(), gw.url(), vkey.clone(), body.clone());
+        wait_for_status(200, move || {
+            let (c, u, k, b) = (c.clone(), u.clone(), k.clone(), b.clone());
+            async move { post_status(&c, &u, &k, b).await }
+        })
+        .await;
+    }
+
+    let resp = client
+        .post(format!("{}/v1/chat/completions", gw.url()))
+        .header("authorization", format!("Bearer {vkey}"))
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+    let _ = resp.bytes().await.unwrap(); // drain the >128 KiB stream
+
+    wait_for_metric(&gw, "ai_tokens_total", "input", 5.0).await;
+}
+
+#[tokio::test]
+async fn on_disk_snapshot_enforces_across_restart_without_nats() {
+    // With a configured snapshot path, the deny-set is persisted to disk as deltas arrive. A restart
+    // must seed from that file and enforce immediately — even with NATS unreachable — proving the
+    // gateway reads the snapshot rather than re-scanning NATS on every boot.
+    let mut nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(28);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let snap = std::env::temp_dir().join(format!("beyond-ai-snap-{}.log", nats.port));
+    let _ = std::fs::remove_file(&snap);
+    let snap_str = snap.to_str().unwrap().to_string();
+
+    let denied = mint(
+        &VirtualKey {
+            tenant_id: 8800,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let allowed = mint(
+        &VirtualKey {
+            tenant_id: 8801,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    let probe = |gw_url: String, key: String, want: u16| {
+        let c = client.clone();
+        async move {
+            wait_for_status(want, move || {
+                let (c, u, k) = (c.clone(), gw_url.clone(), key.clone());
+                async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+            })
+            .await
+        }
+    };
+
+    // --- First run: blackhole the tenant; the delta is persisted to the snapshot. ---
+    {
+        let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+            .snapshot_path(&snap_str)
+            .start()
+            .await;
+        probe(gw.url(), denied.clone(), 200).await; // ready + allowed
+        put_kv(nats.port, "blackhole.8800", b"fraud").await;
+        probe(gw.url(), denied.clone(), 403).await; // applied in-memory AND appended to the snapshot
+        // Let the watcher's apply→persist step flush the checkpoint to disk before we kill it.
+        tokio::time::sleep(std::time::Duration::from_millis(400)).await;
+        // gw drops here → process killed.
+    }
+
+    // NATS goes away entirely: a restart has nothing to scan and must rely on the snapshot.
+    nats.stop();
+
+    // --- Restart against the same snapshot path, NATS down. ---
+    let gw2 = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .snapshot_path(&snap_str)
+        .start()
+        .await;
+    // Seeded from disk: the fraud hold is enforced even though NATS is unreachable.
+    probe(gw2.url(), denied, 403).await;
+    // And an un-denied tenant is still served (auth/keys are from config, not NATS).
+    probe(gw2.url(), allowed, 200).await;
+
+    let _ = std::fs::remove_file(&snap);
+}
diff --git a/crates/gateway/tests/smoke.rs b/crates/gateway/tests/smoke.rs
new file mode 100644
index 0000000..318b887
--- /dev/null
+++ b/crates/gateway/tests/smoke.rs
@@ -0,0 +1,202 @@
+//! Live smoke tests against **real** providers — the proof docs and the mock can't give:
+//! a real TLS/SNI handshake to the provider host, the base-path rewrite landing on a real mount
+//! (200, not 404), the **managed** path (verify → deny-check → pool-key swap), and a real
+//! (non-canned) response body.
+//!
+//! These exercise the **production** path, not BYO: the test generates an Ed25519 keypair, configures
+//! the *real* provider key (from the env var) as the gateway's pool key, mints a `bai_…` virtual key,
+//! and sends that. So the gateway verifies the virtual key, runs the deny-set check, and swaps in the
+//! real provider key before forwarding — the same flow a real managed tenant takes. The real key only
+//! ever lives in the gateway's config; the client presents the minted virtual key.
+//!
+//! Two safety layers so this never runs — or bills — by accident:
+//!   1. Every test is `#[ignore]`, so a plain `cargo test` skips the whole file.
+//!   2. When explicitly run, each test still **skips** (early-returns) unless its provider's API
+//!      key env var is set — so you only ever hit the providers you have keys for.
+//!
+//! Run them:
+//!   ANTHROPIC_API_KEY=sk-ant-… mise run test:smoke
+//!   # or directly:
+//!   ANTHROPIC_API_KEY=sk-ant-… cargo test -p beyond-ai --test smoke -- --ignored --nocapture
+//!
+//! Model ids are the cheapest small model per provider as of 2026-05; adjust if a provider retires
+//! one (a model-not-found is a stale id here, not a gateway bug).
+
+mod common;
+
+use beyond_ai::key::{VirtualKey, mint};
+use common::*;
+
+/// The provider's API key from the environment, or `None` (→ the test logs a skip and returns).
+fn env_key(var: &str) -> Option<String> {
+    std::env::var(var).ok().filter(|v| !v.trim().is_empty())
+}
+
+/// A gateway wired to the **real** provider hosts over TLS, with `provider`'s pool key set to the
+/// caller's real key and a signing key installed — so a minted virtual key for `provider` verifies
+/// and swaps to the real key. Returns the gateway plus the minted `bai_…` key to present as a client.
+/// (Its own nats-server backs the deny-set, empty here — this tenant isn't denied.)
+async fn managed_gateway(nats: &Nats, provider: &str, real_key: &str) -> (Gateway, String) {
+    let (pubkey, sk) = test_keypair(7);
+    let gw = Gateway::builder(nats.port, "unused", &b64(&pubkey))
+        .real_upstreams()
+        .pool_key(provider, real_key)
+        .start()
+        .await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 1,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    (gw, vkey)
+}
+
+/// Drive one OpenAI-wire provider through the gateway as a managed request. `openai` is the dialect
+/// default (no header); everything else is selected by `x-beyond-provider`.
+async fn smoke_openai_wire(provider: &str, key_env: &str, model: &str) {
+    let Some(key) = env_key(key_env) else {
+        eprintln!("smoke[{provider}]: {key_env} unset — skipping");
+        return;
+    };
+    let nats = Nats::start().await;
+    let (gw, vkey) = managed_gateway(&nats, provider, &key).await;
+    let client = reqwest::Client::new();
+
+    let body = format!(
+        r#"{{"model":"{model}","max_tokens":16,"messages":[{{"role":"user","content":"Reply with the single word: ping"}}]}}"#
+    );
+    let mut req = client
+        .post(format!("{}/v1/chat/completions", gw.url()))
+        .header("authorization", format!("Bearer {vkey}"))
+        .header("content-type", "application/json");
+    if provider != "openai" {
+        req = req.header("x-beyond-provider", provider);
+    }
+
+    let resp = req.body(body).send().await.expect("request to gateway");
+    let status = resp.status();
+    let text = resp.text().await.unwrap_or_default();
+    assert!(
+        status.is_success(),
+        "smoke[{provider}] model={model}: expected 2xx, got {status}.\n\
+         404 ⇒ base-path rewrite wrong; 401 ⇒ pool-key swap/verify; 403 ⇒ deny-set; \
+         a model error ⇒ stale model id. body: {text}"
+    );
+    assert!(
+        text.contains("\"choices\""),
+        "smoke[{provider}]: {status} but no `choices` in body: {text}"
+    );
+    eprintln!("smoke[{provider}]: OK ({status}) — verified, swapped, real 2xx");
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_anthropic() {
+    let Some(key) = env_key("ANTHROPIC_API_KEY") else {
+        eprintln!("smoke[anthropic]: ANTHROPIC_API_KEY unset — skipping");
+        return;
+    };
+    let nats = Nats::start().await;
+    let (gw, vkey) = managed_gateway(&nats, "anthropic", &key).await;
+    let client = reqwest::Client::new();
+
+    // `/v1/messages` → Anthropic dialect → provider `anthropic`. The minted virtual key is presented
+    // in `x-api-key` (the Anthropic SDK's header); the gateway verifies it and swaps in the real key
+    // — again in `x-api-key` (not Bearer). The required `anthropic-version` header passes through.
+    // This is the *only* test covering the x-api-key auth scheme + a real TLS handshake to
+    // api.anthropic.com via the full managed path.
+    let body = r#"{"model":"claude-haiku-4-5","max_tokens":16,"messages":[{"role":"user","content":"Reply with the single word: ping"}]}"#;
+    let resp = client
+        .post(format!("{}/v1/messages", gw.url()))
+        .header("x-api-key", &vkey)
+        .header("anthropic-version", "2023-06-01")
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .expect("request to gateway");
+    let status = resp.status();
+    let text = resp.text().await.unwrap_or_default();
+    assert!(
+        status.is_success(),
+        "smoke[anthropic]: expected 2xx, got {status}. body: {text}"
+    );
+    assert!(
+        text.contains("\"content\""),
+        "smoke[anthropic]: {status} but no `content` in body: {text}"
+    );
+    eprintln!("smoke[anthropic]: OK ({status}) — verified, swapped to x-api-key, real 2xx");
+}
+
+// --- OpenAI-wire providers. Same code path; testing more than one confirms each host/base-path/auth
+// row in `route::KNOWN_PROVIDERS` against the real endpoint. ---
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_openai() {
+    smoke_openai_wire("openai", "OPENAI_API_KEY", "gpt-4o-mini").await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_groq() {
+    // Proves the `/v1` → `/openai/v1` rewrite against a real mount (the highest-value rewrite case).
+    smoke_openai_wire("groq", "GROQ_API_KEY", "llama-3.1-8b-instant").await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_fireworks() {
+    // Proves the `/v1` → `/inference/v1` rewrite against a real mount.
+    smoke_openai_wire(
+        "fireworks",
+        "FIREWORKS_API_KEY",
+        "accounts/fireworks/models/llama-v3p1-8b-instruct",
+    )
+    .await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_openrouter() {
+    // Proves the `/v1` → `/api/v1` rewrite against a real mount.
+    smoke_openai_wire("openrouter", "OPENROUTER_API_KEY", "openai/gpt-4o-mini").await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_deepseek() {
+    smoke_openai_wire("deepseek", "DEEPSEEK_API_KEY", "deepseek-chat").await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_together() {
+    smoke_openai_wire(
+        "together",
+        "TOGETHER_API_KEY",
+        "meta-llama/Llama-3.1-8B-Instruct-Turbo",
+    )
+    .await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_cerebras() {
+    smoke_openai_wire("cerebras", "CEREBRAS_API_KEY", "llama3.1-8b").await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_mistral() {
+    smoke_openai_wire("mistral", "MISTRAL_API_KEY", "mistral-small-latest").await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_xai() {
+    smoke_openai_wire("xai", "XAI_API_KEY", "grok-3-mini").await;
+}
diff --git a/mise.toml b/mise.toml
new file mode 100644
index 0000000..55c7edc
--- /dev/null
+++ b/mise.toml
@@ -0,0 +1,52 @@
+[tools]
+dprint = "latest"
+rust = { version = "1.92", components = "rustfmt,clippy", targets = "aarch64-unknown-linux-gnu,x86_64-unknown-linux-gnu" }
+yamlfmt = "latest"
+cargo-binstall = "latest"
+"cargo:cross" = "latest"
+# nats-server for the e2e harness (real JetStream KV backing slipstream).
+"ubi:nats-io/nats-server" = { version = "latest", exe = "nats-server" }
+
+[tasks."build:rs"]
+run = "cargo build"
+
+[tasks."build:rs:release"]
+run = "cargo build --release"
+
+[tasks."check:rs"]
+run = "cargo clippy --workspace --all-targets -- -D warnings"
+
+[tasks."check:fmt"]
+run = "dprint check"
+
+[tasks."format"]
+run = "dprint fmt && cargo fmt"
+
+[tasks."test:unit:rs"]
+description = "Unit tests for the pure-logic modules (key/route/peek/usage/deny/config/resolver)."
+run = "cargo test --lib"
+
+# Integration tests (gateway driven against a mock upstream + NATS) — to be added; see ARCHITECTURE.md.
+[tasks."test:integration:rs"]
+description = "End-to-end gateway tests against a mock provider + docker-compose NATS (TODO)."
+run = "cargo test --test '*'"
+
+[tasks."test:smoke"]
+description = "Live smoke tests against REAL providers. Auto-loads .env if present; set the API keys you have (ANTHROPIC_API_KEY, OPENAI_API_KEY, GROQ_API_KEY, …) there or in the environment. Each test skips if its key is unset. Bills real (tiny, max_tokens-capped) requests."
+run = """
+# Auto-load .env if it exists (export every assignment), so the keys reach the test process.
+if [ -f .env ]; then set -a; . ./.env; set +a; fi
+cargo test -p beyond-ai --test smoke -- --ignored --nocapture
+"""
+
+[tasks."bench:unit"]
+description = "divan micro-benchmarks of the IO-free hot paths (key/peek/usage/route/deny): timing + native allocation counts."
+run = "cargo bench --bench unit"
+
+[tasks."bench:e2e"]
+description = "A-1 end-to-end bench: real beyond-ai binary + nats-server + mock upstream over HTTP."
+run = "cargo bench --bench e2e"
+
+[tasks."bench"]
+description = "Run both bench harnesses (unit micro + e2e macro)."
+depends = ["bench:unit", "bench:e2e"]

From 246e88d5dc2abd0133727fe21893d3d6dd361a68 Mon Sep 17 00:00:00 2001
From: Jared Lunde <jared.lunde@gmail.com>
Date: Sun, 31 May 2026 12:20:21 -0700
Subject: [PATCH 2/7] fixes

---
 ARCHITECTURE.md                   | 245 +++++++++++++++++++++------
 README.md                         |   4 +-
 config.example.toml               |  19 ++-
 crates/gateway/benches/e2e.rs     |  62 ++++++-
 crates/gateway/benches/unit.rs    |  91 +++++++++-
 crates/gateway/src/config.rs      |  32 +++-
 crates/gateway/src/deny.rs        |  15 +-
 crates/gateway/src/key.rs         |  26 ++-
 crates/gateway/src/peek.rs        | 259 +++++++++++++++++++++++++++++
 crates/gateway/src/proxy.rs       | 266 ++++++++++++++++++++++++------
 crates/gateway/src/ratelimit.rs   | 246 +++++++++++++++++++--------
 crates/gateway/src/state.rs       |  44 +++--
 crates/gateway/src/store_watch.rs |  74 +++++++--
 13 files changed, 1149 insertions(+), 234 deletions(-)

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index e307e66..f9a3b7f 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -7,6 +7,23 @@ response untouched, and emits token-usage facts for billing.
 **Self-contained:** no `path` deps into the `beyond` repo. Depends only on crates.io + the published
 `beyond-slipstream` — so it clones/CI-builds/publishes anywhere.
 
+## Concepts & Terminology
+
+| Term                                             | What It Controls / Gates                                                               | NOT                                                                          |
+| ------------------------------------------------ | -------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------- |
+| **Managed key** (`bai_v1.…`)                     | Ed25519-verified identity; enables key swap, deny-set check, and `ai.usage` billing    | A session token or capability grant — just tenant attribution                |
+| **BYO key** (anything else)                      | Forwarded as-is to the provider; no swap, no billing, no deny-set                      | A lesser tier — same proxy, minus attribution and billing                    |
+| **Pool key**                                     | Real provider API key held by the gateway; swapped in for managed traffic              | Per-tenant — one key per provider, shared by all managed callers             |
+| **Tenant**                                       | The billing entity from the virtual key payload (`tenant_id: u32`)                     | An org, user, or namespace — an opaque integer the gateway doesn't interpret |
+| **Dialect**                                      | Wire protocol implied by the request path (OpenAI `/v1/…` vs Anthropic `/v1/messages`) | The provider — dialect determines auth scheme and usage parsing format       |
+| **Provider**                                     | Named row in the routing table: authority, base path, auth scheme                      | A vendor relationship — just connection facts and auth wiring                |
+| **Deny-set**                                     | Sparse set of denied `tenant_id`s; gates managed traffic; default-allow                | An allowlist or ACL — misses are allowed, not blocked                        |
+| **Tail tap**                                     | Bounded 64KB window kept from the end of the response for usage extraction             | A buffer or copy — the response is relayed unbuffered; only the tail is kept |
+| **Snapshot**                                     | On-disk deny-set cache (entries + NATS cursor) for edge/tunnel deployments             | Persistent store — a pure cache; delete it and the gateway re-scans NATS     |
+| **Virtual key** (`bai_v1.{kid}.{payload}.{sig}`) | Ed25519-signed token encoding `tenant_id` + `vpc_id`                                   | A session or auth token — stateless, no server-side lookup, no revocation    |
+
+---
+
 ## Request flow (`proxy.rs`)
 
 ```
@@ -14,13 +31,14 @@ client (stock SDK, Bearer/ x-api-key)
    │
    ▼ request_filter
    ├─ provider = dialect(path) [+ x-beyond-provider override]   (unknown → 400)
-   ├─ extract key
+   ├─ extract key                                               (missing → 401)
+   ├─ rate guardrails ← BEFORE verify/connect: per-credential (seeded raw-key hash) +
+   │                    global BYO aggregate (managed exempt; protects egress IPs); over → 429
    ├─ Content-Length abuse guard (declared size; streamed total enforced in body filter too)
    ├─ key format branch:
    │    • bai_…  → MANAGED: Ed25519 verify (stateless) → {tenant_id, vpc_id}
    │              → deny-set check (O(1), default-allow) → require pool key
    │    • else   → BYO: the user's own provider token, passed through unchanged
-   └─ per-key rate guardrail (tenant / BYO-token hash; over ceiling → 429)
    ▼ upstream_peer        — TTL-cached DNS resolve → HttpPeer (no blocking getaddrinfo)
    ▼ upstream_request_filter — managed: swap auth header to pool key; BYO: leave it. Set Host.
    ▼ request_body_filter  — STREAM BODY THROUGH (never buffered); feed bytes to a structural
@@ -29,6 +47,7 @@ client (stock SDK, Bearer/ x-api-key)
    ▼ response_filter      — TTFT; streaming? = response Content-Type is text/event-stream
    ▼ response_body_filter — relay unbuffered; keep a bounded 64KB tail for the usage tap
    ▼ logging              — parse usage from tail (by dialect+streaming); emit `ai.usage` fact
+   │                         (managed only — BYO has no tenant to bill); metrics count all traffic
         upstream: a registered provider (openai, anthropic, openrouter, fireworks,
                   groq, deepseek, together, cerebras, mistral, xai — + config-added)
 ```
@@ -39,8 +58,9 @@ client (stock SDK, Bearer/ x-api-key)
   fail-open. Auth and keys do **not** depend on NATS.
 - **Config (boot, SSM/env):** `signing_keys` (Ed25519 **public** keys by kid — multiple for
   rotation), `pool_keys` (managed pool keys **by provider name**, from `AI_POOL_KEY_<NAME>` env),
-  `provider_authorities` (per-name authority overrides / additions), `rate_limit_rps` (per-key
-  request ceiling; 0 disables), `snapshot_path` (optional on-disk deny-set cache; see below),
+  `provider_authorities` (per-name authority overrides / additions), `rate_limit_rps` (per-credential
+  request ceiling; 0 disables), `byo_rate_limit_rps` (aggregate ceiling for _all_ BYO traffic — the
+  egress-IP guard; 0 disables), `snapshot_path` (optional on-disk deny-set cache; see below),
   timeouts. Secret-bearing fields (`pool_keys`, `nats_creds`) are held as `Secret`, so a stray
   `Debug`/`Serialize` of the config can't leak them. See `config.example.toml`.
 - **The virtual key (`bai_v1.{kid}.{payload}.{sig}`):** Ed25519-signed, payload = `{tenant_id,
@@ -50,11 +70,15 @@ client (stock SDK, Bearer/ x-api-key)
 ## Key invariants
 
 - **Managed vs BYO by key format.** `bai_…` → verify + swap to pool key. Anything else → the user's
-  real token, passed through (no swap, no deny-set, no per-tenant attribution).
+  real token, passed through (no swap, no deny-set, no per-tenant attribution, and **no `ai.usage`
+  billing event** — it would be an unbillable `tenant_id=0` row; aggregate metrics still count it).
 - **Request body is never buffered** — it streams through with original framing; a streaming
   structural scanner (`peek::ModelScanner`, O(1), SIMD `memchr` skip over big values) extracts the
-  exact root-level `model`. (Trade-off: OpenAI streaming without `stream_options.include_usage`
-  isn't metered — the SDK/platform can set it.)
+  exact root-level `model`. **One exception:** a _managed_ OpenAI chat/responses request is buffered
+  so the gateway can inject `stream_options.include_usage` when the client streams without it —
+  otherwise OpenAI emits no usage chunk and the request is unmeterable. Works out of the box (no
+  client/SDK cooperation), framed upstream as chunked, bounded by `MAX_REQUEST_BODY`, scoped to that
+  one path — BYO and everything else stay pure passthrough.
 - **Response is never buffered** — relayed chunk-by-chunk; a bounded 64KB tail feeds the usage tap.
 - **Deny-set is `O(denied)`, default-allow, fail-open.** Restore = explicit delete or TTL expiry.
   Seeding is **gap-free**: the seed records the stream revision it reflects, and the watch _resumes
@@ -68,11 +92,32 @@ client (stock SDK, Bearer/ x-api-key)
   append each applied delta back to the file. The snapshot is a pure cache — delete it and the
   gateway falls back to scanning; a `CursorExpired` (history compacted past the cursor) does the same.
 - **Auth works without NATS** (keys from config); a NATS outage only staleens the deny-set.
-- **Per-key rate guardrail, not a spend control.** The deny-set is the spend/fraud authority but
-  reacts on a lag and never sees floods that don't bill (auth failures, 4xx, BYO). A fixed-memory
-  count-min limiter (`ratelimit`, pingora-limits) caps a single tenant's / BYO caller's request
-  velocity — bounding a leaked/runaway key during deny-set lag and a retry-storm flood. Generous by
-  default (a circuit breaker, not a quota); `rate_limit_rps = 0` disables it.
+- **Two-tier rate guardrail, checked _before_ verify/connect, not a spend control.** The deny-set is
+  the spend/fraud authority but reacts on a lag and never sees floods that don't bill (auth failures,
+  4xx, BYO). Two fixed-memory count-min tiers (`ratelimit`, pingora-limits) cap velocity:
+  - **Per-credential** — keyed by a seeded hash of the raw presented key (so collisions can't be
+    precomputed to false-throttle another caller). Bounds a leaked/runaway key during deny-set lag, a
+    retry-storm flood, **and the Ed25519-verify cost of a forged-key flood**: keying on the raw
+    credential (not the verified tenant) is what lets the guard sit _ahead of_ the verify (the
+    gateway's one ~28µs/req CPU cost; see Benchmarking), so a single bad token can't drive unbounded
+    crypto work. Granularity is per-credential ≈ per-(tenant, app), since virtual keys are
+    deterministic per that pair — not a per-tenant aggregate.
+  - **Global BYO aggregate** — one shared bucket for _all_ BYO traffic. BYO connects outward to
+    providers _from our egress IPs_ carrying the caller's token, so a flood of distinct **junk** BYO
+    tokens (which slip past per-credential keying — each is its own bucket) would get those IPs
+    rate-limited or banned by the provider, hurting _everyone_. This bounds that aggregate regardless
+    of token variation. **Managed traffic is exempt** — it's verified before any upstream connect and
+    can't be forged, so a random `bai_…` flood fails verify and never reaches a provider; exempting it
+    keeps this shared bucket from ever shedding core tenant load. **Per-source-IP was considered and
+    rejected** as the primary control: it depends on the calling task's real IP surviving ECS Service
+    Connect (unconfirmed), and is worse than nothing if the peer is a collapsed mesh hop — so we chose
+    the topology-independent aggregate. The blunt cap's residual (it sheds legit BYO under a flood; the
+    default is an untuned guess; the real selective fix is a provider-feedback circuit breaker on
+    upstream 401s) is recorded in full in the `ratelimit` **module-doc decision block** — read it
+    before changing the knob or reaching for per-IP.
+
+  Both tiers are generous circuit breakers, not quotas; `rate_limit_rps = 0` / `byo_rate_limit_rps = 0`
+  disable them independently.
 - **Routing is dialect-based** (model isn't known before peer selection); any non-default provider
   is reached via the `x-beyond-provider: <name>` header. **Providers are data** — a row in
   `route::KNOWN_PROVIDERS` (name, authority, **base path**, auth scheme) or a config entry — so
@@ -81,24 +126,100 @@ client (stock SDK, Bearer/ x-api-key)
   prefix is rewritten to the provider's mount point (Groq `/openai/v1`, Fireworks `/inference/v1`,
   OpenRouter `/api/v1`) so a verbatim passthrough can't 404.
 - **Connect retries only** (`fail_to_connect`); no HTTP-status retry (Pingora-idiomatic, SDKs back off).
+- **`ai.usage` carries _both_ models: `model` (resolved) + `requested_model` (alias).** `model` is
+  the id the provider resolved + billed, taken from the _response_ (a second `ModelScanner` over the
+  response head; works for SSE — it skips the `data:` prefix and reads the first chunk's root
+  `model`). It's the key for pricing **and** for reconciling against the provider's invoice, which
+  itemizes by the pinned snapshot (`gpt-4o-2024-08-06`), not the alias. `requested_model` is what the
+  client sent (`gpt-4o`) — product analytics, and a fallback rate when a snapshot is newer than the
+  downstream price table. The two are equal when the response carried no model (error body), where
+  `model` falls back to the alias. Emitting both is additive: a consumer that keyed on the alias
+  doesn't break, and reconciliation still gets the exact id.
 - **Pricing is never here** — emit token _facts_; a closed downstream consumer prices.
 
+## Trust Boundaries
+
+**What the gateway verifies (rejects if invalid):**
+
+- Virtual key signature (Ed25519, stateless — no DB lookup)
+- Virtual key format (`bai_v1.{kid}.{payload}.{sig}`, fixed 16-byte payload)
+- Tenant not in deny-set (managed traffic only)
+- Pool key configured for the requested provider (managed traffic only)
+- Request body size ≤ `MAX_REQUEST_BODY` (declared Content-Length + streaming running total)
+- Per-credential request rate within ceiling; aggregate BYO rate within ceiling
+
+**What passes through unchecked:**
+
+- Request body content and schema — no validation at the gateway layer
+- Model name in the request — extracted for billing facts, never validated against an allowlist
+- Provider response content — relayed byte-for-byte
+- BYO token validity — forwarded as-is; the provider rejects it if invalid
+- `vpc_id` in the virtual key — decoded and emitted in billing facts, not used for access control
+
+**Why these boundaries are where they are:**
+
+- Body schema validation belongs to the provider — duplicate validation adds latency without a
+  security benefit at the gateway layer
+- Model validation would require a per-provider allowlist coupled to model release cadence
+- BYO token validation requires a provider round-trip — the provider does it anyway
+
+---
+
+## Configuration
+
+All fields configurable via `config.example.toml` and environment (`AI_` prefix, flat merge).
+Secret-bearing fields (`pool_keys`, `nats_creds`) are held as `Secret` — stray `Debug`/`Serialize`
+output redacts values.
+
+| Field                         | Default                           | Runtime Effect                                                                                                                                                   |
+| ----------------------------- | --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `signing_keys`                | _(required)_                      | Map of kid → base64 Ed25519 public key. Multiple kids enable rotation. Missing → all traffic falls through to BYO treatment.                                     |
+| `pool_keys.<name>`            | _(from `AI_POOL_KEY_<NAME>` env)_ | Real provider API key. Missing for a provider → managed requests to that provider return 503.                                                                    |
+| `provider_authorities.<name>` | _(none)_                          | Override or add a provider's `authority` (host:port). Enables config-added providers beyond `KNOWN_PROVIDERS` with zero code change.                             |
+| `snapshot_path`               | _(unset)_                         | Path for the on-disk deny-set cache. Unset → re-scan NATS on every cold boot. Set → load from disk and enforce before NATS reconnects (edge/tunnel deployments). |
+| `rate_limit_rps`              | `100`                             | Per-credential request ceiling (count-min, keyed on raw key hash). `0` disables. Exceeded → 429. Checked before Ed25519 verify.                                  |
+| `byo_rate_limit_rps`          | `1000`                            | Aggregate ceiling for all BYO traffic (single shared bucket). `0` disables. Managed traffic exempt.                                                              |
+| `connect_timeout_secs`        | `10`                              | TCP connect timeout to the upstream provider. Exceeded → retry up to 2×, then 502.                                                                               |
+| `read_timeout_secs`           | `600`                             | Response read timeout. 10 minutes accommodates long-running LLM streams.                                                                                         |
+| `nats_url`                    | `nats://localhost:4222`           | NATS server for the deny-set watcher. Unreachable → fail-open (stale or empty set).                                                                              |
+| `nats_creds`                  | _(unset)_                         | NATS credentials file path. Required for authenticated clusters.                                                                                                 |
+| `listen_addr`                 | `0.0.0.0:8080`                    | Proxy listener address.                                                                                                                                          |
+| `prometheus_addr`             | `0.0.0.0:9090`                    | Prometheus `/metrics` scrape endpoint.                                                                                                                           |
+
+---
+
+## Failure Modes
+
+| Failure                                     | What Actually Happens                                                                                       | Recovery                                                                                    |
+| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
+| NATS unreachable at boot                    | Deny-set starts empty (fail-open). Auth still works — keys from config.                                     | Watcher reconnects; seeds from NATS or disk snapshot on connect.                            |
+| NATS disconnects mid-run                    | Last-known deny-set stays active. New deny entries not applied until reconnect.                             | Watcher reconnects and resumes from saved revision — no re-scan needed.                     |
+| NATS history compacted past snapshot cursor | `CursorExpired` → full re-scan from current NATS state.                                                     | After re-scan, new cursor set; delta watch resumes normally.                                |
+| Virtual key tampered or forged              | Ed25519 verify fails → falls through to BYO treatment. No billing event.                                    | Billing miss detectable downstream; no security boundary breach.                            |
+| Pool key missing for provider               | Managed request returns 503 before any upstream connection.                                                 | Add `AI_POOL_KEY_<NAME>` env and redeploy.                                                  |
+| Provider DNS fails                          | `upstream_peer` returns error → 502 to client.                                                              | TTL-cached DNS (60s) serves stale; poisoned-lock guard re-resolves on next request.         |
+| Provider TCP connect fails                  | `fail_to_connect` retries up to 2×, then returns 502.                                                       | Client SDK retries with backoff. No HTTP-status retries (Pingora-idiomatic).                |
+| Response body > 128KB before usage chunk    | Tail compaction fires: `drain(..half)` discards first half, keeps tail. Usage extracted from retained tail. | No action — O(1) tail tap is designed for this; SSE usage is always in the final data line. |
+| Gateway crash mid-request                   | In-flight request drops; client receives TCP close, not a structured error. No partial state written.       | Client SDK retries. No DB writes in the request path — no cleanup needed.                   |
+
+---
+
 ## Modules
 
-| Module                    | Role                                                                        | Tested        |
-| ------------------------- | --------------------------------------------------------------------------- | ------------- |
-| `key`                     | `bai_v1` parse + Ed25519 verify + mint; stateless identity                  | unit ✓        |
-| `route`                   | data-driven provider table (name/authority/auth) + dialect default          | unit ✓        |
-| `peek`                    | `ModelScanner` — streaming structural scan for the exact root-level `model` | unit ✓        |
-| `usage`                   | token extraction (OpenAI/Anthropic, body + SSE)                             | unit ✓        |
-| `deny`                    | sparse deny-set, default-allow, reason → status                             | unit ✓        |
-| `ratelimit`               | per-key request guardrail (count-min, fixed memory, no GC)                  | unit ✓        |
-| `secret`                  | redacting, zeroize-on-drop `Secret` newtype                                 | unit ✓        |
-| `config`                  | Figment config; build keyring; pool keys/authorities by provider name       | unit ✓        |
-| `state`                   | keyring + resolved provider registry + watched deny-set + TTL DNS cache     | unit ✓        |
-| `store_watch`             | the single NATS watcher (deny-set), as a Pingora `BackgroundService`        | —             |
-| `proxy`                   | the `ProxyHttp` impl                                                        | e2e ✓         |
-| `metrics`/`doctor`/`main` | Prometheus, diagnostics, bootstrap                                          | e2e/compile ✓ |
+| Module                    | Role                                                                          | Tested        |
+| ------------------------- | ----------------------------------------------------------------------------- | ------------- |
+| `key`                     | `bai_v1` parse + Ed25519 verify + mint; stateless identity                    | unit ✓        |
+| `route`                   | data-driven provider table (name/authority/auth) + dialect default            | unit ✓        |
+| `peek`                    | `ModelScanner` — streaming structural scan for the exact root-level `model`   | unit ✓        |
+| `usage`                   | token extraction (OpenAI/Anthropic, body + SSE)                               | unit ✓        |
+| `deny`                    | sparse deny-set, default-allow, reason → status                               | unit ✓        |
+| `ratelimit`               | two-tier guardrail: per-credential + global BYO (count-min, fixed mem, no GC) | unit ✓        |
+| `secret`                  | redacting, zeroize-on-drop `Secret` newtype                                   | unit ✓        |
+| `config`                  | Figment config; build keyring; pool keys/authorities by provider name         | unit ✓        |
+| `state`                   | keyring + resolved provider registry + watched deny-set + TTL DNS cache       | unit ✓        |
+| `store_watch`             | the single NATS watcher (deny-set), as a Pingora `BackgroundService`          | —             |
+| `proxy`                   | the `ProxyHttp` impl                                                          | e2e ✓         |
+| `metrics`/`doctor`/`main` | Prometheus, diagnostics, bootstrap                                            | e2e/compile ✓ |
 
 ## Verification
 
@@ -125,34 +246,54 @@ client (stock SDK, Bearer/ x-api-key)
 
 ## Benchmarking
 
-Two harnesses, best-tool-per-job, mirroring the unit/e2e split of the tests:
+Two harnesses, best-tool-per-job, mirroring the unit/e2e split of the tests. The framing is
+**Theory of Constraints**: a proxy's steady-state constraint is upstream I/O, not gateway CPU — the
+whole design exists to _stay off the critical path_. So the benches don't chase micro-optimizations;
+they **prove the gateway's added cost is negligible and bounded**, i.e. that we never become the
+constraint. Every bench maps to a function that runs on the per-request hot path (`proxy.rs`).
 
 - **Unit micro (`benches/unit.rs`, `mise run bench:unit`) — `divan`.** Times the IO-free hot paths
-  (`key` verify/mint, `peek::ModelScanner` over 0/4KB/256KB bodies with `model` placed _last_ =
-  worst case, `usage` parsers, `route`, `deny`) **and** measures allocations natively: divan's
-  `AllocProfiler` (installed as the global allocator) reports alloc/dealloc/grow **count + bytes**
-  beside ns/iter, no extra plumbing — and stays clear of the crate's `#![deny(unsafe_code)]` (a
-  hand-rolled `GlobalAlloc` would need `unsafe impl`). This makes the design's allocation claims
-  _assertable_: `key/verify` shows **0 allocs** (stack-only decode — divan omits the alloc rows
-  entirely), `peek` a flat **1 alloc** independent of body size (the O(1)-memory claim),
-  `route`/`deny::parse_key` **0 allocs**. A regression surfaces as a non-zero / grown number.
+  **and** measures allocations natively: divan's `AllocProfiler` (installed as the global allocator)
+  reports alloc/dealloc/grow **count + bytes** beside ns/iter, no extra plumbing — and stays clear of
+  the crate's `#![deny(unsafe_code)]` (a hand-rolled `GlobalAlloc` would need `unsafe impl`). Coverage
+  follows the hot path: `key` verify/mint; `peek::ModelScanner` over 0/4KB/256KB bodies with `model`
+  placed _last_ = worst case; `usage` parsers; `route`; `deny` (both the off-path ingest parse,
+  `parse_key`/`parse_reason`, **and** the on-path `reason()` lookup run on every managed request); and
+  `ratelimit::check` (both tiers — `check_managed` runs the per-credential tier only; `check_byo` runs
+  the per-credential tier **plus** the global BYO aggregate bucket). This makes the design's
+  allocation/complexity claims _assertable_: `key/verify` shows **0 allocs** (stack-only
+  decode — divan omits the alloc rows entirely), `peek` a flat **1 alloc** independent of body size
+  (the O(1)-memory claim), `route`/`deny::parse_key` **0 allocs**, **`deny::reason` is 0-alloc and flat
+  across 0→1M denied tenants** (the O(1)-lookup, `O(denied)`-memory claim — ~1ns/8ns), and
+  **`ratelimit::check` is 0-alloc** (~43ns managed / ~83ns BYO — the delta is the second tier's bucket
+  `observe` plus hashing a longer token; fixed-memory count-min, no per-credential entry). A regression
+  surfaces as a non-zero / grown / size-scaling number. **The headline this bench exists to assert:
+  `key/verify` ≈ 28µs is ~350–650× every other per-request op** (deny lookup, ratelimit, route all in
+  the **nanoseconds**), so verify is the gateway's one real per-request CPU cost — the constraint that
+  motivates checking the rate guardrails _before_ it (`proxy::request_filter`), so a forged-key flood
+  is rejected for tens of ns instead of ~28µs each. Everything else is allocation-free and invisible
+  against a network round trip.
 - **A-1 end-to-end (`benches/e2e.rs`, `mise run bench:e2e`) — `criterion`.** The real `beyond-ai`
-  binary + real nats-server + mock upstream (reuses `tests/common` verbatim), driven over real HTTP
-  — measures the whole request path: single-request latency + concurrent throughput. criterion is
-  chosen here for its saved-baseline comparison (`--save-baseline`), which tracks latency/RPS drift
-  across runs. Allocations are _not_ measured (the gateway is a separate process — its heap is
-  invisible to the bench); that's the unit bench's job. Needs `nats-server` on PATH (mise provides
-  it).
+  binary + real nats-server + mock upstream (reuses `tests/common` verbatim), driven over real HTTP —
+  measures the whole request path across four cases that **decompose** where time goes:
+  `reject_missing_key_latency` (401, short-circuited before any upstream connection — the bare
+  transport floor), `byo_json_latency` (pure passthrough), `managed_json_latency` (verify + deny +
+  key swap), and `managed_sse_latency` (exercises the streaming response tap: tail buffer + bounded
+  compaction). Plus a concurrent-throughput group. criterion is chosen for its saved-baseline
+  comparison (`--save-baseline`), which tracks latency/RPS drift across runs. Allocations are _not_
+  measured (the gateway is a separate process — its heap is invisible to the bench); that's the unit
+  bench's job. Needs `nats-server` on PATH (mise provides it).
+  - **What the decomposition shows (loopback laptop) — and its limit:** all four cases land in a
+    ~110–120µs band, and run-to-run variance is **±15–20µs** (loopback sub-150µs round-trips are
+    dominated by OS scheduling jitter). That noise floor is _larger_ than the gateway's own per-request
+    CPU (verify ≈28µs, everything else ns) — so this harness **cannot resolve** the verify cost, and the
+    reject/BYO/managed cases are statistically indistinguishable here. Two honest conclusions follow:
+    (1) the right tool for the gateway's CPU cost is the in-process `unit` bench, not this one; (2) for
+    _legitimate_ managed traffic the e2e latency is **expected to be flat** across the verify reorder —
+    moving the rate guard before verify doesn't change the legit path (verify still runs); its win is on
+    the _throttled_ path (verify skipped, proven at the unit level: 42ns vs 28µs) and in per-request
+    allocator pressure (the lazy `resp_tail`, below this harness's resolution). What this harness _is_
+    good for: catching gross regressions (a buffering mistake, a dropped connection-pool, an O(n) added
+    to the path would move the band by far more than 20µs) and the saved-baseline RPS trend over time.
 
 `mise run bench` runs both.
-
-## Out of scope / deferred
-
-- **Go control plane** (mint/inject virtual keys, write deny entries) — separate workstream; the
-  e2e mints keys directly.
-- **OpenAI `stream_options` injection** — dropped to keep the request body a pure passthrough.
-- **HTTP 5xx/429 response retries + `Retry-After`** — non-idiomatic in Pingora 0.8; SDKs back off.
-- **Trickle/cancel e2e** — SSE relay is covered; incremental-timing/cancel assertions are flaky.
-- Cross-dialect IR translation; caching; guardrails; ClickHouse ingestion wiring (table exists).
-- **Anthropic streaming input tokens** can sit in `message_start` (response head) outside the 64KB
-  usage tail on very long streams — a pre-existing tap limitation.
diff --git a/README.md b/README.md
index 7c9309f..38a0931 100644
--- a/README.md
+++ b/README.md
@@ -26,10 +26,10 @@ client = OpenAI(base_url="http://ai.internal/v1", api_key="sk-your-openai-key")
 ## What It Does
 
 - **Managed keys** (`bai_v1…`) — Ed25519-verified, stateless. Swaps to the pool key. Attributes usage to tenant + VPC. Deny-set checked (spend/fraud).
-- **BYO keys** — any other token passes through to the provider untouched. No attribution, no deny-set, no metering.
+- **BYO keys** — any other token passes through to the provider untouched. No key-swap, no deny-set, no attribution, no `ai.usage` billing event (aggregate throughput metrics still count it).
 - **10 providers, zero config** — openai, anthropic, openrouter, fireworks, groq, deepseek, together, cerebras, mistral, xai. Add more in `config.toml` under `[provider_authorities]`.
 - **Never buffers** — request and response stream through; a SIMD scanner extracts `model` in O(1) memory. 64KB tail taps usage without holding the body.
-- **Token facts, not pricing** — emits `ai.usage` events to slipstream. A downstream consumer prices.
+- **Token facts, not pricing** — emits `ai.usage` token-count events as structured logs (stdout → logfwd/OTLP → ClickHouse). A closed downstream consumer prices; slipstream carries only the deny-set.
 - **Rate guardrail** — per-key request ceiling (`rate_limit_rps`). Circuit breaker against runaway keys. Deny-set owns spend control.
 - **Fail-open NATS** — auth works without NATS. A NATS outage stales the deny-set; existing allows stay allowed.
 
diff --git a/config.example.toml b/config.example.toml
index 6fd258c..9063339 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -26,13 +26,22 @@ idle_timeout_secs = 90
 
 # upstream_tls = true   # set false only for a plaintext mock (tests)
 
-# Per-key request-rate ceiling (requests/sec) — a blast-radius circuit breaker, not a spend control
-# (the deny-set owns spend). Caps how fast one tenant (managed) or BYO caller can drive the gateway,
-# bounding a leaked/runaway key during the deny-set's reaction lag and a failure flood that never
-# bills. Generous by default so legitimate traffic never trips it; set 0 to disable. Tune from the
-# `ai_rejections_total{reason="rate_limit"}` metric.
+# Per-credential request-rate ceiling (requests/sec) — a blast-radius circuit breaker, not a spend
+# control (the deny-set owns spend). Caps how fast one credential (managed virtual key ≈ a tenant+app,
+# or a BYO token) can drive the gateway, bounding a leaked/runaway key during the deny-set's reaction
+# lag and a failure flood that never bills. Generous by default so legitimate traffic never trips it;
+# set 0 to disable. Tune from the `ai_rejections_total{reason="rate_limit"}` metric.
 rate_limit_rps = 100
 
+# Aggregate request-rate ceiling (requests/sec) for ALL BYO traffic combined — one shared bucket.
+# BYO is unverified and upstream-bound: a flood of *distinct* random BYO tokens slips past the
+# per-credential ceiling and would open junk-auth connections to providers from our egress IPs,
+# getting them rate-limited or banned. This bounds that aggregate regardless of token variation.
+# Managed traffic is EXEMPT (verified before any upstream connect, can't be forged), so this bucket
+# never sheds core tenant load. Generous by default; set 0 to disable. Tune from the
+# `ai_rejections_total{reason="rate_limit_byo_global"}` metric.
+byo_rate_limit_rps = 1000
+
 # Optional per-provider upstream authority (host:port), BY PROVIDER NAME. For a known provider this
 # overrides its built-in default; for an unknown name it ADDS a new OpenAI-wire provider, reachable
 # via `x-beyond-provider: <name>`. Known providers (zero-config defaults): openai, anthropic,
diff --git a/crates/gateway/benches/e2e.rs b/crates/gateway/benches/e2e.rs
index 6f23561..68ab389 100644
--- a/crates/gateway/benches/e2e.rs
+++ b/crates/gateway/benches/e2e.rs
@@ -26,6 +26,10 @@ use common::*;
 
 const MANAGED_BODY: &str = r#"{"model":"gpt-4o","messages":[{"role":"user","content":"hi"}]}"#;
 
+/// A plausible BYO provider token (anything not starting with `bai_` is BYO — passed through
+/// unchanged, no verify/deny/swap). The mock upstream accepts any token.
+const BYO_KEY: &str = "sk-byo-provider-token-1234567890";
+
 /// Concurrency level for the throughput group — enough in-flight requests to expose per-request
 /// overhead and connection-pool behavior without saturating a laptop.
 const CONCURRENCY: u64 = 32;
@@ -47,9 +51,13 @@ struct Stack {
 }
 
 async fn start_stack() -> Stack {
+    start_stack_with(Mode::Json).await
+}
+
+async fn start_stack_with(mode: Mode) -> Stack {
     let nats = Nats::start().await;
     let (pubkey, sk) = test_keypair(1);
-    let mock = MockUpstream::start(Mode::Json).await;
+    let mock = MockUpstream::start(mode).await;
     let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
     let vkey = mint(
         &VirtualKey {
@@ -109,9 +117,45 @@ async fn managed_roundtrip(s: &Stack) {
     let _ = resp.bytes().await.expect("body");
 }
 
+/// One **BYO** round-trip: a non-`bai_` token, passed straight through — no key verify, no deny-set
+/// check, no key swap. Isolates the passthrough path's overhead from the managed path's auth work.
+async fn byo_roundtrip(s: &Stack) {
+    let resp = s
+        .client
+        .post(format!("{}/v1/chat/completions", s.url))
+        .header("authorization", format!("Bearer {BYO_KEY}"))
+        .header("content-type", "application/json")
+        .body(MANAGED_BODY)
+        .send()
+        .await
+        .expect("request");
+    debug_assert_eq!(resp.status().as_u16(), 200);
+    let _ = resp.bytes().await.expect("body");
+}
+
+/// One **rejected** request: no API key ⇒ 401, short-circuited in `request_filter` **before** any
+/// upstream connection. Benched to prove a flood of rejects costs far less than a proxied request —
+/// the rate-guardrail/flood rationale (a reject must not consume the upstream-connection
+/// constraint). The gap between this and `managed_json_latency` is the gateway's reject headroom.
+async fn reject_roundtrip(s: &Stack) {
+    let resp = s
+        .client
+        .post(format!("{}/v1/chat/completions", s.url))
+        .header("content-type", "application/json")
+        .body(MANAGED_BODY)
+        .send()
+        .await
+        .expect("request");
+    debug_assert_eq!(resp.status().as_u16(), 401);
+    let _ = resp.bytes().await.expect("body");
+}
+
 fn bench_e2e(c: &mut Criterion) {
     let rt = Runtime::new().expect("tokio runtime");
     let stack = rt.block_on(start_stack());
+    // A second stack whose mock streams SSE, so the response-tap (tail buffer + compaction) hot path
+    // is actually exercised — it's a near no-op for the single-shot JSON body.
+    let sse_stack = rt.block_on(start_stack_with(Mode::Sse));
 
     let mut group = c.benchmark_group("e2e");
     // Real round-trips are sub-millisecond on loopback but still ~100× a micro-bench; trim the
@@ -119,10 +163,21 @@ fn bench_e2e(c: &mut Criterion) {
     group.sample_size(50);
     group.measurement_time(Duration::from_secs(10));
 
-    // Single-request latency through the full proxy.
+    // Single-request latency through the full proxy: managed (verify + deny + key swap), BYO
+    // (pure passthrough), SSE relay (exercises the streaming response tap), and the reject
+    // fast-path (401, no upstream). Compared against each other these isolate where time goes.
     group.bench_function("managed_json_latency", |b| {
         b.to_async(&rt).iter(|| managed_roundtrip(&stack));
     });
+    group.bench_function("byo_json_latency", |b| {
+        b.to_async(&rt).iter(|| byo_roundtrip(&stack));
+    });
+    group.bench_function("managed_sse_latency", |b| {
+        b.to_async(&rt).iter(|| managed_roundtrip(&sse_stack));
+    });
+    group.bench_function("reject_missing_key_latency", |b| {
+        b.to_async(&rt).iter(|| reject_roundtrip(&stack));
+    });
 
     // Throughput: CONCURRENCY requests in flight per iteration. `Throughput::Elements` makes
     // criterion report requests/sec.
@@ -154,8 +209,9 @@ fn bench_e2e(c: &mut Criterion) {
 
     group.finish();
 
-    // Keep the stack alive until every bench has run, then tear it down explicitly.
+    // Keep the stacks alive until every bench has run, then tear them down explicitly.
     drop(stack);
+    drop(sse_stack);
 }
 
 criterion_group!(benches, bench_e2e);
diff --git a/crates/gateway/benches/unit.rs b/crates/gateway/benches/unit.rs
index 75010dc..c733529 100644
--- a/crates/gateway/benches/unit.rs
+++ b/crates/gateway/benches/unit.rs
@@ -65,7 +65,9 @@ mod route {
 
 mod deny {
     use super::*;
-    use beyond_ai::deny;
+    use beyond_ai::deny::{self, DenyReason, DenySet};
+
+    // --- ingest path: parse a watched NATS key/value into the set (off the request hot path) ---
 
     #[divan::bench]
     fn parse_key() -> Option<u64> {
@@ -81,6 +83,56 @@ mod deny {
     fn parse_reason_json() -> beyond_ai::deny::DenyReason {
         deny::parse_reason(black_box(br#"{"reason":"fraud","exp":123}"#))
     }
+
+    // --- request hot path: the lookup run on EVERY managed request (`proxy::request_filter`) ---
+
+    /// Build a deny-set holding `n` cut-off tenants (ids `0..n`). Built outside the timed closure.
+    fn populated(n: u64) -> DenySet {
+        (0..n).map(|t| (t, DenyReason::Spend)).collect()
+    }
+
+    /// The common case: tenant **absent** from the set (default-allow). The headline invariant is
+    /// that this is O(1) and **0-alloc regardless of set size** — so the args span an empty set and
+    /// a large one (1M cut-off tenants); the ns/iter and the (absent) alloc columns must stay flat.
+    /// A regression to anything size-dependent shows up as the big-`n` row diverging from the small.
+    #[divan::bench(args = [0, 1_000_000])]
+    fn reason_miss(bencher: Bencher, n: u64) {
+        let set = populated(n);
+        // A tenant id past the populated range → guaranteed miss (the allow path).
+        bencher.bench(|| set.reason(black_box(n + 1)));
+    }
+
+    /// The deny case: tenant present. Same O(1) hash lookup, returning the reason — proves the
+    /// enforce path costs the same as the allow path (no surprise on the rejection branch).
+    #[divan::bench(args = [1, 1_000_000])]
+    fn reason_hit(bencher: Bencher, n: u64) {
+        let set = populated(n);
+        bencher.bench(|| set.reason(black_box(n / 2)));
+    }
+}
+
+mod ratelimit {
+    use super::*;
+    use beyond_ai::ratelimit::RateLimit;
+
+    /// Guardrail charged on **every request before verify** (`proxy::request_filter`). Managed: a
+    /// seeded hash of the raw credential + the per-credential sketch `observe` (the BYO global tier is
+    /// skipped). Fixed memory regardless of key cardinality, so this must be flat and low-alloc.
+    #[divan::bench]
+    fn check_managed(bencher: Bencher) {
+        let rl = RateLimit::new(1_000_000, 1_000_000).expect("enabled");
+        let cred = "bai_v1.1.AAAAAAAAAAAAAAAAAAAAAA.signature-base64url-payload-here";
+        bencher.bench(|| rl.check(black_box(cred), black_box(true)));
+    }
+
+    /// A longer BYO provider token — exercises both tiers (global BYO bucket + per-credential sketch)
+    /// against a realistic raw token length: the full per-request BYO cost.
+    #[divan::bench]
+    fn check_byo(bencher: Bencher) {
+        let rl = RateLimit::new(1_000_000, 1_000_000).expect("enabled");
+        let token = "sk-some-byo-provider-token-of-realistic-length-abcdef0123456789";
+        bencher.bench(|| rl.check(black_box(token), black_box(false)));
+    }
 }
 
 mod usage {
@@ -137,4 +189,41 @@ mod peek {
             scanner.take_model()
         });
     }
+
+    use beyond_ai::peek::plan_stream_usage_injection;
+
+    /// A streaming body whose large `content` value precedes the root `stream` field — the worst
+    /// case for the injection planner: it must walk past `padding` bytes of uninteresting string
+    /// content (the SIMD fast-skip target) before it can decide.
+    fn streaming_body(padding: usize) -> Vec<u8> {
+        let content = "x".repeat(padding);
+        format!(r#"{{"messages":[{{"role":"user","content":"{content}"}}],"model":"gpt-4o","stream":true}}"#)
+            .into_bytes()
+    }
+
+    /// The common case: a non-streaming body (no `stream` field). The planner must prove absence,
+    /// which today means a full structural walk — the case the `memmem` pre-filter short-circuits.
+    fn non_streaming_body(padding: usize) -> Vec<u8> {
+        let content = "x".repeat(padding);
+        format!(r#"{{"messages":[{{"role":"user","content":"{content}"}}],"model":"gpt-4o"}}"#)
+            .into_bytes()
+    }
+
+    /// Plan injection on a **streaming** body (must walk past the big content value to find `stream`).
+    #[divan::bench(args = [0, 4 * 1024, 256 * 1024])]
+    fn plan_inject_streaming(bencher: Bencher, padding: usize) {
+        let body = streaming_body(padding);
+        bencher
+            .counter(BytesCount::of_slice(&body))
+            .bench(|| plan_stream_usage_injection(black_box(&body)));
+    }
+
+    /// Plan injection on a **non-streaming** body (no `stream` key — the majority case).
+    #[divan::bench(args = [0, 4 * 1024, 256 * 1024])]
+    fn plan_inject_non_streaming(bencher: Bencher, padding: usize) {
+        let body = non_streaming_body(padding);
+        bencher
+            .counter(BytesCount::of_slice(&body))
+            .bench(|| plan_stream_usage_injection(black_box(&body)));
+    }
 }
diff --git a/crates/gateway/src/config.rs b/crates/gateway/src/config.rs
index 227a024..b287528 100644
--- a/crates/gateway/src/config.rs
+++ b/crates/gateway/src/config.rs
@@ -70,12 +70,26 @@ pub struct AiConfig {
     /// to talk to a plaintext mock.
     pub upstream_tls: bool,
 
-    /// Per-key request-rate ceiling (requests/sec). A blast-radius guardrail (see `ratelimit`), not
-    /// a spend control: it caps how fast a single tenant (managed) or BYO caller can drive the
-    /// gateway, bounding a leaked/runaway key during the deny-set's reaction lag and a failure flood
-    /// that never bills. `0` disables it. The default is generous — a circuit breaker, not a quota;
-    /// tune from `ai_rejections_total{reason="rate_limit"}`.
+    /// Per-credential request-rate ceiling (requests/sec). A blast-radius guardrail (see `ratelimit`),
+    /// not a spend control: it caps how fast a single credential (managed virtual key ≈ a `(tenant,
+    /// app)`, or a BYO token) can drive the gateway, bounding a leaked/runaway key during the
+    /// deny-set's reaction lag and a failure flood that never bills. `0` disables it. The default is
+    /// generous — a circuit breaker, not a quota; tune from `ai_rejections_total{reason="rate_limit"}`.
     pub rate_limit_rps: u32,
+
+    /// Aggregate request-rate ceiling (requests/sec) for **all BYO traffic combined** — a single
+    /// shared bucket. BYO is unverified and upstream-bound, so a flood of *distinct* random BYO tokens
+    /// slips past the per-credential ceiling and would open junk-auth connections to providers from
+    /// our egress IPs (getting them rate-limited or banned). This bounds that aggregate regardless of
+    /// token variation. Managed traffic is **exempt** (it's Ed25519-verified before any upstream
+    /// connect and can't be forged), so this shared bucket never sheds core tenant load. `0` disables
+    /// it. Generous by default; tune from `ai_rejections_total{reason="rate_limit_byo_global"}`.
+    ///
+    /// Before changing this (or reaching for per-IP limiting), read the **design-decision** block in
+    /// the `ratelimit` module docs: it records why this is a global cap and not per-source-IP, what it
+    /// deliberately doesn't cover, and why the real fix for egress-reputation pain is a
+    /// provider-feedback circuit breaker rather than a bigger number here.
+    pub byo_rate_limit_rps: u32,
 }
 
 impl Default for AiConfig {
@@ -97,9 +111,13 @@ impl Default for AiConfig {
             write_timeout_secs: 60,
             idle_timeout_secs: 90,
             upstream_tls: true,
-            // Generous per-key circuit breaker, on by default. Won't touch legitimate steady-state
-            // traffic; caps a runaway/leaked key or a retry-storm flood. Set 0 to disable.
+            // Generous per-credential circuit breaker, on by default. Won't touch legitimate
+            // steady-state traffic; caps a runaway/leaked key or a retry-storm flood. Set 0 to disable.
             rate_limit_rps: 100,
+            // Generous aggregate BYO ceiling, on by default — well above any expected legitimate BYO
+            // throughput, low enough that a junk-auth flood can't get our egress IPs flagged by the
+            // providers. Tune from the metric; set 0 to disable. (Managed traffic is exempt.)
+            byo_rate_limit_rps: 1_000,
         }
     }
 }
diff --git a/crates/gateway/src/deny.rs b/crates/gateway/src/deny.rs
index 503b39b..af9964f 100644
--- a/crates/gateway/src/deny.rs
+++ b/crates/gateway/src/deny.rs
@@ -88,15 +88,18 @@ pub fn parse_key(key: &str) -> Option<u64> {
 /// object `{"reason":"spend", ...}`. Anything else → `Unknown` (still denied — fail safe).
 pub fn parse_reason(value: &[u8]) -> DenyReason {
     let s = std::str::from_utf8(value).unwrap_or("").trim();
-    let token = if s.starts_with('{') {
-        serde_json::from_slice::<serde_json::Value>(value)
+    // The JSON branch must own its extracted reason (it's borrowed from a temporary `Value`); the
+    // bare-token branch matches the borrowed `&str` directly — no allocation on the common path.
+    let json_reason: Option<String>;
+    let token: &str = if s.starts_with('{') {
+        json_reason = serde_json::from_slice::<serde_json::Value>(value)
             .ok()
-            .and_then(|v| v.get("reason").and_then(|r| r.as_str()).map(str::to_owned))
-            .unwrap_or_default()
+            .and_then(|v| v.get("reason").and_then(|r| r.as_str()).map(str::to_owned));
+        json_reason.as_deref().unwrap_or("")
     } else {
-        s.to_owned()
+        s
     };
-    match token.as_str() {
+    match token {
         "spend" => DenyReason::Spend,
         "fraud" => DenyReason::Fraud,
         _ => DenyReason::Unknown,
diff --git a/crates/gateway/src/key.rs b/crates/gateway/src/key.rs
index 923aa1e..c1ce33f 100644
--- a/crates/gateway/src/key.rs
+++ b/crates/gateway/src/key.rs
@@ -150,9 +150,12 @@ impl Keyring {
         let vk = self.get(kid).ok_or(KeyError::UnknownKid(kid))?;
 
         // The signed message binds version + kid + payload, so none can be swapped independently.
-        // Build it into a stack buffer (≤ 40 bytes) — no allocation per verify.
+        // Build it into a stack buffer (≤ 40 bytes) — no allocation per verify. A payload longer
+        // than the buffer can hold can't be a valid 16-byte payload anyway, so it's `Malformed`
+        // rather than a panic on this per-request hot path.
         let mut signed_buf = [0u8; SIGNED_BYTES_CAP];
-        let signed = write_signed_bytes(&mut signed_buf, kid, payload_b64);
+        let signed =
+            write_signed_bytes(&mut signed_buf, kid, payload_b64).ok_or(KeyError::Malformed)?;
         vk.verify(signed, &signature)
             .map_err(|_| KeyError::BadSignature)?;
 
@@ -165,19 +168,21 @@ impl Keyring {
 const SIGNED_BYTES_CAP: usize = 64;
 
 /// Write the signature-covered bytes `bai_v1.{kid}.{payload}` into `buf`, returning the written
-/// slice. Binding kid + payload here is what stops an attacker from re-pointing a valid signature
-/// at a different kid or a tampered payload. Bounded length (see `SIGNED_BYTES_CAP`) so the fixed
-/// buffer never overflows — keeps both verify and mint allocation-free.
+/// slice — or `None` if they don't fit in `SIGNED_BYTES_CAP`. Binding kid + payload here is what
+/// stops an attacker from re-pointing a valid signature at a different kid or a tampered payload.
+/// For a well-formed key the length is bounded (≤ 40 bytes; see `SIGNED_BYTES_CAP`), so `None`
+/// means the input was malformed — `write!` returns `WriteZero` rather than panicking or
+/// truncating, keeping the verify hot path allocation- *and* panic-free.
 fn write_signed_bytes<'a>(
     buf: &'a mut [u8; SIGNED_BYTES_CAP],
     kid: Kid,
     payload_b64: &str,
-) -> &'a [u8] {
+) -> Option<&'a [u8]> {
     use std::io::Write;
     let mut cur = std::io::Cursor::new(&mut buf[..]);
-    write!(cur, "{PREFIX}.{kid}.{payload_b64}").expect("signed bytes fit in SIGNED_BYTES_CAP");
+    write!(cur, "{PREFIX}.{kid}.{payload_b64}").ok()?;
     let n = cur.position() as usize;
-    &buf[..n]
+    Some(&buf[..n])
 }
 
 /// Parse an Ed25519 public key from a slipstream `signkey.*` value: accept raw 32 bytes or
@@ -207,7 +212,10 @@ pub fn verifying_key_from_value(bytes: &[u8]) -> Option<VerifyingKey> {
 pub fn mint(vk: &VirtualKey, kid: Kid, signing_key: &SigningKey) -> String {
     let payload_b64 = URL_SAFE_NO_PAD.encode(vk.encode_payload());
     let mut signed_buf = [0u8; SIGNED_BYTES_CAP];
-    let signed = write_signed_bytes(&mut signed_buf, kid, &payload_b64);
+    // mint builds the payload itself (a fixed 22-char base64 of 16 bytes) from controlled inputs,
+    // so it always fits; this `expect` is a true invariant assertion, not a fallible runtime path.
+    let signed = write_signed_bytes(&mut signed_buf, kid, &payload_b64)
+        .expect("minted signed bytes fit in SIGNED_BYTES_CAP");
     let sig: Signature = signing_key.sign(signed);
     let sig_b64 = URL_SAFE_NO_PAD.encode(sig.to_bytes());
     format!("{PREFIX}.{kid}.{payload_b64}.{sig_b64}")
diff --git a/crates/gateway/src/peek.rs b/crates/gateway/src/peek.rs
index 0b4a924..c8ac81b 100644
--- a/crates/gateway/src/peek.rs
+++ b/crates/gateway/src/peek.rs
@@ -146,6 +146,133 @@ impl ModelScanner {
     }
 }
 
+/// Decide whether an OpenAI **chat** request body needs `stream_options.include_usage` injected,
+/// and where. Returns `Some(offset)` — the byte index just after the root object's opening `{`, where
+/// the caller splices `"stream_options":{"include_usage":true},` — **only** when the body is a JSON
+/// object with a root-level `"stream": true` and **no** root-level `"stream_options"` key. Otherwise
+/// `None` (not a stream, options already set, or not an object) → forward unchanged.
+///
+/// Why this exists: OpenAI only emits a usage chunk on a stream when the request carries
+/// `stream_options.include_usage = true`. A stock client that omits it would stream with no usage,
+/// so managed traffic couldn't be metered. We can't ask for it via a header and can't set it in a
+/// client SDK we don't control, so the gateway injects it — for every OpenAI streaming chat request,
+/// out of the box.
+///
+/// Structural (depth + string + escape aware), so a `"stream"` inside a message object or inside a
+/// string value never triggers injection — only the genuine root-level field. The returned offset is
+/// always inside a non-empty object (a root `"stream"` is present), so the caller always follows the
+/// fragment with a comma.
+pub fn plan_stream_usage_injection(body: &[u8]) -> Option<usize> {
+    let n = body.len();
+    // Cheap pre-filter: injection is only ever needed when a root-level `"stream"` key is present.
+    // If the substring `"stream"` doesn't occur *anywhere*, the structural answer is unconditionally
+    // `None`, so a single SIMD `memmem` pass lets us skip the whole walk — the common case, since
+    // most requests aren't streaming. (The needle is a substring of `"stream_options"` too, so a
+    // body carrying only stream_options still passes the filter and is correctly resolved to `None`
+    // by the walk below.)
+    memchr::memmem::find(body, b"\"stream\"")?;
+    let mut i = 0;
+    while i < n && body[i].is_ascii_whitespace() {
+        i += 1;
+    }
+    // Must be a JSON object; anything else (array, scalar, garbage) we never rewrite.
+    if i >= n || body[i] != b'{' {
+        return None;
+    }
+    let insert_at = i + 1;
+
+    let mut depth = 0u32;
+    let mut in_string = false;
+    let mut escaped = false;
+    let mut expect_key = false;
+    let mut capturing_key = false;
+    // Start index (just past the opening `"`) of the root-level key currently being scanned. The
+    // body is fully in hand, so we slice the key out of it at the closing quote — no accumulation
+    // buffer, zero-copy. (Escaped keys are sliced raw; since neither `stream` nor `stream_options`
+    // contains an escape, an escaped key simply doesn't match either needle — the correct answer.)
+    let mut key_start = 0usize;
+    // The current root-level key is exactly `stream` (so the next literal is its value).
+    let mut last_key_is_stream = false;
+    let mut stream_true = false;
+
+    let mut j = i;
+    while j < n {
+        if in_string {
+            // Fast path: inside a string we're not capturing (any non-root-key string — message
+            // content, system prompts, base64 images), jump straight to the next `"`/`\` with a
+            // SIMD search instead of inspecting every byte. Mirrors the skip in `ModelScanner::feed`.
+            if !capturing_key && !escaped {
+                match memchr::memchr2(b'"', b'\\', &body[j..]) {
+                    Some(rel) => j += rel,
+                    None => break, // rest of the body is skippable string content
+                }
+            }
+            let b = body[j];
+            if escaped {
+                escaped = false;
+            } else if b == b'\\' {
+                escaped = true;
+            } else if b == b'"' {
+                in_string = false;
+                if capturing_key {
+                    capturing_key = false;
+                    // Only root-level (`depth == 1`) keys matter.
+                    if depth == 1 {
+                        let key = &body[key_start..j];
+                        // A root `stream_options` means the client already controls usage — the
+                        // answer is `None` regardless of anything else in the body, so stop now
+                        // rather than walking the remainder for a result we already know.
+                        if key == b"stream_options" {
+                            return None;
+                        }
+                        last_key_is_stream = key == b"stream";
+                    }
+                }
+            }
+            j += 1;
+            continue;
+        }
+        let b = body[j];
+        match b {
+            b'"' => {
+                // A root-level key starts only where one is expected (just after `{` or `,`).
+                if depth == 1 && expect_key {
+                    capturing_key = true;
+                    key_start = j + 1; // first key byte is just past this opening quote
+                } else {
+                    capturing_key = false;
+                }
+                in_string = true;
+            }
+            b'{' => {
+                depth += 1;
+                if depth == 1 {
+                    expect_key = true;
+                }
+            }
+            b'[' => depth += 1,
+            b'}' | b']' => depth = depth.saturating_sub(1),
+            b':' if depth == 1 => expect_key = false,
+            b',' if depth == 1 => {
+                expect_key = true;
+                last_key_is_stream = false;
+            }
+            // The value of a root-level `stream` key: a bare `true` literal.
+            b't' if depth == 1 && last_key_is_stream => {
+                if body[j..].starts_with(b"true") {
+                    stream_true = true;
+                }
+                last_key_is_stream = false;
+            }
+            _ => {}
+        }
+        j += 1;
+    }
+
+    // `stream_options` would have already returned `None` above, so reaching here means it's absent.
+    if stream_true { Some(insert_at) } else { None }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -156,6 +283,138 @@ mod tests {
         s.take_model()
     }
 
+    #[test]
+    fn extracts_model_from_sse_first_chunk() {
+        // The response-side model tap feeds SSE through this same scanner. `data: ` is non-structural
+        // noise at depth 0, so the scanner reads the first chunk's root `model` — the provider's
+        // resolved/billed id — and stops. This is what makes the billing model authoritative.
+        let sse = b"data: {\"id\":\"chatcmpl-x\",\"object\":\"chat.completion.chunk\",\"model\":\"gpt-4o-2024-08-06\",\"choices\":[]}\n\n";
+        assert_eq!(scan(sse).as_deref(), Some("gpt-4o-2024-08-06"));
+    }
+
+    /// Apply `plan_stream_usage_injection` and return the rewritten body (or unchanged if no plan),
+    /// so tests assert the *resulting* JSON — the thing the upstream actually receives.
+    fn inject(body: &str) -> String {
+        match plan_stream_usage_injection(body.as_bytes()) {
+            Some(at) => {
+                let frag = br#""stream_options":{"include_usage":true},"#;
+                let mut out = Vec::with_capacity(body.len() + frag.len());
+                out.extend_from_slice(&body.as_bytes()[..at]);
+                out.extend_from_slice(frag);
+                out.extend_from_slice(&body.as_bytes()[at..]);
+                String::from_utf8(out).unwrap()
+            }
+            None => body.to_string(),
+        }
+    }
+
+    #[test]
+    fn injects_when_streaming_and_absent() {
+        let out = inject(r#"{"model":"gpt-4o","stream":true,"messages":[]}"#);
+        assert_eq!(
+            out,
+            r#"{"stream_options":{"include_usage":true},"model":"gpt-4o","stream":true,"messages":[]}"#
+        );
+        // The result must be valid JSON with the option set.
+        let v: serde_json::Value = serde_json::from_str(&out).unwrap();
+        assert_eq!(
+            v["stream_options"]["include_usage"],
+            serde_json::json!(true)
+        );
+    }
+
+    #[test]
+    fn stream_can_be_the_only_or_last_key() {
+        assert!(plan_stream_usage_injection(br#"{"stream":true}"#).is_some());
+        let v: serde_json::Value =
+            serde_json::from_str(&inject(r#"{"model":"x","stream":true}"#)).unwrap();
+        assert_eq!(
+            v["stream_options"]["include_usage"],
+            serde_json::json!(true)
+        );
+    }
+
+    #[test]
+    fn skips_when_options_already_present() {
+        // Client already asked for usage (in any form) — never touch it.
+        assert_eq!(
+            plan_stream_usage_injection(
+                br#"{"stream":true,"stream_options":{"include_usage":false}}"#
+            ),
+            None
+        );
+        // Order-independent: options before stream.
+        assert_eq!(
+            plan_stream_usage_injection(br#"{"stream_options":{},"stream":true}"#),
+            None
+        );
+    }
+
+    #[test]
+    fn skips_when_not_streaming() {
+        assert_eq!(
+            plan_stream_usage_injection(br#"{"model":"x","stream":false}"#),
+            None
+        );
+        assert_eq!(plan_stream_usage_injection(br#"{"model":"x"}"#), None);
+    }
+
+    #[test]
+    fn ignores_nested_or_in_string_stream() {
+        // `stream` inside a message object is not the root field.
+        assert_eq!(
+            plan_stream_usage_injection(
+                br#"{"messages":[{"role":"u","stream":true}],"model":"x"}"#
+            ),
+            None
+        );
+        // `stream` mentioned inside a string value must not trigger.
+        assert_eq!(
+            plan_stream_usage_injection(br#"{"system":"set stream:true please","model":"x"}"#),
+            None
+        );
+    }
+
+    #[test]
+    fn injects_with_large_content_before_stream() {
+        // Exercises the SIMD fast-skip in the planner: a large content value must be skipped, and
+        // the genuine root `stream` after it still triggers injection.
+        let big = "x".repeat(64 * 1024);
+        let body = format!(r#"{{"messages":[{{"content":"{big}"}}],"stream":true}}"#);
+        let v: serde_json::Value = serde_json::from_str(&inject(&body)).unwrap();
+        assert_eq!(
+            v["stream_options"]["include_usage"],
+            serde_json::json!(true)
+        );
+    }
+
+    #[test]
+    fn skips_word_stream_inside_large_value() {
+        // The word `stream` (even `"stream"`) buried in a big string value must not trigger — the
+        // memmem pre-filter passes, but the structural walk correctly skips over the string content.
+        let big = "x".repeat(64 * 1024);
+        let body = format!(r#"{{"system":"{big} \"stream\":true","model":"x"}}"#);
+        assert_eq!(plan_stream_usage_injection(body.as_bytes()), None);
+    }
+
+    #[test]
+    fn stream_options_after_large_content_suppresses() {
+        // The early-return-on-stream_options path: stream_options appearing (in any order, after a
+        // big value) must suppress injection even though `stream:true` is also present.
+        let big = "x".repeat(64 * 1024);
+        let body = format!(
+            r#"{{"content":"{big}","stream":true,"stream_options":{{"include_usage":false}}}}"#
+        );
+        assert_eq!(plan_stream_usage_injection(body.as_bytes()), None);
+    }
+
+    #[test]
+    fn tolerates_whitespace_and_non_objects() {
+        assert!(plan_stream_usage_injection(b"  {  \"stream\" : true }").is_some());
+        assert_eq!(plan_stream_usage_injection(b"[1,2,3]"), None);
+        assert_eq!(plan_stream_usage_injection(b"not json"), None);
+    }
+
     #[test]
     fn simple() {
         assert_eq!(
diff --git a/crates/gateway/src/proxy.rs b/crates/gateway/src/proxy.rs
index d846f4e..fdd49f7 100644
--- a/crates/gateway/src/proxy.rs
+++ b/crates/gateway/src/proxy.rs
@@ -18,9 +18,11 @@
 //! (the supported hook), feeding each chunk to a streaming structural scanner (`peek::ModelScanner`,
 //! O(1) memory) — never withholding or buffering it.
 //!
-//! Not done by design: OpenAI `stream_options.include_usage` injection — a streaming OpenAI client
-//! that omits it has no usage chunk to meter (the SDK/platform can set it). Worth it to keep the
-//! request body a pure passthrough rather than buffering+rewriting every request.
+//! One deliberate exception to the no-buffer rule: a **managed** OpenAI chat/responses request is
+//! buffered and gets `stream_options.include_usage` injected when it streams without it — otherwise
+//! OpenAI emits no usage chunk and the request couldn't be metered. We can't set that option in a
+//! client SDK we don't control, so the gateway guarantees it, out of the box. Scoped to exactly that
+//! path (managed + OpenAI dialect + streaming-capable); BYO and everything else stay pure passthrough.
 //!
 //! Auth branches on key format: `bai_…` is a managed virtual key (verify → deny-check → swap to
 //! the pool key); anything else is a **BYO** request — the user's own provider token, passed
@@ -30,7 +32,6 @@
 //! `upstream_peer` runs. Any non-default provider is reached via the `x-beyond-provider` header
 //! (providers are data — see `route`). Model is still captured (from the streamed body) for usage.
 
-use crate::ratelimit::RlKey;
 use crate::route::{self, Dialect, Provider};
 use crate::state::GatewayState;
 use crate::{peek, usage};
@@ -76,9 +77,18 @@ pub struct RequestCtx {
     /// Whether this is a **managed** request (`bai_…` key → swap to the pool key). `false` for
     /// **BYO** — we leave the user's own auth header untouched (passthrough).
     managed: bool,
-    /// Requested model, extracted exactly from the request body via a streaming structural scan.
+    /// Model the client *requested*, extracted from the request body. This is the billing-log
+    /// **fallback** — the authoritative value is the model the provider echoes in its response (see
+    /// `resp_model_scanner`), because a client may send an alias (`gpt-4o`) that the provider resolves
+    /// to and bills under a pinned id (`gpt-4o-2024-08-06`).
     model: String,
     model_scanner: peek::ModelScanner,
+    /// Extracts the model the **provider** reports in its response (the resolved/billed id), fed the
+    /// response stream in `response_body_filter`. Preferred over `model` in the `ai.usage` event so
+    /// the billed model is authoritative, not the requested alias. Works for SSE too: the scanner
+    /// skips the `data: ` prefix and reads the first chunk's root `model`. Falls back to `model` when
+    /// the response carries none (e.g. an error body).
+    resp_model_scanner: peek::ModelScanner,
     /// Whether the upstream response is an SSE stream — set in `response_filter` from the response
     /// Content-Type (we don't read the request to learn this).
     streaming: bool,
@@ -87,17 +97,28 @@ pub struct RequestCtx {
     /// Running total of request-body bytes seen, to enforce `MAX_REQUEST_BODY` even when the client
     /// uses chunked transfer encoding (no `Content-Length` to check up front).
     body_bytes_fed: usize,
+    /// Managed OpenAI chat/responses request: buffer the body and inject
+    /// `stream_options.include_usage` if it streams without it, so the usage chunk (hence the
+    /// billable token count) is guaranteed. The single, deliberate exception to "never buffer the
+    /// request body" — scoped to the managed OpenAI streaming-capable path and bounded by
+    /// `MAX_REQUEST_BODY`. BYO and every other request still stream straight through.
+    inject_eligible: bool,
+    /// Accumulated request body — populated only when `inject_eligible`; otherwise stays empty and
+    /// the body is never buffered.
+    req_buf: Vec<u8>,
     start: Instant,
     /// Connect-retry counter (see `fail_to_connect`).
     attempt: u8,
 }
 
 impl AiProxy {
-    /// Write a small JSON error and signal `request_filter` to short-circuit.
+    /// Write a small JSON error and signal `request_filter` to short-circuit. The body is built with
+    /// `serde_json` (not `format!`) so a `typ`/`msg` containing `"` or `\` can never break out of the
+    /// JSON structure — keeps this safe if a future caller passes a non-literal message.
     async fn reject(session: &mut Session, status: u16, typ: &str, msg: &str) -> Result<bool> {
-        let body = Bytes::from(format!(
-            r#"{{"error":{{"type":"{typ}","message":"{msg}"}}}}"#
-        ));
+        let body = Bytes::from(
+            serde_json::json!({ "error": { "type": typ, "message": msg } }).to_string(),
+        );
         let mut resp = ResponseHeader::build(status, None)?;
         resp.insert_header("content-type", "application/json")?;
         resp.insert_header("content-length", body.len().to_string())?;
@@ -120,6 +141,24 @@ fn extract_virtual_key(session: &Session) -> Option<&str> {
         .and_then(|v| v.strip_prefix("Bearer "))
 }
 
+/// Upper bound on a model id we'll record. Real ids are short (`claude-opus-4-8`,
+/// `accounts/fireworks/models/…`); anything longer is junk or an attempt to bloat the billing log.
+const MAX_MODEL_LEN: usize = 128;
+
+/// Sanitize the model id extracted from the (client-controlled) request body before it lands in the
+/// `ai.usage` billing log. `tracing`'s JSON layer escapes the value, but a downstream consumer
+/// (logfwd/OTLP → ClickHouse) may re-handle it, so we refuse anything that could break out of a JSON
+/// string or a line-oriented log: control bytes, `"`, `\`, `DEL`. A violating or over-long value is
+/// recorded as `"unknown"` (matching `peek`'s non-UTF-8 fallback) rather than the raw bytes — a
+/// mislabeled-but-safe usage row beats a corrupted or injected one.
+fn sanitize_model(model: String) -> String {
+    let bad = model.len() > MAX_MODEL_LEN
+        || model
+            .bytes()
+            .any(|b| b < 0x20 || b == b'"' || b == b'\\' || b == 0x7f);
+    if bad { "unknown".to_string() } else { model }
+}
+
 fn dialect_for_path(path: &str) -> Dialect {
     // Anthropic Messages vs OpenAI Chat Completions/Embeddings. Embeddings are OpenAI-dialect only.
     if path.starts_with("/v1/messages") {
@@ -129,6 +168,30 @@ fn dialect_for_path(path: &str) -> Dialect {
     }
 }
 
+/// OpenAI **streaming-capable** endpoints: chat completions + the Responses API. These are the only
+/// requests we buffer for `stream_options.include_usage` injection — embeddings and every other
+/// OpenAI-dialect path never stream, so there's nothing to meter and no reason to buffer them.
+fn openai_streamable_path(path: &str) -> bool {
+    path.starts_with("/v1/chat/completions") || path.starts_with("/v1/responses")
+}
+
+/// Splice `stream_options.include_usage` into a buffered OpenAI chat body when it streams without it
+/// (see `peek::plan_stream_usage_injection`); otherwise return it unchanged. This is what guarantees
+/// a usage chunk — hence a billable token count — from a stock client that never set the option.
+fn maybe_inject_stream_usage(body: Vec<u8>) -> Vec<u8> {
+    match peek::plan_stream_usage_injection(&body) {
+        Some(at) => {
+            const FRAG: &[u8] = br#""stream_options":{"include_usage":true},"#;
+            let mut out = Vec::with_capacity(body.len() + FRAG.len());
+            out.extend_from_slice(&body[..at]);
+            out.extend_from_slice(FRAG);
+            out.extend_from_slice(&body[at..]);
+            out
+        }
+        None => body,
+    }
+}
+
 /// The `x-beyond-provider` override value, if present — a provider *name* resolved against the
 /// registry in `request_filter`. (An unknown name is rejected there, not silently ignored.)
 fn provider_override(session: &Session) -> Option<&str> {
@@ -174,7 +237,26 @@ impl ProxyHttp for AiProxy {
             return Self::reject(session, 401, "authentication_error", "missing API key").await;
         };
 
-        // 3. Reject oversized bodies up front (Content-Length) so we never buffer a huge upload.
+        // 3. Rate guardrails (see `ratelimit`), charged on the *raw presented key* **before** any
+        // verification or upstream connect. Keying on the credential we already hold (rather than the
+        // verified tenant id) is what lets this sit ahead of the Ed25519 verify: a single leaked,
+        // runaway, or forged key can't drive unbounded crypto work (per-credential tier), and a flood
+        // of distinct random BYO tokens can't drive junk-auth connects to providers from our egress
+        // IPs (global BYO tier — managed traffic is exempt, see `ratelimit`). The `check` borrow of
+        // `raw_key` ends as the call returns, so the `&mut session` reject is free to run on the
+        // over-limit path (where `raw_key` is unused afterward).
+        if let Some(rl) = &self.state.rate_limit {
+            if let Some(reason) = rl.check(raw_key, raw_key.starts_with("bai_")) {
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&[reason.label()])
+                    .inc();
+                return Self::reject(session, 429, "rate_limit_error", "rate limit exceeded").await;
+            }
+        }
+
+        // 4. Reject oversized bodies up front (Content-Length) so we never buffer a huge upload.
         if let Some(len) = session
             .req_header()
             .headers
@@ -193,7 +275,7 @@ impl ProxyHttp for AiProxy {
             }
         }
 
-        // 4. Identity + key handling. `bai_…` → managed (stateless verify → deny-check → swap to the
+        // 5. Identity + key handling. `bai_…` → managed (stateless verify → deny-check → swap to the
         // pool key). Anything else → BYO: the user's own provider token, passed through unchanged
         // (no Beyond identity, so no deny-set and no per-tenant attribution).
         let (tenant_id, vpc_id, managed) = if raw_key.starts_with("bai_") {
@@ -234,24 +316,13 @@ impl ProxyHttp for AiProxy {
             (0, 0, false)
         };
 
-        // 5. Per-key rate guardrail (see `ratelimit`): caps a single key's request velocity. Keyed by
-        // tenant for managed traffic, by a hash of the BYO token otherwise. Computed into an owned
-        // key so the `raw_key` borrow of `session` ends before the `&mut session` reject below.
-        if let Some(rl) = &self.state.rate_limit {
-            let key = if managed {
-                RlKey::Tenant(tenant_id)
-            } else {
-                RlKey::byo(raw_key)
-            };
-            if !rl.check(&key) {
-                self.state
-                    .metrics
-                    .rejections_total
-                    .with_label_values(&["rate_limit"])
-                    .inc();
-                return Self::reject(session, 429, "rate_limit_error", "rate limit exceeded").await;
-            }
-        }
+        // Mark OpenAI managed chat/responses streams for body buffering + `stream_options` injection
+        // (handled in `request_body_filter`). Scoped tight: managed only (BYO stays pure
+        // passthrough), OpenAI dialect only, streaming-capable paths only — so everything else still
+        // streams through untouched.
+        let inject_eligible = managed
+            && dialect == Dialect::OpenAI
+            && openai_streamable_path(session.req_header().uri.path());
 
         *ctx = Some(RequestCtx {
             tenant_id,
@@ -261,8 +332,16 @@ impl ProxyHttp for AiProxy {
             managed,
             model: String::new(),
             model_scanner: peek::ModelScanner::new(),
+            resp_model_scanner: peek::ModelScanner::new(),
             streaming: false,
-            resp_tail: Vec::with_capacity(USAGE_TAIL_CAP),
+            inject_eligible,
+            req_buf: Vec::new(),
+            // Grown lazily by the response tap (`response_body_filter`), not pre-reserved: a
+            // non-streaming response — the common case — is a few hundred bytes, so reserving the
+            // full 64KB cap up front would waste an allocation on every request to hold ~200B. A
+            // long stream grows it geometrically to the bounded 2×cap and compacts; that handful of
+            // reallocs is lost in the network noise of a stream we're already relaying chunk by chunk.
+            resp_tail: Vec::new(),
             body_bytes_fed: 0,
             start,
             attempt: 0,
@@ -347,6 +426,14 @@ impl ProxyHttp for AiProxy {
                 upstream_request.set_uri(uri);
             }
         }
+
+        // Injection-eligible (OpenAI managed stream): the body is rewritten in `request_body_filter`,
+        // changing its length, and we can't know the new length here (headers go out before the body
+        // filter runs). So drop the client's `Content-Length` and frame the buffered body as chunked.
+        if rc.inject_eligible {
+            upstream_request.remove_header("content-length");
+            upstream_request.insert_header("transfer-encoding", "chunked")?;
+        }
         Ok(())
     }
 
@@ -378,10 +465,28 @@ impl ProxyHttp for AiProxy {
                 return Err(pingora_core::Error::new_str("request body exceeds limit"));
             }
             rc.model_scanner.feed(chunk);
+            // Eligible requests are buffered so we can splice the root object before any byte reaches
+            // the upstream (injection inserts near the front, so we can't have forwarded it already).
+            if rc.inject_eligible {
+                rc.req_buf.extend_from_slice(chunk);
+            }
         }
+
+        if rc.inject_eligible {
+            if end_of_stream {
+                // Emit the whole (possibly rewritten) body in one shot; `transfer-encoding: chunked`
+                // (set in `upstream_request_filter`) makes the changed length fine.
+                let buf = std::mem::take(&mut rc.req_buf);
+                *body = Some(Bytes::from(maybe_inject_stream_usage(buf)));
+            } else {
+                // Withhold — the bytes are buffered above; nothing goes upstream until end-of-stream.
+                *body = None;
+            }
+        }
+
         if end_of_stream && rc.model.is_empty() {
             if let Some(m) = rc.model_scanner.take_model() {
-                rc.model = m;
+                rc.model = sanitize_model(m);
             }
         }
         Ok(())
@@ -428,6 +533,12 @@ impl ProxyHttp for AiProxy {
         // but moves bytes O(stream_len / cap) times instead of once per chunk — for a long stream of
         // small chunks that's the difference between one memmove per 64 KB and one per chunk.
         if let (Some(rc), Some(chunk)) = (ctx.as_mut(), body.as_ref()) {
+            // Tap the provider-reported (resolved/billed) model from the response *head* — the
+            // scanner stops at the first root `model`, so this is O(1) and cheap (it finds the model
+            // in the first chunk and ignores the rest). Kept separate from the tail because the model
+            // is at the start of the response while the usage event is at the end.
+            rc.resp_model_scanner.feed(chunk);
+
             rc.resp_tail.extend_from_slice(chunk);
             if rc.resp_tail.len() > 2 * USAGE_TAIL_CAP {
                 let keep_from = rc.resp_tail.len() - USAGE_TAIL_CAP;
@@ -461,7 +572,7 @@ impl ProxyHttp for AiProxy {
         _e: Option<&pingora_core::Error>,
         ctx: &mut Self::CTX,
     ) {
-        let Some(rc) = ctx.as_ref() else { return };
+        let Some(rc) = ctx.as_mut() else { return };
 
         // The buffer may transiently hold up to 2× the cap before compaction; the usage event is
         // always in the last cap bytes, so slice to that bounded tail before parsing.
@@ -487,21 +598,80 @@ impl ProxyHttp for AiProxy {
         m.upstream_latency_seconds
             .observe(rc.start.elapsed().as_secs_f64());
 
-        // Emit the usage *fact* on a dedicated target. logfwd/OTLP ships `ai.usage` → ClickHouse;
-        // pricing is a closed downstream consumer (we emit token counts only).
-        info!(
-            target: "ai.usage",
-            tenant_id = rc.tenant_id,
-            vpc_id = rc.vpc_id,
-            provider = rc.provider.name.as_str(),
-            model = %rc.model,
-            stream = rc.streaming,
-            input_tokens = usage.input_tokens,
-            output_tokens = usage.output_tokens,
-            cache_read_tokens = usage.cache_read_tokens,
-            cache_write_tokens = usage.cache_write_tokens,
-            latency_ms = rc.start.elapsed().as_millis() as u64,
-            "usage"
-        );
+        // Emit the usage *fact* on a dedicated target — **managed only**. The event is an
+        // identity-keyed billing record (logfwd/OTLP ships `ai.usage` → ClickHouse → a closed
+        // pricing consumer); BYO carries no Beyond identity, so a BYO event would be a billing row
+        // with `tenant_id=0` — unbillable, unattributable, and a footgun for any consumer that sums
+        // without filtering it out. Aggregate gateway throughput (incl. BYO) is already covered by
+        // the Prometheus metrics above, which is the right tool for non-billing observability.
+        if rc.managed {
+            // Emit BOTH models. `model` is the one the *provider* resolved + billed (echoed in its
+            // response) — the key for pricing AND for reconciling against the provider's invoice,
+            // which itemizes by the pinned snapshot. `requested_model` is the alias the client sent —
+            // product analytics ("what they asked for") and a fallback rate when a snapshot is newer
+            // than the downstream price table. They're equal when the response carried no model (e.g.
+            // an error body), where `model` falls back to the request alias. Both sanitized.
+            let billed_model = rc
+                .resp_model_scanner
+                .take_model()
+                .map(sanitize_model)
+                .unwrap_or_else(|| rc.model.clone());
+            info!(
+                target: "ai.usage",
+                tenant_id = rc.tenant_id,
+                vpc_id = rc.vpc_id,
+                provider = rc.provider.name.as_str(),
+                model = %billed_model,
+                requested_model = %rc.model,
+                stream = rc.streaming,
+                input_tokens = usage.input_tokens,
+                output_tokens = usage.output_tokens,
+                cache_read_tokens = usage.cache_read_tokens,
+                cache_write_tokens = usage.cache_write_tokens,
+                latency_ms = rc.start.elapsed().as_millis() as u64,
+                "usage"
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn sanitize_model_passes_real_ids() {
+        for id in [
+            "gpt-4o",
+            "claude-opus-4-8",
+            "openrouter/meta-llama/llama-3.1",
+            "accounts/fireworks/models/llama-v3p1-70b-instruct",
+            "gpt-4o-mini-2024-07-18",
+        ] {
+            assert_eq!(sanitize_model(id.to_string()), id);
+        }
+    }
+
+    #[test]
+    fn sanitize_model_rejects_json_and_log_injection() {
+        // A `"` would close the JSON string; `\` could escape; a newline breaks line-oriented log
+        // shipping. Any of them ⇒ recorded as "unknown" rather than injected into the billing log.
+        for evil in [
+            r#"real","injected":"x"#,
+            r#"a\b"#,
+            "line1\nline2",
+            "ctrl\u{0}byte",
+        ] {
+            assert_eq!(sanitize_model(evil.to_string()), "unknown");
+        }
+    }
+
+    #[test]
+    fn sanitize_model_rejects_overlong() {
+        let long = "a".repeat(MAX_MODEL_LEN + 1);
+        assert_eq!(sanitize_model(long), "unknown");
+        // Exactly at the cap is fine.
+        let ok = "a".repeat(MAX_MODEL_LEN);
+        assert_eq!(sanitize_model(ok.clone()), ok);
     }
 }
diff --git a/crates/gateway/src/ratelimit.rs b/crates/gateway/src/ratelimit.rs
index 4365cbe..064f880 100644
--- a/crates/gateway/src/ratelimit.rs
+++ b/crates/gateway/src/ratelimit.rs
@@ -1,80 +1,172 @@
-//! Per-key request-rate guardrail — a blast-radius circuit breaker, **not** a spend control.
+//! Request-rate guardrails — blast-radius circuit breakers, **not** a spend control.
 //!
 //! The deny-set (see `deny`) is the spend/fraud authority, but it's *cumulative* and reacts on a
 //! lag: it only learns of spend after usage facts round-trip through the control plane, and it's
 //! structurally blind to request floods that never bill — auth failures (rejected here, never reach
-//! upstream), provider 4xx, and BYO traffic (on the caller's own key, no Beyond identity). This caps
-//! the *velocity* a single key can drive, which bounds two things the deny-set can't: (1) spend from
-//! a leaked/runaway managed key during the deny-set's reaction lag, and (2) the gateway-resource cost
-//! (verifies, sockets, upstream connections) of a failure flood — the classic internal-service
-//! incident: a buggy client in a retry storm.
+//! upstream), provider 4xx, and BYO traffic (on the caller's own key, no Beyond identity). Two tiers
+//! cap velocity, both charged in `proxy::request_filter` *before* the Ed25519 verify and the upstream
+//! connect, so a flood can't drive unbounded crypto/socket work:
 //!
-//! It is deliberately generous: a ceiling well above any legitimate single-tenant steady state, so
-//! it never trips in normal operation. Tune it from `ai_rejections_total{reason="rate_limit"}`.
+//! 1. **Per-credential** — keyed by the raw presented credential (the whole `bai_…` virtual key or
+//!    BYO token). Catches a single leaked/runaway key. Granularity is per-credential: managed virtual
+//!    keys are deterministic per `(tenant, app)`, so this is effectively a per-(tenant, app) ceiling —
+//!    one credential's runaway can't throttle another. A flood of *distinct* credentials slips past
+//!    it (every random string is its own bucket), which is what tier 2 exists for.
 //!
-//! Backed by pingora-limits' `Rate`: a count-min-sketch estimator with **fixed memory regardless of
-//! key cardinality** (no per-tenant entry, no background GC), matching the deny-set's O(denied)
-//! ethos. The sketch can *over*estimate a key's rate on hash collision but never under, so the cap
-//! is always enforced; `SLOTS` is sized wide enough that overestimation stays negligible at our
-//! active-key counts.
+//! 2. **Global BYO aggregate** — a single bucket for *all* BYO traffic combined. BYO is unverified
+//!    and upstream-bound: a flood of distinct random BYO tokens would otherwise open junk-auth
+//!    connections to providers from our egress IPs, getting them rate-limited or banned (we put
+//!    ourselves in the firing line). This bounds that aggregate regardless of how the tokens vary.
+//!    **Managed traffic is exempt** — it's Ed25519-verified before any upstream connect and can't be
+//!    forged (the signing key lives only in the control plane), so a random `bai_` flood fails verify
+//!    and never reaches a provider (CPU only, no egress impact). Exempting it means this shared bucket
+//!    only ever sheds BYO load under a flood, never the core managed tenants.
+//!
+//! Both tiers are deliberately generous: ceilings well above legitimate steady state, so they never
+//! trip in normal operation. Tune from `ai_rejections_total{reason="rate_limit"}` (per-credential)
+//! and `{reason="rate_limit_byo_global"}` (BYO aggregate).
+//!
+//! ## Design decision: why a global BYO cap and not per-source-IP (READ BEFORE CHANGING)
+//!
+//! The threat that shaped tier 2 is **egress-IP reputation**, not gateway CPU. We are an egress proxy:
+//! BYO requests connect outward to OpenAI/Anthropic/OpenRouter/… *from our IPs* carrying the caller's
+//! token. A flood of distinct **junk** BYO tokens makes those providers see a torrent of failed-auth
+//! connections from us and rate-limit or ban our egress IPs — taking down BYO for *everyone*, and
+//! degrading managed traffic that shares the same egress. That blast radius is why this lives here and
+//! is on by default, rather than being pushed entirely to the mesh/ingress.
+//!
+//! **Per-source-IP limiting was considered and rejected** as the primary control. It's the surgical
+//! answer in principle (throttle only the noisy source), but it depends on the calling task's real IP
+//! being visible here — and in production we front this with ECS Service Connect, where it is unclear
+//! whether the peer address is the client task or a collapsed mesh/proxy hop. If it's collapsed,
+//! per-IP keying is worse than nothing: it either does nothing (all sources share one IP, so no single
+//! key trips) or throttles every tenant at once. We refused to hinge an egress-protection control on
+//! an unverified topology assumption. The global BYO cap is **topology-independent** — it bounds the
+//! aggregate no matter how source identity is mangled. (If/when we confirm real per-task IPs reach us,
+//! a per-IP tier is a reasonable *addition* in front of this — not a replacement.)
+//!
+//! ## What this deliberately does NOT cover (the residual — don't assume it's solved)
+//!
+//! - **The BYO cap is a shared bucket.** A flood large enough to hit `byo_rate_limit_rps` *does* shed
+//!   legitimate BYO callers along with the attacker — they're indistinguishable at admit time (we
+//!   reject before we know a token is junk). The trust segmentation (managed exempt) bounds the blast
+//!   radius to BYO only; it does not make the BYO shedding selective.
+//! - **The default ceiling is a guess.** `byo_rate_limit_rps = 1000` was picked without real BYO
+//!   traffic numbers — high enough to clear plausible legitimate use, low enough that a junk flood
+//!   can't realistically get us banned. It is meant to be tuned from the metric, not trusted as-is.
+//! - **A more selective control is the next step, not this.** The surgical fix for egress reputation
+//!   is a **provider-feedback circuit breaker**: watch upstream responses and back BYO off a provider
+//!   when we see a burst of `401`s (junk auth) from it, instead of capping all BYO blindly. That reacts
+//!   to the actual signal (providers rejecting us) and spares legitimate BYO. It's a real feature, not
+//!   a guardrail, so it's intentionally out of scope here. If you're here because the blunt cap hurt,
+//!   build that — don't just raise the number.
+//!
+//! Backed by pingora-limits' `Rate`: count-min-sketch estimators with **fixed memory regardless of
+//! key cardinality** (no per-credential entry, no background GC), matching the deny-set's O(denied)
+//! ethos. A sketch can *over*estimate a key's rate on hash collision but never under, so a cap is
+//! always enforced; `SLOTS` is sized wide enough that overestimation stays negligible.
 
 use pingora_limits::rate::Rate;
-use std::hash::Hash;
+use std::hash::{BuildHasher, RandomState};
 use std::time::Duration;
 
-/// Count-min sketch dimensions. Wider than `Rate::new`'s 1024-slot default because our key
-/// cardinality (active tenants + BYO callers within a 1s window) is high; more slots keeps
-/// collision-driven overestimation negligible. ~8192·4·2 atomic counters — a few hundred KB, fixed.
-const SLOTS: usize = 8192;
-const HASHES: usize = 4;
+/// Count-min sketch dimensions for the per-credential tier. The estimator can only *over*estimate a
+/// key's rate (never under — so the cap always holds); the additive error is bounded by
+/// `(e / SLOTS) × N`, where `N` is total req/s across *all* credentials on the node. Sized for a
+/// single high-volume node: at `SLOTS = 65536` that error stays ≤ ~5 even at ~100k req/s aggregate —
+/// far under the per-credential ceiling, so a legitimate caller near its limit isn't false-throttled.
+/// `HASHES = 5` sets the tail confidence (≈ `e^-5` ≈ 0.7% of checks may exceed that bound; the
+/// estimate is the min over the 5 rows). Memory is `2 × HASHES × SLOTS × 8 B` ≈ **5 MB, fixed**
+/// regardless of credential cardinality (no per-key entry, no GC). To resize: `SLOTS ≈ e × peak_N /
+/// tolerable_error`.
+const SLOTS: usize = 65536;
+const HASHES: usize = 5;
 
-/// The rate window. The ceiling is expressed per this interval, i.e. requests/second.
+/// The rate window. Every ceiling is expressed per this interval, i.e. requests/second.
 const WINDOW: Duration = Duration::from_secs(1);
 
-/// What a single request is charged against. Managed traffic is keyed by tenant, so one tenant's
-/// runaway can't throttle another; BYO has no Beyond identity, so it's keyed by a hash of the
-/// caller's own token. One key space — the enum discriminant keeps a `tenant_id` from colliding with
-/// a BYO token hash that happens to share its value.
-#[derive(Hash)]
-pub enum RlKey {
-    Tenant(u64),
-    Byo(u64),
+/// The single sketch key the global BYO tier counts everything under (one shared bucket).
+const BYO_GLOBAL_KEY: u8 = 0;
+
+/// Why a request was throttled — carried out so the caller can label the rejection metric and an
+/// operator can tell *which* ceiling tripped (and thus which knob to tune).
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Throttled {
+    /// A single credential exceeded its per-credential ceiling.
+    PerCredential,
+    /// Aggregate BYO traffic exceeded the global ceiling.
+    ByoGlobal,
 }
 
-impl RlKey {
-    /// Key a BYO request by a hash of its raw token (we have no tenant identity for BYO). The token
-    /// itself is never stored — only this digest, which the sketch hashes again into its slots.
-    pub fn byo(raw_token: &str) -> Self {
-        use std::hash::Hasher;
-        let mut h = std::collections::hash_map::DefaultHasher::new();
-        raw_token.hash(&mut h);
-        RlKey::Byo(h.finish())
+impl Throttled {
+    /// The `ai_rejections_total{reason=…}` label. `PerCredential` keeps the original `"rate_limit"`
+    /// label so existing dashboards/alerts are unbroken.
+    pub fn label(self) -> &'static str {
+        match self {
+            Throttled::PerCredential => "rate_limit",
+            Throttled::ByoGlobal => "rate_limit_byo_global",
+        }
     }
 }
 
 pub struct RateLimit {
-    rate: Rate,
-    /// Max requests per `WINDOW` for a single key before we start rejecting.
-    max_per_window: isize,
+    /// `(sketch, max_per_window)` for the per-credential tier. `None` disables it.
+    per_cred: Option<(Rate, isize)>,
+    /// `(sketch, max_per_window)` for the global BYO aggregate tier. `None` disables it.
+    byo_global: Option<(Rate, isize)>,
+    /// Process-random hash state. The raw credential is reduced to the per-credential sketch key
+    /// through this, so the SipHash key is per-process and secret. Without it the digest would be
+    /// precomputable (`DefaultHasher` keys on zeros), letting an attacker craft two tokens that
+    /// collide into the same slots and inflate another caller's counter — false throttling. Random
+    /// seeding makes that collision search infeasible.
+    hasher: RandomState,
 }
 
 impl RateLimit {
-    /// `rps` is the per-key requests/second ceiling. `rps == 0` disables the limiter (`None`).
-    pub fn new(rps: u32) -> Option<Self> {
-        if rps == 0 {
+    /// `per_cred_rps` is the per-credential ceiling; `byo_global_rps` is the aggregate BYO ceiling.
+    /// Either tier is disabled by passing `0`. Returns `None` (no limiter at all) only when both are
+    /// `0`, so the hot path can skip it entirely.
+    pub fn new(per_cred_rps: u32, byo_global_rps: u32) -> Option<Self> {
+        if per_cred_rps == 0 && byo_global_rps == 0 {
             return None;
         }
         Some(Self {
-            rate: Rate::new_with_estimator_config(WINDOW, HASHES, SLOTS),
-            max_per_window: rps as isize,
+            per_cred: (per_cred_rps != 0).then(|| {
+                (
+                    Rate::new_with_estimator_config(WINDOW, HASHES, SLOTS),
+                    per_cred_rps as isize,
+                )
+            }),
+            // One bucket, so the default estimator is plenty — no need for the wide sketch.
+            byo_global: (byo_global_rps != 0).then(|| (Rate::new(WINDOW), byo_global_rps as isize)),
+            // `RandomState::new()` draws a fresh SipHash key from the OS RNG per process.
+            hasher: RandomState::new(),
         })
     }
 
-    /// Charge one request to `key`. Returns `true` when it's within budget, `false` once the key has
-    /// exceeded its ceiling in the current window. `observe` counts the event and returns the running
-    /// total for the window, so the very request that crosses the line is the first one rejected.
-    pub fn check(&self, key: &RlKey) -> bool {
-        self.rate.observe(key, 1) <= self.max_per_window
+    /// Charge one request. `managed` is `true` for a verified-path (`bai_…`) credential, `false` for
+    /// BYO. Returns `None` when within budget, or `Some(reason)` once a ceiling is crossed — the very
+    /// request that crosses the line is the first one rejected (`observe` returns the running total).
+    /// The credential itself is never stored; only its seeded digest feeds the per-credential sketch.
+    pub fn check(&self, raw_credential: &str, managed: bool) -> Option<Throttled> {
+        // Global BYO backstop first: BYO is unverified and upstream-bound, so this is the ceiling that
+        // protects our egress IPs from a distinct-token flood. Managed traffic skips it (verified,
+        // can't be forged, already bounded per-credential) so it never shares this bucket.
+        if !managed {
+            if let Some((rate, max)) = &self.byo_global {
+                if rate.observe(&BYO_GLOBAL_KEY, 1) > *max {
+                    return Some(Throttled::ByoGlobal);
+                }
+            }
+        }
+        // Per-credential ceiling: a single leaked/runaway key (managed or BYO), capped before verify.
+        if let Some((rate, max)) = &self.per_cred {
+            let key = self.hasher.hash_one(raw_credential);
+            if rate.observe(&key, 1) > *max {
+                return Some(Throttled::PerCredential);
+            }
+        }
+        None
     }
 }
 
@@ -83,29 +175,53 @@ mod tests {
     use super::*;
 
     #[test]
-    fn zero_rps_disables() {
-        assert!(RateLimit::new(0).is_none());
+    fn both_zero_disables() {
+        assert!(RateLimit::new(0, 0).is_none());
     }
 
     #[test]
-    fn allows_up_to_ceiling_then_rejects() {
-        let rl = RateLimit::new(5).unwrap();
-        let k = RlKey::Tenant(1);
+    fn per_credential_allows_up_to_ceiling_then_rejects() {
+        let rl = RateLimit::new(5, 0).unwrap();
+        let cred = "bai_v1.1.payload.sig";
         for _ in 0..5 {
-            assert!(rl.check(&k));
+            assert_eq!(rl.check(cred, true), None);
+        }
+        // 6th request in the same 1s window crosses the per-credential ceiling.
+        assert_eq!(rl.check(cred, true), Some(Throttled::PerCredential));
+    }
+
+    #[test]
+    fn credentials_have_independent_budgets() {
+        let rl = RateLimit::new(2, 0).unwrap();
+        assert_eq!(rl.check("token-1", false), None);
+        assert_eq!(rl.check("token-1", false), None);
+        assert_eq!(rl.check("token-1", false), Some(Throttled::PerCredential)); // token-1 exhausted
+        assert_eq!(rl.check("token-2", false), None); // a different credential is unaffected
+    }
+
+    #[test]
+    fn byo_global_caps_distinct_tokens_but_exempts_managed() {
+        // Per-credential disabled, global BYO ceiling = 3. A flood of *distinct* BYO tokens (which
+        // would each slip past per-credential keying) is still bounded by the shared bucket.
+        let rl = RateLimit::new(0, 3).unwrap();
+        assert_eq!(rl.check("byo-aaaa", false), None);
+        assert_eq!(rl.check("byo-bbbb", false), None);
+        assert_eq!(rl.check("byo-cccc", false), None);
+        assert_eq!(rl.check("byo-dddd", false), Some(Throttled::ByoGlobal)); // 4th distinct token
+
+        // Managed traffic is exempt from the BYO bucket — a distinct `bai_…` flood is never throttled
+        // here (it's bounded by verify failing, not by this ceiling).
+        for i in 0..10 {
+            assert_eq!(rl.check(&format!("bai_v1.1.p{i}.s{i}"), true), None);
         }
-        // 6th request in the same 1s window crosses the ceiling.
-        assert!(!rl.check(&k));
     }
 
     #[test]
-    fn keys_have_independent_budgets() {
-        let rl = RateLimit::new(2).unwrap();
-        assert!(rl.check(&RlKey::Tenant(1)));
-        assert!(rl.check(&RlKey::Tenant(1)));
-        assert!(!rl.check(&RlKey::Tenant(1))); // tenant 1 exhausted
-        assert!(rl.check(&RlKey::Tenant(2))); // a different tenant is unaffected
-        // Same numeric value, different variant ⇒ different key (discriminant disambiguates).
-        assert!(rl.check(&RlKey::Byo(1)));
+    fn byo_global_does_not_touch_managed_budget() {
+        // With only the global BYO tier on, managed requests pass freely while BYO is being capped.
+        let rl = RateLimit::new(0, 1).unwrap();
+        assert_eq!(rl.check("byo-1", false), None);
+        assert_eq!(rl.check("byo-2", false), Some(Throttled::ByoGlobal)); // BYO bucket exhausted
+        assert_eq!(rl.check("bai_v1.1.p.s", true), None); // managed unaffected
     }
 }
diff --git a/crates/gateway/src/state.rs b/crates/gateway/src/state.rs
index 19d49ee..89a89f5 100644
--- a/crates/gateway/src/state.rs
+++ b/crates/gateway/src/state.rs
@@ -15,7 +15,7 @@ use crate::route::{self, AuthScheme, Provider};
 use arc_swap::ArcSwap;
 use std::collections::HashMap;
 use std::net::SocketAddr;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 /// How long a resolved upstream address is reused before re-resolving.
@@ -83,15 +83,17 @@ pub struct GatewayState {
     pub rate_limit: Option<RateLimit>,
 
     /// TTL cache of resolved upstream addresses, so `upstream_peer` neither blocks on a synchronous
-    /// `getaddrinfo` nor re-resolves the same provider host every request.
-    dns_cache: Mutex<HashMap<String, (SocketAddr, Instant)>>,
+    /// `getaddrinfo` nor re-resolves the same provider host every request. `ArcSwap` so the common
+    /// case — a cache hit, on every admitted request after warmup — is a lock-free atomic load; the
+    /// only writes are the ~10 providers' entries refreshed once per `DNS_TTL`, applied via `rcu`.
+    dns_cache: ArcSwap<HashMap<String, (SocketAddr, Instant)>>,
 }
 
 impl GatewayState {
     pub fn new(config: AiConfig, metrics: Arc<Metrics>) -> Result<Arc<Self>> {
         let keyring = config.build_keyring()?;
         let providers = build_providers(&config);
-        let rate_limit = RateLimit::new(config.rate_limit_rps);
+        let rate_limit = RateLimit::new(config.rate_limit_rps, config.byo_rate_limit_rps);
 
         Ok(Arc::new(Self {
             metrics,
@@ -99,7 +101,7 @@ impl GatewayState {
             providers,
             deny: ArcSwap::from_pointee(DenySet::new()),
             rate_limit,
-            dns_cache: Mutex::new(HashMap::new()),
+            dns_cache: ArcSwap::from_pointee(HashMap::new()),
             config,
         }))
     }
@@ -114,20 +116,11 @@ impl GatewayState {
     /// `tokio::net::lookup_host` (runs `getaddrinfo` on the blocking pool — async-safe) instead of
     /// `HttpPeer::new`'s eager blocking resolve.
     pub async fn resolve(&self, authority: &str) -> Result<SocketAddr> {
-        // Scope the guard so the `std::sync::Mutex` is provably released before the `.await` below —
-        // a `std` guard is not `Send` and must never be held across an await. The explicit block
-        // makes that invariant local and obvious (a stray log/borrow added before the await would
-        // otherwise either deadlock or fail to compile). A miss may let two concurrent callers both
-        // resolve; that's harmless (same answer, last writer wins) and not worth a lock across DNS.
-        {
-            // Recover from a poisoned lock (a prior holder panicked) rather than propagating the
-            // panic: the cache holds only transient `SocketAddr` entries, so a poisoned-but-readable
-            // map is safe to use. Without this, one panic would wedge every later DNS lookup.
-            let cache = self.dns_cache.lock().unwrap_or_else(|p| p.into_inner());
-            if let Some((addr, at)) = cache.get(authority) {
-                if at.elapsed() < DNS_TTL {
-                    return Ok(*addr);
-                }
+        // Cache hit (the common case after warmup): a lock-free `ArcSwap` load — no mutex, no
+        // syscall — so concurrent workers never serialize on a DNS lookup that's already resolved.
+        if let Some((addr, at)) = self.dns_cache.load().get(authority) {
+            if at.elapsed() < DNS_TTL {
+                return Ok(*addr);
             }
         }
         let addr = tokio::net::lookup_host(authority)
@@ -135,10 +128,15 @@ impl GatewayState {
             .map_err(|e| GatewayError::Dns(format!("{authority}: {e}")))?
             .next()
             .ok_or_else(|| GatewayError::Dns(format!("{authority}: no addresses")))?;
-        self.dns_cache
-            .lock()
-            .unwrap_or_else(|p| p.into_inner())
-            .insert(authority.to_string(), (addr, Instant::now()));
+        // rcu the new/refreshed entry in. Two concurrent misses for the same host may both resolve
+        // and both rcu; that's harmless (same answer, last writer wins) and far cheaper than holding
+        // a lock across `getaddrinfo`. The clone-on-write copies a ~10-entry map — trivial, and only
+        // on the rare miss/refresh path, never on a hit.
+        self.dns_cache.rcu(|cur| {
+            let mut next = HashMap::clone(cur);
+            next.insert(authority.to_string(), (addr, Instant::now()));
+            next
+        });
         Ok(addr)
     }
 }
diff --git a/crates/gateway/src/store_watch.rs b/crates/gateway/src/store_watch.rs
index 7c98707..1d44742 100644
--- a/crates/gateway/src/store_watch.rs
+++ b/crates/gateway/src/store_watch.rs
@@ -52,7 +52,7 @@ pub struct WatcherService {
 
 #[async_trait]
 impl BackgroundService for WatcherService {
-    async fn start(&self, _shutdown: ShutdownWatch) {
+    async fn start(&self, mut shutdown: ShutdownWatch) {
         // Resume position + on-disk snapshot writer persist across reconnects: a NATS blip resumes
         // the watch from `cursor` instead of re-scanning, and `seeded` stays true so we don't reseed.
         let mut cursor = WatchCursor::none();
@@ -94,15 +94,47 @@ impl BackgroundService for WatcherService {
         }
 
         loop {
-            match connect(&self.state).await {
-                Ok(store) => {
-                    info!("slipstream connected; watching deny-set");
-                    watch_deny(&self.state, store, &mut cursor, &mut writer, &mut seeded).await;
-                    warn!("deny-set watch exited; reconnecting");
+            // Connect, but bail immediately if Pingora signals shutdown mid-connect (e.g. NATS is
+            // down and `connect` is retrying its own backoff) rather than blocking teardown.
+            let store = tokio::select! {
+                _ = shutdown.changed() => {
+                    info!("shutdown signaled; deny-set watcher exiting");
+                    return;
                 }
-                Err(e) => error!(error = %e, "slipstream connect failed; retrying"),
+                outcome = connect(&self.state) => match outcome {
+                    Ok(store) => store,
+                    Err(e) => {
+                        error!(error = %e, "slipstream connect failed; retrying");
+                        // Reconnect backoff, also interruptible by shutdown.
+                        tokio::select! {
+                            _ = shutdown.changed() => return,
+                            _ = tokio::time::sleep(Duration::from_secs(2)) => continue,
+                        }
+                    }
+                },
+            };
+
+            info!("slipstream connected; watching deny-set");
+            // `watch_deny` returns `true` when it exited because shutdown was signaled — stop the
+            // reconnect loop cleanly instead of trying to reconnect a shutting-down process.
+            if watch_deny(
+                &self.state,
+                store,
+                &mut cursor,
+                &mut writer,
+                &mut seeded,
+                &mut shutdown,
+            )
+            .await
+            {
+                info!("shutdown signaled; deny-set watcher exiting");
+                return;
+            }
+            warn!("deny-set watch exited; reconnecting");
+            tokio::select! {
+                _ = shutdown.changed() => return,
+                _ = tokio::time::sleep(Duration::from_secs(2)) => {}
             }
-            tokio::time::sleep(Duration::from_secs(2)).await;
         }
     }
 }
@@ -166,13 +198,16 @@ async fn connect(state: &GatewayState) -> crate::error::Result<Arc<dyn KvStore>>
     Ok(store)
 }
 
+/// Seed (if needed) and stream deny-set deltas until the watch ends or shutdown is signaled.
+/// Returns `true` iff it exited because `shutdown` fired — the caller then stops reconnecting.
 async fn watch_deny(
     state: &Arc<GatewayState>,
     store: Arc<dyn KvStore>,
     cursor: &mut WatchCursor,
     writer: &mut Option<SnapshotWriter>,
     seeded: &mut bool,
-) {
+    shutdown: &mut ShutdownWatch,
+) -> bool {
     // Seed once, on the first connect that lacks a usable resume point (cold boot with no snapshot,
     // or after a `CursorExpired` reset). A NATS scan is a point-in-time read of the live set; the
     // highest revision among its entries is the baseline the watch resumes strictly after. An empty
@@ -212,7 +247,7 @@ async fn watch_deny(
                 // No baseline yet — serve whatever's already in memory (fail-open) and let the
                 // reconnect loop retry the scan.
                 warn!(error = %e, "deny-set scan failed; serving current set, will retry");
-                return;
+                return false;
             }
         }
     }
@@ -221,7 +256,7 @@ async fn watch_deny(
     // New) — that would drop anything written in the seed→subscribe window.
     let Some(watcher) = store.watcher() else {
         warn!("store has no watcher; deny-set will not update");
-        return;
+        return false;
     };
     let (tx, mut rx) = tokio::sync::mpsc::channel::<KvUpdate>(256);
     let w = watcher.clone();
@@ -233,8 +268,20 @@ async fn watch_deny(
 
     // Updates are rcu (clone-on-write); the set is tiny (O(denied)). Each applied delta also
     // advances the in-memory cursor (so a reconnect resumes from here) and is appended to the
-    // on-disk snapshot if one is configured.
-    while let Some(update) = rx.recv().await {
+    // on-disk snapshot if one is configured. We `select!` on shutdown so a quiet stream (no deltas
+    // arriving) doesn't pin the task open through teardown; `select!` can only switch at an await
+    // point — between updates — so we never abort mid-`persist_update`, leaving the snapshot intact.
+    loop {
+        let update = tokio::select! {
+            _ = shutdown.changed() => {
+                watch.abort();
+                return true;
+            }
+            update = rx.recv() => match update {
+                Some(u) => u,
+                None => break,
+            },
+        };
         state.deny.rcu(|cur| {
             let mut set = (**cur).clone();
             match &update {
@@ -268,6 +315,7 @@ async fn watch_deny(
         Ok(Err(e)) => warn!(error = %e, "deny-set watch ended"),
         Err(e) => warn!(error = %e, "deny-set watch task panicked"),
     }
+    false
 }
 
 /// Append one applied delta to the on-disk snapshot (if configured) and checkpoint the cursor.

From 072061e273c6bb8fdc36701dd9b21c8b2ef94a3b Mon Sep 17 00:00:00 2001
From: Jared Lunde <jared.lunde@gmail.com>
Date: Sun, 31 May 2026 13:03:09 -0700
Subject: [PATCH 3/7] fixes

---
 ARCHITECTURE.md                    |  31 ++--
 Cargo.lock                         |  37 +++++
 Cargo.toml                         |  16 ++
 config.example.toml                |   2 +-
 crates/gateway/Cargo.toml          |  14 +-
 crates/gateway/benches/e2e.rs      | 143 ++++++++++++++++-
 crates/gateway/src/admin.rs        |  71 +++++++++
 crates/gateway/src/config.rs       |  20 +++
 crates/gateway/src/doctor.rs       | 166 ++++++++++++++++++++
 crates/gateway/src/lib.rs          |   9 +-
 crates/gateway/src/main.rs         |  18 ++-
 crates/gateway/src/metrics.rs      | 106 +++++++++++--
 crates/gateway/src/proxy.rs        | 219 ++++++++++++++++++++++++--
 crates/gateway/src/ratelimit.rs    |   6 +
 crates/gateway/src/state.rs        |  95 +++++++++++-
 crates/gateway/src/store_watch.rs  | 106 ++++++++++++-
 crates/gateway/tests/common/mod.rs | 239 +++++++++++++++++++++++------
 crates/gateway/tests/e2e.rs        | 100 ++++++++++++
 18 files changed, 1291 insertions(+), 107 deletions(-)
 create mode 100644 crates/gateway/src/admin.rs

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index f9a3b7f..692f11c 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -44,10 +44,12 @@ client (stock SDK, Bearer/ x-api-key)
    ▼ request_body_filter  — STREAM BODY THROUGH (never buffered); feed bytes to a structural
    │                         scanner that extracts the exact root-level `model` (O(1), memchr-fast);
    │                         enforce the body cap on the running total (chunked-safe)
-   ▼ response_filter      — TTFT; streaming? = response Content-Type is text/event-stream
+   ▼ response_filter      — TTFT; streaming? = response Content-Type is text/event-stream; count
+   │                         upstream response by provider+status class; set x-beyond-request-id
    ▼ response_body_filter — relay unbuffered; keep a bounded 64KB tail for the usage tap
    ▼ logging              — parse usage from tail (by dialect+streaming); emit `ai.usage` fact
-   │                         (managed only — BYO has no tenant to bill); metrics count all traffic
+   │                         (managed only — BYO has no tenant to bill); metrics count all traffic.
+   │                         Every terminal path (reject + usage) logs the request_id for correlation
         upstream: a registered provider (openai, anthropic, openrouter, fireworks,
                   groq, deepseek, together, cerebras, mistral, xai — + config-added)
 ```
@@ -184,23 +186,23 @@ output redacts values.
 | `nats_url`                    | `nats://localhost:4222`           | NATS server for the deny-set watcher. Unreachable → fail-open (stale or empty set).                                                                              |
 | `nats_creds`                  | _(unset)_                         | NATS credentials file path. Required for authenticated clusters.                                                                                                 |
 | `listen_addr`                 | `0.0.0.0:8080`                    | Proxy listener address.                                                                                                                                          |
-| `prometheus_addr`             | `0.0.0.0:9090`                    | Prometheus `/metrics` scrape endpoint.                                                                                                                           |
+| `metrics_listen`              | `0.0.0.0:9090`                    | Internal admin/observability listener: `/metrics` (Prometheus scrape), `/livez`, `/readyz`. Separate from the client listener — not externally reachable.        |
 
 ---
 
 ## Failure Modes
 
-| Failure                                     | What Actually Happens                                                                                       | Recovery                                                                                    |
-| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
-| NATS unreachable at boot                    | Deny-set starts empty (fail-open). Auth still works — keys from config.                                     | Watcher reconnects; seeds from NATS or disk snapshot on connect.                            |
-| NATS disconnects mid-run                    | Last-known deny-set stays active. New deny entries not applied until reconnect.                             | Watcher reconnects and resumes from saved revision — no re-scan needed.                     |
-| NATS history compacted past snapshot cursor | `CursorExpired` → full re-scan from current NATS state.                                                     | After re-scan, new cursor set; delta watch resumes normally.                                |
-| Virtual key tampered or forged              | Ed25519 verify fails → falls through to BYO treatment. No billing event.                                    | Billing miss detectable downstream; no security boundary breach.                            |
-| Pool key missing for provider               | Managed request returns 503 before any upstream connection.                                                 | Add `AI_POOL_KEY_<NAME>` env and redeploy.                                                  |
-| Provider DNS fails                          | `upstream_peer` returns error → 502 to client.                                                              | TTL-cached DNS (60s) serves stale; poisoned-lock guard re-resolves on next request.         |
-| Provider TCP connect fails                  | `fail_to_connect` retries up to 2×, then returns 502.                                                       | Client SDK retries with backoff. No HTTP-status retries (Pingora-idiomatic).                |
-| Response body > 128KB before usage chunk    | Tail compaction fires: `drain(..half)` discards first half, keeps tail. Usage extracted from retained tail. | No action — O(1) tail tap is designed for this; SSE usage is always in the final data line. |
-| Gateway crash mid-request                   | In-flight request drops; client receives TCP close, not a structured error. No partial state written.       | Client SDK retries. No DB writes in the request path — no cleanup needed.                   |
+| Failure                                     | What Actually Happens                                                                                       | Recovery                                                                                                        |
+| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------- |
+| NATS unreachable at boot                    | Deny-set starts empty (fail-open). Auth still works — keys from config.                                     | Watcher reconnects; seeds from NATS or disk snapshot on connect.                                                |
+| NATS disconnects mid-run                    | Last-known deny-set stays active. New deny entries not applied until reconnect.                             | Watcher reconnects (1s→30s exponential backoff, reset on success) and resumes from saved revision — no re-scan. |
+| NATS history compacted past snapshot cursor | `CursorExpired` → full re-scan from current NATS state.                                                     | After re-scan, new cursor set; delta watch resumes normally.                                                    |
+| Virtual key tampered or forged              | Ed25519 verify fails → falls through to BYO treatment. No billing event.                                    | Billing miss detectable downstream; no security boundary breach.                                                |
+| Pool key missing for provider               | Managed request returns 503 before any upstream connection.                                                 | Add `AI_POOL_KEY_<NAME>` env and redeploy.                                                                      |
+| Provider DNS fails                          | `upstream_peer` returns error → 502 to client.                                                              | TTL-cached DNS (60s) serves stale; poisoned-lock guard re-resolves on next request.                             |
+| Provider TCP connect fails                  | `fail_to_connect` retries up to 2×, then returns 502.                                                       | Client SDK retries with backoff. No HTTP-status retries (Pingora-idiomatic).                                    |
+| Response body > 128KB before usage chunk    | Tail compaction fires: `drain(..half)` discards first half, keeps tail. Usage extracted from retained tail. | No action — O(1) tail tap is designed for this; SSE usage is always in the final data line.                     |
+| Gateway crash mid-request                   | In-flight request drops; client receives TCP close, not a structured error. No partial state written.       | Client SDK retries. No DB writes in the request path — no cleanup needed.                                       |
 
 ---
 
@@ -219,6 +221,7 @@ output redacts values.
 | `state`                   | keyring + resolved provider registry + watched deny-set + TTL DNS cache       | unit ✓        |
 | `store_watch`             | the single NATS watcher (deny-set), as a Pingora `BackgroundService`          | —             |
 | `proxy`                   | the `ProxyHttp` impl                                                          | e2e ✓         |
+| `admin`                   | `ServeHttp` on the metrics listener: `/livez`, `/readyz`, `/metrics`          | e2e ✓         |
 | `metrics`/`doctor`/`main` | Prometheus, diagnostics, bootstrap                                            | e2e/compile ✓ |
 
 ## Verification
diff --git a/Cargo.lock b/Cargo.lock
index f3476ad..ec8bd74 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -314,6 +314,8 @@ dependencies = [
  "divan",
  "ed25519-dalek",
  "figment",
+ "getrandom 0.3.4",
+ "http",
  "http-body-util",
  "hyper",
  "hyper-util",
@@ -323,12 +325,14 @@ dependencies = [
  "pingora-limits",
  "pingora-proxy",
  "prometheus",
+ "rcgen",
  "reqwest",
  "rustls",
  "serde",
  "serde_json",
  "thiserror 2.0.18",
  "tokio",
+ "tokio-rustls",
  "tracing",
  "tracing-subscriber",
  "zeroize",
@@ -1425,6 +1429,7 @@ dependencies = [
  "bytes",
  "futures-channel",
  "futures-core",
+ "h2",
  "http",
  "http-body",
  "httparse",
@@ -2100,6 +2105,16 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "pem"
+version = "3.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be"
+dependencies = [
+ "base64",
+ "serde_core",
+]
+
 [[package]]
 name = "pem-rfc7468"
 version = "0.7.0"
@@ -2743,6 +2758,19 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "rcgen"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75e669e5202259b5314d1ea5397316ad400819437857b90861765f24c4cf80a2"
+dependencies = [
+ "pem",
+ "ring",
+ "rustls-pki-types",
+ "time",
+ "yasna",
+]
+
 [[package]]
 name = "redox_syscall"
 version = "0.5.18"
@@ -4328,6 +4356,15 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
 
+[[package]]
+name = "yasna"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd"
+dependencies = [
+ "time",
+]
+
 [[package]]
 name = "yoke"
 version = "0.8.2"
diff --git a/Cargo.toml b/Cargo.toml
index 5b5467c..8f994ce 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,18 @@ edition = "2024"
 license = "MIT"
 rust-version = "1.85"
 
+# Applied to every crate root in the workspace (lib *and* bin) via `[lints] workspace = true`.
+# Crate-level `#![deny(...)]` attributes only cover the unit they're written in, so a binary root
+# (`main.rs`) would otherwise escape the library's denies — this closes that gap structurally.
+[workspace.lints.rust]
+unsafe_code = "forbid"
+unused_must_use = "deny"
+
+# Release builds wrap arithmetic silently by default; turn that into a panic so an overflow on a
+# size/count never goes unnoticed. Negligible cost for a proxy (arithmetic isn't the bottleneck).
+[profile.release]
+overflow-checks = true
+
 [workspace.dependencies]
 # slipstream is published — consume it from crates.io, aliased to `store` so the code's
 # `use store::...` is unchanged. No path deps into the `beyond` repo: this crate builds standalone.
@@ -26,6 +38,10 @@ bytes = "1"
 clap = { version = "4", features = ["derive", "env"] }
 ed25519-dalek = "2.2"
 figment = { version = "0.10", features = ["toml", "env"] }
+getrandom = "0.3"
+# The types Pingora's `ServeHttp` trait speaks (`Response<Vec<u8>>`); pin to the same 1.x already in
+# the tree via Pingora so the admin app can name them directly.
+http = "1"
 memchr = "2"
 prometheus = "0.13"
 rustls = { version = "0.23", default-features = false, features = ["ring"] }
diff --git a/config.example.toml b/config.example.toml
index 9063339..d31943a 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -2,7 +2,7 @@
 # (e.g. `AI_NATS_URL`, `AI_POOL_KEY_OPENAI`, `AI_READ_TIMEOUT_SECS`). Values below are defaults.
 
 listen = "0.0.0.0:8080" # client (app) traffic; internal-only, fronted as ai.internal
-metrics_listen = "0.0.0.0:9090" # Prometheus /metrics
+metrics_listen = "0.0.0.0:9090" # internal admin: /metrics (Prometheus), /livez, /readyz
 
 # NATS / slipstream — carries ONLY the deny-set (`blackhole.*`). Auth + keys come from this file,
 # so the gateway authenticates + serves managed traffic even if NATS is down.
diff --git a/crates/gateway/Cargo.toml b/crates/gateway/Cargo.toml
index 75a0b52..f04fd8e 100644
--- a/crates/gateway/Cargo.toml
+++ b/crates/gateway/Cargo.toml
@@ -6,6 +6,9 @@ license.workspace = true
 rust-version.workspace = true
 description = "Beyond AI gateway — egress L7 proxy to LLM providers"
 
+[lints]
+workspace = true
+
 [lib]
 name = "beyond_ai"
 path = "src/lib.rs"
@@ -29,6 +32,8 @@ bytes = { workspace = true }
 clap = { workspace = true }
 ed25519-dalek = { workspace = true }
 figment = { workspace = true }
+getrandom = { workspace = true }
+http = { workspace = true }
 memchr = { workspace = true }
 prometheus = { workspace = true }
 rustls = { workspace = true }
@@ -49,9 +54,14 @@ base64 = { workspace = true }
 criterion = { version = "0.5", features = ["async_tokio"] }
 divan = "0.1"
 http-body-util = "0.1"
-hyper = { version = "1", features = ["server", "http1"] }
-hyper-util = { version = "0.1", features = ["tokio"] }
+# `http2` + hyper-util's `server-auto` let the mock upstream serve H1 *and* H2 on one TLS listener
+# (protocol chosen by ALPN), so the concurrency bench can drive the gateway's H2 path. `rcgen` mints a
+# throwaway self-signed cert for that listener; `tokio-rustls` terminates TLS in front of hyper.
+hyper = { version = "1", features = ["server", "http1", "http2"] }
+hyper-util = { version = "0.1", features = ["tokio", "server-auto"] }
+rcgen = "0.13"
 reqwest = { version = "0.13", default-features = false, features = ["json", "rustls"] }
+tokio-rustls = "0.26"
 
 [[bench]]
 name = "unit"
diff --git a/crates/gateway/benches/e2e.rs b/crates/gateway/benches/e2e.rs
index 68ab389..5338c4a 100644
--- a/crates/gateway/benches/e2e.rs
+++ b/crates/gateway/benches/e2e.rs
@@ -17,7 +17,7 @@ mod common;
 
 use std::time::Duration;
 
-use criterion::{Criterion, Throughput, criterion_group, criterion_main};
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
 use tokio::runtime::Runtime;
 use tokio::task::JoinSet;
 
@@ -214,5 +214,144 @@ fn bench_e2e(c: &mut Criterion) {
     drop(sse_stack);
 }
 
-criterion_group!(benches, bench_e2e);
+/// Concurrency levels swept by `bench_concurrency`. Spans below and above hyper's default
+/// `SETTINGS_MAX_CONCURRENT_STREAMS` (200) so an H2 stream-concurrency cliff (if any) shows up against
+/// H1's connection pool.
+const SWEEP: &[u64] = &[1, 8, 32, 128, 512];
+
+/// Fire `conc` managed requests at `url` concurrently and drain each body (returns the connection to
+/// the pool). This is one bench iteration; `Throughput::Elements(conc)` makes criterion report req/s.
+async fn drive(client: &reqwest::Client, url: &str, vkey: &str, conc: u64) {
+    let mut set = JoinSet::new();
+    for _ in 0..conc {
+        let (c, u, k) = (client.clone(), url.to_string(), vkey.to_string());
+        set.spawn(async move {
+            let resp = c
+                .post(format!("{u}/v1/chat/completions"))
+                .header("authorization", format!("Bearer {k}"))
+                .header("content-type", "application/json")
+                .body(MANAGED_BODY)
+                .send()
+                .await
+                .expect("request");
+            let _ = resp.bytes().await.expect("body");
+        });
+    }
+    while let Some(r) = set.join_next().await {
+        r.expect("task");
+    }
+}
+
+/// Warm a gateway until it answers 200, then return the protocol it used to reach the upstream — read
+/// from the `x-mock-proto` header the TLS mock stamps and the gateway relays. This is the proof the
+/// "h2"/"h1" bench labels reflect what actually negotiated, not just what we configured.
+async fn warm_and_proto(client: &reqwest::Client, url: &str, vkey: &str) -> String {
+    {
+        let (c, u, k) = (client.clone(), url.to_string(), vkey.to_string());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                c.post(format!("{u}/v1/chat/completions"))
+                    .header("authorization", format!("Bearer {k}"))
+                    .header("content-type", "application/json")
+                    .body(MANAGED_BODY)
+                    .send()
+                    .await
+                    .map(|r| r.status().as_u16())
+                    .unwrap_or(0)
+            }
+        })
+        .await;
+    }
+    let resp = client
+        .post(format!("{url}/v1/chat/completions"))
+        .header("authorization", format!("Bearer {vkey}"))
+        .header("content-type", "application/json")
+        .body(MANAGED_BODY)
+        .send()
+        .await
+        .expect("warm request");
+    resp.headers()
+        .get("x-mock-proto")
+        .and_then(|v| v.to_str().ok())
+        .unwrap_or("unknown")
+        .to_string()
+}
+
+/// H2-vs-H1 to the upstream, under concurrency. One TLS+H2 mock; two gateways against it — one with
+/// `upstream_http2 = true` (ALPN H2H1 → h2), one `false` (ALPN H1). Same client→gateway transport
+/// (plain H1) for both, so the only variable is the gateway→upstream protocol. The sweep exposes
+/// whether H2 multiplexing wins or hits its stream-concurrency cap vs H1's connection pool.
+fn bench_concurrency(c: &mut Criterion) {
+    let rt = Runtime::new().expect("tokio runtime");
+    let nats = rt.block_on(Nats::start());
+    let mock = rt.block_on(MockUpstream::start_tls(Mode::Json));
+    let (pubkey, sk) = test_keypair(1);
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 42,
+            vpc_id: 7,
+        },
+        1,
+        &sk,
+    );
+
+    // Two gateways at the same self-signed TLS mock; ALPN is the only difference. Rate limits OFF
+    // (both tiers): the sweep drives one credential well past the 100 rps default, and a rate-limited
+    // 429 short-circuits *before* the upstream — it would measure the reject path, not H2-vs-H1.
+    let gw_h2 = rt.block_on(
+        Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+            .tls_upstream()
+            .upstream_http2(true)
+            .rate_limit_rps(0)
+            .byo_rate_limit_rps(0)
+            .start(),
+    );
+    let gw_h1 = rt.block_on(
+        Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+            .tls_upstream()
+            .upstream_http2(false)
+            .rate_limit_rps(0)
+            .byo_rate_limit_rps(0)
+            .start(),
+    );
+    let client = reqwest::Client::new();
+    let (url_h2, url_h1) = (gw_h2.url(), gw_h1.url());
+
+    // Prove the gateways actually negotiated what we asked for before trusting the labels.
+    let proto_h2 = rt.block_on(warm_and_proto(&client, &url_h2, &vkey));
+    let proto_h1 = rt.block_on(warm_and_proto(&client, &url_h1, &vkey));
+    assert_eq!(
+        proto_h2, "h2",
+        "upstream_http2=true should negotiate h2 to the mock"
+    );
+    assert_eq!(
+        proto_h1, "http/1.1",
+        "upstream_http2=false should stay http/1.1 to the mock"
+    );
+    eprintln!("e2e_concurrency: confirmed gw_h2→upstream=h2, gw_h1→upstream=http/1.1");
+
+    let mut group = c.benchmark_group("e2e_concurrency");
+    group.sample_size(10);
+    group.measurement_time(Duration::from_secs(6));
+    for &conc in SWEEP {
+        group.throughput(Throughput::Elements(conc));
+        group.bench_with_input(BenchmarkId::new("h2", conc), &conc, |b, &conc| {
+            b.to_async(&rt)
+                .iter(|| drive(&client, &url_h2, &vkey, conc));
+        });
+        group.bench_with_input(BenchmarkId::new("h1", conc), &conc, |b, &conc| {
+            b.to_async(&rt)
+                .iter(|| drive(&client, &url_h1, &vkey, conc));
+        });
+    }
+    group.finish();
+
+    drop(gw_h2);
+    drop(gw_h1);
+    drop(mock);
+    drop(nats);
+}
+
+criterion_group!(benches, bench_e2e, bench_concurrency);
 criterion_main!(benches);
diff --git a/crates/gateway/src/admin.rs b/crates/gateway/src/admin.rs
new file mode 100644
index 0000000..ecdf542
--- /dev/null
+++ b/crates/gateway/src/admin.rs
@@ -0,0 +1,71 @@
+//! Admin / observability HTTP surface served on the metrics listener: `/livez`, `/readyz`,
+//! `/metrics`.
+//!
+//! Matches the Beyond service convention (cf. `auth`, `objects`): the body is `{"status",
+//! "version"}` and there are two probes. Both return 200 once the process is answering, because
+//! the gateway is **fail-open by design** — auth + key swap come from boot config, and a NATS
+//! outage degrades only the (stale) deny-set, never the ability to serve. So readiness must *not*
+//! gate on NATS: a cold boot with NATS down can still serve correctly, and reporting not-ready
+//! would pull a healthy gateway out of the load balancer for no reason. Readiness here therefore
+//! means "listeners up + boot config loaded" — which is true the instant we can answer this
+//! request (state is built before the server starts; a build failure `exit`s in `main`).
+//! `readyz` is kept distinct from `livez` only to honor the orchestrator's two-probe convention.
+//!
+//! Implemented as a Pingora `ServeHttp` app so all three paths share the one (internal) metrics
+//! port — Pingora's built-in prometheus service only serves `/metrics`, so we hand-route all three.
+
+use async_trait::async_trait;
+use http::Response;
+use pingora_core::apps::http_app::ServeHttp;
+use pingora_core::protocols::http::ServerSession;
+use prometheus::{Encoder, TextEncoder};
+
+/// Compile-time service version, surfaced in every health body (matches the sibling services).
+const VERSION: &str = env!("CARGO_PKG_VERSION");
+
+pub struct AdminApp;
+
+impl AdminApp {
+    /// Build a `{"status","version"}` JSON health response. `status` is `"ok"`/`"degraded"` so a
+    /// human or a probe can read intent without parsing the code. Header values are all static or
+    /// integer, so the builder can't fail — `expect` documents that invariant.
+    fn health(status: u16, health: &str) -> Response<Vec<u8>> {
+        let body = serde_json::json!({ "status": health, "version": VERSION })
+            .to_string()
+            .into_bytes();
+        Response::builder()
+            .status(status)
+            .header(http::header::CONTENT_TYPE, "application/json")
+            .header(http::header::CONTENT_LENGTH, body.len())
+            .body(body)
+            .expect("static health response is always valid")
+    }
+
+    /// Encode the default Prometheus registry as text (same output as Pingora's built-in service).
+    fn metrics() -> Response<Vec<u8>> {
+        let encoder = TextEncoder::new();
+        let mut buffer = Vec::new();
+        // `encode` only errors if the writer fails; a `Vec` never does, so the result is infallible
+        // here — discard it explicitly (the crate denies `unused_must_use`).
+        let _ = encoder.encode(&prometheus::gather(), &mut buffer);
+        Response::builder()
+            .status(200)
+            .header(http::header::CONTENT_TYPE, encoder.format_type())
+            .header(http::header::CONTENT_LENGTH, buffer.len())
+            .body(buffer)
+            .expect("metrics response is always valid")
+    }
+}
+
+#[async_trait]
+impl ServeHttp for AdminApp {
+    async fn response(&self, session: &mut ServerSession) -> Response<Vec<u8>> {
+        match session.req_header().uri.path() {
+            // Liveness + readiness are the same signal here (see module docs): the gateway is
+            // fail-open, so "can answer" ⇒ "can serve". Both 200 once the process is up.
+            "/livez" | "/readyz" => Self::health(200, "ok"),
+            "/metrics" => Self::metrics(),
+            _ => Self::health(404, "not_found"),
+        }
+    }
+}
diff --git a/crates/gateway/src/config.rs b/crates/gateway/src/config.rs
index b287528..2a40bae 100644
--- a/crates/gateway/src/config.rs
+++ b/crates/gateway/src/config.rs
@@ -70,6 +70,21 @@ pub struct AiConfig {
     /// to talk to a plaintext mock.
     pub upstream_tls: bool,
 
+    /// Prefer HTTP/2 (with HTTP/1.1 fallback) to the upstream. `true` ⇒ peer ALPN `H2H1`: every
+    /// provider that offers `h2` over TLS is reached over a multiplexed H2 connection (fewer sockets
+    /// and TLS handshakes from our egress IPs), and any host that doesn't offer it negotiates down to
+    /// H1. `false` ⇒ ALPN `H1` (one connection per in-flight request, pooled). The knob exists so an
+    /// operator can fall back to H1 without a code redeploy if a provider's h2 stack misbehaves, and
+    /// so the e2e concurrency bench can compare the two. Only consulted over TLS — a plaintext upstream
+    /// (the mock) has no ALPN and is always H1 regardless.
+    pub upstream_http2: bool,
+
+    /// Verify the upstream's TLS certificate (and that it matches the SNI). `true` everywhere in
+    /// production. The **only** intended `false` is the e2e concurrency bench, whose TLS mock presents
+    /// a self-signed cert — turning verification off there lets us exercise the real TLS+ALPN+H2 path
+    /// against a local mock without a CA. Never set this `false` against a real provider.
+    pub upstream_verify_cert: bool,
+
     /// Per-credential request-rate ceiling (requests/sec). A blast-radius guardrail (see `ratelimit`),
     /// not a spend control: it caps how fast a single credential (managed virtual key ≈ a `(tenant,
     /// app)`, or a BYO token) can drive the gateway, bounding a leaked/runaway key during the
@@ -111,6 +126,11 @@ impl Default for AiConfig {
             write_timeout_secs: 60,
             idle_timeout_secs: 90,
             upstream_tls: true,
+            // Prefer H2 to providers by default (all of `KNOWN_PROVIDERS` offer it; H1 fallback is
+            // automatic). Flip to false for an all-H1 upstream without recompiling.
+            upstream_http2: true,
+            // Verify upstream certs by default; only the bench's self-signed TLS mock turns this off.
+            upstream_verify_cert: true,
             // Generous per-credential circuit breaker, on by default. Won't touch legitimate
             // steady-state traffic; caps a runaway/leaked key or a retry-storm flood. Set 0 to disable.
             rate_limit_rps: 100,
diff --git a/crates/gateway/src/doctor.rs b/crates/gateway/src/doctor.rs
index 1db5852..3effe4e 100644
--- a/crates/gateway/src/doctor.rs
+++ b/crates/gateway/src/doctor.rs
@@ -1,6 +1,14 @@
 //! Diagnostics (PATTERNS.md `doctor` pattern): fast prerequisite checks, exit 0/1.
+//!
+//! The point is to catch a misconfiguration *before* traffic lands on the instance, where it would
+//! otherwise surface as a first-request failure (a 401 from an empty keyring, a 503 from a missing
+//! pool key, a 502 from an unresolvable provider). We check the things boot does lazily or never:
+//! NATS reachability, the signing keyring, managed pool keys, and provider DNS.
 
 use crate::config::AiConfig;
+use crate::route;
+use std::collections::BTreeMap;
+use std::time::Duration;
 
 pub struct CheckResult {
     pub name: &'static str,
@@ -46,9 +54,112 @@ pub async fn run_checks(config: &AiConfig) -> Vec<CheckResult> {
         )),
     }
 
+    out.push(check_signing_keys(config));
+    out.push(check_pool_keys(config));
+    out.extend(check_provider_dns(config).await);
+
     out
 }
 
+/// The signing keyring is what authenticates managed traffic. An empty or invalid keyring isn't a
+/// hard boot failure (the gateway still serves BYO), but it silently turns *every* `bai_…` key into a
+/// 401 — a footgun worth surfacing loudly here. `build_keyring` already rejects a non-numeric kid or
+/// an unparseable public key, so a success means every configured key installed.
+fn check_signing_keys(config: &AiConfig) -> CheckResult {
+    match config.build_keyring() {
+        Ok(ring) if ring.is_empty() => fail(
+            "signing_keys",
+            "no signing keys configured — all managed (bai_…) traffic will 401, only BYO works",
+            "set [signing_keys] (kid → base64 Ed25519 public key) in config or AI_ env",
+        ),
+        Ok(ring) => pass(
+            "signing_keys",
+            format!("{} signing key(s) loaded", ring.len()),
+        ),
+        Err(e) => fail(
+            "signing_keys",
+            e.to_string(),
+            "every kid must be numeric and every value a base64 (or raw 32-byte) Ed25519 public key",
+        ),
+    }
+}
+
+/// Pool keys back managed traffic (swapped in per provider). Cross-check against the keyring: if
+/// signing keys are present the operator *intends* to serve managed traffic, so zero pool keys means
+/// every managed request 503s — a real misconfiguration. A pure-BYO deployment (no signing keys) with
+/// no pool keys is legitimate, so that case passes with a note instead of failing.
+fn check_pool_keys(config: &AiConfig) -> CheckResult {
+    let mut names: Vec<&str> = config.pool_keys.keys().map(String::as_str).collect();
+    names.sort_unstable();
+    let managed_intended = !config.signing_keys.is_empty();
+    match (names.is_empty(), managed_intended) {
+        (true, true) => fail(
+            "pool_keys",
+            "signing keys are configured (managed traffic expected) but no pool keys are set — \
+             every managed request will 503",
+            "set AI_POOL_KEY_<PROVIDER> (e.g. AI_POOL_KEY_OPENAI) for each provider you serve",
+        ),
+        (true, false) => pass(
+            "pool_keys",
+            "none configured (BYO-only deployment — no signing keys either)",
+        ),
+        (false, _) => pass("pool_keys", format!("pool keys for: {}", names.join(", "))),
+    }
+}
+
+/// Resolve every provider authority the gateway might dial (known providers + config overrides/adds),
+/// so a DNS or typo'd-authority problem shows up here rather than as a 502 on the first request. Each
+/// lookup is bounded so one black-holed host can't hang the doctor. We don't connect (no auth, no TLS
+/// handshake) — reachability of the *name* is the prerequisite; live auth is proven by the smoke test.
+async fn check_provider_dns(config: &AiConfig) -> Vec<CheckResult> {
+    // Effective authority per provider name: the known default unless config overrides it, plus any
+    // config-only provider. A BTreeMap dedups and keeps the output stable/ordered.
+    let mut authorities: BTreeMap<&str, String> = BTreeMap::new();
+    for spec in route::KNOWN_PROVIDERS {
+        authorities.insert(spec.name, spec.authority.to_string());
+    }
+    for (name, authority) in &config.provider_authorities {
+        authorities.insert(name.as_str(), authority.clone());
+    }
+
+    let mut results = Vec::with_capacity(authorities.len());
+    for (name, authority) in authorities {
+        // `CheckResult.name` is `&'static str`: a known provider lends its static name; a config-only
+        // provider (non-'static) reports under a generic label, with the real name in the message.
+        let check_name: &'static str = route::KNOWN_PROVIDERS
+            .iter()
+            .find(|s| s.name == name)
+            .map_or("provider_dns", |s| s.name);
+        let lookup = tokio::time::timeout(
+            Duration::from_secs(3),
+            tokio::net::lookup_host(authority.clone()),
+        )
+        .await;
+        let res = match lookup {
+            Ok(Ok(mut addrs)) => match addrs.next() {
+                Some(addr) => pass(check_name, format!("{name} → {authority} ({addr})")),
+                None => fail(
+                    check_name,
+                    format!("{name}: {authority} resolved to no addresses"),
+                    "check the provider authority (host:port) in provider_authorities",
+                ),
+            },
+            Ok(Err(e)) => fail(
+                check_name,
+                format!("{name}: {authority}: {e}"),
+                "check the provider authority (host:port) and DNS",
+            ),
+            Err(_) => fail(
+                check_name,
+                format!("{name}: {authority}: DNS lookup timed out (>3s)"),
+                "the upstream host may be unreachable or DNS is slow",
+            ),
+        };
+        results.push(res);
+    }
+    results
+}
+
 pub fn print_results(title: &str, results: &[CheckResult]) {
     println!("== {title} ==");
     for r in results {
@@ -59,3 +170,58 @@ pub fn print_results(title: &str, results: &[CheckResult]) {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::secret::Secret;
+    use std::collections::HashMap;
+
+    #[test]
+    fn signing_keys_empty_fails() {
+        // No keys ⇒ every managed token 401s; doctor must flag it, not pass silently.
+        let c = AiConfig::default();
+        assert!(!check_signing_keys(&c).passed);
+    }
+
+    #[test]
+    fn signing_keys_valid_passes() {
+        let c = AiConfig {
+            // 32 zero bytes, base64 — a structurally valid Ed25519 public key.
+            signing_keys: HashMap::from([(
+                "1".to_string(),
+                "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(),
+            )]),
+            ..Default::default()
+        };
+        assert!(check_signing_keys(&c).passed);
+    }
+
+    #[test]
+    fn pool_keys_missing_with_signing_keys_fails() {
+        // Signing keys present (managed intended) but no pool keys ⇒ every managed request 503s.
+        let c = AiConfig {
+            signing_keys: HashMap::from([(
+                "1".to_string(),
+                "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(),
+            )]),
+            ..Default::default()
+        };
+        assert!(!check_pool_keys(&c).passed);
+    }
+
+    #[test]
+    fn pool_keys_absent_byo_only_passes() {
+        // No signing keys and no pool keys is a legitimate BYO-only deployment — must not fail.
+        assert!(check_pool_keys(&AiConfig::default()).passed);
+    }
+
+    #[test]
+    fn pool_keys_present_passes() {
+        let c = AiConfig {
+            pool_keys: HashMap::from([("openai".to_string(), Secret::new("sk-x"))]),
+            ..Default::default()
+        };
+        assert!(check_pool_keys(&c).passed);
+    }
+}
diff --git a/crates/gateway/src/lib.rs b/crates/gateway/src/lib.rs
index 0c529d3..7f4ed49 100644
--- a/crates/gateway/src/lib.rs
+++ b/crates/gateway/src/lib.rs
@@ -4,11 +4,12 @@
 //! (virtual-key verification, deny-set, usage parsing, routing, request peek) lives in modules
 //! free of Pingora/IO so it is unit-tested without a running proxy or live providers.
 
-// Application crate: no `unsafe` is needed, so forbid it outright. `unused_must_use` is denied so
-// a dropped `Result` (e.g. an unchecked `write_response_*`) is a hard error, not a silent swallow.
-#![deny(unsafe_code)]
-#![deny(unused_must_use)]
+// Lint gates (`unsafe_code = "forbid"`, `unused_must_use = "deny"`) live in `[workspace.lints]` so
+// they apply to *both* crate roots — this lib and the `main.rs` binary — not just whichever unit
+// carries a crate-level `#![deny]`. A dropped `Result` (e.g. an unchecked `write_response_*`) is
+// therefore a hard error, and `unsafe` is forbidden, everywhere in the crate.
 
+pub mod admin;
 pub mod config;
 pub mod deny;
 pub mod doctor;
diff --git a/crates/gateway/src/main.rs b/crates/gateway/src/main.rs
index 01e662b..43de521 100644
--- a/crates/gateway/src/main.rs
+++ b/crates/gateway/src/main.rs
@@ -1,5 +1,6 @@
 //! Beyond AI gateway binary: clap `Run`/`Doctor`, Pingora server bootstrap, services.
 
+use beyond_ai::admin::AdminApp;
 use beyond_ai::config::AiConfig;
 use beyond_ai::doctor;
 use beyond_ai::metrics::Metrics;
@@ -7,8 +8,10 @@ use beyond_ai::proxy::AiProxy;
 use beyond_ai::state::GatewayState;
 use beyond_ai::store_watch::WatcherService;
 use clap::{Parser, Subcommand};
+use pingora_core::apps::http_app::HttpServer;
 use pingora_core::server::Server;
 use pingora_core::services::background::background_service;
+use pingora_core::services::listening::Service as ListeningService;
 use pingora_proxy::http_proxy_service;
 use std::path::Path;
 use std::process::exit;
@@ -60,9 +63,9 @@ fn init_tracing() {
 
 fn main() {
     // rustls 0.23 requires a process-wide crypto provider for the TLS connections to providers.
-    rustls::crypto::ring::default_provider()
-        .install_default()
-        .ok();
+    // Idempotent: an `Err` means a provider is already installed (e.g. a second init in tests),
+    // which is fine to ignore — the provider we want is in place either way.
+    let _ = rustls::crypto::ring::default_provider().install_default();
 
     let cli = Cli::parse();
 
@@ -122,10 +125,11 @@ fn main() {
         },
     ));
 
-    // Prometheus /metrics (serves the default registry that `Metrics` registered on).
-    let mut prom = pingora::services::listening::Service::prometheus_http_service();
-    prom.add_tcp(&metrics_listen);
-    server.add_service(prom);
+    // Metrics listener now also serves /livez + /readyz for the ECS/k8s probes. Pingora's built-in
+    // prometheus service only does /metrics, so we hand-route all three in one small ServeHttp.
+    let mut admin = ListeningService::new("ai-admin".to_string(), HttpServer::new_app(AdminApp));
+    admin.add_tcp(&metrics_listen);
+    server.add_service(admin);
 
     tracing::info!(%listen, %metrics_listen, "starting beyond-ai");
     server.run_forever();
diff --git a/crates/gateway/src/metrics.rs b/crates/gateway/src/metrics.rs
index 81ee864..9fe8e71 100644
--- a/crates/gateway/src/metrics.rs
+++ b/crates/gateway/src/metrics.rs
@@ -4,7 +4,7 @@
 //! exposes them with no extra wiring. `Metrics::new` is called exactly once (in `main`).
 
 use prometheus::{
-    Histogram, HistogramOpts, IntCounter, IntCounterVec, IntGauge, Opts, default_registry,
+    HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, Opts, default_registry,
 };
 use std::sync::Arc;
 
@@ -12,13 +12,56 @@ pub struct Metrics {
     pub requests_total: IntCounter,
     /// Labeled by reason ("auth", "deny_spend", "deny_fraud") so we can see *why* we rejected.
     pub rejections_total: IntCounterVec,
-    /// Labeled by kind: input|output.
+    /// Upstream responses by provider + status class ("2xx"/"4xx"/"5xx"). A provider degrading
+    /// (429/5xx) is otherwise invisible until it surfaces as latency or missing usage events —
+    /// this is the per-provider error-rate signal an oncall pages on.
+    pub upstream_responses_total: IntCounterVec,
+    /// Upstream **connect** retries by provider (see `proxy::fail_to_connect`). A partially-down
+    /// provider TCP layer (or an egress-IP ban) silently retries up to `MAX_CONNECT_RETRIES` times
+    /// per request; without this, the extra latency looks like a slow provider, not a connect
+    /// problem. Pairs with a `warn!` on the same path so the dashboard spike has a log to grep.
+    pub connect_retries_total: IntCounterVec,
+    /// Labeled by kind: input|output|cache_read|cache_write. Cache tokens are also in the `ai.usage`
+    /// billing log, but that ships with lag — the Prometheus counter is the alerting surface for
+    /// "cache hit rate fell off a cliff after a deploy" (cache write ≈ 3× input, cache read ≈ 0.1×,
+    /// so a regression is a real cost event, not just a latency one).
     pub tokens_total: IntCounterVec,
-    pub ttft_seconds: Histogram,
-    pub upstream_latency_seconds: Histogram,
+    /// Labeled by provider: TTFT varies by an order of magnitude across providers (Groq/Cerebras
+    /// <100ms vs. a large Anthropic/xAI model at seconds), so an unlabeled histogram can't tell you
+    /// *which* provider's first-token time regressed.
+    pub ttft_seconds: HistogramVec,
+    /// Labeled by provider, same rationale as `ttft_seconds`: full-request duration is dominated by
+    /// the model's generation time, which is per-provider.
+    pub upstream_latency_seconds: HistogramVec,
     pub active_streams: IntGauge,
+    /// Total in-flight requests (streaming + non-streaming), incremented once a request is admitted
+    /// in `request_filter` and decremented in `logging`. `active_streams` only covers SSE; under a
+    /// burst or a stalled upstream this is what distinguishes "high rps, fast upstreams" from
+    /// "connections piling up" — the difference between a perf blip and a connection-exhaustion
+    /// incident.
+    pub requests_in_flight: IntGauge,
+    /// Current deny-set cardinality (denied tenants). The set is `O(denied)` and fed from NATS; a
+    /// fraud event or a control-plane bug that mass-denies tenants would otherwise grow it invisibly
+    /// until it shows up as memory pressure. Updated on every seed and every applied delta.
+    pub deny_set_size: IntGauge,
+    /// NATS connectivity for the deny-set watcher (1 = connected, 0 = disconnected). The gateway is
+    /// fail-open — it serves on the last-known set when NATS is down — so staleness is otherwise
+    /// silent; this is the metric to alert "deny-set has been stale for >N minutes" on.
+    pub nats_connected: IntGauge,
 }
 
+/// TTFT buckets (seconds). Tuned for LLM latency: sub-second prompts up through the multi-second
+/// first-token times of large models. The default prometheus buckets top out at 10s, but TTFT for a
+/// busy model can exceed that, so the tail goes to 30s.
+const TTFT_BUCKETS: &[f64] = &[0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0];
+
+/// Full-request duration buckets (seconds). A streaming completion runs far longer than the
+/// default 10s ceiling (`read_timeout_secs` defaults to 600), so the tail reaches 300s — without
+/// these, every long stream lands in `+Inf` and the p99/p999 tail is unrecoverable.
+const LATENCY_BUCKETS: &[f64] = &[
+    0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0, 600.0,
+];
+
 impl Metrics {
     /// Build and register every metric on the default registry. Fallible: registering a name that
     /// already exists (a second `Metrics::new()` against the process-wide default registry) returns
@@ -33,35 +76,74 @@ impl Metrics {
             Opts::new("ai_rejections_total", "Requests rejected before upstream"),
             &["reason"],
         )?;
+        let upstream_responses_total = IntCounterVec::new(
+            Opts::new(
+                "ai_upstream_responses_total",
+                "Upstream responses by provider and status class",
+            ),
+            &["provider", "status"],
+        )?;
+        let connect_retries_total = IntCounterVec::new(
+            Opts::new(
+                "ai_connect_retries_total",
+                "Upstream connect retries by provider",
+            ),
+            &["provider"],
+        )?;
         let tokens_total =
             IntCounterVec::new(Opts::new("ai_tokens_total", "Tokens metered"), &["kind"])?;
-        let ttft_seconds = Histogram::with_opts(HistogramOpts::new(
-            "ai_ttft_seconds",
-            "Time to first byte from upstream",
-        ))?;
-        let upstream_latency_seconds = Histogram::with_opts(HistogramOpts::new(
-            "ai_upstream_latency_seconds",
-            "Full upstream request duration",
-        ))?;
+        let ttft_seconds = HistogramVec::new(
+            HistogramOpts::new("ai_ttft_seconds", "Time to first byte from upstream")
+                .buckets(TTFT_BUCKETS.to_vec()),
+            &["provider"],
+        )?;
+        let upstream_latency_seconds = HistogramVec::new(
+            HistogramOpts::new(
+                "ai_upstream_latency_seconds",
+                "Full upstream request duration",
+            )
+            .buckets(LATENCY_BUCKETS.to_vec()),
+            &["provider"],
+        )?;
         let active_streams = IntGauge::with_opts(Opts::new(
             "ai_active_streams",
             "In-flight streaming responses",
         ))?;
+        let requests_in_flight = IntGauge::with_opts(Opts::new(
+            "ai_requests_in_flight",
+            "In-flight requests (streaming + non-streaming)",
+        ))?;
+        let deny_set_size =
+            IntGauge::with_opts(Opts::new("ai_deny_set_size", "Currently denied tenants"))?;
+        let nats_connected = IntGauge::with_opts(Opts::new(
+            "ai_nats_connected",
+            "Deny-set watcher NATS connectivity (1=connected, 0=disconnected)",
+        ))?;
 
         r.register(Box::new(requests_total.clone()))?;
         r.register(Box::new(rejections_total.clone()))?;
+        r.register(Box::new(upstream_responses_total.clone()))?;
+        r.register(Box::new(connect_retries_total.clone()))?;
         r.register(Box::new(tokens_total.clone()))?;
         r.register(Box::new(ttft_seconds.clone()))?;
         r.register(Box::new(upstream_latency_seconds.clone()))?;
         r.register(Box::new(active_streams.clone()))?;
+        r.register(Box::new(requests_in_flight.clone()))?;
+        r.register(Box::new(deny_set_size.clone()))?;
+        r.register(Box::new(nats_connected.clone()))?;
 
         Ok(Arc::new(Self {
             requests_total,
             rejections_total,
+            upstream_responses_total,
+            connect_retries_total,
             tokens_total,
             ttft_seconds,
             upstream_latency_seconds,
             active_streams,
+            requests_in_flight,
+            deny_set_size,
+            nats_connected,
         }))
     }
 }
diff --git a/crates/gateway/src/proxy.rs b/crates/gateway/src/proxy.rs
index fdd49f7..ee48efb 100644
--- a/crates/gateway/src/proxy.rs
+++ b/crates/gateway/src/proxy.rs
@@ -39,11 +39,16 @@ use async_trait::async_trait;
 use bytes::Bytes;
 use pingora::http::ResponseHeader;
 use pingora_core::Result;
+use pingora_core::protocols::ALPN;
 use pingora_core::upstreams::peer::HttpPeer;
 use pingora_proxy::{ProxyHttp, Session};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use tracing::info;
+use tracing::{info, warn};
+
+/// Response header carrying the per-request id (`{instance}-{seq}`). Set on both the proxied
+/// response and every reject body so a client can quote it and an oncall can grep for it.
+const REQUEST_ID_HEADER: &str = "x-beyond-request-id";
 
 /// Reject requests whose declared Content-Length exceeds this. The body itself is **not** buffered
 /// (it streams straight through); this is purely an abuse guard checked up front via the header.
@@ -109,19 +114,35 @@ pub struct RequestCtx {
     start: Instant,
     /// Connect-retry counter (see `fail_to_connect`).
     attempt: u8,
+    /// Process-unique id for this request (`{instance}-{seq}`), echoed in the `x-beyond-request-id`
+    /// response header and the `ai.usage` event so a client report ties back to a log line.
+    request_id: String,
 }
 
 impl AiProxy {
     /// Write a small JSON error and signal `request_filter` to short-circuit. The body is built with
     /// `serde_json` (not `format!`) so a `typ`/`msg` containing `"` or `\` can never break out of the
     /// JSON structure — keeps this safe if a future caller passes a non-literal message.
-    async fn reject(session: &mut Session, status: u16, typ: &str, msg: &str) -> Result<bool> {
+    ///
+    /// Every rejection logs one structured `warn` line (the rejection counter only says *how many*,
+    /// not *which request* — this is what an oncall greps when a `deny_fraud`/`rate_limit` spike
+    /// shows on the dashboard) and echoes the `request_id` in a response header so a client report
+    /// quoting that id lands on this line.
+    async fn reject(
+        session: &mut Session,
+        request_id: &str,
+        status: u16,
+        typ: &str,
+        msg: &str,
+    ) -> Result<bool> {
+        warn!(request_id, status, error_type = typ, "request rejected");
         let body = Bytes::from(
             serde_json::json!({ "error": { "type": typ, "message": msg } }).to_string(),
         );
         let mut resp = ResponseHeader::build(status, None)?;
         resp.insert_header("content-type", "application/json")?;
         resp.insert_header("content-length", body.len().to_string())?;
+        resp.insert_header(REQUEST_ID_HEADER, request_id)?;
         session.write_response_header(Box::new(resp), false).await?;
         session.write_response_body(Some(body), true).await?;
         Ok(true)
@@ -214,6 +235,10 @@ impl ProxyHttp for AiProxy {
     async fn request_filter(&self, session: &mut Session, ctx: &mut Self::CTX) -> Result<bool> {
         self.state.metrics.requests_total.inc();
         let start = Instant::now();
+        // One id per request, generated before any reject path so even a 400/401 carries it (in the
+        // log line and the `x-beyond-request-id` header). Moved into `ctx` at the end for the
+        // admitted path. Cheap: a counter bump + a short `format!` (see `next_request_id`).
+        let request_id = self.state.next_request_id();
 
         // 1. Resolve the upstream provider first — from the ingress dialect (the body/model isn't
         // available pre-connect), with an explicit `x-beyond-provider` override. Resolving up front
@@ -229,12 +254,26 @@ impl ProxyHttp for AiProxy {
                 .cloned(),
         };
         let Some(provider) = provider else {
-            return Self::reject(session, 400, "invalid_request_error", "unknown provider").await;
+            return Self::reject(
+                session,
+                &request_id,
+                400,
+                "invalid_request_error",
+                "unknown provider",
+            )
+            .await;
         };
 
         // 2. Extract the presented key — a managed virtual key (`bai_…`) or a raw BYO provider token.
         let Some(raw_key) = extract_virtual_key(session) else {
-            return Self::reject(session, 401, "authentication_error", "missing API key").await;
+            return Self::reject(
+                session,
+                &request_id,
+                401,
+                "authentication_error",
+                "missing API key",
+            )
+            .await;
         };
 
         // 3. Rate guardrails (see `ratelimit`), charged on the *raw presented key* **before** any
@@ -252,7 +291,14 @@ impl ProxyHttp for AiProxy {
                     .rejections_total
                     .with_label_values(&[reason.label()])
                     .inc();
-                return Self::reject(session, 429, "rate_limit_error", "rate limit exceeded").await;
+                return Self::reject(
+                    session,
+                    &request_id,
+                    429,
+                    "rate_limit_error",
+                    "rate limit exceeded",
+                )
+                .await;
             }
         }
 
@@ -267,6 +313,7 @@ impl ProxyHttp for AiProxy {
             if len > MAX_REQUEST_BODY {
                 return Self::reject(
                     session,
+                    &request_id,
                     413,
                     "invalid_request_error",
                     "request body too large",
@@ -285,7 +332,14 @@ impl ProxyHttp for AiProxy {
                     .rejections_total
                     .with_label_values(&["auth"])
                     .inc();
-                return Self::reject(session, 401, "authentication_error", "invalid API key").await;
+                return Self::reject(
+                    session,
+                    &request_id,
+                    401,
+                    "authentication_error",
+                    "invalid API key",
+                )
+                .await;
             };
             // Deny-set: O(1), default-allow. The gateway never learns *why*, only the reason code.
             if let Some(reason) = self.state.deny.load().reason(identity.tenant_id) {
@@ -300,6 +354,7 @@ impl ProxyHttp for AiProxy {
                     .inc();
                 return Self::reject(
                     session,
+                    &request_id,
                     reason.http_status(),
                     "access_denied",
                     "tenant is over limit or suspended",
@@ -309,7 +364,14 @@ impl ProxyHttp for AiProxy {
             // The actual `Bearer …`/`x-api-key` value is precomputed in the provider registry and
             // applied in `upstream_request_filter`; here we only confirm a pool key exists.
             if provider.pool_auth_value.is_none() {
-                return Self::reject(session, 503, "api_error", "no provider key available").await;
+                return Self::reject(
+                    session,
+                    &request_id,
+                    503,
+                    "api_error",
+                    "no provider key available",
+                )
+                .await;
             }
             (identity.tenant_id, identity.vpc_id, true)
         } else {
@@ -345,7 +407,12 @@ impl ProxyHttp for AiProxy {
             body_bytes_fed: 0,
             start,
             attempt: 0,
+            request_id,
         });
+        // Admitted: count it in-flight. Balanced by the decrement in `logging`, which runs exactly
+        // once per admitted request (rejected requests leave `ctx` None and never reach that path,
+        // so the gauge can't leak). `active_streams` only covers SSE; this covers every request.
+        self.state.metrics.requests_in_flight.inc();
         Ok(false)
     }
 
@@ -368,9 +435,22 @@ impl ProxyHttp for AiProxy {
         // e2e harness flips `upstream_tls=false` for a plaintext mock).
         let addr = match self.state.resolve(&rc.provider.authority).await {
             Ok(a) => a,
-            Err(_) => {
-                return Err(pingora_core::Error::new_str(
+            Err(e) => {
+                // DNS failures are rare and usually mean a misconfigured `provider_authorities`
+                // override — so keep the diagnostic (provider name + authority + the resolver error,
+                // already formatted into `e`) instead of discarding it behind an opaque static string.
+                // `error_because` chains `e` as the cause so it shows in the Pingora error log.
+                warn!(
+                    request_id = rc.request_id,
+                    provider = rc.provider.name.as_str(),
+                    authority = rc.provider.authority.as_str(),
+                    error = %e,
                     "upstream dns resolution failed",
+                );
+                return Err(pingora_core::Error::because(
+                    pingora_core::ErrorType::ConnectError,
+                    "upstream dns resolution failed",
+                    e,
                 ));
             }
         };
@@ -379,6 +459,26 @@ impl ProxyHttp for AiProxy {
             self.state.config.upstream_tls,
             rc.provider.host.clone(),
         );
+        // Prefer HTTP/2 to the provider (config `upstream_http2`, default on), fall back to HTTP/1.1.
+        // Every provider in `KNOWN_PROVIDERS` negotiates `h2` over TLS (verified by handshake), and H2
+        // multiplexes many concurrent requests/streams over one connection — fewer sockets and TLS
+        // handshakes from our egress IPs (which also eases the egress-reputation pressure `ratelimit`
+        // guards). `H2H1` is strictly ≥ `H1` on compatibility: ALPN negotiates down to H1 for any host
+        // that doesn't offer h2, and a plaintext upstream (the mock, `upstream_tls=false`) has no ALPN
+        // at all and stays H1. The negotiated protocol is then visible per-request as
+        // `upstream_request.version` (see `upstream_request_filter`), which is what lets the
+        // body-injection path frame correctly. The knob lets an operator force all-H1 without a code
+        // redeploy, and lets the e2e bench compare the two head-to-head.
+        peer.options.alpn = if self.state.config.upstream_http2 {
+            ALPN::H2H1
+        } else {
+            ALPN::H1
+        };
+        // Cert verification is on everywhere except the bench's self-signed TLS mock (see config).
+        if !self.state.config.upstream_verify_cert {
+            peer.options.verify_cert = false;
+            peer.options.verify_hostname = false;
+        }
         peer.options.connection_timeout =
             Some(Duration::from_secs(self.state.config.connect_timeout_secs));
         peer.options.read_timeout = Some(Duration::from_secs(self.state.config.read_timeout_secs));
@@ -429,10 +529,23 @@ impl ProxyHttp for AiProxy {
 
         // Injection-eligible (OpenAI managed stream): the body is rewritten in `request_body_filter`,
         // changing its length, and we can't know the new length here (headers go out before the body
-        // filter runs). So drop the client's `Content-Length` and frame the buffered body as chunked.
+        // filter runs). So drop the client's `Content-Length`; how the now-unknown length is framed
+        // depends on the **negotiated upstream protocol**, which is reliably readable here as
+        // `upstream_request.version`: pingora-proxy sets it to HTTP/2 before this filter on the H2 path
+        // (`proxy_h2.rs`) and to HTTP/1.1 on the H1 path (`proxy_h1.rs`).
+        //
+        //   - **H1**: a body with neither `content-length` nor `transfer-encoding` is framed as
+        //     *zero-length* by pingora's H1 client (RFC 9112 §6.3) — the injected body would be
+        //     silently dropped. So we must set `transfer-encoding: chunked`.
+        //   - **H2**: bodies are delimited by `END_STREAM`, and `transfer-encoding` is a forbidden
+        //     connection-specific header — the `h2` crate *rejects the whole request*
+        //     (`UserError::MalformedHeaders`) if it's present. So we must NOT set it; removing
+        //     `content-length` is sufficient and correct.
         if rc.inject_eligible {
             upstream_request.remove_header("content-length");
-            upstream_request.insert_header("transfer-encoding", "chunked")?;
+            if upstream_request.version != http::Version::HTTP_2 {
+                upstream_request.insert_header("transfer-encoding", "chunked")?;
+            }
         }
         Ok(())
     }
@@ -499,11 +612,28 @@ impl ProxyHttp for AiProxy {
         ctx: &mut Self::CTX,
     ) -> Result<()> {
         if let Some(rc) = ctx.as_mut() {
-            // Headers arrived ≈ time-to-first-byte.
+            // Headers arrived ≈ time-to-first-byte. Labeled by provider — first-token latency is
+            // per-provider, so an unlabeled histogram can't tell you which one regressed.
             self.state
                 .metrics
                 .ttft_seconds
+                .with_label_values(&[rc.provider.name.as_str()])
                 .observe(rc.start.elapsed().as_secs_f64());
+
+            // Per-provider response counter, bucketed by status class — the signal that a provider
+            // is degrading (429/5xx) before it shows up only as latency or a missing usage event.
+            let status_class = match upstream_response.status.as_u16() {
+                s if (200..300).contains(&s) => "2xx",
+                s if (300..400).contains(&s) => "3xx",
+                s if (400..500).contains(&s) => "4xx",
+                _ => "5xx",
+            };
+            self.state
+                .metrics
+                .upstream_responses_total
+                .with_label_values(&[rc.provider.name.as_str(), status_class])
+                .inc();
+
             // Derive streaming from the response, not the request: SSE ⇒ use the streaming usage
             // parser; otherwise the body is a single JSON object.
             rc.streaming = upstream_response
@@ -511,6 +641,17 @@ impl ProxyHttp for AiProxy {
                 .get("content-type")
                 .and_then(|v| v.to_str().ok())
                 .is_some_and(|ct| ct.contains("event-stream"));
+            // Track concurrent SSE streams. Incremented here (response head is in), decremented in
+            // `logging` once the stream completes — so the gauge reflects in-flight streams, not a
+            // counter that only ever climbs. Non-streaming responses don't touch it.
+            if rc.streaming {
+                self.state.metrics.active_streams.inc();
+            }
+
+            // Echo the request id so a client (or an oncall reading a captured response) can quote it
+            // and land on this request's log line. `insert_header` only fails on an invalid value;
+            // our id is `[0-9a-f-]`, always valid — but surface a failure rather than silently drop.
+            upstream_response.insert_header(REQUEST_ID_HEADER, &rc.request_id)?;
         }
         Ok(())
     }
@@ -560,6 +701,22 @@ impl ProxyHttp for AiProxy {
             // Retry transient connect failures a couple of times (Pingora re-invokes upstream_peer).
             if rc.attempt < MAX_CONNECT_RETRIES {
                 rc.attempt += 1;
+                // Surface the retry. Without this, a partially-down provider TCP layer (or an
+                // egress-IP ban — connect is where that first bites) shows up only as extra latency
+                // on `upstream_latency_seconds`, indistinguishable from a slow model. The counter is
+                // the dashboard signal; the `warn!` carries the request_id to grep.
+                self.state
+                    .metrics
+                    .connect_retries_total
+                    .with_label_values(&[rc.provider.name.as_str()])
+                    .inc();
+                warn!(
+                    request_id = rc.request_id,
+                    provider = rc.provider.name.as_str(),
+                    attempt = rc.attempt,
+                    error = %e,
+                    "upstream connect failed; retrying",
+                );
                 e.set_retry(true);
             }
         }
@@ -569,11 +726,31 @@ impl ProxyHttp for AiProxy {
     async fn logging(
         &self,
         _session: &mut Session,
-        _e: Option<&pingora_core::Error>,
+        e: Option<&pingora_core::Error>,
         ctx: &mut Self::CTX,
     ) {
         let Some(rc) = ctx.as_mut() else { return };
 
+        // Balance the in-flight gauge incremented at admission. `logging` runs exactly once per
+        // admitted request — including on upstream errors and client disconnects — so the gauge
+        // always returns to baseline and can't drift upward.
+        self.state.metrics.requests_in_flight.dec();
+
+        // An upstream error (DNS/connect timeout, read timeout, abort) lands here with `Some(e)` but
+        // no `ai.usage` row (no parseable body) — and the earlier `warn!` in `upstream_peer` only
+        // fires for DNS, not connect/read failures. Log it with the full identity so "why did tenant
+        // 42 get 502s for 5 minutes" is one grep on the request_id, not a reconstruction.
+        if let Some(e) = e {
+            warn!(
+                request_id = rc.request_id,
+                tenant_id = rc.tenant_id,
+                vpc_id = rc.vpc_id,
+                provider = rc.provider.name.as_str(),
+                error = %e,
+                "upstream request errored",
+            );
+        }
+
         // The buffer may transiently hold up to 2× the cap before compaction; the usage event is
         // always in the last cap bytes, so slice to that bounded tail before parsing.
         let tail_start = rc.resp_tail.len().saturating_sub(USAGE_TAIL_CAP);
@@ -595,8 +772,23 @@ impl ProxyHttp for AiProxy {
         m.tokens_total
             .with_label_values(&["output"])
             .inc_by(usage.output_tokens);
+        // Cache tokens, too — these are in the `ai.usage` billing log below, but that ships with lag;
+        // the counter is the alerting surface for a cache-hit-rate cliff after a deploy.
+        m.tokens_total
+            .with_label_values(&["cache_read"])
+            .inc_by(usage.cache_read_tokens);
+        m.tokens_total
+            .with_label_values(&["cache_write"])
+            .inc_by(usage.cache_write_tokens);
         m.upstream_latency_seconds
+            .with_label_values(&[rc.provider.name.as_str()])
             .observe(rc.start.elapsed().as_secs_f64());
+        // Balance the `active_streams` increment from `response_filter`. `logging` runs exactly once
+        // per request (including on upstream errors / client disconnects), so a stream that opened is
+        // always accounted closed here — the gauge can't leak upward.
+        if rc.streaming {
+            m.active_streams.dec();
+        }
 
         // Emit the usage *fact* on a dedicated target — **managed only**. The event is an
         // identity-keyed billing record (logfwd/OTLP ships `ai.usage` → ClickHouse → a closed
@@ -618,6 +810,7 @@ impl ProxyHttp for AiProxy {
                 .unwrap_or_else(|| rc.model.clone());
             info!(
                 target: "ai.usage",
+                request_id = rc.request_id,
                 tenant_id = rc.tenant_id,
                 vpc_id = rc.vpc_id,
                 provider = rc.provider.name.as_str(),
diff --git a/crates/gateway/src/ratelimit.rs b/crates/gateway/src/ratelimit.rs
index 064f880..19caa9d 100644
--- a/crates/gateway/src/ratelimit.rs
+++ b/crates/gateway/src/ratelimit.rs
@@ -148,6 +148,12 @@ impl RateLimit {
     /// BYO. Returns `None` when within budget, or `Some(reason)` once a ceiling is crossed — the very
     /// request that crosses the line is the first one rejected (`observe` returns the running total).
     /// The credential itself is never stored; only its seeded digest feeds the per-credential sketch.
+    ///
+    /// `#[must_use]`: `observe` has already incremented the counters by the time this returns, so a
+    /// caller that drops the result has *charged* the request but skipped enforcement — the limiter is
+    /// silently bypassed. The crate's `#![deny(unused_must_use)]` only bites with this attribute
+    /// present, so it's load-bearing, not decorative.
+    #[must_use = "the throttle decision must be enforced — dropping it charges the request but lets it through"]
     pub fn check(&self, raw_credential: &str, managed: bool) -> Option<Throttled> {
         // Global BYO backstop first: BYO is unverified and upstream-bound, so this is the ceiling that
         // protects our egress IPs from a distinct-token flood. Managed traffic skips it (verified,
diff --git a/crates/gateway/src/state.rs b/crates/gateway/src/state.rs
index 89a89f5..c2324c5 100644
--- a/crates/gateway/src/state.rs
+++ b/crates/gateway/src/state.rs
@@ -16,7 +16,9 @@ use arc_swap::ArcSwap;
 use std::collections::HashMap;
 use std::net::SocketAddr;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+use tracing::warn;
 
 /// How long a resolved upstream address is reused before re-resolving.
 const DNS_TTL: Duration = Duration::from_secs(60);
@@ -87,14 +89,50 @@ pub struct GatewayState {
     /// case — a cache hit, on every admitted request after warmup — is a lock-free atomic load; the
     /// only writes are the ~10 providers' entries refreshed once per `DNS_TTL`, applied via `rcu`.
     dns_cache: ArcSwap<HashMap<String, (SocketAddr, Instant)>>,
+
+    /// Per-process instance token (hex of 8 OS-random bytes), the high half of every `request_id`.
+    /// Random rather than a uuid dep, so log lines from two gateways don't collide when aggregated —
+    /// and random rather than the boot wall-clock, which collides when a rapid scale-up boots several
+    /// instances within the same nanosecond.
+    instance_id: String,
+    /// Monotonic per-request counter, the low half of `request_id`. A relaxed `fetch_add` — the only
+    /// requirement is uniqueness within the process, not cross-request ordering.
+    request_seq: AtomicU64,
 }
 
 impl GatewayState {
     pub fn new(config: AiConfig, metrics: Arc<Metrics>) -> Result<Arc<Self>> {
         let keyring = config.build_keyring()?;
+        // No signing keys ⇒ every `bai_…` fails verify and falls through to BYO treatment: no key
+        // swap, no deny-set, no `ai.usage` billing. That's a *valid* mode (a BYO-only deployment),
+        // but a far more common cause is a missing/typo'd `signing_keys` (SSM param, env) — which
+        // looks healthy while silently dropping all billing. Warn loudly so the boot logs flag it;
+        // don't `exit` (BYO-only is legitimate and tests rely on it).
+        if config.signing_keys.is_empty() {
+            warn!(
+                "no signing_keys configured — all managed (bai_) traffic will be treated as BYO \
+                 (no key swap, no deny-set, no billing). Expected only for a BYO-only deployment."
+            );
+        }
         let providers = build_providers(&config);
         let rate_limit = RateLimit::new(config.rate_limit_rps, config.byo_rate_limit_rps);
 
+        // 8 OS-random bytes as the instance token, so two gateways' request_ids never collide when
+        // aggregated — including when a rapid scale-up boots several instances within the same
+        // nanosecond (which a wall-clock token can't distinguish). If the OS RNG is somehow
+        // unavailable, fall back to the boot wall-clock rather than panicking — a degraded-uniqueness
+        // id beats failing to start.
+        let instance = {
+            let mut buf = [0u8; 8];
+            match getrandom::fill(&mut buf) {
+                Ok(()) => u64::from_le_bytes(buf),
+                Err(_) => SystemTime::now()
+                    .duration_since(UNIX_EPOCH)
+                    .map(|d| d.as_nanos() as u64)
+                    .unwrap_or(0),
+            }
+        };
+
         Ok(Arc::new(Self {
             metrics,
             keyring,
@@ -102,10 +140,21 @@ impl GatewayState {
             deny: ArcSwap::from_pointee(DenySet::new()),
             rate_limit,
             dns_cache: ArcSwap::from_pointee(HashMap::new()),
+            instance_id: format!("{instance:x}"),
+            request_seq: AtomicU64::new(0),
             config,
         }))
     }
 
+    /// A process-unique request id (`{instance}-{seq}`) for log correlation and the
+    /// `x-beyond-request-id` response header. Deliberately *not* a uuid: a per-process instance
+    /// token (computed once at boot) plus a relaxed atomic counter is unique across the fleet, costs
+    /// one `fetch_add` + one small `format!`, and needs no extra crate or randomness.
+    pub fn next_request_id(&self) -> String {
+        let seq = self.request_seq.fetch_add(1, Ordering::Relaxed);
+        format!("{}-{seq:x}", self.instance_id)
+    }
+
     /// The resolved provider for `name` (`x-beyond-provider` value or dialect default), or `None`
     /// if no such provider is registered.
     pub fn provider(&self, name: &str) -> Option<&Arc<Provider>> {
@@ -147,13 +196,25 @@ mod tests {
     use crate::route::AuthScheme;
     use crate::secret::Secret;
 
+    /// One process-wide `Metrics` (it registers on the default Prometheus registry, which rejects a
+    /// second registration), shared by every test that needs a `GatewayState`.
+    fn test_metrics() -> Arc<Metrics> {
+        use std::sync::OnceLock;
+        static M: OnceLock<Arc<Metrics>> = OnceLock::new();
+        M.get_or_init(|| Metrics::new().expect("register metrics once"))
+            .clone()
+    }
+
     #[test]
     fn registry_resolves_known_overrides_and_additions() {
         let config = AiConfig {
             // Override a known provider's authority + give it a pool key; add a config-only one.
+            // `custom2` is a config-only provider with **no** pool key — the condition that makes a
+            // managed request to it 503 (no managed auth value to swap in).
             provider_authorities: HashMap::from([
                 ("openai".to_string(), "127.0.0.1:9".to_string()),
                 ("custom".to_string(), "llm.internal:8443".to_string()),
+                ("custom2".to_string(), "other.internal:8443".to_string()),
             ]),
             pool_keys: HashMap::from([
                 ("openai".to_string(), Secret::new("sk-openai")),
@@ -185,5 +246,37 @@ mod tests {
             custom.pool_auth_value.as_ref().unwrap().expose(),
             "Bearer sk-custom"
         );
+
+        // Config-only provider with no pool key: registered (reachable by name) but with no managed
+        // auth value — this `None` is exactly what `request_filter` turns into a 503 for a managed
+        // request. (BYO to it still works; it just can't serve the pooled path.)
+        let custom2 = providers.get("custom2").unwrap();
+        assert!(
+            custom2.pool_auth_value.is_none(),
+            "a provider with no configured pool key must have no managed auth value (→ 503)"
+        );
+    }
+
+    #[tokio::test]
+    async fn resolve_caches_hit_and_errors_on_bad_host() {
+        // `resolve` is on the request hot path (every admitted request hits `upstream_peer`). Cover
+        // the three outcomes: a successful resolve, a cache hit returning the same address without a
+        // fresh lookup, and a lookup failure surfacing as `GatewayError::Dns` (not a panic/hang).
+        let config = AiConfig::default();
+        let state = GatewayState::new(config, test_metrics()).unwrap();
+
+        // An IP literal resolves through `lookup_host` without real DNS — deterministic, offline-safe.
+        let addr = state.resolve("127.0.0.1:9").await.unwrap();
+        assert_eq!(addr, "127.0.0.1:9".parse().unwrap());
+
+        // Second call is served from the TTL cache: same answer, and the entry is now present.
+        assert_eq!(state.resolve("127.0.0.1:9").await.unwrap(), addr);
+        assert!(state.dns_cache.load().contains_key("127.0.0.1:9"));
+
+        // A guaranteed-NXDOMAIN host (RFC 6761 reserves `.invalid`) → a Dns error, never a panic.
+        assert!(matches!(
+            state.resolve("nonexistent.invalid:80").await,
+            Err(GatewayError::Dns(_))
+        ));
     }
 }
diff --git a/crates/gateway/src/store_watch.rs b/crates/gateway/src/store_watch.rs
index 1d44742..8dd29e0 100644
--- a/crates/gateway/src/store_watch.rs
+++ b/crates/gateway/src/store_watch.rs
@@ -46,6 +46,11 @@ const BLACKHOLE_PREFIX: &str = "blackhole.";
 /// is low-churn, so this is rarely hit; it just bounds the log if a tenant flaps.
 const SNAPSHOT_COMPACT_THRESHOLD: u64 = 1024 * 1024;
 
+/// Reconnect backoff bounds: start at 1s, double to a 30s ceiling. Generous enough to stop log
+/// spam during a long NATS outage, tight enough that recovery is near-immediate once it returns.
+const RECONNECT_BACKOFF_BASE: Duration = Duration::from_secs(1);
+const RECONNECT_BACKOFF_MAX: Duration = Duration::from_secs(30);
+
 pub struct WatcherService {
     pub state: Arc<GatewayState>,
 }
@@ -68,6 +73,7 @@ impl BackgroundService for WatcherService {
                 Ok(Ok(Some(snap))) => {
                     let set = denyset_from_entries(snap.entries.values());
                     info!(count = set.len(), "seeded deny-set from on-disk snapshot");
+                    self.state.metrics.deny_set_size.set(set.len() as i64);
                     self.state.deny.store(Arc::new(set));
                     // A snapshot without a saved cursor can't safely resume (a bare watch would
                     // race), so only treat it as seeded when it carries a resume point; otherwise
@@ -93,6 +99,12 @@ impl BackgroundService for WatcherService {
             }
         }
 
+        // Reconnect backoff: 1s doubling to a 30s cap, reset on every successful connect. A fixed
+        // 2s retry hammered the log at a constant rate through a long outage (minutes to hours),
+        // burying other signals during the very incident an oncall is reading these logs for. The
+        // gateway serves correctly on the stale set throughout — this is purely about log volume
+        // and not pointlessly spinning on a down NATS.
+        let mut backoff = RECONNECT_BACKOFF_BASE;
         loop {
             // Connect, but bail immediately if Pingora signals shutdown mid-connect (e.g. NATS is
             // down and `connect` is retrying its own backoff) rather than blocking teardown.
@@ -104,16 +116,22 @@ impl BackgroundService for WatcherService {
                 outcome = connect(&self.state) => match outcome {
                     Ok(store) => store,
                     Err(e) => {
-                        error!(error = %e, "slipstream connect failed; retrying");
+                        self.state.metrics.nats_connected.set(0);
+                        error!(error = %e, backoff_secs = backoff.as_secs(), "slipstream connect failed; retrying");
                         // Reconnect backoff, also interruptible by shutdown.
                         tokio::select! {
                             _ = shutdown.changed() => return,
-                            _ = tokio::time::sleep(Duration::from_secs(2)) => continue,
+                            _ = tokio::time::sleep(backoff) => {
+                                backoff = (backoff * 2).min(RECONNECT_BACKOFF_MAX);
+                                continue;
+                            }
                         }
                     }
                 },
             };
 
+            backoff = RECONNECT_BACKOFF_BASE;
+            self.state.metrics.nats_connected.set(1);
             info!("slipstream connected; watching deny-set");
             // `watch_deny` returns `true` when it exited because shutdown was signaled — stop the
             // reconnect loop cleanly instead of trying to reconnect a shutting-down process.
@@ -130,10 +148,13 @@ impl BackgroundService for WatcherService {
                 info!("shutdown signaled; deny-set watcher exiting");
                 return;
             }
+            self.state.metrics.nats_connected.set(0);
             warn!("deny-set watch exited; reconnecting");
             tokio::select! {
                 _ = shutdown.changed() => return,
-                _ = tokio::time::sleep(Duration::from_secs(2)) => {}
+                _ = tokio::time::sleep(backoff) => {
+                    backoff = (backoff * 2).min(RECONNECT_BACKOFF_MAX);
+                }
             }
         }
     }
@@ -158,7 +179,17 @@ async fn rebuild_snapshot(
     let res = tokio::task::spawn_blocking(
         move || -> Result<SnapshotWriter, store::snapshot::SnapshotError> {
             // Remove the old log so we don't replay a deleted-but-uncompacted key on a later load.
-            let _ = std::fs::remove_file(&path);
+            // A failed removal is *not* ignorable: if `SnapshotWriter::open` then appends to the
+            // surviving file, a compacted-away `Delete` can't undo its stale `Put`, and a later
+            // `load()` resurrects a tenant we no longer deny — the exact corruption this rebuild
+            // exists to prevent. `NotFound` is the expected, benign case (first boot, or scratch
+            // storage); any other error aborts the rebuild so we run snapshot-less rather than on
+            // poisoned state.
+            match std::fs::remove_file(&path) {
+                Ok(()) => {}
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+                Err(e) => return Err(e.into()),
+            }
             let mut w = SnapshotWriter::open(&path, SNAPSHOT_COMPACT_THRESHOLD)?;
             for e in &entries {
                 w.write_update(&KvUpdate::Put(e.clone()))?;
@@ -183,6 +214,13 @@ async fn rebuild_snapshot(
 
 async fn connect(state: &GatewayState) -> crate::error::Result<Arc<dyn KvStore>> {
     let cfg = &state.config;
+    // `expose().to_string()` lifts the creds out of our `Secret` into the plain `String` the store's
+    // config requires. This doesn't widen the leak surface: `NatsConnectionConfig` has a hand-written
+    // redacting `Debug` (prints `creds: [redacted]`), so a stray `{:?}` on it — in a span, an error
+    // context, a reconnect log — can't print the credential. The plaintext copy is necessarily
+    // un-zeroized for the connection's life (we hand ownership to the store); same trade-off the pool
+    // keys make once they reach Pingora's headers (see `secret`). Redaction, not zeroization, is the
+    // control here.
     let conn = NatsConnection::new(NatsConnectionConfig {
         url: cfg.nats_url.clone(),
         creds: cfg.nats_creds.as_ref().map(|s| s.expose().to_string()),
@@ -227,6 +265,7 @@ async fn watch_deny(
                     revision = baseline_rev,
                     "seeded deny-set from scan"
                 );
+                state.metrics.deny_set_size.set(set.len() as i64);
                 state.deny.store(Arc::new(set));
                 *cursor = WatchCursor::from_u64(baseline_rev);
                 // Persist the freshly-scanned baseline so a later restart can skip the scan. We
@@ -299,6 +338,12 @@ async fn watch_deny(
             }
             Arc::new(set)
         });
+        // Reflect the new cardinality. A lock-free load of the set we just swapped in — cheap, and
+        // the deltas are low-churn, so this is far off any hot path.
+        state
+            .metrics
+            .deny_set_size
+            .set(state.deny.load().len() as i64);
         *cursor = WatchCursor::from_version(update.version().clone());
         persist_update(writer, &update, cursor).await;
     }
@@ -356,3 +401,56 @@ async fn persist_update(
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::deny::DenyReason;
+    use store::VersionToken;
+
+    fn entry(key: &str, value: &[u8]) -> KvEntry {
+        KvEntry {
+            key: key.to_string(),
+            value: value.to_vec(),
+            version: VersionToken::from_u64(1),
+        }
+    }
+
+    #[test]
+    fn denyset_from_entries_seeds_and_skips_malformed() {
+        // This is the seeding core: every boot turns raw KV entries into the live deny-set. A bug
+        // here (or a foreign key bleeding through the `filter_map`) means the deny-set is silently
+        // wrong at boot — denied tenants served, or unrelated keys denying real tenants.
+        let entries = [
+            entry("blackhole.42", b"spend"),
+            entry("blackhole.99", b"fraud"),
+            // Not a `blackhole.{tenant}` key — must be dropped, never inserted as tenant 0 or junk.
+            entry("signkey.1", b"spend"),
+            // `blackhole.` with a non-numeric tail — `parse_key` rejects it, so it's dropped too.
+            entry("blackhole.notanumber", b"spend"),
+            // Unrecognized reason value still denies (fail-safe) under `DenyReason::Unknown`.
+            entry("blackhole.7", b"mystery"),
+        ];
+
+        let set = denyset_from_entries(entries.iter());
+
+        assert_eq!(
+            set.len(),
+            3,
+            "only the three valid blackhole keys are seeded"
+        );
+        assert_eq!(set.reason(42), Some(DenyReason::Spend));
+        assert_eq!(set.reason(99), Some(DenyReason::Fraud));
+        assert_eq!(set.reason(7), Some(DenyReason::Unknown));
+        // The malformed keys produced no entries (and crucially no spurious tenant 0).
+        assert!(!set.is_denied(0));
+        assert!(!set.is_denied(1));
+    }
+
+    #[test]
+    fn denyset_from_entries_empty_is_allow_all() {
+        let set = denyset_from_entries([].iter());
+        assert!(set.is_empty());
+        assert!(!set.is_denied(42)); // default-allow on a cold/empty scan
+    }
+}
diff --git a/crates/gateway/tests/common/mod.rs b/crates/gateway/tests/common/mod.rs
index 45b86fb..2b59fff 100644
--- a/crates/gateway/tests/common/mod.rs
+++ b/crates/gateway/tests/common/mod.rs
@@ -17,11 +17,13 @@ use base64::Engine;
 use bytes::Bytes;
 use http_body_util::{BodyExt, Full};
 use hyper::service::service_fn;
-use hyper::{HeaderMap, Request, Response};
-use hyper_util::rt::TokioIo;
+use hyper::{Request, Response};
+use hyper_util::rt::{TokioExecutor, TokioIo};
+use hyper_util::server::conn::auto;
 use store::Connection;
 use tokio::net::TcpListener;
 use tokio::time::{sleep, timeout};
+use tokio_rustls::TlsAcceptor;
 
 /// Hand out a TCP port no other `free_port()` call in this test binary has returned.
 ///
@@ -207,6 +209,72 @@ fn large_sse() -> String {
     )
 }
 
+/// The canned `(content-type, body)` for a mode. `SseLarge` allocates; the rest are static.
+fn canned_body(mode: Mode) -> (&'static str, Bytes) {
+    match mode {
+        Mode::Json => (
+            "application/json",
+            Bytes::from_static(CANNED_JSON.as_bytes()),
+        ),
+        Mode::Sse => (
+            "text/event-stream",
+            Bytes::from_static(CANNED_SSE.as_bytes()),
+        ),
+        Mode::AnthropicJson => (
+            "application/json",
+            Bytes::from_static(CANNED_ANTHROPIC_JSON.as_bytes()),
+        ),
+        Mode::SseLarge => ("text/event-stream", Bytes::from(large_sse())),
+    }
+}
+
+/// The protocol the gateway used to *reach the mock* — derived from the version hyper parsed off the
+/// wire. Echoed back in `x-mock-proto`; since the gateway relays response headers untouched, the bench
+/// client reads this to prove which protocol the gateway→upstream hop negotiated (H2 vs H1).
+fn proto_label(version: hyper::Version) -> &'static str {
+    match version {
+        hyper::Version::HTTP_2 => "h2",
+        _ => "http/1.1",
+    }
+}
+
+/// Shared request handler for both the plaintext and TLS listeners: record what the gateway forwarded,
+/// then return the canned body tagged with the negotiated protocol.
+async fn mock_handle(
+    req: Request<hyper::body::Incoming>,
+    cap: Arc<Mutex<Option<Captured>>>,
+    mode: Mode,
+) -> Result<Response<Full<Bytes>>, std::convert::Infallible> {
+    let version = req.version();
+    let path = req.uri().path().to_string();
+    // Pull the headers we record before consuming the body (which moves `req`).
+    let (authorization, x_api_key, host) = {
+        let h = req.headers();
+        let get = |k: &str| h.get(k).and_then(|v| v.to_str().ok()).map(String::from);
+        (get("authorization"), get("x-api-key"), get("host"))
+    };
+    let body = req
+        .into_body()
+        .collect()
+        .await
+        .map(|b| b.to_bytes().to_vec())
+        .unwrap_or_default();
+    *cap.lock().unwrap() = Some(Captured {
+        path,
+        authorization,
+        x_api_key,
+        host,
+        body,
+    });
+    let (ct, payload) = canned_body(mode);
+    Ok(Response::builder()
+        .status(200)
+        .header("content-type", ct)
+        .header("x-mock-proto", proto_label(version))
+        .body(Full::new(payload))
+        .unwrap())
+}
+
 impl MockUpstream {
     pub async fn start(mode: Mode) -> Self {
         // Bind `:0` and read the port back, keeping the listener open the whole time — no
@@ -223,50 +291,7 @@ impl MockUpstream {
                 let io = TokioIo::new(stream);
                 let cap = cap.clone();
                 tokio::spawn(async move {
-                    let svc = service_fn(move |req: Request<hyper::body::Incoming>| {
-                        let cap = cap.clone();
-                        async move {
-                            let path = req.uri().path().to_string();
-                            let h: &HeaderMap = req.headers();
-                            let get =
-                                |k: &str| h.get(k).and_then(|v| v.to_str().ok()).map(String::from);
-                            let c = Captured {
-                                path,
-                                authorization: get("authorization"),
-                                x_api_key: get("x-api-key"),
-                                host: get("host"),
-                                body: req
-                                    .into_body()
-                                    .collect()
-                                    .await
-                                    .map(|b| b.to_bytes().to_vec())
-                                    .unwrap_or_default(),
-                            };
-                            *cap.lock().unwrap() = Some(c);
-                            let (ct, payload): (&str, Bytes) = match mode {
-                                Mode::Json => (
-                                    "application/json",
-                                    Bytes::from_static(CANNED_JSON.as_bytes()),
-                                ),
-                                Mode::Sse => (
-                                    "text/event-stream",
-                                    Bytes::from_static(CANNED_SSE.as_bytes()),
-                                ),
-                                Mode::AnthropicJson => (
-                                    "application/json",
-                                    Bytes::from_static(CANNED_ANTHROPIC_JSON.as_bytes()),
-                                ),
-                                Mode::SseLarge => ("text/event-stream", Bytes::from(large_sse())),
-                            };
-                            Ok::<_, std::convert::Infallible>(
-                                Response::builder()
-                                    .status(200)
-                                    .header("content-type", ct)
-                                    .body(Full::new(payload))
-                                    .unwrap(),
-                            )
-                        }
-                    });
+                    let svc = service_fn(move |req| mock_handle(req, cap.clone(), mode));
                     let _ = hyper::server::conn::http1::Builder::new()
                         .serve_connection(io, svc)
                         .await;
@@ -280,6 +305,63 @@ impl MockUpstream {
         }
     }
 
+    /// Like [`start`], but terminates **TLS** and serves H1 *and* H2 on the one listener (protocol
+    /// chosen by ALPN, via hyper-util's auto builder). Presents a throwaway self-signed cert, so the
+    /// gateway must be pointed at it with `upstream_tls = true` and `upstream_verify_cert = false`.
+    /// This is what lets the concurrency bench drive the gateway's real TLS+ALPN+H2 path against a
+    /// local mock. Returns the mock; reach it at `authority()` (host `127.0.0.1`).
+    pub async fn start_tls(mode: Mode) -> Self {
+        // rustls 0.23 needs a process crypto provider; both ring and aws-lc are compiled in (so there's
+        // no default), pick ring to match the gateway. Idempotent across multiple mocks in one process.
+        let _ = rustls::crypto::ring::default_provider().install_default();
+
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let port = listener.local_addr().unwrap().port();
+
+        let ck = rcgen::generate_simple_self_signed(vec![
+            "127.0.0.1".to_string(),
+            "localhost".to_string(),
+        ])
+        .expect("self-signed cert");
+        let certs = vec![ck.cert.der().clone()];
+        let key = rustls::pki_types::PrivateKeyDer::Pkcs8(ck.key_pair.serialize_der().into());
+        let mut tls = rustls::ServerConfig::builder()
+            .with_no_client_auth()
+            .with_single_cert(certs, key)
+            .expect("server tls config");
+        // Offer both so the gateway's ALPN preference decides: H2H1 → h2, H1 → http/1.1.
+        tls.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+        let acceptor = TlsAcceptor::from(Arc::new(tls));
+
+        let captured: Arc<Mutex<Option<Captured>>> = Arc::new(Mutex::new(None));
+        let cap = captured.clone();
+        let task = tokio::spawn(async move {
+            loop {
+                let Ok((stream, _)) = listener.accept().await else {
+                    break;
+                };
+                let acceptor = acceptor.clone();
+                let cap = cap.clone();
+                tokio::spawn(async move {
+                    let Ok(tls_stream) = acceptor.accept(stream).await else {
+                        return;
+                    };
+                    let io = TokioIo::new(tls_stream);
+                    let svc = service_fn(move |req| mock_handle(req, cap.clone(), mode));
+                    // Auto builder: serves H2 or H1 per the negotiated ALPN.
+                    let _ = auto::Builder::new(TokioExecutor::new())
+                        .serve_connection(io, svc)
+                        .await;
+                });
+            }
+        });
+        MockUpstream {
+            port,
+            captured,
+            task,
+        }
+    }
+
     pub fn authority(&self) -> String {
         format!("127.0.0.1:{}", self.port)
     }
@@ -326,6 +408,13 @@ pub struct GatewayBuilder {
     snapshot_path: Option<String>,
     real_upstreams: bool,
     pool_key_overrides: Vec<(String, String)>,
+    rate_limit_rps: Option<u32>,
+    byo_rate_limit_rps: Option<u32>,
+    /// Point at a TLS mock (`MockUpstream::start_tls`): `upstream_tls = true` + skip cert verification
+    /// (the mock is self-signed), while still routing via `provider_authorities`. For the H2 bench.
+    tls_upstream: bool,
+    /// Override the gateway's `upstream_http2` (H2H1 vs H1 ALPN). `None` ⇒ leave the gateway default.
+    upstream_http2: Option<bool>,
 }
 
 impl GatewayBuilder {
@@ -361,13 +450,42 @@ impl GatewayBuilder {
         self
     }
 
+    /// Override the per-credential request-rate ceiling (requests/sec). The harness default leaves
+    /// the gateway's own generous default (100) in place; set a small value to exercise the 429 path.
+    pub fn rate_limit_rps(mut self, rps: u32) -> Self {
+        self.rate_limit_rps = Some(rps);
+        self
+    }
+
+    /// Override the aggregate BYO request-rate ceiling (requests/sec). `0` disables that tier so a
+    /// per-credential 429 test isn't perturbed by the shared BYO bucket.
+    pub fn byo_rate_limit_rps(mut self, rps: u32) -> Self {
+        self.byo_rate_limit_rps = Some(rps);
+        self
+    }
+
+    /// Talk to the upstream over TLS without verifying its cert — for a `MockUpstream::start_tls`
+    /// target (self-signed). The gateway still routes via `provider_authorities` (the mock), but with
+    /// real TLS + ALPN, so the H2 path is exercised. Used by the concurrency bench.
+    pub fn tls_upstream(mut self) -> Self {
+        self.tls_upstream = true;
+        self
+    }
+
+    /// Force the gateway's upstream ALPN: `true` ⇒ H2H1 (prefer H2), `false` ⇒ H1 only. The bench
+    /// starts one gateway each way against the same TLS mock to compare them.
+    pub fn upstream_http2(mut self, on: bool) -> Self {
+        self.upstream_http2 = Some(on);
+        self
+    }
+
     pub async fn start(self) -> Gateway {
         let port = free_port();
         let metrics_port = free_port();
         let config_path = std::env::temp_dir().join(format!("beyond-ai-config-{port}.toml"));
         let nats_port = self.nats_port;
         // Scalars first, `[…]` tables last (TOML ordering).
-        let tls = self.real_upstreams;
+        let tls = self.real_upstreams || self.tls_upstream;
         let mut cfg = format!(
             "listen = \"127.0.0.1:{port}\"\n\
              metrics_listen = \"127.0.0.1:{metrics_port}\"\n\
@@ -375,9 +493,22 @@ impl GatewayBuilder {
              config_bucket = \"ai-gateway\"\n\
              upstream_tls = {tls}\n"
         );
+        // TLS mock is self-signed → don't verify its cert (production always verifies).
+        if self.tls_upstream {
+            cfg.push_str("upstream_verify_cert = false\n");
+        }
+        if let Some(h2) = self.upstream_http2 {
+            cfg.push_str(&format!("upstream_http2 = {h2}\n"));
+        }
         if let Some(path) = &self.snapshot_path {
             cfg.push_str(&format!("snapshot_path = \"{path}\"\n"));
         }
+        if let Some(rps) = self.rate_limit_rps {
+            cfg.push_str(&format!("rate_limit_rps = {rps}\n"));
+        }
+        if let Some(rps) = self.byo_rate_limit_rps {
+            cfg.push_str(&format!("byo_rate_limit_rps = {rps}\n"));
+        }
         if self.real_upstreams {
             // Real-host smoke mode: built-in provider defaults (no authority overrides). For a
             // *managed* smoke we still write the caller-supplied pool key(s) — the real provider key
@@ -451,6 +582,10 @@ impl Gateway {
             snapshot_path: None,
             real_upstreams: false,
             pool_key_overrides: Vec::new(),
+            rate_limit_rps: None,
+            byo_rate_limit_rps: None,
+            tls_upstream: false,
+            upstream_http2: None,
         }
     }
 
@@ -466,6 +601,16 @@ impl Gateway {
             .await
             .unwrap()
     }
+
+    /// GET a path on the admin/metrics listener, returning `(status, body)`. Used to probe
+    /// `/livez` and `/readyz` (which live on `metrics_port`, alongside `/metrics`).
+    pub async fn admin_get(&self, path: &str) -> (u16, String) {
+        let resp = reqwest::get(format!("http://127.0.0.1:{}{path}", self.metrics_port))
+            .await
+            .unwrap();
+        let status = resp.status().as_u16();
+        (status, resp.text().await.unwrap())
+    }
 }
 
 impl Drop for Gateway {
diff --git a/crates/gateway/tests/e2e.rs b/crates/gateway/tests/e2e.rs
index ae6f648..8f8e511 100644
--- a/crates/gateway/tests/e2e.rs
+++ b/crates/gateway/tests/e2e.rs
@@ -253,6 +253,19 @@ async fn streaming_relays_sse_and_meters_usage() {
     assert_eq!(resp.status(), 200);
     assert!(resp.text().await.unwrap().contains("[DONE]"));
     wait_for_metric(&gw, "ai_tokens_total", "input", 5.0).await;
+
+    // The client streamed without `stream_options`, so the managed OpenAI path must have buffered the
+    // body and spliced `stream_options.include_usage` in before forwarding — otherwise OpenAI emits no
+    // usage chunk and the request is unbillable. The metric above can't prove this (the mock returns a
+    // usage chunk unconditionally), so assert the *forwarded body* the mock actually received carries
+    // the injected fragment. This is the only coverage that the splice in `request_body_filter` ran.
+    let cap = mock.captured().expect("mock received a request");
+    let needle = br#""stream_options":{"include_usage":true}"#;
+    assert!(
+        cap.body.windows(needle.len()).any(|w| w == needle),
+        "managed OpenAI streaming body must have stream_options.include_usage injected; got: {}",
+        String::from_utf8_lossy(&cap.body)
+    );
 }
 
 #[tokio::test]
@@ -364,6 +377,56 @@ async fn oversized_content_length_is_rejected_413() {
     assert_eq!(raw_status(gw.port, &req).await, 413);
 }
 
+#[tokio::test]
+async fn per_credential_rate_limit_returns_429() {
+    // Every other rejection code is covered e2e (401/402/403/413/503) — 429 was the gap. A
+    // misconfigured ceiling (e.g. `rate_limit_rps` env typo'd to 0) would silently disable the
+    // guardrail, so prove the full enforcement path: a burst on one credential trips 429, charged on
+    // the raw key in `request_filter` before any verify/upstream connect. BYO (so no key material
+    // needed); the global BYO tier is disabled so this isolates the per-credential ceiling.
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(40);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .rate_limit_rps(5)
+        .byo_rate_limit_rps(0)
+        .start()
+        .await;
+    let client = reqwest::Client::new();
+
+    // Wait until the gateway serves, using a *different* credential so the flood token's budget is
+    // untouched by readiness probing.
+    {
+        let (c, u) = (client.clone(), gw.url());
+        wait_for_status(200, move || {
+            let (c, u) = (c.clone(), u.clone());
+            async move { post_status(&c, &u, "sk-byo-warmup", body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    // Burst one credential well past its 5 rps ceiling within a single window. The first few are
+    // served (200); once the ceiling is crossed the rest are throttled (429).
+    let mut saw_200 = false;
+    let mut saw_429 = false;
+    for _ in 0..50 {
+        match post_status(&client, &gw.url(), "sk-byo-flood", body_for("gpt-4o")).await {
+            200 => saw_200 = true,
+            429 => saw_429 = true,
+            other => panic!("unexpected status under rate limit: {other}"),
+        }
+    }
+    assert!(
+        saw_200,
+        "the first requests under the ceiling must be served"
+    );
+    assert!(
+        saw_429,
+        "a burst past the per-credential ceiling must yield 429"
+    );
+    wait_for_metric(&gw, "ai_rejections_total", "rate_limit", 1.0).await;
+}
+
 #[tokio::test]
 async fn managed_key_via_x_api_key_header_is_accepted() {
     // Anthropic SDKs present the key in `x-api-key`, not `Authorization: Bearer`. A managed virtual
@@ -661,3 +724,40 @@ async fn on_disk_snapshot_enforces_across_restart_without_nats() {
 
     let _ = std::fs::remove_file(&snap);
 }
+
+#[tokio::test]
+async fn health_endpoints_report_ready_on_the_metrics_listener() {
+    // /livez and /readyz live on the metrics listener (alongside /metrics) and must both 200 with a
+    // `{status:"ok"}` body once the process is up. Readiness is intentionally *not* gated on NATS —
+    // the gateway is fail-open, so it can serve from config alone. We stop NATS before probing to
+    // prove readiness doesn't depend on it: a NATS-less gateway is still ready.
+    let mut nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(30);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    nats.stop();
+
+    let (live_status, live_body) = gw.admin_get("/livez").await;
+    assert_eq!(
+        live_status, 200,
+        "livez should be 200 once the process answers"
+    );
+    assert!(
+        live_body.contains("\"status\":\"ok\""),
+        "livez body: {live_body}"
+    );
+
+    let (ready_status, ready_body) = gw.admin_get("/readyz").await;
+    assert_eq!(
+        ready_status, 200,
+        "readyz should be 200 even with NATS down (fail-open): {ready_body}"
+    );
+    assert!(
+        ready_body.contains("\"status\":\"ok\""),
+        "readyz body: {ready_body}"
+    );
+
+    // An unknown admin path is a clean 404, not a hang or a 200.
+    let (nf_status, _) = gw.admin_get("/nope").await;
+    assert_eq!(nf_status, 404);
+}

From 6a3bc22e50d5418482cf6462018cc004d0f7ded5 Mon Sep 17 00:00:00 2001
From: Jared Lunde <jared.lunde@gmail.com>
Date: Sun, 31 May 2026 13:17:39 -0700
Subject: [PATCH 4/7] feat(ai): provider routing by path prefix; drop
 x-beyond-provider header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The provider is now the request's first path segment (`/{provider}/…`); the rest
of the path is forwarded to the upstream verbatim (native passthrough — the
gateway holds no per-provider mount knowledge). A bare path with no provider
prefix that starts with `/v1` is the drop-in default: dialect picks
openai/anthropic, so those two are a host-only swap. An unknown first segment is
a 404.

This makes the gateway a true drop-in for a provider's base URL with any stock
tool (Codex, Cursor, the OpenAI/Anthropic SDKs) — provider is a base-URL concern,
not a per-request header that tools can't set. Removes the `x-beyond-provider`
header, the per-provider `base_path` rewrite table, `CLIENT_PREFIX`, and
`Provider::upstream_path` — a net simplification. `dialect` becomes a provider
attribute (drives usage parsing + stream_options injection eligibility, now a
prefix-agnostic suffix check); `dialect_for_path` survives only for the bare-path
default.

Auth swap / BYO passthrough, deny-set, rate limits, usage metering, model
provenance, and stream_options injection are unchanged.

Swept end to end per the plan: route/proxy/state/config + all inline comments;
e2e (reworked Fireworks → prefix-strip assertion; added `/openai`-prefix ==
bare-default and unknown→404 tests); all smoke tests (per-provider native paths,
Anthropic via `/anthropic/v1/messages`); ARCHITECTURE, README, config.example.

Verified: 68 lib + 18 e2e + 10 smoke pass, clippy clean across all targets, no
remaining x-beyond-provider/base_path/upstream_path/CLIENT_PREFIX references, and
the live Anthropic prefix route returns 200 (`mise run test:smoke`).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml           |  28 ++++
 ARCHITECTURE.md                    |  94 ++++++-------
 Cargo.lock                         |   1 +
 Cargo.toml                         |  14 ++
 README.md                          |  18 ++-
 config.example.toml                |   7 +-
 crates/gateway/Cargo.toml          |   1 +
 crates/gateway/benches/e2e.rs      |   3 +
 crates/gateway/benches/unit.rs     |   3 +
 crates/gateway/src/admin.rs        |   2 +
 crates/gateway/src/config.rs       |  14 +-
 crates/gateway/src/key.rs          |   1 +
 crates/gateway/src/lib.rs          |   5 +
 crates/gateway/src/main.rs         |   6 +
 crates/gateway/src/metrics.rs      |  81 ++++++++++-
 crates/gateway/src/proxy.rs        | 210 +++++++++++++++--------------
 crates/gateway/src/route.rs        | 178 ++++++++++++------------
 crates/gateway/src/state.rs        |  72 +++++++---
 crates/gateway/src/usage.rs        | 143 ++++++++++++++------
 crates/gateway/tests/common/mod.rs |   2 +
 crates/gateway/tests/e2e.rs        | 113 ++++++++++++++--
 crates/gateway/tests/smoke.rs      | 107 +++++++++++----
 22 files changed, 746 insertions(+), 357 deletions(-)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..0e23c17
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,28 @@
+name: CI
+on:
+  pull_request:
+    branches: [main]
+env:
+  CARGO_TERM_COLOR: always
+  # Belt-and-suspenders: the panic-surface + `unused_must_use` denies live in `[workspace.lints]`
+  # (Cargo.toml) so they bind locally too, but escalate *every* warning to an error in CI in case a
+  # lint isn't expressible there (build scripts, future targets).
+  RUSTFLAGS: -D warnings
+jobs:
+  check:
+    name: Check
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - uses: jdx/mise-action@v2
+      - uses: Swatinem/rust-cache@v2
+      # Formatting: dprint (config/json/etc) + rustfmt.
+      - run: mise check:fmt
+      - run: cargo fmt --all --check
+      # Lints: clippy `-D warnings` across all targets. With `[workspace.lints.clippy]` denying the
+      # panic surface (unwrap/expect/panic/todo/unimplemented), a new `.unwrap()` in production code
+      # fails the build here.
+      - run: mise check:rs
+      - run: mise test:unit:rs
+      - run: mise test:integration:rs
+      - run: mise build:rs:release
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 692f11c..564560b 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -9,18 +9,18 @@ response untouched, and emits token-usage facts for billing.
 
 ## Concepts & Terminology
 
-| Term                                             | What It Controls / Gates                                                               | NOT                                                                          |
-| ------------------------------------------------ | -------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------- |
-| **Managed key** (`bai_v1.…`)                     | Ed25519-verified identity; enables key swap, deny-set check, and `ai.usage` billing    | A session token or capability grant — just tenant attribution                |
-| **BYO key** (anything else)                      | Forwarded as-is to the provider; no swap, no billing, no deny-set                      | A lesser tier — same proxy, minus attribution and billing                    |
-| **Pool key**                                     | Real provider API key held by the gateway; swapped in for managed traffic              | Per-tenant — one key per provider, shared by all managed callers             |
-| **Tenant**                                       | The billing entity from the virtual key payload (`tenant_id: u32`)                     | An org, user, or namespace — an opaque integer the gateway doesn't interpret |
-| **Dialect**                                      | Wire protocol implied by the request path (OpenAI `/v1/…` vs Anthropic `/v1/messages`) | The provider — dialect determines auth scheme and usage parsing format       |
-| **Provider**                                     | Named row in the routing table: authority, base path, auth scheme                      | A vendor relationship — just connection facts and auth wiring                |
-| **Deny-set**                                     | Sparse set of denied `tenant_id`s; gates managed traffic; default-allow                | An allowlist or ACL — misses are allowed, not blocked                        |
-| **Tail tap**                                     | Bounded 64KB window kept from the end of the response for usage extraction             | A buffer or copy — the response is relayed unbuffered; only the tail is kept |
-| **Snapshot**                                     | On-disk deny-set cache (entries + NATS cursor) for edge/tunnel deployments             | Persistent store — a pure cache; delete it and the gateway re-scans NATS     |
-| **Virtual key** (`bai_v1.{kid}.{payload}.{sig}`) | Ed25519-signed token encoding `tenant_id` + `vpc_id`                                   | A session or auth token — stateless, no server-side lookup, no revocation    |
+| Term                                             | What It Controls / Gates                                                                                                                                    | NOT                                                                          |
+| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------- |
+| **Managed key** (`bai_v1.…`)                     | Ed25519-verified identity; enables key swap, deny-set check, and `ai.usage` billing                                                                         | A session token or capability grant — just tenant attribution                |
+| **BYO key** (anything else)                      | Forwarded as-is to the provider; no swap, no billing, no deny-set                                                                                           | A lesser tier — same proxy, minus attribution and billing                    |
+| **Pool key**                                     | Real provider API key held by the gateway; swapped in for managed traffic                                                                                   | Per-tenant — one key per provider, shared by all managed callers             |
+| **Tenant**                                       | The billing entity from the virtual key payload (`tenant_id: u32`)                                                                                          | An org, user, or namespace — an opaque integer the gateway doesn't interpret |
+| **Dialect**                                      | A provider attribute (OpenAI-wire vs Anthropic-wire) driving usage parsing; for a bare-path request it's derived from the path to pick the default provider | The provider — a prefixed request uses its provider's dialect, not the path  |
+| **Provider**                                     | The request's **first path segment** (`/{provider}/…`); a named row in the routing table: authority, dialect, auth scheme                                   | A vendor relationship — just connection facts and auth wiring                |
+| **Deny-set**                                     | Sparse set of denied `tenant_id`s; gates managed traffic; default-allow                                                                                     | An allowlist or ACL — misses are allowed, not blocked                        |
+| **Tail tap**                                     | Bounded 64KB window kept from the end of the response for usage extraction                                                                                  | A buffer or copy — the response is relayed unbuffered; only the tail is kept |
+| **Snapshot**                                     | On-disk deny-set cache (entries + NATS cursor) for edge/tunnel deployments                                                                                  | Persistent store — a pure cache; delete it and the gateway re-scans NATS     |
+| **Virtual key** (`bai_v1.{kid}.{payload}.{sig}`) | Ed25519-signed token encoding `tenant_id` + `vpc_id`                                                                                                        | A session or auth token — stateless, no server-side lookup, no revocation    |
 
 ---
 
@@ -30,7 +30,8 @@ response untouched, and emits token-usage facts for billing.
 client (stock SDK, Bearer/ x-api-key)
    │
    ▼ request_filter
-   ├─ provider = dialect(path) [+ x-beyond-provider override]   (unknown → 400)
+   ├─ provider = first path segment `/{provider}/…` (strip + forward rest verbatim);
+   │             bare `/v1…` → dialect default (openai/anthropic);  unknown → 404
    ├─ extract key                                               (missing → 401)
    ├─ rate guardrails ← BEFORE verify/connect: per-credential (seeded raw-key hash) +
    │                    global BYO aggregate (managed exempt; protects egress IPs); over → 429
@@ -120,13 +121,14 @@ client (stock SDK, Bearer/ x-api-key)
 
   Both tiers are generous circuit breakers, not quotas; `rate_limit_rps = 0` / `byo_rate_limit_rps = 0`
   disable them independently.
-- **Routing is dialect-based** (model isn't known before peer selection); any non-default provider
-  is reached via the `x-beyond-provider: <name>` header. **Providers are data** — a row in
-  `route::KNOWN_PROVIDERS` (name, authority, **base path**, auth scheme) or a config entry — so
-  adding an OpenAI-wire provider is one line, no new code paths. Each row's connection facts are
-  **verified against the provider's official docs (cited inline in `route.rs`)**; the client's `/v1`
-  prefix is rewritten to the provider's mount point (Groq `/openai/v1`, Fireworks `/inference/v1`,
-  OpenRouter `/api/v1`) so a verbatim passthrough can't 404.
+- **Routing is by the first path segment = provider** (model isn't known before peer selection).
+  `/{provider}/…` selects the provider and the rest of the path is forwarded **verbatim** — the
+  gateway holds no per-provider mount knowledge, so the client uses the provider's own native path
+  (`/groq/openai/v1/…`, `/fireworks/inference/v1/…`). A bare path with no provider prefix that starts
+  with `/v1` is the **drop-in default** (dialect → openai/anthropic); an unknown segment is a **404**.
+  **Providers are data** — a row in `route::KNOWN_PROVIDERS` (name, authority, dialect, auth scheme)
+  or a config entry — so adding an OpenAI-wire provider is one line, no new code paths. Each row's
+  authority/auth is **verified against the provider's official docs (cited inline in `route.rs`)**.
 - **Connect retries only** (`fail_to_connect`); no HTTP-status retry (Pingora-idiomatic, SDKs back off).
 - **`ai.usage` carries _both_ models: `model` (resolved) + `requested_model` (alias).** `model` is
   the id the provider resolved + billed, taken from the _response_ (a second `ModelScanner` over the
@@ -173,36 +175,38 @@ All fields configurable via `config.example.toml` and environment (`AI_` prefix,
 Secret-bearing fields (`pool_keys`, `nats_creds`) are held as `Secret` — stray `Debug`/`Serialize`
 output redacts values.
 
-| Field                         | Default                           | Runtime Effect                                                                                                                                                   |
-| ----------------------------- | --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `signing_keys`                | _(required)_                      | Map of kid → base64 Ed25519 public key. Multiple kids enable rotation. Missing → all traffic falls through to BYO treatment.                                     |
-| `pool_keys.<name>`            | _(from `AI_POOL_KEY_<NAME>` env)_ | Real provider API key. Missing for a provider → managed requests to that provider return 503.                                                                    |
-| `provider_authorities.<name>` | _(none)_                          | Override or add a provider's `authority` (host:port). Enables config-added providers beyond `KNOWN_PROVIDERS` with zero code change.                             |
-| `snapshot_path`               | _(unset)_                         | Path for the on-disk deny-set cache. Unset → re-scan NATS on every cold boot. Set → load from disk and enforce before NATS reconnects (edge/tunnel deployments). |
-| `rate_limit_rps`              | `100`                             | Per-credential request ceiling (count-min, keyed on raw key hash). `0` disables. Exceeded → 429. Checked before Ed25519 verify.                                  |
-| `byo_rate_limit_rps`          | `1000`                            | Aggregate ceiling for all BYO traffic (single shared bucket). `0` disables. Managed traffic exempt.                                                              |
-| `connect_timeout_secs`        | `10`                              | TCP connect timeout to the upstream provider. Exceeded → retry up to 2×, then 502.                                                                               |
-| `read_timeout_secs`           | `600`                             | Response read timeout. 10 minutes accommodates long-running LLM streams.                                                                                         |
-| `nats_url`                    | `nats://localhost:4222`           | NATS server for the deny-set watcher. Unreachable → fail-open (stale or empty set).                                                                              |
-| `nats_creds`                  | _(unset)_                         | NATS credentials file path. Required for authenticated clusters.                                                                                                 |
-| `listen_addr`                 | `0.0.0.0:8080`                    | Proxy listener address.                                                                                                                                          |
-| `metrics_listen`              | `0.0.0.0:9090`                    | Internal admin/observability listener: `/metrics` (Prometheus scrape), `/livez`, `/readyz`. Separate from the client listener — not externally reachable.        |
+| Field                         | Default                           | Runtime Effect                                                                                                                                                                                                                       |
+| ----------------------------- | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `signing_keys`                | _(required)_                      | Map of kid → base64 Ed25519 public key. Multiple kids enable rotation. Missing → all traffic falls through to BYO treatment.                                                                                                         |
+| `require_signing_keys`        | `false`                           | When `true`, an empty `signing_keys` is a hard boot failure instead of silent BYO-only mode. Set on managed deployments so a typo'd/absent SSM param fails fast rather than serving for free (no key swap, no deny-set, no billing). |
+| `pool_keys.<name>`            | _(from `AI_POOL_KEY_<NAME>` env)_ | Real provider API key. Missing for a provider → managed requests to that provider return 503.                                                                                                                                        |
+| `provider_authorities.<name>` | _(none)_                          | Override or add a provider's `authority` (host:port). Enables config-added providers beyond `KNOWN_PROVIDERS` with zero code change.                                                                                                 |
+| `snapshot_path`               | _(unset)_                         | Path for the on-disk deny-set cache. Unset → re-scan NATS on every cold boot. Set → load from disk and enforce before NATS reconnects (edge/tunnel deployments).                                                                     |
+| `rate_limit_rps`              | `100`                             | Per-credential request ceiling (count-min, keyed on raw key hash). `0` disables. Exceeded → 429. Checked before Ed25519 verify.                                                                                                      |
+| `byo_rate_limit_rps`          | `1000`                            | Aggregate ceiling for all BYO traffic (single shared bucket). `0` disables. Managed traffic exempt.                                                                                                                                  |
+| `connect_timeout_secs`        | `10`                              | TCP connect timeout to the upstream provider. Exceeded → retry up to 2×, then 502.                                                                                                                                                   |
+| `read_timeout_secs`           | `600`                             | Response read timeout. 10 minutes accommodates long-running LLM streams.                                                                                                                                                             |
+| `nats_url`                    | `nats://localhost:4222`           | NATS server for the deny-set watcher. Unreachable → fail-open (stale or empty set).                                                                                                                                                  |
+| `nats_creds`                  | _(unset)_                         | NATS credentials file path. Required for authenticated clusters.                                                                                                                                                                     |
+| `listen_addr`                 | `0.0.0.0:8080`                    | Proxy listener address.                                                                                                                                                                                                              |
+| `metrics_listen`              | `0.0.0.0:9090`                    | Internal admin/observability listener: `/metrics` (Prometheus scrape), `/livez`, `/readyz`. Separate from the client listener — not externally reachable.                                                                            |
 
 ---
 
 ## Failure Modes
 
-| Failure                                     | What Actually Happens                                                                                       | Recovery                                                                                                        |
-| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------- |
-| NATS unreachable at boot                    | Deny-set starts empty (fail-open). Auth still works — keys from config.                                     | Watcher reconnects; seeds from NATS or disk snapshot on connect.                                                |
-| NATS disconnects mid-run                    | Last-known deny-set stays active. New deny entries not applied until reconnect.                             | Watcher reconnects (1s→30s exponential backoff, reset on success) and resumes from saved revision — no re-scan. |
-| NATS history compacted past snapshot cursor | `CursorExpired` → full re-scan from current NATS state.                                                     | After re-scan, new cursor set; delta watch resumes normally.                                                    |
-| Virtual key tampered or forged              | Ed25519 verify fails → falls through to BYO treatment. No billing event.                                    | Billing miss detectable downstream; no security boundary breach.                                                |
-| Pool key missing for provider               | Managed request returns 503 before any upstream connection.                                                 | Add `AI_POOL_KEY_<NAME>` env and redeploy.                                                                      |
-| Provider DNS fails                          | `upstream_peer` returns error → 502 to client.                                                              | TTL-cached DNS (60s) serves stale; poisoned-lock guard re-resolves on next request.                             |
-| Provider TCP connect fails                  | `fail_to_connect` retries up to 2×, then returns 502.                                                       | Client SDK retries with backoff. No HTTP-status retries (Pingora-idiomatic).                                    |
-| Response body > 128KB before usage chunk    | Tail compaction fires: `drain(..half)` discards first half, keeps tail. Usage extracted from retained tail. | No action — O(1) tail tap is designed for this; SSE usage is always in the final data line.                     |
-| Gateway crash mid-request                   | In-flight request drops; client receives TCP close, not a structured error. No partial state written.       | Client SDK retries. No DB writes in the request path — no cleanup needed.                                       |
+| Failure                                     | What Actually Happens                                                                                                          | Recovery                                                                                                        |
+| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------- |
+| NATS unreachable at boot                    | Deny-set starts empty (fail-open). Auth still works — keys from config.                                                        | Watcher reconnects; seeds from NATS or disk snapshot on connect.                                                |
+| NATS disconnects mid-run                    | Last-known deny-set stays active. New deny entries not applied until reconnect.                                                | Watcher reconnects (1s→30s exponential backoff, reset on success) and resumes from saved revision — no re-scan. |
+| NATS history compacted past snapshot cursor | `CursorExpired` → full re-scan from current NATS state.                                                                        | After re-scan, new cursor set; delta watch resumes normally.                                                    |
+| Virtual key tampered or forged              | Ed25519 verify fails → falls through to BYO treatment. No billing event.                                                       | Billing miss detectable downstream; no security boundary breach.                                                |
+| `signing_keys` absent (typo'd/missing SSM)  | Default: warn + BYO-only (silently drops all managed billing + deny-set). With `require_signing_keys=true`: hard boot failure. | Set `require_signing_keys=true` on managed deployments so the mis-deploy fails fast and visibly at boot.        |
+| Pool key missing for provider               | Managed request returns 503 before any upstream connection.                                                                    | Add `AI_POOL_KEY_<NAME>` env and redeploy.                                                                      |
+| Provider DNS fails                          | `upstream_peer` returns error → 502 to client.                                                                                 | TTL-cached DNS (60s) serves stale; poisoned-lock guard re-resolves on next request.                             |
+| Provider TCP connect fails                  | `fail_to_connect` retries up to 2×, then returns 502.                                                                          | Client SDK retries with backoff. No HTTP-status retries (Pingora-idiomatic).                                    |
+| Response body > 128KB before usage chunk    | Tail compaction fires: `drain(..half)` discards first half, keeps tail. Usage extracted from retained tail.                    | No action — O(1) tail tap is designed for this; SSE usage is always in the final data line.                     |
+| Gateway crash mid-request                   | In-flight request drops; client receives TCP close, not a structured error. No partial state written.                          | Client SDK retries. No DB writes in the request path — no cleanup needed.                                       |
 
 ---
 
diff --git a/Cargo.lock b/Cargo.lock
index ec8bd74..1a9bf79 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -305,6 +305,7 @@ name = "beyond-ai"
 version = "0.1.0"
 dependencies = [
  "arc-swap",
+ "arrayvec",
  "async-trait",
  "base64",
  "beyond-slipstream",
diff --git a/Cargo.toml b/Cargo.toml
index 8f994ce..bee51ed 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,6 +14,19 @@ rust-version = "1.85"
 unsafe_code = "forbid"
 unused_must_use = "deny"
 
+# Panic surface: a stray `.unwrap()`/`.expect()`/`panic!`/`todo!` in a request path is a worker
+# crash, not an error response. Deny them so a new one is a hard CI failure (mise `check:rs` runs
+# clippy with `-D warnings`). These are clippy *restriction* lints (allow-by-default); naming them
+# here turns them on. The handful of genuine boot-time invariants carry a local
+# `#[allow(clippy::expect_used)]` with a SAFETY-style note; test/bench targets allow them wholesale
+# at the file head (asserting a precondition with `.unwrap()` is the point of a test).
+[workspace.lints.clippy]
+unwrap_used = "deny"
+expect_used = "deny"
+panic = "deny"
+todo = "deny"
+unimplemented = "deny"
+
 # Release builds wrap arithmetic silently by default; turn that into a panic so an overflow on a
 # size/count never goes unnoticed. Negligible cost for a proxy (arithmetic isn't the bottleneck).
 [profile.release]
@@ -31,6 +44,7 @@ pingora-limits = "0.8"
 pingora-proxy = "0.8"
 
 arc-swap = "1"
+arrayvec = "0.7"
 async-nats = "0.46"
 async-trait = "0.1"
 base64 = "0.22"
diff --git a/README.md b/README.md
index 38a0931..e852c19 100644
--- a/README.md
+++ b/README.md
@@ -35,16 +35,22 @@ client = OpenAI(base_url="http://ai.internal/v1", api_key="sk-your-openai-key")
 
 ## Providers
 
-Select a non-default provider with `x-beyond-provider: <name>`:
+The provider is the **first path segment** of the base URL — no header, nothing tool-specific. Bare
+`/v1` defaults to OpenAI (and `/v1/messages` to Anthropic), so the two big providers are a host-only
+swap; everything else is `/{provider}/…` using that provider's own path (forwarded verbatim).
 
 ```python
-client = OpenAI(
-    base_url="http://ai.internal/v1",
-    api_key="bai_v1...",
-    default_headers={"x-beyond-provider": "groq"},
-)
+# OpenAI (default) — change only the host
+client = OpenAI(base_url="http://ai.internal/v1", api_key="bai_v1...")
+
+# Groq — its native base path is /openai/v1, so the gateway path is /groq/openai/v1
+client = OpenAI(base_url="http://ai.internal/groq/openai/v1", api_key="bai_v1...")
+
+# Fireworks mounts at /inference/v1 → /fireworks/inference/v1; OpenRouter at /api/v1 → /openrouter/api/v1
 ```
 
+An unknown first segment is a 404. See `route::KNOWN_PROVIDERS` for each provider's native base path.
+
 ## Config
 
 All config keys are overridable by `AI_`-prefixed env vars (`AI_NATS_URL`, `AI_POOL_KEY_OPENAI`, …). See `config.example.toml` for the full reference.
diff --git a/config.example.toml b/config.example.toml
index d31943a..566534a 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -43,9 +43,10 @@ rate_limit_rps = 100
 byo_rate_limit_rps = 1000
 
 # Optional per-provider upstream authority (host:port), BY PROVIDER NAME. For a known provider this
-# overrides its built-in default; for an unknown name it ADDS a new OpenAI-wire provider, reachable
-# via `x-beyond-provider: <name>`. Known providers (zero-config defaults): openai, anthropic,
-# openrouter, fireworks, groq, deepseek, together, cerebras, mistral, xai.
+# overrides its built-in default; for an unknown name it ADDS a new OpenAI-wire provider, then
+# reachable at `/{name}/…` (the provider is the first path segment of the request). Known providers
+# (zero-config defaults): openai, anthropic, openrouter, fireworks, groq, deepseek, together,
+# cerebras, mistral, xai.
 # [provider_authorities]
 # openai = "api.openai.com:443"
 # my-self-hosted = "llm.internal:8443"
diff --git a/crates/gateway/Cargo.toml b/crates/gateway/Cargo.toml
index f04fd8e..f52d86f 100644
--- a/crates/gateway/Cargo.toml
+++ b/crates/gateway/Cargo.toml
@@ -26,6 +26,7 @@ pingora-limits = { workspace = true }
 pingora-proxy = { workspace = true }
 
 arc-swap = { workspace = true }
+arrayvec = { workspace = true }
 async-trait = { workspace = true }
 base64 = { workspace = true }
 bytes = { workspace = true }
diff --git a/crates/gateway/benches/e2e.rs b/crates/gateway/benches/e2e.rs
index 5338c4a..4e39b43 100644
--- a/crates/gateway/benches/e2e.rs
+++ b/crates/gateway/benches/e2e.rs
@@ -1,3 +1,6 @@
+// Bench target: `.unwrap()`/`.expect()` set up the harness; not production code. See tests/e2e.rs.
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
+
 //! A-1 end-to-end bench: the real `beyond-ai` binary + real `nats-server` + a mock upstream,
 //! driven over real HTTP. Run with `mise run bench:e2e` (needs `nats-server` on PATH — mise
 //! provides it). This is the macro counterpart to `unit.rs`: it measures the *whole* request path
diff --git a/crates/gateway/benches/unit.rs b/crates/gateway/benches/unit.rs
index c733529..e35a195 100644
--- a/crates/gateway/benches/unit.rs
+++ b/crates/gateway/benches/unit.rs
@@ -1,3 +1,6 @@
+// Bench target: `.unwrap()`/`.expect()` set up fixtures; not production code. See tests/e2e.rs.
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
+
 //! Unit bench: the pure, IO-free hot paths. Timing **and** allocations come from `divan` — its
 //! `AllocProfiler` (installed as the global allocator below) reports alloc count + bytes per
 //! sample right beside ns/iter, so the design's allocation claims are visible in one table.
diff --git a/crates/gateway/src/admin.rs b/crates/gateway/src/admin.rs
index ecdf542..3e924d4 100644
--- a/crates/gateway/src/admin.rs
+++ b/crates/gateway/src/admin.rs
@@ -29,6 +29,7 @@ impl AdminApp {
     /// Build a `{"status","version"}` JSON health response. `status` is `"ok"`/`"degraded"` so a
     /// human or a probe can read intent without parsing the code. Header values are all static or
     /// integer, so the builder can't fail — `expect` documents that invariant.
+    #[allow(clippy::expect_used)] // builder inputs are all static/integer; cannot fail
     fn health(status: u16, health: &str) -> Response<Vec<u8>> {
         let body = serde_json::json!({ "status": health, "version": VERSION })
             .to_string()
@@ -42,6 +43,7 @@ impl AdminApp {
     }
 
     /// Encode the default Prometheus registry as text (same output as Pingora's built-in service).
+    #[allow(clippy::expect_used)] // builder inputs are encoder-derived/integer; cannot fail
     fn metrics() -> Response<Vec<u8>> {
         let encoder = TextEncoder::new();
         let mut buffer = Vec::new();
diff --git a/crates/gateway/src/config.rs b/crates/gateway/src/config.rs
index 2a40bae..b41c5a3 100644
--- a/crates/gateway/src/config.rs
+++ b/crates/gateway/src/config.rs
@@ -47,6 +47,14 @@ pub struct AiConfig {
     /// → base64 public key. Multiple allowed for zero-downtime rotation. Config, not NATS.
     pub signing_keys: HashMap<String, String>,
 
+    /// Fail the boot if `signing_keys` is empty, instead of degrading to BYO-only. Empty signing
+    /// keys is a *legitimate* mode (a BYO-only deployment) but is far more often a mis-deploy — a
+    /// typo'd/absent SSM param — that looks healthy while silently dropping **all** managed billing
+    /// and deny-set enforcement. A managed deployment should set this `true` so a bad deploy fails
+    /// fast and visibly at boot rather than serving for free. Default `false` to keep BYO-only and
+    /// the test/e2e harnesses (which run keyless) working out of the box.
+    pub require_signing_keys: bool,
+
     /// Managed Beyond pool keys, **by provider name** (`openai`, `anthropic`, `fireworks`, …).
     /// From the `[pool_keys]` TOML table or SSM-injected `AI_POOL_KEY_<NAME>` env (the env form is
     /// the production path — see `load_with_path`). A provider with no pool key here can't serve
@@ -56,8 +64,9 @@ pub struct AiConfig {
 
     /// Per-provider upstream authority (`host:port`), **by provider name**. For a known provider
     /// (see `route::KNOWN_PROVIDERS`) this *overrides* its default; for an unknown name it *adds* a
-    /// new OpenAI-wire provider reachable via `x-beyond-provider`. Empty = every known provider uses
-    /// its built-in default. (The e2e harness points providers at a mock here.)
+    /// new OpenAI-wire provider, then reachable at `/{name}/…` (the provider is the request's first
+    /// path segment). Empty = every known provider uses its built-in default. (The e2e harness points
+    /// providers at a mock here.)
     pub provider_authorities: HashMap<String, String>,
 
     /// Upstream timeouts (seconds). Streaming responses are long, so read/idle are generous.
@@ -118,6 +127,7 @@ impl Default for AiConfig {
             config_bucket: "ai-gateway".to_string(),
             snapshot_path: None,
             signing_keys: HashMap::new(),
+            require_signing_keys: false,
             pool_keys: HashMap::new(),
             provider_authorities: HashMap::new(),
             connect_timeout_secs: 10,
diff --git a/crates/gateway/src/key.rs b/crates/gateway/src/key.rs
index c1ce33f..4cdce56 100644
--- a/crates/gateway/src/key.rs
+++ b/crates/gateway/src/key.rs
@@ -209,6 +209,7 @@ pub fn verifying_key_from_value(bytes: &[u8]) -> Option<VerifyingKey> {
 /// Mint a virtual key. Lives here for tests + determinism checks and as the reference
 /// implementation; production minting is the Go control plane (`crypto/ed25519`), which must
 /// produce byte-identical output for the same inputs.
+#[allow(clippy::expect_used)] // payload is a fixed 22-char base64 of 16 bytes; always fits the cap
 pub fn mint(vk: &VirtualKey, kid: Kid, signing_key: &SigningKey) -> String {
     let payload_b64 = URL_SAFE_NO_PAD.encode(vk.encode_payload());
     let mut signed_buf = [0u8; SIGNED_BYTES_CAP];
diff --git a/crates/gateway/src/lib.rs b/crates/gateway/src/lib.rs
index 7f4ed49..be11bd6 100644
--- a/crates/gateway/src/lib.rs
+++ b/crates/gateway/src/lib.rs
@@ -8,6 +8,11 @@
 // they apply to *both* crate roots — this lib and the `main.rs` binary — not just whichever unit
 // carries a crate-level `#![deny]`. A dropped `Result` (e.g. an unchecked `write_response_*`) is
 // therefore a hard error, and `unsafe` is forbidden, everywhere in the crate.
+//
+// `unwrap_used`/`expect_used`/`panic` are denied in production code (see `[workspace.lints.clippy]`)
+// but a unit test's whole job is to assert a precondition holds — `.unwrap()` *is* the assertion — so
+// allow them in `#[cfg(test)]` modules.
+#![cfg_attr(test, allow(clippy::unwrap_used, clippy::expect_used, clippy::panic))]
 
 pub mod admin;
 pub mod config;
diff --git a/crates/gateway/src/main.rs b/crates/gateway/src/main.rs
index 43de521..046cc0d 100644
--- a/crates/gateway/src/main.rs
+++ b/crates/gateway/src/main.rs
@@ -1,5 +1,8 @@
 //! Beyond AI gateway binary: clap `Run`/`Doctor`, Pingora server bootstrap, services.
 
+// See `lib.rs`: deny the panic surface in production, allow it in `#[cfg(test)]` assertions.
+#![cfg_attr(test, allow(clippy::unwrap_used, clippy::expect_used, clippy::panic))]
+
 use beyond_ai::admin::AdminApp;
 use beyond_ai::config::AiConfig;
 use beyond_ai::doctor;
@@ -61,6 +64,9 @@ fn init_tracing() {
         .init();
 }
 
+// Boot path: every `.expect()` here is a fatal start-up invariant (no runtime to build, no Pingora
+// server) — a panic before we serve a single request is the correct, visible failure.
+#[allow(clippy::expect_used)]
 fn main() {
     // rustls 0.23 requires a process-wide crypto provider for the TLS connections to providers.
     // Idempotent: an `Err` means a provider is already installed (e.g. a second init in tests),
diff --git a/crates/gateway/src/metrics.rs b/crates/gateway/src/metrics.rs
index 9fe8e71..2f669b1 100644
--- a/crates/gateway/src/metrics.rs
+++ b/crates/gateway/src/metrics.rs
@@ -4,7 +4,8 @@
 //! exposes them with no extra wiring. `Metrics::new` is called exactly once (in `main`).
 
 use prometheus::{
-    HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, Opts, default_registry,
+    Histogram, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, Opts,
+    default_registry,
 };
 use std::sync::Arc;
 
@@ -26,6 +27,13 @@ pub struct Metrics {
     /// "cache hit rate fell off a cliff after a deploy" (cache write ≈ 3× input, cache read ≈ 0.1×,
     /// so a regression is a real cost event, not just a latency one).
     pub tokens_total: IntCounterVec,
+    /// The four `tokens_total` children, resolved once at boot. The label set (`input`/`output`/
+    /// `cache_read`/`cache_write`) is fixed and known at compile time, so we pay the
+    /// `with_label_values` map lookup once here instead of four times per metered response.
+    pub tokens_input: IntCounter,
+    pub tokens_output: IntCounter,
+    pub tokens_cache_read: IntCounter,
+    pub tokens_cache_write: IntCounter,
     /// Labeled by provider: TTFT varies by an order of magnitude across providers (Groq/Cerebras
     /// <100ms vs. a large Anthropic/xAI model at seconds), so an unlabeled histogram can't tell you
     /// *which* provider's first-token time regressed.
@@ -92,6 +100,12 @@ impl Metrics {
         )?;
         let tokens_total =
             IntCounterVec::new(Opts::new("ai_tokens_total", "Tokens metered"), &["kind"])?;
+        // Resolve the fixed-label children once. Created against the (about-to-be-registered) vec, so
+        // they export normally; the hot path then bumps a direct handle, no per-call label lookup.
+        let tokens_input = tokens_total.with_label_values(&["input"]);
+        let tokens_output = tokens_total.with_label_values(&["output"]);
+        let tokens_cache_read = tokens_total.with_label_values(&["cache_read"]);
+        let tokens_cache_write = tokens_total.with_label_values(&["cache_write"]);
         let ttft_seconds = HistogramVec::new(
             HistogramOpts::new("ai_ttft_seconds", "Time to first byte from upstream")
                 .buckets(TTFT_BUCKETS.to_vec()),
@@ -138,6 +152,10 @@ impl Metrics {
             upstream_responses_total,
             connect_retries_total,
             tokens_total,
+            tokens_input,
+            tokens_output,
+            tokens_cache_read,
+            tokens_cache_write,
             ttft_seconds,
             upstream_latency_seconds,
             active_streams,
@@ -147,3 +165,64 @@ impl Metrics {
         }))
     }
 }
+
+/// Per-provider metric handles, resolved once at boot and held on the [`Provider`](crate::route::Provider).
+///
+/// Every per-provider metric (`ttft_seconds`, `upstream_latency_seconds`, `upstream_responses_total`,
+/// `connect_retries_total`) is keyed on the provider name — a label known at boot from the provider
+/// registry. Resolving the child handles here lets the response path bump a direct counter/histogram
+/// instead of doing a string-keyed `with_label_values` map lookup on every response.
+pub struct ProviderMetrics {
+    pub ttft_seconds: Histogram,
+    pub upstream_latency_seconds: Histogram,
+    pub connect_retries_total: IntCounter,
+    /// Responses by status class, indexed `[2xx, 3xx, 4xx, 5xx]` (see [`Self::record_response`]).
+    responses: [IntCounter; 4],
+}
+
+impl ProviderMetrics {
+    /// Resolve the child handles for `provider` from the shared label vecs. Called once per provider
+    /// at boot (see `state::build_providers`).
+    pub fn resolve(m: &Metrics, provider: &str) -> Self {
+        ProviderMetrics {
+            ttft_seconds: m.ttft_seconds.with_label_values(&[provider]),
+            upstream_latency_seconds: m.upstream_latency_seconds.with_label_values(&[provider]),
+            connect_retries_total: m.connect_retries_total.with_label_values(&[provider]),
+            responses: [
+                m.upstream_responses_total
+                    .with_label_values(&[provider, "2xx"]),
+                m.upstream_responses_total
+                    .with_label_values(&[provider, "3xx"]),
+                m.upstream_responses_total
+                    .with_label_values(&[provider, "4xx"]),
+                m.upstream_responses_total
+                    .with_label_values(&[provider, "5xx"]),
+            ],
+        }
+    }
+
+    /// Count one upstream response, bucketed by status class (`2xx`/`3xx`/`4xx`/`5xx`).
+    pub fn record_response(&self, status: u16) {
+        let idx = match status {
+            200..=299 => 0,
+            300..=399 => 1,
+            400..=499 => 2,
+            _ => 3,
+        };
+        self.responses[idx].inc();
+    }
+
+    /// Standalone, **unregistered** handles for tests that build a `Provider` without a live registry.
+    #[cfg(test)]
+    pub fn disconnected() -> Self {
+        let counter = || IntCounter::new("t", "t").expect("valid counter opts");
+        let hist =
+            || Histogram::with_opts(HistogramOpts::new("t", "t")).expect("valid histogram opts");
+        ProviderMetrics {
+            ttft_seconds: hist(),
+            upstream_latency_seconds: hist(),
+            connect_retries_total: counter(),
+            responses: [counter(), counter(), counter(), counter()],
+        }
+    }
+}
diff --git a/crates/gateway/src/proxy.rs b/crates/gateway/src/proxy.rs
index ee48efb..efbe57b 100644
--- a/crates/gateway/src/proxy.rs
+++ b/crates/gateway/src/proxy.rs
@@ -1,7 +1,7 @@
 //! The Pingora `ProxyHttp` passthrough service.
 //!
-//! Flow: verify the virtual key (stateless) → deny-set check (O(1), default-allow) → pick the
-//! provider from the ingress dialect (+ optional `x-beyond-provider` override) → swap the auth
+//! Flow: pick the provider from the **first path segment** (`/{provider}/…`) → verify the virtual
+//! key (stateless) → deny-set check (O(1), default-allow) → swap the auth
 //! header to the pool key (managed only) → **stream the request body straight through** (never
 //! buffered; original framing preserved) while feeding it to a structural scanner that extracts the
 //! exact root-level `model` → relay the response **without buffering** → tap usage from a bounded
@@ -28,12 +28,15 @@
 //! the pool key); anything else is a **BYO** request — the user's own provider token, passed
 //! through unchanged (no swap, no Beyond identity, no deny-set).
 //!
-//! Consequence: routing is by **dialect**, not model — the body (hence model) isn't known when
-//! `upstream_peer` runs. Any non-default provider is reached via the `x-beyond-provider` header
-//! (providers are data — see `route`). Model is still captured (from the streamed body) for usage.
+//! Routing is by the **first path segment** = provider name (`route`, data-driven): `/{provider}/…`
+//! selects the provider and the rest of the path is forwarded **verbatim** (the gateway holds no
+//! per-provider mount knowledge). A bare path with no provider prefix that starts with `/v1` is the
+//! drop-in default — dialect picks openai/anthropic (`dialect_for_path`) — so an OpenAI/Anthropic
+//! client works by changing only the host. An unknown first segment is a 404. Model isn't used for
+//! routing (the body isn't read pre-connect); it's still captured from the body for usage.
 
 use crate::route::{self, Dialect, Provider};
-use crate::state::GatewayState;
+use crate::state::{GatewayState, RequestId};
 use crate::{peek, usage};
 use async_trait::async_trait;
 use bytes::Bytes;
@@ -42,6 +45,7 @@ use pingora_core::Result;
 use pingora_core::protocols::ALPN;
 use pingora_core::upstreams::peer::HttpPeer;
 use pingora_proxy::{ProxyHttp, Session};
+use std::borrow::Cow;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tracing::{info, warn};
@@ -79,6 +83,11 @@ pub struct RequestCtx {
     /// The resolved upstream provider (authority/host + precomputed managed auth value), shared from
     /// the boot-time registry — a cheap `Arc` clone, nothing re-allocated per request.
     provider: Arc<Provider>,
+    /// The path (+ query) to send upstream: the client path with the `/{provider}` segment stripped
+    /// (provider-prefixed request) or unchanged (bare-path default). Forwarded **verbatim** — the
+    /// gateway does no per-provider path rewriting. Applied as the upstream URI in
+    /// `upstream_request_filter`.
+    forward_path: String,
     /// Whether this is a **managed** request (`bai_…` key → swap to the pool key). `false` for
     /// **BYO** — we leave the user's own auth header untouched (passthrough).
     managed: bool,
@@ -116,7 +125,7 @@ pub struct RequestCtx {
     attempt: u8,
     /// Process-unique id for this request (`{instance}-{seq}`), echoed in the `x-beyond-request-id`
     /// response header and the `ai.usage` event so a client report ties back to a log line.
-    request_id: String,
+    request_id: RequestId,
 }
 
 impl AiProxy {
@@ -172,12 +181,16 @@ const MAX_MODEL_LEN: usize = 128;
 /// string or a line-oriented log: control bytes, `"`, `\`, `DEL`. A violating or over-long value is
 /// recorded as `"unknown"` (matching `peek`'s non-UTF-8 fallback) rather than the raw bytes — a
 /// mislabeled-but-safe usage row beats a corrupted or injected one.
-fn sanitize_model(model: String) -> String {
+fn sanitize_model(model: String) -> Cow<'static, str> {
     let bad = model.len() > MAX_MODEL_LEN
         || model
             .bytes()
             .any(|b| b < 0x20 || b == b'"' || b == b'\\' || b == 0x7f);
-    if bad { "unknown".to_string() } else { model }
+    if bad {
+        Cow::Borrowed("unknown")
+    } else {
+        Cow::Owned(model)
+    }
 }
 
 fn dialect_for_path(path: &str) -> Dialect {
@@ -189,11 +202,13 @@ fn dialect_for_path(path: &str) -> Dialect {
     }
 }
 
-/// OpenAI **streaming-capable** endpoints: chat completions + the Responses API. These are the only
-/// requests we buffer for `stream_options.include_usage` injection — embeddings and every other
-/// OpenAI-dialect path never stream, so there's nothing to meter and no reason to buffer them.
-fn openai_streamable_path(path: &str) -> bool {
-    path.starts_with("/v1/chat/completions") || path.starts_with("/v1/responses")
+/// Whether the **forwarded** (provider-native) path targets an OpenAI streaming-capable endpoint —
+/// chat completions or the Responses API. Checked by *suffix*, so it holds regardless of the
+/// provider's mount prefix (`/v1/chat/completions`, `/openai/v1/chat/completions`,
+/// `/inference/v1/chat/completions`, …). Only these get buffered for `stream_options.include_usage`
+/// injection — embeddings and everything else never stream, so there's nothing to meter.
+fn is_streamable_path(forward_path: &str) -> bool {
+    forward_path.ends_with("/chat/completions") || forward_path.ends_with("/responses")
 }
 
 /// Splice `stream_options.include_usage` into a buffered OpenAI chat body when it streams without it
@@ -213,17 +228,6 @@ fn maybe_inject_stream_usage(body: Vec<u8>) -> Vec<u8> {
     }
 }
 
-/// The `x-beyond-provider` override value, if present — a provider *name* resolved against the
-/// registry in `request_filter`. (An unknown name is rejected there, not silently ignored.)
-fn provider_override(session: &Session) -> Option<&str> {
-    session
-        .req_header()
-        .headers
-        .get("x-beyond-provider")?
-        .to_str()
-        .ok()
-}
-
 #[async_trait]
 impl ProxyHttp for AiProxy {
     type CTX = Option<RequestCtx>;
@@ -240,29 +244,49 @@ impl ProxyHttp for AiProxy {
         // admitted path. Cheap: a counter bump + a short `format!` (see `next_request_id`).
         let request_id = self.state.next_request_id();
 
-        // 1. Resolve the upstream provider first — from the ingress dialect (the body/model isn't
-        // available pre-connect), with an explicit `x-beyond-provider` override. Resolving up front
-        // means an unknown provider is a clean 400 before any auth work, and (since it borrows
-        // nothing) keeps the borrow checker happy when the key is extracted next. An `Arc` clone of
-        // the boot-time registry entry — nothing re-allocated per request.
-        let dialect = dialect_for_path(session.req_header().uri.path());
-        let provider = match provider_override(session) {
-            Some(name) => self.state.provider(name).cloned(),
-            None => self
-                .state
-                .provider(route::dialect_default(dialect))
-                .cloned(),
+        // 1. Route by the **first path segment** = provider; forward the rest of the path verbatim
+        // (native passthrough — the gateway holds no per-provider mount knowledge). A path with no
+        // provider segment that starts with `/v1` is the drop-in default: dialect picks
+        // openai/anthropic and the path is forwarded as-is. Anything else → unknown provider (404).
+        // We resolve before auth (an unknown route is cheap) and compute owned values inside the
+        // block so the session borrow ends before any `&mut session` reject below.
+        let (provider_opt, forward_path) = {
+            let uri = &session.req_header().uri;
+            let path = uri.path();
+            let query = uri.query();
+            // `nth(1)`: `/openai/v1/…` → "openai"; `/v1/…` → "v1"; "/" or "" → "".
+            let first = path.split('/').nth(1).unwrap_or("");
+            let with_query = |p: &str| match query {
+                Some(q) => format!("{p}?{q}"),
+                None => p.to_string(),
+            };
+            if let Some(p) = self.state.provider(first) {
+                // Provider-prefixed: strip the leading `/{first}` segment, forward the remainder.
+                let rest = &path[1 + first.len()..];
+                (
+                    Some(p.clone()),
+                    with_query(if rest.is_empty() { "/" } else { rest }),
+                )
+            } else if path.starts_with(route::DEFAULT_PREFIX) {
+                // Bare default: dialect picks the provider; forward the path unchanged.
+                let name = route::dialect_default(dialect_for_path(path));
+                (self.state.provider(name).cloned(), with_query(path))
+            } else {
+                (None, String::new())
+            }
         };
-        let Some(provider) = provider else {
+        let Some(provider) = provider_opt else {
             return Self::reject(
                 session,
                 &request_id,
-                400,
+                404,
                 "invalid_request_error",
                 "unknown provider",
             )
             .await;
         };
+        // Dialect now comes from the resolved provider (usage parsing + injection eligibility).
+        let dialect = provider.dialect;
 
         // 2. Extract the presented key — a managed virtual key (`bai_…`) or a raw BYO provider token.
         let Some(raw_key) = extract_virtual_key(session) else {
@@ -381,16 +405,16 @@ impl ProxyHttp for AiProxy {
         // Mark OpenAI managed chat/responses streams for body buffering + `stream_options` injection
         // (handled in `request_body_filter`). Scoped tight: managed only (BYO stays pure
         // passthrough), OpenAI dialect only, streaming-capable paths only — so everything else still
-        // streams through untouched.
-        let inject_eligible = managed
-            && dialect == Dialect::OpenAI
-            && openai_streamable_path(session.req_header().uri.path());
+        // streams through untouched. Checked on the forwarded path (suffix), so it's prefix-agnostic.
+        let inject_eligible =
+            managed && dialect == Dialect::OpenAI && is_streamable_path(&forward_path);
 
         *ctx = Some(RequestCtx {
             tenant_id,
             vpc_id,
             dialect,
             provider,
+            forward_path,
             managed,
             model: String::new(),
             model_scanner: peek::ModelScanner::new(),
@@ -441,7 +465,7 @@ impl ProxyHttp for AiProxy {
                 // already formatted into `e`) instead of discarding it behind an opaque static string.
                 // `error_because` chains `e` as the cause so it shows in the Pingora error log.
                 warn!(
-                    request_id = rc.request_id,
+                    request_id = %rc.request_id,
                     provider = rc.provider.name.as_str(),
                     authority = rc.provider.authority.as_str(),
                     error = %e,
@@ -509,22 +533,23 @@ impl ProxyHttp for AiProxy {
             }
         }
 
-        // Point Host at the upstream. The body passes through untouched, so the client's original
-        // framing (Content-Length / chunked) is preserved — true passthrough.
+        // Point Host at the upstream.
         upstream_request.insert_header("host", rc.provider.host.as_str())?;
 
-        // Rewrite the path to the provider's mount point when it isn't `/v1` (e.g. Groq serves the
-        // OpenAI surface under `/openai/v1`, Fireworks under `/inference/v1`). Most providers mount
-        // at `/v1`, so `upstream_path` returns `None` and the URI is left untouched (no realloc).
-        // The query string is preserved.
-        if let Some(new_path) = rc.provider.upstream_path(upstream_request.uri.path()) {
-            let pq = match upstream_request.uri.query() {
-                Some(q) => format!("{new_path}?{q}"),
-                None => new_path,
-            };
-            if let Ok(uri) = pq.parse() {
-                upstream_request.set_uri(uri);
-            }
+        // Forward the provider-native path (computed in `request_filter`): the client path with the
+        // `/{provider}` segment stripped, or unchanged for a bare-path default. We send it verbatim —
+        // no per-provider rewriting. Only set the URI when it actually differs from the inbound path
+        // (i.e. a `/{provider}` prefix was stripped); the bare-path case needs no change, so we skip
+        // the parse + realloc. The body's framing (Content-Length / chunked) is preserved.
+        if rc.forward_path
+            != upstream_request
+                .uri
+                .path_and_query()
+                .map(|pq| pq.as_str())
+                .unwrap_or("")
+            && let Ok(uri) = rc.forward_path.parse()
+        {
+            upstream_request.set_uri(uri);
         }
 
         // Injection-eligible (OpenAI managed stream): the body is rewritten in `request_body_filter`,
@@ -599,7 +624,7 @@ impl ProxyHttp for AiProxy {
 
         if end_of_stream && rc.model.is_empty() {
             if let Some(m) = rc.model_scanner.take_model() {
-                rc.model = sanitize_model(m);
+                rc.model = sanitize_model(m).into_owned();
             }
         }
         Ok(())
@@ -612,27 +637,19 @@ impl ProxyHttp for AiProxy {
         ctx: &mut Self::CTX,
     ) -> Result<()> {
         if let Some(rc) = ctx.as_mut() {
-            // Headers arrived ≈ time-to-first-byte. Labeled by provider — first-token latency is
-            // per-provider, so an unlabeled histogram can't tell you which one regressed.
-            self.state
+            // Headers arrived ≈ time-to-first-byte. Per-provider handle resolved once at boot (see
+            // `ProviderMetrics`) — first-token latency is per-provider, so an unlabeled histogram
+            // can't tell you which one regressed.
+            rc.provider
                 .metrics
                 .ttft_seconds
-                .with_label_values(&[rc.provider.name.as_str()])
                 .observe(rc.start.elapsed().as_secs_f64());
 
             // Per-provider response counter, bucketed by status class — the signal that a provider
             // is degrading (429/5xx) before it shows up only as latency or a missing usage event.
-            let status_class = match upstream_response.status.as_u16() {
-                s if (200..300).contains(&s) => "2xx",
-                s if (300..400).contains(&s) => "3xx",
-                s if (400..500).contains(&s) => "4xx",
-                _ => "5xx",
-            };
-            self.state
+            rc.provider
                 .metrics
-                .upstream_responses_total
-                .with_label_values(&[rc.provider.name.as_str(), status_class])
-                .inc();
+                .record_response(upstream_response.status.as_u16());
 
             // Derive streaming from the response, not the request: SSE ⇒ use the streaming usage
             // parser; otherwise the body is a single JSON object.
@@ -651,7 +668,7 @@ impl ProxyHttp for AiProxy {
             // Echo the request id so a client (or an oncall reading a captured response) can quote it
             // and land on this request's log line. `insert_header` only fails on an invalid value;
             // our id is `[0-9a-f-]`, always valid — but surface a failure rather than silently drop.
-            upstream_response.insert_header(REQUEST_ID_HEADER, &rc.request_id)?;
+            upstream_response.insert_header(REQUEST_ID_HEADER, rc.request_id.as_str())?;
         }
         Ok(())
     }
@@ -705,13 +722,9 @@ impl ProxyHttp for AiProxy {
                 // egress-IP ban — connect is where that first bites) shows up only as extra latency
                 // on `upstream_latency_seconds`, indistinguishable from a slow model. The counter is
                 // the dashboard signal; the `warn!` carries the request_id to grep.
-                self.state
-                    .metrics
-                    .connect_retries_total
-                    .with_label_values(&[rc.provider.name.as_str()])
-                    .inc();
+                rc.provider.metrics.connect_retries_total.inc();
                 warn!(
-                    request_id = rc.request_id,
+                    request_id = %rc.request_id,
                     provider = rc.provider.name.as_str(),
                     attempt = rc.attempt,
                     error = %e,
@@ -742,7 +755,7 @@ impl ProxyHttp for AiProxy {
         // 42 get 502s for 5 minutes" is one grep on the request_id, not a reconstruction.
         if let Some(e) = e {
             warn!(
-                request_id = rc.request_id,
+                request_id = %rc.request_id,
                 tenant_id = rc.tenant_id,
                 vpc_id = rc.vpc_id,
                 provider = rc.provider.name.as_str(),
@@ -766,22 +779,16 @@ impl ProxyHttp for AiProxy {
         .unwrap_or_default();
 
         let m = &self.state.metrics;
-        m.tokens_total
-            .with_label_values(&["input"])
-            .inc_by(usage.input_tokens);
-        m.tokens_total
-            .with_label_values(&["output"])
-            .inc_by(usage.output_tokens);
+        // Pre-resolved fixed-label children (see `Metrics`) — no per-call `with_label_values` lookup.
+        m.tokens_input.inc_by(usage.input_tokens);
+        m.tokens_output.inc_by(usage.output_tokens);
         // Cache tokens, too — these are in the `ai.usage` billing log below, but that ships with lag;
         // the counter is the alerting surface for a cache-hit-rate cliff after a deploy.
-        m.tokens_total
-            .with_label_values(&["cache_read"])
-            .inc_by(usage.cache_read_tokens);
-        m.tokens_total
-            .with_label_values(&["cache_write"])
-            .inc_by(usage.cache_write_tokens);
-        m.upstream_latency_seconds
-            .with_label_values(&[rc.provider.name.as_str()])
+        m.tokens_cache_read.inc_by(usage.cache_read_tokens);
+        m.tokens_cache_write.inc_by(usage.cache_write_tokens);
+        rc.provider
+            .metrics
+            .upstream_latency_seconds
             .observe(rc.start.elapsed().as_secs_f64());
         // Balance the `active_streams` increment from `response_filter`. `logging` runs exactly once
         // per request (including on upstream errors / client disconnects), so a stream that opened is
@@ -803,18 +810,17 @@ impl ProxyHttp for AiProxy {
             // product analytics ("what they asked for") and a fallback rate when a snapshot is newer
             // than the downstream price table. They're equal when the response carried no model (e.g.
             // an error body), where `model` falls back to the request alias. Both sanitized.
-            let billed_model = rc
-                .resp_model_scanner
-                .take_model()
-                .map(sanitize_model)
-                .unwrap_or_else(|| rc.model.clone());
+            let billed = rc.resp_model_scanner.take_model().map(sanitize_model);
+            // Borrow the requested model as the fallback rather than cloning it — it's still read as
+            // `requested_model` below, so a clone would be pure waste on every managed response.
+            let billed_model = billed.as_deref().unwrap_or(&rc.model);
             info!(
                 target: "ai.usage",
-                request_id = rc.request_id,
+                request_id = %rc.request_id,
                 tenant_id = rc.tenant_id,
                 vpc_id = rc.vpc_id,
                 provider = rc.provider.name.as_str(),
-                model = %billed_model,
+                model = billed_model,
                 requested_model = %rc.model,
                 stream = rc.streaming,
                 input_tokens = usage.input_tokens,
diff --git a/crates/gateway/src/route.rs b/crates/gateway/src/route.rs
index 9f69e07..587f848 100644
--- a/crates/gateway/src/route.rs
+++ b/crates/gateway/src/route.rs
@@ -1,20 +1,27 @@
 //! Provider routing and per-provider wire details — **data-driven**.
 //!
-//! Passthrough-first: the ingress *dialect* (which API surface the client called) picks the default
-//! provider; an `x-beyond-provider: <name>` header selects any registered provider by name. A
-//! provider is a *row* in [`KNOWN_PROVIDERS`] (name, upstream authority, base path, auth scheme) —
+//! The provider is the **first path segment** of the request (`/{provider}/…`); the rest of the path
+//! is forwarded to the upstream **verbatim** (native passthrough — the gateway holds no per-provider
+//! path knowledge). A path with no provider prefix that starts with `/v1` routes by *dialect* —
+//! `/v1/messages*` → `anthropic`, else → `openai` — so an OpenAI/Anthropic client is drop-in by
+//! changing only the host. An unrecognized first segment is a 404 (see `proxy::request_filter`).
+//!
+//! A provider is a *row* in [`KNOWN_PROVIDERS`] (name, upstream authority, dialect, auth scheme) —
 //! adding an OpenAI-wire provider (Groq, DeepSeek, Together, …) is one line there, no new code
-//! paths, no enum, no match arms. Operators can also add/override providers from config (see
-//! `state`/`config`). We do not translate between dialects — that's deliberately out of scope.
+//! paths. Operators can also add/override providers from config (see `state`/`config`). We do not
+//! translate between dialects — that's deliberately out of scope.
 
+use crate::metrics::ProviderMetrics;
 use crate::secret::Secret;
 
-/// The path prefix client SDKs use (OpenAI + Anthropic both mount their API under `/v1`). A provider
-/// whose `base_path` differs has this leading segment rewritten to its prefix (see
-/// [`Provider::upstream_path`]).
-pub const CLIENT_PREFIX: &str = "/v1";
+/// The default API prefix OpenAI/Anthropic clients use. A request with no provider segment that
+/// starts with this is routed to a default provider by [`dialect_for_path`](crate::proxy) (the
+/// bare-path drop-in case); anything else with an unknown first segment is a 404.
+pub const DEFAULT_PREFIX: &str = "/v1";
 
-/// Which API surface the client called. Drives usage parsing and the default provider.
+/// Which API surface the client called. Drives usage parsing and the bare-path default provider.
+/// On a provider-prefixed request it's the selected provider's own [`Provider::dialect`]; on a
+/// bare-path request it's derived from the path (`proxy::dialect_for_path`).
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Dialect {
     OpenAI,
@@ -53,106 +60,110 @@ pub struct ProviderSpec {
     pub name: &'static str,
     /// Default upstream `host:port` (TLS:443). Overridable per-provider via config.
     pub authority: &'static str,
-    /// Where the provider mounts the OpenAI-wire surface. The client always calls `/v1/…` (its
-    /// SDK's fixed prefix); for a provider whose `base_path != "/v1"` the gateway rewrites that
-    /// leading segment (e.g. Groq serves under `/openai/v1`, so `/v1/chat/completions` →
-    /// `/openai/v1/chat/completions`). Most providers mount at `/v1`, so this is usually `"/v1"`.
-    pub base_path: &'static str,
+    /// The provider's wire format — drives usage parsing and `stream_options` injection eligibility.
+    /// (We forward the client's path verbatim, so the *path* doesn't tell us the wire format; the
+    /// provider does.)
+    pub dialect: Dialect,
     pub auth: AuthScheme,
 }
 
 /// The providers the gateway knows out of the box. All but Anthropic speak the OpenAI wire format
-/// (Bearer auth, chat/completions + embeddings); they differ by authority and where they mount that
-/// surface (`base_path`) — a new one is a single row here, then reachable via `x-beyond-provider:
-/// <name>`. (Config can add further OpenAI-wire providers or override any authority — see
+/// (Bearer auth, chat/completions + embeddings); a new one is a single row here, then reachable at
+/// `/{name}/…`. (Config can add further OpenAI-wire providers or override any authority — see
 /// `state::build_providers`.)
 ///
-/// `base_path` values are deliberate, not cosmetic: Groq/OpenRouter/Fireworks do **not** mount at
-/// `/v1`, so a verbatim path passthrough would 404 against the real provider.
-///
-/// Every row's `authority`/`base_path`/`auth` was verified against the provider's **official** docs
-/// (cited inline) as of 2026-05; the citation is the source of truth if a provider later moves an
-/// endpoint. These are static facts, so doc-verification (not a live call) is the right proof.
+/// We forward the path after `/{name}` **verbatim**, so the gateway carries no per-provider mount
+/// path — the client uses the provider's native base path (e.g. `/groq/openai/v1/chat/completions`,
+/// `/fireworks/inference/v1/chat/completions`), exactly as it would hitting the provider directly.
+/// Each row's `authority`/`auth` is verified against the provider's **official** docs (cited inline)
+/// as of 2026-05; the client-facing native path is noted alongside as a convenience.
 pub const KNOWN_PROVIDERS: &[ProviderSpec] = &[
     // docs: https://platform.openai.com/docs/api-reference/authentication — base https://api.openai.com/v1, Bearer.
+    // Client path: /openai/v1/… (or bare /v1/… as the default).
     ProviderSpec {
         name: "openai",
         authority: "api.openai.com:443",
-        base_path: "/v1",
+        dialect: Dialect::OpenAI,
         auth: AuthScheme::Bearer,
     },
     // docs: https://docs.claude.com/en/api/messages — base https://api.anthropic.com, Messages at /v1/messages,
     // auth is `x-api-key` (NOT Bearer). The required `anthropic-version` header is the client's; we pass it through.
+    // Client path: /anthropic/v1/messages (or bare /v1/messages as the default).
     ProviderSpec {
         name: "anthropic",
         authority: "api.anthropic.com:443",
-        base_path: "/v1",
+        dialect: Dialect::Anthropic,
         auth: AuthScheme::XApiKey,
     },
-    // docs: https://openrouter.ai/docs/quickstart — base https://openrouter.ai/api/v1 (note `/api/v1`, not `/v1`), Bearer.
+    // docs: https://openrouter.ai/docs/quickstart — base https://openrouter.ai/api/v1, Bearer.
+    // Client path: /openrouter/api/v1/chat/completions.
     ProviderSpec {
         name: "openrouter",
         authority: "openrouter.ai:443",
-        base_path: "/api/v1",
+        dialect: Dialect::OpenAI,
         auth: AuthScheme::Bearer,
     },
     // docs: https://docs.fireworks.ai/tools-sdks/openai-compatibility — base https://api.fireworks.ai/inference/v1, Bearer.
+    // Client path: /fireworks/inference/v1/chat/completions.
     ProviderSpec {
         name: "fireworks",
         authority: "api.fireworks.ai:443",
-        base_path: "/inference/v1",
+        dialect: Dialect::OpenAI,
         auth: AuthScheme::Bearer,
     },
-    // docs: https://console.groq.com/docs/openai — base https://api.groq.com/openai/v1 (note `/openai/v1`), Bearer.
+    // docs: https://console.groq.com/docs/openai — base https://api.groq.com/openai/v1, Bearer.
+    // Client path: /groq/openai/v1/chat/completions.
     ProviderSpec {
         name: "groq",
         authority: "api.groq.com:443",
-        base_path: "/openai/v1",
+        dialect: Dialect::OpenAI,
         auth: AuthScheme::Bearer,
     },
     // docs: https://api-docs.deepseek.com/ — base https://api.deepseek.com/v1 (the `/v1` is an OpenAI-compat alias,
-    // not API versioning); /v1/chat/completions is officially supported. Bearer.
+    // not API versioning); /v1/chat/completions is officially supported. Bearer. Client path: /deepseek/v1/….
     ProviderSpec {
         name: "deepseek",
         authority: "api.deepseek.com:443",
-        base_path: "/v1",
+        dialect: Dialect::OpenAI,
         auth: AuthScheme::Bearer,
     },
     // docs: https://docs.together.ai/docs/openai-api-compatibility — base https://api.together.ai/v1, Bearer.
     // Canonical host is `api.together.ai`; the legacy `api.together.xyz` is still live but no longer documented.
+    // Client path: /together/v1/….
     ProviderSpec {
         name: "together",
         authority: "api.together.ai:443",
-        base_path: "/v1",
+        dialect: Dialect::OpenAI,
         auth: AuthScheme::Bearer,
     },
     // docs: https://inference-docs.cerebras.ai/resources/openai — base https://api.cerebras.ai/v1, Bearer.
+    // Client path: /cerebras/v1/….
     ProviderSpec {
         name: "cerebras",
         authority: "api.cerebras.ai:443",
-        base_path: "/v1",
+        dialect: Dialect::OpenAI,
         auth: AuthScheme::Bearer,
     },
-    // docs: https://docs.mistral.ai/api/ — base https://api.mistral.ai/v1, Bearer.
+    // docs: https://docs.mistral.ai/api/ — base https://api.mistral.ai/v1, Bearer. Client path: /mistral/v1/….
     ProviderSpec {
         name: "mistral",
         authority: "api.mistral.ai:443",
-        base_path: "/v1",
+        dialect: Dialect::OpenAI,
         auth: AuthScheme::Bearer,
     },
     // docs: https://docs.x.ai/docs/api-reference — base https://api.x.ai/v1, Bearer. Reasoning models are slow:
-    // the generous read/idle timeouts (see `config`) matter here.
+    // the generous read/idle timeouts (see `config`) matter here. Client path: /xai/v1/….
     ProviderSpec {
         name: "xai",
         authority: "api.x.ai:443",
-        base_path: "/v1",
+        dialect: Dialect::OpenAI,
         auth: AuthScheme::Bearer,
     },
 ];
 
-/// The default provider name for a dialect, used when no `x-beyond-provider` override is given.
-/// (Model-based auto-routing isn't possible — the body isn't read before peer selection — so the
-/// long tail is reached explicitly via the header.)
+/// The default provider name for a dialect — used only for the **bare-path** request (no provider
+/// segment), where the dialect is derived from the path. A provider-prefixed request names its
+/// provider directly.
 pub fn dialect_default(d: Dialect) -> &'static str {
     match d {
         Dialect::OpenAI => "openai",
@@ -170,24 +181,29 @@ pub struct Provider {
     pub authority: String,
     /// Bare upstream host (SNI / `Host` header) = authority without the port.
     pub host: String,
-    /// Where the provider mounts the OpenAI-wire surface (see [`ProviderSpec::base_path`]).
-    pub base_path: String,
+    /// The provider's wire format (usage parsing + injection eligibility). See [`ProviderSpec::dialect`].
+    pub dialect: Dialect,
     pub auth: AuthScheme,
     /// Precomputed managed auth header value (`Bearer <key>` / bare key). `None` ⇒ no pool key is
     /// configured for this provider ⇒ managed requests to it are rejected (503). Kept in `Secret`
     /// for the redacting-`Debug` + zeroize-on-drop hygiene of the underlying key.
     pub pool_auth_value: Option<Secret>,
+    /// Per-provider metric handles, resolved once here so the response path bumps a direct
+    /// counter/histogram instead of a string-keyed label lookup per response.
+    pub metrics: ProviderMetrics,
 }
 
 impl Provider {
-    /// Resolve a provider from its name, upstream authority, base path, auth scheme, and (optional)
-    /// pool key. Derives the bare host and precomputes the managed auth header value once.
+    /// Resolve a provider from its name, upstream authority, dialect, auth scheme, (optional) pool
+    /// key, and pre-resolved per-provider metric handles. Derives the bare host and precomputes the
+    /// managed auth header value once.
     pub fn resolve(
         name: &str,
         authority: String,
-        base_path: &str,
+        dialect: Dialect,
         auth: AuthScheme,
         pool_key: Option<&str>,
+        metrics: ProviderMetrics,
     ) -> Self {
         let host = authority
             .split(':')
@@ -199,28 +215,12 @@ impl Provider {
             name: name.to_string(),
             authority,
             host,
-            base_path: base_path.to_string(),
+            dialect,
             auth,
             pool_auth_value,
+            metrics,
         }
     }
-
-    /// Map a client request path to the upstream path for this provider. The client's SDK uses the
-    /// fixed `/v1` prefix; if this provider mounts elsewhere (`base_path != "/v1"`) the leading
-    /// `/v1` is replaced. Returns `None` when no rewrite is needed (the common `/v1` case, or a
-    /// path that doesn't start with `/v1`), so the hot path skips reallocating the URI.
-    pub fn upstream_path(&self, client_path: &str) -> Option<String> {
-        if self.base_path == CLIENT_PREFIX {
-            return None;
-        }
-        // Only remap when the segment is exactly `/v1` (followed by `/` or end), never a prefix
-        // match like `/v1beta`. `rest` keeps the remainder (incl. its leading `/`, or empty).
-        let rest = client_path.strip_prefix(CLIENT_PREFIX)?;
-        if !rest.is_empty() && !rest.starts_with('/') {
-            return None;
-        }
-        Some(format!("{}{}", self.base_path, rest))
-    }
 }
 
 #[cfg(test)]
@@ -246,6 +246,19 @@ mod tests {
         );
     }
 
+    #[test]
+    fn anthropic_is_the_only_anthropic_dialect() {
+        // Dialect drives usage parsing + injection; getting Anthropic's wire wrong mis-meters it.
+        for spec in KNOWN_PROVIDERS {
+            let want = if spec.name == "anthropic" {
+                Dialect::Anthropic
+            } else {
+                Dialect::OpenAI
+            };
+            assert_eq!(spec.dialect, want, "{} dialect", spec.name);
+        }
+    }
+
     #[test]
     fn auth_scheme_formats_and_headers() {
         assert_eq!(AuthScheme::Bearer.header(), "authorization");
@@ -260,49 +273,24 @@ mod tests {
         let p = Provider::resolve(
             "openai",
             "api.openai.com:443".to_string(),
-            "/v1",
+            Dialect::OpenAI,
             AuthScheme::Bearer,
             Some("sk-x"),
+            ProviderMetrics::disconnected(),
         );
         assert_eq!(p.host, "api.openai.com");
+        assert_eq!(p.dialect, Dialect::OpenAI);
         assert_eq!(p.pool_auth_value.as_ref().unwrap().expose(), "Bearer sk-x");
 
         // No pool key ⇒ no managed auth value (managed requests to it would 503).
         let a = Provider::resolve(
             "anthropic",
             "api.anthropic.com:443".to_string(),
-            "/v1",
+            Dialect::Anthropic,
             AuthScheme::XApiKey,
             None,
+            ProviderMetrics::disconnected(),
         );
         assert!(a.pool_auth_value.is_none());
     }
-
-    #[test]
-    fn upstream_path_rewrites_only_non_v1_bases() {
-        let v1 = Provider::resolve("openai", "h:443".into(), "/v1", AuthScheme::Bearer, None);
-        // `/v1` provider: no rewrite (None) — the hot path passes the client path through verbatim.
-        assert_eq!(v1.upstream_path("/v1/chat/completions"), None);
-
-        let groq = Provider::resolve(
-            "groq",
-            "h:443".into(),
-            "/openai/v1",
-            AuthScheme::Bearer,
-            None,
-        );
-        assert_eq!(
-            groq.upstream_path("/v1/chat/completions").as_deref(),
-            Some("/openai/v1/chat/completions")
-        );
-        // Anthropic-style messages path under a remapped base, and the bare prefix.
-        assert_eq!(
-            groq.upstream_path("/v1/embeddings").as_deref(),
-            Some("/openai/v1/embeddings")
-        );
-        assert_eq!(groq.upstream_path("/v1").as_deref(), Some("/openai/v1"));
-        // A non-`/v1` path (e.g. a health probe) is left alone, as is a `/v1beta`-style false match.
-        assert_eq!(groq.upstream_path("/healthz"), None);
-        assert_eq!(groq.upstream_path("/v1beta/models"), None);
-    }
 }
diff --git a/crates/gateway/src/state.rs b/crates/gateway/src/state.rs
index c2324c5..b4b42dd 100644
--- a/crates/gateway/src/state.rs
+++ b/crates/gateway/src/state.rs
@@ -9,11 +9,13 @@ use crate::config::AiConfig;
 use crate::deny::DenySet;
 use crate::error::{GatewayError, Result};
 use crate::key::Keyring;
-use crate::metrics::Metrics;
+use crate::metrics::{Metrics, ProviderMetrics};
 use crate::ratelimit::RateLimit;
-use crate::route::{self, AuthScheme, Provider};
+use crate::route::{self, AuthScheme, Dialect, Provider};
 use arc_swap::ArcSwap;
+use arrayvec::ArrayString;
 use std::collections::HashMap;
+use std::fmt::Write as _;
 use std::net::SocketAddr;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicU64, Ordering};
@@ -23,11 +25,16 @@ use tracing::warn;
 /// How long a resolved upstream address is reused before re-resolving.
 const DNS_TTL: Duration = Duration::from_secs(60);
 
+/// A process-unique request id, `{instance:x}-{seq:x}`. Two `u64`s in hex (≤16 chars each) plus the
+/// `-` separator never exceed 33 bytes, so it lives inline on the stack — no per-request heap
+/// allocation on the admitted path (it's minted for every request, including fast rejects).
+pub type RequestId = ArrayString<33>;
+
 /// Build the resolved provider registry from the static known set + config: every known provider
 /// (its authority overridable by `provider_authorities`), plus any config-only OpenAI-wire provider
 /// (a `provider_authorities` entry whose name isn't known). Each provider's pool key (if any) is
 /// looked up by name and its managed auth header value precomputed.
-fn build_providers(config: &AiConfig) -> HashMap<String, Arc<Provider>> {
+fn build_providers(config: &AiConfig, metrics: &Metrics) -> HashMap<String, Arc<Provider>> {
     let mut providers = HashMap::new();
     for spec in route::KNOWN_PROVIDERS {
         let authority = config
@@ -41,9 +48,10 @@ fn build_providers(config: &AiConfig) -> HashMap<String, Arc<Provider>> {
             Arc::new(Provider::resolve(
                 spec.name,
                 authority,
-                spec.base_path,
+                spec.dialect,
                 spec.auth,
                 pool_key,
+                ProviderMetrics::resolve(metrics, spec.name),
             )),
         );
     }
@@ -57,9 +65,10 @@ fn build_providers(config: &AiConfig) -> HashMap<String, Arc<Provider>> {
                 Arc::new(Provider::resolve(
                     name,
                     authority.clone(),
-                    route::CLIENT_PREFIX,
+                    Dialect::OpenAI,
                     AuthScheme::Bearer,
                     pool_key,
+                    ProviderMetrics::resolve(metrics, name),
                 )),
             );
         }
@@ -90,11 +99,11 @@ pub struct GatewayState {
     /// only writes are the ~10 providers' entries refreshed once per `DNS_TTL`, applied via `rcu`.
     dns_cache: ArcSwap<HashMap<String, (SocketAddr, Instant)>>,
 
-    /// Per-process instance token (hex of 8 OS-random bytes), the high half of every `request_id`.
+    /// Per-process instance token (8 OS-random bytes), the high half of every `request_id`.
     /// Random rather than a uuid dep, so log lines from two gateways don't collide when aggregated —
     /// and random rather than the boot wall-clock, which collides when a rapid scale-up boots several
     /// instances within the same nanosecond.
-    instance_id: String,
+    instance_id: u64,
     /// Monotonic per-request counter, the low half of `request_id`. A relaxed `fetch_add` — the only
     /// requirement is uniqueness within the process, not cross-request ordering.
     request_seq: AtomicU64,
@@ -106,15 +115,24 @@ impl GatewayState {
         // No signing keys ⇒ every `bai_…` fails verify and falls through to BYO treatment: no key
         // swap, no deny-set, no `ai.usage` billing. That's a *valid* mode (a BYO-only deployment),
         // but a far more common cause is a missing/typo'd `signing_keys` (SSM param, env) — which
-        // looks healthy while silently dropping all billing. Warn loudly so the boot logs flag it;
-        // don't `exit` (BYO-only is legitimate and tests rely on it).
+        // looks healthy while silently dropping all billing. A managed deployment sets
+        // `require_signing_keys = true` so this mis-deploy is a hard, visible boot failure; otherwise
+        // we warn loudly and continue (BYO-only is legitimate and the test/e2e harnesses run keyless).
         if config.signing_keys.is_empty() {
+            if config.require_signing_keys {
+                return Err(GatewayError::Config(
+                    "require_signing_keys is set but no signing_keys are configured — refusing to \
+                     boot into silent BYO-only mode (no key swap, no deny-set, no billing). Check \
+                     the signing_keys config / SSM param."
+                        .to_string(),
+                ));
+            }
             warn!(
                 "no signing_keys configured — all managed (bai_) traffic will be treated as BYO \
                  (no key swap, no deny-set, no billing). Expected only for a BYO-only deployment."
             );
         }
-        let providers = build_providers(&config);
+        let providers = build_providers(&config, &metrics);
         let rate_limit = RateLimit::new(config.rate_limit_rps, config.byo_rate_limit_rps);
 
         // 8 OS-random bytes as the instance token, so two gateways' request_ids never collide when
@@ -140,7 +158,7 @@ impl GatewayState {
             deny: ArcSwap::from_pointee(DenySet::new()),
             rate_limit,
             dns_cache: ArcSwap::from_pointee(HashMap::new()),
-            instance_id: format!("{instance:x}"),
+            instance_id: instance,
             request_seq: AtomicU64::new(0),
             config,
         }))
@@ -149,14 +167,21 @@ impl GatewayState {
     /// A process-unique request id (`{instance}-{seq}`) for log correlation and the
     /// `x-beyond-request-id` response header. Deliberately *not* a uuid: a per-process instance
     /// token (computed once at boot) plus a relaxed atomic counter is unique across the fleet, costs
-    /// one `fetch_add` + one small `format!`, and needs no extra crate or randomness.
-    pub fn next_request_id(&self) -> String {
+    /// one `fetch_add` + a hex format into a stack buffer (no heap allocation), and needs no
+    /// randomness per request.
+    pub fn next_request_id(&self) -> RequestId {
         let seq = self.request_seq.fetch_add(1, Ordering::Relaxed);
-        format!("{}-{seq:x}", self.instance_id)
+        let mut id = RequestId::new();
+        // Can't overflow: two `u64`s in hex + `-` is ≤33 bytes, exactly the buffer's capacity. The
+        // `write!` is infallible here, but if a future format change ever exceeded the cap we'd
+        // rather emit a truncated id than panic on a correlation aid — so swallow the result.
+        let _ = write!(id, "{:x}-{seq:x}", self.instance_id);
+        id
     }
 
-    /// The resolved provider for `name` (`x-beyond-provider` value or dialect default), or `None`
-    /// if no such provider is registered.
+    /// The resolved provider for `name` (the request's first path segment, or the bare-path dialect
+    /// default), or `None` if no such provider is registered — which `request_filter` turns into a
+    /// 404.
     pub fn provider(&self, name: &str) -> Option<&Arc<Provider>> {
         self.providers.get(name)
     }
@@ -181,9 +206,20 @@ impl GatewayState {
         // and both rcu; that's harmless (same answer, last writer wins) and far cheaper than holding
         // a lock across `getaddrinfo`. The clone-on-write copies a ~10-entry map — trivial, and only
         // on the rare miss/refresh path, never on a hit.
+        //
+        // Sweep entries that are long dead while we're already paying for the clone. The cache keys
+        // are provider authorities, which come entirely from the boot-time registry (so in practice
+        // the map is bounded by the provider count, not by traffic) — this sweep is belt-and-
+        // suspenders against authorities ever becoming dynamic, and it's a *TTL* drop, not an
+        // eviction *policy*: there's no capacity contest here, so LRU/SIEVE would be machinery for a
+        // problem we don't have. We keep anything within `2 × DNS_TTL` so a still-live provider whose
+        // entry just expired (and is about to be refreshed) is never dropped out from under a
+        // concurrent resolve.
+        let now = Instant::now();
         self.dns_cache.rcu(|cur| {
             let mut next = HashMap::clone(cur);
-            next.insert(authority.to_string(), (addr, Instant::now()));
+            next.retain(|_, (_, at)| now.duration_since(*at) < DNS_TTL * 2);
+            next.insert(authority.to_string(), (addr, now));
             next
         });
         Ok(addr)
@@ -222,7 +258,7 @@ mod tests {
             ]),
             ..Default::default()
         };
-        let providers = build_providers(&config);
+        let providers = build_providers(&config, &test_metrics());
 
         // Known provider: authority overridden, pool auth precomputed in the right scheme.
         let openai = providers.get("openai").unwrap();
diff --git a/crates/gateway/src/usage.rs b/crates/gateway/src/usage.rs
index f5281fb..958846a 100644
--- a/crates/gateway/src/usage.rs
+++ b/crates/gateway/src/usage.rs
@@ -5,6 +5,8 @@
 //! an SSE stream. For streaming we scan the relayed bytes for the usage event but never block the
 //! relay on it (see `proxy`).
 
+use serde::Deserialize;
+
 #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
 pub struct Usage {
     pub input_tokens: u64,
@@ -13,59 +15,104 @@ pub struct Usage {
     pub cache_write_tokens: u64,
 }
 
-fn u64_at(v: &serde_json::Value, ptr: &str) -> u64 {
-    v.pointer(ptr).and_then(|x| x.as_u64()).unwrap_or(0)
+// Typed views of just the fields we meter. Deserializing into these (rather than a
+// `serde_json::Value` DOM) lets serde skip every field we don't read without allocating a node for
+// it — no `Map`/`String`/`Number` tree to build and drop per body or per SSE line. Every field is
+// `#[serde(default)]` so a missing or partial `usage` block reads as zeros, matching the prior
+// pointer-with-`unwrap_or(0)` behavior.
+
+/// OpenAI `usage` block (chat/completions + responses). `prompt`/`completion` map to in/out; cached
+/// input rides in `prompt_tokens_details.cached_tokens`. No cache-write concept on the OpenAI wire.
+#[derive(Deserialize, Default)]
+struct OpenAiUsage {
+    #[serde(default)]
+    prompt_tokens: u64,
+    #[serde(default)]
+    completion_tokens: u64,
+    #[serde(default)]
+    prompt_tokens_details: OpenAiPromptDetails,
+}
+
+#[derive(Deserialize, Default)]
+struct OpenAiPromptDetails {
+    #[serde(default)]
+    cached_tokens: u64,
+}
+
+impl From<OpenAiUsage> for Usage {
+    fn from(u: OpenAiUsage) -> Self {
+        Usage {
+            input_tokens: u.prompt_tokens,
+            output_tokens: u.completion_tokens,
+            cache_read_tokens: u.prompt_tokens_details.cached_tokens,
+            cache_write_tokens: 0,
+        }
+    }
+}
+
+/// Anthropic `usage` block (`/v1/messages` body + streaming events).
+#[derive(Deserialize, Default)]
+struct AnthropicUsage {
+    #[serde(default)]
+    input_tokens: u64,
+    #[serde(default)]
+    output_tokens: u64,
+    #[serde(default)]
+    cache_read_input_tokens: u64,
+    #[serde(default)]
+    cache_creation_input_tokens: u64,
 }
 
-/// OpenAI non-streaming: `usage.{prompt_tokens, completion_tokens}` (+ cached details).
+/// OpenAI non-streaming: top-level `usage`. `None` (absent/`null`) ⇒ no usage to meter.
 pub fn openai_body(body: &[u8]) -> Option<Usage> {
-    let v: serde_json::Value = serde_json::from_slice(body).ok()?;
-    let u = v.get("usage")?;
-    Some(Usage {
-        input_tokens: u64_at(u, "/prompt_tokens"),
-        output_tokens: u64_at(u, "/completion_tokens"),
-        cache_read_tokens: u64_at(u, "/prompt_tokens_details/cached_tokens"),
-        cache_write_tokens: 0,
-    })
+    #[derive(Deserialize)]
+    struct Body {
+        usage: Option<OpenAiUsage>,
+    }
+    serde_json::from_slice::<Body>(body)
+        .ok()?
+        .usage
+        .map(Usage::from)
 }
 
-/// Anthropic non-streaming: `usage.{input_tokens, output_tokens, cache_*}`.
+/// Anthropic non-streaming: top-level `usage.{input,output,cache_*}`.
 pub fn anthropic_body(body: &[u8]) -> Option<Usage> {
-    let v: serde_json::Value = serde_json::from_slice(body).ok()?;
-    let u = v.get("usage")?;
+    #[derive(Deserialize)]
+    struct Body {
+        usage: Option<AnthropicUsage>,
+    }
+    let u = serde_json::from_slice::<Body>(body).ok()?.usage?;
     Some(Usage {
-        input_tokens: u64_at(u, "/input_tokens"),
-        output_tokens: u64_at(u, "/output_tokens"),
-        cache_read_tokens: u64_at(u, "/cache_read_input_tokens"),
-        cache_write_tokens: u64_at(u, "/cache_creation_input_tokens"),
+        input_tokens: u.input_tokens,
+        output_tokens: u.output_tokens,
+        cache_read_tokens: u.cache_read_input_tokens,
+        cache_write_tokens: u.cache_creation_input_tokens,
     })
 }
 
-/// Iterate the JSON objects carried on `data:` lines of an SSE byte stream. `[DONE]` and
-/// non-JSON payloads are skipped. Used by both stream parsers below.
-fn sse_data_objects(sse: &[u8]) -> impl Iterator<Item = serde_json::Value> + '_ {
+/// Iterate the raw JSON payloads carried on `data:` lines of an SSE byte stream. `[DONE]` and the
+/// `data:` framing are stripped; each caller deserializes the payload into its own typed view.
+fn sse_data_lines(sse: &[u8]) -> impl Iterator<Item = &[u8]> + '_ {
     sse.split(|&b| b == b'\n').filter_map(|line| {
         let line = line.strip_prefix(b"data:")?;
         let line = line.strip_prefix(b" ").unwrap_or(line);
-        if line == b"[DONE]" {
-            return None;
-        }
-        serde_json::from_slice::<serde_json::Value>(line).ok()
+        (line != b"[DONE]").then_some(line)
     })
 }
 
 /// OpenAI streaming (requires `stream_options.include_usage`): the penultimate chunk carries a
 /// top-level `usage` object. Last one with usage wins.
 pub fn openai_stream(sse: &[u8]) -> Option<Usage> {
+    #[derive(Deserialize)]
+    struct Chunk {
+        usage: Option<OpenAiUsage>,
+    }
     let mut found = None;
-    for v in sse_data_objects(sse) {
-        if let Some(u) = v.get("usage").filter(|u| !u.is_null()) {
-            found = Some(Usage {
-                input_tokens: u64_at(u, "/prompt_tokens"),
-                output_tokens: u64_at(u, "/completion_tokens"),
-                cache_read_tokens: u64_at(u, "/prompt_tokens_details/cached_tokens"),
-                cache_write_tokens: 0,
-            });
+    for line in sse_data_lines(sse) {
+        if let Ok(chunk) = serde_json::from_slice::<Chunk>(line) {
+            if let Some(u) = chunk.usage {
+                found = Some(Usage::from(u));
+            }
         }
     }
     found
@@ -74,20 +121,32 @@ pub fn openai_stream(sse: &[u8]) -> Option<Usage> {
 /// Anthropic streaming: input + cache tokens arrive in `message_start.message.usage`; output
 /// accumulates in `message_delta.usage.output_tokens` (last delta is the cumulative total).
 pub fn anthropic_stream(sse: &[u8]) -> Option<Usage> {
+    #[derive(Deserialize)]
+    struct Message {
+        usage: Option<AnthropicUsage>,
+    }
+    #[derive(Deserialize)]
+    struct Chunk {
+        // `message_start` nests usage under `message`; `message_delta` carries it top-level.
+        message: Option<Message>,
+        usage: Option<AnthropicUsage>,
+    }
     let mut usage = Usage::default();
     let mut saw_any = false;
-    for v in sse_data_objects(sse) {
-        if let Some(u) = v.pointer("/message/usage") {
-            usage.input_tokens = u64_at(u, "/input_tokens");
-            usage.cache_read_tokens = u64_at(u, "/cache_read_input_tokens");
-            usage.cache_write_tokens = u64_at(u, "/cache_creation_input_tokens");
+    for line in sse_data_lines(sse) {
+        let Ok(chunk) = serde_json::from_slice::<Chunk>(line) else {
+            continue;
+        };
+        if let Some(u) = chunk.message.and_then(|m| m.usage) {
+            usage.input_tokens = u.input_tokens;
+            usage.cache_read_tokens = u.cache_read_input_tokens;
+            usage.cache_write_tokens = u.cache_creation_input_tokens;
             saw_any = true;
         }
-        if let Some(u) = v.get("usage") {
+        if let Some(u) = chunk.usage {
             // message_delta carries the running output token count.
-            let out = u64_at(u, "/output_tokens");
-            if out > 0 {
-                usage.output_tokens = out;
+            if u.output_tokens > 0 {
+                usage.output_tokens = u.output_tokens;
             }
             saw_any = true;
         }
diff --git a/crates/gateway/tests/common/mod.rs b/crates/gateway/tests/common/mod.rs
index 2b59fff..215fbdf 100644
--- a/crates/gateway/tests/common/mod.rs
+++ b/crates/gateway/tests/common/mod.rs
@@ -6,6 +6,8 @@
 //! deny-set. Every component picks a free port and cleans up on drop, so tests run in parallel.
 
 #![allow(dead_code)]
+// Test harness: `.unwrap()`/`.expect()`/`panic!` are assertions, not production code. See e2e.rs.
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
 
 use std::io::Write;
 use std::net::TcpListener as StdTcpListener;
diff --git a/crates/gateway/tests/e2e.rs b/crates/gateway/tests/e2e.rs
index 8f8e511..2bf7e38 100644
--- a/crates/gateway/tests/e2e.rs
+++ b/crates/gateway/tests/e2e.rs
@@ -3,6 +3,10 @@
 //!
 //! Signing key + pool key come from the gateway's *config*; NATS carries only the deny-set.
 
+// Test target: `.unwrap()`/`.expect()`/`panic!` are assertions, not production code — allow the
+// panic-surface restriction lints denied workspace-wide in `[workspace.lints.clippy]`.
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
+
 mod common;
 
 use beyond_ai::key::{VirtualKey, mint};
@@ -24,17 +28,18 @@ async fn post_status(client: &reqwest::Client, url: &str, key: &str, body: Strin
         .unwrap_or(0)
 }
 
-async fn post_status_provider(
+/// POST to an arbitrary gateway path with a Bearer key — exercises provider routing by the first
+/// path segment (`/{provider}/…`) vs the bare-path default.
+async fn post_path_status(
     client: &reqwest::Client,
     url: &str,
+    path: &str,
     key: &str,
-    provider: &str,
     body: String,
 ) -> u16 {
     client
-        .post(format!("{url}/v1/chat/completions"))
+        .post(format!("{url}{path}"))
         .header("authorization", format!("Bearer {key}"))
-        .header("x-beyond-provider", provider)
         .header("content-type", "application/json")
         .body(body)
         .send()
@@ -165,7 +170,7 @@ async fn byo_passes_user_token_through_unchanged() {
 }
 
 #[tokio::test]
-async fn fireworks_provider_header_routes_and_swaps_pool_key() {
+async fn fireworks_path_prefix_strips_and_swaps_pool_key() {
     let nats = Nats::start().await;
     let (pubkey, sk) = test_keypair(4);
     let mock = MockUpstream::start(Mode::Json).await;
@@ -180,18 +185,19 @@ async fn fireworks_provider_header_routes_and_swaps_pool_key() {
         &sk,
     );
     let client = reqwest::Client::new();
-    // Fireworks model ids contain `/`, so it's reached via the `x-beyond-provider` header, not
-    // model inference. A managed key must swap to the Fireworks-specific pool key.
+    // Fireworks is selected by the `/fireworks` path segment; the client uses its native base path
+    // (`/inference/v1`). The gateway strips `/fireworks` and forwards the rest VERBATIM, and a
+    // managed key swaps to the Fireworks-specific pool key.
     {
         let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
         wait_for_status(200, move || {
             let (c, u, k) = (c.clone(), u.clone(), k.clone());
             async move {
-                post_status_provider(
+                post_path_status(
                     &c,
                     &u,
+                    "/fireworks/inference/v1/chat/completions",
                     &k,
-                    "fireworks",
                     body_for("accounts/fireworks/models/llama-v3p1-70b-instruct"),
                 )
                 .await
@@ -206,14 +212,86 @@ async fn fireworks_provider_header_routes_and_swaps_pool_key() {
         Some("Bearer sk-fireworks-pool"),
         "managed Fireworks request must swap to the Fireworks pool key"
     );
-    // Fireworks mounts the OpenAI surface under `/inference/v1`, not `/v1`: the client's
-    // `/v1/chat/completions` must be rewritten or the real upstream would 404.
+    // The `/fireworks` segment is stripped; the provider-native remainder is forwarded verbatim
+    // (the gateway does no per-provider path rewriting).
     assert_eq!(
         cap.path, "/inference/v1/chat/completions",
-        "client `/v1` path must be remapped to the provider's base path"
+        "first segment (provider) stripped; remainder forwarded verbatim"
     );
 }
 
+#[tokio::test]
+async fn openai_prefix_matches_bare_default() {
+    // `/openai/v1/chat/completions` (explicit prefix) must reach OpenAI identically to bare
+    // `/v1/chat/completions` (dialect default): same pool-key swap, same upstream path after strip.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(8);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 1,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                post_path_status(
+                    &c,
+                    &u,
+                    "/openai/v1/chat/completions",
+                    &k,
+                    body_for("gpt-4o"),
+                )
+                .await
+            }
+        })
+        .await;
+    }
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(cap.authorization.as_deref(), Some("Bearer sk-pool-secret"));
+    assert_eq!(
+        cap.path, "/v1/chat/completions",
+        "`/openai` stripped → same upstream path as the bare `/v1` default"
+    );
+}
+
+#[tokio::test]
+async fn unknown_provider_segment_returns_404() {
+    // An unrecognized first path segment that isn't the bare `/v1` default is a routing miss — 404
+    // from the gateway (before any auth), not a confusing upstream error. Provider resolution is the
+    // very first step, so this fires regardless of the key.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(9);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 1,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+    wait_for_status(404, move || {
+        let (c, u, k) = (c.clone(), u.clone(), k.clone());
+        async move {
+            post_path_status(&c, &u, "/bogus/v1/chat/completions", &k, body_for("gpt-4o")).await
+        }
+    })
+    .await;
+}
+
 #[tokio::test]
 async fn streaming_relays_sse_and_meters_usage() {
     let nats = Nats::start().await;
@@ -462,7 +540,7 @@ async fn managed_key_via_x_api_key_header_is_accepted() {
 #[tokio::test]
 async fn managed_key_for_unconfigured_provider_returns_503() {
     // The default gateway configures OpenAI + Fireworks pool keys, but NOT Anthropic. A managed key
-    // routed to Anthropic (via the override header) has no pool key → 503, surfaced in
+    // routed to Anthropic (via the `/anthropic` path segment) has no pool key → 503, surfaced in
     // request_filter before any upstream connect.
     let nats = Nats::start().await;
     let (pubkey, sk) = test_keypair(23);
@@ -481,7 +559,14 @@ async fn managed_key_for_unconfigured_provider_returns_503() {
     wait_for_status(503, move || {
         let (c, u, k) = (c.clone(), u.clone(), k.clone());
         async move {
-            post_status_provider(&c, &u, &k, "anthropic", body_for("claude-opus-4-8")).await
+            post_path_status(
+                &c,
+                &u,
+                "/anthropic/v1/messages",
+                &k,
+                body_for("claude-opus-4-8"),
+            )
+            .await
         }
     })
     .await;
diff --git a/crates/gateway/tests/smoke.rs b/crates/gateway/tests/smoke.rs
index 318b887..622e56a 100644
--- a/crates/gateway/tests/smoke.rs
+++ b/crates/gateway/tests/smoke.rs
@@ -22,6 +22,9 @@
 //! Model ids are the cheapest small model per provider as of 2026-05; adjust if a provider retires
 //! one (a model-not-found is a stale id here, not a gateway bug).
 
+// Test target: `.unwrap()`/`.expect()`/`panic!` are assertions, not production code. See e2e.rs.
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
+
 mod common;
 
 use beyond_ai::key::{VirtualKey, mint};
@@ -54,9 +57,10 @@ async fn managed_gateway(nats: &Nats, provider: &str, real_key: &str) -> (Gatewa
     (gw, vkey)
 }
 
-/// Drive one OpenAI-wire provider through the gateway as a managed request. `openai` is the dialect
-/// default (no header); everything else is selected by `x-beyond-provider`.
-async fn smoke_openai_wire(provider: &str, key_env: &str, model: &str) {
+/// Drive one OpenAI-wire provider through the gateway as a managed request. The provider is selected
+/// by the first path segment; `chat_path` is the full gateway path — `/{provider}/{native-base}/
+/// chat/completions` (the provider's own base path after the selector, forwarded verbatim).
+async fn smoke_openai_wire(provider: &str, key_env: &str, model: &str, chat_path: &str) {
     let Some(key) = env_key(key_env) else {
         eprintln!("smoke[{provider}]: {key_env} unset — skipping");
         return;
@@ -68,21 +72,20 @@ async fn smoke_openai_wire(provider: &str, key_env: &str, model: &str) {
     let body = format!(
         r#"{{"model":"{model}","max_tokens":16,"messages":[{{"role":"user","content":"Reply with the single word: ping"}}]}}"#
     );
-    let mut req = client
-        .post(format!("{}/v1/chat/completions", gw.url()))
+    let resp = client
+        .post(format!("{}{chat_path}", gw.url()))
         .header("authorization", format!("Bearer {vkey}"))
-        .header("content-type", "application/json");
-    if provider != "openai" {
-        req = req.header("x-beyond-provider", provider);
-    }
-
-    let resp = req.body(body).send().await.expect("request to gateway");
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .expect("request to gateway");
     let status = resp.status();
     let text = resp.text().await.unwrap_or_default();
     assert!(
         status.is_success(),
-        "smoke[{provider}] model={model}: expected 2xx, got {status}.\n\
-         404 ⇒ base-path rewrite wrong; 401 ⇒ pool-key swap/verify; 403 ⇒ deny-set; \
+        "smoke[{provider}] model={model} path={chat_path}: expected 2xx, got {status}.\n\
+         404 ⇒ wrong native path / provider segment; 401 ⇒ pool-key swap/verify; 403 ⇒ deny-set; \
          a model error ⇒ stale model id. body: {text}"
     );
     assert!(
@@ -103,14 +106,15 @@ async fn smoke_anthropic() {
     let (gw, vkey) = managed_gateway(&nats, "anthropic", &key).await;
     let client = reqwest::Client::new();
 
-    // `/v1/messages` → Anthropic dialect → provider `anthropic`. The minted virtual key is presented
-    // in `x-api-key` (the Anthropic SDK's header); the gateway verifies it and swaps in the real key
-    // — again in `x-api-key` (not Bearer). The required `anthropic-version` header passes through.
-    // This is the *only* test covering the x-api-key auth scheme + a real TLS handshake to
-    // api.anthropic.com via the full managed path.
+    // `/anthropic/v1/messages` → provider `anthropic` (selected by the path segment, stripped to
+    // `/v1/messages` upstream). The minted virtual key is presented in `x-api-key` (the Anthropic
+    // SDK's header); the gateway verifies it and swaps in the real key — again in `x-api-key` (not
+    // Bearer). The required `anthropic-version` header passes through. This is the *only* test
+    // covering the x-api-key auth scheme + a real TLS handshake to api.anthropic.com via the full
+    // managed path.
     let body = r#"{"model":"claude-haiku-4-5","max_tokens":16,"messages":[{"role":"user","content":"Reply with the single word: ping"}]}"#;
     let resp = client
-        .post(format!("{}/v1/messages", gw.url()))
+        .post(format!("{}/anthropic/v1/messages", gw.url()))
         .header("x-api-key", &vkey)
         .header("anthropic-version", "2023-06-01")
         .header("content-type", "application/json")
@@ -137,24 +141,38 @@ async fn smoke_anthropic() {
 #[tokio::test]
 #[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
 async fn smoke_openai() {
-    smoke_openai_wire("openai", "OPENAI_API_KEY", "gpt-4o-mini").await;
+    smoke_openai_wire(
+        "openai",
+        "OPENAI_API_KEY",
+        "gpt-4o-mini",
+        "/openai/v1/chat/completions",
+    )
+    .await;
 }
 
 #[tokio::test]
 #[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
 async fn smoke_groq() {
-    // Proves the `/v1` → `/openai/v1` rewrite against a real mount (the highest-value rewrite case).
-    smoke_openai_wire("groq", "GROQ_API_KEY", "llama-3.1-8b-instant").await;
+    // Groq mounts under `/openai/v1`; the client sends `/groq/openai/v1/...` and the gateway strips
+    // `/groq` and forwards the rest verbatim. The highest-value non-`/v1` native-path case.
+    smoke_openai_wire(
+        "groq",
+        "GROQ_API_KEY",
+        "llama-3.1-8b-instant",
+        "/groq/openai/v1/chat/completions",
+    )
+    .await;
 }
 
 #[tokio::test]
 #[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
 async fn smoke_fireworks() {
-    // Proves the `/v1` → `/inference/v1` rewrite against a real mount.
+    // Fireworks mounts under `/inference/v1`: client sends `/fireworks/inference/v1/...`.
     smoke_openai_wire(
         "fireworks",
         "FIREWORKS_API_KEY",
         "accounts/fireworks/models/llama-v3p1-8b-instruct",
+        "/fireworks/inference/v1/chat/completions",
     )
     .await;
 }
@@ -162,14 +180,26 @@ async fn smoke_fireworks() {
 #[tokio::test]
 #[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
 async fn smoke_openrouter() {
-    // Proves the `/v1` → `/api/v1` rewrite against a real mount.
-    smoke_openai_wire("openrouter", "OPENROUTER_API_KEY", "openai/gpt-4o-mini").await;
+    // OpenRouter mounts under `/api/v1`: client sends `/openrouter/api/v1/...`.
+    smoke_openai_wire(
+        "openrouter",
+        "OPENROUTER_API_KEY",
+        "openai/gpt-4o-mini",
+        "/openrouter/api/v1/chat/completions",
+    )
+    .await;
 }
 
 #[tokio::test]
 #[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
 async fn smoke_deepseek() {
-    smoke_openai_wire("deepseek", "DEEPSEEK_API_KEY", "deepseek-chat").await;
+    smoke_openai_wire(
+        "deepseek",
+        "DEEPSEEK_API_KEY",
+        "deepseek-chat",
+        "/deepseek/v1/chat/completions",
+    )
+    .await;
 }
 
 #[tokio::test]
@@ -179,6 +209,7 @@ async fn smoke_together() {
         "together",
         "TOGETHER_API_KEY",
         "meta-llama/Llama-3.1-8B-Instruct-Turbo",
+        "/together/v1/chat/completions",
     )
     .await;
 }
@@ -186,17 +217,35 @@ async fn smoke_together() {
 #[tokio::test]
 #[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
 async fn smoke_cerebras() {
-    smoke_openai_wire("cerebras", "CEREBRAS_API_KEY", "llama3.1-8b").await;
+    smoke_openai_wire(
+        "cerebras",
+        "CEREBRAS_API_KEY",
+        "llama3.1-8b",
+        "/cerebras/v1/chat/completions",
+    )
+    .await;
 }
 
 #[tokio::test]
 #[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
 async fn smoke_mistral() {
-    smoke_openai_wire("mistral", "MISTRAL_API_KEY", "mistral-small-latest").await;
+    smoke_openai_wire(
+        "mistral",
+        "MISTRAL_API_KEY",
+        "mistral-small-latest",
+        "/mistral/v1/chat/completions",
+    )
+    .await;
 }
 
 #[tokio::test]
 #[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
 async fn smoke_xai() {
-    smoke_openai_wire("xai", "XAI_API_KEY", "grok-3-mini").await;
+    smoke_openai_wire(
+        "xai",
+        "XAI_API_KEY",
+        "grok-3-mini",
+        "/xai/v1/chat/completions",
+    )
+    .await;
 }

From fd968c80769bd77b933a6dd581e3a48ac39b0448 Mon Sep 17 00:00:00 2001
From: Jared Lunde <jared.lunde@gmail.com>
Date: Sun, 31 May 2026 13:42:54 -0700
Subject: [PATCH 5/7] refactor(ai): flatten workspace-of-one into a single
 crate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The repo was a Cargo workspace with exactly one member (`crates/gateway`) — a
remnant of a planned multi-crate layout (SDK, control-plane) that was dropped.
Over-structured: a workspace's only leverage (`[workspace.dependencies]`,
`[workspace.package]`, shared lints across crates) needs 2+ members to pay off.

Flattened to a single crate at the repo root:
- `crates/gateway/{src,tests,benches}` → `{src,tests,benches}` (git-tracked moves,
  history preserved).
- The two manifests merged into one root `Cargo.toml`: `[workspace.*]` tables
  inlined as `[package]`/`[dependencies]`/`[dev-dependencies]`, dep versions
  resolved from `[workspace.dependencies]`. Dropped the unused `async-nats`
  workspace dep (pulled transitively via slipstream; never named directly).

Rigor preserved exactly — same `[lints.rust]` (forbid unsafe, deny
unused_must_use) and `[lints.clippy]` panic-surface denies (unwrap/expect/panic/
todo/unimplemented), same `[profile.release]` overflow-checks. `[lints]` at the
package level still binds every target (lib/bin/tests/benches), so the bin-root
gap stays closed.

`mise check:rs` drops the now-meaningless `--workspace`; CI/ci.yml comments
updated `[workspace.lints]` → `[lints]`.

All CI steps pass locally (RUSTFLAGS=-D warnings): dprint check, cargo fmt
--check, clippy --all-targets -D warnings, 68 unit + 18 e2e (10 smoke ignored),
release build. Live Anthropic smoke green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml                      | 10 +--
 Cargo.toml                                    | 56 ++++++++++----
 {crates/gateway/benches => benches}/e2e.rs    |  0
 {crates/gateway/benches => benches}/unit.rs   |  0
 crates/gateway/Cargo.toml                     | 73 -------------------
 mise.toml                                     |  2 +-
 {crates/gateway/src => src}/admin.rs          |  0
 {crates/gateway/src => src}/config.rs         |  0
 {crates/gateway/src => src}/deny.rs           |  0
 {crates/gateway/src => src}/doctor.rs         |  0
 {crates/gateway/src => src}/error.rs          |  0
 {crates/gateway/src => src}/key.rs            |  0
 {crates/gateway/src => src}/lib.rs            |  0
 {crates/gateway/src => src}/main.rs           |  0
 {crates/gateway/src => src}/metrics.rs        |  0
 {crates/gateway/src => src}/peek.rs           |  0
 {crates/gateway/src => src}/proxy.rs          |  0
 {crates/gateway/src => src}/ratelimit.rs      |  0
 {crates/gateway/src => src}/route.rs          |  0
 {crates/gateway/src => src}/secret.rs         |  0
 {crates/gateway/src => src}/state.rs          |  0
 {crates/gateway/src => src}/store_watch.rs    |  0
 {crates/gateway/src => src}/usage.rs          |  0
 {crates/gateway/tests => tests}/common/mod.rs |  0
 {crates/gateway/tests => tests}/e2e.rs        |  0
 {crates/gateway/tests => tests}/smoke.rs      |  0
 26 files changed, 49 insertions(+), 92 deletions(-)
 rename {crates/gateway/benches => benches}/e2e.rs (100%)
 rename {crates/gateway/benches => benches}/unit.rs (100%)
 delete mode 100644 crates/gateway/Cargo.toml
 rename {crates/gateway/src => src}/admin.rs (100%)
 rename {crates/gateway/src => src}/config.rs (100%)
 rename {crates/gateway/src => src}/deny.rs (100%)
 rename {crates/gateway/src => src}/doctor.rs (100%)
 rename {crates/gateway/src => src}/error.rs (100%)
 rename {crates/gateway/src => src}/key.rs (100%)
 rename {crates/gateway/src => src}/lib.rs (100%)
 rename {crates/gateway/src => src}/main.rs (100%)
 rename {crates/gateway/src => src}/metrics.rs (100%)
 rename {crates/gateway/src => src}/peek.rs (100%)
 rename {crates/gateway/src => src}/proxy.rs (100%)
 rename {crates/gateway/src => src}/ratelimit.rs (100%)
 rename {crates/gateway/src => src}/route.rs (100%)
 rename {crates/gateway/src => src}/secret.rs (100%)
 rename {crates/gateway/src => src}/state.rs (100%)
 rename {crates/gateway/src => src}/store_watch.rs (100%)
 rename {crates/gateway/src => src}/usage.rs (100%)
 rename {crates/gateway/tests => tests}/common/mod.rs (100%)
 rename {crates/gateway/tests => tests}/e2e.rs (100%)
 rename {crates/gateway/tests => tests}/smoke.rs (100%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0e23c17..3827dd3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4,9 +4,9 @@ on:
     branches: [main]
 env:
   CARGO_TERM_COLOR: always
-  # Belt-and-suspenders: the panic-surface + `unused_must_use` denies live in `[workspace.lints]`
-  # (Cargo.toml) so they bind locally too, but escalate *every* warning to an error in CI in case a
-  # lint isn't expressible there (build scripts, future targets).
+  # Belt-and-suspenders: the panic-surface + `unused_must_use` denies live in `[lints]` (Cargo.toml)
+  # so they bind locally too, but escalate *every* warning to an error in CI in case a lint isn't
+  # expressible there (build scripts, future targets).
   RUSTFLAGS: -D warnings
 jobs:
   check:
@@ -19,8 +19,8 @@ jobs:
       # Formatting: dprint (config/json/etc) + rustfmt.
       - run: mise check:fmt
       - run: cargo fmt --all --check
-      # Lints: clippy `-D warnings` across all targets. With `[workspace.lints.clippy]` denying the
-      # panic surface (unwrap/expect/panic/todo/unimplemented), a new `.unwrap()` in production code
+      # Lints: clippy `-D warnings` across all targets. With `[lints.clippy]` denying the panic
+      # surface (unwrap/expect/panic/todo/unimplemented), a new `.unwrap()` in production code
       # fails the build here.
       - run: mise check:rs
       - run: mise test:unit:rs
diff --git a/Cargo.toml b/Cargo.toml
index bee51ed..c73b00c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,16 +1,23 @@
-[workspace]
-resolver = "2"
-members = ["crates/gateway"]
-
-[workspace.package]
+[package]
+name = "beyond-ai"
+version = "0.1.0"
 edition = "2024"
 license = "MIT"
 rust-version = "1.85"
+description = "Beyond AI gateway — egress L7 proxy to LLM providers"
+
+[lib]
+name = "beyond_ai"
+path = "src/lib.rs"
 
-# Applied to every crate root in the workspace (lib *and* bin) via `[lints] workspace = true`.
-# Crate-level `#![deny(...)]` attributes only cover the unit they're written in, so a binary root
-# (`main.rs`) would otherwise escape the library's denies — this closes that gap structurally.
-[workspace.lints.rust]
+[[bin]]
+name = "beyond-ai"
+path = "src/main.rs"
+
+# `[lints]` binds every target in the crate — lib, bin, tests, benches. That matters: a crate-level
+# `#![deny(...)]` attribute only covers the unit it's written in, so the binary root (`main.rs`)
+# would otherwise escape the library's denies. Declaring them here closes that gap structurally.
+[lints.rust]
 unsafe_code = "forbid"
 unused_must_use = "deny"
 
@@ -20,7 +27,7 @@ unused_must_use = "deny"
 # here turns them on. The handful of genuine boot-time invariants carry a local
 # `#[allow(clippy::expect_used)]` with a SAFETY-style note; test/bench targets allow them wholesale
 # at the file head (asserting a precondition with `.unwrap()` is the point of a test).
-[workspace.lints.clippy]
+[lints.clippy]
 unwrap_used = "deny"
 expect_used = "deny"
 panic = "deny"
@@ -32,11 +39,10 @@ unimplemented = "deny"
 [profile.release]
 overflow-checks = true
 
-[workspace.dependencies]
+[dependencies]
 # slipstream is published — consume it from crates.io, aliased to `store` so the code's
 # `use store::...` is unchanged. No path deps into the `beyond` repo: this crate builds standalone.
 store = { package = "beyond-slipstream", version = "0.1.0" }
-zeroize = "1"
 
 pingora = { version = "0.8", default-features = false, features = ["rustls"] }
 pingora-core = "0.8"
@@ -45,7 +51,6 @@ pingora-proxy = "0.8"
 
 arc-swap = "1"
 arrayvec = "0.7"
-async-nats = "0.46"
 async-trait = "0.1"
 base64 = "0.22"
 bytes = "1"
@@ -65,3 +70,28 @@ thiserror = "2"
 tokio = { version = "1", features = ["full"] }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
+zeroize = "1"
+
+[dev-dependencies]
+# Bench harnesses. Best tool per job: `divan` for the unit micro-bench (it measures allocations
+# natively via AllocProfiler, alongside timing); `criterion` for the e2e macro-bench (`async_tokio`
+# drives the round-trips, and its saved-baseline comparison tracks latency/RPS over time).
+criterion = { version = "0.5", features = ["async_tokio"] }
+divan = "0.1"
+http-body-util = "0.1"
+# `http2` + hyper-util's `server-auto` let the mock upstream serve H1 *and* H2 on one TLS listener
+# (protocol chosen by ALPN), so the concurrency bench can drive the gateway's H2 path. `rcgen` mints a
+# throwaway self-signed cert for that listener; `tokio-rustls` terminates TLS in front of hyper.
+hyper = { version = "1", features = ["server", "http1", "http2"] }
+hyper-util = { version = "0.1", features = ["tokio", "server-auto"] }
+rcgen = "0.13"
+reqwest = { version = "0.13", default-features = false, features = ["json", "rustls"] }
+tokio-rustls = "0.26"
+
+[[bench]]
+name = "unit"
+harness = false
+
+[[bench]]
+name = "e2e"
+harness = false
diff --git a/crates/gateway/benches/e2e.rs b/benches/e2e.rs
similarity index 100%
rename from crates/gateway/benches/e2e.rs
rename to benches/e2e.rs
diff --git a/crates/gateway/benches/unit.rs b/benches/unit.rs
similarity index 100%
rename from crates/gateway/benches/unit.rs
rename to benches/unit.rs
diff --git a/crates/gateway/Cargo.toml b/crates/gateway/Cargo.toml
deleted file mode 100644
index f52d86f..0000000
--- a/crates/gateway/Cargo.toml
+++ /dev/null
@@ -1,73 +0,0 @@
-[package]
-name = "beyond-ai"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-rust-version.workspace = true
-description = "Beyond AI gateway — egress L7 proxy to LLM providers"
-
-[lints]
-workspace = true
-
-[lib]
-name = "beyond_ai"
-path = "src/lib.rs"
-
-[[bin]]
-name = "beyond-ai"
-path = "src/main.rs"
-
-[dependencies]
-store = { workspace = true }
-
-pingora = { workspace = true }
-pingora-core = { workspace = true }
-pingora-limits = { workspace = true }
-pingora-proxy = { workspace = true }
-
-arc-swap = { workspace = true }
-arrayvec = { workspace = true }
-async-trait = { workspace = true }
-base64 = { workspace = true }
-bytes = { workspace = true }
-clap = { workspace = true }
-ed25519-dalek = { workspace = true }
-figment = { workspace = true }
-getrandom = { workspace = true }
-http = { workspace = true }
-memchr = { workspace = true }
-prometheus = { workspace = true }
-rustls = { workspace = true }
-serde = { workspace = true }
-serde_json = { workspace = true }
-thiserror = { workspace = true }
-tokio = { workspace = true }
-tracing = { workspace = true }
-tracing-subscriber = { workspace = true }
-zeroize = { workspace = true }
-
-[dev-dependencies]
-# e2e harness: real gateway subprocess + real nats-server + a mock HTTP upstream.
-base64 = { workspace = true }
-# Bench harnesses. Best tool per job: `divan` for the unit micro-bench (it measures allocations
-# natively via AllocProfiler, alongside timing); `criterion` for the e2e macro-bench (`async_tokio`
-# drives the round-trips, and its saved-baseline comparison tracks latency/RPS over time).
-criterion = { version = "0.5", features = ["async_tokio"] }
-divan = "0.1"
-http-body-util = "0.1"
-# `http2` + hyper-util's `server-auto` let the mock upstream serve H1 *and* H2 on one TLS listener
-# (protocol chosen by ALPN), so the concurrency bench can drive the gateway's H2 path. `rcgen` mints a
-# throwaway self-signed cert for that listener; `tokio-rustls` terminates TLS in front of hyper.
-hyper = { version = "1", features = ["server", "http1", "http2"] }
-hyper-util = { version = "0.1", features = ["tokio", "server-auto"] }
-rcgen = "0.13"
-reqwest = { version = "0.13", default-features = false, features = ["json", "rustls"] }
-tokio-rustls = "0.26"
-
-[[bench]]
-name = "unit"
-harness = false
-
-[[bench]]
-name = "e2e"
-harness = false
diff --git a/mise.toml b/mise.toml
index 55c7edc..be93405 100644
--- a/mise.toml
+++ b/mise.toml
@@ -14,7 +14,7 @@ run = "cargo build"
 run = "cargo build --release"
 
 [tasks."check:rs"]
-run = "cargo clippy --workspace --all-targets -- -D warnings"
+run = "cargo clippy --all-targets -- -D warnings"
 
 [tasks."check:fmt"]
 run = "dprint check"
diff --git a/crates/gateway/src/admin.rs b/src/admin.rs
similarity index 100%
rename from crates/gateway/src/admin.rs
rename to src/admin.rs
diff --git a/crates/gateway/src/config.rs b/src/config.rs
similarity index 100%
rename from crates/gateway/src/config.rs
rename to src/config.rs
diff --git a/crates/gateway/src/deny.rs b/src/deny.rs
similarity index 100%
rename from crates/gateway/src/deny.rs
rename to src/deny.rs
diff --git a/crates/gateway/src/doctor.rs b/src/doctor.rs
similarity index 100%
rename from crates/gateway/src/doctor.rs
rename to src/doctor.rs
diff --git a/crates/gateway/src/error.rs b/src/error.rs
similarity index 100%
rename from crates/gateway/src/error.rs
rename to src/error.rs
diff --git a/crates/gateway/src/key.rs b/src/key.rs
similarity index 100%
rename from crates/gateway/src/key.rs
rename to src/key.rs
diff --git a/crates/gateway/src/lib.rs b/src/lib.rs
similarity index 100%
rename from crates/gateway/src/lib.rs
rename to src/lib.rs
diff --git a/crates/gateway/src/main.rs b/src/main.rs
similarity index 100%
rename from crates/gateway/src/main.rs
rename to src/main.rs
diff --git a/crates/gateway/src/metrics.rs b/src/metrics.rs
similarity index 100%
rename from crates/gateway/src/metrics.rs
rename to src/metrics.rs
diff --git a/crates/gateway/src/peek.rs b/src/peek.rs
similarity index 100%
rename from crates/gateway/src/peek.rs
rename to src/peek.rs
diff --git a/crates/gateway/src/proxy.rs b/src/proxy.rs
similarity index 100%
rename from crates/gateway/src/proxy.rs
rename to src/proxy.rs
diff --git a/crates/gateway/src/ratelimit.rs b/src/ratelimit.rs
similarity index 100%
rename from crates/gateway/src/ratelimit.rs
rename to src/ratelimit.rs
diff --git a/crates/gateway/src/route.rs b/src/route.rs
similarity index 100%
rename from crates/gateway/src/route.rs
rename to src/route.rs
diff --git a/crates/gateway/src/secret.rs b/src/secret.rs
similarity index 100%
rename from crates/gateway/src/secret.rs
rename to src/secret.rs
diff --git a/crates/gateway/src/state.rs b/src/state.rs
similarity index 100%
rename from crates/gateway/src/state.rs
rename to src/state.rs
diff --git a/crates/gateway/src/store_watch.rs b/src/store_watch.rs
similarity index 100%
rename from crates/gateway/src/store_watch.rs
rename to src/store_watch.rs
diff --git a/crates/gateway/src/usage.rs b/src/usage.rs
similarity index 100%
rename from crates/gateway/src/usage.rs
rename to src/usage.rs
diff --git a/crates/gateway/tests/common/mod.rs b/tests/common/mod.rs
similarity index 100%
rename from crates/gateway/tests/common/mod.rs
rename to tests/common/mod.rs
diff --git a/crates/gateway/tests/e2e.rs b/tests/e2e.rs
similarity index 100%
rename from crates/gateway/tests/e2e.rs
rename to tests/e2e.rs
diff --git a/crates/gateway/tests/smoke.rs b/tests/smoke.rs
similarity index 100%
rename from crates/gateway/tests/smoke.rs
rename to tests/smoke.rs

From c27476b022898f45e385372b7c66a8add5f57e08 Mon Sep 17 00:00:00 2001
From: Jared Lunde <jared.lunde@gmail.com>
Date: Sun, 31 May 2026 15:10:07 -0700
Subject: [PATCH 6/7] fixes

---
 ARCHITECTURE.md | 559 +++++++++++++++++++++++++++++-------------------
 src/admin.rs    |  49 +++--
 src/config.rs   | 178 ++++++++++++++-
 src/main.rs     |   7 +-
 src/metrics.rs  |  46 +++-
 src/proxy.rs    |  26 ++-
 src/usage.rs    |  22 +-
 7 files changed, 632 insertions(+), 255 deletions(-)

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 564560b..142a15a 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -1,11 +1,14 @@
 # Beyond AI Gateway — Architecture
 
-A centralized, internal **egress L7 proxy** to LLM providers, built on **Pingora** + tokio. Apps point their stock
-OpenAI/Anthropic SDK at it; the gateway authenticates, swaps in the real provider key, relays the
-response untouched, and emits token-usage facts for billing.
+Takes HTTP requests carrying an OpenAI- or Anthropic-dialect payload, authenticates the caller via
+Ed25519 virtual key or BYO provider token, swaps in a pool key for managed traffic, relays the
+request and response byte-for-byte to the upstream provider, and emits a token-usage billing fact
+(`ai.usage`) on completion — all without buffering the body or response stream.
 
-**Self-contained:** no `path` deps into the `beyond` repo. Depends only on crates.io + the published
-`beyond-slipstream` — so it clones/CI-builds/publishes anywhere.
+**Self-contained:** no `path` deps into the `beyond` repo. Depends only on crates.io + the
+published `beyond-slipstream` — clones, CI-builds, and publishes anywhere.
+
+---
 
 ## Concepts & Terminology
 
@@ -14,132 +17,236 @@ response untouched, and emits token-usage facts for billing.
 | **Managed key** (`bai_v1.…`)                     | Ed25519-verified identity; enables key swap, deny-set check, and `ai.usage` billing                                                                         | A session token or capability grant — just tenant attribution                |
 | **BYO key** (anything else)                      | Forwarded as-is to the provider; no swap, no billing, no deny-set                                                                                           | A lesser tier — same proxy, minus attribution and billing                    |
 | **Pool key**                                     | Real provider API key held by the gateway; swapped in for managed traffic                                                                                   | Per-tenant — one key per provider, shared by all managed callers             |
-| **Tenant**                                       | The billing entity from the virtual key payload (`tenant_id: u32`)                                                                                          | An org, user, or namespace — an opaque integer the gateway doesn't interpret |
+| **Tenant**                                       | The billing entity from the virtual key payload (`tenant_id: u64`)                                                                                          | An org, user, or namespace — an opaque integer the gateway doesn't interpret |
 | **Dialect**                                      | A provider attribute (OpenAI-wire vs Anthropic-wire) driving usage parsing; for a bare-path request it's derived from the path to pick the default provider | The provider — a prefixed request uses its provider's dialect, not the path  |
 | **Provider**                                     | The request's **first path segment** (`/{provider}/…`); a named row in the routing table: authority, dialect, auth scheme                                   | A vendor relationship — just connection facts and auth wiring                |
-| **Deny-set**                                     | Sparse set of denied `tenant_id`s; gates managed traffic; default-allow                                                                                     | An allowlist or ACL — misses are allowed, not blocked                        |
+| **Deny-set**                                     | Sparse map of denied `tenant_id`s → reason; gates managed traffic; default-allow                                                                            | An allowlist or ACL — misses are allowed, not blocked                        |
 | **Tail tap**                                     | Bounded 64KB window kept from the end of the response for usage extraction                                                                                  | A buffer or copy — the response is relayed unbuffered; only the tail is kept |
 | **Snapshot**                                     | On-disk deny-set cache (entries + NATS cursor) for edge/tunnel deployments                                                                                  | Persistent store — a pure cache; delete it and the gateway re-scans NATS     |
-| **Virtual key** (`bai_v1.{kid}.{payload}.{sig}`) | Ed25519-signed token encoding `tenant_id` + `vpc_id`                                                                                                        | A session or auth token — stateless, no server-side lookup, no revocation    |
+| **Virtual key** (`bai_v1.{kid}.{payload}.{sig}`) | Ed25519-signed token encoding `tenant_id` + `vpc_id` (16-byte fixed payload)                                                                                | A session or auth token — stateless, no server-side lookup, no revocation    |
 
 ---
 
-## Request flow (`proxy.rs`)
+## Data Flow
+
+### Happy Path
+
+```
+Client (stock OpenAI/Anthropic SDK)
+  │
+  ▼  request_filter (proxy.rs)
+  │  ├─ Route: first segment → provider row (authority, dialect, auth scheme)
+  │  ├─ Extract key from Authorization: Bearer or x-api-key
+  │  ├─ Rate guardrails (BEFORE verify — keeps forged-key floods at ns cost)
+  │  │    per-credential count-min  ──────────────────────────────► 429
+  │  │    global BYO aggregate (managed exempt)  ─────────────────► 429
+  │  ├─ Content-Length abuse guard  ──────────────────────────────► 413
+  │  └─ Identity branch:
+  │       bai_v1.…  → Ed25519 verify → deny-set check (O(1))
+  │       │               │                    │
+  │       │             401 (bad sig)     402 Spend / 403 Fraud
+  │       │                                    │
+  │       │           pool key required ───────────────────────── 503
+  │       └─ BYO: pass through (no verify, no deny-set, no billing)
+  │
+  ▼  upstream_peer (proxy.rs)
+  │  TTL-cached DNS resolve (60s) → HttpPeer (TLS, H2 pref, timeouts)
+  │  DNS fail ──────────────────────────────────────────────────── 502
+  │  TCP connect fail (retry 2×) ──────────────────────────────── 502
+  │
+  ▼  upstream_request_filter (proxy.rs)
+  │  Managed: remove both auth headers → inject pool key
+  │  BYO: leave auth header unchanged
+  │  Set Host; forward path verbatim (/{provider} prefix stripped)
+  │
+  ▼  request_body_filter (proxy.rs)  — body streamed through, never buffered
+  │  Feed chunks → ModelScanner (peek.rs) — extract root-level `model`, O(1) mem
+  │  Enforce running size cap (chunked-safe) ──────────────────── 413
+  │  Injection-eligible (managed OpenAI chat/responses + stream):
+  │    buffer full body → inject stream_options.include_usage → re-frame chunked
+  │
+  ▼  Provider upstream  (OpenAI / Anthropic / Groq / DeepSeek / …)
+  │
+  ▼  response_filter (proxy.rs)
+  │  Record TTFT; detect streaming (Content-Type: text/event-stream)
+  │  Count upstream response by provider + status class
+  │  Set x-beyond-request-id header
+  │
+  ▼  response_body_filter (proxy.rs)  — response relayed chunk-by-chunk, never buffered
+  │  Feed chunks → ModelScanner over response head → extract billed model
+  │  Append to bounded 64KB tail (compact drain(..half) if tail > 128KB)
+  │
+  ▼  logging (proxy.rs)
+     Parse usage from tail (by dialect + streaming flag)
+     Emit ai.usage fact: tenant, vpc, model, requested_model, token counts (managed only)
+     Decrement requests_in_flight gauge
+```
+
+### Background: Deny-Set Watcher
 
 ```
-client (stock SDK, Bearer/ x-api-key)
-   │
-   ▼ request_filter
-   ├─ provider = first path segment `/{provider}/…` (strip + forward rest verbatim);
-   │             bare `/v1…` → dialect default (openai/anthropic);  unknown → 404
-   ├─ extract key                                               (missing → 401)
-   ├─ rate guardrails ← BEFORE verify/connect: per-credential (seeded raw-key hash) +
-   │                    global BYO aggregate (managed exempt; protects egress IPs); over → 429
-   ├─ Content-Length abuse guard (declared size; streamed total enforced in body filter too)
-   ├─ key format branch:
-   │    • bai_…  → MANAGED: Ed25519 verify (stateless) → {tenant_id, vpc_id}
-   │              → deny-set check (O(1), default-allow) → require pool key
-   │    • else   → BYO: the user's own provider token, passed through unchanged
-   ▼ upstream_peer        — TTL-cached DNS resolve → HttpPeer (no blocking getaddrinfo)
-   ▼ upstream_request_filter — managed: swap auth header to pool key; BYO: leave it. Set Host.
-   ▼ request_body_filter  — STREAM BODY THROUGH (never buffered); feed bytes to a structural
-   │                         scanner that extracts the exact root-level `model` (O(1), memchr-fast);
-   │                         enforce the body cap on the running total (chunked-safe)
-   ▼ response_filter      — TTFT; streaming? = response Content-Type is text/event-stream; count
-   │                         upstream response by provider+status class; set x-beyond-request-id
-   ▼ response_body_filter — relay unbuffered; keep a bounded 64KB tail for the usage tap
-   ▼ logging              — parse usage from tail (by dialect+streaming); emit `ai.usage` fact
-   │                         (managed only — BYO has no tenant to bill); metrics count all traffic.
-   │                         Every terminal path (reject + usage) logs the request_id for correlation
-        upstream: a registered provider (openai, anthropic, openrouter, fireworks,
-                  groq, deepseek, together, cerebras, mistral, xai — + config-added)
+NATS (blackhole.* KV entries)
+  │
+  ▼  store_watch.rs (Pingora BackgroundService)
+  │  On connect: seed from disk snapshot (if snapshot_path set) or full NATS scan
+  │  Resume watch from saved revision (gap-free — no entry lost mid-connect)
+  │  Reconnect backoff: 1s → 30s exponential
+  │
+  ▼  ArcSwap<DenySet>  (state.rs)
+     Lock-free read on every managed request
+     Written only by the watcher on entry add/remove
 ```
 
-## What lives where
-
-- **NATS / slipstream:** exactly one thing — the **deny-set** (`blackhole.{tenant}`). Watched,
-  fail-open. Auth and keys do **not** depend on NATS.
-- **Config (boot, SSM/env):** `signing_keys` (Ed25519 **public** keys by kid — multiple for
-  rotation), `pool_keys` (managed pool keys **by provider name**, from `AI_POOL_KEY_<NAME>` env),
-  `provider_authorities` (per-name authority overrides / additions), `rate_limit_rps` (per-credential
-  request ceiling; 0 disables), `byo_rate_limit_rps` (aggregate ceiling for _all_ BYO traffic — the
-  egress-IP guard; 0 disables), `snapshot_path` (optional on-disk deny-set cache; see below),
-  timeouts. Secret-bearing fields (`pool_keys`, `nats_creds`) are held as `Secret`, so a stray
-  `Debug`/`Serialize` of the config can't leak them. See `config.example.toml`.
-- **The virtual key (`bai_v1.{kid}.{payload}.{sig}`):** Ed25519-signed, payload = `{tenant_id,
-  vpc_id}`, verified with a public key — stateless, no lookup. Minted by the control plane (it holds
-  the private key); a compromised/OSS gateway can verify but not mint.
-
-## Key invariants
-
-- **Managed vs BYO by key format.** `bai_…` → verify + swap to pool key. Anything else → the user's
-  real token, passed through (no swap, no deny-set, no per-tenant attribution, and **no `ai.usage`
-  billing event** — it would be an unbillable `tenant_id=0` row; aggregate metrics still count it).
-- **Request body is never buffered** — it streams through with original framing; a streaming
-  structural scanner (`peek::ModelScanner`, O(1), SIMD `memchr` skip over big values) extracts the
-  exact root-level `model`. **One exception:** a _managed_ OpenAI chat/responses request is buffered
-  so the gateway can inject `stream_options.include_usage` when the client streams without it —
-  otherwise OpenAI emits no usage chunk and the request is unmeterable. Works out of the box (no
-  client/SDK cooperation), framed upstream as chunked, bounded by `MAX_REQUEST_BODY`, scoped to that
-  one path — BYO and everything else stay pure passthrough.
-- **Response is never buffered** — relayed chunk-by-chunk; a bounded 64KB tail feeds the usage tap.
-- **Deny-set is `O(denied)`, default-allow, fail-open.** Restore = explicit delete or TTL expiry.
-  Seeding is **gap-free**: the seed records the stream revision it reflects, and the watch _resumes
-  from that revision_ (`watch_prefix_from`) rather than starting live — so a deny entry written in
-  the window between seeding and the watch attaching can't be lost (a plain `watch_prefix` uses NATS
-  `DeliverPolicy::New` and would silently drop it). The resume revision is kept across reconnects, so
-  a NATS blip resumes from where it left off instead of re-scanning.
-- **Deny-set seeding has two modes (`snapshot_path`).** Unset (ephemeral/Fargate): scan
-  `blackhole.*` from NATS each cold boot. Set (edge/tunnel, durable disk): load slipstream's on-disk
-  snapshot (entries + saved cursor), enforce immediately on restart **before NATS reconnects**, and
-  append each applied delta back to the file. The snapshot is a pure cache — delete it and the
-  gateway falls back to scanning; a `CursorExpired` (history compacted past the cursor) does the same.
-- **Auth works without NATS** (keys from config); a NATS outage only staleens the deny-set.
-- **Two-tier rate guardrail, checked _before_ verify/connect, not a spend control.** The deny-set is
-  the spend/fraud authority but reacts on a lag and never sees floods that don't bill (auth failures,
-  4xx, BYO). Two fixed-memory count-min tiers (`ratelimit`, pingora-limits) cap velocity:
-  - **Per-credential** — keyed by a seeded hash of the raw presented key (so collisions can't be
-    precomputed to false-throttle another caller). Bounds a leaked/runaway key during deny-set lag, a
-    retry-storm flood, **and the Ed25519-verify cost of a forged-key flood**: keying on the raw
-    credential (not the verified tenant) is what lets the guard sit _ahead of_ the verify (the
-    gateway's one ~28µs/req CPU cost; see Benchmarking), so a single bad token can't drive unbounded
-    crypto work. Granularity is per-credential ≈ per-(tenant, app), since virtual keys are
-    deterministic per that pair — not a per-tenant aggregate.
-  - **Global BYO aggregate** — one shared bucket for _all_ BYO traffic. BYO connects outward to
-    providers _from our egress IPs_ carrying the caller's token, so a flood of distinct **junk** BYO
-    tokens (which slip past per-credential keying — each is its own bucket) would get those IPs
-    rate-limited or banned by the provider, hurting _everyone_. This bounds that aggregate regardless
-    of token variation. **Managed traffic is exempt** — it's verified before any upstream connect and
-    can't be forged, so a random `bai_…` flood fails verify and never reaches a provider; exempting it
-    keeps this shared bucket from ever shedding core tenant load. **Per-source-IP was considered and
-    rejected** as the primary control: it depends on the calling task's real IP surviving ECS Service
-    Connect (unconfirmed), and is worse than nothing if the peer is a collapsed mesh hop — so we chose
-    the topology-independent aggregate. The blunt cap's residual (it sheds legit BYO under a flood; the
-    default is an untuned guess; the real selective fix is a provider-feedback circuit breaker on
-    upstream 401s) is recorded in full in the `ratelimit` **module-doc decision block** — read it
-    before changing the knob or reaching for per-IP.
-
-  Both tiers are generous circuit breakers, not quotas; `rate_limit_rps = 0` / `byo_rate_limit_rps = 0`
-  disable them independently.
-- **Routing is by the first path segment = provider** (model isn't known before peer selection).
-  `/{provider}/…` selects the provider and the rest of the path is forwarded **verbatim** — the
-  gateway holds no per-provider mount knowledge, so the client uses the provider's own native path
-  (`/groq/openai/v1/…`, `/fireworks/inference/v1/…`). A bare path with no provider prefix that starts
-  with `/v1` is the **drop-in default** (dialect → openai/anthropic); an unknown segment is a **404**.
-  **Providers are data** — a row in `route::KNOWN_PROVIDERS` (name, authority, dialect, auth scheme)
-  or a config entry — so adding an OpenAI-wire provider is one line, no new code paths. Each row's
-  authority/auth is **verified against the provider's official docs (cited inline in `route.rs`)**.
-- **Connect retries only** (`fail_to_connect`); no HTTP-status retry (Pingora-idiomatic, SDKs back off).
-- **`ai.usage` carries _both_ models: `model` (resolved) + `requested_model` (alias).** `model` is
-  the id the provider resolved + billed, taken from the _response_ (a second `ModelScanner` over the
-  response head; works for SSE — it skips the `data:` prefix and reads the first chunk's root
-  `model`). It's the key for pricing **and** for reconciling against the provider's invoice, which
-  itemizes by the pinned snapshot (`gpt-4o-2024-08-06`), not the alias. `requested_model` is what the
-  client sent (`gpt-4o`) — product analytics, and a fallback rate when a snapshot is newer than the
-  downstream price table. The two are equal when the response carried no model (error body), where
-  `model` falls back to the alias. Emitting both is additive: a consumer that keyed on the alias
-  doesn't break, and reconciliation still gets the exact id.
-- **Pricing is never here** — emit token _facts_; a closed downstream consumer prices.
+---
+
+## Core Mechanism
+
+### Routing (`route.rs`)
+
+Providers are **data rows**, not code paths. `KNOWN_PROVIDERS` in `route.rs` lists 10 built-in
+providers (openai, anthropic, openrouter, fireworks, groq, deepseek, together, cerebras, mistral,
+xai); each row carries its authority (host:port), dialect (OpenAI-wire vs Anthropic-wire), and auth
+scheme (Bearer vs x-api-key). The `provider_authorities` config key adds or overrides rows at boot
+with zero code change.
+
+The routing rule: **first path segment = provider name**. `/groq/openai/v1/chat/completions` routes
+to Groq and forwards `/openai/v1/chat/completions` verbatim. A bare `/v1/…` path matches the
+dialect default (OpenAI or Anthropic based on which default is set). Unknown segment → 404. Model
+is not known at peer-selection time and is never used for routing.
+
+### Identity (`key.rs`)
+
+Virtual key format: `bai_v1.{kid}.{payload}.{sig}` where payload is exactly 16 bytes (8-byte
+`tenant_id` + 8-byte `vpc_id`, little-endian u64). Verification is **stateless Ed25519** — no
+database, no network call. The keyring holds multiple `kid` → public key mappings simultaneously
+(zero-downtime rotation: add the new kid, deploy, remove the old kid). A tampered or forged key
+falls through to BYO treatment; it does not error in a way that reveals which part failed.
+
+Verification cost ≈ 28µs per request — this is the gateway's only meaningful per-request CPU cost
+(everything else runs in nanoseconds; see Benchmarking). The rate guardrails sit **before** verify
+precisely because of this: a forged-key flood is rejected in tens of nanoseconds, not 28µs each.
+
+### Model Extraction (`peek.rs:ModelScanner`)
+
+A streaming structural scanner fed body or response chunks as they arrive. Tracks JSON nesting
+depth, string-escape state, and quote boundaries. Captures the **root-level `model` field only**
+(depth 0 in the object), ignoring nested `model` keys in tool calls or message content.
+SIMD-accelerated via `memchr2` to skip over large string values (base64-encoded images, long
+prompts). O(1) memory: one struct, no heap growth with payload size — proven by the unit bench
+which shows a single allocation independent of whether the body is 0 bytes, 4 KB, or 256 KB.
+
+The billing fact carries **two model fields**:
+
+- `requested_model` — what the client sent (extracted from the request body)
+- `model` — what the provider resolved and billed (extracted from the response head; falls back to
+  `requested_model` when the response carries no model field, e.g. an error body)
+
+`model` is what reconciles against the provider's invoice (which itemizes by pinned snapshot, e.g.
+`gpt-4o-2024-08-06`, not alias). `requested_model` serves product analytics and as a fallback rate
+when the snapshot is newer than the downstream price table.
+
+### Usage Extraction (`usage.rs`)
+
+The tail tap feeds the parser after `logging` fires. Two dialects:
+
+| Dialect   | Format     | Fields                                                                                                            |
+| --------- | ---------- | ----------------------------------------------------------------------------------------------------------------- |
+| OpenAI    | JSON body  | `usage.prompt_tokens`, `usage.completion_tokens`, `usage.prompt_tokens_details.cached_tokens`                     |
+| OpenAI    | SSE stream | Terminal `data:` line (before `[DONE]`), same fields                                                              |
+| Anthropic | JSON body  | `usage.input_tokens`, `usage.output_tokens`, `usage.cache_read_input_tokens`, `usage.cache_creation_input_tokens` |
+| Anthropic | SSE stream | `message_delta` event with `usage` block                                                                          |
+
+Missing or zero usage fields deserialize to zero (safe default). If the tail is truncated by the
+compaction drain, the usage chunk is still present because SSE usage is always the final `data:`
+line and the tail keeps the last 64KB.
+
+### Deny-Set (`deny.rs`)
+
+A `HashMap<u64, DenyReason>` (tenant_id → reason). Only denied tenants are stored — the map is
+`O(denied)` in memory regardless of total tenant count. Lookup is one hash probe. Written
+exclusively by the NATS watcher via `ArcSwap`; reads on the hot path are lock-free.
+
+Reasons: `Spend` (→ 402), `Fraud` (→ 403), `Unknown` (→ 403, fail-safe for unrecognized values).
+Restore = explicit delete from NATS KV or TTL expiry — no gateway-side timer.
+
+### Rate Guardrails (`ratelimit.rs`)
+
+Two fixed-memory count-min sketch tiers, checked before Ed25519 verify and before any upstream
+connection:
+
+| Tier                 | Key             | Bucket count | Default ceiling | Managed exempt? |
+| -------------------- | --------------- | ------------ | --------------- | --------------- |
+| Per-credential       | Hash of raw key | 5 MB sketch  | 100 req/s       | No              |
+| Global BYO aggregate | Single bucket   | 1 bucket     | 1000 req/s      | **Yes**         |
+
+The per-credential tier is keyed on the **raw presented credential** (not the verified tenant),
+which has two consequences: (1) the guard sits ahead of verify, so forged tokens are rejected
+before any crypto work; (2) virtual keys are deterministic per `(tenant, app)`, so this is
+effectively per-(tenant, app) granularity without a registry lookup.
+
+The global BYO aggregate exists because BYO traffic exits from the gateway's own egress IPs
+carrying the caller's raw token. A flood of distinct junk BYO tokens each get their own
+per-credential bucket and slip through that tier — the aggregate caps total BYO egress rate to
+protect the gateway's IP reputation with providers. Managed traffic is exempt because it's verified
+before any upstream connection and cannot be forged.
+
+Both tiers are generous circuit breakers, not quotas. `rate_limit_rps = 0` / `byo_rate_limit_rps =
+0` disable them independently.
+
+---
+
+## Why It Behaves This Way
+
+### Why rate guardrails sit before Ed25519 verify
+
+Ed25519 verify is ~28µs — roughly 350–650× more expensive than every other per-request operation.
+A flood of forged `bai_v1` tokens could drive unbounded crypto work if the rate limit came after
+verify. By checking the per-credential bucket first (keyed on the raw token, no crypto), a
+forged-key flood is rejected in tens of nanoseconds per request. Legit traffic is unaffected: the
+rate guard passes through, then verify runs as normal. The unit bench (`benches/unit.rs`) asserts
+this: `key/verify` ≈ 28µs; `ratelimit::check` ≈ 43–83ns; 0 allocations for either.
+
+### Why the body injection exception exists (`managed + OpenAI + streaming`)
+
+OpenAI streams no usage chunk unless `stream_options.include_usage: true` is set. Without it, a
+streaming managed request is unmeterable: no usage block in the response means no billing fact. The
+gateway injects this field server-side so callers using stock SDKs get metered without any
+cooperation. The request is buffered (`MAX_REQUEST_BODY` cap), the field injected, and the body
+re-framed as chunked upstream. Scoped to managed + OpenAI-dialect + streaming only — BYO and
+non-streaming requests remain pure passthrough.
+
+### Why the deny-set watch resumes from a saved revision
+
+A plain `watch_prefix` (NATS `DeliverPolicy::New`) would miss any entry written in the window
+between the initial seed scan and the live watch attaching. `store_watch.rs` records the stream
+revision at which the seed was complete and calls `watch_prefix_from` to resume from that revision
+— so a deny written during the gap is delivered, not silently dropped. This revision is also
+persisted across reconnects, so a NATS blip resumes from the last-seen point instead of re-scanning
+the entire keyspace.
+
+### Why BYO token validity is never checked
+
+Checking a BYO token requires a round-trip to the provider. The provider does that check anyway and
+returns 401 if the token is invalid — the client sees the same rejection it would get going direct,
+just routed through the gateway. Adding a gateway-side preflight check would double the latency for
+every BYO request on the error path with no security benefit at the gateway layer.
+
+### Why pricing is absent from the gateway
+
+The gateway emits token _facts_ (`ai.usage`): counts and model identifiers. Applying prices to
+those facts is a downstream concern. Provider pricing changes frequently, varies by contract tier,
+and is sometimes retroactively corrected on invoices. A downstream consumer can reprice historical
+facts; the gateway's facts cannot be regenerated once the request is gone.
+
+### Why routing uses the first path segment, not a header
+
+Path-based routing makes the target provider explicit in every request URL — visible in logs,
+traces, and curl output without inspecting headers. It also survives transparent proxies and load
+balancers that strip custom headers. A `/{provider}/` prefix was preferred over a separate header
+because SDKs already let callers set the base URL; swapping in the gateway's URL with a provider
+prefix requires no SDK modification.
+
+---
 
 ## Trust Boundaries
 
@@ -147,9 +254,9 @@ client (stock SDK, Bearer/ x-api-key)
 
 - Virtual key signature (Ed25519, stateless — no DB lookup)
 - Virtual key format (`bai_v1.{kid}.{payload}.{sig}`, fixed 16-byte payload)
-- Tenant not in deny-set (managed traffic only)
-- Pool key configured for the requested provider (managed traffic only)
-- Request body size ≤ `MAX_REQUEST_BODY` (declared Content-Length + streaming running total)
+- Tenant not in deny-set (managed traffic only; O(1) HashMap lookup)
+- Pool key configured for the requested provider (managed traffic only — else 503)
+- Request body size ≤ `MAX_REQUEST_BODY` (declared `Content-Length` + streaming running total)
 - Per-credential request rate within ceiling; aggregate BYO rate within ceiling
 
 **What passes through unchecked:**
@@ -164,7 +271,7 @@ client (stock SDK, Bearer/ x-api-key)
 
 - Body schema validation belongs to the provider — duplicate validation adds latency without a
   security benefit at the gateway layer
-- Model validation would require a per-provider allowlist coupled to model release cadence
+- Model allowlisting would require a per-provider list coupled to model release cadence
 - BYO token validation requires a provider round-trip — the provider does it anyway
 
 ---
@@ -172,24 +279,26 @@ client (stock SDK, Bearer/ x-api-key)
 ## Configuration
 
 All fields configurable via `config.example.toml` and environment (`AI_` prefix, flat merge).
-Secret-bearing fields (`pool_keys`, `nats_creds`) are held as `Secret` — stray `Debug`/`Serialize`
-output redacts values.
-
-| Field                         | Default                           | Runtime Effect                                                                                                                                                                                                                       |
-| ----------------------------- | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `signing_keys`                | _(required)_                      | Map of kid → base64 Ed25519 public key. Multiple kids enable rotation. Missing → all traffic falls through to BYO treatment.                                                                                                         |
-| `require_signing_keys`        | `false`                           | When `true`, an empty `signing_keys` is a hard boot failure instead of silent BYO-only mode. Set on managed deployments so a typo'd/absent SSM param fails fast rather than serving for free (no key swap, no deny-set, no billing). |
-| `pool_keys.<name>`            | _(from `AI_POOL_KEY_<NAME>` env)_ | Real provider API key. Missing for a provider → managed requests to that provider return 503.                                                                                                                                        |
-| `provider_authorities.<name>` | _(none)_                          | Override or add a provider's `authority` (host:port). Enables config-added providers beyond `KNOWN_PROVIDERS` with zero code change.                                                                                                 |
-| `snapshot_path`               | _(unset)_                         | Path for the on-disk deny-set cache. Unset → re-scan NATS on every cold boot. Set → load from disk and enforce before NATS reconnects (edge/tunnel deployments).                                                                     |
-| `rate_limit_rps`              | `100`                             | Per-credential request ceiling (count-min, keyed on raw key hash). `0` disables. Exceeded → 429. Checked before Ed25519 verify.                                                                                                      |
-| `byo_rate_limit_rps`          | `1000`                            | Aggregate ceiling for all BYO traffic (single shared bucket). `0` disables. Managed traffic exempt.                                                                                                                                  |
-| `connect_timeout_secs`        | `10`                              | TCP connect timeout to the upstream provider. Exceeded → retry up to 2×, then 502.                                                                                                                                                   |
-| `read_timeout_secs`           | `600`                             | Response read timeout. 10 minutes accommodates long-running LLM streams.                                                                                                                                                             |
-| `nats_url`                    | `nats://localhost:4222`           | NATS server for the deny-set watcher. Unreachable → fail-open (stale or empty set).                                                                                                                                                  |
-| `nats_creds`                  | _(unset)_                         | NATS credentials file path. Required for authenticated clusters.                                                                                                                                                                     |
-| `listen_addr`                 | `0.0.0.0:8080`                    | Proxy listener address.                                                                                                                                                                                                              |
-| `metrics_listen`              | `0.0.0.0:9090`                    | Internal admin/observability listener: `/metrics` (Prometheus scrape), `/livez`, `/readyz`. Separate from the client listener — not externally reachable.                                                                            |
+Secret-bearing fields (`pool_keys`, `nats_creds`) are held as `Secret<T>` — stray `Debug` or
+`Serialize` output redacts to `"***"` and the value is zeroized on drop (`secret.rs`).
+
+| Field                         | Default                           | Runtime Effect                                                                                                                                                                                         |
+| ----------------------------- | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `signing_keys`                | _(required)_                      | Map of kid → base64 Ed25519 public key. Multiple kids enable rotation. Missing → all traffic falls through to BYO treatment.                                                                           |
+| `require_signing_keys`        | `false`                           | When `true`, an empty `signing_keys` is a hard boot failure instead of silent BYO-only mode. Set on managed deployments so a typo'd/absent SSM param fails fast rather than silently serving for free. |
+| `pool_keys.<name>`            | _(from `AI_POOL_KEY_<NAME>` env)_ | Real provider API key. Missing for a provider → managed requests to that provider return 503 before any upstream connection.                                                                           |
+| `provider_authorities.<name>` | _(none)_                          | Override or add a provider's `authority` (host:port). Enables config-added providers beyond `KNOWN_PROVIDERS` with zero code change.                                                                   |
+| `snapshot_path`               | _(unset)_                         | Path for the on-disk deny-set cache. Unset → re-scan NATS on every cold boot. Set → load from disk and enforce before NATS reconnects (edge/tunnel deployments).                                       |
+| `rate_limit_rps`              | `100`                             | Per-credential request ceiling (count-min, keyed on raw key hash). `0` disables. Exceeded → 429. Checked before Ed25519 verify.                                                                        |
+| `byo_rate_limit_rps`          | `1000`                            | Aggregate ceiling for all BYO traffic (single shared bucket). `0` disables. Managed traffic exempt. Exceeded → 429.                                                                                    |
+| `connect_timeout_secs`        | `10`                              | TCP connect timeout to the upstream provider. Exceeded → retry up to 2×, then 502.                                                                                                                     |
+| `read_timeout_secs`           | `600`                             | Response read timeout (10 min accommodates long-running LLM streams).                                                                                                                                  |
+| `write_timeout_secs`          | `60`                              | Upstream request-write timeout (sending the request to the provider).                                                                                                                                  |
+| `idle_timeout_secs`           | `90`                              | Idle timeout on a pooled upstream connection before it's closed.                                                                                                                                       |
+| `nats_url`                    | `nats://localhost:4222`           | NATS server for the deny-set watcher. Unreachable → fail-open (deny-set stays empty or stale).                                                                                                         |
+| `nats_creds`                  | _(unset)_                         | NATS credentials file path. Required for authenticated clusters.                                                                                                                                       |
+| `listen_addr`                 | `0.0.0.0:8080`                    | Proxy listener address (client traffic).                                                                                                                                                               |
+| `metrics_listen`              | `0.0.0.0:9090`                    | Internal admin/observability listener: `/metrics` (Prometheus scrape), `/livez`, `/readyz`. Separate from the client listener — not externally reachable.                                              |
 
 ---
 
@@ -200,33 +309,56 @@ output redacts values.
 | NATS unreachable at boot                    | Deny-set starts empty (fail-open). Auth still works — keys from config.                                                        | Watcher reconnects; seeds from NATS or disk snapshot on connect.                                                |
 | NATS disconnects mid-run                    | Last-known deny-set stays active. New deny entries not applied until reconnect.                                                | Watcher reconnects (1s→30s exponential backoff, reset on success) and resumes from saved revision — no re-scan. |
 | NATS history compacted past snapshot cursor | `CursorExpired` → full re-scan from current NATS state.                                                                        | After re-scan, new cursor set; delta watch resumes normally.                                                    |
-| Virtual key tampered or forged              | Ed25519 verify fails → falls through to BYO treatment. No billing event.                                                       | Billing miss detectable downstream; no security boundary breach.                                                |
+| Virtual key tampered or forged              | Ed25519 verify fails → falls through to BYO treatment. No billing event. No error reveals which part failed.                   | Billing miss detectable downstream; no security boundary breach.                                                |
 | `signing_keys` absent (typo'd/missing SSM)  | Default: warn + BYO-only (silently drops all managed billing + deny-set). With `require_signing_keys=true`: hard boot failure. | Set `require_signing_keys=true` on managed deployments so the mis-deploy fails fast and visibly at boot.        |
 | Pool key missing for provider               | Managed request returns 503 before any upstream connection.                                                                    | Add `AI_POOL_KEY_<NAME>` env and redeploy.                                                                      |
 | Provider DNS fails                          | `upstream_peer` returns error → 502 to client.                                                                                 | TTL-cached DNS (60s) serves stale; poisoned-lock guard re-resolves on next request.                             |
 | Provider TCP connect fails                  | `fail_to_connect` retries up to 2×, then returns 502.                                                                          | Client SDK retries with backoff. No HTTP-status retries (Pingora-idiomatic).                                    |
-| Response body > 128KB before usage chunk    | Tail compaction fires: `drain(..half)` discards first half, keeps tail. Usage extracted from retained tail.                    | No action — O(1) tail tap is designed for this; SSE usage is always in the final data line.                     |
-| Gateway crash mid-request                   | In-flight request drops; client receives TCP close, not a structured error. No partial state written.                          | Client SDK retries. No DB writes in the request path — no cleanup needed.                                       |
+| Response body > 128KB before usage chunk    | Tail compaction fires: `drain(..half)` discards first half, keeps tail. Usage extracted from retained tail.                    | No action — SSE usage is always in the final `data:` line, which always lands in the tail.                      |
+| Gateway crash mid-request                   | In-flight request drops; client receives TCP close. No partial state written.                                                  | Client SDK retries. No DB writes in the request path — no cleanup needed.                                       |
+
+---
+
+## Metrics
+
+Prometheus on the default registry, exposed at `/metrics` on `metrics_listen`.
+
+| Metric                        | Type      | Labels               | What It Measures                                                         |
+| ----------------------------- | --------- | -------------------- | ------------------------------------------------------------------------ |
+| `ai_requests_total`           | Counter   | —                    | Total admitted requests                                                  |
+| `ai_rejections_total`         | Counter   | `reason`             | Rejected requests by cause (auth, deny_spend, deny_fraud, rate_limit, …) |
+| `ai_upstream_responses_total` | Counter   | `provider`, `status` | Upstream responses by provider and status class                          |
+| `ai_tokens_total`             | Counter   | `kind`               | input / output / cache_read / cache_write token counts                   |
+| `ai_ttft_seconds`             | Histogram | `provider`           | Time to first token (50ms–30s buckets)                                   |
+| `ai_upstream_latency_seconds` | Histogram | `provider`           | Full request latency (100ms–600s buckets)                                |
+| `ai_active_streams`           | Gauge     | —                    | Open SSE streams                                                         |
+| `ai_requests_in_flight`       | Gauge     | —                    | All in-flight requests (streaming + non-streaming)                       |
+| `ai_deny_set_size`            | Gauge     | —                    | Current number of denied tenants                                         |
+| `ai_nats_connected`           | Gauge     | —                    | 1 if NATS watcher is connected, 0 otherwise                              |
 
 ---
 
 ## Modules
 
-| Module                    | Role                                                                          | Tested        |
-| ------------------------- | ----------------------------------------------------------------------------- | ------------- |
-| `key`                     | `bai_v1` parse + Ed25519 verify + mint; stateless identity                    | unit ✓        |
-| `route`                   | data-driven provider table (name/authority/auth) + dialect default            | unit ✓        |
-| `peek`                    | `ModelScanner` — streaming structural scan for the exact root-level `model`   | unit ✓        |
-| `usage`                   | token extraction (OpenAI/Anthropic, body + SSE)                               | unit ✓        |
-| `deny`                    | sparse deny-set, default-allow, reason → status                               | unit ✓        |
-| `ratelimit`               | two-tier guardrail: per-credential + global BYO (count-min, fixed mem, no GC) | unit ✓        |
-| `secret`                  | redacting, zeroize-on-drop `Secret` newtype                                   | unit ✓        |
-| `config`                  | Figment config; build keyring; pool keys/authorities by provider name         | unit ✓        |
-| `state`                   | keyring + resolved provider registry + watched deny-set + TTL DNS cache       | unit ✓        |
-| `store_watch`             | the single NATS watcher (deny-set), as a Pingora `BackgroundService`          | —             |
-| `proxy`                   | the `ProxyHttp` impl                                                          | e2e ✓         |
-| `admin`                   | `ServeHttp` on the metrics listener: `/livez`, `/readyz`, `/metrics`          | e2e ✓         |
-| `metrics`/`doctor`/`main` | Prometheus, diagnostics, bootstrap                                            | e2e/compile ✓ |
+| Module        | Role                                                                                        | Tested    |
+| ------------- | ------------------------------------------------------------------------------------------- | --------- |
+| `proxy`       | `ProxyHttp` impl — request/response pipeline (request_filter through logging)               | e2e ✓     |
+| `key`         | `bai_v1` parse + Ed25519 verify + mint; keyring with multi-kid rotation support             | unit ✓    |
+| `route`       | Data-driven provider table (name / authority / auth) + dialect default routing              | unit ✓    |
+| `peek`        | `ModelScanner` — streaming structural scan for the root-level `model`; O(1) memory          | unit ✓    |
+| `usage`       | Token extraction (OpenAI / Anthropic, body + SSE)                                           | unit ✓    |
+| `deny`        | Sparse deny-set, default-allow, reason → HTTP status                                        | unit ✓    |
+| `ratelimit`   | Two-tier guardrail: per-credential + global BYO (count-min sketches, fixed memory, no GC)   | unit ✓    |
+| `state`       | Keyring + resolved provider registry + watched deny-set (ArcSwap) + TTL DNS cache           | unit ✓    |
+| `store_watch` | NATS watcher — gap-free deny-set seeding + delta watch as Pingora `BackgroundService`       | e2e ✓     |
+| `config`      | Figment config; build keyring; pool keys / authorities by provider name                     | unit ✓    |
+| `secret`      | Redacting, zeroize-on-drop `Secret<T>` newtype for pool keys and NATS creds                 | unit ✓    |
+| `admin`       | `ServeHttp` on the metrics listener: `/livez`, `/readyz`, `/metrics`                        | e2e ✓     |
+| `metrics`     | Prometheus counter/histogram/gauge registration and update helpers                          | compile ✓ |
+| `doctor`      | Boot-time diagnostics (`beyond-ai doctor`)                                                  | compile ✓ |
+| `main`        | CLI (`run` / `doctor`), rustls init, config load, Pingora server + three services bootstrap | compile ✓ |
+
+---
 
 ## Verification
 
@@ -236,71 +368,54 @@ output redacts values.
   nats-server + mock upstream. Covers managed key-swap + passthrough fidelity + usage metering
   (OpenAI JSON + SSE, **Anthropic `/v1/messages`** with `x-api-key` swap + metering), **BYO
   passthrough** (raw token unchanged), the **virtual key in either inbound header** (`Bearer` or
-  `x-api-key`), and deny-set propagation: spend (write `blackhole.{tenant}` → 402, delete → 200) and
-  **fraud** (→ 403). Error/edge paths: **missing key → 401**, **oversized `Content-Length` → 413**,
-  **managed key for an unconfigured provider → 503**, **streaming tail compaction** (>128KB before
-  the usage chunk still meters), **deny-set fail-open** (kill NATS → stale set retained, auth still
-  works), and **on-disk snapshot survival** (blackhole a tenant, restart with NATS down → the hold is
-  still enforced from disk). Managed/BYO/streaming seed **nothing** in NATS (signkey/pool keys from
-  config), demonstrating auth's independence from NATS.
+  `x-api-key`), and deny-set propagation: spend (write `blackhole.{tenant}` → 402, delete → 200)
+  and **fraud** (→ 403). Error/edge paths: **missing key → 401**, **oversized `Content-Length` →
+  413**, **managed key for an unconfigured provider → 503**, **streaming tail compaction** (>128KB
+  before the usage chunk still meters), **deny-set fail-open** (kill NATS → stale set retained,
+  auth still works), and **on-disk snapshot survival** (blackhole a tenant, restart with NATS down
+  → the hold is still enforced from disk).
 - **Live smoke (`tests/smoke.rs`, `mise run test:smoke`):** the real `beyond-ai` binary against the
-  **real** provider hosts over TLS, one per provider in `KNOWN_PROVIDERS`. Proves what docs and the
-  mock can't — real TLS/SNI, the `/v1`→base-path rewrite landing on a live mount (200, not 404), and
-  auth passthrough. Traffic is BYO (the env key forwarded as the caller's token). Doubly guarded:
-  every test is `#[ignore]` (a plain `cargo test` skips them) **and** skips unless its provider's API
-  key env var (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GROQ_API_KEY`, …) is set — so CI stays
-  hermetic and you only hit providers you have keys for.
+  **real** provider hosts over TLS, one per provider in `KNOWN_PROVIDERS`. Proves real TLS/SNI,
+  the `/v1` → base-path rewrite landing on a live mount (200, not 404), and BYO auth passthrough.
+  Every test is `#[ignore]` and skips unless its provider's API key env var is set — CI stays
+  hermetic; you only hit providers you have keys for.
+
+---
 
 ## Benchmarking
 
-Two harnesses, best-tool-per-job, mirroring the unit/e2e split of the tests. The framing is
-**Theory of Constraints**: a proxy's steady-state constraint is upstream I/O, not gateway CPU — the
-whole design exists to _stay off the critical path_. So the benches don't chase micro-optimizations;
-they **prove the gateway's added cost is negligible and bounded**, i.e. that we never become the
-constraint. Every bench maps to a function that runs on the per-request hot path (`proxy.rs`).
-
-- **Unit micro (`benches/unit.rs`, `mise run bench:unit`) — `divan`.** Times the IO-free hot paths
-  **and** measures allocations natively: divan's `AllocProfiler` (installed as the global allocator)
-  reports alloc/dealloc/grow **count + bytes** beside ns/iter, no extra plumbing — and stays clear of
-  the crate's `#![deny(unsafe_code)]` (a hand-rolled `GlobalAlloc` would need `unsafe impl`). Coverage
-  follows the hot path: `key` verify/mint; `peek::ModelScanner` over 0/4KB/256KB bodies with `model`
-  placed _last_ = worst case; `usage` parsers; `route`; `deny` (both the off-path ingest parse,
-  `parse_key`/`parse_reason`, **and** the on-path `reason()` lookup run on every managed request); and
-  `ratelimit::check` (both tiers — `check_managed` runs the per-credential tier only; `check_byo` runs
-  the per-credential tier **plus** the global BYO aggregate bucket). This makes the design's
-  allocation/complexity claims _assertable_: `key/verify` shows **0 allocs** (stack-only
-  decode — divan omits the alloc rows entirely), `peek` a flat **1 alloc** independent of body size
-  (the O(1)-memory claim), `route`/`deny::parse_key` **0 allocs**, **`deny::reason` is 0-alloc and flat
-  across 0→1M denied tenants** (the O(1)-lookup, `O(denied)`-memory claim — ~1ns/8ns), and
-  **`ratelimit::check` is 0-alloc** (~43ns managed / ~83ns BYO — the delta is the second tier's bucket
-  `observe` plus hashing a longer token; fixed-memory count-min, no per-credential entry). A regression
-  surfaces as a non-zero / grown / size-scaling number. **The headline this bench exists to assert:
-  `key/verify` ≈ 28µs is ~350–650× every other per-request op** (deny lookup, ratelimit, route all in
-  the **nanoseconds**), so verify is the gateway's one real per-request CPU cost — the constraint that
-  motivates checking the rate guardrails _before_ it (`proxy::request_filter`), so a forged-key flood
-  is rejected for tens of ns instead of ~28µs each. Everything else is allocation-free and invisible
-  against a network round trip.
-- **A-1 end-to-end (`benches/e2e.rs`, `mise run bench:e2e`) — `criterion`.** The real `beyond-ai`
-  binary + real nats-server + mock upstream (reuses `tests/common` verbatim), driven over real HTTP —
-  measures the whole request path across four cases that **decompose** where time goes:
-  `reject_missing_key_latency` (401, short-circuited before any upstream connection — the bare
-  transport floor), `byo_json_latency` (pure passthrough), `managed_json_latency` (verify + deny +
-  key swap), and `managed_sse_latency` (exercises the streaming response tap: tail buffer + bounded
-  compaction). Plus a concurrent-throughput group. criterion is chosen for its saved-baseline
-  comparison (`--save-baseline`), which tracks latency/RPS drift across runs. Allocations are _not_
-  measured (the gateway is a separate process — its heap is invisible to the bench); that's the unit
-  bench's job. Needs `nats-server` on PATH (mise provides it).
-  - **What the decomposition shows (loopback laptop) — and its limit:** all four cases land in a
-    ~110–120µs band, and run-to-run variance is **±15–20µs** (loopback sub-150µs round-trips are
-    dominated by OS scheduling jitter). That noise floor is _larger_ than the gateway's own per-request
-    CPU (verify ≈28µs, everything else ns) — so this harness **cannot resolve** the verify cost, and the
-    reject/BYO/managed cases are statistically indistinguishable here. Two honest conclusions follow:
-    (1) the right tool for the gateway's CPU cost is the in-process `unit` bench, not this one; (2) for
-    _legitimate_ managed traffic the e2e latency is **expected to be flat** across the verify reorder —
-    moving the rate guard before verify doesn't change the legit path (verify still runs); its win is on
-    the _throttled_ path (verify skipped, proven at the unit level: 42ns vs 28µs) and in per-request
-    allocator pressure (the lazy `resp_tail`, below this harness's resolution). What this harness _is_
-    good for: catching gross regressions (a buffering mistake, a dropped connection-pool, an O(n) added
-    to the path would move the band by far more than 20µs) and the saved-baseline RPS trend over time.
+Two harnesses, mirroring the unit/e2e split of the tests. The framing is **Theory of Constraints**:
+a proxy's steady-state constraint is upstream I/O, not gateway CPU. The benches **prove the
+gateway's added cost is negligible and bounded** — i.e. it never becomes the constraint.
+
+- **Unit micro (`benches/unit.rs`, `mise run bench:unit`) — `divan`.** Times IO-free hot paths and
+  measures allocations natively (divan's `AllocProfiler` reports alloc/dealloc/grow count + bytes
+  beside ns/iter, no `unsafe` needed). Coverage: `key` verify/mint; `peek::ModelScanner` over
+  0/4KB/256KB bodies with `model` placed last (worst case); `usage` parsers; `route`; `deny`
+  (`parse_key`/`parse_reason` off-path + `reason()` on-path); `ratelimit::check` (managed tier
+  only vs. BYO which runs both tiers).
+
+  What the alloc numbers assert:
+  | Operation           | Cost     | Allocations                  | Claim verified                |
+  | ------------------- | -------- | ---------------------------- | ----------------------------- |
+  | `key/verify`        | ~28µs    | 0                            | Stack-only Ed25519 decode     |
+  | `peek/ModelScanner` | varies   | 1 (independent of body size) | O(1) memory                   |
+  | `route`             | ~ns      | 0                            | —                             |
+  | `deny::reason`      | ~1–8ns   | 0, flat 0→1M entries         | O(1) lookup, O(denied) memory |
+  | `ratelimit::check`  | ~43–83ns | 0                            | Fixed-memory count-min        |
+
+  **Headline: `key/verify` ≈ 28µs is ~350–650× every other per-request op.** This is why the rate
+  guardrail sits before verify in `proxy::request_filter`.
+
+- **End-to-end (`benches/e2e.rs`, `mise run bench:e2e`) — `criterion`.** Real `beyond-ai` binary
+  - real nats-server + mock upstream (reuses `tests/common`). Four decomposed cases:
+    `reject_missing_key_latency` (401, short-circuit before any upstream connection — transport floor),
+    `byo_json_latency` (pure passthrough), `managed_json_latency` (verify + deny + key swap),
+    `managed_sse_latency` (streaming response tap). Plus a concurrent-throughput group.
+
+  All four cases land in ~110–120µs on loopback with ±15–20µs jitter — larger than the gateway's
+  own CPU cost. This harness cannot resolve the verify cost (that's the unit bench's job). Its value:
+  catching gross regressions (a buffering mistake, a dropped connection pool, an O(n) path added
+  would move the band by far more than 20µs) and saved-baseline RPS trend via `--save-baseline`.
 
 `mise run bench` runs both.
diff --git a/src/admin.rs b/src/admin.rs
index 3e924d4..8b52d3d 100644
--- a/src/admin.rs
+++ b/src/admin.rs
@@ -2,28 +2,38 @@
 //! `/metrics`.
 //!
 //! Matches the Beyond service convention (cf. `auth`, `objects`): the body is `{"status",
-//! "version"}` and there are two probes. Both return 200 once the process is answering, because
-//! the gateway is **fail-open by design** — auth + key swap come from boot config, and a NATS
-//! outage degrades only the (stale) deny-set, never the ability to serve. So readiness must *not*
-//! gate on NATS: a cold boot with NATS down can still serve correctly, and reporting not-ready
-//! would pull a healthy gateway out of the load balancer for no reason. Readiness here therefore
-//! means "listeners up + boot config loaded" — which is true the instant we can answer this
-//! request (state is built before the server starts; a build failure `exit`s in `main`).
-//! `readyz` is kept distinct from `livez` only to honor the orchestrator's two-probe convention.
+//! "version"}` and there are two probes. **Both always return HTTP 200** once the process is
+//! answering, because the gateway is **fail-open by design** — auth + key swap come from boot
+//! config, and a NATS outage degrades only the (stale) deny-set, never the ability to serve. So
+//! readiness must *not* gate on NATS: a cold boot with NATS down can still serve correctly, and a
+//! non-200 would pull a healthy gateway out of the load balancer for no reason.
+//!
+//! `readyz` does, however, carry a distinct *body* signal that `livez` doesn't: when the deny-set
+//! watcher is disconnected from NATS, `readyz` reports `"status":"degraded"` (still 200). This lets
+//! an operator alert on "readyz has been degraded for >N minutes" — the spend/fraud enforcement is
+//! stale — without ever risking an LB eviction. `livez` is pure liveness: 200/`"ok"` whenever the
+//! process can answer. (The `ai_nats_connected` gauge is the same signal in Prometheus; the body
+//! flag is for orchestrators that probe HTTP but don't scrape.)
 //!
 //! Implemented as a Pingora `ServeHttp` app so all three paths share the one (internal) metrics
 //! port — Pingora's built-in prometheus service only serves `/metrics`, so we hand-route all three.
 
+use crate::metrics::Metrics;
 use async_trait::async_trait;
 use http::Response;
 use pingora_core::apps::http_app::ServeHttp;
 use pingora_core::protocols::http::ServerSession;
 use prometheus::{Encoder, TextEncoder};
+use std::sync::Arc;
 
 /// Compile-time service version, surfaced in every health body (matches the sibling services).
 const VERSION: &str = env!("CARGO_PKG_VERSION");
 
-pub struct AdminApp;
+pub struct AdminApp {
+    /// Read-only handle to the metric gauges. Used by `/readyz` to reflect NATS connectivity in the
+    /// health body (never to gate the HTTP status — see module docs).
+    pub metrics: Arc<Metrics>,
+}
 
 impl AdminApp {
     /// Build a `{"status","version"}` JSON health response. `status` is `"ok"`/`"degraded"` so a
@@ -46,7 +56,10 @@ impl AdminApp {
     #[allow(clippy::expect_used)] // builder inputs are encoder-derived/integer; cannot fail
     fn metrics() -> Response<Vec<u8>> {
         let encoder = TextEncoder::new();
-        let mut buffer = Vec::new();
+        // Pre-size for a typical scrape: the gateway's fixed metric set renders to a few KiB of
+        // text, so one allocation up front avoids the handful of reallocs `Vec::new` would incur as
+        // the encoder appends. 8 KiB comfortably covers the current set with headroom.
+        let mut buffer = Vec::with_capacity(8 * 1024);
         // `encode` only errors if the writer fails; a `Vec` never does, so the result is infallible
         // here — discard it explicitly (the crate denies `unused_must_use`).
         let _ = encoder.encode(&prometheus::gather(), &mut buffer);
@@ -63,9 +76,19 @@ impl AdminApp {
 impl ServeHttp for AdminApp {
     async fn response(&self, session: &mut ServerSession) -> Response<Vec<u8>> {
         match session.req_header().uri.path() {
-            // Liveness + readiness are the same signal here (see module docs): the gateway is
-            // fail-open, so "can answer" ⇒ "can serve". Both 200 once the process is up.
-            "/livez" | "/readyz" => Self::health(200, "ok"),
+            // Pure liveness: 200/ok whenever the process can answer.
+            "/livez" => Self::health(200, "ok"),
+            // Readiness: always 200 (fail-open — never pull a serving gateway from the LB), but the
+            // body reports `degraded` when the deny-set watcher is disconnected from NATS, so an
+            // operator can alert on stale spend/fraud enforcement without an eviction.
+            "/readyz" => {
+                let health = if self.metrics.nats_connected.get() == 1 {
+                    "ok"
+                } else {
+                    "degraded"
+                };
+                Self::health(200, health)
+            }
             "/metrics" => Self::metrics(),
             _ => Self::health(404, "not_found"),
         }
diff --git a/src/config.rs b/src/config.rs
index b41c5a3..10949f2 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -13,10 +13,13 @@ use std::collections::HashMap;
 use std::path::Path;
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
-// `default` so every field is optional. We deliberately do NOT set `deny_unknown_fields`: config is
-// merged from `Env::prefixed("AI_")`, a namespace shared with foreign variables the platform injects
-// (e.g. `AI_AGENT`, `AI_LOG`), so rejecting unknown keys would fail load on a valid environment
-// rather than catch a typo.
+// `default` so every field is optional. We deliberately do NOT set serde's `deny_unknown_fields`:
+// config is merged from `Env::prefixed("AI_")`, a namespace shared with foreign variables the
+// platform injects (e.g. `AI_AGENT`, `AI_LOG`), so rejecting unknown keys at the serde layer would
+// fail load on a valid environment. Typo protection is instead enforced one layer down, against the
+// *TOML file only* (`reject_unknown_toml_keys`): the file is ours alone — not a shared namespace —
+// so an unrecognized key there is unambiguously a mistake, and a silent one (it loads its default
+// and the setting does nothing), worth a hard, visible boot failure.
 #[serde(default)]
 pub struct AiConfig {
     /// Downstream listener for client (app) traffic. Internal-only in production (Service Connect
@@ -75,6 +78,23 @@ pub struct AiConfig {
     pub write_timeout_secs: u64,
     pub idle_timeout_secs: u64,
 
+    /// Graceful-shutdown drain window (seconds): after SIGTERM, how long Pingora lets **in-flight
+    /// requests finish** before tearing the runtimes down. Maps to Pingora's `grace_period_seconds`
+    /// (left unset, Pingora silently defaults to 300s — this knob makes the window explicit and
+    /// tunable without a code change). The tension: a streaming completion can run up to
+    /// `read_timeout_secs` (600s default), so any in-flight stream longer than this window is cut at
+    /// the boundary (client sees a mid-stream TCP close). Set it to cover your p99 stream, not the
+    /// max — covering the max means every deploy waits the full window. **It is also capped by the
+    /// orchestrator**: ECS SIGKILLs at `stopTimeout` (default 30s, max 120s), so a grace larger than
+    /// `stopTimeout` is wasted unless `stopTimeout` is raised to match. Default 120 = the ECS ceiling.
+    pub shutdown_grace_period_secs: u64,
+    /// Final runtime-teardown timeout (seconds) **after** the drain window: how long Pingora waits for
+    /// the tokio runtimes to exit before forcing the process down. Maps to Pingora's
+    /// `graceful_shutdown_timeout_seconds` (unset ⇒ a silent 5s default). A few seconds is enough to
+    /// flush logs/metrics; this is a backstop against a wedged runtime hanging shutdown forever, not a
+    /// second drain window (that's `shutdown_grace_period_secs`).
+    pub shutdown_runtime_timeout_secs: u64,
+
     /// TLS to the upstream provider. Real providers are HTTPS (true); the e2e harness sets false
     /// to talk to a plaintext mock.
     pub upstream_tls: bool,
@@ -135,6 +155,12 @@ impl Default for AiConfig {
             read_timeout_secs: 600,
             write_timeout_secs: 60,
             idle_timeout_secs: 90,
+            // Drain in-flight requests for up to the ECS SIGKILL ceiling (120s) on SIGTERM, then a
+            // short runtime-teardown backstop. Explicit so the window is a documented operational
+            // knob, not Pingora's silent 300s default. See the field docs for the read_timeout /
+            // orchestrator-stopTimeout tradeoffs.
+            shutdown_grace_period_secs: 120,
+            shutdown_runtime_timeout_secs: 10,
             upstream_tls: true,
             // Prefer H2 to providers by default (all of `KNOWN_PROVIDERS` offer it; H1 fallback is
             // automatic). Flip to false for an all-H1 upstream without recompiling.
@@ -154,8 +180,15 @@ impl Default for AiConfig {
 
 impl AiConfig {
     pub fn load_with_path(path: Option<&Path>) -> Result<Self> {
+        let toml_path = path.unwrap_or_else(|| Path::new("config.toml"));
+        // Catch a typo'd key in the operator's own TOML *before* any of it merges — a misspelled
+        // `require_signing_keys` would otherwise load its default and silently drop all managed
+        // billing while the gateway looks healthy. Only the TOML file is checked (see the
+        // `deny_unknown_fields` note on `AiConfig`); the env layer must stay lenient.
+        reject_unknown_toml_keys(toml_path)?;
+
         let mut fig = Figment::from(figment::providers::Serialized::defaults(AiConfig::default()));
-        fig = fig.merge(Toml::file(path.unwrap_or_else(|| Path::new("config.toml"))));
+        fig = fig.merge(Toml::file(toml_path));
         // Flat mapping: `AI_READ_TIMEOUT_SECS` → `read_timeout_secs`. (No `.split('_')` — these are
         // flat fields, not nested tables.) Unknown `AI_*` vars are tolerated (see the
         // `deny_unknown_fields` note on `AiConfig`) — which is also why pool keys are collected
@@ -165,9 +198,31 @@ impl AiConfig {
             .extract()
             .map_err(|e| GatewayError::Config(e.to_string()))?;
         cfg.merge_pool_key_env(std::env::vars());
+        cfg.validate()?;
         Ok(cfg)
     }
 
+    /// Reject nonsensical values that would otherwise fail silently at runtime. A `0` connect/read
+    /// timeout (a typo'd SSM param) becomes a `Duration::from_secs(0)` deadline that fails every
+    /// upstream call immediately — surfacing only as a 502 cascade, not a loud boot failure. Catch it
+    /// here so a mis-deploy fails fast and visibly. Write/idle are not load-bearing for correctness
+    /// (Pingora treats them as best-effort), so they're left unconstrained.
+    fn validate(&self) -> Result<()> {
+        if self.connect_timeout_secs == 0 {
+            return Err(GatewayError::Config(
+                "connect_timeout_secs must be > 0 (a 0 connect timeout fails every upstream connect)"
+                    .to_string(),
+            ));
+        }
+        if self.read_timeout_secs == 0 {
+            return Err(GatewayError::Config(
+                "read_timeout_secs must be > 0 (a 0 read timeout aborts every response before it arrives)"
+                    .to_string(),
+            ));
+        }
+        Ok(())
+    }
+
     /// Fold `AI_POOL_KEY_<NAME>` environment variables into `pool_keys` (provider name lowercased).
     /// This is the production secret path (SSM-injected env); a flat figment merge can't target a
     /// map field, and env must win over any `[pool_keys]` value baked into a config file.
@@ -196,6 +251,50 @@ impl AiConfig {
     }
 }
 
+/// The set of top-level keys a config file may set, derived from `AiConfig` itself by serializing
+/// its defaults — so it tracks the struct automatically and can never drift from the field list.
+fn known_config_keys() -> std::collections::BTreeSet<String> {
+    use figment::Provider as _;
+    figment::providers::Serialized::defaults(AiConfig::default())
+        .data()
+        .map(|profiles| {
+            profiles
+                .into_values()
+                .flat_map(|dict| dict.into_keys())
+                .collect()
+        })
+        .unwrap_or_default()
+}
+
+/// Fail the load if the TOML file at `path` carries any key that isn't an `AiConfig` field. A
+/// missing file is fine (the gateway runs on defaults + env), so an unreadable/absent file yields no
+/// keys and passes. See the `deny_unknown_fields` note on `AiConfig` for why this is scoped to the
+/// TOML file and not the env layer.
+fn reject_unknown_toml_keys(path: &Path) -> Result<()> {
+    use figment::Provider as _;
+    let known = known_config_keys();
+    let unknown: std::collections::BTreeSet<String> = Toml::file(path)
+        .data()
+        .map(|profiles| {
+            profiles
+                .into_values()
+                .flat_map(|dict| dict.into_keys())
+                .filter(|k| !known.contains(k))
+                .collect()
+        })
+        .unwrap_or_default();
+    if unknown.is_empty() {
+        return Ok(());
+    }
+    let unknown: Vec<String> = unknown.into_iter().collect();
+    Err(GatewayError::Config(format!(
+        "unknown key(s) in {}: {} — check for a typo (known keys: {})",
+        path.display(),
+        unknown.join(", "),
+        known.into_iter().collect::<Vec<_>>().join(", "),
+    )))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -214,6 +313,75 @@ mod tests {
         assert_eq!(c.listen, "0.0.0.0:8080");
     }
 
+    #[test]
+    fn validate_rejects_zero_connect_and_read_timeouts() {
+        // A 0 connect/read timeout (a typo'd SSM param) must fail boot loudly, not degrade into a
+        // 502 cascade at runtime.
+        assert!(
+            AiConfig {
+                connect_timeout_secs: 0,
+                ..Default::default()
+            }
+            .validate()
+            .is_err()
+        );
+        assert!(
+            AiConfig {
+                read_timeout_secs: 0,
+                ..Default::default()
+            }
+            .validate()
+            .is_err()
+        );
+        // Defaults are valid.
+        assert!(AiConfig::default().validate().is_ok());
+    }
+
+    /// Write `body` to a uniquely-named temp TOML file (the literal `label` keeps parallel tests
+    /// from colliding) and return its path; the caller removes it.
+    fn temp_toml(label: &str, body: &str) -> std::path::PathBuf {
+        use std::io::Write as _;
+        let path = std::env::temp_dir().join(format!("beyond-ai-cfg-{label}.toml"));
+        let mut f = std::fs::File::create(&path).unwrap();
+        f.write_all(body.as_bytes()).unwrap();
+        path
+    }
+
+    #[test]
+    fn rejects_typod_toml_key() {
+        // A misspelled key in the operator's own TOML is a silent footgun (loads its default, the
+        // setting does nothing) — load must fail loudly and name the offending key, not boot healthy.
+        let path = temp_toml(
+            "typo",
+            "listen = \"0.0.0.0:1234\"\nreqiure_signing_keys = true\n",
+        );
+        let err = AiConfig::load_with_path(Some(&path)).unwrap_err();
+        let _ = std::fs::remove_file(&path);
+        match err {
+            GatewayError::Config(msg) => assert!(
+                msg.contains("reqiure_signing_keys"),
+                "error must name the typo'd key, got: {msg}"
+            ),
+            other => panic!("expected Config error, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn accepts_known_toml_keys() {
+        // Every key here is a real `AiConfig` field (including the `[signing_keys]` table) — load
+        // must succeed and apply the values.
+        let path = temp_toml(
+            "known",
+            "listen = \"0.0.0.0:1234\"\nrequire_signing_keys = true\nrate_limit_rps = 7\n\n[signing_keys]\n1 = \"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\"\n",
+        );
+        let c = AiConfig::load_with_path(Some(&path)).unwrap();
+        let _ = std::fs::remove_file(&path);
+        assert_eq!(c.listen, "0.0.0.0:1234");
+        assert!(c.require_signing_keys);
+        assert_eq!(c.rate_limit_rps, 7);
+        assert!(c.signing_keys.contains_key("1"));
+    }
+
     #[test]
     fn build_keyring_rejects_non_numeric_kid() {
         // `kid` is parsed as `u32`; a non-numeric map key must fail boot (loud) rather than
diff --git a/src/main.rs b/src/main.rs
index 046cc0d..634144f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -133,7 +133,12 @@ fn main() {
 
     // Metrics listener now also serves /livez + /readyz for the ECS/k8s probes. Pingora's built-in
     // prometheus service only does /metrics, so we hand-route all three in one small ServeHttp.
-    let mut admin = ListeningService::new("ai-admin".to_string(), HttpServer::new_app(AdminApp));
+    let mut admin = ListeningService::new(
+        "ai-admin".to_string(),
+        HttpServer::new_app(AdminApp {
+            metrics: state.metrics.clone(),
+        }),
+    );
     admin.add_tcp(&metrics_listen);
     server.add_service(admin);
 
diff --git a/src/metrics.rs b/src/metrics.rs
index 2f669b1..b3ac19e 100644
--- a/src/metrics.rs
+++ b/src/metrics.rs
@@ -176,8 +176,8 @@ pub struct ProviderMetrics {
     pub ttft_seconds: Histogram,
     pub upstream_latency_seconds: Histogram,
     pub connect_retries_total: IntCounter,
-    /// Responses by status class, indexed `[2xx, 3xx, 4xx, 5xx]` (see [`Self::record_response`]).
-    responses: [IntCounter; 4],
+    /// Responses by status class, indexed `[1xx, 2xx, 3xx, 4xx, 5xx]` (see [`Self::record_response`]).
+    responses: [IntCounter; 5],
 }
 
 impl ProviderMetrics {
@@ -189,6 +189,8 @@ impl ProviderMetrics {
             upstream_latency_seconds: m.upstream_latency_seconds.with_label_values(&[provider]),
             connect_retries_total: m.connect_retries_total.with_label_values(&[provider]),
             responses: [
+                m.upstream_responses_total
+                    .with_label_values(&[provider, "1xx"]),
                 m.upstream_responses_total
                     .with_label_values(&[provider, "2xx"]),
                 m.upstream_responses_total
@@ -201,13 +203,17 @@ impl ProviderMetrics {
         }
     }
 
-    /// Count one upstream response, bucketed by status class (`2xx`/`3xx`/`4xx`/`5xx`).
+    /// Count one upstream response, bucketed by status class (`1xx`/`2xx`/`3xx`/`4xx`/`5xx`).
+    /// A `1xx` (e.g. `100 Continue`, `101 Switching Protocols`) gets its own bucket rather than
+    /// falling through to `5xx` — providers don't normally emit it, but a misbucketed informational
+    /// status would otherwise read as a phantom upstream-error spike on the dashboard.
     pub fn record_response(&self, status: u16) {
         let idx = match status {
-            200..=299 => 0,
-            300..=399 => 1,
-            400..=499 => 2,
-            _ => 3,
+            100..=199 => 0,
+            200..=299 => 1,
+            300..=399 => 2,
+            400..=499 => 3,
+            _ => 4,
         };
         self.responses[idx].inc();
     }
@@ -222,7 +228,31 @@ impl ProviderMetrics {
             ttft_seconds: hist(),
             upstream_latency_seconds: hist(),
             connect_retries_total: counter(),
-            responses: [counter(), counter(), counter(), counter()],
+            responses: [counter(), counter(), counter(), counter(), counter()],
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn record_response_buckets_by_status_class() {
+        // Lock the index mapping: a 1xx must land in its own bucket, never the 5xx fallback (which
+        // would read as a phantom upstream-error spike on the provider dashboard).
+        let pm = ProviderMetrics::disconnected();
+        pm.record_response(100); // 1xx
+        pm.record_response(204); // 2xx
+        pm.record_response(301); // 3xx
+        pm.record_response(404); // 4xx
+        pm.record_response(503); // 5xx
+        for (idx, status) in [100u16, 204, 301, 404, 503].iter().enumerate() {
+            assert_eq!(
+                pm.responses[idx].get(),
+                1,
+                "status {status} landed in the wrong class bucket"
+            );
         }
     }
 }
diff --git a/src/proxy.rs b/src/proxy.rs
index efbe57b..db31468 100644
--- a/src/proxy.rs
+++ b/src/proxy.rs
@@ -327,13 +327,13 @@ impl ProxyHttp for AiProxy {
         }
 
         // 4. Reject oversized bodies up front (Content-Length) so we never buffer a huge upload.
-        if let Some(len) = session
+        let declared_len = session
             .req_header()
             .headers
             .get("content-length")
             .and_then(|v| v.to_str().ok())
-            .and_then(|v| v.parse::<usize>().ok())
-        {
+            .and_then(|v| v.parse::<usize>().ok());
+        if let Some(len) = declared_len {
             if len > MAX_REQUEST_BODY {
                 return Self::reject(
                     session,
@@ -367,9 +367,15 @@ impl ProxyHttp for AiProxy {
             };
             // Deny-set: O(1), default-allow. The gateway never learns *why*, only the reason code.
             if let Some(reason) = self.state.deny.load().reason(identity.tenant_id) {
+                // Distinct label per reason — `Unknown` is *not* folded into `deny_fraud`. An
+                // `Unknown` arises when the control plane writes a reason string this gateway
+                // doesn't recognize (a control-plane deploy ahead of a gateway deploy), which would
+                // otherwise spike the fraud counter and mask the real fraud signal. A `deny_unknown`
+                // label surfaces it as the deployment-coordination issue it is.
                 let label = match reason {
                     crate::deny::DenyReason::Spend => "deny_spend",
-                    _ => "deny_fraud",
+                    crate::deny::DenyReason::Fraud => "deny_fraud",
+                    crate::deny::DenyReason::Unknown => "deny_unknown",
                 };
                 self.state
                     .metrics
@@ -421,7 +427,17 @@ impl ProxyHttp for AiProxy {
             resp_model_scanner: peek::ModelScanner::new(),
             streaming: false,
             inject_eligible,
-            req_buf: Vec::new(),
+            // Only the inject-eligible path ever buffers the request body (to splice
+            // `stream_options` after the root `{`; the `stream` key can appear anywhere in the root
+            // object, so the decision needs the whole body — buffering is inherent here, not
+            // incidental). When it does, pre-size from the declared Content-Length so accumulation is
+            // a single allocation instead of a geometric realloc chain; capped at `MAX_REQUEST_BODY`
+            // so a lying header can't pre-allocate unbounded memory. Every other request leaves this
+            // empty and never buffers.
+            req_buf: match (inject_eligible, declared_len) {
+                (true, Some(len)) => Vec::with_capacity(len.min(MAX_REQUEST_BODY)),
+                _ => Vec::new(),
+            },
             // Grown lazily by the response tap (`response_body_filter`), not pre-reserved: a
             // non-streaming response — the common case — is a few hundred bytes, so reserving the
             // full 64KB cap up front would waste an allocation on every request to hold ~200B. A
diff --git a/src/usage.rs b/src/usage.rs
index 958846a..22443a0 100644
--- a/src/usage.rs
+++ b/src/usage.rs
@@ -95,7 +95,10 @@ pub fn anthropic_body(body: &[u8]) -> Option<Usage> {
 fn sse_data_lines(sse: &[u8]) -> impl Iterator<Item = &[u8]> + '_ {
     sse.split(|&b| b == b'\n').filter_map(|line| {
         let line = line.strip_prefix(b"data:")?;
-        let line = line.strip_prefix(b" ").unwrap_or(line);
+        // SSE strips *all* leading spaces after the field colon (not exactly one) — OpenAI/Anthropic
+        // emit `data: ` (one space), but a config-added OpenAI-wire provider that pads with more
+        // would otherwise leave whitespace in the payload and fail the JSON parse → silent zero usage.
+        let line = line.trim_ascii_start();
         (line != b"[DONE]").then_some(line)
     })
 }
@@ -241,6 +244,23 @@ mod tests {
         );
     }
 
+    #[test]
+    fn tolerates_extra_leading_spaces_after_data_colon() {
+        // SSE strips all leading spaces, not just one. A provider padding `data:   {…}` must still
+        // parse — the alternative is a silent zero-usage row for that request.
+        let sse =
+            b"data:   {\"choices\":[],\"usage\":{\"prompt_tokens\":3,\"completion_tokens\":7}}\n\n";
+        assert_eq!(
+            openai_stream(sse).unwrap(),
+            Usage {
+                input_tokens: 3,
+                output_tokens: 7,
+                cache_read_tokens: 0,
+                cache_write_tokens: 0
+            }
+        );
+    }
+
     #[test]
     fn no_usage_returns_none() {
         assert!(openai_stream(b"data: {\"choices\":[]}\n\n").is_none());

From 1f1e67d780f21cb31ef1f54ef7875ce091b700aa Mon Sep 17 00:00:00 2001
From: Jared Lunde <jared.lunde@gmail.com>
Date: Sun, 31 May 2026 16:51:38 -0700
Subject: [PATCH 7/7] fixes

---
 ARCHITECTURE.md        | 151 ++++---
 config.example.toml    |  24 ++
 src/circuit_breaker.rs | 935 +++++++++++++++++++++++++++++++++++++++++
 src/config.rs          |  79 +++-
 src/lib.rs             |   1 +
 src/main.rs            |  24 +-
 src/proxy.rs           |  57 ++-
 src/route.rs           |   9 +
 src/state.rs           |  11 +
 src/store_watch.rs     |   5 +-
 tests/common/mod.rs    |  58 ++-
 tests/e2e.rs           |  81 ++++
 12 files changed, 1356 insertions(+), 79 deletions(-)
 create mode 100644 src/circuit_breaker.rs

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 142a15a..8f9b40d 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -48,6 +48,8 @@ Client (stock OpenAI/Anthropic SDK)
   │       │                                    │
   │       │           pool key required ───────────────────────── 503
   │       └─ BYO: pass through (no verify, no deny-set, no billing)
+  │  └─ Circuit breaker (per provider, all traffic): if OPEN ─────► 503
+  │       (claims a half-open probe permit only on an actual attempt)
   │
   ▼  upstream_peer (proxy.rs)
   │  TTL-cached DNS resolve (60s) → HttpPeer (TLS, H2 pref, timeouts)
@@ -79,6 +81,7 @@ Client (stock OpenAI/Anthropic SDK)
   ▼  logging (proxy.rs)
      Parse usage from tail (by dialect + streaming flag)
      Emit ai.usage fact: tenant, vpc, model, requested_model, token counts (managed only)
+     Record circuit-breaker outcome (once): 5xx / connect-fail → failure; else → success (429 incl.)
      Decrement requests_in_flight gauge
 ```
 
@@ -193,6 +196,30 @@ before any upstream connection and cannot be forged.
 Both tiers are generous circuit breakers, not quotas. `rate_limit_rps = 0` / `byo_rate_limit_rps =
 0` disable them independently.
 
+### Circuit Breaker (`circuit_breaker.rs`)
+
+A per-provider, lock-free circuit breaker (single packed `AtomicU64`; windowed failure policy) sits
+on the upstream path. It protects against a **broken provider**, which is a different failure than
+the rate guardrails (which protect against abusive _inbound_ load):
+
+- **Failure = the provider is broken** — a `5xx` response or a connect failure. After
+  `circuit_breaker_threshold` failures within `circuit_breaker_window_secs`, the breaker **opens**:
+  requests to that provider fast-fail with `503` (`ai_rejections_total{reason="circuit_open"}`)
+  instead of piling up against `read_timeout_secs` and exhausting connection / in-flight slots for
+  _every_ provider (head-of-line blocking by one sick dependency). After `circuit_breaker_reset_secs`
+  it half-opens and admits a probe; success closes it, failure reopens it.
+- **A `429` is NOT a failure.** It means the provider is healthy and throttling our pool key — a
+  velocity/spend signal the rate limiter and the client's `Retry-After` backoff own. Tripping on it
+  would convert a self-healing throttle into a self-inflicted outage. The breaker records any response
+  that _arrived_ (2xx/3xx/4xx incl. 429) as a **success**; only 5xx and transport failures count
+  against it.
+- **Applies to all traffic** (managed + BYO) — a down provider is down regardless of whose key is
+  used. One breaker per provider, built at boot, shared lock-free across callers.
+- The `allow()` check is the **last** thing in `request_filter` (after every other rejection), so a
+  scarce half-open probe permit is only claimed for a request that will actually attempt the upstream;
+  the outcome is recorded exactly once in `logging`, so a permit can never leak.
+- `circuit_breaker_threshold = 0` disables it.
+
 ---
 
 ## Why It Behaves This Way
@@ -282,40 +309,47 @@ All fields configurable via `config.example.toml` and environment (`AI_` prefix,
 Secret-bearing fields (`pool_keys`, `nats_creds`) are held as `Secret<T>` — stray `Debug` or
 `Serialize` output redacts to `"***"` and the value is zeroized on drop (`secret.rs`).
 
-| Field                         | Default                           | Runtime Effect                                                                                                                                                                                         |
-| ----------------------------- | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `signing_keys`                | _(required)_                      | Map of kid → base64 Ed25519 public key. Multiple kids enable rotation. Missing → all traffic falls through to BYO treatment.                                                                           |
-| `require_signing_keys`        | `false`                           | When `true`, an empty `signing_keys` is a hard boot failure instead of silent BYO-only mode. Set on managed deployments so a typo'd/absent SSM param fails fast rather than silently serving for free. |
-| `pool_keys.<name>`            | _(from `AI_POOL_KEY_<NAME>` env)_ | Real provider API key. Missing for a provider → managed requests to that provider return 503 before any upstream connection.                                                                           |
-| `provider_authorities.<name>` | _(none)_                          | Override or add a provider's `authority` (host:port). Enables config-added providers beyond `KNOWN_PROVIDERS` with zero code change.                                                                   |
-| `snapshot_path`               | _(unset)_                         | Path for the on-disk deny-set cache. Unset → re-scan NATS on every cold boot. Set → load from disk and enforce before NATS reconnects (edge/tunnel deployments).                                       |
-| `rate_limit_rps`              | `100`                             | Per-credential request ceiling (count-min, keyed on raw key hash). `0` disables. Exceeded → 429. Checked before Ed25519 verify.                                                                        |
-| `byo_rate_limit_rps`          | `1000`                            | Aggregate ceiling for all BYO traffic (single shared bucket). `0` disables. Managed traffic exempt. Exceeded → 429.                                                                                    |
-| `connect_timeout_secs`        | `10`                              | TCP connect timeout to the upstream provider. Exceeded → retry up to 2×, then 502.                                                                                                                     |
-| `read_timeout_secs`           | `600`                             | Response read timeout (10 min accommodates long-running LLM streams).                                                                                                                                  |
-| `write_timeout_secs`          | `60`                              | Upstream request-write timeout (sending the request to the provider).                                                                                                                                  |
-| `idle_timeout_secs`           | `90`                              | Idle timeout on a pooled upstream connection before it's closed.                                                                                                                                       |
-| `nats_url`                    | `nats://localhost:4222`           | NATS server for the deny-set watcher. Unreachable → fail-open (deny-set stays empty or stale).                                                                                                         |
-| `nats_creds`                  | _(unset)_                         | NATS credentials file path. Required for authenticated clusters.                                                                                                                                       |
-| `listen_addr`                 | `0.0.0.0:8080`                    | Proxy listener address (client traffic).                                                                                                                                                               |
-| `metrics_listen`              | `0.0.0.0:9090`                    | Internal admin/observability listener: `/metrics` (Prometheus scrape), `/livez`, `/readyz`. Separate from the client listener — not externally reachable.                                              |
+| Field                           | Default                           | Runtime Effect                                                                                                                                                                                         |
+| ------------------------------- | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `signing_keys`                  | _(required)_                      | Map of kid → base64 Ed25519 public key. Multiple kids enable rotation. Missing → all traffic falls through to BYO treatment.                                                                           |
+| `require_signing_keys`          | `false`                           | When `true`, an empty `signing_keys` is a hard boot failure instead of silent BYO-only mode. Set on managed deployments so a typo'd/absent SSM param fails fast rather than silently serving for free. |
+| `pool_keys.<name>`              | _(from `AI_POOL_KEY_<NAME>` env)_ | Real provider API key. Missing for a provider → managed requests to that provider return 503 before any upstream connection.                                                                           |
+| `provider_authorities.<name>`   | _(none)_                          | Override or add a provider's `authority` (host:port). Enables config-added providers beyond `KNOWN_PROVIDERS` with zero code change.                                                                   |
+| `snapshot_path`                 | _(unset)_                         | Path for the on-disk deny-set cache. Unset → re-scan NATS on every cold boot. Set → load from disk and enforce before NATS reconnects (edge/tunnel deployments).                                       |
+| `rate_limit_rps`                | `100`                             | Per-credential request ceiling (count-min, keyed on raw key hash). `0` disables. Exceeded → 429. Checked before Ed25519 verify.                                                                        |
+| `byo_rate_limit_rps`            | `1000`                            | Aggregate ceiling for all BYO traffic (single shared bucket). `0` disables. Managed traffic exempt. Exceeded → 429.                                                                                    |
+| `circuit_breaker_threshold`     | `20`                              | Per-provider upstream failures (5xx / connect; **not** 429) within the window before the breaker opens. While open, requests to that provider fast-fail with 503. `0` disables.                        |
+| `circuit_breaker_window_secs`   | `10`                              | Rolling window over which failures are counted (trips on a burst, not a slow trickle).                                                                                                                 |
+| `circuit_breaker_reset_secs`    | `30`                              | How long the breaker stays open before admitting a half-open probe. Probe success closes it; failure reopens it.                                                                                       |
+| `connect_timeout_secs`          | `10`                              | TCP connect timeout to the upstream provider. Exceeded → retry up to 2×, then 502.                                                                                                                     |
+| `read_timeout_secs`             | `600`                             | Response read timeout (10 min accommodates long-running LLM streams).                                                                                                                                  |
+| `write_timeout_secs`            | `60`                              | Upstream request-write timeout (sending the request to the provider).                                                                                                                                  |
+| `idle_timeout_secs`             | `90`                              | Idle timeout on a pooled upstream connection before it's closed.                                                                                                                                       |
+| `shutdown_grace_period_secs`    | `600`                             | SIGTERM drain window for in-flight requests (= `read_timeout_secs` so a deploy never truncates a stream). Capped by the orchestrator's stop timeout (ECS Fargate: 120s).                               |
+| `shutdown_runtime_timeout_secs` | `10`                              | Final runtime-teardown backstop after the drain window.                                                                                                                                                |
+| `nats_url`                      | `nats://localhost:4222`           | NATS server for the deny-set watcher. Unreachable → fail-open (deny-set stays empty or stale).                                                                                                         |
+| `nats_creds`                    | _(unset)_                         | NATS credentials file path. Required for authenticated clusters.                                                                                                                                       |
+| `listen_addr`                   | `0.0.0.0:8080`                    | Proxy listener address (client traffic).                                                                                                                                                               |
+| `metrics_listen`                | `0.0.0.0:9090`                    | Internal admin/observability listener: `/metrics` (Prometheus scrape), `/livez`, `/readyz`. Separate from the client listener — not externally reachable.                                              |
 
 ---
 
 ## Failure Modes
 
-| Failure                                     | What Actually Happens                                                                                                          | Recovery                                                                                                        |
-| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------- |
-| NATS unreachable at boot                    | Deny-set starts empty (fail-open). Auth still works — keys from config.                                                        | Watcher reconnects; seeds from NATS or disk snapshot on connect.                                                |
-| NATS disconnects mid-run                    | Last-known deny-set stays active. New deny entries not applied until reconnect.                                                | Watcher reconnects (1s→30s exponential backoff, reset on success) and resumes from saved revision — no re-scan. |
-| NATS history compacted past snapshot cursor | `CursorExpired` → full re-scan from current NATS state.                                                                        | After re-scan, new cursor set; delta watch resumes normally.                                                    |
-| Virtual key tampered or forged              | Ed25519 verify fails → falls through to BYO treatment. No billing event. No error reveals which part failed.                   | Billing miss detectable downstream; no security boundary breach.                                                |
-| `signing_keys` absent (typo'd/missing SSM)  | Default: warn + BYO-only (silently drops all managed billing + deny-set). With `require_signing_keys=true`: hard boot failure. | Set `require_signing_keys=true` on managed deployments so the mis-deploy fails fast and visibly at boot.        |
-| Pool key missing for provider               | Managed request returns 503 before any upstream connection.                                                                    | Add `AI_POOL_KEY_<NAME>` env and redeploy.                                                                      |
-| Provider DNS fails                          | `upstream_peer` returns error → 502 to client.                                                                                 | TTL-cached DNS (60s) serves stale; poisoned-lock guard re-resolves on next request.                             |
-| Provider TCP connect fails                  | `fail_to_connect` retries up to 2×, then returns 502.                                                                          | Client SDK retries with backoff. No HTTP-status retries (Pingora-idiomatic).                                    |
-| Response body > 128KB before usage chunk    | Tail compaction fires: `drain(..half)` discards first half, keeps tail. Usage extracted from retained tail.                    | No action — SSE usage is always in the final `data:` line, which always lands in the tail.                      |
-| Gateway crash mid-request                   | In-flight request drops; client receives TCP close. No partial state written.                                                  | Client SDK retries. No DB writes in the request path — no cleanup needed.                                       |
+| Failure                                     | What Actually Happens                                                                                                                                                          | Recovery                                                                                                                                                                  |
+| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NATS unreachable at boot                    | Deny-set starts empty (fail-open). Auth still works — keys from config.                                                                                                        | Watcher reconnects; seeds from NATS or disk snapshot on connect.                                                                                                          |
+| NATS disconnects mid-run                    | Last-known deny-set stays active. New deny entries not applied until reconnect.                                                                                                | Watcher reconnects (1s→30s exponential backoff, reset on success) and resumes from saved revision — no re-scan.                                                           |
+| NATS history compacted past snapshot cursor | `CursorExpired` → full re-scan from current NATS state.                                                                                                                        | After re-scan, new cursor set; delta watch resumes normally.                                                                                                              |
+| Virtual key tampered or forged              | Ed25519 verify fails → falls through to BYO treatment. No billing event. No error reveals which part failed.                                                                   | Billing miss detectable downstream; no security boundary breach.                                                                                                          |
+| `signing_keys` absent (typo'd/missing SSM)  | Default: warn + BYO-only (silently drops all managed billing + deny-set). With `require_signing_keys=true`: hard boot failure.                                                 | Set `require_signing_keys=true` on managed deployments so the mis-deploy fails fast and visibly at boot.                                                                  |
+| Pool key missing for provider               | Managed request returns 503 before any upstream connection.                                                                                                                    | Add `AI_POOL_KEY_<NAME>` env and redeploy.                                                                                                                                |
+| Provider DNS fails                          | `upstream_peer` returns error → 502 to client.                                                                                                                                 | TTL-cached DNS (60s) serves stale; poisoned-lock guard re-resolves on next request.                                                                                       |
+| Provider TCP connect fails                  | `fail_to_connect` retries up to 2×, then returns 502. Counts as a circuit-breaker failure.                                                                                     | Client SDK retries with backoff. No HTTP-status retries (Pingora-idiomatic).                                                                                              |
+| Provider brownout (sustained 5xx)           | After `circuit_breaker_threshold` 5xx/connect failures in the window, the breaker opens; requests fast-fail 503 (`circuit_open`) instead of stalling against the read timeout. | Auto: after `circuit_breaker_reset_secs` a half-open probe is admitted — success closes the breaker, failure reopens it. Per-provider, so other providers are unaffected. |
+| Provider throttles (429 storm)              | Relayed to the client as 429; the client's `Retry-After` backoff applies. Does **not** trip the breaker (provider is healthy).                                                 | Backpressure via client + the rate guardrails; no gateway-side circuit action.                                                                                            |
+| Response body > 128KB before usage chunk    | Tail compaction fires: `drain(..half)` discards first half, keeps tail. Usage extracted from retained tail.                                                                    | No action — SSE usage is always in the final `data:` line, which always lands in the tail.                                                                                |
+| Gateway crash mid-request                   | In-flight request drops; client receives TCP close. No partial state written.                                                                                                  | Client SDK retries. No DB writes in the request path — no cleanup needed.                                                                                                 |
 
 ---
 
@@ -323,40 +357,41 @@ Secret-bearing fields (`pool_keys`, `nats_creds`) are held as `Secret<T>` — st
 
 Prometheus on the default registry, exposed at `/metrics` on `metrics_listen`.
 
-| Metric                        | Type      | Labels               | What It Measures                                                         |
-| ----------------------------- | --------- | -------------------- | ------------------------------------------------------------------------ |
-| `ai_requests_total`           | Counter   | —                    | Total admitted requests                                                  |
-| `ai_rejections_total`         | Counter   | `reason`             | Rejected requests by cause (auth, deny_spend, deny_fraud, rate_limit, …) |
-| `ai_upstream_responses_total` | Counter   | `provider`, `status` | Upstream responses by provider and status class                          |
-| `ai_tokens_total`             | Counter   | `kind`               | input / output / cache_read / cache_write token counts                   |
-| `ai_ttft_seconds`             | Histogram | `provider`           | Time to first token (50ms–30s buckets)                                   |
-| `ai_upstream_latency_seconds` | Histogram | `provider`           | Full request latency (100ms–600s buckets)                                |
-| `ai_active_streams`           | Gauge     | —                    | Open SSE streams                                                         |
-| `ai_requests_in_flight`       | Gauge     | —                    | All in-flight requests (streaming + non-streaming)                       |
-| `ai_deny_set_size`            | Gauge     | —                    | Current number of denied tenants                                         |
-| `ai_nats_connected`           | Gauge     | —                    | 1 if NATS watcher is connected, 0 otherwise                              |
+| Metric                        | Type      | Labels               | What It Measures                                                                       |
+| ----------------------------- | --------- | -------------------- | -------------------------------------------------------------------------------------- |
+| `ai_requests_total`           | Counter   | —                    | Total admitted requests                                                                |
+| `ai_rejections_total`         | Counter   | `reason`             | Rejected requests by cause (auth, deny_spend, deny_fraud, rate_limit, circuit_open, …) |
+| `ai_upstream_responses_total` | Counter   | `provider`, `status` | Upstream responses by provider and status class                                        |
+| `ai_tokens_total`             | Counter   | `kind`               | input / output / cache_read / cache_write token counts                                 |
+| `ai_ttft_seconds`             | Histogram | `provider`           | Time to first token (50ms–30s buckets)                                                 |
+| `ai_upstream_latency_seconds` | Histogram | `provider`           | Full request latency (100ms–600s buckets)                                              |
+| `ai_active_streams`           | Gauge     | —                    | Open SSE streams                                                                       |
+| `ai_requests_in_flight`       | Gauge     | —                    | All in-flight requests (streaming + non-streaming)                                     |
+| `ai_deny_set_size`            | Gauge     | —                    | Current number of denied tenants                                                       |
+| `ai_nats_connected`           | Gauge     | —                    | 1 if NATS watcher is connected, 0 otherwise                                            |
 
 ---
 
 ## Modules
 
-| Module        | Role                                                                                        | Tested    |
-| ------------- | ------------------------------------------------------------------------------------------- | --------- |
-| `proxy`       | `ProxyHttp` impl — request/response pipeline (request_filter through logging)               | e2e ✓     |
-| `key`         | `bai_v1` parse + Ed25519 verify + mint; keyring with multi-kid rotation support             | unit ✓    |
-| `route`       | Data-driven provider table (name / authority / auth) + dialect default routing              | unit ✓    |
-| `peek`        | `ModelScanner` — streaming structural scan for the root-level `model`; O(1) memory          | unit ✓    |
-| `usage`       | Token extraction (OpenAI / Anthropic, body + SSE)                                           | unit ✓    |
-| `deny`        | Sparse deny-set, default-allow, reason → HTTP status                                        | unit ✓    |
-| `ratelimit`   | Two-tier guardrail: per-credential + global BYO (count-min sketches, fixed memory, no GC)   | unit ✓    |
-| `state`       | Keyring + resolved provider registry + watched deny-set (ArcSwap) + TTL DNS cache           | unit ✓    |
-| `store_watch` | NATS watcher — gap-free deny-set seeding + delta watch as Pingora `BackgroundService`       | e2e ✓     |
-| `config`      | Figment config; build keyring; pool keys / authorities by provider name                     | unit ✓    |
-| `secret`      | Redacting, zeroize-on-drop `Secret<T>` newtype for pool keys and NATS creds                 | unit ✓    |
-| `admin`       | `ServeHttp` on the metrics listener: `/livez`, `/readyz`, `/metrics`                        | e2e ✓     |
-| `metrics`     | Prometheus counter/histogram/gauge registration and update helpers                          | compile ✓ |
-| `doctor`      | Boot-time diagnostics (`beyond-ai doctor`)                                                  | compile ✓ |
-| `main`        | CLI (`run` / `doctor`), rustls init, config load, Pingora server + three services bootstrap | compile ✓ |
+| Module            | Role                                                                                                 | Tested         |
+| ----------------- | ---------------------------------------------------------------------------------------------------- | -------------- |
+| `proxy`           | `ProxyHttp` impl — request/response pipeline (request_filter through logging)                        | e2e ✓          |
+| `key`             | `bai_v1` parse + Ed25519 verify + mint; keyring with multi-kid rotation support                      | unit ✓         |
+| `route`           | Data-driven provider table (name / authority / auth) + dialect default routing                       | unit ✓         |
+| `peek`            | `ModelScanner` — streaming structural scan for the root-level `model`; O(1) memory                   | unit ✓         |
+| `usage`           | Token extraction (OpenAI / Anthropic, body + SSE)                                                    | unit ✓         |
+| `deny`            | Sparse deny-set, default-allow, reason → HTTP status                                                 | unit ✓         |
+| `ratelimit`       | Two-tier guardrail: per-credential + global BYO (count-min sketches, fixed memory, no GC)            | unit ✓         |
+| `circuit_breaker` | Per-provider lock-free breaker (packed `AtomicU64`, windowed policy) — trips on 5xx/connect, not 429 | unit ✓ + e2e ✓ |
+| `state`           | Keyring + resolved provider registry + watched deny-set (ArcSwap) + TTL DNS cache                    | unit ✓         |
+| `store_watch`     | NATS watcher — gap-free deny-set seeding + delta watch as Pingora `BackgroundService`                | e2e ✓          |
+| `config`          | Figment config; build keyring; pool keys / authorities by provider name                              | unit ✓         |
+| `secret`          | Redacting, zeroize-on-drop `Secret<T>` newtype for pool keys and NATS creds                          | unit ✓         |
+| `admin`           | `ServeHttp` on the metrics listener: `/livez`, `/readyz`, `/metrics`                                 | e2e ✓          |
+| `metrics`         | Prometheus counter/histogram/gauge registration and update helpers                                   | compile ✓      |
+| `doctor`          | Boot-time diagnostics (`beyond-ai doctor`)                                                           | compile ✓      |
+| `main`            | CLI (`run` / `doctor`), rustls init, config load, Pingora server + three services bootstrap          | compile ✓      |
 
 ---
 
diff --git a/config.example.toml b/config.example.toml
index 566534a..ed7f7af 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -24,6 +24,18 @@ read_timeout_secs = 600
 write_timeout_secs = 60
 idle_timeout_secs = 90
 
+# Graceful shutdown. On SIGTERM, in-flight requests drain for up to grace_period_secs before the
+# runtimes are torn down (then runtime_timeout_secs is the final teardown backstop).
+# Default = read_timeout_secs so a deploy NEVER truncates an in-flight stream — the gateway is a
+# transparent proxy and must not mangle a paid-for generation (a half-delivered SSE can't be cleanly
+# retried). Pingora stops accepting new connections at SIGTERM, so this only waits out the longest
+# existing stream, not new work; slower rollouts are the deliberate price.
+# The orchestrator must grant the same window or it caps us (it SIGKILLs at its own stop timeout):
+# set k8s terminationGracePeriodSeconds (or the EC2 agent's ECS_CONTAINER_STOP_TIMEOUT) to match.
+# NOTE: ECS Fargate caps stopTimeout at 120s — there, streams past 120s are still cut (a Fargate limit).
+shutdown_grace_period_secs = 600
+shutdown_runtime_timeout_secs = 10
+
 # upstream_tls = true   # set false only for a plaintext mock (tests)
 
 # Per-credential request-rate ceiling (requests/sec) — a blast-radius circuit breaker, not a spend
@@ -33,6 +45,18 @@ idle_timeout_secs = 90
 # set 0 to disable. Tune from the `ai_rejections_total{reason="rate_limit"}` metric.
 rate_limit_rps = 100
 
+# Per-provider circuit breaker. Trips when `circuit_breaker_threshold` upstream FAILURES occur within
+# `circuit_breaker_window_secs`; while open, requests to that provider fast-fail with 503
+# (ai_rejections_total{reason="circuit_open"}) instead of piling up against read_timeout_secs and
+# exhausting connection/in-flight slots for every provider. After `circuit_breaker_reset_secs` a probe
+# is allowed — success closes it, failure reopens it. A FAILURE is a 5xx response or a connect failure
+# (the provider is broken); a 429 is NOT a failure (a healthy provider throttling our pool key — the
+# rate limiter and the client's Retry-After own that). Applies to all traffic (managed + BYO). Set
+# threshold 0 to disable. Defaults are generous so normal background 5xx noise never trips it.
+circuit_breaker_threshold = 20
+circuit_breaker_window_secs = 10
+circuit_breaker_reset_secs = 30
+
 # Aggregate request-rate ceiling (requests/sec) for ALL BYO traffic combined — one shared bucket.
 # BYO is unverified and upstream-bound: a flood of *distinct* random BYO tokens slips past the
 # per-credential ceiling and would open junk-auth connections to providers from our egress IPs,
diff --git a/src/circuit_breaker.rs b/src/circuit_breaker.rs
new file mode 100644
index 0000000..af8c1e1
--- /dev/null
+++ b/src/circuit_breaker.rs
@@ -0,0 +1,935 @@
+//! Lock-free circuit breaker for protecting external service calls.
+//!
+//! This implementation is provably race-free through:
+//! 1. Atomic words for all mutable state (no multi-variable coordination)
+//! 2. Compare-and-swap loops for all state transitions
+//! 3. Monotonic timestamps for timeout detection
+//!
+//! # States
+//!
+//! ```text
+//!                 failure_threshold reached
+//!     ┌─────────┐ ──────────────────────────► ┌────────┐
+//!     │ Closed  │                             │  Open  │
+//!     └─────────┘ ◄────────────────────────── └────────┘
+//!          ▲        success in half-open           │
+//!          │                                       │ reset_timeout elapsed
+//!          │        ┌─────────────┐                │
+//!          └─────── │  Half-Open  │ ◄──────────────┘
+//!            success└─────────────┘
+//!                         │
+//!                         │ failure
+//!                         ▼
+//!                    back to Open
+//! ```
+//!
+//! # Failure Policies
+//!
+//! Two failure detection policies are supported:
+//!
+//! - **Consecutive**: Opens after N failures in a row. Any success resets the count.
+//!   Good for detecting complete backend failures.
+//!
+//! - **Windowed**: Opens after N failures within a time window. Failures outside
+//!   the window are forgotten. Good for detecting degraded backends with partial failures.
+//!
+//! # Example
+//!
+//! ```rust
+//! use beyond_ai::circuit_breaker::{CircuitBreaker, CircuitBreakerConfig, FailurePolicy};
+//! use std::time::Duration;
+//!
+//! // Consecutive failures (default)
+//! let cb = CircuitBreaker::new(CircuitBreakerConfig::default());
+//!
+//! // Windowed failures (better for edge proxies)
+//! let cb = CircuitBreaker::new(
+//!     CircuitBreakerConfig::windowed(3, Duration::from_secs(10))
+//!         .reset_timeout(Duration::from_secs(30))
+//! );
+//!
+//! // Before calling external service
+//! if cb.allow().is_err() {
+//!     // return Err("service temporarily unavailable");
+//! }
+//!
+//! // match call_external_service().await {
+//! //     Ok(result) => {
+//! //         cb.record_success();
+//! //         Ok(result)
+//! //     }
+//! //     Err(e) if is_connectivity_error(&e) => {
+//! //         cb.record_failure();
+//! //         Err(e)
+//! //     }
+//! //     Err(e) => Err(e), // Don't count business logic errors
+//! // }
+//! ```
+
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::Duration;
+
+/// How failures are counted before opening the circuit.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum FailurePolicy {
+    /// N consecutive failures opens the circuit. Any success resets the count.
+    Consecutive {
+        /// Number of consecutive failures before opening.
+        threshold: u32,
+    },
+    /// N failures within the window opens the circuit.
+    /// Failures older than the window are forgotten.
+    Windowed {
+        /// Number of failures within the window before opening.
+        threshold: u32,
+        /// Time window for counting failures.
+        window: Duration,
+    },
+}
+
+impl Default for FailurePolicy {
+    fn default() -> Self {
+        FailurePolicy::Consecutive { threshold: 5 }
+    }
+}
+
+/// Circuit breaker configuration.
+#[derive(Debug, Clone)]
+pub struct CircuitBreakerConfig {
+    /// How failures are counted.
+    pub failure_policy: FailurePolicy,
+    /// Time to wait in open state before transitioning to half-open.
+    pub reset_timeout: Duration,
+    /// Number of probe requests allowed in half-open state.
+    pub half_open_permits: u32,
+}
+
+impl Default for CircuitBreakerConfig {
+    fn default() -> Self {
+        Self {
+            failure_policy: FailurePolicy::default(),
+            reset_timeout: Duration::from_secs(30),
+            half_open_permits: 3,
+        }
+    }
+}
+
+impl CircuitBreakerConfig {
+    /// Create a config with consecutive failure detection.
+    pub fn consecutive(threshold: u32) -> Self {
+        Self {
+            failure_policy: FailurePolicy::Consecutive { threshold },
+            ..Default::default()
+        }
+    }
+
+    /// Create a config with windowed failure detection.
+    pub fn windowed(threshold: u32, window: Duration) -> Self {
+        Self {
+            failure_policy: FailurePolicy::Windowed { threshold, window },
+            ..Default::default()
+        }
+    }
+
+    /// Set the reset timeout (time in open state before half-open).
+    pub fn reset_timeout(mut self, timeout: Duration) -> Self {
+        self.reset_timeout = timeout;
+        self
+    }
+
+    /// Set the number of half-open permits.
+    pub fn half_open_permits(mut self, permits: u32) -> Self {
+        self.half_open_permits = permits;
+        self
+    }
+
+    /// Get the failure threshold from the policy.
+    #[allow(dead_code)]
+    fn threshold(&self) -> u32 {
+        match &self.failure_policy {
+            FailurePolicy::Consecutive { threshold } => *threshold,
+            FailurePolicy::Windowed { threshold, .. } => *threshold,
+        }
+    }
+}
+
+/// Lock-free circuit breaker.
+///
+/// All state is packed into a single 64-bit atomic:
+/// - Bits 62-63: State (0=closed, 1=open, 2=half-open)
+/// - Bits 48-61: Failure count (14 bits, max 16383)
+/// - Bits 32-47: Half-open permits remaining (16 bits)
+/// - Bits 0-31: Timestamp of last state change (seconds since epoch, wraps every 136 years)
+///
+/// For windowed mode, a second atomic tracks the window start timestamp.
+///
+/// This packing ensures all state transitions are atomic via single CAS operations.
+pub struct CircuitBreaker {
+    /// Packed state word.
+    state: AtomicU64,
+    /// Window start timestamp (only used in windowed mode).
+    /// Stores seconds since epoch when the first failure in the current window occurred.
+    /// 0 means no active window.
+    window_start: AtomicU64,
+    /// Configuration (immutable after construction).
+    config: CircuitBreakerConfig,
+    /// Clock function for getting current time in seconds.
+    clock: fn() -> u64,
+}
+
+impl std::fmt::Debug for CircuitBreaker {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CircuitBreaker")
+            .field("state", &self.state)
+            .field("window_start", &self.window_start)
+            .field("config", &self.config)
+            .finish_non_exhaustive()
+    }
+}
+
+// State encoding constants
+const STATE_CLOSED: u64 = 0;
+const STATE_OPEN: u64 = 1;
+const STATE_HALF_OPEN: u64 = 2;
+
+const STATE_SHIFT: u32 = 62;
+const STATE_MASK: u64 = 0b11;
+
+const FAILURE_SHIFT: u32 = 48;
+const FAILURE_MASK: u64 = 0x3FFF; // 14 bits
+
+const PERMIT_SHIFT: u32 = 32;
+const PERMIT_MASK: u64 = 0xFFFF; // 16 bits
+
+const TIMESTAMP_MASK: u64 = 0xFFFF_FFFF; // 32 bits
+
+impl CircuitBreaker {
+    /// Create a new circuit breaker with the given configuration.
+    pub fn new(config: CircuitBreakerConfig) -> Self {
+        Self::with_clock(config, Self::system_clock)
+    }
+
+    /// Create a circuit breaker with a custom clock (for testing).
+    pub fn with_clock(config: CircuitBreakerConfig, clock: fn() -> u64) -> Self {
+        let initial = Self::pack(STATE_CLOSED, 0, 0, clock());
+        Self {
+            state: AtomicU64::new(initial),
+            window_start: AtomicU64::new(0),
+            config,
+            clock,
+        }
+    }
+
+    /// System clock returning seconds since epoch (32-bit, wrapping).
+    #[inline]
+    fn system_clock() -> u64 {
+        std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .map(|d| d.as_secs() & TIMESTAMP_MASK)
+            .unwrap_or(0)
+    }
+
+    /// Get current time from the configured clock.
+    #[inline]
+    fn now_secs(&self) -> u64 {
+        (self.clock)()
+    }
+
+    /// Pack state components into a single u64.
+    #[inline]
+    fn pack(state: u64, failures: u64, permits: u64, timestamp: u64) -> u64 {
+        ((state & STATE_MASK) << STATE_SHIFT)
+            | ((failures & FAILURE_MASK) << FAILURE_SHIFT)
+            | ((permits & PERMIT_MASK) << PERMIT_SHIFT)
+            | (timestamp & TIMESTAMP_MASK)
+    }
+
+    /// Unpack a u64 into state components.
+    #[inline]
+    fn unpack(packed: u64) -> (u64, u64, u64, u64) {
+        let state = (packed >> STATE_SHIFT) & STATE_MASK;
+        let failures = (packed >> FAILURE_SHIFT) & FAILURE_MASK;
+        let permits = (packed >> PERMIT_SHIFT) & PERMIT_MASK;
+        let timestamp = packed & TIMESTAMP_MASK;
+        (state, failures, permits, timestamp)
+    }
+
+    /// Check if a request should be allowed through the circuit.
+    ///
+    /// Returns `Ok(())` if the request is allowed, `Err(CircuitOpen)` if the
+    /// circuit is open and the request should be rejected.
+    ///
+    /// In half-open state, this atomically decrements the permit count.
+    pub fn allow(&self) -> Result<(), CircuitOpen> {
+        loop {
+            let packed = self.state.load(Ordering::Acquire);
+            let (state, failures, permits, timestamp) = Self::unpack(packed);
+
+            match state {
+                STATE_CLOSED => return Ok(()),
+
+                STATE_OPEN => {
+                    let now = self.now_secs();
+                    let elapsed = now.wrapping_sub(timestamp);
+
+                    if elapsed >= self.config.reset_timeout.as_secs() {
+                        // Timeout elapsed, try to transition to half-open
+                        let new_packed = Self::pack(
+                            STATE_HALF_OPEN,
+                            0,
+                            u64::from(self.config.half_open_permits),
+                            now,
+                        );
+
+                        match self.state.compare_exchange_weak(
+                            packed,
+                            new_packed,
+                            Ordering::AcqRel,
+                            Ordering::Acquire,
+                        ) {
+                            Ok(_) => continue,  // Transitioned, retry allow()
+                            Err(_) => continue, // Someone else modified, retry
+                        }
+                    }
+                    return Err(CircuitOpen);
+                }
+
+                STATE_HALF_OPEN => {
+                    if permits == 0 {
+                        return Err(CircuitOpen);
+                    }
+
+                    // Try to claim a permit
+                    let new_packed = Self::pack(STATE_HALF_OPEN, failures, permits - 1, timestamp);
+
+                    match self.state.compare_exchange_weak(
+                        packed,
+                        new_packed,
+                        Ordering::AcqRel,
+                        Ordering::Acquire,
+                    ) {
+                        Ok(_) => return Ok(()),
+                        Err(_) => continue, // CAS failed, retry
+                    }
+                }
+
+                _ => {
+                    // Invalid state, reset to closed
+                    let new_packed = Self::pack(STATE_CLOSED, 0, 0, self.now_secs());
+                    let _ = self.state.compare_exchange(
+                        packed,
+                        new_packed,
+                        Ordering::AcqRel,
+                        Ordering::Acquire,
+                    );
+                    return Ok(());
+                }
+            }
+        }
+    }
+
+    /// Record a successful request.
+    ///
+    /// In closed state, resets the failure counter (and window for windowed mode).
+    /// In half-open state, closes the circuit (service is healthy again).
+    pub fn record_success(&self) {
+        // Reset window start for windowed mode
+        self.window_start.store(0, Ordering::Release);
+
+        loop {
+            let packed = self.state.load(Ordering::Acquire);
+            let (state, _, _, _) = Self::unpack(packed);
+
+            let new_packed = match state {
+                STATE_CLOSED => {
+                    // Reset failure count, keep closed
+                    Self::pack(STATE_CLOSED, 0, 0, self.now_secs())
+                }
+                STATE_HALF_OPEN => {
+                    // Success in half-open: close the circuit
+                    Self::pack(STATE_CLOSED, 0, 0, self.now_secs())
+                }
+                STATE_OPEN => return, // Shouldn't record success while open
+                _ => return,
+            };
+
+            match self.state.compare_exchange_weak(
+                packed,
+                new_packed,
+                Ordering::AcqRel,
+                Ordering::Acquire,
+            ) {
+                Ok(_) => return,
+                Err(_) => continue,
+            }
+        }
+    }
+
+    /// Record a failed request.
+    ///
+    /// In closed state, increments the failure counter and opens the circuit
+    /// if the threshold is reached.
+    /// In half-open state, reopens the circuit immediately.
+    pub fn record_failure(&self) {
+        match &self.config.failure_policy {
+            FailurePolicy::Consecutive { threshold } => {
+                self.record_failure_consecutive(*threshold);
+            }
+            FailurePolicy::Windowed { threshold, window } => {
+                self.record_failure_windowed(*threshold, window.as_secs());
+            }
+        }
+    }
+
+    /// Record failure with consecutive failure tracking.
+    fn record_failure_consecutive(&self, threshold: u32) {
+        loop {
+            let packed = self.state.load(Ordering::Acquire);
+            let (state, failures, _, _) = Self::unpack(packed);
+            let now = self.now_secs();
+
+            let new_packed = match state {
+                STATE_CLOSED => {
+                    let new_failures = failures + 1;
+                    if new_failures >= u64::from(threshold) {
+                        Self::pack(STATE_OPEN, 0, 0, now)
+                    } else {
+                        Self::pack(STATE_CLOSED, new_failures, 0, now)
+                    }
+                }
+                STATE_HALF_OPEN => Self::pack(STATE_OPEN, 0, 0, now),
+                STATE_OPEN => return,
+                _ => return,
+            };
+
+            match self.state.compare_exchange_weak(
+                packed,
+                new_packed,
+                Ordering::AcqRel,
+                Ordering::Acquire,
+            ) {
+                Ok(_) => return,
+                Err(_) => continue,
+            }
+        }
+    }
+
+    /// Record failure with windowed failure tracking.
+    fn record_failure_windowed(&self, threshold: u32, window_secs: u64) {
+        let now = self.now_secs();
+
+        // Handle window timing
+        let window_start = self.window_start.load(Ordering::Acquire);
+        let (new_window_start, reset_count) = if window_start == 0 {
+            // First failure, start new window
+            (now, true)
+        } else if now.wrapping_sub(window_start) >= window_secs {
+            // Window expired, start new window
+            (now, true)
+        } else {
+            // Within window, continue counting
+            (window_start, false)
+        };
+
+        // Update window start if needed (best-effort, races are acceptable)
+        if new_window_start != window_start {
+            let _ = self.window_start.compare_exchange(
+                window_start,
+                new_window_start,
+                Ordering::Release,
+                Ordering::Relaxed,
+            );
+        }
+
+        // Now update the main state
+        loop {
+            let packed = self.state.load(Ordering::Acquire);
+            let (state, failures, _, _) = Self::unpack(packed);
+
+            let new_packed = match state {
+                STATE_CLOSED => {
+                    let new_failures = if reset_count { 1 } else { failures + 1 };
+                    if new_failures >= u64::from(threshold) {
+                        // Reset window when opening circuit
+                        self.window_start.store(0, Ordering::Release);
+                        Self::pack(STATE_OPEN, 0, 0, now)
+                    } else {
+                        Self::pack(STATE_CLOSED, new_failures, 0, now)
+                    }
+                }
+                STATE_HALF_OPEN => {
+                    self.window_start.store(0, Ordering::Release);
+                    Self::pack(STATE_OPEN, 0, 0, now)
+                }
+                STATE_OPEN => return,
+                _ => return,
+            };
+
+            match self.state.compare_exchange_weak(
+                packed,
+                new_packed,
+                Ordering::AcqRel,
+                Ordering::Acquire,
+            ) {
+                Ok(_) => return,
+                Err(_) => continue,
+            }
+        }
+    }
+
+    /// Get the current circuit state for observability.
+    pub fn state(&self) -> CircuitState {
+        let packed = self.state.load(Ordering::Acquire);
+        let (state, failures, permits, _) = Self::unpack(packed);
+
+        match state {
+            STATE_CLOSED => CircuitState::Closed {
+                failure_count: failures as u32,
+            },
+            STATE_OPEN => CircuitState::Open,
+            STATE_HALF_OPEN => CircuitState::HalfOpen {
+                permits_remaining: permits as u32,
+            },
+            _ => CircuitState::Closed { failure_count: 0 },
+        }
+    }
+
+    /// Reset the circuit breaker to closed state.
+    pub fn reset(&self) {
+        self.window_start.store(0, Ordering::Release);
+        let packed = Self::pack(STATE_CLOSED, 0, 0, self.now_secs());
+        self.state.store(packed, Ordering::Release);
+    }
+
+    /// Force the circuit to a specific state (for testing/admin).
+    #[cfg(test)]
+    pub fn force_state(&self, new_state: CircuitState) {
+        let now = self.now_secs();
+        let packed = match new_state {
+            CircuitState::Closed { failure_count } => {
+                Self::pack(STATE_CLOSED, u64::from(failure_count), 0, now)
+            }
+            CircuitState::Open => Self::pack(STATE_OPEN, 0, 0, now),
+            CircuitState::HalfOpen { permits_remaining } => {
+                Self::pack(STATE_HALF_OPEN, 0, u64::from(permits_remaining), now)
+            }
+        };
+        self.state.store(packed, Ordering::Release);
+    }
+}
+
+/// Error returned when the circuit is open.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct CircuitOpen;
+
+impl std::fmt::Display for CircuitOpen {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "circuit breaker is open")
+    }
+}
+
+impl std::error::Error for CircuitOpen {}
+
+/// Observable circuit state.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CircuitState {
+    /// Circuit is closed, requests flow through normally.
+    Closed {
+        /// Number of failures since last success/reset.
+        failure_count: u32,
+    },
+    /// Circuit is open, requests are rejected immediately.
+    Open,
+    /// Circuit is half-open, limited probe requests allowed.
+    HalfOpen {
+        /// Number of probe requests still allowed.
+        permits_remaining: u32,
+    },
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+    use std::thread;
+
+    // =========================================================================
+    // Consecutive mode tests
+    // =========================================================================
+
+    #[test]
+    fn test_initial_state_is_closed() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::default());
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 0 });
+    }
+
+    #[test]
+    fn test_allow_when_closed() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::default());
+        assert!(cb.allow().is_ok());
+    }
+
+    #[test]
+    fn test_consecutive_failures_increment() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::consecutive(5));
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 1 });
+
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 3 });
+    }
+
+    #[test]
+    fn test_consecutive_success_resets_failures() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::consecutive(5));
+
+        cb.record_failure();
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 3 });
+
+        cb.record_success();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 0 });
+    }
+
+    #[test]
+    fn test_consecutive_opens_at_threshold() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::consecutive(3));
+
+        cb.record_failure();
+        cb.record_failure();
+        assert!(matches!(cb.state(), CircuitState::Closed { .. }));
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+    }
+
+    #[test]
+    fn test_rejects_when_open() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::consecutive(1).reset_timeout(Duration::from_secs(3600)),
+        );
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+        assert!(cb.allow().is_err());
+    }
+
+    #[test]
+    fn test_half_open_after_timeout() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::consecutive(1)
+                .reset_timeout(Duration::from_millis(1))
+                .half_open_permits(2),
+        );
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+
+        thread::sleep(Duration::from_millis(10));
+
+        assert!(cb.allow().is_ok());
+        assert!(matches!(cb.state(), CircuitState::HalfOpen { .. }));
+    }
+
+    #[test]
+    fn test_half_open_permits_decrement() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::consecutive(1)
+                .reset_timeout(Duration::from_millis(1))
+                .half_open_permits(3),
+        );
+
+        cb.record_failure();
+        thread::sleep(Duration::from_millis(10));
+
+        assert!(cb.allow().is_ok());
+        assert_eq!(
+            cb.state(),
+            CircuitState::HalfOpen {
+                permits_remaining: 2
+            }
+        );
+
+        assert!(cb.allow().is_ok());
+        assert_eq!(
+            cb.state(),
+            CircuitState::HalfOpen {
+                permits_remaining: 1
+            }
+        );
+
+        assert!(cb.allow().is_ok());
+        assert_eq!(
+            cb.state(),
+            CircuitState::HalfOpen {
+                permits_remaining: 0
+            }
+        );
+
+        assert!(cb.allow().is_err());
+    }
+
+    #[test]
+    fn test_half_open_success_closes() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::consecutive(1)
+                .reset_timeout(Duration::from_millis(1))
+                .half_open_permits(3),
+        );
+
+        cb.record_failure();
+        thread::sleep(Duration::from_millis(10));
+        let _ = cb.allow();
+
+        cb.record_success();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 0 });
+    }
+
+    #[test]
+    fn test_half_open_failure_reopens() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::consecutive(1)
+                .reset_timeout(Duration::from_millis(1))
+                .half_open_permits(3),
+        );
+
+        cb.record_failure();
+        thread::sleep(Duration::from_millis(10));
+        let _ = cb.allow();
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+    }
+
+    // =========================================================================
+    // Windowed mode tests
+    // =========================================================================
+
+    #[test]
+    fn test_windowed_opens_at_threshold() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::windowed(3, Duration::from_secs(10)));
+
+        cb.record_failure();
+        cb.record_failure();
+        assert!(matches!(cb.state(), CircuitState::Closed { .. }));
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+    }
+
+    #[test]
+    fn test_windowed_resets_after_window() {
+        // Note: window uses second-level precision, so use 1 second window
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::windowed(3, Duration::from_secs(1)));
+
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 2 });
+
+        // Wait for window to expire (1 second + buffer)
+        thread::sleep(Duration::from_millis(1100));
+
+        // This failure starts a new window
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 1 });
+
+        // Two more to hit threshold
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+    }
+
+    #[test]
+    fn test_windowed_success_resets_window() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::windowed(3, Duration::from_secs(10)));
+
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 2 });
+
+        // Success resets the failure count
+        cb.record_success();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 0 });
+
+        // Need 3 fresh failures to open
+        cb.record_failure();
+        cb.record_failure();
+        assert!(matches!(cb.state(), CircuitState::Closed { .. }));
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+    }
+
+    #[test]
+    fn test_windowed_half_open_recovery() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::windowed(2, Duration::from_secs(10))
+                .reset_timeout(Duration::from_millis(1)),
+        );
+
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+
+        thread::sleep(Duration::from_millis(10));
+
+        assert!(cb.allow().is_ok());
+        assert!(matches!(cb.state(), CircuitState::HalfOpen { .. }));
+
+        cb.record_success();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 0 });
+    }
+
+    // =========================================================================
+    // Concurrency tests
+    // =========================================================================
+
+    #[test]
+    fn test_concurrent_failures_open_exactly_once() {
+        for _ in 0..100 {
+            let cb = Arc::new(CircuitBreaker::new(
+                CircuitBreakerConfig::consecutive(10).reset_timeout(Duration::from_secs(3600)),
+            ));
+
+            let handles: Vec<_> = (0..20)
+                .map(|_| {
+                    let cb = Arc::clone(&cb);
+                    thread::spawn(move || {
+                        cb.record_failure();
+                    })
+                })
+                .collect();
+
+            for h in handles {
+                h.join().unwrap();
+            }
+
+            assert_eq!(cb.state(), CircuitState::Open);
+        }
+    }
+
+    #[test]
+    fn test_concurrent_allow_in_half_open_respects_permits() {
+        for _ in 0..100 {
+            let cb = Arc::new(CircuitBreaker::new(
+                CircuitBreakerConfig::consecutive(1)
+                    .reset_timeout(Duration::from_millis(1))
+                    .half_open_permits(5),
+            ));
+
+            cb.record_failure();
+            thread::sleep(Duration::from_millis(10));
+
+            let allowed = Arc::new(std::sync::atomic::AtomicU32::new(0));
+
+            let handles: Vec<_> = (0..20)
+                .map(|_| {
+                    let cb = Arc::clone(&cb);
+                    let allowed = Arc::clone(&allowed);
+                    thread::spawn(move || {
+                        if cb.allow().is_ok() {
+                            allowed.fetch_add(1, Ordering::SeqCst);
+                        }
+                    })
+                })
+                .collect();
+
+            for h in handles {
+                h.join().unwrap();
+            }
+
+            let total_allowed = allowed.load(Ordering::SeqCst);
+            assert!(
+                total_allowed <= 5,
+                "allowed {} requests but only 5 permits",
+                total_allowed
+            );
+        }
+    }
+
+    #[test]
+    fn test_concurrent_windowed_failures() {
+        for _ in 0..50 {
+            let cb = Arc::new(CircuitBreaker::new(CircuitBreakerConfig::windowed(
+                10,
+                Duration::from_secs(60),
+            )));
+
+            let handles: Vec<_> = (0..20)
+                .map(|_| {
+                    let cb = Arc::clone(&cb);
+                    thread::spawn(move || {
+                        cb.record_failure();
+                    })
+                })
+                .collect();
+
+            for h in handles {
+                h.join().unwrap();
+            }
+
+            assert_eq!(cb.state(), CircuitState::Open);
+        }
+    }
+
+    // =========================================================================
+    // Pack/unpack tests
+    // =========================================================================
+
+    #[test]
+    fn test_pack_unpack_roundtrip() {
+        let test_cases = [
+            (STATE_CLOSED, 0, 0, 0),
+            (STATE_OPEN, 0, 0, 12345),
+            (STATE_HALF_OPEN, 100, 50, 999999),
+            (STATE_CLOSED, FAILURE_MASK, PERMIT_MASK, TIMESTAMP_MASK),
+        ];
+
+        for (state, failures, permits, timestamp) in test_cases {
+            let packed = CircuitBreaker::pack(state, failures, permits, timestamp);
+            let (s, f, p, t) = CircuitBreaker::unpack(packed);
+            assert_eq!(s, state, "state mismatch");
+            assert_eq!(f, failures, "failures mismatch");
+            assert_eq!(p, permits, "permits mismatch");
+            assert_eq!(t, timestamp, "timestamp mismatch");
+        }
+    }
+
+    // =========================================================================
+    // Builder API tests
+    // =========================================================================
+
+    #[test]
+    fn test_builder_consecutive() {
+        let config = CircuitBreakerConfig::consecutive(5)
+            .reset_timeout(Duration::from_secs(60))
+            .half_open_permits(10);
+
+        assert_eq!(
+            config.failure_policy,
+            FailurePolicy::Consecutive { threshold: 5 }
+        );
+        assert_eq!(config.reset_timeout, Duration::from_secs(60));
+        assert_eq!(config.half_open_permits, 10);
+    }
+
+    #[test]
+    fn test_builder_windowed() {
+        let config = CircuitBreakerConfig::windowed(3, Duration::from_secs(10))
+            .reset_timeout(Duration::from_secs(30))
+            .half_open_permits(5);
+
+        assert_eq!(
+            config.failure_policy,
+            FailurePolicy::Windowed {
+                threshold: 3,
+                window: Duration::from_secs(10)
+            }
+        );
+        assert_eq!(config.reset_timeout, Duration::from_secs(30));
+        assert_eq!(config.half_open_permits, 5);
+    }
+}
diff --git a/src/config.rs b/src/config.rs
index 10949f2..c454ad9 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -80,13 +80,21 @@ pub struct AiConfig {
 
     /// Graceful-shutdown drain window (seconds): after SIGTERM, how long Pingora lets **in-flight
     /// requests finish** before tearing the runtimes down. Maps to Pingora's `grace_period_seconds`
-    /// (left unset, Pingora silently defaults to 300s — this knob makes the window explicit and
-    /// tunable without a code change). The tension: a streaming completion can run up to
-    /// `read_timeout_secs` (600s default), so any in-flight stream longer than this window is cut at
-    /// the boundary (client sees a mid-stream TCP close). Set it to cover your p99 stream, not the
-    /// max — covering the max means every deploy waits the full window. **It is also capped by the
-    /// orchestrator**: ECS SIGKILLs at `stopTimeout` (default 30s, max 120s), so a grace larger than
-    /// `stopTimeout` is wasted unless `stopTimeout` is raised to match. Default 120 = the ECS ceiling.
+    /// (left unset, Pingora silently defaults to 300s — this knob makes the window explicit).
+    ///
+    /// **Default to `read_timeout_secs` so we never truncate a response.** The gateway is a
+    /// transparent man-in-the-middle: cutting an in-flight stream on deploy corrupts a generation the
+    /// caller is paying for and can't cleanly retry (a half-delivered SSE isn't idempotent). The
+    /// longest a request can live is `read_timeout_secs`, so a drain window of at least that
+    /// guarantees every accepted request finishes — Pingora stops *accepting* new connections the
+    /// instant SIGTERM lands, so this only ever waits out the existing longest stream, not new work.
+    /// Slower rollouts are the deliberate price of not mangling responses.
+    ///
+    /// **The orchestrator must grant the same window**, or it caps us: the platform SIGKILLs at its
+    /// own stop timeout regardless of this value. Set k8s `terminationGracePeriodSeconds` (or the EC2
+    /// agent's `ECS_CONTAINER_STOP_TIMEOUT`) to match. Note **ECS Fargate caps `stopTimeout` at 120s**
+    /// — there, full coverage of a 600s stream is impossible and the longest streams will still be
+    /// cut at 120s; that's a Fargate limitation, not a reason to default to truncating.
     pub shutdown_grace_period_secs: u64,
     /// Final runtime-teardown timeout (seconds) **after** the drain window: how long Pingora waits for
     /// the tokio runtimes to exit before forcing the process down. Maps to Pingora's
@@ -134,6 +142,27 @@ pub struct AiConfig {
     /// deliberately doesn't cover, and why the real fix for egress-reputation pain is a
     /// provider-feedback circuit breaker rather than a bigger number here.
     pub byo_rate_limit_rps: u32,
+
+    /// Per-provider circuit breaker: number of upstream **failures within `circuit_breaker_window_secs`**
+    /// that trips the breaker open for that provider. A failure is a **5xx response or a connect
+    /// failure** — i.e. the *provider is broken*. A `429` is deliberately **not** a failure: it means
+    /// the provider is healthy and throttling our pool key (a velocity/spend signal the rate limiter
+    /// and the client's `Retry-After` backoff own), so tripping on it would convert a self-healing
+    /// throttle into a self-inflicted outage. While open, requests to that provider fast-fail with a
+    /// `503` (`ai_rejections_total{reason="circuit_open"}`) instead of piling up against
+    /// `read_timeout_secs` and exhausting connection/in-flight slots for *every* provider. After
+    /// `circuit_breaker_reset_secs` a probe request is allowed; success closes it, failure reopens it.
+    /// Applies to **all** traffic to the provider (managed + BYO) — a down provider is down regardless
+    /// of whose key is used. `0` disables the breaker entirely. Default is generous so normal
+    /// background 5xx noise never trips it.
+    pub circuit_breaker_threshold: u32,
+    /// Rolling window (seconds) over which `circuit_breaker_threshold` failures are counted. Failures
+    /// older than the window are forgotten — so it trips on a *burst* of failures, not on a slow trickle
+    /// spread across a healthy day.
+    pub circuit_breaker_window_secs: u64,
+    /// How long the breaker stays open before allowing a half-open probe request (seconds). Long enough
+    /// to let a provider recover, short enough that recovery is detected promptly.
+    pub circuit_breaker_reset_secs: u64,
 }
 
 impl Default for AiConfig {
@@ -155,11 +184,12 @@ impl Default for AiConfig {
             read_timeout_secs: 600,
             write_timeout_secs: 60,
             idle_timeout_secs: 90,
-            // Drain in-flight requests for up to the ECS SIGKILL ceiling (120s) on SIGTERM, then a
-            // short runtime-teardown backstop. Explicit so the window is a documented operational
-            // knob, not Pingora's silent 300s default. See the field docs for the read_timeout /
-            // orchestrator-stopTimeout tradeoffs.
-            shutdown_grace_period_secs: 120,
+            // Drain for the full request lifetime (= read_timeout_secs) so a deploy never truncates
+            // an in-flight stream — we're a transparent proxy and must not mangle a paid-for
+            // generation. Pingora stops accepting new connections at SIGTERM, so this only waits out
+            // the longest existing stream. The orchestrator's stop timeout must match (see field
+            // docs; ECS Fargate's 120s cap is a hard limit there). Then a short teardown backstop.
+            shutdown_grace_period_secs: 600,
             shutdown_runtime_timeout_secs: 10,
             upstream_tls: true,
             // Prefer H2 to providers by default (all of `KNOWN_PROVIDERS` offer it; H1 fallback is
@@ -174,6 +204,12 @@ impl Default for AiConfig {
             // throughput, low enough that a junk-auth flood can't get our egress IPs flagged by the
             // providers. Tune from the metric; set 0 to disable. (Managed traffic is exempt.)
             byo_rate_limit_rps: 1_000,
+            // Per-provider breaker: trip after 20 upstream failures (5xx/connect) within 10s, stay
+            // open 30s, then probe. Generous enough that a provider's occasional background 5xx never
+            // trips it — only a sustained brownout does. Set threshold 0 to disable.
+            circuit_breaker_threshold: 20,
+            circuit_breaker_window_secs: 10,
+            circuit_breaker_reset_secs: 30,
         }
     }
 }
@@ -223,6 +259,25 @@ impl AiConfig {
         Ok(())
     }
 
+    /// The per-provider circuit-breaker config, or `None` when disabled (`circuit_breaker_threshold
+    /// == 0`). Windowed policy: a degrading backend trips on a *burst* of failures, not a slow
+    /// trickle (see `circuit_breaker` crate docs). Each provider gets its own breaker built from this
+    /// (see `state::build_providers`).
+    pub fn circuit_breaker_config(&self) -> Option<crate::circuit_breaker::CircuitBreakerConfig> {
+        if self.circuit_breaker_threshold == 0 {
+            return None;
+        }
+        Some(
+            crate::circuit_breaker::CircuitBreakerConfig::windowed(
+                self.circuit_breaker_threshold,
+                std::time::Duration::from_secs(self.circuit_breaker_window_secs),
+            )
+            .reset_timeout(std::time::Duration::from_secs(
+                self.circuit_breaker_reset_secs,
+            )),
+        )
+    }
+
     /// Fold `AI_POOL_KEY_<NAME>` environment variables into `pool_keys` (provider name lowercased).
     /// This is the production secret path (SSM-injected env); a flat figment merge can't target a
     /// map field, and env must win over any `[pool_keys]` value baked into a config file.
diff --git a/src/lib.rs b/src/lib.rs
index be11bd6..bfdf419 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,6 +15,7 @@
 #![cfg_attr(test, allow(clippy::unwrap_used, clippy::expect_used, clippy::panic))]
 
 pub mod admin;
+pub mod circuit_breaker;
 pub mod config;
 pub mod deny;
 pub mod doctor;
diff --git a/src/main.rs b/src/main.rs
index 634144f..6d77699 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -13,6 +13,7 @@ use beyond_ai::store_watch::WatcherService;
 use clap::{Parser, Subcommand};
 use pingora_core::apps::http_app::HttpServer;
 use pingora_core::server::Server;
+use pingora_core::server::configuration::ServerConf;
 use pingora_core::services::background::background_service;
 use pingora_core::services::listening::Service as ListeningService;
 use pingora_proxy::http_proxy_service;
@@ -95,6 +96,9 @@ fn main() {
     let config = load_config(cli.config.as_deref());
     let listen = config.listen.clone();
     let metrics_listen = config.metrics_listen.clone();
+    // Capture the shutdown knobs before `config` is moved into the gateway state below.
+    let grace_period_secs = config.shutdown_grace_period_secs;
+    let runtime_timeout_secs = config.shutdown_runtime_timeout_secs;
     let metrics = match Metrics::new() {
         Ok(m) => m,
         Err(e) => {
@@ -110,7 +114,17 @@ fn main() {
         }
     };
 
-    let mut server = Server::new(None).expect("failed to init pingora server");
+    // Make the graceful-shutdown drain window explicit instead of inheriting Pingora's silent
+    // defaults (300s grace / 5s runtime teardown). `grace_period_seconds` is how long in-flight
+    // requests get to finish after SIGTERM before teardown; `graceful_shutdown_timeout_seconds` is
+    // the final runtime-exit backstop. See the `AiConfig` field docs for the read_timeout /
+    // orchestrator-stopTimeout tradeoffs.
+    let conf = ServerConf {
+        grace_period_seconds: Some(grace_period_secs),
+        graceful_shutdown_timeout_seconds: Some(runtime_timeout_secs),
+        ..ServerConf::default()
+    };
+    let mut server = Server::new_with_opt_and_conf(None, conf);
     server.bootstrap();
 
     // Client (app) traffic.
@@ -142,6 +156,12 @@ fn main() {
     admin.add_tcp(&metrics_listen);
     server.add_service(admin);
 
-    tracing::info!(%listen, %metrics_listen, "starting beyond-ai");
+    tracing::info!(
+        %listen,
+        %metrics_listen,
+        grace_period_secs,
+        runtime_timeout_secs,
+        "starting beyond-ai"
+    );
     server.run_forever();
 }
diff --git a/src/proxy.rs b/src/proxy.rs
index db31468..9083ea7 100644
--- a/src/proxy.rs
+++ b/src/proxy.rs
@@ -111,6 +111,11 @@ pub struct RequestCtx {
     /// Running total of request-body bytes seen, to enforce `MAX_REQUEST_BODY` even when the client
     /// uses chunked transfer encoding (no `Content-Length` to check up front).
     body_bytes_fed: usize,
+    /// Upstream HTTP status, set in `response_filter` once the response head arrives. Drives the
+    /// circuit-breaker outcome recorded once in `logging`: `5xx` → failure, any other response →
+    /// success (the provider answered — a `429` is a healthy throttle, not a breaker trip), and a
+    /// `None` here with an upstream error → failure (connect/read failed before any response).
+    upstream_status: Option<u16>,
     /// Managed OpenAI chat/responses request: buffer the body and inject
     /// `stream_options.include_usage` if it streams without it, so the usage chunk (hence the
     /// billable token count) is guaranteed. The single, deliberate exception to "never buffer the
@@ -415,6 +420,31 @@ impl ProxyHttp for AiProxy {
         let inject_eligible =
             managed && dialect == Dialect::OpenAI && is_streamable_path(&forward_path);
 
+        // Circuit breaker (per provider, all traffic — a down provider is down regardless of whose
+        // key is used). Checked here, after every other rejection, so claiming a half-open probe
+        // permit corresponds to an *actual* upstream attempt — and balanced by exactly one
+        // `record_*` in `logging` (which runs once per admitted request), so a permit can't leak.
+        // When open, fast-fail 503 instead of piling the request against `read_timeout_secs` and
+        // exhausting connection/in-flight slots for every provider. 5xx/connect failures trip it;
+        // 429 never does (that's a healthy provider throttling — see `logging`).
+        if let Some(breaker) = &provider.breaker {
+            if breaker.allow().is_err() {
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&["circuit_open"])
+                    .inc();
+                return Self::reject(
+                    session,
+                    &request_id,
+                    503,
+                    "api_error",
+                    "provider temporarily unavailable",
+                )
+                .await;
+            }
+        }
+
         *ctx = Some(RequestCtx {
             tenant_id,
             vpc_id,
@@ -445,6 +475,7 @@ impl ProxyHttp for AiProxy {
             // reallocs is lost in the network noise of a stream we're already relaying chunk by chunk.
             resp_tail: Vec::new(),
             body_bytes_fed: 0,
+            upstream_status: None,
             start,
             attempt: 0,
             request_id,
@@ -663,9 +694,12 @@ impl ProxyHttp for AiProxy {
 
             // Per-provider response counter, bucketed by status class — the signal that a provider
             // is degrading (429/5xx) before it shows up only as latency or a missing usage event.
-            rc.provider
-                .metrics
-                .record_response(upstream_response.status.as_u16());
+            let status = upstream_response.status.as_u16();
+            rc.provider.metrics.record_response(status);
+            // Remember the status for the circuit-breaker outcome resolved in `logging` (a response
+            // arrived, so the provider is reachable — even a 429/5xx is a real answer, not a connect
+            // failure). `logging` decides failure-vs-success from this.
+            rc.upstream_status = Some(status);
 
             // Derive streaming from the response, not the request: SSE ⇒ use the streaming usage
             // parser; otherwise the body is a single JSON object.
@@ -780,6 +814,23 @@ impl ProxyHttp for AiProxy {
             );
         }
 
+        // Resolve the circuit-breaker outcome exactly once per admitted request (every request that
+        // claimed a permit in `request_filter` records here, so a half-open probe permit can't leak).
+        // Failure = the provider is *broken*: a 5xx response, or no response at all paired with an
+        // upstream error (connect/read failure). Success = the provider *answered* — 2xx/3xx, and
+        // deliberately **4xx/429 too**: a 429 is a healthy provider throttling our pool key, which the
+        // rate limiter and the client's `Retry-After` own, NOT a reason to cut all traffic to it.
+        if let Some(breaker) = &rc.provider.breaker {
+            match rc.upstream_status {
+                Some(s) if s >= 500 => breaker.record_failure(),
+                Some(_) => breaker.record_success(),
+                None if e.is_some() => breaker.record_failure(),
+                // No response and no error ⇒ client went away before the upstream answered; don't
+                // blame the provider — record success so the probe permit resolves.
+                None => breaker.record_success(),
+            }
+        }
+
         // The buffer may transiently hold up to 2× the cap before compaction; the usage event is
         // always in the last cap bytes, so slice to that bounded tail before parsing.
         let tail_start = rc.resp_tail.len().saturating_sub(USAGE_TAIL_CAP);
diff --git a/src/route.rs b/src/route.rs
index 587f848..e59c5bf 100644
--- a/src/route.rs
+++ b/src/route.rs
@@ -11,6 +11,7 @@
 //! paths. Operators can also add/override providers from config (see `state`/`config`). We do not
 //! translate between dialects — that's deliberately out of scope.
 
+use crate::circuit_breaker::CircuitBreaker;
 use crate::metrics::ProviderMetrics;
 use crate::secret::Secret;
 
@@ -191,6 +192,10 @@ pub struct Provider {
     /// Per-provider metric handles, resolved once here so the response path bumps a direct
     /// counter/histogram instead of a string-keyed label lookup per response.
     pub metrics: ProviderMetrics,
+    /// Per-provider circuit breaker, shared across all callers to this provider. `None` when the
+    /// breaker is disabled (`circuit_breaker_threshold == 0`). Checked before connect and fed the
+    /// 5xx/connect outcome — see `proxy`. Lock-free, so the hot path reads it without contention.
+    pub breaker: Option<CircuitBreaker>,
 }
 
 impl Provider {
@@ -204,6 +209,7 @@ impl Provider {
         auth: AuthScheme,
         pool_key: Option<&str>,
         metrics: ProviderMetrics,
+        breaker: Option<CircuitBreaker>,
     ) -> Self {
         let host = authority
             .split(':')
@@ -219,6 +225,7 @@ impl Provider {
             auth,
             pool_auth_value,
             metrics,
+            breaker,
         }
     }
 }
@@ -277,6 +284,7 @@ mod tests {
             AuthScheme::Bearer,
             Some("sk-x"),
             ProviderMetrics::disconnected(),
+            None,
         );
         assert_eq!(p.host, "api.openai.com");
         assert_eq!(p.dialect, Dialect::OpenAI);
@@ -290,6 +298,7 @@ mod tests {
             AuthScheme::XApiKey,
             None,
             ProviderMetrics::disconnected(),
+            None,
         );
         assert!(a.pool_auth_value.is_none());
     }
diff --git a/src/state.rs b/src/state.rs
index b4b42dd..26a7446 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -35,6 +35,15 @@ pub type RequestId = ArrayString<33>;
 /// (a `provider_authorities` entry whose name isn't known). Each provider's pool key (if any) is
 /// looked up by name and its managed auth header value precomputed.
 fn build_providers(config: &AiConfig, metrics: &Metrics) -> HashMap<String, Arc<Provider>> {
+    // One independent breaker per provider, all built from the same config (the breaker holds
+    // atomics so it can't be cloned — we mint a fresh one per provider). `None` ⇒ breaker disabled.
+    let cb_config = config.circuit_breaker_config();
+    let breaker = || {
+        cb_config
+            .clone()
+            .map(crate::circuit_breaker::CircuitBreaker::new)
+    };
+
     let mut providers = HashMap::new();
     for spec in route::KNOWN_PROVIDERS {
         let authority = config
@@ -52,6 +61,7 @@ fn build_providers(config: &AiConfig, metrics: &Metrics) -> HashMap<String, Arc<
                 spec.auth,
                 pool_key,
                 ProviderMetrics::resolve(metrics, spec.name),
+                breaker(),
             )),
         );
     }
@@ -69,6 +79,7 @@ fn build_providers(config: &AiConfig, metrics: &Metrics) -> HashMap<String, Arc<
                     AuthScheme::Bearer,
                     pool_key,
                     ProviderMetrics::resolve(metrics, name),
+                    breaker(),
                 )),
             );
         }
diff --git a/src/store_watch.rs b/src/store_watch.rs
index 8dd29e0..72c3602 100644
--- a/src/store_watch.rs
+++ b/src/store_watch.rs
@@ -110,7 +110,10 @@ impl BackgroundService for WatcherService {
             // down and `connect` is retrying its own backoff) rather than blocking teardown.
             let store = tokio::select! {
                 _ = shutdown.changed() => {
-                    info!("shutdown signaled; deny-set watcher exiting");
+                    info!(
+                        in_flight = self.state.metrics.requests_in_flight.get(),
+                        "shutdown signaled; deny-set watcher exiting"
+                    );
                     return;
                 }
                 outcome = connect(&self.state) => match outcome {
diff --git a/tests/common/mod.rs b/tests/common/mod.rs
index 215fbdf..a5d4f1e 100644
--- a/tests/common/mod.rs
+++ b/tests/common/mod.rs
@@ -176,6 +176,9 @@ pub enum Mode {
     /// OpenAI-shaped SSE stream with >128 KiB of content *before* the usage chunk — forces the
     /// proxy's response-tail compaction path.
     SseLarge,
+    /// Always reply with this HTTP status and a small JSON error body — for circuit-breaker tests
+    /// (5xx trips the breaker; 4xx/429 do not).
+    Status(u16),
 }
 
 #[derive(Default, Clone)]
@@ -190,6 +193,7 @@ pub struct Captured {
 pub struct MockUpstream {
     pub port: u16,
     captured: Arc<Mutex<Option<Captured>>>,
+    hits: Arc<std::sync::atomic::AtomicUsize>,
     task: tokio::task::JoinHandle<()>,
 }
 
@@ -227,6 +231,11 @@ fn canned_body(mode: Mode) -> (&'static str, Bytes) {
             Bytes::from_static(CANNED_ANTHROPIC_JSON.as_bytes()),
         ),
         Mode::SseLarge => ("text/event-stream", Bytes::from(large_sse())),
+        // The status is applied by `mock_handle`; the body is a stock error shape.
+        Mode::Status(_) => (
+            "application/json",
+            Bytes::from_static(br#"{"error":{"message":"mock"}}"#),
+        ),
     }
 }
 
@@ -245,8 +254,10 @@ fn proto_label(version: hyper::Version) -> &'static str {
 async fn mock_handle(
     req: Request<hyper::body::Incoming>,
     cap: Arc<Mutex<Option<Captured>>>,
+    hits: Arc<std::sync::atomic::AtomicUsize>,
     mode: Mode,
 ) -> Result<Response<Full<Bytes>>, std::convert::Infallible> {
+    hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
     let version = req.version();
     let path = req.uri().path().to_string();
     // Pull the headers we record before consuming the body (which moves `req`).
@@ -269,8 +280,12 @@ async fn mock_handle(
         body,
     });
     let (ct, payload) = canned_body(mode);
+    let status = match mode {
+        Mode::Status(s) => s,
+        _ => 200,
+    };
     Ok(Response::builder()
-        .status(200)
+        .status(status)
         .header("content-type", ct)
         .header("x-mock-proto", proto_label(version))
         .body(Full::new(payload))
@@ -284,7 +299,9 @@ impl MockUpstream {
         let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
         let port = listener.local_addr().unwrap().port();
         let captured: Arc<Mutex<Option<Captured>>> = Arc::new(Mutex::new(None));
+        let hits = Arc::new(std::sync::atomic::AtomicUsize::new(0));
         let cap = captured.clone();
+        let hit_counter = hits.clone();
         let task = tokio::spawn(async move {
             loop {
                 let Ok((stream, _)) = listener.accept().await else {
@@ -292,8 +309,11 @@ impl MockUpstream {
                 };
                 let io = TokioIo::new(stream);
                 let cap = cap.clone();
+                let hit_counter = hit_counter.clone();
                 tokio::spawn(async move {
-                    let svc = service_fn(move |req| mock_handle(req, cap.clone(), mode));
+                    let svc = service_fn(move |req| {
+                        mock_handle(req, cap.clone(), hit_counter.clone(), mode)
+                    });
                     let _ = hyper::server::conn::http1::Builder::new()
                         .serve_connection(io, svc)
                         .await;
@@ -303,6 +323,7 @@ impl MockUpstream {
         MockUpstream {
             port,
             captured,
+            hits,
             task,
         }
     }
@@ -336,7 +357,9 @@ impl MockUpstream {
         let acceptor = TlsAcceptor::from(Arc::new(tls));
 
         let captured: Arc<Mutex<Option<Captured>>> = Arc::new(Mutex::new(None));
+        let hits = Arc::new(std::sync::atomic::AtomicUsize::new(0));
         let cap = captured.clone();
+        let hit_counter = hits.clone();
         let task = tokio::spawn(async move {
             loop {
                 let Ok((stream, _)) = listener.accept().await else {
@@ -344,12 +367,15 @@ impl MockUpstream {
                 };
                 let acceptor = acceptor.clone();
                 let cap = cap.clone();
+                let hit_counter = hit_counter.clone();
                 tokio::spawn(async move {
                     let Ok(tls_stream) = acceptor.accept(stream).await else {
                         return;
                     };
                     let io = TokioIo::new(tls_stream);
-                    let svc = service_fn(move |req| mock_handle(req, cap.clone(), mode));
+                    let svc = service_fn(move |req| {
+                        mock_handle(req, cap.clone(), hit_counter.clone(), mode)
+                    });
                     // Auto builder: serves H2 or H1 per the negotiated ALPN.
                     let _ = auto::Builder::new(TokioExecutor::new())
                         .serve_connection(io, svc)
@@ -360,6 +386,7 @@ impl MockUpstream {
         MockUpstream {
             port,
             captured,
+            hits,
             task,
         }
     }
@@ -371,6 +398,12 @@ impl MockUpstream {
     pub fn captured(&self) -> Option<Captured> {
         self.captured.lock().unwrap().clone()
     }
+
+    /// Total requests the mock has received — used to prove an open circuit breaker stops requests
+    /// from reaching the upstream at all.
+    pub fn hits(&self) -> usize {
+        self.hits.load(std::sync::atomic::Ordering::Relaxed)
+    }
 }
 
 impl Drop for MockUpstream {
@@ -417,6 +450,9 @@ pub struct GatewayBuilder {
     tls_upstream: bool,
     /// Override the gateway's `upstream_http2` (H2H1 vs H1 ALPN). `None` ⇒ leave the gateway default.
     upstream_http2: Option<bool>,
+    /// Override the per-provider circuit-breaker threshold (failures in the window before opening).
+    /// `None` ⇒ leave the gateway default; `Some(0)` disables the breaker.
+    circuit_breaker_threshold: Option<u32>,
 }
 
 impl GatewayBuilder {
@@ -481,6 +517,13 @@ impl GatewayBuilder {
         self
     }
 
+    /// Set the per-provider circuit-breaker failure threshold (a tight window/reset are written too,
+    /// so the breaker trips fast in-test). `0` disables it.
+    pub fn circuit_breaker_threshold(mut self, threshold: u32) -> Self {
+        self.circuit_breaker_threshold = Some(threshold);
+        self
+    }
+
     pub async fn start(self) -> Gateway {
         let port = free_port();
         let metrics_port = free_port();
@@ -511,6 +554,14 @@ impl GatewayBuilder {
         if let Some(rps) = self.byo_rate_limit_rps {
             cfg.push_str(&format!("byo_rate_limit_rps = {rps}\n"));
         }
+        if let Some(threshold) = self.circuit_breaker_threshold {
+            // Tight window + reset so the test trips and recovers quickly.
+            cfg.push_str(&format!(
+                "circuit_breaker_threshold = {threshold}\n\
+                 circuit_breaker_window_secs = 60\n\
+                 circuit_breaker_reset_secs = 1\n"
+            ));
+        }
         if self.real_upstreams {
             // Real-host smoke mode: built-in provider defaults (no authority overrides). For a
             // *managed* smoke we still write the caller-supplied pool key(s) — the real provider key
@@ -588,6 +639,7 @@ impl Gateway {
             byo_rate_limit_rps: None,
             tls_upstream: false,
             upstream_http2: None,
+            circuit_breaker_threshold: None,
         }
     }
 
diff --git a/tests/e2e.rs b/tests/e2e.rs
index 2bf7e38..a12207d 100644
--- a/tests/e2e.rs
+++ b/tests/e2e.rs
@@ -846,3 +846,84 @@ async fn health_endpoints_report_ready_on_the_metrics_listener() {
     let (nf_status, _) = gw.admin_get("/nope").await;
     assert_eq!(nf_status, 404);
 }
+
+#[tokio::test]
+async fn circuit_breaker_opens_on_5xx_and_sheds() {
+    // A provider returning 5xx is *broken*: after `threshold` failures the per-provider breaker
+    // opens and the gateway fast-fails with 503 — without connecting upstream — instead of piling
+    // requests against `read_timeout_secs`. BYO traffic (no minting needed); the breaker gates all
+    // traffic to the provider.
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(1);
+    let mock = MockUpstream::start(Mode::Status(500)).await;
+    let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .circuit_breaker_threshold(3)
+        .start()
+        .await;
+    let client = reqwest::Client::new();
+
+    // While closed the gateway relays the mock's 500; once the breaker trips it returns its own 503.
+    // Poll until we observe the trip (each failure is recorded in `logging`, which lags the response
+    // slightly — polling absorbs that).
+    {
+        let (c, u) = (client.clone(), gw.url());
+        wait_for_status(503, move || {
+            let (c, u) = (c.clone(), u.clone());
+            async move { post_status(&c, &u, "sk-byo-test", body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    // The trip is visible as circuit_open rejections — the breaker shed requests before the upstream.
+    assert!(
+        parse_metric(&gw.metrics().await, "ai_rejections_total", "circuit_open") >= 1.0,
+        "expected ai_rejections_total{{reason=\"circuit_open\"}} >= 1 after the breaker tripped"
+    );
+}
+
+#[tokio::test]
+async fn circuit_breaker_does_not_trip_on_429() {
+    // A 429 is a *healthy* provider throttling our pool key — the rate limiter and the client's
+    // Retry-After own that, NOT the breaker. So a 429 storm must never open the circuit: every
+    // request is relayed (429) and reaches the upstream; none is shed (503).
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(1);
+    let mock = MockUpstream::start(Mode::Status(429)).await;
+    let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .circuit_breaker_threshold(3)
+        // Don't let the BYO rate limiter shed these — we want every request to reach the upstream.
+        .byo_rate_limit_rps(0)
+        .start()
+        .await;
+    let client = reqwest::Client::new();
+
+    // Warm up until the gateway is serving and relaying the mock's 429 (the readiness pattern the
+    // other e2e tests use — avoids racing the first request against gateway startup under load).
+    {
+        let (c, u) = (client.clone(), gw.url());
+        wait_for_status(429, move || {
+            let (c, u) = (c.clone(), u.clone());
+            async move { post_status(&c, &u, "sk-byo-test", body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    // Well past the failure threshold (3): all relayed as 429, never the breaker's 503.
+    for _ in 0..10 {
+        assert_eq!(
+            post_status(&client, &gw.url(), "sk-byo-test", body_for("gpt-4o")).await,
+            429
+        );
+    }
+
+    assert_eq!(
+        parse_metric(&gw.metrics().await, "ai_rejections_total", "circuit_open"),
+        0.0,
+        "a 429 storm must not open the circuit breaker"
+    );
+    assert!(
+        mock.hits() >= 10,
+        "every request should have reached the upstream (got {} hits)",
+        mock.hits()
+    );
+}