diff --git a/.cargo/audit.toml b/.cargo/audit.toml
new file mode 100644
index 0000000..d561358
--- /dev/null
+++ b/.cargo/audit.toml
@@ -0,0 +1,32 @@
+# cargo-audit configuration.
+#
+# The advisories ignored below are ALL transitively pinned by dependencies we cannot bump from this
+# repo, and each has been assessed for actual exposure. They are listed individually (not blanket
+# `informational` suppression) so a NEW advisory still fails the audit. Re-evaluate when the upstream
+# pins move — chiefly when `pingora` publishes past 0.8 (it is the latest published release as of
+# this writing) and when `beyond-slipstream` relaxes its `async-nats ^0.46` requirement.
+[advisories]
+ignore = [
+  # rustls-webpki 0.102.8: reachable panic in CRL parsing; name-constraint acceptance bugs (URI /
+  # wildcard); CRL distribution-point matching. Pulled ONLY by async-nats 0.46 (pinned by
+  # beyond-slipstream `^0.46`), used for the NATS/slipstream control-channel TLS — NOT the client-
+  # or provider-facing TLS, which already resolve the patched rustls-webpki 0.103.13. Blast radius
+  # is limited to MITM of the deny-set channel, which is fail-open and carries only deny entries.
+  # Fix path: a beyond-slipstream release on async-nats >= 0.47 (uses rustls-webpki 0.103+).
+  "RUSTSEC-2026-0104",
+  "RUSTSEC-2026-0098",
+  "RUSTSEC-2026-0099",
+  "RUSTSEC-2026-0049",
+
+  # protobuf 2.28.0: DoS via uncontrolled recursion when PARSING protobuf. Pulled by prometheus
+  # 0.13 (both our direct dep — kept at 0.13 to share pingora-core's default registry — and
+  # pingora-core 0.8 itself). We never parse untrusted protobuf: metrics are exposed in the text
+  # exposition format via pingora's prometheus_http_service. Fix path: pingora past 0.8 (drops the
+  # prometheus 0.13 / protobuf 2.x chain).
+  "RUSTSEC-2024-0437",
+
+  # Unmaintained-crate warnings (no known vulnerability), all transitive via pingora 0.8:
+  "RUSTSEC-2025-0134", # rustls-pemfile (via rustls-native-certs <- pingora-rustls / async-nats)
+  "RUSTSEC-2025-0069", # daemonize (via pingora-core)
+  "RUSTSEC-2024-0388", # derivative (via a pingora dependency)
+]
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..3827dd3
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,28 @@
+name: CI
+on:
+  pull_request:
+    branches: [main]
+env:
+  CARGO_TERM_COLOR: always
+  # Belt-and-suspenders: the panic-surface + `unused_must_use` denies live in `[lints]` (Cargo.toml)
+  # so they bind locally too, but escalate *every* warning to an error in CI in case a lint isn't
+  # expressible there (build scripts, future targets).
+  RUSTFLAGS: -D warnings
+jobs:
+  check:
+    name: Check
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - uses: jdx/mise-action@v2
+      - uses: Swatinem/rust-cache@v2
+      # Formatting: dprint (config/json/etc) + rustfmt.
+      - run: mise check:fmt
+      - run: cargo fmt --all --check
+      # Lints: clippy `-D warnings` across all targets. With `[lints.clippy]` denying the panic
+      # surface (unwrap/expect/panic/todo/unimplemented), a new `.unwrap()` in production code
+      # fails the build here.
+      - run: mise check:rs
+      - run: mise test:unit:rs
+      - run: mise test:integration:rs
+      - run: mise build:rs:release
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..18aa6a2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+target/
+dist/
+**/*.rs.bk
+.env
+.env.*
+!.env.example
+.claude/settings.local.json
+.sqlx
+.wiki
+node_modules/
+bench/out/
+.mcp.json
+.claude
+.env
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
new file mode 100644
index 0000000..8f9b40d
--- /dev/null
+++ b/ARCHITECTURE.md
@@ -0,0 +1,456 @@
+# Beyond AI Gateway — Architecture
+
+Takes HTTP requests carrying an OpenAI- or Anthropic-dialect payload, authenticates the caller via
+Ed25519 virtual key or BYO provider token, swaps in a pool key for managed traffic, relays the
+request and response byte-for-byte to the upstream provider, and emits a token-usage billing fact
+(`ai.usage`) on completion — all without buffering the body or response stream.
+
+**Self-contained:** no `path` deps into the `beyond` repo. Depends only on crates.io + the
+published `beyond-slipstream` — clones, CI-builds, and publishes anywhere.
+
+---
+
+## Concepts & Terminology
+
+| Term                                             | What It Controls / Gates                                                                                                                                    | NOT                                                                          |
+| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------- |
+| **Managed key** (`bai_v1.…`)                     | Ed25519-verified identity; enables key swap, deny-set check, and `ai.usage` billing                                                                         | A session token or capability grant — just tenant attribution                |
+| **BYO key** (anything else)                      | Forwarded as-is to the provider; no swap, no billing, no deny-set                                                                                           | A lesser tier — same proxy, minus attribution and billing                    |
+| **Pool key**                                     | Real provider API key held by the gateway; swapped in for managed traffic                                                                                   | Per-tenant — one key per provider, shared by all managed callers             |
+| **Tenant**                                       | The billing entity from the virtual key payload (`tenant_id: u64`)                                                                                          | An org, user, or namespace — an opaque integer the gateway doesn't interpret |
+| **Dialect**                                      | A provider attribute (OpenAI-wire vs Anthropic-wire) driving usage parsing; for a bare-path request it's derived from the path to pick the default provider | The provider — a prefixed request uses its provider's dialect, not the path  |
+| **Provider**                                     | The request's **first path segment** (`/{provider}/…`); a named row in the routing table: authority, dialect, auth scheme                                   | A vendor relationship — just connection facts and auth wiring                |
+| **Deny-set**                                     | Sparse map of denied `tenant_id`s → reason; gates managed traffic; default-allow                                                                            | An allowlist or ACL — misses are allowed, not blocked                        |
+| **Tail tap**                                     | Bounded 64KB window kept from the end of the response for usage extraction                                                                                  | A buffer or copy — the response is relayed unbuffered; only the tail is kept |
+| **Snapshot**                                     | On-disk deny-set cache (entries + NATS cursor) for edge/tunnel deployments                                                                                  | Persistent store — a pure cache; delete it and the gateway re-scans NATS     |
+| **Virtual key** (`bai_v1.{kid}.{payload}.{sig}`) | Ed25519-signed token encoding `tenant_id` + `vpc_id` (16-byte fixed payload)                                                                                | A session or auth token — stateless, no server-side lookup, no revocation    |
+
+---
+
+## Data Flow
+
+### Happy Path
+
+```
+Client (stock OpenAI/Anthropic SDK)
+  │
+  ▼  request_filter (proxy.rs)
+  │  ├─ Route: first segment → provider row (authority, dialect, auth scheme)
+  │  ├─ Extract key from Authorization: Bearer or x-api-key
+  │  ├─ Rate guardrails (BEFORE verify — keeps forged-key floods at ns cost)
+  │  │    per-credential count-min  ──────────────────────────────► 429
+  │  │    global BYO aggregate (managed exempt)  ─────────────────► 429
+  │  ├─ Content-Length abuse guard  ──────────────────────────────► 413
+  │  └─ Identity branch:
+  │       bai_v1.…  → Ed25519 verify → deny-set check (O(1))
+  │       │               │                    │
+  │       │             401 (bad sig)     402 Spend / 403 Fraud
+  │       │                                    │
+  │       │           pool key required ───────────────────────── 503
+  │       └─ BYO: pass through (no verify, no deny-set, no billing)
+  │  └─ Circuit breaker (per provider, all traffic): if OPEN ─────► 503
+  │       (claims a half-open probe permit only on an actual attempt)
+  │
+  ▼  upstream_peer (proxy.rs)
+  │  TTL-cached DNS resolve (60s) → HttpPeer (TLS, H2 pref, timeouts)
+  │  DNS fail ──────────────────────────────────────────────────── 502
+  │  TCP connect fail (retry 2×) ──────────────────────────────── 502
+  │
+  ▼  upstream_request_filter (proxy.rs)
+  │  Managed: remove both auth headers → inject pool key
+  │  BYO: leave auth header unchanged
+  │  Set Host; forward path verbatim (/{provider} prefix stripped)
+  │
+  ▼  request_body_filter (proxy.rs)  — body streamed through, never buffered
+  │  Feed chunks → ModelScanner (peek.rs) — extract root-level `model`, O(1) mem
+  │  Enforce running size cap (chunked-safe) ──────────────────── 413
+  │  Injection-eligible (managed OpenAI chat/responses + stream):
+  │    buffer full body → inject stream_options.include_usage → re-frame chunked
+  │
+  ▼  Provider upstream  (OpenAI / Anthropic / Groq / DeepSeek / …)
+  │
+  ▼  response_filter (proxy.rs)
+  │  Record TTFT; detect streaming (Content-Type: text/event-stream)
+  │  Count upstream response by provider + status class
+  │  Set x-beyond-request-id header
+  │
+  ▼  response_body_filter (proxy.rs)  — response relayed chunk-by-chunk, never buffered
+  │  Feed chunks → ModelScanner over response head → extract billed model
+  │  Append to bounded 64KB tail (compact drain(..half) if tail > 128KB)
+  │
+  ▼  logging (proxy.rs)
+     Parse usage from tail (by dialect + streaming flag)
+     Emit ai.usage fact: tenant, vpc, model, requested_model, token counts (managed only)
+     Record circuit-breaker outcome (once): 5xx / connect-fail → failure; else → success (429 incl.)
+     Decrement requests_in_flight gauge
+```
+
+### Background: Deny-Set Watcher
+
+```
+NATS (blackhole.* KV entries)
+  │
+  ▼  store_watch.rs (Pingora BackgroundService)
+  │  On connect: seed from disk snapshot (if snapshot_path set) or full NATS scan
+  │  Resume watch from saved revision (gap-free — no entry lost mid-connect)
+  │  Reconnect backoff: 1s → 30s exponential
+  │
+  ▼  ArcSwap<DenySet>  (state.rs)
+     Lock-free read on every managed request
+     Written only by the watcher on entry add/remove
+```
+
+---
+
+## Core Mechanism
+
+### Routing (`route.rs`)
+
+Providers are **data rows**, not code paths. `KNOWN_PROVIDERS` in `route.rs` lists 10 built-in
+providers (openai, anthropic, openrouter, fireworks, groq, deepseek, together, cerebras, mistral,
+xai); each row carries its authority (host:port), dialect (OpenAI-wire vs Anthropic-wire), and auth
+scheme (Bearer vs x-api-key). The `provider_authorities` config key adds or overrides rows at boot
+with zero code change.
+
+The routing rule: **first path segment = provider name**. `/groq/openai/v1/chat/completions` routes
+to Groq and forwards `/openai/v1/chat/completions` verbatim. A bare `/v1/…` path matches the
+dialect default (OpenAI or Anthropic based on which default is set). Unknown segment → 404. Model
+is not known at peer-selection time and is never used for routing.
+
+### Identity (`key.rs`)
+
+Virtual key format: `bai_v1.{kid}.{payload}.{sig}` where payload is exactly 16 bytes (8-byte
+`tenant_id` + 8-byte `vpc_id`, little-endian u64). Verification is **stateless Ed25519** — no
+database, no network call. The keyring holds multiple `kid` → public key mappings simultaneously
+(zero-downtime rotation: add the new kid, deploy, remove the old kid). A tampered or forged key
+falls through to BYO treatment; it does not error in a way that reveals which part failed.
+
+Verification cost ≈ 28µs per request — this is the gateway's only meaningful per-request CPU cost
+(everything else runs in nanoseconds; see Benchmarking). The rate guardrails sit **before** verify
+precisely because of this: a forged-key flood is rejected in tens of nanoseconds, not 28µs each.
+
+### Model Extraction (`peek.rs:ModelScanner`)
+
+A streaming structural scanner fed body or response chunks as they arrive. Tracks JSON nesting
+depth, string-escape state, and quote boundaries. Captures the **root-level `model` field only**
+(depth 0 in the object), ignoring nested `model` keys in tool calls or message content.
+SIMD-accelerated via `memchr2` to skip over large string values (base64-encoded images, long
+prompts). O(1) memory: one struct, no heap growth with payload size — proven by the unit bench
+which shows a single allocation independent of whether the body is 0 bytes, 4 KB, or 256 KB.
+
+The billing fact carries **two model fields**:
+
+- `requested_model` — what the client sent (extracted from the request body)
+- `model` — what the provider resolved and billed (extracted from the response head; falls back to
+  `requested_model` when the response carries no model field, e.g. an error body)
+
+`model` is what reconciles against the provider's invoice (which itemizes by pinned snapshot, e.g.
+`gpt-4o-2024-08-06`, not alias). `requested_model` serves product analytics and as a fallback rate
+when the snapshot is newer than the downstream price table.
+
+### Usage Extraction (`usage.rs`)
+
+The tail tap feeds the parser after `logging` fires. Two dialects:
+
+| Dialect   | Format     | Fields                                                                                                            |
+| --------- | ---------- | ----------------------------------------------------------------------------------------------------------------- |
+| OpenAI    | JSON body  | `usage.prompt_tokens`, `usage.completion_tokens`, `usage.prompt_tokens_details.cached_tokens`                     |
+| OpenAI    | SSE stream | Terminal `data:` line (before `[DONE]`), same fields                                                              |
+| Anthropic | JSON body  | `usage.input_tokens`, `usage.output_tokens`, `usage.cache_read_input_tokens`, `usage.cache_creation_input_tokens` |
+| Anthropic | SSE stream | `message_delta` event with `usage` block                                                                          |
+
+Missing or zero usage fields deserialize to zero (safe default). If the tail is truncated by the
+compaction drain, the usage chunk is still present because SSE usage is always the final `data:`
+line and the tail keeps the last 64KB.
+
+### Deny-Set (`deny.rs`)
+
+A `HashMap<u64, DenyReason>` (tenant_id → reason). Only denied tenants are stored — the map is
+`O(denied)` in memory regardless of total tenant count. Lookup is one hash probe. Written
+exclusively by the NATS watcher via `ArcSwap`; reads on the hot path are lock-free.
+
+Reasons: `Spend` (→ 402), `Fraud` (→ 403), `Unknown` (→ 403, fail-safe for unrecognized values).
+Restore = explicit delete from NATS KV or TTL expiry — no gateway-side timer.
+
+### Rate Guardrails (`ratelimit.rs`)
+
+Two fixed-memory count-min sketch tiers, checked before Ed25519 verify and before any upstream
+connection:
+
+| Tier                 | Key             | Bucket count | Default ceiling | Managed exempt? |
+| -------------------- | --------------- | ------------ | --------------- | --------------- |
+| Per-credential       | Hash of raw key | 5 MB sketch  | 100 req/s       | No              |
+| Global BYO aggregate | Single bucket   | 1 bucket     | 1000 req/s      | **Yes**         |
+
+The per-credential tier is keyed on the **raw presented credential** (not the verified tenant),
+which has two consequences: (1) the guard sits ahead of verify, so forged tokens are rejected
+before any crypto work; (2) virtual keys are deterministic per `(tenant, app)`, so this is
+effectively per-(tenant, app) granularity without a registry lookup.
+
+The global BYO aggregate exists because BYO traffic exits from the gateway's own egress IPs
+carrying the caller's raw token. A flood of distinct junk BYO tokens each get their own
+per-credential bucket and slip through that tier — the aggregate caps total BYO egress rate to
+protect the gateway's IP reputation with providers. Managed traffic is exempt because it's verified
+before any upstream connection and cannot be forged.
+
+Both tiers are generous circuit breakers, not quotas. `rate_limit_rps = 0` / `byo_rate_limit_rps =
+0` disable them independently.
+
+### Circuit Breaker (`circuit_breaker.rs`)
+
+A per-provider, lock-free circuit breaker (single packed `AtomicU64`; windowed failure policy) sits
+on the upstream path. It protects against a **broken provider**, which is a different failure than
+the rate guardrails (which protect against abusive _inbound_ load):
+
+- **Failure = the provider is broken** — a `5xx` response or a connect failure. After
+  `circuit_breaker_threshold` failures within `circuit_breaker_window_secs`, the breaker **opens**:
+  requests to that provider fast-fail with `503` (`ai_rejections_total{reason="circuit_open"}`)
+  instead of piling up against `read_timeout_secs` and exhausting connection / in-flight slots for
+  _every_ provider (head-of-line blocking by one sick dependency). After `circuit_breaker_reset_secs`
+  it half-opens and admits a probe; success closes it, failure reopens it.
+- **A `429` is NOT a failure.** It means the provider is healthy and throttling our pool key — a
+  velocity/spend signal the rate limiter and the client's `Retry-After` backoff own. Tripping on it
+  would convert a self-healing throttle into a self-inflicted outage. The breaker records any response
+  that _arrived_ (2xx/3xx/4xx incl. 429) as a **success**; only 5xx and transport failures count
+  against it.
+- **Applies to all traffic** (managed + BYO) — a down provider is down regardless of whose key is
+  used. One breaker per provider, built at boot, shared lock-free across callers.
+- The `allow()` check is the **last** thing in `request_filter` (after every other rejection), so a
+  scarce half-open probe permit is only claimed for a request that will actually attempt the upstream;
+  the outcome is recorded exactly once in `logging`, so a permit can never leak.
+- `circuit_breaker_threshold = 0` disables it.
+
+---
+
+## Why It Behaves This Way
+
+### Why rate guardrails sit before Ed25519 verify
+
+Ed25519 verify is ~28µs — roughly 350–650× more expensive than every other per-request operation.
+A flood of forged `bai_v1` tokens could drive unbounded crypto work if the rate limit came after
+verify. By checking the per-credential bucket first (keyed on the raw token, no crypto), a
+forged-key flood is rejected in tens of nanoseconds per request. Legit traffic is unaffected: the
+rate guard passes through, then verify runs as normal. The unit bench (`benches/unit.rs`) asserts
+this: `key/verify` ≈ 28µs; `ratelimit::check` ≈ 43–83ns; 0 allocations for either.
+
+### Why the body injection exception exists (`managed + OpenAI + streaming`)
+
+OpenAI streams no usage chunk unless `stream_options.include_usage: true` is set. Without it, a
+streaming managed request is unmeterable: no usage block in the response means no billing fact. The
+gateway injects this field server-side so callers using stock SDKs get metered without any
+cooperation. The request is buffered (`MAX_REQUEST_BODY` cap), the field injected, and the body
+re-framed as chunked upstream. Scoped to managed + OpenAI-dialect + streaming only — BYO and
+non-streaming requests remain pure passthrough.
+
+### Why the deny-set watch resumes from a saved revision
+
+A plain `watch_prefix` (NATS `DeliverPolicy::New`) would miss any entry written in the window
+between the initial seed scan and the live watch attaching. `store_watch.rs` records the stream
+revision at which the seed was complete and calls `watch_prefix_from` to resume from that revision
+— so a deny written during the gap is delivered, not silently dropped. This revision is also
+persisted across reconnects, so a NATS blip resumes from the last-seen point instead of re-scanning
+the entire keyspace.
+
+### Why BYO token validity is never checked
+
+Checking a BYO token requires a round-trip to the provider. The provider does that check anyway and
+returns 401 if the token is invalid — the client sees the same rejection it would get going direct,
+just routed through the gateway. Adding a gateway-side preflight check would double the latency for
+every BYO request on the error path with no security benefit at the gateway layer.
+
+### Why pricing is absent from the gateway
+
+The gateway emits token _facts_ (`ai.usage`): counts and model identifiers. Applying prices to
+those facts is a downstream concern. Provider pricing changes frequently, varies by contract tier,
+and is sometimes retroactively corrected on invoices. A downstream consumer can reprice historical
+facts; the gateway's facts cannot be regenerated once the request is gone.
+
+### Why routing uses the first path segment, not a header
+
+Path-based routing makes the target provider explicit in every request URL — visible in logs,
+traces, and curl output without inspecting headers. It also survives transparent proxies and load
+balancers that strip custom headers. A `/{provider}/` prefix was preferred over a separate header
+because SDKs already let callers set the base URL; swapping in the gateway's URL with a provider
+prefix requires no SDK modification.
+
+---
+
+## Trust Boundaries
+
+**What the gateway verifies (rejects if invalid):**
+
+- Virtual key signature (Ed25519, stateless — no DB lookup)
+- Virtual key format (`bai_v1.{kid}.{payload}.{sig}`, fixed 16-byte payload)
+- Tenant not in deny-set (managed traffic only; O(1) HashMap lookup)
+- Pool key configured for the requested provider (managed traffic only — else 503)
+- Request body size ≤ `MAX_REQUEST_BODY` (declared `Content-Length` + streaming running total)
+- Per-credential request rate within ceiling; aggregate BYO rate within ceiling
+
+**What passes through unchecked:**
+
+- Request body content and schema — no validation at the gateway layer
+- Model name in the request — extracted for billing facts, never validated against an allowlist
+- Provider response content — relayed byte-for-byte
+- BYO token validity — forwarded as-is; the provider rejects it if invalid
+- `vpc_id` in the virtual key — decoded and emitted in billing facts, not used for access control
+
+**Why these boundaries are where they are:**
+
+- Body schema validation belongs to the provider — duplicate validation adds latency without a
+  security benefit at the gateway layer
+- Model allowlisting would require a per-provider list coupled to model release cadence
+- BYO token validation requires a provider round-trip — the provider does it anyway
+
+---
+
+## Configuration
+
+All fields configurable via `config.example.toml` and environment (`AI_` prefix, flat merge).
+Secret-bearing fields (`pool_keys`, `nats_creds`) are held as `Secret<T>` — stray `Debug` or
+`Serialize` output redacts to `"***"` and the value is zeroized on drop (`secret.rs`).
+
+| Field                           | Default                           | Runtime Effect                                                                                                                                                                                         |
+| ------------------------------- | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `signing_keys`                  | _(required)_                      | Map of kid → base64 Ed25519 public key. Multiple kids enable rotation. Missing → all traffic falls through to BYO treatment.                                                                           |
+| `require_signing_keys`          | `false`                           | When `true`, an empty `signing_keys` is a hard boot failure instead of silent BYO-only mode. Set on managed deployments so a typo'd/absent SSM param fails fast rather than silently serving for free. |
+| `pool_keys.<name>`              | _(from `AI_POOL_KEY_<NAME>` env)_ | Real provider API key. Missing for a provider → managed requests to that provider return 503 before any upstream connection.                                                                           |
+| `provider_authorities.<name>`   | _(none)_                          | Override or add a provider's `authority` (host:port). Enables config-added providers beyond `KNOWN_PROVIDERS` with zero code change.                                                                   |
+| `snapshot_path`                 | _(unset)_                         | Path for the on-disk deny-set cache. Unset → re-scan NATS on every cold boot. Set → load from disk and enforce before NATS reconnects (edge/tunnel deployments).                                       |
+| `rate_limit_rps`                | `100`                             | Per-credential request ceiling (count-min, keyed on raw key hash). `0` disables. Exceeded → 429. Checked before Ed25519 verify.                                                                        |
+| `byo_rate_limit_rps`            | `1000`                            | Aggregate ceiling for all BYO traffic (single shared bucket). `0` disables. Managed traffic exempt. Exceeded → 429.                                                                                    |
+| `circuit_breaker_threshold`     | `20`                              | Per-provider upstream failures (5xx / connect; **not** 429) within the window before the breaker opens. While open, requests to that provider fast-fail with 503. `0` disables.                        |
+| `circuit_breaker_window_secs`   | `10`                              | Rolling window over which failures are counted (trips on a burst, not a slow trickle).                                                                                                                 |
+| `circuit_breaker_reset_secs`    | `30`                              | How long the breaker stays open before admitting a half-open probe. Probe success closes it; failure reopens it.                                                                                       |
+| `connect_timeout_secs`          | `10`                              | TCP connect timeout to the upstream provider. Exceeded → retry up to 2×, then 502.                                                                                                                     |
+| `read_timeout_secs`             | `600`                             | Response read timeout (10 min accommodates long-running LLM streams).                                                                                                                                  |
+| `write_timeout_secs`            | `60`                              | Upstream request-write timeout (sending the request to the provider).                                                                                                                                  |
+| `idle_timeout_secs`             | `90`                              | Idle timeout on a pooled upstream connection before it's closed.                                                                                                                                       |
+| `shutdown_grace_period_secs`    | `600`                             | SIGTERM drain window for in-flight requests (= `read_timeout_secs` so a deploy never truncates a stream). Capped by the orchestrator's stop timeout (ECS Fargate: 120s).                               |
+| `shutdown_runtime_timeout_secs` | `10`                              | Final runtime-teardown backstop after the drain window.                                                                                                                                                |
+| `nats_url`                      | `nats://localhost:4222`           | NATS server for the deny-set watcher. Unreachable → fail-open (deny-set stays empty or stale).                                                                                                         |
+| `nats_creds`                    | _(unset)_                         | NATS credentials file path. Required for authenticated clusters.                                                                                                                                       |
+| `listen_addr`                   | `0.0.0.0:8080`                    | Proxy listener address (client traffic).                                                                                                                                                               |
+| `metrics_listen`                | `0.0.0.0:9090`                    | Internal admin/observability listener: `/metrics` (Prometheus scrape), `/livez`, `/readyz`. Separate from the client listener — not externally reachable.                                              |
+
+---
+
+## Failure Modes
+
+| Failure                                     | What Actually Happens                                                                                                                                                          | Recovery                                                                                                                                                                  |
+| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NATS unreachable at boot                    | Deny-set starts empty (fail-open). Auth still works — keys from config.                                                                                                        | Watcher reconnects; seeds from NATS or disk snapshot on connect.                                                                                                          |
+| NATS disconnects mid-run                    | Last-known deny-set stays active. New deny entries not applied until reconnect.                                                                                                | Watcher reconnects (1s→30s exponential backoff, reset on success) and resumes from saved revision — no re-scan.                                                           |
+| NATS history compacted past snapshot cursor | `CursorExpired` → full re-scan from current NATS state.                                                                                                                        | After re-scan, new cursor set; delta watch resumes normally.                                                                                                              |
+| Virtual key tampered or forged              | Ed25519 verify fails → falls through to BYO treatment. No billing event. No error reveals which part failed.                                                                   | Billing miss detectable downstream; no security boundary breach.                                                                                                          |
+| `signing_keys` absent (typo'd/missing SSM)  | Default: warn + BYO-only (silently drops all managed billing + deny-set). With `require_signing_keys=true`: hard boot failure.                                                 | Set `require_signing_keys=true` on managed deployments so the mis-deploy fails fast and visibly at boot.                                                                  |
+| Pool key missing for provider               | Managed request returns 503 before any upstream connection.                                                                                                                    | Add `AI_POOL_KEY_<NAME>` env and redeploy.                                                                                                                                |
+| Provider DNS fails                          | `upstream_peer` returns error → 502 to client.                                                                                                                                 | TTL-cached DNS (60s) serves stale; poisoned-lock guard re-resolves on next request.                                                                                       |
+| Provider TCP connect fails                  | `fail_to_connect` retries up to 2×, then returns 502. Counts as a circuit-breaker failure.                                                                                     | Client SDK retries with backoff. No HTTP-status retries (Pingora-idiomatic).                                                                                              |
+| Provider brownout (sustained 5xx)           | After `circuit_breaker_threshold` 5xx/connect failures in the window, the breaker opens; requests fast-fail 503 (`circuit_open`) instead of stalling against the read timeout. | Auto: after `circuit_breaker_reset_secs` a half-open probe is admitted — success closes the breaker, failure reopens it. Per-provider, so other providers are unaffected. |
+| Provider throttles (429 storm)              | Relayed to the client as 429; the client's `Retry-After` backoff applies. Does **not** trip the breaker (provider is healthy).                                                 | Backpressure via client + the rate guardrails; no gateway-side circuit action.                                                                                            |
+| Response body > 128KB before usage chunk    | Tail compaction fires: `drain(..half)` discards first half, keeps tail. Usage extracted from retained tail.                                                                    | No action — SSE usage is always in the final `data:` line, which always lands in the tail.                                                                                |
+| Gateway crash mid-request                   | In-flight request drops; client receives TCP close. No partial state written.                                                                                                  | Client SDK retries. No DB writes in the request path — no cleanup needed.                                                                                                 |
+
+---
+
+## Metrics
+
+Prometheus on the default registry, exposed at `/metrics` on `metrics_listen`.
+
+| Metric                        | Type      | Labels               | What It Measures                                                                       |
+| ----------------------------- | --------- | -------------------- | -------------------------------------------------------------------------------------- |
+| `ai_requests_total`           | Counter   | —                    | Total admitted requests                                                                |
+| `ai_rejections_total`         | Counter   | `reason`             | Rejected requests by cause (auth, deny_spend, deny_fraud, rate_limit, circuit_open, …) |
+| `ai_upstream_responses_total` | Counter   | `provider`, `status` | Upstream responses by provider and status class                                        |
+| `ai_tokens_total`             | Counter   | `kind`               | input / output / cache_read / cache_write token counts                                 |
+| `ai_ttft_seconds`             | Histogram | `provider`           | Time to first token (50ms–30s buckets)                                                 |
+| `ai_upstream_latency_seconds` | Histogram | `provider`           | Full request latency (100ms–600s buckets)                                              |
+| `ai_active_streams`           | Gauge     | —                    | Open SSE streams                                                                       |
+| `ai_requests_in_flight`       | Gauge     | —                    | All in-flight requests (streaming + non-streaming)                                     |
+| `ai_deny_set_size`            | Gauge     | —                    | Current number of denied tenants                                                       |
+| `ai_nats_connected`           | Gauge     | —                    | 1 if NATS watcher is connected, 0 otherwise                                            |
+
+---
+
+## Modules
+
+| Module            | Role                                                                                                 | Tested         |
+| ----------------- | ---------------------------------------------------------------------------------------------------- | -------------- |
+| `proxy`           | `ProxyHttp` impl — request/response pipeline (request_filter through logging)                        | e2e ✓          |
+| `key`             | `bai_v1` parse + Ed25519 verify + mint; keyring with multi-kid rotation support                      | unit ✓         |
+| `route`           | Data-driven provider table (name / authority / auth) + dialect default routing                       | unit ✓         |
+| `peek`            | `ModelScanner` — streaming structural scan for the root-level `model`; O(1) memory                   | unit ✓         |
+| `usage`           | Token extraction (OpenAI / Anthropic, body + SSE)                                                    | unit ✓         |
+| `deny`            | Sparse deny-set, default-allow, reason → HTTP status                                                 | unit ✓         |
+| `ratelimit`       | Two-tier guardrail: per-credential + global BYO (count-min sketches, fixed memory, no GC)            | unit ✓         |
+| `circuit_breaker` | Per-provider lock-free breaker (packed `AtomicU64`, windowed policy) — trips on 5xx/connect, not 429 | unit ✓ + e2e ✓ |
+| `state`           | Keyring + resolved provider registry + watched deny-set (ArcSwap) + TTL DNS cache                    | unit ✓         |
+| `store_watch`     | NATS watcher — gap-free deny-set seeding + delta watch as Pingora `BackgroundService`                | e2e ✓          |
+| `config`          | Figment config; build keyring; pool keys / authorities by provider name                              | unit ✓         |
+| `secret`          | Redacting, zeroize-on-drop `Secret<T>` newtype for pool keys and NATS creds                          | unit ✓         |
+| `admin`           | `ServeHttp` on the metrics listener: `/livez`, `/readyz`, `/metrics`                                 | e2e ✓          |
+| `metrics`         | Prometheus counter/histogram/gauge registration and update helpers                                   | compile ✓      |
+| `doctor`          | Boot-time diagnostics (`beyond-ai doctor`)                                                           | compile ✓      |
+| `main`            | CLI (`run` / `doctor`), rustls init, config load, Pingora server + three services bootstrap          | compile ✓      |
+
+---
+
+## Verification
+
+- **Unit (`cargo test --lib`):** key, route, peek, usage, deny, secret, config. `clippy
+  --all-targets -D warnings` clean.
+- **End-to-end (`tests/e2e.rs`, `mise run test:integration:rs`):** real `beyond-ai` binary + real
+  nats-server + mock upstream. Covers managed key-swap + passthrough fidelity + usage metering
+  (OpenAI JSON + SSE, **Anthropic `/v1/messages`** with `x-api-key` swap + metering), **BYO
+  passthrough** (raw token unchanged), the **virtual key in either inbound header** (`Bearer` or
+  `x-api-key`), and deny-set propagation: spend (write `blackhole.{tenant}` → 402, delete → 200)
+  and **fraud** (→ 403). Error/edge paths: **missing key → 401**, **oversized `Content-Length` →
+  413**, **managed key for an unconfigured provider → 503**, **streaming tail compaction** (>128KB
+  before the usage chunk still meters), **deny-set fail-open** (kill NATS → stale set retained,
+  auth still works), and **on-disk snapshot survival** (blackhole a tenant, restart with NATS down
+  → the hold is still enforced from disk).
+- **Live smoke (`tests/smoke.rs`, `mise run test:smoke`):** the real `beyond-ai` binary against the
+  **real** provider hosts over TLS, one per provider in `KNOWN_PROVIDERS`. Proves real TLS/SNI,
+  the `/v1` → base-path rewrite landing on a live mount (200, not 404), and BYO auth passthrough.
+  Every test is `#[ignore]` and skips unless its provider's API key env var is set — CI stays
+  hermetic; you only hit providers you have keys for.
+
+---
+
+## Benchmarking
+
+Two harnesses, mirroring the unit/e2e split of the tests. The framing is **Theory of Constraints**:
+a proxy's steady-state constraint is upstream I/O, not gateway CPU. The benches **prove the
+gateway's added cost is negligible and bounded** — i.e. it never becomes the constraint.
+
+- **Unit micro (`benches/unit.rs`, `mise run bench:unit`) — `divan`.** Times IO-free hot paths and
+  measures allocations natively (divan's `AllocProfiler` reports alloc/dealloc/grow count + bytes
+  beside ns/iter, no `unsafe` needed). Coverage: `key` verify/mint; `peek::ModelScanner` over
+  0/4KB/256KB bodies with `model` placed last (worst case); `usage` parsers; `route`; `deny`
+  (`parse_key`/`parse_reason` off-path + `reason()` on-path); `ratelimit::check` (managed tier
+  only vs. BYO which runs both tiers).
+
+  What the alloc numbers assert:
+  | Operation           | Cost     | Allocations                  | Claim verified                |
+  | ------------------- | -------- | ---------------------------- | ----------------------------- |
+  | `key/verify`        | ~28µs    | 0                            | Stack-only Ed25519 decode     |
+  | `peek/ModelScanner` | varies   | 1 (independent of body size) | O(1) memory                   |
+  | `route`             | ~ns      | 0                            | —                             |
+  | `deny::reason`      | ~1–8ns   | 0, flat 0→1M entries         | O(1) lookup, O(denied) memory |
+  | `ratelimit::check`  | ~43–83ns | 0                            | Fixed-memory count-min        |
+
+  **Headline: `key/verify` ≈ 28µs is ~350–650× every other per-request op.** This is why the rate
+  guardrail sits before verify in `proxy::request_filter`.
+
+- **End-to-end (`benches/e2e.rs`, `mise run bench:e2e`) — `criterion`.** Real `beyond-ai` binary
+  - real nats-server + mock upstream (reuses `tests/common`). Four decomposed cases:
+    `reject_missing_key_latency` (401, short-circuit before any upstream connection — transport floor),
+    `byo_json_latency` (pure passthrough), `managed_json_latency` (verify + deny + key swap),
+    `managed_sse_latency` (streaming response tap). Plus a concurrent-throughput group.
+
+  All four cases land in ~110–120µs on loopback with ±15–20µs jitter — larger than the gateway's
+  own CPU cost. This harness cannot resolve the verify cost (that's the unit bench's job). Its value:
+  catching gross regressions (a buffering mistake, a dropped connection pool, an O(n) path added
+  would move the band by far more than 20µs) and saved-baseline RPS trend via `--save-baseline`.
+
+`mise run bench` runs both.
diff --git a/CLAUDE.md b/CLAUDE.md
index b7a270e..67e4bef 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -51,24 +51,3 @@ Apply the **Theory of Constraints**: a system's throughput is limited by its sin
 5. **Repeat.** The bottleneck has shifted. Go back to step 1.
 
 The corollary: if you can't name the current constraint, you aren't ready to optimize.
-
-<!-- wiki-managed:start (managed by `wiki claude install`; edits inside this block will be overwritten) -->
-
-## Wiki
-
-This repo uses [agent-wiki](.wiki/): `.wiki/` indexes repo markdown docs and code symbols into a queryable knowledge graph.
-
-**Read the wiki before grepping the codebase or reading ARCHITECTURE.md.** Pages are pre-indexed — searching them is faster and ~5–10× cheaper than re-deriving from raw files.
-
-Wiki tools — pick based on what you need:
-
-- `wiki_query "<term>"` — first move for any specific question. BM25++ over repo docs and code symbols; returns ranked hits with paths, scores, and inline snippets.
-- `wiki_answer "<question>"` — returns top-ranked pages with query-relevant passage extracts in one round-trip. Best when you expect the answer exists and want it immediately.
-- `wiki_read "path/to/page.md"` (optionally `section: "..."` or `paths: [...]`) — full page, one section, or multiple pages in one call.
-- `wiki_search_code "<query>"` — search exported symbols, signatures, and doc comments when you need to locate a declaration or understand an API.
-- `wiki_usage_examples "<symbol>"` — real call sites with surrounding source code. Use before changing a function (to see every calling convention you must preserve) or when learning how an unfamiliar API is actually used.
-- `wiki_impact "<symbol>"` — blast radius: every symbol that transitively calls this one, ranked by hop distance. Use before refactoring or renaming to know what breaks.
-- `wiki_callees "<symbol>"` — outgoing call hierarchy (rust-analyzer equivalent): every function this symbol transitively calls, ranked by hop distance. Use when you need to understand what a function depends on before touching it — its DB calls, service calls, and abstractions.
-- `wiki_implementors "<symbol>"` — go-to-implementations (rust-analyzer equivalent): every concrete type that implements a trait or interface. Use when you need to know what's behind a trait object, or how many types a trait change will affect.
-
-<!-- wiki-managed:end -->
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..1a9bf79
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,4504 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "addr2line"
+version = "0.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
+
+[[package]]
+name = "ahash"
+version = "0.8.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
+dependencies = [
+ "cfg-if",
+ "getrandom 0.3.4",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "aliasable"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd"
+
+[[package]]
+name = "alloc-no-stdlib"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
+
+[[package]]
+name = "alloc-stdlib"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
+dependencies = [
+ "alloc-no-stdlib",
+]
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
+[[package]]
+name = "anstream"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
+
+[[package]]
+name = "anstyle-parse"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+
+[[package]]
+name = "arc-swap"
+version = "1.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "arrayvec"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+
+[[package]]
+name = "asn1-rs"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048"
+dependencies = [
+ "asn1-rs-derive",
+ "asn1-rs-impl",
+ "displaydoc",
+ "nom",
+ "num-traits",
+ "rusticata-macros",
+ "thiserror 1.0.69",
+ "time",
+]
+
+[[package]]
+name = "asn1-rs-derive"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "synstructure",
+]
+
+[[package]]
+name = "asn1-rs-impl"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "async-nats"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df5af9ebfb0a14481d3eaf6101e6391261e4f30d25b26a7635ade8a39482ded0"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-util",
+ "memchr",
+ "nkeys",
+ "nuid",
+ "once_cell",
+ "pin-project",
+ "portable-atomic",
+ "rand 0.8.6",
+ "regex",
+ "ring",
+ "rustls-native-certs 0.7.3",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.8",
+ "serde",
+ "serde_json",
+ "serde_nanos",
+ "serde_repr",
+ "thiserror 1.0.69",
+ "time",
+ "tokio",
+ "tokio-rustls",
+ "tokio-stream",
+ "tokio-util",
+ "tokio-websockets",
+ "tracing",
+ "tryhard",
+ "url",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "atomic"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a89cbf775b137e9b968e67227ef7f775587cde3fd31b0d8599dbd0f598a48340"
+dependencies = [
+ "bytemuck",
+]
+
+[[package]]
+name = "atomic-waker"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+
+[[package]]
+name = "autocfg"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53"
+
+[[package]]
+name = "aws-lc-rs"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ec2f1fc3ec205783a5da9a7e6c1509cc69dedf09a1949e412c1e18469326d00"
+dependencies = [
+ "aws-lc-sys",
+ "zeroize",
+]
+
+[[package]]
+name = "aws-lc-sys"
+version = "0.41.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a2f9779ce85b93ab6170dd940ad0169b5766ff848247aff13bb788b832fe3f4"
+dependencies = [
+ "cc",
+ "cmake",
+ "dunce",
+ "fs_extra",
+]
+
+[[package]]
+name = "backtrace"
+version = "0.3.76"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6"
+dependencies = [
+ "addr2line",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+ "windows-link",
+]
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "base64ct"
+version = "1.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06"
+
+[[package]]
+name = "beyond-ai"
+version = "0.1.0"
+dependencies = [
+ "arc-swap",
+ "arrayvec",
+ "async-trait",
+ "base64",
+ "beyond-slipstream",
+ "bytes",
+ "clap",
+ "criterion",
+ "divan",
+ "ed25519-dalek",
+ "figment",
+ "getrandom 0.3.4",
+ "http",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "memchr",
+ "pingora",
+ "pingora-core",
+ "pingora-limits",
+ "pingora-proxy",
+ "prometheus",
+ "rcgen",
+ "reqwest",
+ "rustls",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.18",
+ "tokio",
+ "tokio-rustls",
+ "tracing",
+ "tracing-subscriber",
+ "zeroize",
+]
+
+[[package]]
+name = "beyond-slipstream"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b07e54aae9b02cf7d2e9d935bd99cbc4a045f19d00738f069f44ba238a01600"
+dependencies = [
+ "async-nats",
+ "async-trait",
+ "base64",
+ "crc32fast",
+ "futures",
+ "serde_json",
+ "tempfile",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "url",
+]
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitflags"
+version = "2.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
+
+[[package]]
+name = "blake2"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "brotli"
+version = "3.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d640d25bc63c50fb1f0b545ffd80207d2e10a4c965530809b40ba3386825c391"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+ "brotli-decompressor",
+]
+
+[[package]]
+name = "brotli-decompressor"
+version = "2.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+]
+
+[[package]]
+name = "bstr"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab"
+dependencies = [
+ "memchr",
+ "regex-automata",
+ "serde",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.20.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649"
+
+[[package]]
+name = "bytemuck"
+version = "1.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "bytes"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
+[[package]]
+name = "cc"
+version = "1.2.63"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f"
+dependencies = [
+ "find-msvc-tools",
+ "jobserver",
+ "libc",
+ "shlex",
+]
+
+[[package]]
+name = "cf-rustracing"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6565523d8145e63e0cf1b397a5f1bd4e90d5652a7dffb2de8cec460ff23ef6b1"
+dependencies = [
+ "backtrace",
+ "rand 0.10.1",
+ "tokio",
+ "trackable",
+]
+
+[[package]]
+name = "cf-rustracing-jaeger"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16c0e4d8cce27f6a6eaff58d2b66f063a18b8ed0d6ef0947ae7a263afa3b7c08"
+dependencies = [
+ "cf-rustracing",
+ "hostname",
+ "local-ip-address",
+ "percent-encoding",
+ "rand 0.10.1",
+ "thrift_codec",
+ "tokio",
+ "trackable",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
+[[package]]
+name = "chacha20"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.3.0",
+ "rand_core 0.10.1",
+]
+
+[[package]]
+name = "chrono"
+version = "0.4.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
+[[package]]
+name = "clap"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+ "terminal_size",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "clap_lex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
+
+[[package]]
+name = "cmake"
+version = "0.1.58"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+
+[[package]]
+name = "combine"
+version = "4.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd"
+dependencies = [
+ "bytes",
+ "memchr",
+]
+
+[[package]]
+name = "condtype"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf0a07a401f374238ab8e2f11a104d2851bf9ce711ec69804834de8af45c7af"
+
+[[package]]
+name = "const-oid"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
+
+[[package]]
+name = "core-foundation"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "cpufeatures"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "crc32fast"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "futures",
+ "is-terminal",
+ "itertools",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "tokio",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
+[[package]]
+name = "crypto-common"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "curve25519-dalek"
+version = "4.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.2.17",
+ "curve25519-dalek-derive",
+ "digest",
+ "fiat-crypto",
+ "rustc_version",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "curve25519-dalek-derive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "daemonize"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab8bfdaacb3c887a54d41bdf48d3af8873b3f5566469f8ba21b92057509f116e"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "daggy"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70def8d72740e44d9f676d8dab2c933a236663d86dd24319b57a2bed4d694774"
+dependencies = [
+ "petgraph",
+]
+
+[[package]]
+name = "darling"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "data-encoding"
+version = "2.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4ae5f15dda3c708c0ade84bfee31ccab44a3da4f88015ed22f63732abe300c8"
+
+[[package]]
+name = "der"
+version = "0.7.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
+dependencies = [
+ "const-oid",
+ "pem-rfc7468",
+ "zeroize",
+]
+
+[[package]]
+name = "der-parser"
+version = "9.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553"
+dependencies = [
+ "asn1-rs",
+ "displaydoc",
+ "nom",
+ "num-bigint",
+ "num-traits",
+ "rusticata-macros",
+]
+
+[[package]]
+name = "deranged"
+version = "0.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
+dependencies = [
+ "powerfmt",
+ "serde_core",
+]
+
+[[package]]
+name = "derivative"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "derive_builder"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947"
+dependencies = [
+ "derive_builder_macro",
+]
+
+[[package]]
+name = "derive_builder_core"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "derive_builder_macro"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
+dependencies = [
+ "derive_builder_core",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+ "subtle",
+]
+
+[[package]]
+name = "displaydoc"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "divan"
+version = "0.1.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a405457ec78b8fe08b0e32b4a3570ab5dff6dd16eb9e76a5ee0a9d9cbd898933"
+dependencies = [
+ "cfg-if",
+ "clap",
+ "condtype",
+ "divan-macros",
+ "libc",
+ "regex-lite",
+]
+
+[[package]]
+name = "divan-macros"
+version = "0.1.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9556bc800956545d6420a640173e5ba7dfa82f38d3ea5a167eb555bc69ac3323"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "dunce"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+
+[[package]]
+name = "ed25519"
+version = "2.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
+dependencies = [
+ "pkcs8",
+ "signature",
+]
+
+[[package]]
+name = "ed25519-dalek"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9"
+dependencies = [
+ "curve25519-dalek",
+ "ed25519",
+ "serde",
+ "sha2",
+ "signature",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "either"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "errno"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "fastrand"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
+
+[[package]]
+name = "fiat-crypto"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
+
+[[package]]
+name = "figment"
+version = "0.10.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8cb01cd46b0cf372153850f4c6c272d9cbea2da513e07538405148f95bd789f3"
+dependencies = [
+ "atomic",
+ "pear",
+ "serde",
+ "toml",
+ "uncased",
+ "version_check",
+]
+
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
+
+[[package]]
+name = "fixedbitset"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
+
+[[package]]
+name = "flate2"
+version = "1.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
+dependencies = [
+ "crc32fast",
+ "libz-ng-sys",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
+[[package]]
+name = "form_urlencoded"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "fs_extra"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
+
+[[package]]
+name = "futures"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893"
+
+[[package]]
+name = "futures-task"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
+
+[[package]]
+name = "futures-util"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "slab",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "wasi",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "r-efi 5.3.0",
+ "wasip2",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 6.0.0",
+ "rand_core 0.10.1",
+ "wasip2",
+ "wasip3",
+]
+
+[[package]]
+name = "getset"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9cf0fc11e47561d47397154977bc219f4cf809b2974facc3ccb3b89e2436f912"
+dependencies = [
+ "proc-macro-error2",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "gimli"
+version = "0.32.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7"
+
+[[package]]
+name = "h2"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "http",
+ "indexmap 2.14.0",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "half"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "zerocopy",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+dependencies = [
+ "foldhash 0.1.5",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash 0.2.0",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.17.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash 0.2.0",
+]
+
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
+[[package]]
+name = "hostname"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "windows-link",
+]
+
+[[package]]
+name = "http"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0"
+dependencies = [
+ "bytes",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "httparse"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
+[[package]]
+name = "hyper"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55281c53a1894c864990125767da440a4e630446785086f52523b20033b74498"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+ "want",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.27.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f"
+dependencies = [
+ "http",
+ "hyper",
+ "hyper-util",
+ "rustls",
+ "tokio",
+ "tokio-rustls",
+ "tower-service",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "ipnet",
+ "libc",
+ "percent-encoding",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "icu_collections"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c"
+dependencies = [
+ "displaydoc",
+ "potential_utf",
+ "utf8_iter",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locale_core"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4"
+dependencies = [
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38"
+
+[[package]]
+name = "icu_properties"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de"
+dependencies = [
+ "icu_collections",
+ "icu_locale_core",
+ "icu_properties_data",
+ "icu_provider",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14"
+
+[[package]]
+name = "icu_provider"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421"
+dependencies = [
+ "displaydoc",
+ "icu_locale_core",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
+[[package]]
+name = "idna"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
+dependencies = [
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
+]
+
+[[package]]
+name = "idna_adapter"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
+]
+
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.17.1",
+ "serde",
+ "serde_core",
+]
+
+[[package]]
+name = "inlinable_string"
+version = "0.1.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8fae54786f62fb2918dcfae3d568594e50eb9b5c25bf04371af6fe7516452fb"
+
+[[package]]
+name = "ipnet"
+version = "2.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
+
+[[package]]
+name = "is-terminal"
+version = "0.4.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+
+[[package]]
+name = "jni"
+version = "0.22.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5efd9a482cf3a427f00d6b35f14332adc7902ce91efb778580e180ff90fa3498"
+dependencies = [
+ "cfg-if",
+ "combine",
+ "jni-macros",
+ "jni-sys",
+ "log",
+ "simd_cesu8",
+ "thiserror 2.0.18",
+ "walkdir",
+ "windows-link",
+]
+
+[[package]]
+name = "jni-macros"
+version = "0.22.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a00109accc170f0bdb141fed3e393c565b6f5e072365c3bd58f5b062591560a3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "rustc_version",
+ "simd_cesu8",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "jni-sys"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2"
+dependencies = [
+ "jni-sys-macros",
+]
+
+[[package]]
+name = "jni-sys-macros"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264"
+dependencies = [
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "jobserver"
+version = "0.1.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
+dependencies = [
+ "getrandom 0.3.4",
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11"
+dependencies = [
+ "cfg-if",
+ "futures-util",
+ "once_cell",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
+[[package]]
+name = "leb128fmt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
+
+[[package]]
+name = "libc"
+version = "0.2.186"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
+
+[[package]]
+name = "libz-ng-sys"
+version = "1.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be734b33b7bc6a42d92d23e25e69758f866cf564a88d0bf80866fcf5a52c2255"
+dependencies = [
+ "cmake",
+ "libc",
+]
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
+
+[[package]]
+name = "litemap"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
+
+[[package]]
+name = "local-ip-address"
+version = "0.6.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa08fb2b1ec3ea84575e94b489d06d4ce0cbf052d12acd515838f50e3c3d63e3"
+dependencies = [
+ "libc",
+ "neli",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5"
+
+[[package]]
+name = "lru"
+version = "0.16.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39"
+dependencies = [
+ "hashbrown 0.16.1",
+]
+
+[[package]]
+name = "lru-slab"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
+
+[[package]]
+name = "matchers"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
+dependencies = [
+ "regex-automata",
+]
+
+[[package]]
+name = "memchr"
+version = "2.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
+
+[[package]]
+name = "memoffset"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
+dependencies = [
+ "adler2",
+ "simd-adler32",
+]
+
+[[package]]
+name = "mio"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02bd0af71c67b473010cbbc60715ee815645a4dc942899111f494b4b737d6fda"
+dependencies = [
+ "libc",
+ "wasi",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "neli"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22f9786d56d972959e1408b6a93be6af13b9c1392036c5c1fafa08a1b0c6ee87"
+dependencies = [
+ "bitflags 2.11.1",
+ "byteorder",
+ "derive_builder",
+ "getset",
+ "libc",
+ "log",
+ "neli-proc-macros",
+ "parking_lot",
+]
+
+[[package]]
+name = "neli-proc-macros"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05d8d08c6e98f20a62417478ebf7be8e1425ec9acecc6f63e22da633f6b71609"
+dependencies = [
+ "either",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "nix"
+version = "0.24.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa52e972a9a719cecb6864fb88568781eb706bac2cd1d4f04a648542dbf78069"
+dependencies = [
+ "bitflags 1.3.2",
+ "cfg-if",
+ "libc",
+ "memoffset",
+]
+
+[[package]]
+name = "nkeys"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf"
+dependencies = [
+ "data-encoding",
+ "ed25519",
+ "ed25519-dalek",
+ "getrandom 0.2.17",
+ "log",
+ "rand 0.8.6",
+ "signatory",
+]
+
+[[package]]
+name = "no_debug"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f23a60c850e1144fc1dd9435152e0cfdc7dd18725350b4243584118013a52a4"
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "nu-ansi-term"
+version = "0.50.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "nuid"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83"
+dependencies = [
+ "rand 0.8.6",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-conv"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441"
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "object"
+version = "0.37.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "oid-registry"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9"
+dependencies = [
+ "asn1-rs",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+
+[[package]]
+name = "oorandom"
+version = "11.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
+
+[[package]]
+name = "openssl-probe"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
+
+[[package]]
+name = "openssl-probe"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
+
+[[package]]
+name = "ouroboros"
+version = "0.18.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59"
+dependencies = [
+ "aliasable",
+ "ouroboros_macro",
+ "static_assertions",
+]
+
+[[package]]
+name = "ouroboros_macro"
+version = "0.18.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c7028bdd3d43083f6d8d4d5187680d0d3560d54df4cc9d752005268b41e64d0"
+dependencies = [
+ "heck 0.4.1",
+ "proc-macro2",
+ "proc-macro2-diagnostics",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "parking_lot"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-link",
+]
+
+[[package]]
+name = "pear"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdeeaa00ce488657faba8ebf44ab9361f9365a97bd39ffb8a60663f57ff4b467"
+dependencies = [
+ "inlinable_string",
+ "pear_codegen",
+ "yansi",
+]
+
+[[package]]
+name = "pear_codegen"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bab5b985dc082b345f812b7df84e1bef27e7207b39e448439ba8bd69c93f147"
+dependencies = [
+ "proc-macro2",
+ "proc-macro2-diagnostics",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "pem"
+version = "3.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be"
+dependencies = [
+ "base64",
+ "serde_core",
+]
+
+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
+
+[[package]]
+name = "petgraph"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
+dependencies = [
+ "fixedbitset",
+ "indexmap 2.14.0",
+]
+
+[[package]]
+name = "pin-project"
+version = "1.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
+
+[[package]]
+name = "pingora"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "844a13b16e556293f4ea96dc5ac0923ac6f36855a9dfc13b640d0da183f6b5b7"
+dependencies = [
+ "pingora-cache",
+ "pingora-core",
+ "pingora-http",
+ "pingora-load-balancing",
+ "pingora-proxy",
+ "pingora-timeout",
+]
+
+[[package]]
+name = "pingora-cache"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59d8c4c939a3a193a3da0e061aa7acf7432431f92ee62a26f5a9e5167a0ade2"
+dependencies = [
+ "ahash",
+ "async-trait",
+ "blake2",
+ "bstr",
+ "bytes",
+ "cf-rustracing",
+ "cf-rustracing-jaeger",
+ "hex",
+ "http",
+ "httparse",
+ "httpdate",
+ "indexmap 1.9.3",
+ "log",
+ "lru",
+ "once_cell",
+ "parking_lot",
+ "pingora-core",
+ "pingora-error",
+ "pingora-header-serde",
+ "pingora-http",
+ "pingora-lru",
+ "pingora-timeout",
+ "rand 0.8.6",
+ "regex",
+ "rmp",
+ "rmp-serde",
+ "serde",
+ "strum",
+ "tokio",
+]
+
+[[package]]
+name = "pingora-core"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08973c4853cef4c682f7a592907e81a32dcad69476c4846e5de079f16448b177"
+dependencies = [
+ "ahash",
+ "async-trait",
+ "brotli",
+ "bstr",
+ "bytes",
+ "chrono",
+ "clap",
+ "daemonize",
+ "daggy",
+ "derivative",
+ "flate2",
+ "futures",
+ "h2",
+ "http",
+ "httparse",
+ "httpdate",
+ "libc",
+ "log",
+ "nix",
+ "once_cell",
+ "openssl-probe 0.1.6",
+ "ouroboros",
+ "parking_lot",
+ "percent-encoding",
+ "pingora-error",
+ "pingora-http",
+ "pingora-pool",
+ "pingora-runtime",
+ "pingora-rustls",
+ "pingora-timeout",
+ "prometheus",
+ "rand 0.8.6",
+ "regex",
+ "serde",
+ "serde_yaml",
+ "sfv",
+ "socket2",
+ "strum",
+ "strum_macros",
+ "tokio",
+ "tokio-stream",
+ "tokio-test",
+ "unicase",
+ "windows-sys 0.59.0",
+ "x509-parser",
+ "zstd",
+]
+
+[[package]]
+name = "pingora-error"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9fa97a500e7e5c27a7b8609b9294c8922c9656322285268bfad9520f12feb38"
+
+[[package]]
+name = "pingora-header-serde"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2705feb8b50d4e734e0c7d3879aa040e655a45656276323ff530e254585dd816"
+dependencies = [
+ "bytes",
+ "http",
+ "httparse",
+ "pingora-error",
+ "pingora-http",
+ "thread_local",
+ "zstd",
+ "zstd-safe",
+]
+
+[[package]]
+name = "pingora-http"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbb52d4651b687fab6abf669539cfd97b7cd94b301fde8f57c63354f9c9cc5e2"
+dependencies = [
+ "bytes",
+ "http",
+ "pingora-error",
+]
+
+[[package]]
+name = "pingora-ketama"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0286fb5a0359dca1e2e137dfe14ca4d94f676635a5eae4616bb3d8d4ce06d120"
+dependencies = [
+ "crc32fast",
+]
+
+[[package]]
+name = "pingora-limits"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7568624fc0e2f11fa32d27053ac862048b40bad98140b07a11d82f1b4989700"
+dependencies = [
+ "ahash",
+]
+
+[[package]]
+name = "pingora-load-balancing"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2606e9e22e72927a69772cefe56b0d41d251c3ffdfcd548a6020fe157fb79ad"
+dependencies = [
+ "arc-swap",
+ "async-trait",
+ "derivative",
+ "fnv",
+ "futures",
+ "http",
+ "log",
+ "pingora-core",
+ "pingora-error",
+ "pingora-http",
+ "pingora-ketama",
+ "pingora-runtime",
+ "rand 0.8.6",
+ "tokio",
+]
+
+[[package]]
+name = "pingora-lru"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91bb5030596a3d442c0866ac68afe29c14ba558e77c726dcdf7016b0dbb359d9"
+dependencies = [
+ "arrayvec",
+ "hashbrown 0.17.1",
+ "parking_lot",
+ "rand 0.8.6",
+]
+
+[[package]]
+name = "pingora-pool"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67f034be36772f318370d058913db43dbd22c3763ad974c995ba2e4afb2bb52a"
+dependencies = [
+ "crossbeam-queue",
+ "log",
+ "lru",
+ "parking_lot",
+ "pingora-timeout",
+ "thread_local",
+ "tokio",
+]
+
+[[package]]
+name = "pingora-proxy"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e1e070a98a70d0d05f2fdcfb706237e06a043b2fbc9261e8772a3459cc2175e"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "clap",
+ "futures",
+ "h2",
+ "http",
+ "log",
+ "once_cell",
+ "pingora-cache",
+ "pingora-core",
+ "pingora-error",
+ "pingora-http",
+ "rand 0.8.6",
+ "regex",
+ "tokio",
+]
+
+[[package]]
+name = "pingora-runtime"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e371315b1c44c2e5a8788fdc61577527b785e121e6ff49144755f40d86511430"
+dependencies = [
+ "once_cell",
+ "rand 0.8.6",
+ "thread_local",
+ "tokio",
+]
+
+[[package]]
+name = "pingora-rustls"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "239b663618bb822ddeddaf6d8384177a8ab226cb22febc627a72c2fd55e7bb75"
+dependencies = [
+ "log",
+ "no_debug",
+ "pingora-error",
+ "ring",
+ "rustls",
+ "rustls-native-certs 0.7.3",
+ "rustls-pemfile",
+ "rustls-pki-types",
+ "tokio-rustls",
+]
+
+[[package]]
+name = "pingora-timeout"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a853fee5ce510a7f5db2561f99c752724112ed13fc3820e70d462d278d704ea"
+dependencies = [
+ "once_cell",
+ "parking_lot",
+ "pin-project-lite",
+ "thread_local",
+ "tokio",
+]
+
+[[package]]
+name = "pkcs8"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
+dependencies = [
+ "der",
+ "spki",
+]
+
+[[package]]
+name = "pkg-config"
+version = "0.3.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
+
+[[package]]
+name = "plotters"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
+dependencies = [
+ "plotters-backend",
+]
+
+[[package]]
+name = "portable-atomic"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
+
+[[package]]
+name = "potential_utf"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564"
+dependencies = [
+ "zerovec",
+]
+
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "prettyplease"
+version = "0.2.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "proc-macro-error-attr2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+]
+
+[[package]]
+name = "proc-macro-error2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802"
+dependencies = [
+ "proc-macro-error-attr2",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "proc-macro2-diagnostics"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "version_check",
+ "yansi",
+]
+
+[[package]]
+name = "prometheus"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
+dependencies = [
+ "cfg-if",
+ "fnv",
+ "lazy_static",
+ "memchr",
+ "parking_lot",
+ "protobuf",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "protobuf"
+version = "2.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
+
+[[package]]
+name = "quinn"
+version = "0.11.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
+dependencies = [
+ "bytes",
+ "cfg_aliases",
+ "pin-project-lite",
+ "quinn-proto",
+ "quinn-udp",
+ "rustc-hash",
+ "rustls",
+ "socket2",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-proto"
+version = "0.11.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
+dependencies = [
+ "aws-lc-rs",
+ "bytes",
+ "getrandom 0.3.4",
+ "lru-slab",
+ "rand 0.9.4",
+ "ring",
+ "rustc-hash",
+ "rustls",
+ "rustls-pki-types",
+ "slab",
+ "thiserror 2.0.18",
+ "tinyvec",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-udp"
+version = "0.5.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
+dependencies = [
+ "cfg_aliases",
+ "libc",
+ "once_cell",
+ "socket2",
+ "tracing",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
+[[package]]
+name = "r-efi"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
+
+[[package]]
+name = "rand"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
+dependencies = [
+ "libc",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
+dependencies = [
+ "chacha20",
+ "getrandom 0.4.2",
+ "rand_core 0.10.1",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom 0.2.17",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom 0.3.4",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69"
+
+[[package]]
+name = "rayon"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "rcgen"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75e669e5202259b5314d1ea5397316ad400819437857b90861765f24c4cf80a2"
+dependencies = [
+ "pem",
+ "ring",
+ "rustls-pki-types",
+ "time",
+ "yasna",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
+dependencies = [
+ "bitflags 2.11.1",
+]
+
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-lite"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973"
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+
+[[package]]
+name = "reqwest"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "219c5811de6525e5416c7d5d53bb656d3afdbc6c5af816e0802bcfa42dbdc1c3"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-rustls",
+ "hyper-util",
+ "js-sys",
+ "log",
+ "percent-encoding",
+ "pin-project-lite",
+ "quinn",
+ "rustls",
+ "rustls-pki-types",
+ "rustls-platform-verifier",
+ "serde",
+ "serde_json",
+ "sync_wrapper",
+ "tokio",
+ "tokio-rustls",
+ "tower",
+ "tower-http",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
+[[package]]
+name = "ring"
+version = "0.17.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "getrandom 0.2.17",
+ "libc",
+ "untrusted",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rmp"
+version = "0.8.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ba8be72d372b2c9b35542551678538b562e7cf86c3315773cae48dfbfe7790c"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "rmp-serde"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72f81bee8c8ef9b577d1681a70ebbc962c232461e397b22c208c43c04b67a155"
+dependencies = [
+ "rmp",
+ "serde",
+]
+
+[[package]]
+name = "rust_decimal"
+version = "1.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c5108e3d4d903e21aac27f12ba5377b6b34f9f44b325e4894c7924169d06995"
+dependencies = [
+ "arrayvec",
+ "num-traits",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d"
+
+[[package]]
+name = "rustc-hash"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rusticata-macros"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "rustix"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
+dependencies = [
+ "bitflags 2.11.1",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustls"
+version = "0.23.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b"
+dependencies = [
+ "aws-lc-rs",
+ "log",
+ "once_cell",
+ "ring",
+ "rustls-pki-types",
+ "rustls-webpki 0.103.13",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-native-certs"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5"
+dependencies = [
+ "openssl-probe 0.1.6",
+ "rustls-pemfile",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework 2.11.1",
+]
+
+[[package]]
+name = "rustls-native-certs"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
+dependencies = [
+ "openssl-probe 0.2.1",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework 3.7.0",
+]
+
+[[package]]
+name = "rustls-pemfile"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "rustls-pki-types"
+version = "1.14.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9"
+dependencies = [
+ "web-time",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-platform-verifier"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26d1e2536ce4f35f4846aa13bff16bd0ff40157cdb14cc056c7b14ba41233ba0"
+dependencies = [
+ "core-foundation 0.10.1",
+ "core-foundation-sys",
+ "jni",
+ "log",
+ "once_cell",
+ "rustls",
+ "rustls-native-certs 0.8.3",
+ "rustls-platform-verifier-android",
+ "rustls-webpki 0.103.13",
+ "security-framework 3.7.0",
+ "security-framework-sys",
+ "webpki-root-certs",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustls-platform-verifier-android"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
+
+[[package]]
+name = "rustls-webpki"
+version = "0.102.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
+dependencies = [
+ "rustls-pki-types",
+ "untrusted",
+]
+
+[[package]]
+name = "rustls-webpki"
+version = "0.103.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e"
+dependencies = [
+ "aws-lc-rs",
+ "ring",
+ "rustls-pki-types",
+ "untrusted",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "ryu"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "schannel"
+version = "0.1.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "security-framework"
+version = "2.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
+dependencies = [
+ "bitflags 2.11.1",
+ "core-foundation 0.9.4",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework"
+version = "3.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
+dependencies = [
+ "bitflags 2.11.1",
+ "core-foundation 0.10.1",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework-sys"
+version = "2.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.150"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "serde_nanos"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "serde_repr"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "serde_spanned"
+version = "0.6.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "serde_yaml"
+version = "0.9.34+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
+dependencies = [
+ "indexmap 2.14.0",
+ "itoa",
+ "ryu",
+ "serde",
+ "unsafe-libyaml",
+]
+
+[[package]]
+name = "sfv"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fa1f336066b758b7c9df34ed049c0e693a426afe2b27ff7d5b14f410ab1a132"
+dependencies = [
+ "base64",
+ "indexmap 2.14.0",
+ "rust_decimal",
+]
+
+[[package]]
+name = "sha2"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.2.17",
+ "digest",
+]
+
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "shlex"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba"
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b"
+dependencies = [
+ "errno",
+ "libc",
+]
+
+[[package]]
+name = "signatory"
+version = "0.27.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31"
+dependencies = [
+ "pkcs8",
+ "rand_core 0.6.4",
+ "signature",
+ "zeroize",
+]
+
+[[package]]
+name = "signature"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
+dependencies = [
+ "digest",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "simd-adler32"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
+
+[[package]]
+name = "simd_cesu8"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94f90157bb87cddf702797c5dadfa0be7d266cdf49e22da2fcaa32eff75b2c33"
+dependencies = [
+ "rustc_version",
+ "simdutf8",
+]
+
+[[package]]
+name = "simdutf8"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
+
+[[package]]
+name = "slab"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
+
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
+[[package]]
+name = "socket2"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "spki"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
+dependencies = [
+ "base64ct",
+ "der",
+]
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "strum"
+version = "0.26.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "sync_wrapper"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+dependencies = [
+ "futures-core",
+]
+
+[[package]]
+name = "synstructure"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tempfile"
+version = "3.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
+dependencies = [
+ "fastrand",
+ "getrandom 0.4.2",
+ "once_cell",
+ "rustix",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "terminal_size"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874"
+dependencies = [
+ "rustix",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
+dependencies = [
+ "thiserror-impl 2.0.18",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "thread_local"
+version = "1.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "thrift_codec"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83d957f535b242b91aa9f47bde08080f9a6fef276477e55b0079979d002759d5"
+dependencies = [
+ "byteorder",
+ "trackable",
+]
+
+[[package]]
+name = "time"
+version = "0.3.47"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c"
+dependencies = [
+ "deranged",
+ "itoa",
+ "num-conv",
+ "powerfmt",
+ "serde_core",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
+
+[[package]]
+name = "time-macros"
+version = "0.2.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215"
+dependencies = [
+ "num-conv",
+ "time-core",
+]
+
+[[package]]
+name = "tinystr"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.52.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe"
+dependencies = [
+ "bytes",
+ "libc",
+ "mio",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2",
+ "tokio-macros",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
+dependencies = [
+ "rustls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-test"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f6d24790a10a7af737693a3e8f1d03faef7e6ca0cc99aae5066f533766de545"
+dependencies = [
+ "futures-core",
+ "tokio",
+ "tokio-stream",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-websockets"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "http",
+ "httparse",
+ "rand 0.8.6",
+ "ring",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls",
+ "tokio-util",
+ "webpki-roots 0.26.11",
+]
+
+[[package]]
+name = "toml"
+version = "0.8.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit",
+]
+
+[[package]]
+name = "toml_datetime"
+version = "0.6.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.22.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
+dependencies = [
+ "indexmap 2.14.0",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_write",
+ "winnow",
+]
+
+[[package]]
+name = "toml_write"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
+
+[[package]]
+name = "tower"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.6.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840"
+dependencies = [
+ "bitflags 2.11.1",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "pin-project-lite",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "url",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
+dependencies = [
+ "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-serde"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "regex-automata",
+ "serde",
+ "serde_json",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-serde",
+]
+
+[[package]]
+name = "trackable"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15bd114abb99ef8cee977e517c8f37aee63f184f2d08e3e6ceca092373369ae"
+dependencies = [
+ "trackable_derive",
+]
+
+[[package]]
+name = "trackable_derive"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebeb235c5847e2f82cfe0f07eb971d1e5f6804b18dac2ae16349cc604380f82f"
+dependencies = [
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
+[[package]]
+name = "tryhard"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5"
+dependencies = [
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "typenum"
+version = "1.20.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20"
+
+[[package]]
+name = "uncased"
+version = "0.9.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1b88fcfe09e89d3866a5c11019378088af2d24c3fbd4f0543f96b479ec90697"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "unicase"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
+[[package]]
+name = "unsafe-libyaml"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
+
+[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
+[[package]]
+name = "url"
+version = "2.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+ "serde",
+]
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "valuable"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
+[[package]]
+name = "wasip2"
+version = "1.0.3+wasi-0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
+dependencies = [
+ "wit-bindgen 0.57.1",
+]
+
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+dependencies = [
+ "wit-bindgen 0.51.0",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.122"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.72"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9473dbd2991ae90b6291c3c32c30c6187ac49aa32f9905d1cce280ec1e110b0f"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.122"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.122"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.122"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap 2.14.0",
+ "wasm-encoder",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags 2.11.1",
+ "hashbrown 0.15.5",
+ "indexmap 2.14.0",
+ "semver",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.99"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d621441cfc37b84979402712047321980c178f299193a3589d05b99e8763436"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "webpki-root-certs"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "0.26.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
+dependencies = [
+ "webpki-roots 1.0.7",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "winapi-util"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "winnow"
+version = "0.7.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "wit-bindgen"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+dependencies = [
+ "wit-bindgen-rust-macro",
+]
+
+[[package]]
+name = "wit-bindgen"
+version = "0.57.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
+
+[[package]]
+name = "wit-bindgen-core"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
+dependencies = [
+ "anyhow",
+ "heck 0.5.0",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck 0.5.0",
+ "indexmap 2.14.0",
+ "prettyplease",
+ "syn 2.0.117",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags 2.11.1",
+ "indexmap 2.14.0",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
+dependencies = [
+ "anyhow",
+ "id-arena",
+ "indexmap 2.14.0",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
+]
+
+[[package]]
+name = "writeable"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
+
+[[package]]
+name = "x509-parser"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69"
+dependencies = [
+ "asn1-rs",
+ "data-encoding",
+ "der-parser",
+ "lazy_static",
+ "nom",
+ "oid-registry",
+ "rusticata-macros",
+ "thiserror 1.0.69",
+ "time",
+]
+
+[[package]]
+name = "yansi"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
+
+[[package]]
+name = "yasna"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd"
+dependencies = [
+ "time",
+]
+
+[[package]]
+name = "yoke"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca"
+dependencies = [
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "synstructure",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b065d4f0e55f82fae73202e189638116a87c55ab6b8e6c2721e13dd9d854ad1"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b631b19d36a892ab55420c92dbc83ccd79274f25be714855d3074aa71cab639"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "zerofrom"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "synstructure",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
+
+[[package]]
+name = "zerotrie"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+]
+
+[[package]]
+name = "zerovec"
+version = "0.11.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
+
+[[package]]
+name = "zstd"
+version = "0.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "7.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
+dependencies = [
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.16+zstd.1.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
+dependencies = [
+ "cc",
+ "pkg-config",
+]
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..c73b00c
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,97 @@
+[package]
+name = "beyond-ai"
+version = "0.1.0"
+edition = "2024"
+license = "MIT"
+rust-version = "1.85"
+description = "Beyond AI gateway — egress L7 proxy to LLM providers"
+
+[lib]
+name = "beyond_ai"
+path = "src/lib.rs"
+
+[[bin]]
+name = "beyond-ai"
+path = "src/main.rs"
+
+# `[lints]` binds every target in the crate — lib, bin, tests, benches. That matters: a crate-level
+# `#![deny(...)]` attribute only covers the unit it's written in, so the binary root (`main.rs`)
+# would otherwise escape the library's denies. Declaring them here closes that gap structurally.
+[lints.rust]
+unsafe_code = "forbid"
+unused_must_use = "deny"
+
+# Panic surface: a stray `.unwrap()`/`.expect()`/`panic!`/`todo!` in a request path is a worker
+# crash, not an error response. Deny them so a new one is a hard CI failure (mise `check:rs` runs
+# clippy with `-D warnings`). These are clippy *restriction* lints (allow-by-default); naming them
+# here turns them on. The handful of genuine boot-time invariants carry a local
+# `#[allow(clippy::expect_used)]` with a SAFETY-style note; test/bench targets allow them wholesale
+# at the file head (asserting a precondition with `.unwrap()` is the point of a test).
+[lints.clippy]
+unwrap_used = "deny"
+expect_used = "deny"
+panic = "deny"
+todo = "deny"
+unimplemented = "deny"
+
+# Release builds wrap arithmetic silently by default; turn that into a panic so an overflow on a
+# size/count never goes unnoticed. Negligible cost for a proxy (arithmetic isn't the bottleneck).
+[profile.release]
+overflow-checks = true
+
+[dependencies]
+# slipstream is published — consume it from crates.io, aliased to `store` so the code's
+# `use store::...` is unchanged. No path deps into the `beyond` repo: this crate builds standalone.
+store = { package = "beyond-slipstream", version = "0.1.0" }
+
+pingora = { version = "0.8", default-features = false, features = ["rustls"] }
+pingora-core = "0.8"
+pingora-limits = "0.8"
+pingora-proxy = "0.8"
+
+arc-swap = "1"
+arrayvec = "0.7"
+async-trait = "0.1"
+base64 = "0.22"
+bytes = "1"
+clap = { version = "4", features = ["derive", "env"] }
+ed25519-dalek = "2.2"
+figment = { version = "0.10", features = ["toml", "env"] }
+getrandom = "0.3"
+# The types Pingora's `ServeHttp` trait speaks (`Response<Vec<u8>>`); pin to the same 1.x already in
+# the tree via Pingora so the admin app can name them directly.
+http = "1"
+memchr = "2"
+prometheus = "0.13"
+rustls = { version = "0.23", default-features = false, features = ["ring"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+thiserror = "2"
+tokio = { version = "1", features = ["full"] }
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
+zeroize = "1"
+
+[dev-dependencies]
+# Bench harnesses. Best tool per job: `divan` for the unit micro-bench (it measures allocations
+# natively via AllocProfiler, alongside timing); `criterion` for the e2e macro-bench (`async_tokio`
+# drives the round-trips, and its saved-baseline comparison tracks latency/RPS over time).
+criterion = { version = "0.5", features = ["async_tokio"] }
+divan = "0.1"
+http-body-util = "0.1"
+# `http2` + hyper-util's `server-auto` let the mock upstream serve H1 *and* H2 on one TLS listener
+# (protocol chosen by ALPN), so the concurrency bench can drive the gateway's H2 path. `rcgen` mints a
+# throwaway self-signed cert for that listener; `tokio-rustls` terminates TLS in front of hyper.
+hyper = { version = "1", features = ["server", "http1", "http2"] }
+hyper-util = { version = "0.1", features = ["tokio", "server-auto"] }
+rcgen = "0.13"
+reqwest = { version = "0.13", default-features = false, features = ["json", "rustls"] }
+tokio-rustls = "0.26"
+
+[[bench]]
+name = "unit"
+harness = false
+
+[[bench]]
+name = "e2e"
+harness = false
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e852c19
--- /dev/null
+++ b/README.md
@@ -0,0 +1,84 @@
+# beyond/ai
+
+Route LLM traffic through one internal proxy. Apps use their stock OpenAI or Anthropic SDK unchanged — the gateway authenticates, swaps in the real provider key, and meters every token.
+
+## Quick Start
+
+```sh
+cp config.example.toml config.toml
+# Set at minimum: signing_keys and one pool key
+AI_POOL_KEY_OPENAI=sk-... cargo run --release
+```
+
+Point any OpenAI-wire SDK at `http://ai.internal` with a virtual key:
+
+```python
+from openai import OpenAI
+client = OpenAI(base_url="http://ai.internal/v1", api_key="bai_v1.1.<payload>.<sig>")
+```
+
+Or pass your own provider key directly (BYO — forwarded unchanged, no swap):
+
+```python
+client = OpenAI(base_url="http://ai.internal/v1", api_key="sk-your-openai-key")
+```
+
+## What It Does
+
+- **Managed keys** (`bai_v1…`) — Ed25519-verified, stateless. Swaps to the pool key. Attributes usage to tenant + VPC. Deny-set checked (spend/fraud).
+- **BYO keys** — any other token passes through to the provider untouched. No key-swap, no deny-set, no attribution, no `ai.usage` billing event (aggregate throughput metrics still count it).
+- **10 providers, zero config** — openai, anthropic, openrouter, fireworks, groq, deepseek, together, cerebras, mistral, xai. Add more in `config.toml` under `[provider_authorities]`.
+- **Never buffers** — request and response stream through; a SIMD scanner extracts `model` in O(1) memory. 64KB tail taps usage without holding the body.
+- **Token facts, not pricing** — emits `ai.usage` token-count events as structured logs (stdout → logfwd/OTLP → ClickHouse). A closed downstream consumer prices; slipstream carries only the deny-set.
+- **Rate guardrail** — per-key request ceiling (`rate_limit_rps`). Circuit breaker against runaway keys. Deny-set owns spend control.
+- **Fail-open NATS** — auth works without NATS. A NATS outage stales the deny-set; existing allows stay allowed.
+
+## Providers
+
+The provider is the **first path segment** of the base URL — no header, nothing tool-specific. Bare
+`/v1` defaults to OpenAI (and `/v1/messages` to Anthropic), so the two big providers are a host-only
+swap; everything else is `/{provider}/…` using that provider's own path (forwarded verbatim).
+
+```python
+# OpenAI (default) — change only the host
+client = OpenAI(base_url="http://ai.internal/v1", api_key="bai_v1...")
+
+# Groq — its native base path is /openai/v1, so the gateway path is /groq/openai/v1
+client = OpenAI(base_url="http://ai.internal/groq/openai/v1", api_key="bai_v1...")
+
+# Fireworks mounts at /inference/v1 → /fireworks/inference/v1; OpenRouter at /api/v1 → /openrouter/api/v1
+```
+
+An unknown first segment is a 404. See `route::KNOWN_PROVIDERS` for each provider's native base path.
+
+## Config
+
+All config keys are overridable by `AI_`-prefixed env vars (`AI_NATS_URL`, `AI_POOL_KEY_OPENAI`, …). See `config.example.toml` for the full reference.
+
+Required to serve managed traffic:
+
+| Key                  | Source        | Purpose                                                 |
+| -------------------- | ------------- | ------------------------------------------------------- |
+| `signing_keys`       | `config.toml` | Ed25519 public keys by `kid` — verifies `bai_v1` tokens |
+| `AI_POOL_KEY_<NAME>` | env (SSM)     | Provider key swapped in for managed requests            |
+
+Optional:
+
+| Key                      | Default   | Purpose                                                                  |
+| ------------------------ | --------- | ------------------------------------------------------------------------ |
+| `snapshot_path`          | unset     | On-disk deny-set snapshot — set on durable nodes, leave unset on Fargate |
+| `rate_limit_rps`         | `100`     | Per-key request ceiling; `0` disables                                    |
+| `[provider_authorities]` | built-ins | Override or add upstream hosts                                           |
+
+## Running Tests
+
+```sh
+mise run test:unit:rs        # pure-logic unit tests (no network)
+mise run test:integration:rs # gateway + mock upstream + NATS
+mise run test:smoke          # live providers — needs API keys in env, bills real (tiny) requests
+mise run bench               # unit micro-benchmarks + end-to-end throughput
+```
+
+## Architecture
+
+[ARCHITECTURE.md](ARCHITECTURE.md) — request flow, module map, key invariants.
diff --git a/benches/e2e.rs b/benches/e2e.rs
new file mode 100644
index 0000000..4e39b43
--- /dev/null
+++ b/benches/e2e.rs
@@ -0,0 +1,360 @@
+// Bench target: `.unwrap()`/`.expect()` set up the harness; not production code. See tests/e2e.rs.
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
+
+//! A-1 end-to-end bench: the real `beyond-ai` binary + real `nats-server` + a mock upstream,
+//! driven over real HTTP. Run with `mise run bench:e2e` (needs `nats-server` on PATH — mise
+//! provides it). This is the macro counterpart to `unit.rs`: it measures the *whole* request path
+//! (TCP accept → Pingora filters → key verify → key swap → body stream → upstream → usage tap),
+//! not a single function.
+//!
+//! Reuses the e2e test harness (`tests/common`) verbatim so the bench and the integration tests
+//! exercise the same stack. Allocations are deliberately *not* measured here — the gateway is a
+//! separate process, so its heap is invisible to this binary; allocation regressions belong to the
+//! in-process `unit` bench.
+//!
+//! The stack starts **once** and stays warm for the whole run; each iteration is one (or, for the
+//! throughput group, N concurrent) HTTP round-trip(s) against that live gateway.
+
+#[path = "../tests/common/mod.rs"]
+mod common;
+
+use std::time::Duration;
+
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use tokio::runtime::Runtime;
+use tokio::task::JoinSet;
+
+use beyond_ai::key::{VirtualKey, mint};
+use common::*;
+
+const MANAGED_BODY: &str = r#"{"model":"gpt-4o","messages":[{"role":"user","content":"hi"}]}"#;
+
+/// A plausible BYO provider token (anything not starting with `bai_` is BYO — passed through
+/// unchanged, no verify/deny/swap). The mock upstream accepts any token.
+const BYO_KEY: &str = "sk-byo-provider-token-1234567890";
+
+/// Concurrency level for the throughput group — enough in-flight requests to expose per-request
+/// overhead and connection-pool behavior without saturating a laptop.
+const CONCURRENCY: u64 = 32;
+
+/// A live, warmed-up stack. Field order matters only for drop (children are killed on drop); we
+/// hold every piece so nothing is torn down mid-bench.
+struct Stack {
+    // RAII guards: held only so their `Drop` (kill subprocess / abort task / clean tempdir) fires
+    // when the bench ends. Never read directly — the requests go through `url`/`client`.
+    #[allow(dead_code)]
+    gw: Gateway,
+    #[allow(dead_code)]
+    mock: MockUpstream,
+    #[allow(dead_code)]
+    nats: Nats,
+    client: reqwest::Client,
+    vkey: String,
+    url: String,
+}
+
+async fn start_stack() -> Stack {
+    start_stack_with(Mode::Json).await
+}
+
+async fn start_stack_with(mode: Mode) -> Stack {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(1);
+    let mock = MockUpstream::start(mode).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 42,
+            vpc_id: 7,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    let url = gw.url();
+
+    // Warm until the gateway answers 200 — the watcher connects to NATS and the DNS cache fills on
+    // the first call, neither of which we want inside the timed loop.
+    {
+        let (c, u, k) = (client.clone(), url.clone(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                c.post(format!("{u}/v1/chat/completions"))
+                    .header("authorization", format!("Bearer {k}"))
+                    .header("content-type", "application/json")
+                    .body(MANAGED_BODY)
+                    .send()
+                    .await
+                    .map(|r| r.status().as_u16())
+                    .unwrap_or(0)
+            }
+        })
+        .await;
+    }
+
+    Stack {
+        gw,
+        mock,
+        nats,
+        client,
+        vkey,
+        url,
+    }
+}
+
+/// One full managed round-trip: key swap + body relay + non-streaming usage tap. Drains the
+/// response body so the connection is returned to the pool (otherwise reqwest would open a new
+/// socket every iteration and we'd be benching `connect`, not the gateway).
+async fn managed_roundtrip(s: &Stack) {
+    let resp = s
+        .client
+        .post(format!("{}/v1/chat/completions", s.url))
+        .header("authorization", format!("Bearer {}", s.vkey))
+        .header("content-type", "application/json")
+        .body(MANAGED_BODY)
+        .send()
+        .await
+        .expect("request");
+    debug_assert_eq!(resp.status().as_u16(), 200);
+    let _ = resp.bytes().await.expect("body");
+}
+
+/// One **BYO** round-trip: a non-`bai_` token, passed straight through — no key verify, no deny-set
+/// check, no key swap. Isolates the passthrough path's overhead from the managed path's auth work.
+async fn byo_roundtrip(s: &Stack) {
+    let resp = s
+        .client
+        .post(format!("{}/v1/chat/completions", s.url))
+        .header("authorization", format!("Bearer {BYO_KEY}"))
+        .header("content-type", "application/json")
+        .body(MANAGED_BODY)
+        .send()
+        .await
+        .expect("request");
+    debug_assert_eq!(resp.status().as_u16(), 200);
+    let _ = resp.bytes().await.expect("body");
+}
+
+/// One **rejected** request: no API key ⇒ 401, short-circuited in `request_filter` **before** any
+/// upstream connection. Benched to prove a flood of rejects costs far less than a proxied request —
+/// the rate-guardrail/flood rationale (a reject must not consume the upstream-connection
+/// constraint). The gap between this and `managed_json_latency` is the gateway's reject headroom.
+async fn reject_roundtrip(s: &Stack) {
+    let resp = s
+        .client
+        .post(format!("{}/v1/chat/completions", s.url))
+        .header("content-type", "application/json")
+        .body(MANAGED_BODY)
+        .send()
+        .await
+        .expect("request");
+    debug_assert_eq!(resp.status().as_u16(), 401);
+    let _ = resp.bytes().await.expect("body");
+}
+
+fn bench_e2e(c: &mut Criterion) {
+    let rt = Runtime::new().expect("tokio runtime");
+    let stack = rt.block_on(start_stack());
+    // A second stack whose mock streams SSE, so the response-tap (tail buffer + compaction) hot path
+    // is actually exercised — it's a near no-op for the single-shot JSON body.
+    let sse_stack = rt.block_on(start_stack_with(Mode::Sse));
+
+    let mut group = c.benchmark_group("e2e");
+    // Real round-trips are sub-millisecond on loopback but still ~100× a micro-bench; trim the
+    // sample count so the suite stays in the seconds, not minutes.
+    group.sample_size(50);
+    group.measurement_time(Duration::from_secs(10));
+
+    // Single-request latency through the full proxy: managed (verify + deny + key swap), BYO
+    // (pure passthrough), SSE relay (exercises the streaming response tap), and the reject
+    // fast-path (401, no upstream). Compared against each other these isolate where time goes.
+    group.bench_function("managed_json_latency", |b| {
+        b.to_async(&rt).iter(|| managed_roundtrip(&stack));
+    });
+    group.bench_function("byo_json_latency", |b| {
+        b.to_async(&rt).iter(|| byo_roundtrip(&stack));
+    });
+    group.bench_function("managed_sse_latency", |b| {
+        b.to_async(&rt).iter(|| managed_roundtrip(&sse_stack));
+    });
+    group.bench_function("reject_missing_key_latency", |b| {
+        b.to_async(&rt).iter(|| reject_roundtrip(&stack));
+    });
+
+    // Throughput: CONCURRENCY requests in flight per iteration. `Throughput::Elements` makes
+    // criterion report requests/sec.
+    group.throughput(Throughput::Elements(CONCURRENCY));
+    group.bench_function("managed_json_throughput", |b| {
+        b.to_async(&rt).iter(|| async {
+            let mut set = JoinSet::new();
+            for _ in 0..CONCURRENCY {
+                let client = stack.client.clone();
+                let url = stack.url.clone();
+                let vkey = stack.vkey.clone();
+                set.spawn(async move {
+                    let resp = client
+                        .post(format!("{url}/v1/chat/completions"))
+                        .header("authorization", format!("Bearer {vkey}"))
+                        .header("content-type", "application/json")
+                        .body(MANAGED_BODY)
+                        .send()
+                        .await
+                        .expect("request");
+                    let _ = resp.bytes().await.expect("body");
+                });
+            }
+            while let Some(r) = set.join_next().await {
+                r.expect("task");
+            }
+        });
+    });
+
+    group.finish();
+
+    // Keep the stacks alive until every bench has run, then tear them down explicitly.
+    drop(stack);
+    drop(sse_stack);
+}
+
+/// Concurrency levels swept by `bench_concurrency`. Spans below and above hyper's default
+/// `SETTINGS_MAX_CONCURRENT_STREAMS` (200) so an H2 stream-concurrency cliff (if any) shows up against
+/// H1's connection pool.
+const SWEEP: &[u64] = &[1, 8, 32, 128, 512];
+
+/// Fire `conc` managed requests at `url` concurrently and drain each body (returns the connection to
+/// the pool). This is one bench iteration; `Throughput::Elements(conc)` makes criterion report req/s.
+async fn drive(client: &reqwest::Client, url: &str, vkey: &str, conc: u64) {
+    let mut set = JoinSet::new();
+    for _ in 0..conc {
+        let (c, u, k) = (client.clone(), url.to_string(), vkey.to_string());
+        set.spawn(async move {
+            let resp = c
+                .post(format!("{u}/v1/chat/completions"))
+                .header("authorization", format!("Bearer {k}"))
+                .header("content-type", "application/json")
+                .body(MANAGED_BODY)
+                .send()
+                .await
+                .expect("request");
+            let _ = resp.bytes().await.expect("body");
+        });
+    }
+    while let Some(r) = set.join_next().await {
+        r.expect("task");
+    }
+}
+
+/// Warm a gateway until it answers 200, then return the protocol it used to reach the upstream — read
+/// from the `x-mock-proto` header the TLS mock stamps and the gateway relays. This is the proof the
+/// "h2"/"h1" bench labels reflect what actually negotiated, not just what we configured.
+async fn warm_and_proto(client: &reqwest::Client, url: &str, vkey: &str) -> String {
+    {
+        let (c, u, k) = (client.clone(), url.to_string(), vkey.to_string());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                c.post(format!("{u}/v1/chat/completions"))
+                    .header("authorization", format!("Bearer {k}"))
+                    .header("content-type", "application/json")
+                    .body(MANAGED_BODY)
+                    .send()
+                    .await
+                    .map(|r| r.status().as_u16())
+                    .unwrap_or(0)
+            }
+        })
+        .await;
+    }
+    let resp = client
+        .post(format!("{url}/v1/chat/completions"))
+        .header("authorization", format!("Bearer {vkey}"))
+        .header("content-type", "application/json")
+        .body(MANAGED_BODY)
+        .send()
+        .await
+        .expect("warm request");
+    resp.headers()
+        .get("x-mock-proto")
+        .and_then(|v| v.to_str().ok())
+        .unwrap_or("unknown")
+        .to_string()
+}
+
+/// H2-vs-H1 to the upstream, under concurrency. One TLS+H2 mock; two gateways against it — one with
+/// `upstream_http2 = true` (ALPN H2H1 → h2), one `false` (ALPN H1). Same client→gateway transport
+/// (plain H1) for both, so the only variable is the gateway→upstream protocol. The sweep exposes
+/// whether H2 multiplexing wins or hits its stream-concurrency cap vs H1's connection pool.
+fn bench_concurrency(c: &mut Criterion) {
+    let rt = Runtime::new().expect("tokio runtime");
+    let nats = rt.block_on(Nats::start());
+    let mock = rt.block_on(MockUpstream::start_tls(Mode::Json));
+    let (pubkey, sk) = test_keypair(1);
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 42,
+            vpc_id: 7,
+        },
+        1,
+        &sk,
+    );
+
+    // Two gateways at the same self-signed TLS mock; ALPN is the only difference. Rate limits OFF
+    // (both tiers): the sweep drives one credential well past the 100 rps default, and a rate-limited
+    // 429 short-circuits *before* the upstream — it would measure the reject path, not H2-vs-H1.
+    let gw_h2 = rt.block_on(
+        Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+            .tls_upstream()
+            .upstream_http2(true)
+            .rate_limit_rps(0)
+            .byo_rate_limit_rps(0)
+            .start(),
+    );
+    let gw_h1 = rt.block_on(
+        Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+            .tls_upstream()
+            .upstream_http2(false)
+            .rate_limit_rps(0)
+            .byo_rate_limit_rps(0)
+            .start(),
+    );
+    let client = reqwest::Client::new();
+    let (url_h2, url_h1) = (gw_h2.url(), gw_h1.url());
+
+    // Prove the gateways actually negotiated what we asked for before trusting the labels.
+    let proto_h2 = rt.block_on(warm_and_proto(&client, &url_h2, &vkey));
+    let proto_h1 = rt.block_on(warm_and_proto(&client, &url_h1, &vkey));
+    assert_eq!(
+        proto_h2, "h2",
+        "upstream_http2=true should negotiate h2 to the mock"
+    );
+    assert_eq!(
+        proto_h1, "http/1.1",
+        "upstream_http2=false should stay http/1.1 to the mock"
+    );
+    eprintln!("e2e_concurrency: confirmed gw_h2→upstream=h2, gw_h1→upstream=http/1.1");
+
+    let mut group = c.benchmark_group("e2e_concurrency");
+    group.sample_size(10);
+    group.measurement_time(Duration::from_secs(6));
+    for &conc in SWEEP {
+        group.throughput(Throughput::Elements(conc));
+        group.bench_with_input(BenchmarkId::new("h2", conc), &conc, |b, &conc| {
+            b.to_async(&rt)
+                .iter(|| drive(&client, &url_h2, &vkey, conc));
+        });
+        group.bench_with_input(BenchmarkId::new("h1", conc), &conc, |b, &conc| {
+            b.to_async(&rt)
+                .iter(|| drive(&client, &url_h1, &vkey, conc));
+        });
+    }
+    group.finish();
+
+    drop(gw_h2);
+    drop(gw_h1);
+    drop(mock);
+    drop(nats);
+}
+
+criterion_group!(benches, bench_e2e, bench_concurrency);
+criterion_main!(benches);
diff --git a/benches/unit.rs b/benches/unit.rs
new file mode 100644
index 0000000..e35a195
--- /dev/null
+++ b/benches/unit.rs
@@ -0,0 +1,232 @@
+// Bench target: `.unwrap()`/`.expect()` set up fixtures; not production code. See tests/e2e.rs.
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
+
+//! Unit bench: the pure, IO-free hot paths. Timing **and** allocations come from `divan` — its
+//! `AllocProfiler` (installed as the global allocator below) reports alloc count + bytes per
+//! sample right beside ns/iter, so the design's allocation claims are visible in one table.
+//! Run with `mise run bench:unit` (or `cargo bench --bench unit`).
+//!
+//! The headline invariant to watch: managed-key **verify** is 0 allocs — it decodes onto the
+//! stack (see `key.rs`). `peek` should hold a flat, tiny alloc count independent of body size
+//! (the O(1)-memory claim). A regression shows up as a non-zero / grown number in the alloc
+//! columns the moment this runs.
+//!
+//! Fixtures are built *outside* the closure handed to `Bencher::bench` (or in `args`), so only the
+//! measured call is timed and counted — setup allocations don't pollute the numbers.
+
+use std::hint::black_box;
+
+use divan::Bencher;
+use divan::counter::BytesCount;
+
+#[global_allocator]
+static ALLOC: divan::AllocProfiler = divan::AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+mod key {
+    use super::*;
+    use beyond_ai::key::{Keyring, VirtualKey, mint};
+    use ed25519_dalek::SigningKey;
+
+    const ID: VirtualKey = VirtualKey {
+        tenant_id: 42,
+        vpc_id: 7,
+    };
+
+    /// Stateless verify — must not touch the heap (stack-only base64 decode + signature check).
+    #[divan::bench]
+    fn verify(bencher: Bencher) {
+        let sk = SigningKey::from_bytes(&[1u8; 32]);
+        let mut ring = Keyring::new();
+        ring.insert(1, sk.verifying_key());
+        let token = mint(&ID, 1, &sk);
+        bencher.bench(|| ring.verify(black_box(&token)));
+    }
+
+    /// Reference mint path (allocates the output string + base64 segments) — tracked so the Go
+    /// control-plane parity implementation has a baseline.
+    #[divan::bench]
+    fn mint_key(bencher: Bencher) {
+        let sk = SigningKey::from_bytes(&[1u8; 32]);
+        bencher.bench(|| mint(black_box(&ID), 1, &sk));
+    }
+}
+
+mod route {
+    use super::*;
+    use beyond_ai::route::{Dialect, dialect_default};
+
+    // Dialect → default provider name: the per-request routing decision (sans override). 0-alloc.
+    #[divan::bench(args = [Dialect::OpenAI, Dialect::Anthropic])]
+    fn dialect_default_name(bencher: Bencher, dialect: Dialect) {
+        bencher.bench(|| dialect_default(black_box(dialect)));
+    }
+}
+
+mod deny {
+    use super::*;
+    use beyond_ai::deny::{self, DenyReason, DenySet};
+
+    // --- ingest path: parse a watched NATS key/value into the set (off the request hot path) ---
+
+    #[divan::bench]
+    fn parse_key() -> Option<u64> {
+        deny::parse_key(black_box("blackhole.123456789"))
+    }
+
+    #[divan::bench]
+    fn parse_reason_bare() -> beyond_ai::deny::DenyReason {
+        deny::parse_reason(black_box(b"spend"))
+    }
+
+    #[divan::bench]
+    fn parse_reason_json() -> beyond_ai::deny::DenyReason {
+        deny::parse_reason(black_box(br#"{"reason":"fraud","exp":123}"#))
+    }
+
+    // --- request hot path: the lookup run on EVERY managed request (`proxy::request_filter`) ---
+
+    /// Build a deny-set holding `n` cut-off tenants (ids `0..n`). Built outside the timed closure.
+    fn populated(n: u64) -> DenySet {
+        (0..n).map(|t| (t, DenyReason::Spend)).collect()
+    }
+
+    /// The common case: tenant **absent** from the set (default-allow). The headline invariant is
+    /// that this is O(1) and **0-alloc regardless of set size** — so the args span an empty set and
+    /// a large one (1M cut-off tenants); the ns/iter and the (absent) alloc columns must stay flat.
+    /// A regression to anything size-dependent shows up as the big-`n` row diverging from the small.
+    #[divan::bench(args = [0, 1_000_000])]
+    fn reason_miss(bencher: Bencher, n: u64) {
+        let set = populated(n);
+        // A tenant id past the populated range → guaranteed miss (the allow path).
+        bencher.bench(|| set.reason(black_box(n + 1)));
+    }
+
+    /// The deny case: tenant present. Same O(1) hash lookup, returning the reason — proves the
+    /// enforce path costs the same as the allow path (no surprise on the rejection branch).
+    #[divan::bench(args = [1, 1_000_000])]
+    fn reason_hit(bencher: Bencher, n: u64) {
+        let set = populated(n);
+        bencher.bench(|| set.reason(black_box(n / 2)));
+    }
+}
+
+mod ratelimit {
+    use super::*;
+    use beyond_ai::ratelimit::RateLimit;
+
+    /// Guardrail charged on **every request before verify** (`proxy::request_filter`). Managed: a
+    /// seeded hash of the raw credential + the per-credential sketch `observe` (the BYO global tier is
+    /// skipped). Fixed memory regardless of key cardinality, so this must be flat and low-alloc.
+    #[divan::bench]
+    fn check_managed(bencher: Bencher) {
+        let rl = RateLimit::new(1_000_000, 1_000_000).expect("enabled");
+        let cred = "bai_v1.1.AAAAAAAAAAAAAAAAAAAAAA.signature-base64url-payload-here";
+        bencher.bench(|| rl.check(black_box(cred), black_box(true)));
+    }
+
+    /// A longer BYO provider token — exercises both tiers (global BYO bucket + per-credential sketch)
+    /// against a realistic raw token length: the full per-request BYO cost.
+    #[divan::bench]
+    fn check_byo(bencher: Bencher) {
+        let rl = RateLimit::new(1_000_000, 1_000_000).expect("enabled");
+        let token = "sk-some-byo-provider-token-of-realistic-length-abcdef0123456789";
+        bencher.bench(|| rl.check(black_box(token), black_box(false)));
+    }
+}
+
+mod usage {
+    use super::*;
+    use beyond_ai::usage::{self, Usage};
+
+    const OAI: &[u8] = br#"{"usage":{"prompt_tokens":12,"completion_tokens":34,"prompt_tokens_details":{"cached_tokens":4}}}"#;
+    const ANT: &[u8] = br#"{"usage":{"input_tokens":100,"output_tokens":50,"cache_read_input_tokens":10,"cache_creation_input_tokens":7}}"#;
+    const OAI_SSE: &[u8] = b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\ndata: {\"choices\":[],\"usage\":{\"prompt_tokens\":5,\"completion_tokens\":9}}\n\ndata: [DONE]\n\n";
+    const ANT_SSE: &[u8] = b"event: message_start\ndata: {\"type\":\"message_start\",\"message\":{\"usage\":{\"input_tokens\":20,\"output_tokens\":0}}}\n\nevent: message_delta\ndata: {\"type\":\"message_delta\",\"usage\":{\"output_tokens\":15}}\n\n";
+
+    #[divan::bench]
+    fn openai_body() -> Option<Usage> {
+        usage::openai_body(black_box(OAI))
+    }
+
+    #[divan::bench]
+    fn anthropic_body() -> Option<Usage> {
+        usage::anthropic_body(black_box(ANT))
+    }
+
+    #[divan::bench]
+    fn openai_stream() -> Option<Usage> {
+        usage::openai_stream(black_box(OAI_SSE))
+    }
+
+    #[divan::bench]
+    fn anthropic_stream() -> Option<Usage> {
+        usage::anthropic_stream(black_box(ANT_SSE))
+    }
+}
+
+mod peek {
+    use super::*;
+    use beyond_ai::peek::ModelScanner;
+
+    /// A realistic chat body with `padding` bytes of message content, the root `model` placed
+    /// **last** so the scanner must walk the whole body (worst case for the streaming scan).
+    fn body_with_model_last(padding: usize) -> Vec<u8> {
+        let content = "x".repeat(padding);
+        format!(r#"{{"messages":[{{"role":"user","content":"{content}"}}],"stream":true,"model":"claude-opus-4-8"}}"#)
+            .into_bytes()
+    }
+
+    /// Sizes span a tiny request, a typical prompt, and a large one (e.g. a pasted document /
+    /// base64 image) that exercises the SIMD fast-skip over uninteresting string content. The
+    /// `BytesCount` makes divan report bytes/sec; the alloc columns should stay flat across sizes.
+    #[divan::bench(args = [0, 4 * 1024, 256 * 1024])]
+    fn scan_model_last(bencher: Bencher, padding: usize) {
+        let body = body_with_model_last(padding);
+        bencher.counter(BytesCount::of_slice(&body)).bench(|| {
+            let mut scanner = ModelScanner::new();
+            scanner.feed(black_box(&body));
+            scanner.take_model()
+        });
+    }
+
+    use beyond_ai::peek::plan_stream_usage_injection;
+
+    /// A streaming body whose large `content` value precedes the root `stream` field — the worst
+    /// case for the injection planner: it must walk past `padding` bytes of uninteresting string
+    /// content (the SIMD fast-skip target) before it can decide.
+    fn streaming_body(padding: usize) -> Vec<u8> {
+        let content = "x".repeat(padding);
+        format!(r#"{{"messages":[{{"role":"user","content":"{content}"}}],"model":"gpt-4o","stream":true}}"#)
+            .into_bytes()
+    }
+
+    /// The common case: a non-streaming body (no `stream` field). The planner must prove absence,
+    /// which today means a full structural walk — the case the `memmem` pre-filter short-circuits.
+    fn non_streaming_body(padding: usize) -> Vec<u8> {
+        let content = "x".repeat(padding);
+        format!(r#"{{"messages":[{{"role":"user","content":"{content}"}}],"model":"gpt-4o"}}"#)
+            .into_bytes()
+    }
+
+    /// Plan injection on a **streaming** body (must walk past the big content value to find `stream`).
+    #[divan::bench(args = [0, 4 * 1024, 256 * 1024])]
+    fn plan_inject_streaming(bencher: Bencher, padding: usize) {
+        let body = streaming_body(padding);
+        bencher
+            .counter(BytesCount::of_slice(&body))
+            .bench(|| plan_stream_usage_injection(black_box(&body)));
+    }
+
+    /// Plan injection on a **non-streaming** body (no `stream` key — the majority case).
+    #[divan::bench(args = [0, 4 * 1024, 256 * 1024])]
+    fn plan_inject_non_streaming(bencher: Bencher, padding: usize) {
+        let body = non_streaming_body(padding);
+        bencher
+            .counter(BytesCount::of_slice(&body))
+            .bench(|| plan_stream_usage_injection(black_box(&body)));
+    }
+}
diff --git a/config.example.toml b/config.example.toml
new file mode 100644
index 0000000..ed7f7af
--- /dev/null
+++ b/config.example.toml
@@ -0,0 +1,89 @@
+# Beyond AI gateway — example config. Every key is overridable by an `AI_`-prefixed env var
+# (e.g. `AI_NATS_URL`, `AI_POOL_KEY_OPENAI`, `AI_READ_TIMEOUT_SECS`). Values below are defaults.
+
+listen = "0.0.0.0:8080" # client (app) traffic; internal-only, fronted as ai.internal
+metrics_listen = "0.0.0.0:9090" # internal admin: /metrics (Prometheus), /livez, /readyz
+
+# NATS / slipstream — carries ONLY the deny-set (`blackhole.*`). Auth + keys come from this file,
+# so the gateway authenticates + serves managed traffic even if NATS is down.
+nats_url = "nats://localhost:4222"
+# nats_creds = "<base64 .creds>"        # ECS via SOPS
+# nats_creds_file = "/path/to/nats.creds"
+config_bucket = "ai-gateway"
+
+# Optional on-disk deny-set snapshot (slipstream append-log + resume cursor). Set this ONLY on
+# durable storage (edge/tunnel nodes): a restart then seeds the deny-set from disk and resumes the
+# NATS watch from the saved revision — enforcing immediately, even before NATS reconnects, and
+# skipping the boot scan. Leave unset on ephemeral hosts (e.g. Fargate); the gateway seeds from a
+# NATS scan each boot. It's a pure cache: deleting the file just forces a rescan.
+# snapshot_path = "/var/lib/beyond-ai/denyset.snap"
+
+# Upstream timeouts. read/idle are generous: LLM streams can run for minutes.
+connect_timeout_secs = 10
+read_timeout_secs = 600
+write_timeout_secs = 60
+idle_timeout_secs = 90
+
+# Graceful shutdown. On SIGTERM, in-flight requests drain for up to grace_period_secs before the
+# runtimes are torn down (then runtime_timeout_secs is the final teardown backstop).
+# Default = read_timeout_secs so a deploy NEVER truncates an in-flight stream — the gateway is a
+# transparent proxy and must not mangle a paid-for generation (a half-delivered SSE can't be cleanly
+# retried). Pingora stops accepting new connections at SIGTERM, so this only waits out the longest
+# existing stream, not new work; slower rollouts are the deliberate price.
+# The orchestrator must grant the same window or it caps us (it SIGKILLs at its own stop timeout):
+# set k8s terminationGracePeriodSeconds (or the EC2 agent's ECS_CONTAINER_STOP_TIMEOUT) to match.
+# NOTE: ECS Fargate caps stopTimeout at 120s — there, streams past 120s are still cut (a Fargate limit).
+shutdown_grace_period_secs = 600
+shutdown_runtime_timeout_secs = 10
+
+# upstream_tls = true   # set false only for a plaintext mock (tests)
+
+# Per-credential request-rate ceiling (requests/sec) — a blast-radius circuit breaker, not a spend
+# control (the deny-set owns spend). Caps how fast one credential (managed virtual key ≈ a tenant+app,
+# or a BYO token) can drive the gateway, bounding a leaked/runaway key during the deny-set's reaction
+# lag and a failure flood that never bills. Generous by default so legitimate traffic never trips it;
+# set 0 to disable. Tune from the `ai_rejections_total{reason="rate_limit"}` metric.
+rate_limit_rps = 100
+
+# Per-provider circuit breaker. Trips when `circuit_breaker_threshold` upstream FAILURES occur within
+# `circuit_breaker_window_secs`; while open, requests to that provider fast-fail with 503
+# (ai_rejections_total{reason="circuit_open"}) instead of piling up against read_timeout_secs and
+# exhausting connection/in-flight slots for every provider. After `circuit_breaker_reset_secs` a probe
+# is allowed — success closes it, failure reopens it. A FAILURE is a 5xx response or a connect failure
+# (the provider is broken); a 429 is NOT a failure (a healthy provider throttling our pool key — the
+# rate limiter and the client's Retry-After own that). Applies to all traffic (managed + BYO). Set
+# threshold 0 to disable. Defaults are generous so normal background 5xx noise never trips it.
+circuit_breaker_threshold = 20
+circuit_breaker_window_secs = 10
+circuit_breaker_reset_secs = 30
+
+# Aggregate request-rate ceiling (requests/sec) for ALL BYO traffic combined — one shared bucket.
+# BYO is unverified and upstream-bound: a flood of *distinct* random BYO tokens slips past the
+# per-credential ceiling and would open junk-auth connections to providers from our egress IPs,
+# getting them rate-limited or banned. This bounds that aggregate regardless of token variation.
+# Managed traffic is EXEMPT (verified before any upstream connect, can't be forged), so this bucket
+# never sheds core tenant load. Generous by default; set 0 to disable. Tune from the
+# `ai_rejections_total{reason="rate_limit_byo_global"}` metric.
+byo_rate_limit_rps = 1000
+
+# Optional per-provider upstream authority (host:port), BY PROVIDER NAME. For a known provider this
+# overrides its built-in default; for an unknown name it ADDS a new OpenAI-wire provider, then
+# reachable at `/{name}/…` (the provider is the first path segment of the request). Known providers
+# (zero-config defaults): openai, anthropic, openrouter, fireworks, groq, deepseek, together,
+# cerebras, mistral, xai.
+# [provider_authorities]
+# openai = "api.openai.com:443"
+# my-self-hosted = "llm.internal:8443"
+
+# Managed Beyond pool keys, BY PROVIDER NAME. Inject via SSM-backed env in production
+# (AI_POOL_KEY_OPENAI, AI_POOL_KEY_GROQ, …) rather than this file; env wins over any value here.
+# A provider with no pool key can't serve managed traffic (→ 503); BYO is unaffected.
+# [pool_keys]
+# openai = "sk-..."
+# anthropic = "sk-ant-..."
+# fireworks = "fw-..."
+
+# Trusted Ed25519 signing PUBLIC keys (kid -> base64). Multiple for zero-downtime rotation.
+# Managed virtual keys (bai_…) are verified against these; BYO raw tokens skip verification.
+[signing_keys]
+# 1 = "<base64-ed25519-public-key>"
diff --git a/mise.toml b/mise.toml
new file mode 100644
index 0000000..be93405
--- /dev/null
+++ b/mise.toml
@@ -0,0 +1,52 @@
+[tools]
+dprint = "latest"
+rust = { version = "1.92", components = "rustfmt,clippy", targets = "aarch64-unknown-linux-gnu,x86_64-unknown-linux-gnu" }
+yamlfmt = "latest"
+cargo-binstall = "latest"
+"cargo:cross" = "latest"
+# nats-server for the e2e harness (real JetStream KV backing slipstream).
+"ubi:nats-io/nats-server" = { version = "latest", exe = "nats-server" }
+
+[tasks."build:rs"]
+run = "cargo build"
+
+[tasks."build:rs:release"]
+run = "cargo build --release"
+
+[tasks."check:rs"]
+run = "cargo clippy --all-targets -- -D warnings"
+
+[tasks."check:fmt"]
+run = "dprint check"
+
+[tasks."format"]
+run = "dprint fmt && cargo fmt"
+
+[tasks."test:unit:rs"]
+description = "Unit tests for the pure-logic modules (key/route/peek/usage/deny/config/resolver)."
+run = "cargo test --lib"
+
+# Integration tests (gateway driven against a mock upstream + NATS) — to be added; see ARCHITECTURE.md.
+[tasks."test:integration:rs"]
+description = "End-to-end gateway tests against a mock provider + docker-compose NATS (TODO)."
+run = "cargo test --test '*'"
+
+[tasks."test:smoke"]
+description = "Live smoke tests against REAL providers. Auto-loads .env if present; set the API keys you have (ANTHROPIC_API_KEY, OPENAI_API_KEY, GROQ_API_KEY, …) there or in the environment. Each test skips if its key is unset. Bills real (tiny, max_tokens-capped) requests."
+run = """
+# Auto-load .env if it exists (export every assignment), so the keys reach the test process.
+if [ -f .env ]; then set -a; . ./.env; set +a; fi
+cargo test -p beyond-ai --test smoke -- --ignored --nocapture
+"""
+
+[tasks."bench:unit"]
+description = "divan micro-benchmarks of the IO-free hot paths (key/peek/usage/route/deny): timing + native allocation counts."
+run = "cargo bench --bench unit"
+
+[tasks."bench:e2e"]
+description = "A-1 end-to-end bench: real beyond-ai binary + nats-server + mock upstream over HTTP."
+run = "cargo bench --bench e2e"
+
+[tasks."bench"]
+description = "Run both bench harnesses (unit micro + e2e macro)."
+depends = ["bench:unit", "bench:e2e"]
diff --git a/src/admin.rs b/src/admin.rs
new file mode 100644
index 0000000..8b52d3d
--- /dev/null
+++ b/src/admin.rs
@@ -0,0 +1,96 @@
+//! Admin / observability HTTP surface served on the metrics listener: `/livez`, `/readyz`,
+//! `/metrics`.
+//!
+//! Matches the Beyond service convention (cf. `auth`, `objects`): the body is `{"status",
+//! "version"}` and there are two probes. **Both always return HTTP 200** once the process is
+//! answering, because the gateway is **fail-open by design** — auth + key swap come from boot
+//! config, and a NATS outage degrades only the (stale) deny-set, never the ability to serve. So
+//! readiness must *not* gate on NATS: a cold boot with NATS down can still serve correctly, and a
+//! non-200 would pull a healthy gateway out of the load balancer for no reason.
+//!
+//! `readyz` does, however, carry a distinct *body* signal that `livez` doesn't: when the deny-set
+//! watcher is disconnected from NATS, `readyz` reports `"status":"degraded"` (still 200). This lets
+//! an operator alert on "readyz has been degraded for >N minutes" — the spend/fraud enforcement is
+//! stale — without ever risking an LB eviction. `livez` is pure liveness: 200/`"ok"` whenever the
+//! process can answer. (The `ai_nats_connected` gauge is the same signal in Prometheus; the body
+//! flag is for orchestrators that probe HTTP but don't scrape.)
+//!
+//! Implemented as a Pingora `ServeHttp` app so all three paths share the one (internal) metrics
+//! port — Pingora's built-in prometheus service only serves `/metrics`, so we hand-route all three.
+
+use crate::metrics::Metrics;
+use async_trait::async_trait;
+use http::Response;
+use pingora_core::apps::http_app::ServeHttp;
+use pingora_core::protocols::http::ServerSession;
+use prometheus::{Encoder, TextEncoder};
+use std::sync::Arc;
+
+/// Compile-time service version, surfaced in every health body (matches the sibling services).
+const VERSION: &str = env!("CARGO_PKG_VERSION");
+
+pub struct AdminApp {
+    /// Read-only handle to the metric gauges. Used by `/readyz` to reflect NATS connectivity in the
+    /// health body (never to gate the HTTP status — see module docs).
+    pub metrics: Arc<Metrics>,
+}
+
+impl AdminApp {
+    /// Build a `{"status","version"}` JSON health response. `status` is `"ok"`/`"degraded"` so a
+    /// human or a probe can read intent without parsing the code. Header values are all static or
+    /// integer, so the builder can't fail — `expect` documents that invariant.
+    #[allow(clippy::expect_used)] // builder inputs are all static/integer; cannot fail
+    fn health(status: u16, health: &str) -> Response<Vec<u8>> {
+        let body = serde_json::json!({ "status": health, "version": VERSION })
+            .to_string()
+            .into_bytes();
+        Response::builder()
+            .status(status)
+            .header(http::header::CONTENT_TYPE, "application/json")
+            .header(http::header::CONTENT_LENGTH, body.len())
+            .body(body)
+            .expect("static health response is always valid")
+    }
+
+    /// Encode the default Prometheus registry as text (same output as Pingora's built-in service).
+    #[allow(clippy::expect_used)] // builder inputs are encoder-derived/integer; cannot fail
+    fn metrics() -> Response<Vec<u8>> {
+        let encoder = TextEncoder::new();
+        // Pre-size for a typical scrape: the gateway's fixed metric set renders to a few KiB of
+        // text, so one allocation up front avoids the handful of reallocs `Vec::new` would incur as
+        // the encoder appends. 8 KiB comfortably covers the current set with headroom.
+        let mut buffer = Vec::with_capacity(8 * 1024);
+        // `encode` only errors if the writer fails; a `Vec` never does, so the result is infallible
+        // here — discard it explicitly (the crate denies `unused_must_use`).
+        let _ = encoder.encode(&prometheus::gather(), &mut buffer);
+        Response::builder()
+            .status(200)
+            .header(http::header::CONTENT_TYPE, encoder.format_type())
+            .header(http::header::CONTENT_LENGTH, buffer.len())
+            .body(buffer)
+            .expect("metrics response is always valid")
+    }
+}
+
+#[async_trait]
+impl ServeHttp for AdminApp {
+    async fn response(&self, session: &mut ServerSession) -> Response<Vec<u8>> {
+        match session.req_header().uri.path() {
+            // Pure liveness: 200/ok whenever the process can answer.
+            "/livez" => Self::health(200, "ok"),
+            // Readiness: always 200 (fail-open — never pull a serving gateway from the LB), but the
+            // body reports `degraded` when the deny-set watcher is disconnected from NATS, so an
+            // operator can alert on stale spend/fraud enforcement without an eviction.
+            "/readyz" => {
+                let health = if self.metrics.nats_connected.get() == 1 {
+                    "ok"
+                } else {
+                    "degraded"
+                };
+                Self::health(200, health)
+            }
+            "/metrics" => Self::metrics(),
+            _ => Self::health(404, "not_found"),
+        }
+    }
+}
diff --git a/src/circuit_breaker.rs b/src/circuit_breaker.rs
new file mode 100644
index 0000000..af8c1e1
--- /dev/null
+++ b/src/circuit_breaker.rs
@@ -0,0 +1,935 @@
+//! Lock-free circuit breaker for protecting external service calls.
+//!
+//! This implementation is provably race-free through:
+//! 1. Atomic words for all mutable state (no multi-variable coordination)
+//! 2. Compare-and-swap loops for all state transitions
+//! 3. Monotonic timestamps for timeout detection
+//!
+//! # States
+//!
+//! ```text
+//!                 failure_threshold reached
+//!     ┌─────────┐ ──────────────────────────► ┌────────┐
+//!     │ Closed  │                             │  Open  │
+//!     └─────────┘ ◄────────────────────────── └────────┘
+//!          ▲        success in half-open           │
+//!          │                                       │ reset_timeout elapsed
+//!          │        ┌─────────────┐                │
+//!          └─────── │  Half-Open  │ ◄──────────────┘
+//!            success└─────────────┘
+//!                         │
+//!                         │ failure
+//!                         ▼
+//!                    back to Open
+//! ```
+//!
+//! # Failure Policies
+//!
+//! Two failure detection policies are supported:
+//!
+//! - **Consecutive**: Opens after N failures in a row. Any success resets the count.
+//!   Good for detecting complete backend failures.
+//!
+//! - **Windowed**: Opens after N failures within a time window. Failures outside
+//!   the window are forgotten. Good for detecting degraded backends with partial failures.
+//!
+//! # Example
+//!
+//! ```rust
+//! use beyond_ai::circuit_breaker::{CircuitBreaker, CircuitBreakerConfig, FailurePolicy};
+//! use std::time::Duration;
+//!
+//! // Consecutive failures (default)
+//! let cb = CircuitBreaker::new(CircuitBreakerConfig::default());
+//!
+//! // Windowed failures (better for edge proxies)
+//! let cb = CircuitBreaker::new(
+//!     CircuitBreakerConfig::windowed(3, Duration::from_secs(10))
+//!         .reset_timeout(Duration::from_secs(30))
+//! );
+//!
+//! // Before calling external service
+//! if cb.allow().is_err() {
+//!     // return Err("service temporarily unavailable");
+//! }
+//!
+//! // match call_external_service().await {
+//! //     Ok(result) => {
+//! //         cb.record_success();
+//! //         Ok(result)
+//! //     }
+//! //     Err(e) if is_connectivity_error(&e) => {
+//! //         cb.record_failure();
+//! //         Err(e)
+//! //     }
+//! //     Err(e) => Err(e), // Don't count business logic errors
+//! // }
+//! ```
+
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::Duration;
+
+/// How failures are counted before opening the circuit.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum FailurePolicy {
+    /// N consecutive failures opens the circuit. Any success resets the count.
+    Consecutive {
+        /// Number of consecutive failures before opening.
+        threshold: u32,
+    },
+    /// N failures within the window opens the circuit.
+    /// Failures older than the window are forgotten.
+    Windowed {
+        /// Number of failures within the window before opening.
+        threshold: u32,
+        /// Time window for counting failures.
+        window: Duration,
+    },
+}
+
+impl Default for FailurePolicy {
+    fn default() -> Self {
+        FailurePolicy::Consecutive { threshold: 5 }
+    }
+}
+
+/// Circuit breaker configuration.
+#[derive(Debug, Clone)]
+pub struct CircuitBreakerConfig {
+    /// How failures are counted.
+    pub failure_policy: FailurePolicy,
+    /// Time to wait in open state before transitioning to half-open.
+    pub reset_timeout: Duration,
+    /// Number of probe requests allowed in half-open state.
+    pub half_open_permits: u32,
+}
+
+impl Default for CircuitBreakerConfig {
+    fn default() -> Self {
+        Self {
+            failure_policy: FailurePolicy::default(),
+            reset_timeout: Duration::from_secs(30),
+            half_open_permits: 3,
+        }
+    }
+}
+
+impl CircuitBreakerConfig {
+    /// Create a config with consecutive failure detection.
+    pub fn consecutive(threshold: u32) -> Self {
+        Self {
+            failure_policy: FailurePolicy::Consecutive { threshold },
+            ..Default::default()
+        }
+    }
+
+    /// Create a config with windowed failure detection.
+    pub fn windowed(threshold: u32, window: Duration) -> Self {
+        Self {
+            failure_policy: FailurePolicy::Windowed { threshold, window },
+            ..Default::default()
+        }
+    }
+
+    /// Set the reset timeout (time in open state before half-open).
+    pub fn reset_timeout(mut self, timeout: Duration) -> Self {
+        self.reset_timeout = timeout;
+        self
+    }
+
+    /// Set the number of half-open permits.
+    pub fn half_open_permits(mut self, permits: u32) -> Self {
+        self.half_open_permits = permits;
+        self
+    }
+
+    /// Get the failure threshold from the policy.
+    #[allow(dead_code)]
+    fn threshold(&self) -> u32 {
+        match &self.failure_policy {
+            FailurePolicy::Consecutive { threshold } => *threshold,
+            FailurePolicy::Windowed { threshold, .. } => *threshold,
+        }
+    }
+}
+
+/// Lock-free circuit breaker.
+///
+/// All state is packed into a single 64-bit atomic:
+/// - Bits 62-63: State (0=closed, 1=open, 2=half-open)
+/// - Bits 48-61: Failure count (14 bits, max 16383)
+/// - Bits 32-47: Half-open permits remaining (16 bits)
+/// - Bits 0-31: Timestamp of last state change (seconds since epoch, wraps every 136 years)
+///
+/// For windowed mode, a second atomic tracks the window start timestamp.
+///
+/// This packing ensures all state transitions are atomic via single CAS operations.
+pub struct CircuitBreaker {
+    /// Packed state word.
+    state: AtomicU64,
+    /// Window start timestamp (only used in windowed mode).
+    /// Stores seconds since epoch when the first failure in the current window occurred.
+    /// 0 means no active window.
+    window_start: AtomicU64,
+    /// Configuration (immutable after construction).
+    config: CircuitBreakerConfig,
+    /// Clock function for getting current time in seconds.
+    clock: fn() -> u64,
+}
+
+impl std::fmt::Debug for CircuitBreaker {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CircuitBreaker")
+            .field("state", &self.state)
+            .field("window_start", &self.window_start)
+            .field("config", &self.config)
+            .finish_non_exhaustive()
+    }
+}
+
+// State encoding constants
+const STATE_CLOSED: u64 = 0;
+const STATE_OPEN: u64 = 1;
+const STATE_HALF_OPEN: u64 = 2;
+
+const STATE_SHIFT: u32 = 62;
+const STATE_MASK: u64 = 0b11;
+
+const FAILURE_SHIFT: u32 = 48;
+const FAILURE_MASK: u64 = 0x3FFF; // 14 bits
+
+const PERMIT_SHIFT: u32 = 32;
+const PERMIT_MASK: u64 = 0xFFFF; // 16 bits
+
+const TIMESTAMP_MASK: u64 = 0xFFFF_FFFF; // 32 bits
+
+impl CircuitBreaker {
+    /// Create a new circuit breaker with the given configuration.
+    pub fn new(config: CircuitBreakerConfig) -> Self {
+        Self::with_clock(config, Self::system_clock)
+    }
+
+    /// Create a circuit breaker with a custom clock (for testing).
+    pub fn with_clock(config: CircuitBreakerConfig, clock: fn() -> u64) -> Self {
+        let initial = Self::pack(STATE_CLOSED, 0, 0, clock());
+        Self {
+            state: AtomicU64::new(initial),
+            window_start: AtomicU64::new(0),
+            config,
+            clock,
+        }
+    }
+
+    /// System clock returning seconds since epoch (32-bit, wrapping).
+    #[inline]
+    fn system_clock() -> u64 {
+        std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .map(|d| d.as_secs() & TIMESTAMP_MASK)
+            .unwrap_or(0)
+    }
+
+    /// Get current time from the configured clock.
+    #[inline]
+    fn now_secs(&self) -> u64 {
+        (self.clock)()
+    }
+
+    /// Pack state components into a single u64.
+    #[inline]
+    fn pack(state: u64, failures: u64, permits: u64, timestamp: u64) -> u64 {
+        ((state & STATE_MASK) << STATE_SHIFT)
+            | ((failures & FAILURE_MASK) << FAILURE_SHIFT)
+            | ((permits & PERMIT_MASK) << PERMIT_SHIFT)
+            | (timestamp & TIMESTAMP_MASK)
+    }
+
+    /// Unpack a u64 into state components.
+    #[inline]
+    fn unpack(packed: u64) -> (u64, u64, u64, u64) {
+        let state = (packed >> STATE_SHIFT) & STATE_MASK;
+        let failures = (packed >> FAILURE_SHIFT) & FAILURE_MASK;
+        let permits = (packed >> PERMIT_SHIFT) & PERMIT_MASK;
+        let timestamp = packed & TIMESTAMP_MASK;
+        (state, failures, permits, timestamp)
+    }
+
+    /// Check if a request should be allowed through the circuit.
+    ///
+    /// Returns `Ok(())` if the request is allowed, `Err(CircuitOpen)` if the
+    /// circuit is open and the request should be rejected.
+    ///
+    /// In half-open state, this atomically decrements the permit count.
+    pub fn allow(&self) -> Result<(), CircuitOpen> {
+        loop {
+            let packed = self.state.load(Ordering::Acquire);
+            let (state, failures, permits, timestamp) = Self::unpack(packed);
+
+            match state {
+                STATE_CLOSED => return Ok(()),
+
+                STATE_OPEN => {
+                    let now = self.now_secs();
+                    let elapsed = now.wrapping_sub(timestamp);
+
+                    if elapsed >= self.config.reset_timeout.as_secs() {
+                        // Timeout elapsed, try to transition to half-open
+                        let new_packed = Self::pack(
+                            STATE_HALF_OPEN,
+                            0,
+                            u64::from(self.config.half_open_permits),
+                            now,
+                        );
+
+                        match self.state.compare_exchange_weak(
+                            packed,
+                            new_packed,
+                            Ordering::AcqRel,
+                            Ordering::Acquire,
+                        ) {
+                            Ok(_) => continue,  // Transitioned, retry allow()
+                            Err(_) => continue, // Someone else modified, retry
+                        }
+                    }
+                    return Err(CircuitOpen);
+                }
+
+                STATE_HALF_OPEN => {
+                    if permits == 0 {
+                        return Err(CircuitOpen);
+                    }
+
+                    // Try to claim a permit
+                    let new_packed = Self::pack(STATE_HALF_OPEN, failures, permits - 1, timestamp);
+
+                    match self.state.compare_exchange_weak(
+                        packed,
+                        new_packed,
+                        Ordering::AcqRel,
+                        Ordering::Acquire,
+                    ) {
+                        Ok(_) => return Ok(()),
+                        Err(_) => continue, // CAS failed, retry
+                    }
+                }
+
+                _ => {
+                    // Invalid state, reset to closed
+                    let new_packed = Self::pack(STATE_CLOSED, 0, 0, self.now_secs());
+                    let _ = self.state.compare_exchange(
+                        packed,
+                        new_packed,
+                        Ordering::AcqRel,
+                        Ordering::Acquire,
+                    );
+                    return Ok(());
+                }
+            }
+        }
+    }
+
+    /// Record a successful request.
+    ///
+    /// In closed state, resets the failure counter (and window for windowed mode).
+    /// In half-open state, closes the circuit (service is healthy again).
+    pub fn record_success(&self) {
+        // Reset window start for windowed mode
+        self.window_start.store(0, Ordering::Release);
+
+        loop {
+            let packed = self.state.load(Ordering::Acquire);
+            let (state, _, _, _) = Self::unpack(packed);
+
+            let new_packed = match state {
+                STATE_CLOSED => {
+                    // Reset failure count, keep closed
+                    Self::pack(STATE_CLOSED, 0, 0, self.now_secs())
+                }
+                STATE_HALF_OPEN => {
+                    // Success in half-open: close the circuit
+                    Self::pack(STATE_CLOSED, 0, 0, self.now_secs())
+                }
+                STATE_OPEN => return, // Shouldn't record success while open
+                _ => return,
+            };
+
+            match self.state.compare_exchange_weak(
+                packed,
+                new_packed,
+                Ordering::AcqRel,
+                Ordering::Acquire,
+            ) {
+                Ok(_) => return,
+                Err(_) => continue,
+            }
+        }
+    }
+
+    /// Record a failed request.
+    ///
+    /// In closed state, increments the failure counter and opens the circuit
+    /// if the threshold is reached.
+    /// In half-open state, reopens the circuit immediately.
+    pub fn record_failure(&self) {
+        match &self.config.failure_policy {
+            FailurePolicy::Consecutive { threshold } => {
+                self.record_failure_consecutive(*threshold);
+            }
+            FailurePolicy::Windowed { threshold, window } => {
+                self.record_failure_windowed(*threshold, window.as_secs());
+            }
+        }
+    }
+
+    /// Record failure with consecutive failure tracking.
+    fn record_failure_consecutive(&self, threshold: u32) {
+        loop {
+            let packed = self.state.load(Ordering::Acquire);
+            let (state, failures, _, _) = Self::unpack(packed);
+            let now = self.now_secs();
+
+            let new_packed = match state {
+                STATE_CLOSED => {
+                    let new_failures = failures + 1;
+                    if new_failures >= u64::from(threshold) {
+                        Self::pack(STATE_OPEN, 0, 0, now)
+                    } else {
+                        Self::pack(STATE_CLOSED, new_failures, 0, now)
+                    }
+                }
+                STATE_HALF_OPEN => Self::pack(STATE_OPEN, 0, 0, now),
+                STATE_OPEN => return,
+                _ => return,
+            };
+
+            match self.state.compare_exchange_weak(
+                packed,
+                new_packed,
+                Ordering::AcqRel,
+                Ordering::Acquire,
+            ) {
+                Ok(_) => return,
+                Err(_) => continue,
+            }
+        }
+    }
+
+    /// Record failure with windowed failure tracking.
+    fn record_failure_windowed(&self, threshold: u32, window_secs: u64) {
+        let now = self.now_secs();
+
+        // Handle window timing
+        let window_start = self.window_start.load(Ordering::Acquire);
+        let (new_window_start, reset_count) = if window_start == 0 {
+            // First failure, start new window
+            (now, true)
+        } else if now.wrapping_sub(window_start) >= window_secs {
+            // Window expired, start new window
+            (now, true)
+        } else {
+            // Within window, continue counting
+            (window_start, false)
+        };
+
+        // Update window start if needed (best-effort, races are acceptable)
+        if new_window_start != window_start {
+            let _ = self.window_start.compare_exchange(
+                window_start,
+                new_window_start,
+                Ordering::Release,
+                Ordering::Relaxed,
+            );
+        }
+
+        // Now update the main state
+        loop {
+            let packed = self.state.load(Ordering::Acquire);
+            let (state, failures, _, _) = Self::unpack(packed);
+
+            let new_packed = match state {
+                STATE_CLOSED => {
+                    let new_failures = if reset_count { 1 } else { failures + 1 };
+                    if new_failures >= u64::from(threshold) {
+                        // Reset window when opening circuit
+                        self.window_start.store(0, Ordering::Release);
+                        Self::pack(STATE_OPEN, 0, 0, now)
+                    } else {
+                        Self::pack(STATE_CLOSED, new_failures, 0, now)
+                    }
+                }
+                STATE_HALF_OPEN => {
+                    self.window_start.store(0, Ordering::Release);
+                    Self::pack(STATE_OPEN, 0, 0, now)
+                }
+                STATE_OPEN => return,
+                _ => return,
+            };
+
+            match self.state.compare_exchange_weak(
+                packed,
+                new_packed,
+                Ordering::AcqRel,
+                Ordering::Acquire,
+            ) {
+                Ok(_) => return,
+                Err(_) => continue,
+            }
+        }
+    }
+
+    /// Get the current circuit state for observability.
+    pub fn state(&self) -> CircuitState {
+        let packed = self.state.load(Ordering::Acquire);
+        let (state, failures, permits, _) = Self::unpack(packed);
+
+        match state {
+            STATE_CLOSED => CircuitState::Closed {
+                failure_count: failures as u32,
+            },
+            STATE_OPEN => CircuitState::Open,
+            STATE_HALF_OPEN => CircuitState::HalfOpen {
+                permits_remaining: permits as u32,
+            },
+            _ => CircuitState::Closed { failure_count: 0 },
+        }
+    }
+
+    /// Reset the circuit breaker to closed state.
+    pub fn reset(&self) {
+        self.window_start.store(0, Ordering::Release);
+        let packed = Self::pack(STATE_CLOSED, 0, 0, self.now_secs());
+        self.state.store(packed, Ordering::Release);
+    }
+
+    /// Force the circuit to a specific state (for testing/admin).
+    #[cfg(test)]
+    pub fn force_state(&self, new_state: CircuitState) {
+        let now = self.now_secs();
+        let packed = match new_state {
+            CircuitState::Closed { failure_count } => {
+                Self::pack(STATE_CLOSED, u64::from(failure_count), 0, now)
+            }
+            CircuitState::Open => Self::pack(STATE_OPEN, 0, 0, now),
+            CircuitState::HalfOpen { permits_remaining } => {
+                Self::pack(STATE_HALF_OPEN, 0, u64::from(permits_remaining), now)
+            }
+        };
+        self.state.store(packed, Ordering::Release);
+    }
+}
+
+/// Error returned when the circuit is open.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct CircuitOpen;
+
+impl std::fmt::Display for CircuitOpen {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "circuit breaker is open")
+    }
+}
+
+impl std::error::Error for CircuitOpen {}
+
+/// Observable circuit state.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CircuitState {
+    /// Circuit is closed, requests flow through normally.
+    Closed {
+        /// Number of failures since last success/reset.
+        failure_count: u32,
+    },
+    /// Circuit is open, requests are rejected immediately.
+    Open,
+    /// Circuit is half-open, limited probe requests allowed.
+    HalfOpen {
+        /// Number of probe requests still allowed.
+        permits_remaining: u32,
+    },
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+    use std::thread;
+
+    // =========================================================================
+    // Consecutive mode tests
+    // =========================================================================
+
+    #[test]
+    fn test_initial_state_is_closed() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::default());
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 0 });
+    }
+
+    #[test]
+    fn test_allow_when_closed() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::default());
+        assert!(cb.allow().is_ok());
+    }
+
+    #[test]
+    fn test_consecutive_failures_increment() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::consecutive(5));
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 1 });
+
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 3 });
+    }
+
+    #[test]
+    fn test_consecutive_success_resets_failures() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::consecutive(5));
+
+        cb.record_failure();
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 3 });
+
+        cb.record_success();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 0 });
+    }
+
+    #[test]
+    fn test_consecutive_opens_at_threshold() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::consecutive(3));
+
+        cb.record_failure();
+        cb.record_failure();
+        assert!(matches!(cb.state(), CircuitState::Closed { .. }));
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+    }
+
+    #[test]
+    fn test_rejects_when_open() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::consecutive(1).reset_timeout(Duration::from_secs(3600)),
+        );
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+        assert!(cb.allow().is_err());
+    }
+
+    #[test]
+    fn test_half_open_after_timeout() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::consecutive(1)
+                .reset_timeout(Duration::from_millis(1))
+                .half_open_permits(2),
+        );
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+
+        thread::sleep(Duration::from_millis(10));
+
+        assert!(cb.allow().is_ok());
+        assert!(matches!(cb.state(), CircuitState::HalfOpen { .. }));
+    }
+
+    #[test]
+    fn test_half_open_permits_decrement() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::consecutive(1)
+                .reset_timeout(Duration::from_millis(1))
+                .half_open_permits(3),
+        );
+
+        cb.record_failure();
+        thread::sleep(Duration::from_millis(10));
+
+        assert!(cb.allow().is_ok());
+        assert_eq!(
+            cb.state(),
+            CircuitState::HalfOpen {
+                permits_remaining: 2
+            }
+        );
+
+        assert!(cb.allow().is_ok());
+        assert_eq!(
+            cb.state(),
+            CircuitState::HalfOpen {
+                permits_remaining: 1
+            }
+        );
+
+        assert!(cb.allow().is_ok());
+        assert_eq!(
+            cb.state(),
+            CircuitState::HalfOpen {
+                permits_remaining: 0
+            }
+        );
+
+        assert!(cb.allow().is_err());
+    }
+
+    #[test]
+    fn test_half_open_success_closes() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::consecutive(1)
+                .reset_timeout(Duration::from_millis(1))
+                .half_open_permits(3),
+        );
+
+        cb.record_failure();
+        thread::sleep(Duration::from_millis(10));
+        let _ = cb.allow();
+
+        cb.record_success();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 0 });
+    }
+
+    #[test]
+    fn test_half_open_failure_reopens() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::consecutive(1)
+                .reset_timeout(Duration::from_millis(1))
+                .half_open_permits(3),
+        );
+
+        cb.record_failure();
+        thread::sleep(Duration::from_millis(10));
+        let _ = cb.allow();
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+    }
+
+    // =========================================================================
+    // Windowed mode tests
+    // =========================================================================
+
+    #[test]
+    fn test_windowed_opens_at_threshold() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::windowed(3, Duration::from_secs(10)));
+
+        cb.record_failure();
+        cb.record_failure();
+        assert!(matches!(cb.state(), CircuitState::Closed { .. }));
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+    }
+
+    #[test]
+    fn test_windowed_resets_after_window() {
+        // Note: window uses second-level precision, so use 1 second window
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::windowed(3, Duration::from_secs(1)));
+
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 2 });
+
+        // Wait for window to expire (1 second + buffer)
+        thread::sleep(Duration::from_millis(1100));
+
+        // This failure starts a new window
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 1 });
+
+        // Two more to hit threshold
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+    }
+
+    #[test]
+    fn test_windowed_success_resets_window() {
+        let cb = CircuitBreaker::new(CircuitBreakerConfig::windowed(3, Duration::from_secs(10)));
+
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 2 });
+
+        // Success resets the failure count
+        cb.record_success();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 0 });
+
+        // Need 3 fresh failures to open
+        cb.record_failure();
+        cb.record_failure();
+        assert!(matches!(cb.state(), CircuitState::Closed { .. }));
+
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+    }
+
+    #[test]
+    fn test_windowed_half_open_recovery() {
+        let cb = CircuitBreaker::new(
+            CircuitBreakerConfig::windowed(2, Duration::from_secs(10))
+                .reset_timeout(Duration::from_millis(1)),
+        );
+
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+
+        thread::sleep(Duration::from_millis(10));
+
+        assert!(cb.allow().is_ok());
+        assert!(matches!(cb.state(), CircuitState::HalfOpen { .. }));
+
+        cb.record_success();
+        assert_eq!(cb.state(), CircuitState::Closed { failure_count: 0 });
+    }
+
+    // =========================================================================
+    // Concurrency tests
+    // =========================================================================
+
+    #[test]
+    fn test_concurrent_failures_open_exactly_once() {
+        for _ in 0..100 {
+            let cb = Arc::new(CircuitBreaker::new(
+                CircuitBreakerConfig::consecutive(10).reset_timeout(Duration::from_secs(3600)),
+            ));
+
+            let handles: Vec<_> = (0..20)
+                .map(|_| {
+                    let cb = Arc::clone(&cb);
+                    thread::spawn(move || {
+                        cb.record_failure();
+                    })
+                })
+                .collect();
+
+            for h in handles {
+                h.join().unwrap();
+            }
+
+            assert_eq!(cb.state(), CircuitState::Open);
+        }
+    }
+
+    #[test]
+    fn test_concurrent_allow_in_half_open_respects_permits() {
+        for _ in 0..100 {
+            let cb = Arc::new(CircuitBreaker::new(
+                CircuitBreakerConfig::consecutive(1)
+                    .reset_timeout(Duration::from_millis(1))
+                    .half_open_permits(5),
+            ));
+
+            cb.record_failure();
+            thread::sleep(Duration::from_millis(10));
+
+            let allowed = Arc::new(std::sync::atomic::AtomicU32::new(0));
+
+            let handles: Vec<_> = (0..20)
+                .map(|_| {
+                    let cb = Arc::clone(&cb);
+                    let allowed = Arc::clone(&allowed);
+                    thread::spawn(move || {
+                        if cb.allow().is_ok() {
+                            allowed.fetch_add(1, Ordering::SeqCst);
+                        }
+                    })
+                })
+                .collect();
+
+            for h in handles {
+                h.join().unwrap();
+            }
+
+            let total_allowed = allowed.load(Ordering::SeqCst);
+            assert!(
+                total_allowed <= 5,
+                "allowed {} requests but only 5 permits",
+                total_allowed
+            );
+        }
+    }
+
+    #[test]
+    fn test_concurrent_windowed_failures() {
+        for _ in 0..50 {
+            let cb = Arc::new(CircuitBreaker::new(CircuitBreakerConfig::windowed(
+                10,
+                Duration::from_secs(60),
+            )));
+
+            let handles: Vec<_> = (0..20)
+                .map(|_| {
+                    let cb = Arc::clone(&cb);
+                    thread::spawn(move || {
+                        cb.record_failure();
+                    })
+                })
+                .collect();
+
+            for h in handles {
+                h.join().unwrap();
+            }
+
+            assert_eq!(cb.state(), CircuitState::Open);
+        }
+    }
+
+    // =========================================================================
+    // Pack/unpack tests
+    // =========================================================================
+
+    #[test]
+    fn test_pack_unpack_roundtrip() {
+        let test_cases = [
+            (STATE_CLOSED, 0, 0, 0),
+            (STATE_OPEN, 0, 0, 12345),
+            (STATE_HALF_OPEN, 100, 50, 999999),
+            (STATE_CLOSED, FAILURE_MASK, PERMIT_MASK, TIMESTAMP_MASK),
+        ];
+
+        for (state, failures, permits, timestamp) in test_cases {
+            let packed = CircuitBreaker::pack(state, failures, permits, timestamp);
+            let (s, f, p, t) = CircuitBreaker::unpack(packed);
+            assert_eq!(s, state, "state mismatch");
+            assert_eq!(f, failures, "failures mismatch");
+            assert_eq!(p, permits, "permits mismatch");
+            assert_eq!(t, timestamp, "timestamp mismatch");
+        }
+    }
+
+    // =========================================================================
+    // Builder API tests
+    // =========================================================================
+
+    #[test]
+    fn test_builder_consecutive() {
+        let config = CircuitBreakerConfig::consecutive(5)
+            .reset_timeout(Duration::from_secs(60))
+            .half_open_permits(10);
+
+        assert_eq!(
+            config.failure_policy,
+            FailurePolicy::Consecutive { threshold: 5 }
+        );
+        assert_eq!(config.reset_timeout, Duration::from_secs(60));
+        assert_eq!(config.half_open_permits, 10);
+    }
+
+    #[test]
+    fn test_builder_windowed() {
+        let config = CircuitBreakerConfig::windowed(3, Duration::from_secs(10))
+            .reset_timeout(Duration::from_secs(30))
+            .half_open_permits(5);
+
+        assert_eq!(
+            config.failure_policy,
+            FailurePolicy::Windowed {
+                threshold: 3,
+                window: Duration::from_secs(10)
+            }
+        );
+        assert_eq!(config.reset_timeout, Duration::from_secs(30));
+        assert_eq!(config.half_open_permits, 5);
+    }
+}
diff --git a/src/config.rs b/src/config.rs
new file mode 100644
index 0000000..c454ad9
--- /dev/null
+++ b/src/config.rs
@@ -0,0 +1,482 @@
+//! Layered configuration (PATTERNS.md: Figment defaults → TOML → `AI_`-prefixed env).
+//!
+//! Auth + key material come from config (signing public keys, managed pool keys), so the gateway
+//! is fully functional from boot config alone — NATS is only needed for the deny-set.
+
+use crate::error::{GatewayError, Result};
+use crate::key::{Keyring, Kid};
+use crate::secret::Secret;
+use figment::Figment;
+use figment::providers::{Env, Format, Toml};
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::path::Path;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+// `default` so every field is optional. We deliberately do NOT set serde's `deny_unknown_fields`:
+// config is merged from `Env::prefixed("AI_")`, a namespace shared with foreign variables the
+// platform injects (e.g. `AI_AGENT`, `AI_LOG`), so rejecting unknown keys at the serde layer would
+// fail load on a valid environment. Typo protection is instead enforced one layer down, against the
+// *TOML file only* (`reject_unknown_toml_keys`): the file is ours alone — not a shared namespace —
+// so an unrecognized key there is unambiguously a mistake, and a silent one (it loads its default
+// and the setting does nothing), worth a hard, visible boot failure.
+#[serde(default)]
+pub struct AiConfig {
+    /// Downstream listener for client (app) traffic. Internal-only in production (Service Connect
+    /// fronts it as `ai.internal`); no public ingress, so plain HTTP here is fine.
+    pub listen: String,
+    /// Prometheus metrics listener.
+    pub metrics_listen: String,
+
+    /// NATS / slipstream connection (cf. `_envcommon/ecs-service.hcl`: `tls://connect.ngs.global`).
+    /// Used only for the watched deny-set (`blackhole.*`).
+    pub nats_url: String,
+    /// Base64 `.creds` (ECS via SOPS) — takes priority over `nats_creds_file`. Held in `Secret` so
+    /// it can't leak through the `Debug`/`Serialize` this struct derives (a stray `?config` log).
+    pub nats_creds: Option<Secret>,
+    pub nats_creds_file: Option<String>,
+    /// slipstream bucket holding `blackhole.*` (the deny-set — the only thing in NATS).
+    pub config_bucket: String,
+
+    /// Optional path to an on-disk deny-set snapshot (slipstream's append-log + resume cursor). When
+    /// set **and on durable storage** (the edge/tunnel deployment model), a restart seeds the
+    /// deny-set from this file and *resumes the NATS watch from the saved revision* — skipping the
+    /// boot scan and surviving a restart with enforcement intact even before NATS reconnects. Unset
+    /// (the default, e.g. ephemeral/Fargate) ⇒ seed from a NATS scan each boot, unchanged. The file
+    /// is a pure cache: delete it (or point at scratch) and the gateway falls back to scanning.
+    pub snapshot_path: Option<String>,
+
+    /// Trusted Ed25519 signing **public** keys: `kid` (as string — TOML/JSON map keys are strings)
+    /// → base64 public key. Multiple allowed for zero-downtime rotation. Config, not NATS.
+    pub signing_keys: HashMap<String, String>,
+
+    /// Fail the boot if `signing_keys` is empty, instead of degrading to BYO-only. Empty signing
+    /// keys is a *legitimate* mode (a BYO-only deployment) but is far more often a mis-deploy — a
+    /// typo'd/absent SSM param — that looks healthy while silently dropping **all** managed billing
+    /// and deny-set enforcement. A managed deployment should set this `true` so a bad deploy fails
+    /// fast and visibly at boot rather than serving for free. Default `false` to keep BYO-only and
+    /// the test/e2e harnesses (which run keyless) working out of the box.
+    pub require_signing_keys: bool,
+
+    /// Managed Beyond pool keys, **by provider name** (`openai`, `anthropic`, `fireworks`, …).
+    /// From the `[pool_keys]` TOML table or SSM-injected `AI_POOL_KEY_<NAME>` env (the env form is
+    /// the production path — see `load_with_path`). A provider with no pool key here can't serve
+    /// managed traffic (→ 503); BYO is unaffected. Values are `Secret` so a key can't leak through
+    /// the `Debug`/`Serialize` this struct derives; read the plaintext via `expose` at the use site.
+    pub pool_keys: HashMap<String, Secret>,
+
+    /// Per-provider upstream authority (`host:port`), **by provider name**. For a known provider
+    /// (see `route::KNOWN_PROVIDERS`) this *overrides* its default; for an unknown name it *adds* a
+    /// new OpenAI-wire provider, then reachable at `/{name}/…` (the provider is the request's first
+    /// path segment). Empty = every known provider uses its built-in default. (The e2e harness points
+    /// providers at a mock here.)
+    pub provider_authorities: HashMap<String, String>,
+
+    /// Upstream timeouts (seconds). Streaming responses are long, so read/idle are generous.
+    pub connect_timeout_secs: u64,
+    pub read_timeout_secs: u64,
+    pub write_timeout_secs: u64,
+    pub idle_timeout_secs: u64,
+
+    /// Graceful-shutdown drain window (seconds): after SIGTERM, how long Pingora lets **in-flight
+    /// requests finish** before tearing the runtimes down. Maps to Pingora's `grace_period_seconds`
+    /// (left unset, Pingora silently defaults to 300s — this knob makes the window explicit).
+    ///
+    /// **Default to `read_timeout_secs` so we never truncate a response.** The gateway is a
+    /// transparent man-in-the-middle: cutting an in-flight stream on deploy corrupts a generation the
+    /// caller is paying for and can't cleanly retry (a half-delivered SSE isn't idempotent). The
+    /// longest a request can live is `read_timeout_secs`, so a drain window of at least that
+    /// guarantees every accepted request finishes — Pingora stops *accepting* new connections the
+    /// instant SIGTERM lands, so this only ever waits out the existing longest stream, not new work.
+    /// Slower rollouts are the deliberate price of not mangling responses.
+    ///
+    /// **The orchestrator must grant the same window**, or it caps us: the platform SIGKILLs at its
+    /// own stop timeout regardless of this value. Set k8s `terminationGracePeriodSeconds` (or the EC2
+    /// agent's `ECS_CONTAINER_STOP_TIMEOUT`) to match. Note **ECS Fargate caps `stopTimeout` at 120s**
+    /// — there, full coverage of a 600s stream is impossible and the longest streams will still be
+    /// cut at 120s; that's a Fargate limitation, not a reason to default to truncating.
+    pub shutdown_grace_period_secs: u64,
+    /// Final runtime-teardown timeout (seconds) **after** the drain window: how long Pingora waits for
+    /// the tokio runtimes to exit before forcing the process down. Maps to Pingora's
+    /// `graceful_shutdown_timeout_seconds` (unset ⇒ a silent 5s default). A few seconds is enough to
+    /// flush logs/metrics; this is a backstop against a wedged runtime hanging shutdown forever, not a
+    /// second drain window (that's `shutdown_grace_period_secs`).
+    pub shutdown_runtime_timeout_secs: u64,
+
+    /// TLS to the upstream provider. Real providers are HTTPS (true); the e2e harness sets false
+    /// to talk to a plaintext mock.
+    pub upstream_tls: bool,
+
+    /// Prefer HTTP/2 (with HTTP/1.1 fallback) to the upstream. `true` ⇒ peer ALPN `H2H1`: every
+    /// provider that offers `h2` over TLS is reached over a multiplexed H2 connection (fewer sockets
+    /// and TLS handshakes from our egress IPs), and any host that doesn't offer it negotiates down to
+    /// H1. `false` ⇒ ALPN `H1` (one connection per in-flight request, pooled). The knob exists so an
+    /// operator can fall back to H1 without a code redeploy if a provider's h2 stack misbehaves, and
+    /// so the e2e concurrency bench can compare the two. Only consulted over TLS — a plaintext upstream
+    /// (the mock) has no ALPN and is always H1 regardless.
+    pub upstream_http2: bool,
+
+    /// Verify the upstream's TLS certificate (and that it matches the SNI). `true` everywhere in
+    /// production. The **only** intended `false` is the e2e concurrency bench, whose TLS mock presents
+    /// a self-signed cert — turning verification off there lets us exercise the real TLS+ALPN+H2 path
+    /// against a local mock without a CA. Never set this `false` against a real provider.
+    pub upstream_verify_cert: bool,
+
+    /// Per-credential request-rate ceiling (requests/sec). A blast-radius guardrail (see `ratelimit`),
+    /// not a spend control: it caps how fast a single credential (managed virtual key ≈ a `(tenant,
+    /// app)`, or a BYO token) can drive the gateway, bounding a leaked/runaway key during the
+    /// deny-set's reaction lag and a failure flood that never bills. `0` disables it. The default is
+    /// generous — a circuit breaker, not a quota; tune from `ai_rejections_total{reason="rate_limit"}`.
+    pub rate_limit_rps: u32,
+
+    /// Aggregate request-rate ceiling (requests/sec) for **all BYO traffic combined** — a single
+    /// shared bucket. BYO is unverified and upstream-bound, so a flood of *distinct* random BYO tokens
+    /// slips past the per-credential ceiling and would open junk-auth connections to providers from
+    /// our egress IPs (getting them rate-limited or banned). This bounds that aggregate regardless of
+    /// token variation. Managed traffic is **exempt** (it's Ed25519-verified before any upstream
+    /// connect and can't be forged), so this shared bucket never sheds core tenant load. `0` disables
+    /// it. Generous by default; tune from `ai_rejections_total{reason="rate_limit_byo_global"}`.
+    ///
+    /// Before changing this (or reaching for per-IP limiting), read the **design-decision** block in
+    /// the `ratelimit` module docs: it records why this is a global cap and not per-source-IP, what it
+    /// deliberately doesn't cover, and why the real fix for egress-reputation pain is a
+    /// provider-feedback circuit breaker rather than a bigger number here.
+    pub byo_rate_limit_rps: u32,
+
+    /// Per-provider circuit breaker: number of upstream **failures within `circuit_breaker_window_secs`**
+    /// that trips the breaker open for that provider. A failure is a **5xx response or a connect
+    /// failure** — i.e. the *provider is broken*. A `429` is deliberately **not** a failure: it means
+    /// the provider is healthy and throttling our pool key (a velocity/spend signal the rate limiter
+    /// and the client's `Retry-After` backoff own), so tripping on it would convert a self-healing
+    /// throttle into a self-inflicted outage. While open, requests to that provider fast-fail with a
+    /// `503` (`ai_rejections_total{reason="circuit_open"}`) instead of piling up against
+    /// `read_timeout_secs` and exhausting connection/in-flight slots for *every* provider. After
+    /// `circuit_breaker_reset_secs` a probe request is allowed; success closes it, failure reopens it.
+    /// Applies to **all** traffic to the provider (managed + BYO) — a down provider is down regardless
+    /// of whose key is used. `0` disables the breaker entirely. Default is generous so normal
+    /// background 5xx noise never trips it.
+    pub circuit_breaker_threshold: u32,
+    /// Rolling window (seconds) over which `circuit_breaker_threshold` failures are counted. Failures
+    /// older than the window are forgotten — so it trips on a *burst* of failures, not on a slow trickle
+    /// spread across a healthy day.
+    pub circuit_breaker_window_secs: u64,
+    /// How long the breaker stays open before allowing a half-open probe request (seconds). Long enough
+    /// to let a provider recover, short enough that recovery is detected promptly.
+    pub circuit_breaker_reset_secs: u64,
+}
+
+impl Default for AiConfig {
+    fn default() -> Self {
+        Self {
+            listen: "0.0.0.0:8080".to_string(),
+            metrics_listen: "0.0.0.0:9090".to_string(),
+            nats_url: "nats://localhost:4222".to_string(),
+            nats_creds: None,
+            nats_creds_file: None,
+            config_bucket: "ai-gateway".to_string(),
+            snapshot_path: None,
+            signing_keys: HashMap::new(),
+            require_signing_keys: false,
+            pool_keys: HashMap::new(),
+            provider_authorities: HashMap::new(),
+            connect_timeout_secs: 10,
+            // Generous: LLM streams can run for minutes; a tight read timeout would kill them.
+            read_timeout_secs: 600,
+            write_timeout_secs: 60,
+            idle_timeout_secs: 90,
+            // Drain for the full request lifetime (= read_timeout_secs) so a deploy never truncates
+            // an in-flight stream — we're a transparent proxy and must not mangle a paid-for
+            // generation. Pingora stops accepting new connections at SIGTERM, so this only waits out
+            // the longest existing stream. The orchestrator's stop timeout must match (see field
+            // docs; ECS Fargate's 120s cap is a hard limit there). Then a short teardown backstop.
+            shutdown_grace_period_secs: 600,
+            shutdown_runtime_timeout_secs: 10,
+            upstream_tls: true,
+            // Prefer H2 to providers by default (all of `KNOWN_PROVIDERS` offer it; H1 fallback is
+            // automatic). Flip to false for an all-H1 upstream without recompiling.
+            upstream_http2: true,
+            // Verify upstream certs by default; only the bench's self-signed TLS mock turns this off.
+            upstream_verify_cert: true,
+            // Generous per-credential circuit breaker, on by default. Won't touch legitimate
+            // steady-state traffic; caps a runaway/leaked key or a retry-storm flood. Set 0 to disable.
+            rate_limit_rps: 100,
+            // Generous aggregate BYO ceiling, on by default — well above any expected legitimate BYO
+            // throughput, low enough that a junk-auth flood can't get our egress IPs flagged by the
+            // providers. Tune from the metric; set 0 to disable. (Managed traffic is exempt.)
+            byo_rate_limit_rps: 1_000,
+            // Per-provider breaker: trip after 20 upstream failures (5xx/connect) within 10s, stay
+            // open 30s, then probe. Generous enough that a provider's occasional background 5xx never
+            // trips it — only a sustained brownout does. Set threshold 0 to disable.
+            circuit_breaker_threshold: 20,
+            circuit_breaker_window_secs: 10,
+            circuit_breaker_reset_secs: 30,
+        }
+    }
+}
+
+impl AiConfig {
+    pub fn load_with_path(path: Option<&Path>) -> Result<Self> {
+        let toml_path = path.unwrap_or_else(|| Path::new("config.toml"));
+        // Catch a typo'd key in the operator's own TOML *before* any of it merges — a misspelled
+        // `require_signing_keys` would otherwise load its default and silently drop all managed
+        // billing while the gateway looks healthy. Only the TOML file is checked (see the
+        // `deny_unknown_fields` note on `AiConfig`); the env layer must stay lenient.
+        reject_unknown_toml_keys(toml_path)?;
+
+        let mut fig = Figment::from(figment::providers::Serialized::defaults(AiConfig::default()));
+        fig = fig.merge(Toml::file(toml_path));
+        // Flat mapping: `AI_READ_TIMEOUT_SECS` → `read_timeout_secs`. (No `.split('_')` — these are
+        // flat fields, not nested tables.) Unknown `AI_*` vars are tolerated (see the
+        // `deny_unknown_fields` note on `AiConfig`) — which is also why pool keys are collected
+        // separately below rather than via this flat merge.
+        fig = fig.merge(Env::prefixed("AI_"));
+        let mut cfg: AiConfig = fig
+            .extract()
+            .map_err(|e| GatewayError::Config(e.to_string()))?;
+        cfg.merge_pool_key_env(std::env::vars());
+        cfg.validate()?;
+        Ok(cfg)
+    }
+
+    /// Reject nonsensical values that would otherwise fail silently at runtime. A `0` connect/read
+    /// timeout (a typo'd SSM param) becomes a `Duration::from_secs(0)` deadline that fails every
+    /// upstream call immediately — surfacing only as a 502 cascade, not a loud boot failure. Catch it
+    /// here so a mis-deploy fails fast and visibly. Write/idle are not load-bearing for correctness
+    /// (Pingora treats them as best-effort), so they're left unconstrained.
+    fn validate(&self) -> Result<()> {
+        if self.connect_timeout_secs == 0 {
+            return Err(GatewayError::Config(
+                "connect_timeout_secs must be > 0 (a 0 connect timeout fails every upstream connect)"
+                    .to_string(),
+            ));
+        }
+        if self.read_timeout_secs == 0 {
+            return Err(GatewayError::Config(
+                "read_timeout_secs must be > 0 (a 0 read timeout aborts every response before it arrives)"
+                    .to_string(),
+            ));
+        }
+        Ok(())
+    }
+
+    /// The per-provider circuit-breaker config, or `None` when disabled (`circuit_breaker_threshold
+    /// == 0`). Windowed policy: a degrading backend trips on a *burst* of failures, not a slow
+    /// trickle (see `circuit_breaker` crate docs). Each provider gets its own breaker built from this
+    /// (see `state::build_providers`).
+    pub fn circuit_breaker_config(&self) -> Option<crate::circuit_breaker::CircuitBreakerConfig> {
+        if self.circuit_breaker_threshold == 0 {
+            return None;
+        }
+        Some(
+            crate::circuit_breaker::CircuitBreakerConfig::windowed(
+                self.circuit_breaker_threshold,
+                std::time::Duration::from_secs(self.circuit_breaker_window_secs),
+            )
+            .reset_timeout(std::time::Duration::from_secs(
+                self.circuit_breaker_reset_secs,
+            )),
+        )
+    }
+
+    /// Fold `AI_POOL_KEY_<NAME>` environment variables into `pool_keys` (provider name lowercased).
+    /// This is the production secret path (SSM-injected env); a flat figment merge can't target a
+    /// map field, and env must win over any `[pool_keys]` value baked into a config file.
+    fn merge_pool_key_env(&mut self, vars: impl Iterator<Item = (String, String)>) {
+        for (k, v) in vars {
+            if let Some(name) = k.strip_prefix("AI_POOL_KEY_") {
+                self.pool_keys
+                    .insert(name.to_ascii_lowercase(), Secret::new(v));
+            }
+        }
+    }
+
+    /// Build the trusted keyring from the configured signing public keys.
+    pub fn build_keyring(&self) -> Result<Keyring> {
+        let mut ring = Keyring::new();
+        for (kid_str, b64) in &self.signing_keys {
+            let kid: Kid = kid_str
+                .parse()
+                .map_err(|_| GatewayError::Config(format!("invalid signing key id {kid_str}")))?;
+            let vk = crate::key::verifying_key_from_value(b64.as_bytes()).ok_or_else(|| {
+                GatewayError::Config(format!("invalid signing public key for kid {kid}"))
+            })?;
+            ring.insert(kid, vk);
+        }
+        Ok(ring)
+    }
+}
+
+/// The set of top-level keys a config file may set, derived from `AiConfig` itself by serializing
+/// its defaults — so it tracks the struct automatically and can never drift from the field list.
+fn known_config_keys() -> std::collections::BTreeSet<String> {
+    use figment::Provider as _;
+    figment::providers::Serialized::defaults(AiConfig::default())
+        .data()
+        .map(|profiles| {
+            profiles
+                .into_values()
+                .flat_map(|dict| dict.into_keys())
+                .collect()
+        })
+        .unwrap_or_default()
+}
+
+/// Fail the load if the TOML file at `path` carries any key that isn't an `AiConfig` field. A
+/// missing file is fine (the gateway runs on defaults + env), so an unreadable/absent file yields no
+/// keys and passes. See the `deny_unknown_fields` note on `AiConfig` for why this is scoped to the
+/// TOML file and not the env layer.
+fn reject_unknown_toml_keys(path: &Path) -> Result<()> {
+    use figment::Provider as _;
+    let known = known_config_keys();
+    let unknown: std::collections::BTreeSet<String> = Toml::file(path)
+        .data()
+        .map(|profiles| {
+            profiles
+                .into_values()
+                .flat_map(|dict| dict.into_keys())
+                .filter(|k| !known.contains(k))
+                .collect()
+        })
+        .unwrap_or_default();
+    if unknown.is_empty() {
+        return Ok(());
+    }
+    let unknown: Vec<String> = unknown.into_iter().collect();
+    Err(GatewayError::Config(format!(
+        "unknown key(s) in {}: {} — check for a typo (known keys: {})",
+        path.display(),
+        unknown.join(", "),
+        known.into_iter().collect::<Vec<_>>().join(", "),
+    )))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn defaults_are_sane() {
+        let c = AiConfig::default();
+        // Read timeout must comfortably exceed a long stream.
+        assert!(c.read_timeout_secs >= 300);
+        assert_eq!(c.config_bucket, "ai-gateway");
+    }
+
+    #[test]
+    fn loads_without_a_file() {
+        let c = AiConfig::load_with_path(None).unwrap();
+        assert_eq!(c.listen, "0.0.0.0:8080");
+    }
+
+    #[test]
+    fn validate_rejects_zero_connect_and_read_timeouts() {
+        // A 0 connect/read timeout (a typo'd SSM param) must fail boot loudly, not degrade into a
+        // 502 cascade at runtime.
+        assert!(
+            AiConfig {
+                connect_timeout_secs: 0,
+                ..Default::default()
+            }
+            .validate()
+            .is_err()
+        );
+        assert!(
+            AiConfig {
+                read_timeout_secs: 0,
+                ..Default::default()
+            }
+            .validate()
+            .is_err()
+        );
+        // Defaults are valid.
+        assert!(AiConfig::default().validate().is_ok());
+    }
+
+    /// Write `body` to a uniquely-named temp TOML file (the literal `label` keeps parallel tests
+    /// from colliding) and return its path; the caller removes it.
+    fn temp_toml(label: &str, body: &str) -> std::path::PathBuf {
+        use std::io::Write as _;
+        let path = std::env::temp_dir().join(format!("beyond-ai-cfg-{label}.toml"));
+        let mut f = std::fs::File::create(&path).unwrap();
+        f.write_all(body.as_bytes()).unwrap();
+        path
+    }
+
+    #[test]
+    fn rejects_typod_toml_key() {
+        // A misspelled key in the operator's own TOML is a silent footgun (loads its default, the
+        // setting does nothing) — load must fail loudly and name the offending key, not boot healthy.
+        let path = temp_toml(
+            "typo",
+            "listen = \"0.0.0.0:1234\"\nreqiure_signing_keys = true\n",
+        );
+        let err = AiConfig::load_with_path(Some(&path)).unwrap_err();
+        let _ = std::fs::remove_file(&path);
+        match err {
+            GatewayError::Config(msg) => assert!(
+                msg.contains("reqiure_signing_keys"),
+                "error must name the typo'd key, got: {msg}"
+            ),
+            other => panic!("expected Config error, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn accepts_known_toml_keys() {
+        // Every key here is a real `AiConfig` field (including the `[signing_keys]` table) — load
+        // must succeed and apply the values.
+        let path = temp_toml(
+            "known",
+            "listen = \"0.0.0.0:1234\"\nrequire_signing_keys = true\nrate_limit_rps = 7\n\n[signing_keys]\n1 = \"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\"\n",
+        );
+        let c = AiConfig::load_with_path(Some(&path)).unwrap();
+        let _ = std::fs::remove_file(&path);
+        assert_eq!(c.listen, "0.0.0.0:1234");
+        assert!(c.require_signing_keys);
+        assert_eq!(c.rate_limit_rps, 7);
+        assert!(c.signing_keys.contains_key("1"));
+    }
+
+    #[test]
+    fn build_keyring_rejects_non_numeric_kid() {
+        // `kid` is parsed as `u32`; a non-numeric map key must fail boot (loud) rather than
+        // silently drop a trusted signing key (which would 401 every token under it).
+        let c = AiConfig {
+            signing_keys: HashMap::from([("not-a-number".to_string(), "AAAA".to_string())]),
+            ..Default::default()
+        };
+        assert!(c.build_keyring().is_err());
+    }
+
+    #[test]
+    fn build_keyring_rejects_invalid_public_key() {
+        // A value that is neither raw 32 bytes nor base64 of 32 bytes must fail boot, not install a
+        // bogus key that can never verify anything.
+        let c = AiConfig {
+            signing_keys: HashMap::from([("1".to_string(), "!!! not base64 !!!".to_string())]),
+            ..Default::default()
+        };
+        assert!(c.build_keyring().is_err());
+    }
+
+    #[test]
+    fn pool_key_env_merges_and_overrides() {
+        // `AI_POOL_KEY_<NAME>` → `pool_keys[name]` (lowercased), and env wins over a config-file
+        // value (the production secret path). A non-pool `AI_*` var is ignored.
+        let mut c = AiConfig {
+            pool_keys: HashMap::from([("openai".to_string(), Secret::new("from-file"))]),
+            ..Default::default()
+        };
+        c.merge_pool_key_env(
+            [
+                ("AI_POOL_KEY_OPENAI".to_string(), "from-env".to_string()),
+                ("AI_POOL_KEY_GROQ".to_string(), "gsk-x".to_string()),
+                ("AI_LOG".to_string(), "debug".to_string()),
+            ]
+            .into_iter(),
+        );
+        assert_eq!(c.pool_keys.get("openai").unwrap().expose(), "from-env");
+        assert_eq!(c.pool_keys.get("groq").unwrap().expose(), "gsk-x");
+        assert!(!c.pool_keys.contains_key("log"));
+    }
+}
diff --git a/src/deny.rs b/src/deny.rs
new file mode 100644
index 0000000..af9964f
--- /dev/null
+++ b/src/deny.rs
@@ -0,0 +1,155 @@
+//! Sparse per-tenant deny-set — the gateway's *entire* spend/fraud surface.
+//!
+//! Design (deliberate, see plan): the gateway only ever asks "is this tenant cut off?" and
+//! default-**allows** on a miss. We hold **only the exceptions** (the cut-off tenants), so memory
+//! is `O(denied)`, not `O(tenants)` — this scales to millions of tenants because `denied` stays a
+//! tiny slice (a few MB even at 1M entries; a tenant id is 8 bytes). The gateway never decides
+//! *why* a tenant is denied — the control plane writes/removes entries; we just enforce + log.
+//!
+//! TTL/auto-restore is handled by slipstream, not here: spend holds are written with a TTL to the
+//! next budget reset, so they expire into a `Del` event that removes them; fraud holds have no TTL
+//! (sticky). This struct only reflects current membership.
+
+use std::collections::HashMap;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DenyReason {
+    /// Over budget. Typically written with a TTL to the next reset → auto-restores.
+    Spend,
+    /// Abuse / fraud. Sticky (no TTL) until a human clears it.
+    Fraud,
+    /// Reason not recognized in the entry value — still denied (fail safe on the enforce side).
+    Unknown,
+}
+
+impl DenyReason {
+    /// HTTP status to return. 402 Payment Required for spend, 403 Forbidden for fraud/other —
+    /// gives the client (and our own dashboards) a meaningful signal without leaking detail.
+    pub fn http_status(self) -> u16 {
+        match self {
+            DenyReason::Spend => 402,
+            DenyReason::Fraud | DenyReason::Unknown => 403,
+        }
+    }
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct DenySet {
+    denied: HashMap<u64, DenyReason>,
+}
+
+impl DenySet {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Default-allow: absence from the set = allowed. This is the safe-for-availability default —
+    /// a tenant we've never heard of is served, not blocked.
+    pub fn is_denied(&self, tenant_id: u64) -> bool {
+        self.denied.contains_key(&tenant_id)
+    }
+
+    pub fn reason(&self, tenant_id: u64) -> Option<DenyReason> {
+        self.denied.get(&tenant_id).copied()
+    }
+
+    pub fn insert(&mut self, tenant_id: u64, reason: DenyReason) {
+        self.denied.insert(tenant_id, reason);
+    }
+
+    pub fn remove(&mut self, tenant_id: u64) {
+        self.denied.remove(&tenant_id);
+    }
+
+    pub fn len(&self) -> usize {
+        self.denied.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.denied.is_empty()
+    }
+}
+
+impl FromIterator<(u64, DenyReason)> for DenySet {
+    fn from_iter<I: IntoIterator<Item = (u64, DenyReason)>>(iter: I) -> Self {
+        Self {
+            denied: iter.into_iter().collect(),
+        }
+    }
+}
+
+/// Parse a slipstream deny key `blackhole.{tenant_id}` → tenant id. Returns `None` for keys that
+/// don't match (so an unrelated watched key never corrupts the set).
+pub fn parse_key(key: &str) -> Option<u64> {
+    key.strip_prefix("blackhole.")?.parse().ok()
+}
+
+/// Parse the entry value into a reason. Accepts either a bare token (`spend`/`fraud`) or a JSON
+/// object `{"reason":"spend", ...}`. Anything else → `Unknown` (still denied — fail safe).
+pub fn parse_reason(value: &[u8]) -> DenyReason {
+    let s = std::str::from_utf8(value).unwrap_or("").trim();
+    // The JSON branch must own its extracted reason (it's borrowed from a temporary `Value`); the
+    // bare-token branch matches the borrowed `&str` directly — no allocation on the common path.
+    let json_reason: Option<String>;
+    let token: &str = if s.starts_with('{') {
+        json_reason = serde_json::from_slice::<serde_json::Value>(value)
+            .ok()
+            .and_then(|v| v.get("reason").and_then(|r| r.as_str()).map(str::to_owned));
+        json_reason.as_deref().unwrap_or("")
+    } else {
+        s
+    };
+    match token {
+        "spend" => DenyReason::Spend,
+        "fraud" => DenyReason::Fraud,
+        _ => DenyReason::Unknown,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn default_allows_unknown_tenants() {
+        let set = DenySet::new();
+        assert!(!set.is_denied(12345));
+    }
+
+    #[test]
+    fn insert_remove_and_reason() {
+        let mut set = DenySet::new();
+        set.insert(1, DenyReason::Spend);
+        set.insert(2, DenyReason::Fraud);
+        assert!(set.is_denied(1));
+        assert_eq!(set.reason(1), Some(DenyReason::Spend));
+        assert_eq!(set.reason(2).unwrap().http_status(), 403);
+        set.remove(1);
+        assert!(!set.is_denied(1)); // restored
+        assert_eq!(set.len(), 1);
+    }
+
+    #[test]
+    fn key_parsing() {
+        assert_eq!(parse_key("blackhole.42"), Some(42));
+        assert_eq!(parse_key("blackhole.notanumber"), None);
+        assert_eq!(parse_key("signkey.1"), None);
+    }
+
+    #[test]
+    fn reason_parsing_bare_and_json() {
+        assert_eq!(parse_reason(b"spend"), DenyReason::Spend);
+        assert_eq!(parse_reason(b" fraud "), DenyReason::Fraud);
+        assert_eq!(
+            parse_reason(br#"{"reason":"spend","exp":123}"#),
+            DenyReason::Spend
+        );
+        assert_eq!(parse_reason(b"weird"), DenyReason::Unknown);
+    }
+
+    #[test]
+    fn spend_is_402_fraud_is_403() {
+        assert_eq!(DenyReason::Spend.http_status(), 402);
+        assert_eq!(DenyReason::Fraud.http_status(), 403);
+    }
+}
diff --git a/src/doctor.rs b/src/doctor.rs
new file mode 100644
index 0000000..3effe4e
--- /dev/null
+++ b/src/doctor.rs
@@ -0,0 +1,227 @@
+//! Diagnostics (PATTERNS.md `doctor` pattern): fast prerequisite checks, exit 0/1.
+//!
+//! The point is to catch a misconfiguration *before* traffic lands on the instance, where it would
+//! otherwise surface as a first-request failure (a 401 from an empty keyring, a 503 from a missing
+//! pool key, a 502 from an unresolvable provider). We check the things boot does lazily or never:
+//! NATS reachability, the signing keyring, managed pool keys, and provider DNS.
+
+use crate::config::AiConfig;
+use crate::route;
+use std::collections::BTreeMap;
+use std::time::Duration;
+
+pub struct CheckResult {
+    pub name: &'static str,
+    pub passed: bool,
+    pub message: String,
+    pub hint: Option<String>,
+}
+
+fn pass(name: &'static str, message: impl Into<String>) -> CheckResult {
+    CheckResult {
+        name,
+        passed: true,
+        message: message.into(),
+        hint: None,
+    }
+}
+
+fn fail(name: &'static str, message: impl Into<String>, hint: &str) -> CheckResult {
+    CheckResult {
+        name,
+        passed: false,
+        message: message.into(),
+        hint: Some(hint.to_string()),
+    }
+}
+
+pub async fn run_checks(config: &AiConfig) -> Vec<CheckResult> {
+    let mut out = Vec::new();
+
+    // NATS / slipstream reachability — without it we can't load signing keys or the deny-set.
+    match store::nats_connect(
+        &config.nats_url,
+        config.nats_creds.as_ref().map(|s| s.expose()),
+        config.nats_creds_file.as_deref(),
+    )
+    .await
+    {
+        Ok(_) => out.push(pass("nats", format!("connected to {}", config.nats_url))),
+        Err(e) => out.push(fail(
+            "nats",
+            e.to_string(),
+            "check AI_NATS_URL and credentials",
+        )),
+    }
+
+    out.push(check_signing_keys(config));
+    out.push(check_pool_keys(config));
+    out.extend(check_provider_dns(config).await);
+
+    out
+}
+
+/// The signing keyring is what authenticates managed traffic. An empty or invalid keyring isn't a
+/// hard boot failure (the gateway still serves BYO), but it silently turns *every* `bai_…` key into a
+/// 401 — a footgun worth surfacing loudly here. `build_keyring` already rejects a non-numeric kid or
+/// an unparseable public key, so a success means every configured key installed.
+fn check_signing_keys(config: &AiConfig) -> CheckResult {
+    match config.build_keyring() {
+        Ok(ring) if ring.is_empty() => fail(
+            "signing_keys",
+            "no signing keys configured — all managed (bai_…) traffic will 401, only BYO works",
+            "set [signing_keys] (kid → base64 Ed25519 public key) in config or AI_ env",
+        ),
+        Ok(ring) => pass(
+            "signing_keys",
+            format!("{} signing key(s) loaded", ring.len()),
+        ),
+        Err(e) => fail(
+            "signing_keys",
+            e.to_string(),
+            "every kid must be numeric and every value a base64 (or raw 32-byte) Ed25519 public key",
+        ),
+    }
+}
+
+/// Pool keys back managed traffic (swapped in per provider). Cross-check against the keyring: if
+/// signing keys are present the operator *intends* to serve managed traffic, so zero pool keys means
+/// every managed request 503s — a real misconfiguration. A pure-BYO deployment (no signing keys) with
+/// no pool keys is legitimate, so that case passes with a note instead of failing.
+fn check_pool_keys(config: &AiConfig) -> CheckResult {
+    let mut names: Vec<&str> = config.pool_keys.keys().map(String::as_str).collect();
+    names.sort_unstable();
+    let managed_intended = !config.signing_keys.is_empty();
+    match (names.is_empty(), managed_intended) {
+        (true, true) => fail(
+            "pool_keys",
+            "signing keys are configured (managed traffic expected) but no pool keys are set — \
+             every managed request will 503",
+            "set AI_POOL_KEY_<PROVIDER> (e.g. AI_POOL_KEY_OPENAI) for each provider you serve",
+        ),
+        (true, false) => pass(
+            "pool_keys",
+            "none configured (BYO-only deployment — no signing keys either)",
+        ),
+        (false, _) => pass("pool_keys", format!("pool keys for: {}", names.join(", "))),
+    }
+}
+
+/// Resolve every provider authority the gateway might dial (known providers + config overrides/adds),
+/// so a DNS or typo'd-authority problem shows up here rather than as a 502 on the first request. Each
+/// lookup is bounded so one black-holed host can't hang the doctor. We don't connect (no auth, no TLS
+/// handshake) — reachability of the *name* is the prerequisite; live auth is proven by the smoke test.
+async fn check_provider_dns(config: &AiConfig) -> Vec<CheckResult> {
+    // Effective authority per provider name: the known default unless config overrides it, plus any
+    // config-only provider. A BTreeMap dedups and keeps the output stable/ordered.
+    let mut authorities: BTreeMap<&str, String> = BTreeMap::new();
+    for spec in route::KNOWN_PROVIDERS {
+        authorities.insert(spec.name, spec.authority.to_string());
+    }
+    for (name, authority) in &config.provider_authorities {
+        authorities.insert(name.as_str(), authority.clone());
+    }
+
+    let mut results = Vec::with_capacity(authorities.len());
+    for (name, authority) in authorities {
+        // `CheckResult.name` is `&'static str`: a known provider lends its static name; a config-only
+        // provider (non-'static) reports under a generic label, with the real name in the message.
+        let check_name: &'static str = route::KNOWN_PROVIDERS
+            .iter()
+            .find(|s| s.name == name)
+            .map_or("provider_dns", |s| s.name);
+        let lookup = tokio::time::timeout(
+            Duration::from_secs(3),
+            tokio::net::lookup_host(authority.clone()),
+        )
+        .await;
+        let res = match lookup {
+            Ok(Ok(mut addrs)) => match addrs.next() {
+                Some(addr) => pass(check_name, format!("{name} → {authority} ({addr})")),
+                None => fail(
+                    check_name,
+                    format!("{name}: {authority} resolved to no addresses"),
+                    "check the provider authority (host:port) in provider_authorities",
+                ),
+            },
+            Ok(Err(e)) => fail(
+                check_name,
+                format!("{name}: {authority}: {e}"),
+                "check the provider authority (host:port) and DNS",
+            ),
+            Err(_) => fail(
+                check_name,
+                format!("{name}: {authority}: DNS lookup timed out (>3s)"),
+                "the upstream host may be unreachable or DNS is slow",
+            ),
+        };
+        results.push(res);
+    }
+    results
+}
+
+pub fn print_results(title: &str, results: &[CheckResult]) {
+    println!("== {title} ==");
+    for r in results {
+        let mark = if r.passed { "ok" } else { "FAIL" };
+        println!("[{mark}] {}: {}", r.name, r.message);
+        if let (false, Some(hint)) = (r.passed, &r.hint) {
+            println!("       hint: {hint}");
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::secret::Secret;
+    use std::collections::HashMap;
+
+    #[test]
+    fn signing_keys_empty_fails() {
+        // No keys ⇒ every managed token 401s; doctor must flag it, not pass silently.
+        let c = AiConfig::default();
+        assert!(!check_signing_keys(&c).passed);
+    }
+
+    #[test]
+    fn signing_keys_valid_passes() {
+        let c = AiConfig {
+            // 32 zero bytes, base64 — a structurally valid Ed25519 public key.
+            signing_keys: HashMap::from([(
+                "1".to_string(),
+                "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(),
+            )]),
+            ..Default::default()
+        };
+        assert!(check_signing_keys(&c).passed);
+    }
+
+    #[test]
+    fn pool_keys_missing_with_signing_keys_fails() {
+        // Signing keys present (managed intended) but no pool keys ⇒ every managed request 503s.
+        let c = AiConfig {
+            signing_keys: HashMap::from([(
+                "1".to_string(),
+                "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA".to_string(),
+            )]),
+            ..Default::default()
+        };
+        assert!(!check_pool_keys(&c).passed);
+    }
+
+    #[test]
+    fn pool_keys_absent_byo_only_passes() {
+        // No signing keys and no pool keys is a legitimate BYO-only deployment — must not fail.
+        assert!(check_pool_keys(&AiConfig::default()).passed);
+    }
+
+    #[test]
+    fn pool_keys_present_passes() {
+        let c = AiConfig {
+            pool_keys: HashMap::from([("openai".to_string(), Secret::new("sk-x"))]),
+            ..Default::default()
+        };
+        assert!(check_pool_keys(&c).passed);
+    }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..9f841ea
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,15 @@
+//! Structured error type (PATTERNS.md convention: `thiserror` enum, `From` for foreign errors).
+
+#[derive(Debug, thiserror::Error)]
+pub enum GatewayError {
+    #[error("configuration error: {0}")]
+    Config(String),
+
+    #[error("store error: {0}")]
+    Store(#[from] store::KvError),
+
+    #[error("dns resolution error: {0}")]
+    Dns(String),
+}
+
+pub type Result<T> = std::result::Result<T, GatewayError>;
diff --git a/src/key.rs b/src/key.rs
new file mode 100644
index 0000000..4cdce56
--- /dev/null
+++ b/src/key.rs
@@ -0,0 +1,363 @@
+//! Stateless virtual API key: `bai_v1.{kid}.{payload}.{sig}`.
+//!
+//! The gateway authenticates every request from a `{payload}` it can verify **without a
+//! lookup**: tenant/app identity lives *inside* the token, signed with Ed25519. We hold only
+//! the *public* keys (by `kid`), so a compromised — or third-party / OSS — gateway can verify
+//! but **cannot mint** new tenant keys; the private signing key lives only in the control plane.
+//!
+//! Why signed-token instead of opaque-token + registry lookup: at millions of tenants we don't
+//! want a per-request lookup (latency + a state dependency) just to learn *who* is calling.
+//! Identity is stateless here; the only per-request state is the sparse deny-set (see `deny`),
+//! which is a membership check, not an identity lookup.
+//!
+//! Why deterministic (no nonce/timestamp in the payload): `mint(tenant, app)` is reproducible,
+//! so the control plane can re-derive a tenant's key on demand and store nothing. Revocation is
+//! handled out-of-band by the deny-set, not by per-key expiry.
+
+use base64::Engine;
+use base64::engine::general_purpose::URL_SAFE_NO_PAD;
+use ed25519_dalek::{Signature, Signer, SigningKey, Verifier, VerifyingKey};
+use std::collections::HashMap;
+
+/// Wire prefix + version. Bumping the version is a breaking change to the token format;
+/// the version is inside the signed bytes so it cannot be downgraded by an attacker.
+pub const PREFIX: &str = "bai_v1";
+
+/// Signing-key identifier. Lets the control plane rotate signing keys: new tokens are minted
+/// under a new `kid` while the gateway still trusts the public keys of older, un-retired `kid`s.
+pub type Kid = u32;
+
+/// The identity carried by (and the entire contents of) a virtual key.
+///
+/// `tenant_id`/`vpc_id` are `u64` to match the platform's id width (cf. ClickHouse
+/// `tenant_id UInt64` / `vpc_id UInt64`) and to keep the payload a fixed 16 bytes.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct VirtualKey {
+    pub tenant_id: u64,
+    pub vpc_id: u64,
+}
+
+impl VirtualKey {
+    /// Fixed 16-byte little-endian payload: `tenant_id ++ vpc_id`. Fixed layout (not JSON) so
+    /// the encoding is deterministic byte-for-byte — required for `mint` to be reproducible.
+    fn encode_payload(&self) -> [u8; 16] {
+        let mut out = [0u8; 16];
+        out[..8].copy_from_slice(&self.tenant_id.to_le_bytes());
+        out[8..].copy_from_slice(&self.vpc_id.to_le_bytes());
+        out
+    }
+
+    fn decode_payload(bytes: &[u8]) -> Option<Self> {
+        if bytes.len() != 16 {
+            return None;
+        }
+        Some(Self {
+            tenant_id: u64::from_le_bytes(bytes[..8].try_into().ok()?),
+            vpc_id: u64::from_le_bytes(bytes[8..].try_into().ok()?),
+        })
+    }
+}
+
+#[derive(Debug, thiserror::Error, PartialEq, Eq)]
+pub enum KeyError {
+    #[error("malformed virtual key")]
+    Malformed,
+    #[error("unsupported key version")]
+    BadVersion,
+    #[error("unknown signing key id {0}")]
+    UnknownKid(Kid),
+    #[error("signature verification failed")]
+    BadSignature,
+}
+
+/// The set of trusted Ed25519 public keys, indexed by `kid`. Built once at boot from config
+/// (`signing_keys`); multiple kids may be trusted at once for zero-downtime rotation via redeploy.
+#[derive(Debug, Default, Clone)]
+pub struct Keyring {
+    keys: HashMap<Kid, VerifyingKey>,
+}
+
+impl Keyring {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn insert(&mut self, kid: Kid, key: VerifyingKey) {
+        self.keys.insert(kid, key);
+    }
+
+    pub fn get(&self, kid: Kid) -> Option<&VerifyingKey> {
+        self.keys.get(&kid)
+    }
+
+    pub fn remove(&mut self, kid: Kid) {
+        self.keys.remove(&kid);
+    }
+
+    pub fn len(&self) -> usize {
+        self.keys.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.keys.is_empty()
+    }
+
+    /// Verify a virtual key string and extract its identity. Stateless: the only input besides
+    /// the token is the public keyring.
+    pub fn verify(&self, token: &str) -> Result<VirtualKey, KeyError> {
+        // Split into exactly 4 parts: `bai_v1`, kid, payload, sig. `splitn(4, '.')` rejects any
+        // token with fewer separators; a payload/sig never contains '.' (base64url has none).
+        let mut parts = token.splitn(4, '.');
+        let prefix = parts.next().ok_or(KeyError::Malformed)?;
+        let kid_str = parts.next().ok_or(KeyError::Malformed)?;
+        let payload_b64 = parts.next().ok_or(KeyError::Malformed)?;
+        let sig_b64 = parts.next().ok_or(KeyError::Malformed)?;
+
+        if prefix != PREFIX {
+            // Distinguish "wrong version of our token" from "not our token at all" only loosely;
+            // both are unauthenticated. A `bai_vN` with N != 1 reports BadVersion for clarity.
+            return if prefix.starts_with("bai_v") {
+                Err(KeyError::BadVersion)
+            } else {
+                Err(KeyError::Malformed)
+            };
+        }
+
+        let kid: Kid = kid_str.parse().map_err(|_| KeyError::Malformed)?;
+
+        // Decode the fixed-size fields straight onto the stack — no per-request heap allocation on
+        // the verify hot path. The payload is always 16 bytes, the signature 64. `decode_slice`
+        // sizes its bounds check against a (ceil) estimate, so the buffers are a few bytes larger
+        // than the exact decoded length; we slice to what was actually written and the fixed-size
+        // checks below reject anything off (an oversized field overruns the estimate → Malformed).
+        let mut payload_buf = [0u8; 24]; // ≥ estimate for a 22-char (16-byte) payload
+        let plen = URL_SAFE_NO_PAD
+            .decode_slice(payload_b64, &mut payload_buf)
+            .map_err(|_| KeyError::Malformed)?;
+        let payload = &payload_buf[..plen];
+
+        let mut sig_buf = [0u8; 72]; // ≥ estimate for an 86-char (64-byte) signature
+        let slen = URL_SAFE_NO_PAD
+            .decode_slice(sig_b64, &mut sig_buf)
+            .map_err(|_| KeyError::Malformed)?;
+        let sig_arr: [u8; 64] = sig_buf[..slen]
+            .try_into()
+            .map_err(|_| KeyError::Malformed)?;
+        let signature = Signature::from_bytes(&sig_arr);
+
+        // Resolve the public key *before* the cryptographic check so an unknown kid is a distinct,
+        // cheap rejection (no signature math on keys we don't trust).
+        let vk = self.get(kid).ok_or(KeyError::UnknownKid(kid))?;
+
+        // The signed message binds version + kid + payload, so none can be swapped independently.
+        // Build it into a stack buffer (≤ 40 bytes) — no allocation per verify. A payload longer
+        // than the buffer can hold can't be a valid 16-byte payload anyway, so it's `Malformed`
+        // rather than a panic on this per-request hot path.
+        let mut signed_buf = [0u8; SIGNED_BYTES_CAP];
+        let signed =
+            write_signed_bytes(&mut signed_buf, kid, payload_b64).ok_or(KeyError::Malformed)?;
+        vk.verify(signed, &signature)
+            .map_err(|_| KeyError::BadSignature)?;
+
+        VirtualKey::decode_payload(payload).ok_or(KeyError::Malformed)
+    }
+}
+
+/// Upper bound on `bai_v1.{kid}.{payload}`: `PREFIX` (6) + `.` + a `u32` kid (≤ 10 digits) + `.`
+/// + a 16-byte base64url payload (22 chars) = 40 bytes. 64 leaves headroom.
+const SIGNED_BYTES_CAP: usize = 64;
+
+/// Write the signature-covered bytes `bai_v1.{kid}.{payload}` into `buf`, returning the written
+/// slice — or `None` if they don't fit in `SIGNED_BYTES_CAP`. Binding kid + payload here is what
+/// stops an attacker from re-pointing a valid signature at a different kid or a tampered payload.
+/// For a well-formed key the length is bounded (≤ 40 bytes; see `SIGNED_BYTES_CAP`), so `None`
+/// means the input was malformed — `write!` returns `WriteZero` rather than panicking or
+/// truncating, keeping the verify hot path allocation- *and* panic-free.
+fn write_signed_bytes<'a>(
+    buf: &'a mut [u8; SIGNED_BYTES_CAP],
+    kid: Kid,
+    payload_b64: &str,
+) -> Option<&'a [u8]> {
+    use std::io::Write;
+    let mut cur = std::io::Cursor::new(&mut buf[..]);
+    write!(cur, "{PREFIX}.{kid}.{payload_b64}").ok()?;
+    let n = cur.position() as usize;
+    Some(&buf[..n])
+}
+
+/// Parse an Ed25519 public key from a slipstream `signkey.*` value: accept raw 32 bytes or
+/// base64 (standard or url-safe) of 32 bytes, so the control plane can store whichever form.
+pub fn verifying_key_from_value(bytes: &[u8]) -> Option<VerifyingKey> {
+    if let Ok(arr) = <[u8; 32]>::try_from(bytes) {
+        return VerifyingKey::from_bytes(&arr).ok();
+    }
+    let s = std::str::from_utf8(bytes).ok()?.trim();
+    for decoded in [
+        base64::engine::general_purpose::STANDARD.decode(s).ok(),
+        URL_SAFE_NO_PAD.decode(s).ok(),
+    ]
+    .into_iter()
+    .flatten()
+    {
+        if let Ok(arr) = <[u8; 32]>::try_from(decoded.as_slice()) {
+            return VerifyingKey::from_bytes(&arr).ok();
+        }
+    }
+    None
+}
+
+/// Mint a virtual key. Lives here for tests + determinism checks and as the reference
+/// implementation; production minting is the Go control plane (`crypto/ed25519`), which must
+/// produce byte-identical output for the same inputs.
+#[allow(clippy::expect_used)] // payload is a fixed 22-char base64 of 16 bytes; always fits the cap
+pub fn mint(vk: &VirtualKey, kid: Kid, signing_key: &SigningKey) -> String {
+    let payload_b64 = URL_SAFE_NO_PAD.encode(vk.encode_payload());
+    let mut signed_buf = [0u8; SIGNED_BYTES_CAP];
+    // mint builds the payload itself (a fixed 22-char base64 of 16 bytes) from controlled inputs,
+    // so it always fits; this `expect` is a true invariant assertion, not a fallible runtime path.
+    let signed = write_signed_bytes(&mut signed_buf, kid, &payload_b64)
+        .expect("minted signed bytes fit in SIGNED_BYTES_CAP");
+    let sig: Signature = signing_key.sign(signed);
+    let sig_b64 = URL_SAFE_NO_PAD.encode(sig.to_bytes());
+    format!("{PREFIX}.{kid}.{payload_b64}.{sig_b64}")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Deterministic test keypair from a fixed seed — avoids an RNG dep and keeps tests reproducible.
+    fn test_keypair(seed: u8) -> (SigningKey, VerifyingKey) {
+        let sk = SigningKey::from_bytes(&[seed; 32]);
+        let vk = sk.verifying_key();
+        (sk, vk)
+    }
+
+    fn ring_with(kid: Kid, vk: VerifyingKey) -> Keyring {
+        let mut r = Keyring::new();
+        r.insert(kid, vk);
+        r
+    }
+
+    #[test]
+    fn mint_then_verify_roundtrips_identity() {
+        let (sk, vk) = test_keypair(1);
+        let ring = ring_with(7, vk);
+        let id = VirtualKey {
+            tenant_id: 42,
+            vpc_id: 99,
+        };
+
+        let token = mint(&id, 7, &sk);
+        assert_eq!(ring.verify(&token).unwrap(), id);
+    }
+
+    #[test]
+    fn mint_is_deterministic() {
+        let (sk, _) = test_keypair(2);
+        let id = VirtualKey {
+            tenant_id: 1,
+            vpc_id: 2,
+        };
+        // Ed25519 is deterministic (RFC 8032) and the payload has no nonce, so two mints match.
+        assert_eq!(mint(&id, 1, &sk), mint(&id, 1, &sk));
+    }
+
+    #[test]
+    fn tampered_payload_is_rejected() {
+        let (sk, vk) = test_keypair(3);
+        let ring = ring_with(1, vk);
+        let token = mint(
+            &VirtualKey {
+                tenant_id: 10,
+                vpc_id: 20,
+            },
+            1,
+            &sk,
+        );
+
+        // Flip a byte in the payload segment; the signature no longer covers it.
+        let mut parts: Vec<&str> = token.split('.').collect();
+        let mut payload = URL_SAFE_NO_PAD.decode(parts[2]).unwrap();
+        payload[0] ^= 0xff;
+        let tampered_payload = URL_SAFE_NO_PAD.encode(&payload);
+        parts[2] = &tampered_payload;
+        let tampered = parts.join(".");
+
+        assert_eq!(ring.verify(&tampered), Err(KeyError::BadSignature));
+    }
+
+    #[test]
+    fn tampered_signature_is_rejected() {
+        let (sk, vk) = test_keypair(4);
+        let ring = ring_with(1, vk);
+        let token = mint(
+            &VirtualKey {
+                tenant_id: 5,
+                vpc_id: 6,
+            },
+            1,
+            &sk,
+        );
+
+        let mut sig = URL_SAFE_NO_PAD
+            .decode(token.rsplit('.').next().unwrap())
+            .unwrap();
+        sig[0] ^= 0xff;
+        let bad_sig = URL_SAFE_NO_PAD.encode(&sig);
+        let base = &token[..token.rfind('.').unwrap()];
+        let tampered = format!("{base}.{bad_sig}");
+
+        assert_eq!(ring.verify(&tampered), Err(KeyError::BadSignature));
+    }
+
+    #[test]
+    fn unknown_kid_is_rejected_without_crypto() {
+        let (sk, vk) = test_keypair(5);
+        let ring = ring_with(1, vk); // trusts kid=1 only
+        let token = mint(
+            &VirtualKey {
+                tenant_id: 1,
+                vpc_id: 1,
+            },
+            2,
+            &sk,
+        ); // minted under kid=2
+        assert_eq!(ring.verify(&token), Err(KeyError::UnknownKid(2)));
+    }
+
+    #[test]
+    fn signature_from_a_different_kid_is_rejected() {
+        // A valid signature minted under kid=2 must not verify when presented as kid=1, even if
+        // the gateway trusts both — because kid is part of the signed bytes.
+        let (sk1, vk1) = test_keypair(6);
+        let (sk2, vk2) = test_keypair(7);
+        let mut ring = Keyring::new();
+        ring.insert(1, vk1);
+        ring.insert(2, vk2);
+
+        let id = VirtualKey {
+            tenant_id: 3,
+            vpc_id: 4,
+        };
+        let token2 = mint(&id, 2, &sk2);
+        // Re-label the kid segment as 1 while keeping kid=2's signature.
+        let parts: Vec<&str> = token2.split('.').collect();
+        let relabeled = format!("{}.1.{}.{}", parts[0], parts[2], parts[3]);
+        assert_eq!(ring.verify(&relabeled), Err(KeyError::BadSignature));
+        let _ = sk1;
+    }
+
+    #[test]
+    fn malformed_and_version_errors() {
+        let (_, vk) = test_keypair(8);
+        let ring = ring_with(1, vk);
+        assert_eq!(ring.verify("garbage"), Err(KeyError::Malformed));
+        assert_eq!(ring.verify("bai_v1.1.only-three"), Err(KeyError::Malformed));
+        assert_eq!(ring.verify("bai_v2.1.aaaa.bbbb"), Err(KeyError::BadVersion));
+        assert_eq!(
+            ring.verify("sk-openai.1.aaaa.bbbb"),
+            Err(KeyError::Malformed)
+        );
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..bfdf419
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,32 @@
+//! Beyond AI gateway library.
+//!
+//! `src/main.rs` wires these modules into a Pingora `ProxyHttp` service. The load-bearing logic
+//! (virtual-key verification, deny-set, usage parsing, routing, request peek) lives in modules
+//! free of Pingora/IO so it is unit-tested without a running proxy or live providers.
+
+// Lint gates (`unsafe_code = "forbid"`, `unused_must_use = "deny"`) live in `[workspace.lints]` so
+// they apply to *both* crate roots — this lib and the `main.rs` binary — not just whichever unit
+// carries a crate-level `#![deny]`. A dropped `Result` (e.g. an unchecked `write_response_*`) is
+// therefore a hard error, and `unsafe` is forbidden, everywhere in the crate.
+//
+// `unwrap_used`/`expect_used`/`panic` are denied in production code (see `[workspace.lints.clippy]`)
+// but a unit test's whole job is to assert a precondition holds — `.unwrap()` *is* the assertion — so
+// allow them in `#[cfg(test)]` modules.
+#![cfg_attr(test, allow(clippy::unwrap_used, clippy::expect_used, clippy::panic))]
+
+pub mod admin;
+pub mod circuit_breaker;
+pub mod config;
+pub mod deny;
+pub mod doctor;
+pub mod error;
+pub mod key;
+pub mod metrics;
+pub mod peek;
+pub mod proxy;
+pub mod ratelimit;
+pub mod route;
+pub mod secret;
+pub mod state;
+pub mod store_watch;
+pub mod usage;
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..6d77699
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,167 @@
+//! Beyond AI gateway binary: clap `Run`/`Doctor`, Pingora server bootstrap, services.
+
+// See `lib.rs`: deny the panic surface in production, allow it in `#[cfg(test)]` assertions.
+#![cfg_attr(test, allow(clippy::unwrap_used, clippy::expect_used, clippy::panic))]
+
+use beyond_ai::admin::AdminApp;
+use beyond_ai::config::AiConfig;
+use beyond_ai::doctor;
+use beyond_ai::metrics::Metrics;
+use beyond_ai::proxy::AiProxy;
+use beyond_ai::state::GatewayState;
+use beyond_ai::store_watch::WatcherService;
+use clap::{Parser, Subcommand};
+use pingora_core::apps::http_app::HttpServer;
+use pingora_core::server::Server;
+use pingora_core::server::configuration::ServerConf;
+use pingora_core::services::background::background_service;
+use pingora_core::services::listening::Service as ListeningService;
+use pingora_proxy::http_proxy_service;
+use std::path::Path;
+use std::process::exit;
+use tracing_subscriber::EnvFilter;
+use tracing_subscriber::layer::SubscriberExt;
+use tracing_subscriber::util::SubscriberInitExt;
+
+#[derive(Parser)]
+#[command(
+    name = "beyond-ai",
+    about = "Beyond AI gateway — egress proxy to LLM providers"
+)]
+struct Cli {
+    /// Path to config file (defaults to ./config.toml).
+    #[arg(short, long, env = "AI_CONFIG_PATH", global = true)]
+    config: Option<std::path::PathBuf>,
+
+    #[command(subcommand)]
+    command: Option<Commands>,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    /// Run prerequisite diagnostics and exit.
+    Doctor,
+    /// Start the gateway (default).
+    Run,
+}
+
+fn load_config(path: Option<&Path>) -> AiConfig {
+    match AiConfig::load_with_path(path) {
+        Ok(c) => c,
+        Err(e) => {
+            eprintln!("failed to load config: {e}");
+            exit(1);
+        }
+    }
+}
+
+fn init_tracing() {
+    // JSON to stdout; the `ai.usage` target carries billing facts that logfwd/OTLP ships to
+    // ClickHouse. `AI_LOG` overrides the level filter.
+    let filter = EnvFilter::try_from_env("AI_LOG").unwrap_or_else(|_| EnvFilter::new("info"));
+    tracing_subscriber::registry()
+        .with(tracing_subscriber::fmt::layer().json())
+        .with(filter)
+        .init();
+}
+
+// Boot path: every `.expect()` here is a fatal start-up invariant (no runtime to build, no Pingora
+// server) — a panic before we serve a single request is the correct, visible failure.
+#[allow(clippy::expect_used)]
+fn main() {
+    // rustls 0.23 requires a process-wide crypto provider for the TLS connections to providers.
+    // Idempotent: an `Err` means a provider is already installed (e.g. a second init in tests),
+    // which is fine to ignore — the provider we want is in place either way.
+    let _ = rustls::crypto::ring::default_provider().install_default();
+
+    let cli = Cli::parse();
+
+    // Doctor runs before any server setup (minimal current-thread runtime), exits 0/1.
+    if matches!(cli.command, Some(Commands::Doctor)) {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("runtime");
+        let config = load_config(cli.config.as_deref());
+        let results = rt.block_on(doctor::run_checks(&config));
+        doctor::print_results("Beyond AI Gateway Doctor", &results);
+        exit(if results.iter().all(|r| r.passed) {
+            0
+        } else {
+            1
+        });
+    }
+
+    init_tracing();
+    let config = load_config(cli.config.as_deref());
+    let listen = config.listen.clone();
+    let metrics_listen = config.metrics_listen.clone();
+    // Capture the shutdown knobs before `config` is moved into the gateway state below.
+    let grace_period_secs = config.shutdown_grace_period_secs;
+    let runtime_timeout_secs = config.shutdown_runtime_timeout_secs;
+    let metrics = match Metrics::new() {
+        Ok(m) => m,
+        Err(e) => {
+            eprintln!("failed to register metrics: {e}");
+            exit(1);
+        }
+    };
+    let state = match GatewayState::new(config, metrics) {
+        Ok(s) => s,
+        Err(e) => {
+            eprintln!("failed to build gateway state: {e}");
+            exit(1);
+        }
+    };
+
+    // Make the graceful-shutdown drain window explicit instead of inheriting Pingora's silent
+    // defaults (300s grace / 5s runtime teardown). `grace_period_seconds` is how long in-flight
+    // requests get to finish after SIGTERM before teardown; `graceful_shutdown_timeout_seconds` is
+    // the final runtime-exit backstop. See the `AiConfig` field docs for the read_timeout /
+    // orchestrator-stopTimeout tradeoffs.
+    let conf = ServerConf {
+        grace_period_seconds: Some(grace_period_secs),
+        graceful_shutdown_timeout_seconds: Some(runtime_timeout_secs),
+        ..ServerConf::default()
+    };
+    let mut server = Server::new_with_opt_and_conf(None, conf);
+    server.bootstrap();
+
+    // Client (app) traffic.
+    let mut proxy_svc = http_proxy_service(
+        &server.configuration,
+        AiProxy {
+            state: state.clone(),
+        },
+    );
+    proxy_svc.add_tcp(&listen);
+    server.add_service(proxy_svc);
+
+    // slipstream watchers + NATS connectivity (connects on Pingora's runtime; see WatcherService).
+    server.add_service(background_service(
+        "ai-watchers",
+        WatcherService {
+            state: state.clone(),
+        },
+    ));
+
+    // Metrics listener now also serves /livez + /readyz for the ECS/k8s probes. Pingora's built-in
+    // prometheus service only does /metrics, so we hand-route all three in one small ServeHttp.
+    let mut admin = ListeningService::new(
+        "ai-admin".to_string(),
+        HttpServer::new_app(AdminApp {
+            metrics: state.metrics.clone(),
+        }),
+    );
+    admin.add_tcp(&metrics_listen);
+    server.add_service(admin);
+
+    tracing::info!(
+        %listen,
+        %metrics_listen,
+        grace_period_secs,
+        runtime_timeout_secs,
+        "starting beyond-ai"
+    );
+    server.run_forever();
+}
diff --git a/src/metrics.rs b/src/metrics.rs
new file mode 100644
index 0000000..b3ac19e
--- /dev/null
+++ b/src/metrics.rs
@@ -0,0 +1,258 @@
+//! Prometheus metrics (PATTERNS.md: `Arc<Metrics>`).
+//!
+//! Registered on the **default** registry so Pingora's built-in `prometheus_http_service`
+//! exposes them with no extra wiring. `Metrics::new` is called exactly once (in `main`).
+
+use prometheus::{
+    Histogram, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, Opts,
+    default_registry,
+};
+use std::sync::Arc;
+
+pub struct Metrics {
+    pub requests_total: IntCounter,
+    /// Labeled by reason ("auth", "deny_spend", "deny_fraud") so we can see *why* we rejected.
+    pub rejections_total: IntCounterVec,
+    /// Upstream responses by provider + status class ("2xx"/"4xx"/"5xx"). A provider degrading
+    /// (429/5xx) is otherwise invisible until it surfaces as latency or missing usage events —
+    /// this is the per-provider error-rate signal an oncall pages on.
+    pub upstream_responses_total: IntCounterVec,
+    /// Upstream **connect** retries by provider (see `proxy::fail_to_connect`). A partially-down
+    /// provider TCP layer (or an egress-IP ban) silently retries up to `MAX_CONNECT_RETRIES` times
+    /// per request; without this, the extra latency looks like a slow provider, not a connect
+    /// problem. Pairs with a `warn!` on the same path so the dashboard spike has a log to grep.
+    pub connect_retries_total: IntCounterVec,
+    /// Labeled by kind: input|output|cache_read|cache_write. Cache tokens are also in the `ai.usage`
+    /// billing log, but that ships with lag — the Prometheus counter is the alerting surface for
+    /// "cache hit rate fell off a cliff after a deploy" (cache write ≈ 3× input, cache read ≈ 0.1×,
+    /// so a regression is a real cost event, not just a latency one).
+    pub tokens_total: IntCounterVec,
+    /// The four `tokens_total` children, resolved once at boot. The label set (`input`/`output`/
+    /// `cache_read`/`cache_write`) is fixed and known at compile time, so we pay the
+    /// `with_label_values` map lookup once here instead of four times per metered response.
+    pub tokens_input: IntCounter,
+    pub tokens_output: IntCounter,
+    pub tokens_cache_read: IntCounter,
+    pub tokens_cache_write: IntCounter,
+    /// Labeled by provider: TTFT varies by an order of magnitude across providers (Groq/Cerebras
+    /// <100ms vs. a large Anthropic/xAI model at seconds), so an unlabeled histogram can't tell you
+    /// *which* provider's first-token time regressed.
+    pub ttft_seconds: HistogramVec,
+    /// Labeled by provider, same rationale as `ttft_seconds`: full-request duration is dominated by
+    /// the model's generation time, which is per-provider.
+    pub upstream_latency_seconds: HistogramVec,
+    pub active_streams: IntGauge,
+    /// Total in-flight requests (streaming + non-streaming), incremented once a request is admitted
+    /// in `request_filter` and decremented in `logging`. `active_streams` only covers SSE; under a
+    /// burst or a stalled upstream this is what distinguishes "high rps, fast upstreams" from
+    /// "connections piling up" — the difference between a perf blip and a connection-exhaustion
+    /// incident.
+    pub requests_in_flight: IntGauge,
+    /// Current deny-set cardinality (denied tenants). The set is `O(denied)` and fed from NATS; a
+    /// fraud event or a control-plane bug that mass-denies tenants would otherwise grow it invisibly
+    /// until it shows up as memory pressure. Updated on every seed and every applied delta.
+    pub deny_set_size: IntGauge,
+    /// NATS connectivity for the deny-set watcher (1 = connected, 0 = disconnected). The gateway is
+    /// fail-open — it serves on the last-known set when NATS is down — so staleness is otherwise
+    /// silent; this is the metric to alert "deny-set has been stale for >N minutes" on.
+    pub nats_connected: IntGauge,
+}
+
+/// TTFT buckets (seconds). Tuned for LLM latency: sub-second prompts up through the multi-second
+/// first-token times of large models. The default prometheus buckets top out at 10s, but TTFT for a
+/// busy model can exceed that, so the tail goes to 30s.
+const TTFT_BUCKETS: &[f64] = &[0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0];
+
+/// Full-request duration buckets (seconds). A streaming completion runs far longer than the
+/// default 10s ceiling (`read_timeout_secs` defaults to 600), so the tail reaches 300s — without
+/// these, every long stream lands in `+Inf` and the p99/p999 tail is unrecoverable.
+const LATENCY_BUCKETS: &[f64] = &[
+    0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0, 600.0,
+];
+
+impl Metrics {
+    /// Build and register every metric on the default registry. Fallible: registering a name that
+    /// already exists (a second `Metrics::new()` against the process-wide default registry) returns
+    /// `AlreadyRegisteredError` rather than panicking, so a double-init surfaces as an error the
+    /// caller can report instead of crashing the process.
+    pub fn new() -> prometheus::Result<Arc<Self>> {
+        let r = default_registry();
+
+        let requests_total =
+            IntCounter::with_opts(Opts::new("ai_requests_total", "Total requests handled"))?;
+        let rejections_total = IntCounterVec::new(
+            Opts::new("ai_rejections_total", "Requests rejected before upstream"),
+            &["reason"],
+        )?;
+        let upstream_responses_total = IntCounterVec::new(
+            Opts::new(
+                "ai_upstream_responses_total",
+                "Upstream responses by provider and status class",
+            ),
+            &["provider", "status"],
+        )?;
+        let connect_retries_total = IntCounterVec::new(
+            Opts::new(
+                "ai_connect_retries_total",
+                "Upstream connect retries by provider",
+            ),
+            &["provider"],
+        )?;
+        let tokens_total =
+            IntCounterVec::new(Opts::new("ai_tokens_total", "Tokens metered"), &["kind"])?;
+        // Resolve the fixed-label children once. Created against the (about-to-be-registered) vec, so
+        // they export normally; the hot path then bumps a direct handle, no per-call label lookup.
+        let tokens_input = tokens_total.with_label_values(&["input"]);
+        let tokens_output = tokens_total.with_label_values(&["output"]);
+        let tokens_cache_read = tokens_total.with_label_values(&["cache_read"]);
+        let tokens_cache_write = tokens_total.with_label_values(&["cache_write"]);
+        let ttft_seconds = HistogramVec::new(
+            HistogramOpts::new("ai_ttft_seconds", "Time to first byte from upstream")
+                .buckets(TTFT_BUCKETS.to_vec()),
+            &["provider"],
+        )?;
+        let upstream_latency_seconds = HistogramVec::new(
+            HistogramOpts::new(
+                "ai_upstream_latency_seconds",
+                "Full upstream request duration",
+            )
+            .buckets(LATENCY_BUCKETS.to_vec()),
+            &["provider"],
+        )?;
+        let active_streams = IntGauge::with_opts(Opts::new(
+            "ai_active_streams",
+            "In-flight streaming responses",
+        ))?;
+        let requests_in_flight = IntGauge::with_opts(Opts::new(
+            "ai_requests_in_flight",
+            "In-flight requests (streaming + non-streaming)",
+        ))?;
+        let deny_set_size =
+            IntGauge::with_opts(Opts::new("ai_deny_set_size", "Currently denied tenants"))?;
+        let nats_connected = IntGauge::with_opts(Opts::new(
+            "ai_nats_connected",
+            "Deny-set watcher NATS connectivity (1=connected, 0=disconnected)",
+        ))?;
+
+        r.register(Box::new(requests_total.clone()))?;
+        r.register(Box::new(rejections_total.clone()))?;
+        r.register(Box::new(upstream_responses_total.clone()))?;
+        r.register(Box::new(connect_retries_total.clone()))?;
+        r.register(Box::new(tokens_total.clone()))?;
+        r.register(Box::new(ttft_seconds.clone()))?;
+        r.register(Box::new(upstream_latency_seconds.clone()))?;
+        r.register(Box::new(active_streams.clone()))?;
+        r.register(Box::new(requests_in_flight.clone()))?;
+        r.register(Box::new(deny_set_size.clone()))?;
+        r.register(Box::new(nats_connected.clone()))?;
+
+        Ok(Arc::new(Self {
+            requests_total,
+            rejections_total,
+            upstream_responses_total,
+            connect_retries_total,
+            tokens_total,
+            tokens_input,
+            tokens_output,
+            tokens_cache_read,
+            tokens_cache_write,
+            ttft_seconds,
+            upstream_latency_seconds,
+            active_streams,
+            requests_in_flight,
+            deny_set_size,
+            nats_connected,
+        }))
+    }
+}
+
+/// Per-provider metric handles, resolved once at boot and held on the [`Provider`](crate::route::Provider).
+///
+/// Every per-provider metric (`ttft_seconds`, `upstream_latency_seconds`, `upstream_responses_total`,
+/// `connect_retries_total`) is keyed on the provider name — a label known at boot from the provider
+/// registry. Resolving the child handles here lets the response path bump a direct counter/histogram
+/// instead of doing a string-keyed `with_label_values` map lookup on every response.
+pub struct ProviderMetrics {
+    pub ttft_seconds: Histogram,
+    pub upstream_latency_seconds: Histogram,
+    pub connect_retries_total: IntCounter,
+    /// Responses by status class, indexed `[1xx, 2xx, 3xx, 4xx, 5xx]` (see [`Self::record_response`]).
+    responses: [IntCounter; 5],
+}
+
+impl ProviderMetrics {
+    /// Resolve the child handles for `provider` from the shared label vecs. Called once per provider
+    /// at boot (see `state::build_providers`).
+    pub fn resolve(m: &Metrics, provider: &str) -> Self {
+        ProviderMetrics {
+            ttft_seconds: m.ttft_seconds.with_label_values(&[provider]),
+            upstream_latency_seconds: m.upstream_latency_seconds.with_label_values(&[provider]),
+            connect_retries_total: m.connect_retries_total.with_label_values(&[provider]),
+            responses: [
+                m.upstream_responses_total
+                    .with_label_values(&[provider, "1xx"]),
+                m.upstream_responses_total
+                    .with_label_values(&[provider, "2xx"]),
+                m.upstream_responses_total
+                    .with_label_values(&[provider, "3xx"]),
+                m.upstream_responses_total
+                    .with_label_values(&[provider, "4xx"]),
+                m.upstream_responses_total
+                    .with_label_values(&[provider, "5xx"]),
+            ],
+        }
+    }
+
+    /// Count one upstream response, bucketed by status class (`1xx`/`2xx`/`3xx`/`4xx`/`5xx`).
+    /// A `1xx` (e.g. `100 Continue`, `101 Switching Protocols`) gets its own bucket rather than
+    /// falling through to `5xx` — providers don't normally emit it, but a misbucketed informational
+    /// status would otherwise read as a phantom upstream-error spike on the dashboard.
+    pub fn record_response(&self, status: u16) {
+        let idx = match status {
+            100..=199 => 0,
+            200..=299 => 1,
+            300..=399 => 2,
+            400..=499 => 3,
+            _ => 4,
+        };
+        self.responses[idx].inc();
+    }
+
+    /// Standalone, **unregistered** handles for tests that build a `Provider` without a live registry.
+    #[cfg(test)]
+    pub fn disconnected() -> Self {
+        let counter = || IntCounter::new("t", "t").expect("valid counter opts");
+        let hist =
+            || Histogram::with_opts(HistogramOpts::new("t", "t")).expect("valid histogram opts");
+        ProviderMetrics {
+            ttft_seconds: hist(),
+            upstream_latency_seconds: hist(),
+            connect_retries_total: counter(),
+            responses: [counter(), counter(), counter(), counter(), counter()],
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn record_response_buckets_by_status_class() {
+        // Lock the index mapping: a 1xx must land in its own bucket, never the 5xx fallback (which
+        // would read as a phantom upstream-error spike on the provider dashboard).
+        let pm = ProviderMetrics::disconnected();
+        pm.record_response(100); // 1xx
+        pm.record_response(204); // 2xx
+        pm.record_response(301); // 3xx
+        pm.record_response(404); // 4xx
+        pm.record_response(503); // 5xx
+        for (idx, status) in [100u16, 204, 301, 404, 503].iter().enumerate() {
+            assert_eq!(
+                pm.responses[idx].get(),
+                1,
+                "status {status} landed in the wrong class bucket"
+            );
+        }
+    }
+}
diff --git a/src/peek.rs b/src/peek.rs
new file mode 100644
index 0000000..c8ac81b
--- /dev/null
+++ b/src/peek.rs
@@ -0,0 +1,509 @@
+//! Streaming, 100%-accurate extraction of the **root-level `model`** from a JSON request body.
+//!
+//! Both OpenAI and Anthropic require `model` as a top-level field of the request object. We extract
+//! it with a structural state machine fed the body chunks *as they stream through* — the body is
+//! never buffered or reordered. This is exact (not a byte-heuristic): it tracks nesting depth and
+//! string/escape state, so a `"model"` appearing inside a nested object (e.g. a message) or inside
+//! a string value is correctly ignored, and field order is irrelevant. Memory is O(1): only short
+//! root-level *keys* and the `model` value are accumulated. Large uninteresting string content
+//! (system prompts, base64 images) is skipped with a SIMD-accelerated `memchr2` search to the next
+//! `"`/`\`, not inspected byte-by-byte — so even a multi-MB request is walked cheaply.
+
+#[derive(Clone, Copy, PartialEq, Default)]
+enum Cap {
+    #[default]
+    No,
+    Key,
+    ModelValue,
+}
+
+#[derive(Default)]
+pub struct ModelScanner {
+    model: Option<String>,
+    done: bool,
+    /// Nesting depth: number of currently-open `{`/`[`. Root object contents are at depth 1.
+    depth: u32,
+    root_is_object: bool,
+    in_string: bool,
+    escaped: bool,
+    /// Whether the next root-level string is a key (`{`/`,` → key; `:` → value).
+    expect_key: bool,
+    /// The most recent root-level key was exactly `model`.
+    last_key_is_model: bool,
+    /// What (if anything) we're accumulating into `cur` for the current string.
+    cap: Cap,
+    cur: Vec<u8>,
+}
+
+impl ModelScanner {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Take the extracted model, if found. (Available as soon as the value is seen.)
+    pub fn take_model(&mut self) -> Option<String> {
+        self.model.take()
+    }
+
+    #[inline]
+    fn at_root_object(&self) -> bool {
+        self.depth == 1 && self.root_is_object
+    }
+
+    pub fn feed(&mut self, bytes: &[u8]) {
+        if self.done {
+            return;
+        }
+        let mut i = 0;
+        let n = bytes.len();
+        while i < n {
+            if self.in_string {
+                // Fast path: the content of a string we don't accumulate (a big base64 image, a long
+                // prompt, anything nested) — jump straight to the next `"` or `\` with a
+                // SIMD-accelerated search instead of inspecting every byte.
+                if self.cap == Cap::No && !self.escaped {
+                    match memchr::memchr2(b'"', b'\\', &bytes[i..]) {
+                        Some(rel) => i += rel,
+                        None => return, // rest of this chunk is skippable string content
+                    }
+                }
+                let b = bytes[i];
+                i += 1;
+                if self.escaped {
+                    self.escaped = false;
+                    if self.cap != Cap::No {
+                        self.cur.push(b);
+                    }
+                } else if b == b'\\' {
+                    self.escaped = true;
+                } else if b == b'"' {
+                    self.in_string = false;
+                    match self.cap {
+                        Cap::Key => self.last_key_is_model = self.cur == b"model",
+                        Cap::ModelValue => {
+                            // A valid JSON string value is UTF-8; if a malformed/adversarial body
+                            // smuggles non-UTF-8 bytes here we record "unknown" rather than emitting
+                            // a `U+FFFD`-corrupted model into the billing log. Either way we're done.
+                            self.model = Some(
+                                String::from_utf8(std::mem::take(&mut self.cur))
+                                    .unwrap_or_else(|_| "unknown".to_string()),
+                            );
+                            self.done = true;
+                            return;
+                        }
+                        Cap::No => {}
+                    }
+                    self.cap = Cap::No;
+                    self.cur.clear();
+                } else if self.cap != Cap::No {
+                    self.cur.push(b);
+                }
+                continue;
+            }
+
+            let b = bytes[i];
+            i += 1;
+            match b {
+                b'"' => {
+                    self.in_string = true;
+                    self.cur.clear();
+                    // Decide whether this string is worth accumulating — only root-object keys and
+                    // the `model` value matter.
+                    self.cap = if self.at_root_object() {
+                        if self.expect_key {
+                            Cap::Key
+                        } else if self.last_key_is_model {
+                            Cap::ModelValue
+                        } else {
+                            Cap::No
+                        }
+                    } else {
+                        Cap::No
+                    };
+                }
+                b'{' => {
+                    if self.depth == 0 {
+                        self.root_is_object = true;
+                        self.expect_key = true;
+                    }
+                    self.depth += 1;
+                }
+                b'[' => {
+                    if self.depth == 0 {
+                        self.root_is_object = false;
+                    }
+                    self.depth += 1;
+                }
+                b'}' | b']' => self.depth = self.depth.saturating_sub(1),
+                b':' if self.depth == 1 => self.expect_key = false,
+                b',' if self.depth == 1 => {
+                    self.expect_key = true;
+                    self.last_key_is_model = false;
+                }
+                _ => {}
+            }
+        }
+    }
+}
+
+/// Decide whether an OpenAI **chat** request body needs `stream_options.include_usage` injected,
+/// and where. Returns `Some(offset)` — the byte index just after the root object's opening `{`, where
+/// the caller splices `"stream_options":{"include_usage":true},` — **only** when the body is a JSON
+/// object with a root-level `"stream": true` and **no** root-level `"stream_options"` key. Otherwise
+/// `None` (not a stream, options already set, or not an object) → forward unchanged.
+///
+/// Why this exists: OpenAI only emits a usage chunk on a stream when the request carries
+/// `stream_options.include_usage = true`. A stock client that omits it would stream with no usage,
+/// so managed traffic couldn't be metered. We can't ask for it via a header and can't set it in a
+/// client SDK we don't control, so the gateway injects it — for every OpenAI streaming chat request,
+/// out of the box.
+///
+/// Structural (depth + string + escape aware), so a `"stream"` inside a message object or inside a
+/// string value never triggers injection — only the genuine root-level field. The returned offset is
+/// always inside a non-empty object (a root `"stream"` is present), so the caller always follows the
+/// fragment with a comma.
+pub fn plan_stream_usage_injection(body: &[u8]) -> Option<usize> {
+    let n = body.len();
+    // Cheap pre-filter: injection is only ever needed when a root-level `"stream"` key is present.
+    // If the substring `"stream"` doesn't occur *anywhere*, the structural answer is unconditionally
+    // `None`, so a single SIMD `memmem` pass lets us skip the whole walk — the common case, since
+    // most requests aren't streaming. (The needle is a substring of `"stream_options"` too, so a
+    // body carrying only stream_options still passes the filter and is correctly resolved to `None`
+    // by the walk below.)
+    memchr::memmem::find(body, b"\"stream\"")?;
+    let mut i = 0;
+    while i < n && body[i].is_ascii_whitespace() {
+        i += 1;
+    }
+    // Must be a JSON object; anything else (array, scalar, garbage) we never rewrite.
+    if i >= n || body[i] != b'{' {
+        return None;
+    }
+    let insert_at = i + 1;
+
+    let mut depth = 0u32;
+    let mut in_string = false;
+    let mut escaped = false;
+    let mut expect_key = false;
+    let mut capturing_key = false;
+    // Start index (just past the opening `"`) of the root-level key currently being scanned. The
+    // body is fully in hand, so we slice the key out of it at the closing quote — no accumulation
+    // buffer, zero-copy. (Escaped keys are sliced raw; since neither `stream` nor `stream_options`
+    // contains an escape, an escaped key simply doesn't match either needle — the correct answer.)
+    let mut key_start = 0usize;
+    // The current root-level key is exactly `stream` (so the next literal is its value).
+    let mut last_key_is_stream = false;
+    let mut stream_true = false;
+
+    let mut j = i;
+    while j < n {
+        if in_string {
+            // Fast path: inside a string we're not capturing (any non-root-key string — message
+            // content, system prompts, base64 images), jump straight to the next `"`/`\` with a
+            // SIMD search instead of inspecting every byte. Mirrors the skip in `ModelScanner::feed`.
+            if !capturing_key && !escaped {
+                match memchr::memchr2(b'"', b'\\', &body[j..]) {
+                    Some(rel) => j += rel,
+                    None => break, // rest of the body is skippable string content
+                }
+            }
+            let b = body[j];
+            if escaped {
+                escaped = false;
+            } else if b == b'\\' {
+                escaped = true;
+            } else if b == b'"' {
+                in_string = false;
+                if capturing_key {
+                    capturing_key = false;
+                    // Only root-level (`depth == 1`) keys matter.
+                    if depth == 1 {
+                        let key = &body[key_start..j];
+                        // A root `stream_options` means the client already controls usage — the
+                        // answer is `None` regardless of anything else in the body, so stop now
+                        // rather than walking the remainder for a result we already know.
+                        if key == b"stream_options" {
+                            return None;
+                        }
+                        last_key_is_stream = key == b"stream";
+                    }
+                }
+            }
+            j += 1;
+            continue;
+        }
+        let b = body[j];
+        match b {
+            b'"' => {
+                // A root-level key starts only where one is expected (just after `{` or `,`).
+                if depth == 1 && expect_key {
+                    capturing_key = true;
+                    key_start = j + 1; // first key byte is just past this opening quote
+                } else {
+                    capturing_key = false;
+                }
+                in_string = true;
+            }
+            b'{' => {
+                depth += 1;
+                if depth == 1 {
+                    expect_key = true;
+                }
+            }
+            b'[' => depth += 1,
+            b'}' | b']' => depth = depth.saturating_sub(1),
+            b':' if depth == 1 => expect_key = false,
+            b',' if depth == 1 => {
+                expect_key = true;
+                last_key_is_stream = false;
+            }
+            // The value of a root-level `stream` key: a bare `true` literal.
+            b't' if depth == 1 && last_key_is_stream => {
+                if body[j..].starts_with(b"true") {
+                    stream_true = true;
+                }
+                last_key_is_stream = false;
+            }
+            _ => {}
+        }
+        j += 1;
+    }
+
+    // `stream_options` would have already returned `None` above, so reaching here means it's absent.
+    if stream_true { Some(insert_at) } else { None }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn scan(body: &[u8]) -> Option<String> {
+        let mut s = ModelScanner::new();
+        s.feed(body);
+        s.take_model()
+    }
+
+    #[test]
+    fn extracts_model_from_sse_first_chunk() {
+        // The response-side model tap feeds SSE through this same scanner. `data: ` is non-structural
+        // noise at depth 0, so the scanner reads the first chunk's root `model` — the provider's
+        // resolved/billed id — and stops. This is what makes the billing model authoritative.
+        let sse = b"data: {\"id\":\"chatcmpl-x\",\"object\":\"chat.completion.chunk\",\"model\":\"gpt-4o-2024-08-06\",\"choices\":[]}\n\n";
+        assert_eq!(scan(sse).as_deref(), Some("gpt-4o-2024-08-06"));
+    }
+
+    /// Apply `plan_stream_usage_injection` and return the rewritten body (or unchanged if no plan),
+    /// so tests assert the *resulting* JSON — the thing the upstream actually receives.
+    fn inject(body: &str) -> String {
+        match plan_stream_usage_injection(body.as_bytes()) {
+            Some(at) => {
+                let frag = br#""stream_options":{"include_usage":true},"#;
+                let mut out = Vec::with_capacity(body.len() + frag.len());
+                out.extend_from_slice(&body.as_bytes()[..at]);
+                out.extend_from_slice(frag);
+                out.extend_from_slice(&body.as_bytes()[at..]);
+                String::from_utf8(out).unwrap()
+            }
+            None => body.to_string(),
+        }
+    }
+
+    #[test]
+    fn injects_when_streaming_and_absent() {
+        let out = inject(r#"{"model":"gpt-4o","stream":true,"messages":[]}"#);
+        assert_eq!(
+            out,
+            r#"{"stream_options":{"include_usage":true},"model":"gpt-4o","stream":true,"messages":[]}"#
+        );
+        // The result must be valid JSON with the option set.
+        let v: serde_json::Value = serde_json::from_str(&out).unwrap();
+        assert_eq!(
+            v["stream_options"]["include_usage"],
+            serde_json::json!(true)
+        );
+    }
+
+    #[test]
+    fn stream_can_be_the_only_or_last_key() {
+        assert!(plan_stream_usage_injection(br#"{"stream":true}"#).is_some());
+        let v: serde_json::Value =
+            serde_json::from_str(&inject(r#"{"model":"x","stream":true}"#)).unwrap();
+        assert_eq!(
+            v["stream_options"]["include_usage"],
+            serde_json::json!(true)
+        );
+    }
+
+    #[test]
+    fn skips_when_options_already_present() {
+        // Client already asked for usage (in any form) — never touch it.
+        assert_eq!(
+            plan_stream_usage_injection(
+                br#"{"stream":true,"stream_options":{"include_usage":false}}"#
+            ),
+            None
+        );
+        // Order-independent: options before stream.
+        assert_eq!(
+            plan_stream_usage_injection(br#"{"stream_options":{},"stream":true}"#),
+            None
+        );
+    }
+
+    #[test]
+    fn skips_when_not_streaming() {
+        assert_eq!(
+            plan_stream_usage_injection(br#"{"model":"x","stream":false}"#),
+            None
+        );
+        assert_eq!(plan_stream_usage_injection(br#"{"model":"x"}"#), None);
+    }
+
+    #[test]
+    fn ignores_nested_or_in_string_stream() {
+        // `stream` inside a message object is not the root field.
+        assert_eq!(
+            plan_stream_usage_injection(
+                br#"{"messages":[{"role":"u","stream":true}],"model":"x"}"#
+            ),
+            None
+        );
+        // `stream` mentioned inside a string value must not trigger.
+        assert_eq!(
+            plan_stream_usage_injection(br#"{"system":"set stream:true please","model":"x"}"#),
+            None
+        );
+    }
+
+    #[test]
+    fn injects_with_large_content_before_stream() {
+        // Exercises the SIMD fast-skip in the planner: a large content value must be skipped, and
+        // the genuine root `stream` after it still triggers injection.
+        let big = "x".repeat(64 * 1024);
+        let body = format!(r#"{{"messages":[{{"content":"{big}"}}],"stream":true}}"#);
+        let v: serde_json::Value = serde_json::from_str(&inject(&body)).unwrap();
+        assert_eq!(
+            v["stream_options"]["include_usage"],
+            serde_json::json!(true)
+        );
+    }
+
+    #[test]
+    fn skips_word_stream_inside_large_value() {
+        // The word `stream` (even `"stream"`) buried in a big string value must not trigger — the
+        // memmem pre-filter passes, but the structural walk correctly skips over the string content.
+        let big = "x".repeat(64 * 1024);
+        let body = format!(r#"{{"system":"{big} \"stream\":true","model":"x"}}"#);
+        assert_eq!(plan_stream_usage_injection(body.as_bytes()), None);
+    }
+
+    #[test]
+    fn stream_options_after_large_content_suppresses() {
+        // The early-return-on-stream_options path: stream_options appearing (in any order, after a
+        // big value) must suppress injection even though `stream:true` is also present.
+        let big = "x".repeat(64 * 1024);
+        let body = format!(
+            r#"{{"content":"{big}","stream":true,"stream_options":{{"include_usage":false}}}}"#
+        );
+        assert_eq!(plan_stream_usage_injection(body.as_bytes()), None);
+    }
+
+    #[test]
+    fn tolerates_whitespace_and_non_objects() {
+        assert!(plan_stream_usage_injection(b"  {  \"stream\" : true }").is_some());
+        assert_eq!(plan_stream_usage_injection(b"[1,2,3]"), None);
+        assert_eq!(plan_stream_usage_injection(b"not json"), None);
+    }
+
+    #[test]
+    fn simple() {
+        assert_eq!(
+            scan(br#"{"model":"gpt-4o","messages":[]}"#).as_deref(),
+            Some("gpt-4o")
+        );
+    }
+
+    #[test]
+    fn model_last_after_huge_array() {
+        let body = br#"{"messages":[{"role":"user","content":"...lots of text..."}],"stream":true,"model":"claude-opus-4-8"}"#;
+        assert_eq!(scan(body).as_deref(), Some("claude-opus-4-8"));
+    }
+
+    #[test]
+    fn nested_model_is_ignored() {
+        // `"model"` inside a message object must NOT win over the real root-level one.
+        let body = br#"{"messages":[{"role":"x","model":"NESTED"}],"model":"real"}"#;
+        assert_eq!(scan(body).as_deref(), Some("real"));
+    }
+
+    #[test]
+    fn model_word_inside_a_string_value_is_ignored() {
+        let body = br#"{"system":"use the model called \"gpt\" please","model":"real"}"#;
+        assert_eq!(scan(body).as_deref(), Some("real"));
+    }
+
+    #[test]
+    fn whitespace_tolerant() {
+        assert_eq!(
+            scan(br#"{  "model" :  "m1" , "x":1 }"#).as_deref(),
+            Some("m1")
+        );
+    }
+
+    #[test]
+    fn vendor_prefixed_value() {
+        assert_eq!(
+            scan(br#"{"model":"openrouter/meta-llama/llama-3.1"}"#).as_deref(),
+            Some("openrouter/meta-llama/llama-3.1")
+        );
+    }
+
+    #[test]
+    fn split_across_feeds() {
+        let mut s = ModelScanner::new();
+        for part in [
+            &b"{\"messages\":[],\"mod"[..],
+            &b"el\":\"gp"[..],
+            &b"t-4o\"}"[..],
+        ] {
+            s.feed(part);
+        }
+        assert_eq!(s.take_model().as_deref(), Some("gpt-4o"));
+    }
+
+    #[test]
+    fn absent_is_none() {
+        assert_eq!(scan(br#"{"messages":[]}"#), None);
+        assert_eq!(scan(b"not json"), None);
+    }
+
+    #[test]
+    fn large_skipped_value_then_model() {
+        // Exercises the SIMD fast-skip: a ~256KB content string (with an escaped quote) then the
+        // real model. Must skip the bulk and still find the root model exactly.
+        let big = "x".repeat(256 * 1024);
+        let body =
+            format!(r#"{{"messages":[{{"content":"{big}\"still in string"}}],"model":"gpt-4o"}}"#);
+        assert_eq!(scan(body.as_bytes()).as_deref(), Some("gpt-4o"));
+    }
+
+    #[test]
+    fn nested_object_value_then_root_model() {
+        // A root key whose value is an object, followed by the real model.
+        let body = br#"{"response_format":{"type":"json_object"},"model":"gpt-4o"}"#;
+        assert_eq!(scan(body).as_deref(), Some("gpt-4o"));
+    }
+
+    #[test]
+    fn escaped_quote_inside_model_value_does_not_terminate_it() {
+        // An escaped `"` *inside the model value itself* exercises the `Cap::ModelValue` escape
+        // path (line ~72): the backslash-escaped quote must be kept in the accumulated value rather
+        // than ending the string early. (Model ids never really contain quotes, but a structural
+        // regression here would truncate the model — and thus mislabel usage — for any value that
+        // happens to contain an escape.)
+        assert_eq!(
+            scan(br#"{"model":"gpt-4\"o"}"#).as_deref(),
+            Some("gpt-4\"o")
+        );
+    }
+}
diff --git a/src/proxy.rs b/src/proxy.rs
new file mode 100644
index 0000000..9083ea7
--- /dev/null
+++ b/src/proxy.rs
@@ -0,0 +1,943 @@
+//! The Pingora `ProxyHttp` passthrough service.
+//!
+//! Flow: pick the provider from the **first path segment** (`/{provider}/…`) → verify the virtual
+//! key (stateless) → deny-set check (O(1), default-allow) → swap the auth
+//! header to the pool key (managed only) → **stream the request body straight through** (never
+//! buffered; original framing preserved) while feeding it to a structural scanner that extracts the
+//! exact root-level `model` → relay the response **without buffering** → tap usage from a bounded
+//! tail → emit a usage fact. Whether the call is streaming is derived from the *response*
+//! Content-Type.
+//!
+//! Verified end-to-end (`tests/e2e.rs`): a real `beyond-ai` binary against real nats-server + a
+//! mock upstream — passthrough fidelity, key swap, usage metering (non-streaming + SSE), BYO
+//! passthrough, and deny-set propagation all pass.
+//!
+//! We never read the request body in `request_filter`: Pingora's body-forward phase reads the
+//! downstream body itself, so draining it earlier would make Pingora send `Content-Length` bytes
+//! with no body and the upstream would hang. We let the body flow through `request_body_filter`
+//! (the supported hook), feeding each chunk to a streaming structural scanner (`peek::ModelScanner`,
+//! O(1) memory) — never withholding or buffering it.
+//!
+//! One deliberate exception to the no-buffer rule: a **managed** OpenAI chat/responses request is
+//! buffered and gets `stream_options.include_usage` injected when it streams without it — otherwise
+//! OpenAI emits no usage chunk and the request couldn't be metered. We can't set that option in a
+//! client SDK we don't control, so the gateway guarantees it, out of the box. Scoped to exactly that
+//! path (managed + OpenAI dialect + streaming-capable); BYO and everything else stay pure passthrough.
+//!
+//! Auth branches on key format: `bai_…` is a managed virtual key (verify → deny-check → swap to
+//! the pool key); anything else is a **BYO** request — the user's own provider token, passed
+//! through unchanged (no swap, no Beyond identity, no deny-set).
+//!
+//! Routing is by the **first path segment** = provider name (`route`, data-driven): `/{provider}/…`
+//! selects the provider and the rest of the path is forwarded **verbatim** (the gateway holds no
+//! per-provider mount knowledge). A bare path with no provider prefix that starts with `/v1` is the
+//! drop-in default — dialect picks openai/anthropic (`dialect_for_path`) — so an OpenAI/Anthropic
+//! client works by changing only the host. An unknown first segment is a 404. Model isn't used for
+//! routing (the body isn't read pre-connect); it's still captured from the body for usage.
+
+use crate::route::{self, Dialect, Provider};
+use crate::state::{GatewayState, RequestId};
+use crate::{peek, usage};
+use async_trait::async_trait;
+use bytes::Bytes;
+use pingora::http::ResponseHeader;
+use pingora_core::Result;
+use pingora_core::protocols::ALPN;
+use pingora_core::upstreams::peer::HttpPeer;
+use pingora_proxy::{ProxyHttp, Session};
+use std::borrow::Cow;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+use tracing::{info, warn};
+
+/// Response header carrying the per-request id (`{instance}-{seq}`). Set on both the proxied
+/// response and every reject body so a client can quote it and an oncall can grep for it.
+const REQUEST_ID_HEADER: &str = "x-beyond-request-id";
+
+/// Reject requests whose declared Content-Length exceeds this. The body itself is **not** buffered
+/// (it streams straight through); this is purely an abuse guard checked up front via the header.
+const MAX_REQUEST_BODY: usize = 100 * 1024 * 1024;
+
+/// Bounded tail of the response kept for usage extraction. The usage event is the final SSE chunk
+/// / the whole non-streaming body; keeping a tail means we never buffer a long stream.
+const USAGE_TAIL_CAP: usize = 64 * 1024;
+
+/// Max upstream **connect** retries before surfacing the failure to the client.
+///
+/// We retry connect failures only (the idiomatic Pingora pattern, same as edge). Retrying on a
+/// received **5xx/429 response** is deliberately *not* done: Pingora 0.8 has no clean
+/// post-response retry hook for a streaming passthrough (edge doesn't do it either), the upstream
+/// may have started streaming, and the provider SDKs already back off on 429/5xx + `Retry-After`.
+const MAX_CONNECT_RETRIES: u8 = 2;
+
+pub struct AiProxy {
+    pub state: Arc<GatewayState>,
+}
+
+/// Per-request context. `None` until `request_filter` admits the request; short-circuited
+/// requests (auth/deny failures) leave it `None`, so later filters no-op.
+pub struct RequestCtx {
+    tenant_id: u64,
+    vpc_id: u64,
+    dialect: Dialect,
+    /// The resolved upstream provider (authority/host + precomputed managed auth value), shared from
+    /// the boot-time registry — a cheap `Arc` clone, nothing re-allocated per request.
+    provider: Arc<Provider>,
+    /// The path (+ query) to send upstream: the client path with the `/{provider}` segment stripped
+    /// (provider-prefixed request) or unchanged (bare-path default). Forwarded **verbatim** — the
+    /// gateway does no per-provider path rewriting. Applied as the upstream URI in
+    /// `upstream_request_filter`.
+    forward_path: String,
+    /// Whether this is a **managed** request (`bai_…` key → swap to the pool key). `false` for
+    /// **BYO** — we leave the user's own auth header untouched (passthrough).
+    managed: bool,
+    /// Model the client *requested*, extracted from the request body. This is the billing-log
+    /// **fallback** — the authoritative value is the model the provider echoes in its response (see
+    /// `resp_model_scanner`), because a client may send an alias (`gpt-4o`) that the provider resolves
+    /// to and bills under a pinned id (`gpt-4o-2024-08-06`).
+    model: String,
+    model_scanner: peek::ModelScanner,
+    /// Extracts the model the **provider** reports in its response (the resolved/billed id), fed the
+    /// response stream in `response_body_filter`. Preferred over `model` in the `ai.usage` event so
+    /// the billed model is authoritative, not the requested alias. Works for SSE too: the scanner
+    /// skips the `data: ` prefix and reads the first chunk's root `model`. Falls back to `model` when
+    /// the response carries none (e.g. an error body).
+    resp_model_scanner: peek::ModelScanner,
+    /// Whether the upstream response is an SSE stream — set in `response_filter` from the response
+    /// Content-Type (we don't read the request to learn this).
+    streaming: bool,
+    /// Bounded tail of the response, for the usage tap.
+    resp_tail: Vec<u8>,
+    /// Running total of request-body bytes seen, to enforce `MAX_REQUEST_BODY` even when the client
+    /// uses chunked transfer encoding (no `Content-Length` to check up front).
+    body_bytes_fed: usize,
+    /// Upstream HTTP status, set in `response_filter` once the response head arrives. Drives the
+    /// circuit-breaker outcome recorded once in `logging`: `5xx` → failure, any other response →
+    /// success (the provider answered — a `429` is a healthy throttle, not a breaker trip), and a
+    /// `None` here with an upstream error → failure (connect/read failed before any response).
+    upstream_status: Option<u16>,
+    /// Managed OpenAI chat/responses request: buffer the body and inject
+    /// `stream_options.include_usage` if it streams without it, so the usage chunk (hence the
+    /// billable token count) is guaranteed. The single, deliberate exception to "never buffer the
+    /// request body" — scoped to the managed OpenAI streaming-capable path and bounded by
+    /// `MAX_REQUEST_BODY`. BYO and every other request still stream straight through.
+    inject_eligible: bool,
+    /// Accumulated request body — populated only when `inject_eligible`; otherwise stays empty and
+    /// the body is never buffered.
+    req_buf: Vec<u8>,
+    start: Instant,
+    /// Connect-retry counter (see `fail_to_connect`).
+    attempt: u8,
+    /// Process-unique id for this request (`{instance}-{seq}`), echoed in the `x-beyond-request-id`
+    /// response header and the `ai.usage` event so a client report ties back to a log line.
+    request_id: RequestId,
+}
+
+impl AiProxy {
+    /// Write a small JSON error and signal `request_filter` to short-circuit. The body is built with
+    /// `serde_json` (not `format!`) so a `typ`/`msg` containing `"` or `\` can never break out of the
+    /// JSON structure — keeps this safe if a future caller passes a non-literal message.
+    ///
+    /// Every rejection logs one structured `warn` line (the rejection counter only says *how many*,
+    /// not *which request* — this is what an oncall greps when a `deny_fraud`/`rate_limit` spike
+    /// shows on the dashboard) and echoes the `request_id` in a response header so a client report
+    /// quoting that id lands on this line.
+    async fn reject(
+        session: &mut Session,
+        request_id: &str,
+        status: u16,
+        typ: &str,
+        msg: &str,
+    ) -> Result<bool> {
+        warn!(request_id, status, error_type = typ, "request rejected");
+        let body = Bytes::from(
+            serde_json::json!({ "error": { "type": typ, "message": msg } }).to_string(),
+        );
+        let mut resp = ResponseHeader::build(status, None)?;
+        resp.insert_header("content-type", "application/json")?;
+        resp.insert_header("content-length", body.len().to_string())?;
+        resp.insert_header(REQUEST_ID_HEADER, request_id)?;
+        session.write_response_header(Box::new(resp), false).await?;
+        session.write_response_body(Some(body), true).await?;
+        Ok(true)
+    }
+}
+
+fn extract_virtual_key(session: &Session) -> Option<&str> {
+    let h = session.req_header();
+    // Anthropic SDK sends `x-api-key`; OpenAI SDK sends `Authorization: Bearer`. One neutral
+    // virtual key works in either, so check both. Borrowed from the header — no per-request copy.
+    if let Some(v) = h.headers.get("x-api-key").and_then(|v| v.to_str().ok()) {
+        return Some(v);
+    }
+    h.headers
+        .get("authorization")
+        .and_then(|v| v.to_str().ok())
+        .and_then(|v| v.strip_prefix("Bearer "))
+}
+
+/// Upper bound on a model id we'll record. Real ids are short (`claude-opus-4-8`,
+/// `accounts/fireworks/models/…`); anything longer is junk or an attempt to bloat the billing log.
+const MAX_MODEL_LEN: usize = 128;
+
+/// Sanitize the model id extracted from the (client-controlled) request body before it lands in the
+/// `ai.usage` billing log. `tracing`'s JSON layer escapes the value, but a downstream consumer
+/// (logfwd/OTLP → ClickHouse) may re-handle it, so we refuse anything that could break out of a JSON
+/// string or a line-oriented log: control bytes, `"`, `\`, `DEL`. A violating or over-long value is
+/// recorded as `"unknown"` (matching `peek`'s non-UTF-8 fallback) rather than the raw bytes — a
+/// mislabeled-but-safe usage row beats a corrupted or injected one.
+fn sanitize_model(model: String) -> Cow<'static, str> {
+    let bad = model.len() > MAX_MODEL_LEN
+        || model
+            .bytes()
+            .any(|b| b < 0x20 || b == b'"' || b == b'\\' || b == 0x7f);
+    if bad {
+        Cow::Borrowed("unknown")
+    } else {
+        Cow::Owned(model)
+    }
+}
+
+fn dialect_for_path(path: &str) -> Dialect {
+    // Anthropic Messages vs OpenAI Chat Completions/Embeddings. Embeddings are OpenAI-dialect only.
+    if path.starts_with("/v1/messages") {
+        Dialect::Anthropic
+    } else {
+        Dialect::OpenAI
+    }
+}
+
+/// Whether the **forwarded** (provider-native) path targets an OpenAI streaming-capable endpoint —
+/// chat completions or the Responses API. Checked by *suffix*, so it holds regardless of the
+/// provider's mount prefix (`/v1/chat/completions`, `/openai/v1/chat/completions`,
+/// `/inference/v1/chat/completions`, …). Only these get buffered for `stream_options.include_usage`
+/// injection — embeddings and everything else never stream, so there's nothing to meter.
+fn is_streamable_path(forward_path: &str) -> bool {
+    forward_path.ends_with("/chat/completions") || forward_path.ends_with("/responses")
+}
+
+/// Splice `stream_options.include_usage` into a buffered OpenAI chat body when it streams without it
+/// (see `peek::plan_stream_usage_injection`); otherwise return it unchanged. This is what guarantees
+/// a usage chunk — hence a billable token count — from a stock client that never set the option.
+fn maybe_inject_stream_usage(body: Vec<u8>) -> Vec<u8> {
+    match peek::plan_stream_usage_injection(&body) {
+        Some(at) => {
+            const FRAG: &[u8] = br#""stream_options":{"include_usage":true},"#;
+            let mut out = Vec::with_capacity(body.len() + FRAG.len());
+            out.extend_from_slice(&body[..at]);
+            out.extend_from_slice(FRAG);
+            out.extend_from_slice(&body[at..]);
+            out
+        }
+        None => body,
+    }
+}
+
+#[async_trait]
+impl ProxyHttp for AiProxy {
+    type CTX = Option<RequestCtx>;
+
+    fn new_ctx(&self) -> Self::CTX {
+        None
+    }
+
+    async fn request_filter(&self, session: &mut Session, ctx: &mut Self::CTX) -> Result<bool> {
+        self.state.metrics.requests_total.inc();
+        let start = Instant::now();
+        // One id per request, generated before any reject path so even a 400/401 carries it (in the
+        // log line and the `x-beyond-request-id` header). Moved into `ctx` at the end for the
+        // admitted path. Cheap: a counter bump + a short `format!` (see `next_request_id`).
+        let request_id = self.state.next_request_id();
+
+        // 1. Route by the **first path segment** = provider; forward the rest of the path verbatim
+        // (native passthrough — the gateway holds no per-provider mount knowledge). A path with no
+        // provider segment that starts with `/v1` is the drop-in default: dialect picks
+        // openai/anthropic and the path is forwarded as-is. Anything else → unknown provider (404).
+        // We resolve before auth (an unknown route is cheap) and compute owned values inside the
+        // block so the session borrow ends before any `&mut session` reject below.
+        let (provider_opt, forward_path) = {
+            let uri = &session.req_header().uri;
+            let path = uri.path();
+            let query = uri.query();
+            // `nth(1)`: `/openai/v1/…` → "openai"; `/v1/…` → "v1"; "/" or "" → "".
+            let first = path.split('/').nth(1).unwrap_or("");
+            let with_query = |p: &str| match query {
+                Some(q) => format!("{p}?{q}"),
+                None => p.to_string(),
+            };
+            if let Some(p) = self.state.provider(first) {
+                // Provider-prefixed: strip the leading `/{first}` segment, forward the remainder.
+                let rest = &path[1 + first.len()..];
+                (
+                    Some(p.clone()),
+                    with_query(if rest.is_empty() { "/" } else { rest }),
+                )
+            } else if path.starts_with(route::DEFAULT_PREFIX) {
+                // Bare default: dialect picks the provider; forward the path unchanged.
+                let name = route::dialect_default(dialect_for_path(path));
+                (self.state.provider(name).cloned(), with_query(path))
+            } else {
+                (None, String::new())
+            }
+        };
+        let Some(provider) = provider_opt else {
+            return Self::reject(
+                session,
+                &request_id,
+                404,
+                "invalid_request_error",
+                "unknown provider",
+            )
+            .await;
+        };
+        // Dialect now comes from the resolved provider (usage parsing + injection eligibility).
+        let dialect = provider.dialect;
+
+        // 2. Extract the presented key — a managed virtual key (`bai_…`) or a raw BYO provider token.
+        let Some(raw_key) = extract_virtual_key(session) else {
+            return Self::reject(
+                session,
+                &request_id,
+                401,
+                "authentication_error",
+                "missing API key",
+            )
+            .await;
+        };
+
+        // 3. Rate guardrails (see `ratelimit`), charged on the *raw presented key* **before** any
+        // verification or upstream connect. Keying on the credential we already hold (rather than the
+        // verified tenant id) is what lets this sit ahead of the Ed25519 verify: a single leaked,
+        // runaway, or forged key can't drive unbounded crypto work (per-credential tier), and a flood
+        // of distinct random BYO tokens can't drive junk-auth connects to providers from our egress
+        // IPs (global BYO tier — managed traffic is exempt, see `ratelimit`). The `check` borrow of
+        // `raw_key` ends as the call returns, so the `&mut session` reject is free to run on the
+        // over-limit path (where `raw_key` is unused afterward).
+        if let Some(rl) = &self.state.rate_limit {
+            if let Some(reason) = rl.check(raw_key, raw_key.starts_with("bai_")) {
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&[reason.label()])
+                    .inc();
+                return Self::reject(
+                    session,
+                    &request_id,
+                    429,
+                    "rate_limit_error",
+                    "rate limit exceeded",
+                )
+                .await;
+            }
+        }
+
+        // 4. Reject oversized bodies up front (Content-Length) so we never buffer a huge upload.
+        let declared_len = session
+            .req_header()
+            .headers
+            .get("content-length")
+            .and_then(|v| v.to_str().ok())
+            .and_then(|v| v.parse::<usize>().ok());
+        if let Some(len) = declared_len {
+            if len > MAX_REQUEST_BODY {
+                return Self::reject(
+                    session,
+                    &request_id,
+                    413,
+                    "invalid_request_error",
+                    "request body too large",
+                )
+                .await;
+            }
+        }
+
+        // 5. Identity + key handling. `bai_…` → managed (stateless verify → deny-check → swap to the
+        // pool key). Anything else → BYO: the user's own provider token, passed through unchanged
+        // (no Beyond identity, so no deny-set and no per-tenant attribution).
+        let (tenant_id, vpc_id, managed) = if raw_key.starts_with("bai_") {
+            let Ok(identity) = self.state.keyring.verify(raw_key) else {
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&["auth"])
+                    .inc();
+                return Self::reject(
+                    session,
+                    &request_id,
+                    401,
+                    "authentication_error",
+                    "invalid API key",
+                )
+                .await;
+            };
+            // Deny-set: O(1), default-allow. The gateway never learns *why*, only the reason code.
+            if let Some(reason) = self.state.deny.load().reason(identity.tenant_id) {
+                // Distinct label per reason — `Unknown` is *not* folded into `deny_fraud`. An
+                // `Unknown` arises when the control plane writes a reason string this gateway
+                // doesn't recognize (a control-plane deploy ahead of a gateway deploy), which would
+                // otherwise spike the fraud counter and mask the real fraud signal. A `deny_unknown`
+                // label surfaces it as the deployment-coordination issue it is.
+                let label = match reason {
+                    crate::deny::DenyReason::Spend => "deny_spend",
+                    crate::deny::DenyReason::Fraud => "deny_fraud",
+                    crate::deny::DenyReason::Unknown => "deny_unknown",
+                };
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&[label])
+                    .inc();
+                return Self::reject(
+                    session,
+                    &request_id,
+                    reason.http_status(),
+                    "access_denied",
+                    "tenant is over limit or suspended",
+                )
+                .await;
+            }
+            // The actual `Bearer …`/`x-api-key` value is precomputed in the provider registry and
+            // applied in `upstream_request_filter`; here we only confirm a pool key exists.
+            if provider.pool_auth_value.is_none() {
+                return Self::reject(
+                    session,
+                    &request_id,
+                    503,
+                    "api_error",
+                    "no provider key available",
+                )
+                .await;
+            }
+            (identity.tenant_id, identity.vpc_id, true)
+        } else {
+            (0, 0, false)
+        };
+
+        // Mark OpenAI managed chat/responses streams for body buffering + `stream_options` injection
+        // (handled in `request_body_filter`). Scoped tight: managed only (BYO stays pure
+        // passthrough), OpenAI dialect only, streaming-capable paths only — so everything else still
+        // streams through untouched. Checked on the forwarded path (suffix), so it's prefix-agnostic.
+        let inject_eligible =
+            managed && dialect == Dialect::OpenAI && is_streamable_path(&forward_path);
+
+        // Circuit breaker (per provider, all traffic — a down provider is down regardless of whose
+        // key is used). Checked here, after every other rejection, so claiming a half-open probe
+        // permit corresponds to an *actual* upstream attempt — and balanced by exactly one
+        // `record_*` in `logging` (which runs once per admitted request), so a permit can't leak.
+        // When open, fast-fail 503 instead of piling the request against `read_timeout_secs` and
+        // exhausting connection/in-flight slots for every provider. 5xx/connect failures trip it;
+        // 429 never does (that's a healthy provider throttling — see `logging`).
+        if let Some(breaker) = &provider.breaker {
+            if breaker.allow().is_err() {
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&["circuit_open"])
+                    .inc();
+                return Self::reject(
+                    session,
+                    &request_id,
+                    503,
+                    "api_error",
+                    "provider temporarily unavailable",
+                )
+                .await;
+            }
+        }
+
+        *ctx = Some(RequestCtx {
+            tenant_id,
+            vpc_id,
+            dialect,
+            provider,
+            forward_path,
+            managed,
+            model: String::new(),
+            model_scanner: peek::ModelScanner::new(),
+            resp_model_scanner: peek::ModelScanner::new(),
+            streaming: false,
+            inject_eligible,
+            // Only the inject-eligible path ever buffers the request body (to splice
+            // `stream_options` after the root `{`; the `stream` key can appear anywhere in the root
+            // object, so the decision needs the whole body — buffering is inherent here, not
+            // incidental). When it does, pre-size from the declared Content-Length so accumulation is
+            // a single allocation instead of a geometric realloc chain; capped at `MAX_REQUEST_BODY`
+            // so a lying header can't pre-allocate unbounded memory. Every other request leaves this
+            // empty and never buffers.
+            req_buf: match (inject_eligible, declared_len) {
+                (true, Some(len)) => Vec::with_capacity(len.min(MAX_REQUEST_BODY)),
+                _ => Vec::new(),
+            },
+            // Grown lazily by the response tap (`response_body_filter`), not pre-reserved: a
+            // non-streaming response — the common case — is a few hundred bytes, so reserving the
+            // full 64KB cap up front would waste an allocation on every request to hold ~200B. A
+            // long stream grows it geometrically to the bounded 2×cap and compacts; that handful of
+            // reallocs is lost in the network noise of a stream we're already relaying chunk by chunk.
+            resp_tail: Vec::new(),
+            body_bytes_fed: 0,
+            upstream_status: None,
+            start,
+            attempt: 0,
+            request_id,
+        });
+        // Admitted: count it in-flight. Balanced by the decrement in `logging`, which runs exactly
+        // once per admitted request (rejected requests leave `ctx` None and never reach that path,
+        // so the gauge can't leak). `active_streams` only covers SSE; this covers every request.
+        self.state.metrics.requests_in_flight.inc();
+        Ok(false)
+    }
+
+    async fn upstream_peer(
+        &self,
+        _session: &mut Session,
+        ctx: &mut Self::CTX,
+    ) -> Result<Box<HttpPeer>> {
+        // `ctx` is set by `request_filter` for every admitted request; a missing ctx here means an
+        // unadmitted request reached `upstream_peer` (a Pingora ordering change or future refactor).
+        // Surface it as an error rather than panicking the worker.
+        let Some(rc) = ctx.as_ref() else {
+            return Err(pingora_core::Error::new_str(
+                "upstream_peer reached without request context",
+            ));
+        };
+
+        // Resolve via the TTL cache (async, non-blocking) rather than `HttpPeer::new`'s eager
+        // blocking `getaddrinfo`. SNI/Host = the configured host; TLS on for real providers (the
+        // e2e harness flips `upstream_tls=false` for a plaintext mock).
+        let addr = match self.state.resolve(&rc.provider.authority).await {
+            Ok(a) => a,
+            Err(e) => {
+                // DNS failures are rare and usually mean a misconfigured `provider_authorities`
+                // override — so keep the diagnostic (provider name + authority + the resolver error,
+                // already formatted into `e`) instead of discarding it behind an opaque static string.
+                // `error_because` chains `e` as the cause so it shows in the Pingora error log.
+                warn!(
+                    request_id = %rc.request_id,
+                    provider = rc.provider.name.as_str(),
+                    authority = rc.provider.authority.as_str(),
+                    error = %e,
+                    "upstream dns resolution failed",
+                );
+                return Err(pingora_core::Error::because(
+                    pingora_core::ErrorType::ConnectError,
+                    "upstream dns resolution failed",
+                    e,
+                ));
+            }
+        };
+        let mut peer = HttpPeer::new(
+            addr,
+            self.state.config.upstream_tls,
+            rc.provider.host.clone(),
+        );
+        // Prefer HTTP/2 to the provider (config `upstream_http2`, default on), fall back to HTTP/1.1.
+        // Every provider in `KNOWN_PROVIDERS` negotiates `h2` over TLS (verified by handshake), and H2
+        // multiplexes many concurrent requests/streams over one connection — fewer sockets and TLS
+        // handshakes from our egress IPs (which also eases the egress-reputation pressure `ratelimit`
+        // guards). `H2H1` is strictly ≥ `H1` on compatibility: ALPN negotiates down to H1 for any host
+        // that doesn't offer h2, and a plaintext upstream (the mock, `upstream_tls=false`) has no ALPN
+        // at all and stays H1. The negotiated protocol is then visible per-request as
+        // `upstream_request.version` (see `upstream_request_filter`), which is what lets the
+        // body-injection path frame correctly. The knob lets an operator force all-H1 without a code
+        // redeploy, and lets the e2e bench compare the two head-to-head.
+        peer.options.alpn = if self.state.config.upstream_http2 {
+            ALPN::H2H1
+        } else {
+            ALPN::H1
+        };
+        // Cert verification is on everywhere except the bench's self-signed TLS mock (see config).
+        if !self.state.config.upstream_verify_cert {
+            peer.options.verify_cert = false;
+            peer.options.verify_hostname = false;
+        }
+        peer.options.connection_timeout =
+            Some(Duration::from_secs(self.state.config.connect_timeout_secs));
+        peer.options.read_timeout = Some(Duration::from_secs(self.state.config.read_timeout_secs));
+        peer.options.write_timeout =
+            Some(Duration::from_secs(self.state.config.write_timeout_secs));
+        peer.options.idle_timeout = Some(Duration::from_secs(self.state.config.idle_timeout_secs));
+        Ok(Box::new(peer))
+    }
+
+    async fn upstream_request_filter(
+        &self,
+        _session: &mut Session,
+        upstream_request: &mut pingora::http::RequestHeader,
+        ctx: &mut Self::CTX,
+    ) -> Result<()> {
+        let Some(rc) = ctx.as_ref() else {
+            return Ok(());
+        };
+
+        // Managed: swap the virtual key for the real pool key (precomputed at boot) in the scheme
+        // the upstream wants — removing *both* inbound auth headers first so the virtual key never
+        // leaks upstream. BYO (`!managed`): leave the user's own auth header exactly as presented.
+        if rc.managed {
+            if let Some(av) = &rc.provider.pool_auth_value {
+                upstream_request.remove_header("authorization");
+                upstream_request.remove_header("x-api-key");
+                upstream_request.insert_header(rc.provider.auth.header(), av.expose())?;
+            }
+        }
+
+        // Point Host at the upstream.
+        upstream_request.insert_header("host", rc.provider.host.as_str())?;
+
+        // Forward the provider-native path (computed in `request_filter`): the client path with the
+        // `/{provider}` segment stripped, or unchanged for a bare-path default. We send it verbatim —
+        // no per-provider rewriting. Only set the URI when it actually differs from the inbound path
+        // (i.e. a `/{provider}` prefix was stripped); the bare-path case needs no change, so we skip
+        // the parse + realloc. The body's framing (Content-Length / chunked) is preserved.
+        if rc.forward_path
+            != upstream_request
+                .uri
+                .path_and_query()
+                .map(|pq| pq.as_str())
+                .unwrap_or("")
+            && let Ok(uri) = rc.forward_path.parse()
+        {
+            upstream_request.set_uri(uri);
+        }
+
+        // Injection-eligible (OpenAI managed stream): the body is rewritten in `request_body_filter`,
+        // changing its length, and we can't know the new length here (headers go out before the body
+        // filter runs). So drop the client's `Content-Length`; how the now-unknown length is framed
+        // depends on the **negotiated upstream protocol**, which is reliably readable here as
+        // `upstream_request.version`: pingora-proxy sets it to HTTP/2 before this filter on the H2 path
+        // (`proxy_h2.rs`) and to HTTP/1.1 on the H1 path (`proxy_h1.rs`).
+        //
+        //   - **H1**: a body with neither `content-length` nor `transfer-encoding` is framed as
+        //     *zero-length* by pingora's H1 client (RFC 9112 §6.3) — the injected body would be
+        //     silently dropped. So we must set `transfer-encoding: chunked`.
+        //   - **H2**: bodies are delimited by `END_STREAM`, and `transfer-encoding` is a forbidden
+        //     connection-specific header — the `h2` crate *rejects the whole request*
+        //     (`UserError::MalformedHeaders`) if it's present. So we must NOT set it; removing
+        //     `content-length` is sufficient and correct.
+        if rc.inject_eligible {
+            upstream_request.remove_header("content-length");
+            if upstream_request.version != http::Version::HTTP_2 {
+                upstream_request.insert_header("transfer-encoding", "chunked")?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn request_body_filter(
+        &self,
+        _session: &mut Session,
+        body: &mut Option<Bytes>,
+        end_of_stream: bool,
+        ctx: &mut Self::CTX,
+    ) -> Result<()> {
+        let Some(rc) = ctx.as_mut() else {
+            return Ok(());
+        };
+        // Feed the body through the structural scanner as it passes (never withheld, never
+        // buffered) to extract the exact root-level `model`. Body framing is untouched.
+        if let Some(chunk) = body.as_ref() {
+            // Enforce the body cap on the *streamed* size too: the up-front `Content-Length` check in
+            // `request_filter` can't see a chunked-encoded body (no declared length). We don't buffer
+            // — we just count — and abort the proxied request once the running total crosses the cap.
+            // Aborting (vs. a clean 413) is acceptable here: headers are already away to the upstream,
+            // and this is an abuse guard, not a normal client path.
+            rc.body_bytes_fed = rc.body_bytes_fed.saturating_add(chunk.len());
+            if rc.body_bytes_fed > MAX_REQUEST_BODY {
+                self.state
+                    .metrics
+                    .rejections_total
+                    .with_label_values(&["body_too_large"])
+                    .inc();
+                return Err(pingora_core::Error::new_str("request body exceeds limit"));
+            }
+            rc.model_scanner.feed(chunk);
+            // Eligible requests are buffered so we can splice the root object before any byte reaches
+            // the upstream (injection inserts near the front, so we can't have forwarded it already).
+            if rc.inject_eligible {
+                rc.req_buf.extend_from_slice(chunk);
+            }
+        }
+
+        if rc.inject_eligible {
+            if end_of_stream {
+                // Emit the whole (possibly rewritten) body in one shot; `transfer-encoding: chunked`
+                // (set in `upstream_request_filter`) makes the changed length fine.
+                let buf = std::mem::take(&mut rc.req_buf);
+                *body = Some(Bytes::from(maybe_inject_stream_usage(buf)));
+            } else {
+                // Withhold — the bytes are buffered above; nothing goes upstream until end-of-stream.
+                *body = None;
+            }
+        }
+
+        if end_of_stream && rc.model.is_empty() {
+            if let Some(m) = rc.model_scanner.take_model() {
+                rc.model = sanitize_model(m).into_owned();
+            }
+        }
+        Ok(())
+    }
+
+    async fn response_filter(
+        &self,
+        _session: &mut Session,
+        upstream_response: &mut ResponseHeader,
+        ctx: &mut Self::CTX,
+    ) -> Result<()> {
+        if let Some(rc) = ctx.as_mut() {
+            // Headers arrived ≈ time-to-first-byte. Per-provider handle resolved once at boot (see
+            // `ProviderMetrics`) — first-token latency is per-provider, so an unlabeled histogram
+            // can't tell you which one regressed.
+            rc.provider
+                .metrics
+                .ttft_seconds
+                .observe(rc.start.elapsed().as_secs_f64());
+
+            // Per-provider response counter, bucketed by status class — the signal that a provider
+            // is degrading (429/5xx) before it shows up only as latency or a missing usage event.
+            let status = upstream_response.status.as_u16();
+            rc.provider.metrics.record_response(status);
+            // Remember the status for the circuit-breaker outcome resolved in `logging` (a response
+            // arrived, so the provider is reachable — even a 429/5xx is a real answer, not a connect
+            // failure). `logging` decides failure-vs-success from this.
+            rc.upstream_status = Some(status);
+
+            // Derive streaming from the response, not the request: SSE ⇒ use the streaming usage
+            // parser; otherwise the body is a single JSON object.
+            rc.streaming = upstream_response
+                .headers
+                .get("content-type")
+                .and_then(|v| v.to_str().ok())
+                .is_some_and(|ct| ct.contains("event-stream"));
+            // Track concurrent SSE streams. Incremented here (response head is in), decremented in
+            // `logging` once the stream completes — so the gauge reflects in-flight streams, not a
+            // counter that only ever climbs. Non-streaming responses don't touch it.
+            if rc.streaming {
+                self.state.metrics.active_streams.inc();
+            }
+
+            // Echo the request id so a client (or an oncall reading a captured response) can quote it
+            // and land on this request's log line. `insert_header` only fails on an invalid value;
+            // our id is `[0-9a-f-]`, always valid — but surface a failure rather than silently drop.
+            upstream_response.insert_header(REQUEST_ID_HEADER, rc.request_id.as_str())?;
+        }
+        Ok(())
+    }
+
+    fn response_body_filter(
+        &self,
+        _session: &mut Session,
+        body: &mut Option<Bytes>,
+        _end_of_stream: bool,
+        ctx: &mut Self::CTX,
+    ) -> Result<Option<Duration>>
+    where
+        Self::CTX: Send + Sync,
+    {
+        // Passive tap: copy each chunk into a bounded tail for usage parsing, but never withhold it
+        // — chunks pass straight through, so the stream is relayed with no added buffering.
+        //
+        // We let the tail grow to 2× the cap, then compact once with a single `copy_within` that
+        // keeps the last cap bytes. This bounds memory the same way the old per-chunk `drain` did,
+        // but moves bytes O(stream_len / cap) times instead of once per chunk — for a long stream of
+        // small chunks that's the difference between one memmove per 64 KB and one per chunk.
+        if let (Some(rc), Some(chunk)) = (ctx.as_mut(), body.as_ref()) {
+            // Tap the provider-reported (resolved/billed) model from the response *head* — the
+            // scanner stops at the first root `model`, so this is O(1) and cheap (it finds the model
+            // in the first chunk and ignores the rest). Kept separate from the tail because the model
+            // is at the start of the response while the usage event is at the end.
+            rc.resp_model_scanner.feed(chunk);
+
+            rc.resp_tail.extend_from_slice(chunk);
+            if rc.resp_tail.len() > 2 * USAGE_TAIL_CAP {
+                let keep_from = rc.resp_tail.len() - USAGE_TAIL_CAP;
+                rc.resp_tail.copy_within(keep_from.., 0);
+                rc.resp_tail.truncate(USAGE_TAIL_CAP);
+            }
+        }
+        Ok(None)
+    }
+
+    fn fail_to_connect(
+        &self,
+        _session: &mut Session,
+        _peer: &HttpPeer,
+        ctx: &mut Self::CTX,
+        mut e: Box<pingora_core::Error>,
+    ) -> Box<pingora_core::Error> {
+        if let Some(rc) = ctx.as_mut() {
+            // Retry transient connect failures a couple of times (Pingora re-invokes upstream_peer).
+            if rc.attempt < MAX_CONNECT_RETRIES {
+                rc.attempt += 1;
+                // Surface the retry. Without this, a partially-down provider TCP layer (or an
+                // egress-IP ban — connect is where that first bites) shows up only as extra latency
+                // on `upstream_latency_seconds`, indistinguishable from a slow model. The counter is
+                // the dashboard signal; the `warn!` carries the request_id to grep.
+                rc.provider.metrics.connect_retries_total.inc();
+                warn!(
+                    request_id = %rc.request_id,
+                    provider = rc.provider.name.as_str(),
+                    attempt = rc.attempt,
+                    error = %e,
+                    "upstream connect failed; retrying",
+                );
+                e.set_retry(true);
+            }
+        }
+        e
+    }
+
+    async fn logging(
+        &self,
+        _session: &mut Session,
+        e: Option<&pingora_core::Error>,
+        ctx: &mut Self::CTX,
+    ) {
+        let Some(rc) = ctx.as_mut() else { return };
+
+        // Balance the in-flight gauge incremented at admission. `logging` runs exactly once per
+        // admitted request — including on upstream errors and client disconnects — so the gauge
+        // always returns to baseline and can't drift upward.
+        self.state.metrics.requests_in_flight.dec();
+
+        // An upstream error (DNS/connect timeout, read timeout, abort) lands here with `Some(e)` but
+        // no `ai.usage` row (no parseable body) — and the earlier `warn!` in `upstream_peer` only
+        // fires for DNS, not connect/read failures. Log it with the full identity so "why did tenant
+        // 42 get 502s for 5 minutes" is one grep on the request_id, not a reconstruction.
+        if let Some(e) = e {
+            warn!(
+                request_id = %rc.request_id,
+                tenant_id = rc.tenant_id,
+                vpc_id = rc.vpc_id,
+                provider = rc.provider.name.as_str(),
+                error = %e,
+                "upstream request errored",
+            );
+        }
+
+        // Resolve the circuit-breaker outcome exactly once per admitted request (every request that
+        // claimed a permit in `request_filter` records here, so a half-open probe permit can't leak).
+        // Failure = the provider is *broken*: a 5xx response, or no response at all paired with an
+        // upstream error (connect/read failure). Success = the provider *answered* — 2xx/3xx, and
+        // deliberately **4xx/429 too**: a 429 is a healthy provider throttling our pool key, which the
+        // rate limiter and the client's `Retry-After` own, NOT a reason to cut all traffic to it.
+        if let Some(breaker) = &rc.provider.breaker {
+            match rc.upstream_status {
+                Some(s) if s >= 500 => breaker.record_failure(),
+                Some(_) => breaker.record_success(),
+                None if e.is_some() => breaker.record_failure(),
+                // No response and no error ⇒ client went away before the upstream answered; don't
+                // blame the provider — record success so the probe permit resolves.
+                None => breaker.record_success(),
+            }
+        }
+
+        // The buffer may transiently hold up to 2× the cap before compaction; the usage event is
+        // always in the last cap bytes, so slice to that bounded tail before parsing.
+        let tail_start = rc.resp_tail.len().saturating_sub(USAGE_TAIL_CAP);
+        let tail = &rc.resp_tail[tail_start..];
+
+        // Extract usage facts from the tail (shape depends on dialect + streaming).
+        let usage = match (rc.dialect, rc.streaming) {
+            (Dialect::OpenAI, true) => usage::openai_stream(tail),
+            (Dialect::OpenAI, false) => usage::openai_body(tail),
+            (Dialect::Anthropic, true) => usage::anthropic_stream(tail),
+            (Dialect::Anthropic, false) => usage::anthropic_body(tail),
+        }
+        .unwrap_or_default();
+
+        let m = &self.state.metrics;
+        // Pre-resolved fixed-label children (see `Metrics`) — no per-call `with_label_values` lookup.
+        m.tokens_input.inc_by(usage.input_tokens);
+        m.tokens_output.inc_by(usage.output_tokens);
+        // Cache tokens, too — these are in the `ai.usage` billing log below, but that ships with lag;
+        // the counter is the alerting surface for a cache-hit-rate cliff after a deploy.
+        m.tokens_cache_read.inc_by(usage.cache_read_tokens);
+        m.tokens_cache_write.inc_by(usage.cache_write_tokens);
+        rc.provider
+            .metrics
+            .upstream_latency_seconds
+            .observe(rc.start.elapsed().as_secs_f64());
+        // Balance the `active_streams` increment from `response_filter`. `logging` runs exactly once
+        // per request (including on upstream errors / client disconnects), so a stream that opened is
+        // always accounted closed here — the gauge can't leak upward.
+        if rc.streaming {
+            m.active_streams.dec();
+        }
+
+        // Emit the usage *fact* on a dedicated target — **managed only**. The event is an
+        // identity-keyed billing record (logfwd/OTLP ships `ai.usage` → ClickHouse → a closed
+        // pricing consumer); BYO carries no Beyond identity, so a BYO event would be a billing row
+        // with `tenant_id=0` — unbillable, unattributable, and a footgun for any consumer that sums
+        // without filtering it out. Aggregate gateway throughput (incl. BYO) is already covered by
+        // the Prometheus metrics above, which is the right tool for non-billing observability.
+        if rc.managed {
+            // Emit BOTH models. `model` is the one the *provider* resolved + billed (echoed in its
+            // response) — the key for pricing AND for reconciling against the provider's invoice,
+            // which itemizes by the pinned snapshot. `requested_model` is the alias the client sent —
+            // product analytics ("what they asked for") and a fallback rate when a snapshot is newer
+            // than the downstream price table. They're equal when the response carried no model (e.g.
+            // an error body), where `model` falls back to the request alias. Both sanitized.
+            let billed = rc.resp_model_scanner.take_model().map(sanitize_model);
+            // Borrow the requested model as the fallback rather than cloning it — it's still read as
+            // `requested_model` below, so a clone would be pure waste on every managed response.
+            let billed_model = billed.as_deref().unwrap_or(&rc.model);
+            info!(
+                target: "ai.usage",
+                request_id = %rc.request_id,
+                tenant_id = rc.tenant_id,
+                vpc_id = rc.vpc_id,
+                provider = rc.provider.name.as_str(),
+                model = billed_model,
+                requested_model = %rc.model,
+                stream = rc.streaming,
+                input_tokens = usage.input_tokens,
+                output_tokens = usage.output_tokens,
+                cache_read_tokens = usage.cache_read_tokens,
+                cache_write_tokens = usage.cache_write_tokens,
+                latency_ms = rc.start.elapsed().as_millis() as u64,
+                "usage"
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn sanitize_model_passes_real_ids() {
+        for id in [
+            "gpt-4o",
+            "claude-opus-4-8",
+            "openrouter/meta-llama/llama-3.1",
+            "accounts/fireworks/models/llama-v3p1-70b-instruct",
+            "gpt-4o-mini-2024-07-18",
+        ] {
+            assert_eq!(sanitize_model(id.to_string()), id);
+        }
+    }
+
+    #[test]
+    fn sanitize_model_rejects_json_and_log_injection() {
+        // A `"` would close the JSON string; `\` could escape; a newline breaks line-oriented log
+        // shipping. Any of them ⇒ recorded as "unknown" rather than injected into the billing log.
+        for evil in [
+            r#"real","injected":"x"#,
+            r#"a\b"#,
+            "line1\nline2",
+            "ctrl\u{0}byte",
+        ] {
+            assert_eq!(sanitize_model(evil.to_string()), "unknown");
+        }
+    }
+
+    #[test]
+    fn sanitize_model_rejects_overlong() {
+        let long = "a".repeat(MAX_MODEL_LEN + 1);
+        assert_eq!(sanitize_model(long), "unknown");
+        // Exactly at the cap is fine.
+        let ok = "a".repeat(MAX_MODEL_LEN);
+        assert_eq!(sanitize_model(ok.clone()), ok);
+    }
+}
diff --git a/src/ratelimit.rs b/src/ratelimit.rs
new file mode 100644
index 0000000..19caa9d
--- /dev/null
+++ b/src/ratelimit.rs
@@ -0,0 +1,233 @@
+//! Request-rate guardrails — blast-radius circuit breakers, **not** a spend control.
+//!
+//! The deny-set (see `deny`) is the spend/fraud authority, but it's *cumulative* and reacts on a
+//! lag: it only learns of spend after usage facts round-trip through the control plane, and it's
+//! structurally blind to request floods that never bill — auth failures (rejected here, never reach
+//! upstream), provider 4xx, and BYO traffic (on the caller's own key, no Beyond identity). Two tiers
+//! cap velocity, both charged in `proxy::request_filter` *before* the Ed25519 verify and the upstream
+//! connect, so a flood can't drive unbounded crypto/socket work:
+//!
+//! 1. **Per-credential** — keyed by the raw presented credential (the whole `bai_…` virtual key or
+//!    BYO token). Catches a single leaked/runaway key. Granularity is per-credential: managed virtual
+//!    keys are deterministic per `(tenant, app)`, so this is effectively a per-(tenant, app) ceiling —
+//!    one credential's runaway can't throttle another. A flood of *distinct* credentials slips past
+//!    it (every random string is its own bucket), which is what tier 2 exists for.
+//!
+//! 2. **Global BYO aggregate** — a single bucket for *all* BYO traffic combined. BYO is unverified
+//!    and upstream-bound: a flood of distinct random BYO tokens would otherwise open junk-auth
+//!    connections to providers from our egress IPs, getting them rate-limited or banned (we put
+//!    ourselves in the firing line). This bounds that aggregate regardless of how the tokens vary.
+//!    **Managed traffic is exempt** — it's Ed25519-verified before any upstream connect and can't be
+//!    forged (the signing key lives only in the control plane), so a random `bai_` flood fails verify
+//!    and never reaches a provider (CPU only, no egress impact). Exempting it means this shared bucket
+//!    only ever sheds BYO load under a flood, never the core managed tenants.
+//!
+//! Both tiers are deliberately generous: ceilings well above legitimate steady state, so they never
+//! trip in normal operation. Tune from `ai_rejections_total{reason="rate_limit"}` (per-credential)
+//! and `{reason="rate_limit_byo_global"}` (BYO aggregate).
+//!
+//! ## Design decision: why a global BYO cap and not per-source-IP (READ BEFORE CHANGING)
+//!
+//! The threat that shaped tier 2 is **egress-IP reputation**, not gateway CPU. We are an egress proxy:
+//! BYO requests connect outward to OpenAI/Anthropic/OpenRouter/… *from our IPs* carrying the caller's
+//! token. A flood of distinct **junk** BYO tokens makes those providers see a torrent of failed-auth
+//! connections from us and rate-limit or ban our egress IPs — taking down BYO for *everyone*, and
+//! degrading managed traffic that shares the same egress. That blast radius is why this lives here and
+//! is on by default, rather than being pushed entirely to the mesh/ingress.
+//!
+//! **Per-source-IP limiting was considered and rejected** as the primary control. It's the surgical
+//! answer in principle (throttle only the noisy source), but it depends on the calling task's real IP
+//! being visible here — and in production we front this with ECS Service Connect, where it is unclear
+//! whether the peer address is the client task or a collapsed mesh/proxy hop. If it's collapsed,
+//! per-IP keying is worse than nothing: it either does nothing (all sources share one IP, so no single
+//! key trips) or throttles every tenant at once. We refused to hinge an egress-protection control on
+//! an unverified topology assumption. The global BYO cap is **topology-independent** — it bounds the
+//! aggregate no matter how source identity is mangled. (If/when we confirm real per-task IPs reach us,
+//! a per-IP tier is a reasonable *addition* in front of this — not a replacement.)
+//!
+//! ## What this deliberately does NOT cover (the residual — don't assume it's solved)
+//!
+//! - **The BYO cap is a shared bucket.** A flood large enough to hit `byo_rate_limit_rps` *does* shed
+//!   legitimate BYO callers along with the attacker — they're indistinguishable at admit time (we
+//!   reject before we know a token is junk). The trust segmentation (managed exempt) bounds the blast
+//!   radius to BYO only; it does not make the BYO shedding selective.
+//! - **The default ceiling is a guess.** `byo_rate_limit_rps = 1000` was picked without real BYO
+//!   traffic numbers — high enough to clear plausible legitimate use, low enough that a junk flood
+//!   can't realistically get us banned. It is meant to be tuned from the metric, not trusted as-is.
+//! - **A more selective control is the next step, not this.** The surgical fix for egress reputation
+//!   is a **provider-feedback circuit breaker**: watch upstream responses and back BYO off a provider
+//!   when we see a burst of `401`s (junk auth) from it, instead of capping all BYO blindly. That reacts
+//!   to the actual signal (providers rejecting us) and spares legitimate BYO. It's a real feature, not
+//!   a guardrail, so it's intentionally out of scope here. If you're here because the blunt cap hurt,
+//!   build that — don't just raise the number.
+//!
+//! Backed by pingora-limits' `Rate`: count-min-sketch estimators with **fixed memory regardless of
+//! key cardinality** (no per-credential entry, no background GC), matching the deny-set's O(denied)
+//! ethos. A sketch can *over*estimate a key's rate on hash collision but never under, so a cap is
+//! always enforced; `SLOTS` is sized wide enough that overestimation stays negligible.
+
+use pingora_limits::rate::Rate;
+use std::hash::{BuildHasher, RandomState};
+use std::time::Duration;
+
+/// Count-min sketch dimensions for the per-credential tier. The estimator can only *over*estimate a
+/// key's rate (never under — so the cap always holds); the additive error is bounded by
+/// `(e / SLOTS) × N`, where `N` is total req/s across *all* credentials on the node. Sized for a
+/// single high-volume node: at `SLOTS = 65536` that error stays ≤ ~5 even at ~100k req/s aggregate —
+/// far under the per-credential ceiling, so a legitimate caller near its limit isn't false-throttled.
+/// `HASHES = 5` sets the tail confidence (≈ `e^-5` ≈ 0.7% of checks may exceed that bound; the
+/// estimate is the min over the 5 rows). Memory is `2 × HASHES × SLOTS × 8 B` ≈ **5 MB, fixed**
+/// regardless of credential cardinality (no per-key entry, no GC). To resize: `SLOTS ≈ e × peak_N /
+/// tolerable_error`.
+const SLOTS: usize = 65536;
+const HASHES: usize = 5;
+
+/// The rate window. Every ceiling is expressed per this interval, i.e. requests/second.
+const WINDOW: Duration = Duration::from_secs(1);
+
+/// The single sketch key the global BYO tier counts everything under (one shared bucket).
+const BYO_GLOBAL_KEY: u8 = 0;
+
+/// Why a request was throttled — carried out so the caller can label the rejection metric and an
+/// operator can tell *which* ceiling tripped (and thus which knob to tune).
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Throttled {
+    /// A single credential exceeded its per-credential ceiling.
+    PerCredential,
+    /// Aggregate BYO traffic exceeded the global ceiling.
+    ByoGlobal,
+}
+
+impl Throttled {
+    /// The `ai_rejections_total{reason=…}` label. `PerCredential` keeps the original `"rate_limit"`
+    /// label so existing dashboards/alerts are unbroken.
+    pub fn label(self) -> &'static str {
+        match self {
+            Throttled::PerCredential => "rate_limit",
+            Throttled::ByoGlobal => "rate_limit_byo_global",
+        }
+    }
+}
+
+pub struct RateLimit {
+    /// `(sketch, max_per_window)` for the per-credential tier. `None` disables it.
+    per_cred: Option<(Rate, isize)>,
+    /// `(sketch, max_per_window)` for the global BYO aggregate tier. `None` disables it.
+    byo_global: Option<(Rate, isize)>,
+    /// Process-random hash state. The raw credential is reduced to the per-credential sketch key
+    /// through this, so the SipHash key is per-process and secret. Without it the digest would be
+    /// precomputable (`DefaultHasher` keys on zeros), letting an attacker craft two tokens that
+    /// collide into the same slots and inflate another caller's counter — false throttling. Random
+    /// seeding makes that collision search infeasible.
+    hasher: RandomState,
+}
+
+impl RateLimit {
+    /// `per_cred_rps` is the per-credential ceiling; `byo_global_rps` is the aggregate BYO ceiling.
+    /// Either tier is disabled by passing `0`. Returns `None` (no limiter at all) only when both are
+    /// `0`, so the hot path can skip it entirely.
+    pub fn new(per_cred_rps: u32, byo_global_rps: u32) -> Option<Self> {
+        if per_cred_rps == 0 && byo_global_rps == 0 {
+            return None;
+        }
+        Some(Self {
+            per_cred: (per_cred_rps != 0).then(|| {
+                (
+                    Rate::new_with_estimator_config(WINDOW, HASHES, SLOTS),
+                    per_cred_rps as isize,
+                )
+            }),
+            // One bucket, so the default estimator is plenty — no need for the wide sketch.
+            byo_global: (byo_global_rps != 0).then(|| (Rate::new(WINDOW), byo_global_rps as isize)),
+            // `RandomState::new()` draws a fresh SipHash key from the OS RNG per process.
+            hasher: RandomState::new(),
+        })
+    }
+
+    /// Charge one request. `managed` is `true` for a verified-path (`bai_…`) credential, `false` for
+    /// BYO. Returns `None` when within budget, or `Some(reason)` once a ceiling is crossed — the very
+    /// request that crosses the line is the first one rejected (`observe` returns the running total).
+    /// The credential itself is never stored; only its seeded digest feeds the per-credential sketch.
+    ///
+    /// `#[must_use]`: `observe` has already incremented the counters by the time this returns, so a
+    /// caller that drops the result has *charged* the request but skipped enforcement — the limiter is
+    /// silently bypassed. The crate's `#![deny(unused_must_use)]` only bites with this attribute
+    /// present, so it's load-bearing, not decorative.
+    #[must_use = "the throttle decision must be enforced — dropping it charges the request but lets it through"]
+    pub fn check(&self, raw_credential: &str, managed: bool) -> Option<Throttled> {
+        // Global BYO backstop first: BYO is unverified and upstream-bound, so this is the ceiling that
+        // protects our egress IPs from a distinct-token flood. Managed traffic skips it (verified,
+        // can't be forged, already bounded per-credential) so it never shares this bucket.
+        if !managed {
+            if let Some((rate, max)) = &self.byo_global {
+                if rate.observe(&BYO_GLOBAL_KEY, 1) > *max {
+                    return Some(Throttled::ByoGlobal);
+                }
+            }
+        }
+        // Per-credential ceiling: a single leaked/runaway key (managed or BYO), capped before verify.
+        if let Some((rate, max)) = &self.per_cred {
+            let key = self.hasher.hash_one(raw_credential);
+            if rate.observe(&key, 1) > *max {
+                return Some(Throttled::PerCredential);
+            }
+        }
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn both_zero_disables() {
+        assert!(RateLimit::new(0, 0).is_none());
+    }
+
+    #[test]
+    fn per_credential_allows_up_to_ceiling_then_rejects() {
+        let rl = RateLimit::new(5, 0).unwrap();
+        let cred = "bai_v1.1.payload.sig";
+        for _ in 0..5 {
+            assert_eq!(rl.check(cred, true), None);
+        }
+        // 6th request in the same 1s window crosses the per-credential ceiling.
+        assert_eq!(rl.check(cred, true), Some(Throttled::PerCredential));
+    }
+
+    #[test]
+    fn credentials_have_independent_budgets() {
+        let rl = RateLimit::new(2, 0).unwrap();
+        assert_eq!(rl.check("token-1", false), None);
+        assert_eq!(rl.check("token-1", false), None);
+        assert_eq!(rl.check("token-1", false), Some(Throttled::PerCredential)); // token-1 exhausted
+        assert_eq!(rl.check("token-2", false), None); // a different credential is unaffected
+    }
+
+    #[test]
+    fn byo_global_caps_distinct_tokens_but_exempts_managed() {
+        // Per-credential disabled, global BYO ceiling = 3. A flood of *distinct* BYO tokens (which
+        // would each slip past per-credential keying) is still bounded by the shared bucket.
+        let rl = RateLimit::new(0, 3).unwrap();
+        assert_eq!(rl.check("byo-aaaa", false), None);
+        assert_eq!(rl.check("byo-bbbb", false), None);
+        assert_eq!(rl.check("byo-cccc", false), None);
+        assert_eq!(rl.check("byo-dddd", false), Some(Throttled::ByoGlobal)); // 4th distinct token
+
+        // Managed traffic is exempt from the BYO bucket — a distinct `bai_…` flood is never throttled
+        // here (it's bounded by verify failing, not by this ceiling).
+        for i in 0..10 {
+            assert_eq!(rl.check(&format!("bai_v1.1.p{i}.s{i}"), true), None);
+        }
+    }
+
+    #[test]
+    fn byo_global_does_not_touch_managed_budget() {
+        // With only the global BYO tier on, managed requests pass freely while BYO is being capped.
+        let rl = RateLimit::new(0, 1).unwrap();
+        assert_eq!(rl.check("byo-1", false), None);
+        assert_eq!(rl.check("byo-2", false), Some(Throttled::ByoGlobal)); // BYO bucket exhausted
+        assert_eq!(rl.check("bai_v1.1.p.s", true), None); // managed unaffected
+    }
+}
diff --git a/src/route.rs b/src/route.rs
new file mode 100644
index 0000000..e59c5bf
--- /dev/null
+++ b/src/route.rs
@@ -0,0 +1,305 @@
+//! Provider routing and per-provider wire details — **data-driven**.
+//!
+//! The provider is the **first path segment** of the request (`/{provider}/…`); the rest of the path
+//! is forwarded to the upstream **verbatim** (native passthrough — the gateway holds no per-provider
+//! path knowledge). A path with no provider prefix that starts with `/v1` routes by *dialect* —
+//! `/v1/messages*` → `anthropic`, else → `openai` — so an OpenAI/Anthropic client is drop-in by
+//! changing only the host. An unrecognized first segment is a 404 (see `proxy::request_filter`).
+//!
+//! A provider is a *row* in [`KNOWN_PROVIDERS`] (name, upstream authority, dialect, auth scheme) —
+//! adding an OpenAI-wire provider (Groq, DeepSeek, Together, …) is one line there, no new code
+//! paths. Operators can also add/override providers from config (see `state`/`config`). We do not
+//! translate between dialects — that's deliberately out of scope.
+
+use crate::circuit_breaker::CircuitBreaker;
+use crate::metrics::ProviderMetrics;
+use crate::secret::Secret;
+
+/// The default API prefix OpenAI/Anthropic clients use. A request with no provider segment that
+/// starts with this is routed to a default provider by [`dialect_for_path`](crate::proxy) (the
+/// bare-path drop-in case); anything else with an unknown first segment is a 404.
+pub const DEFAULT_PREFIX: &str = "/v1";
+
+/// Which API surface the client called. Drives usage parsing and the bare-path default provider.
+/// On a provider-prefixed request it's the selected provider's own [`Provider::dialect`]; on a
+/// bare-path request it's derived from the path (`proxy::dialect_for_path`).
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Dialect {
+    OpenAI,
+    Anthropic,
+}
+
+/// How the upstream expects the API key. OpenAI-wire providers use `Authorization: Bearer <key>`;
+/// Anthropic uses the `x-api-key` header. The gateway swaps the client's virtual key for the real
+/// pool key in whichever header the upstream wants (see `proxy`).
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum AuthScheme {
+    Bearer,
+    XApiKey,
+}
+
+impl AuthScheme {
+    /// The request header the upstream expects the key in.
+    pub fn header(self) -> &'static str {
+        match self {
+            AuthScheme::Bearer => "authorization",
+            AuthScheme::XApiKey => "x-api-key",
+        }
+    }
+
+    /// Format `key` as the upstream wants it for [`Self::header`].
+    pub fn format(self, key: &str) -> String {
+        match self {
+            AuthScheme::Bearer => format!("Bearer {key}"),
+            AuthScheme::XApiKey => key.to_string(),
+        }
+    }
+}
+
+/// Static wire facts for a known provider. Adding a provider = one row in [`KNOWN_PROVIDERS`].
+pub struct ProviderSpec {
+    pub name: &'static str,
+    /// Default upstream `host:port` (TLS:443). Overridable per-provider via config.
+    pub authority: &'static str,
+    /// The provider's wire format — drives usage parsing and `stream_options` injection eligibility.
+    /// (We forward the client's path verbatim, so the *path* doesn't tell us the wire format; the
+    /// provider does.)
+    pub dialect: Dialect,
+    pub auth: AuthScheme,
+}
+
+/// The providers the gateway knows out of the box. All but Anthropic speak the OpenAI wire format
+/// (Bearer auth, chat/completions + embeddings); a new one is a single row here, then reachable at
+/// `/{name}/…`. (Config can add further OpenAI-wire providers or override any authority — see
+/// `state::build_providers`.)
+///
+/// We forward the path after `/{name}` **verbatim**, so the gateway carries no per-provider mount
+/// path — the client uses the provider's native base path (e.g. `/groq/openai/v1/chat/completions`,
+/// `/fireworks/inference/v1/chat/completions`), exactly as it would hitting the provider directly.
+/// Each row's `authority`/`auth` is verified against the provider's **official** docs (cited inline)
+/// as of 2026-05; the client-facing native path is noted alongside as a convenience.
+pub const KNOWN_PROVIDERS: &[ProviderSpec] = &[
+    // docs: https://platform.openai.com/docs/api-reference/authentication — base https://api.openai.com/v1, Bearer.
+    // Client path: /openai/v1/… (or bare /v1/… as the default).
+    ProviderSpec {
+        name: "openai",
+        authority: "api.openai.com:443",
+        dialect: Dialect::OpenAI,
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://docs.claude.com/en/api/messages — base https://api.anthropic.com, Messages at /v1/messages,
+    // auth is `x-api-key` (NOT Bearer). The required `anthropic-version` header is the client's; we pass it through.
+    // Client path: /anthropic/v1/messages (or bare /v1/messages as the default).
+    ProviderSpec {
+        name: "anthropic",
+        authority: "api.anthropic.com:443",
+        dialect: Dialect::Anthropic,
+        auth: AuthScheme::XApiKey,
+    },
+    // docs: https://openrouter.ai/docs/quickstart — base https://openrouter.ai/api/v1, Bearer.
+    // Client path: /openrouter/api/v1/chat/completions.
+    ProviderSpec {
+        name: "openrouter",
+        authority: "openrouter.ai:443",
+        dialect: Dialect::OpenAI,
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://docs.fireworks.ai/tools-sdks/openai-compatibility — base https://api.fireworks.ai/inference/v1, Bearer.
+    // Client path: /fireworks/inference/v1/chat/completions.
+    ProviderSpec {
+        name: "fireworks",
+        authority: "api.fireworks.ai:443",
+        dialect: Dialect::OpenAI,
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://console.groq.com/docs/openai — base https://api.groq.com/openai/v1, Bearer.
+    // Client path: /groq/openai/v1/chat/completions.
+    ProviderSpec {
+        name: "groq",
+        authority: "api.groq.com:443",
+        dialect: Dialect::OpenAI,
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://api-docs.deepseek.com/ — base https://api.deepseek.com/v1 (the `/v1` is an OpenAI-compat alias,
+    // not API versioning); /v1/chat/completions is officially supported. Bearer. Client path: /deepseek/v1/….
+    ProviderSpec {
+        name: "deepseek",
+        authority: "api.deepseek.com:443",
+        dialect: Dialect::OpenAI,
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://docs.together.ai/docs/openai-api-compatibility — base https://api.together.ai/v1, Bearer.
+    // Canonical host is `api.together.ai`; the legacy `api.together.xyz` is still live but no longer documented.
+    // Client path: /together/v1/….
+    ProviderSpec {
+        name: "together",
+        authority: "api.together.ai:443",
+        dialect: Dialect::OpenAI,
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://inference-docs.cerebras.ai/resources/openai — base https://api.cerebras.ai/v1, Bearer.
+    // Client path: /cerebras/v1/….
+    ProviderSpec {
+        name: "cerebras",
+        authority: "api.cerebras.ai:443",
+        dialect: Dialect::OpenAI,
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://docs.mistral.ai/api/ — base https://api.mistral.ai/v1, Bearer. Client path: /mistral/v1/….
+    ProviderSpec {
+        name: "mistral",
+        authority: "api.mistral.ai:443",
+        dialect: Dialect::OpenAI,
+        auth: AuthScheme::Bearer,
+    },
+    // docs: https://docs.x.ai/docs/api-reference — base https://api.x.ai/v1, Bearer. Reasoning models are slow:
+    // the generous read/idle timeouts (see `config`) matter here. Client path: /xai/v1/….
+    ProviderSpec {
+        name: "xai",
+        authority: "api.x.ai:443",
+        dialect: Dialect::OpenAI,
+        auth: AuthScheme::Bearer,
+    },
+];
+
+/// The default provider name for a dialect — used only for the **bare-path** request (no provider
+/// segment), where the dialect is derived from the path. A provider-prefixed request names its
+/// provider directly.
+pub fn dialect_default(d: Dialect) -> &'static str {
+    match d {
+        Dialect::OpenAI => "openai",
+        Dialect::Anthropic => "anthropic",
+    }
+}
+
+/// A *resolved* provider: static wire facts + the boot-resolved upstream authority/host + (for
+/// managed traffic) the precomputed pool auth header value. Built once at boot (see
+/// `state::build_providers`); the request hot path holds an `Arc<Provider>` (cheap clone) and
+/// borrows these fields, so nothing is re-allocated or re-formatted per request.
+pub struct Provider {
+    pub name: String,
+    /// Upstream `host:port`.
+    pub authority: String,
+    /// Bare upstream host (SNI / `Host` header) = authority without the port.
+    pub host: String,
+    /// The provider's wire format (usage parsing + injection eligibility). See [`ProviderSpec::dialect`].
+    pub dialect: Dialect,
+    pub auth: AuthScheme,
+    /// Precomputed managed auth header value (`Bearer <key>` / bare key). `None` ⇒ no pool key is
+    /// configured for this provider ⇒ managed requests to it are rejected (503). Kept in `Secret`
+    /// for the redacting-`Debug` + zeroize-on-drop hygiene of the underlying key.
+    pub pool_auth_value: Option<Secret>,
+    /// Per-provider metric handles, resolved once here so the response path bumps a direct
+    /// counter/histogram instead of a string-keyed label lookup per response.
+    pub metrics: ProviderMetrics,
+    /// Per-provider circuit breaker, shared across all callers to this provider. `None` when the
+    /// breaker is disabled (`circuit_breaker_threshold == 0`). Checked before connect and fed the
+    /// 5xx/connect outcome — see `proxy`. Lock-free, so the hot path reads it without contention.
+    pub breaker: Option<CircuitBreaker>,
+}
+
+impl Provider {
+    /// Resolve a provider from its name, upstream authority, dialect, auth scheme, (optional) pool
+    /// key, and pre-resolved per-provider metric handles. Derives the bare host and precomputes the
+    /// managed auth header value once.
+    pub fn resolve(
+        name: &str,
+        authority: String,
+        dialect: Dialect,
+        auth: AuthScheme,
+        pool_key: Option<&str>,
+        metrics: ProviderMetrics,
+        breaker: Option<CircuitBreaker>,
+    ) -> Self {
+        let host = authority
+            .split(':')
+            .next()
+            .unwrap_or(&authority)
+            .to_string();
+        let pool_auth_value = pool_key.map(|k| Secret::new(auth.format(k)));
+        Provider {
+            name: name.to_string(),
+            authority,
+            host,
+            dialect,
+            auth,
+            pool_auth_value,
+            metrics,
+            breaker,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn dialect_defaults() {
+        assert_eq!(dialect_default(Dialect::OpenAI), "openai");
+        assert_eq!(dialect_default(Dialect::Anthropic), "anthropic");
+    }
+
+    #[test]
+    fn known_provider_names_are_unique() {
+        let mut names: Vec<_> = KNOWN_PROVIDERS.iter().map(|p| p.name).collect();
+        names.sort_unstable();
+        let before = names.len();
+        names.dedup();
+        assert_eq!(
+            before,
+            names.len(),
+            "duplicate provider name in KNOWN_PROVIDERS"
+        );
+    }
+
+    #[test]
+    fn anthropic_is_the_only_anthropic_dialect() {
+        // Dialect drives usage parsing + injection; getting Anthropic's wire wrong mis-meters it.
+        for spec in KNOWN_PROVIDERS {
+            let want = if spec.name == "anthropic" {
+                Dialect::Anthropic
+            } else {
+                Dialect::OpenAI
+            };
+            assert_eq!(spec.dialect, want, "{} dialect", spec.name);
+        }
+    }
+
+    #[test]
+    fn auth_scheme_formats_and_headers() {
+        assert_eq!(AuthScheme::Bearer.header(), "authorization");
+        assert_eq!(AuthScheme::XApiKey.header(), "x-api-key");
+        assert_eq!(AuthScheme::Bearer.format("k"), "Bearer k");
+        // Anthropic wants the bare key (no `Bearer`). Getting this wrong → upstream 401.
+        assert_eq!(AuthScheme::XApiKey.format("k"), "k");
+    }
+
+    #[test]
+    fn resolve_derives_host_and_pool_auth() {
+        let p = Provider::resolve(
+            "openai",
+            "api.openai.com:443".to_string(),
+            Dialect::OpenAI,
+            AuthScheme::Bearer,
+            Some("sk-x"),
+            ProviderMetrics::disconnected(),
+            None,
+        );
+        assert_eq!(p.host, "api.openai.com");
+        assert_eq!(p.dialect, Dialect::OpenAI);
+        assert_eq!(p.pool_auth_value.as_ref().unwrap().expose(), "Bearer sk-x");
+
+        // No pool key ⇒ no managed auth value (managed requests to it would 503).
+        let a = Provider::resolve(
+            "anthropic",
+            "api.anthropic.com:443".to_string(),
+            Dialect::Anthropic,
+            AuthScheme::XApiKey,
+            None,
+            ProviderMetrics::disconnected(),
+            None,
+        );
+        assert!(a.pool_auth_value.is_none());
+    }
+}
diff --git a/src/secret.rs b/src/secret.rs
new file mode 100644
index 0000000..5fcd8d8
--- /dev/null
+++ b/src/secret.rs
@@ -0,0 +1,77 @@
+//! A string secret that won't leak into logs and is scrubbed on drop.
+//!
+//! Hygiene, not a hard control: provider keys are necessarily long-lived in RAM (held for the
+//! process life, copied into Pingora's request headers we don't own), so zeroize-on-drop only
+//! helps at rotation/shutdown. The real protections are SSM-at-rest + never logging + rotation.
+//! What this newtype buys: a redacting `Debug` (so a stray `{:?}` or `tracing` field can't print a
+//! key) and a best-effort scrub when the value is dropped.
+
+use std::fmt;
+use zeroize::Zeroize;
+
+#[derive(Clone)]
+pub struct Secret(String);
+
+impl Secret {
+    pub fn new(s: impl Into<String>) -> Self {
+        Self(s.into())
+    }
+
+    /// Borrow the plaintext. Call sites should keep the exposure as narrow as possible.
+    pub fn expose(&self) -> &str {
+        &self.0
+    }
+}
+
+impl From<String> for Secret {
+    fn from(s: String) -> Self {
+        Self(s)
+    }
+}
+
+impl fmt::Debug for Secret {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str("Secret(***)")
+    }
+}
+
+// Deserialize transparently from a plain string so config (`AI_POOL_KEY_*`, `nats_creds`) can load a
+// secret straight into `Option<Secret>`.
+impl<'de> serde::Deserialize<'de> for Secret {
+    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+        Ok(Self(String::deserialize(d)?))
+    }
+}
+
+// Serialize **redacting** — same threat model as `Debug`: a stray `serde_json::to_string(&config)`
+// in a log line must not leak the key. This is sound for our only serialize path (figment's
+// `Serialized::defaults`, where every secret field defaults to `None` and is skipped); a `Secret`
+// is for holding a key, never for round-tripping config back out. Read the plaintext via `expose`.
+impl serde::Serialize for Secret {
+    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+        s.serialize_str("***")
+    }
+}
+
+impl Drop for Secret {
+    fn drop(&mut self) {
+        self.0.zeroize();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn debug_redacts() {
+        let s = Secret::new("sk-supersecret");
+        assert_eq!(format!("{s:?}"), "Secret(***)");
+        assert!(!format!("{s:?}").contains("supersecret"));
+    }
+
+    #[test]
+    fn expose_returns_plaintext() {
+        assert_eq!(Secret::new("abc").expose(), "abc");
+    }
+}
diff --git a/src/state.rs b/src/state.rs
new file mode 100644
index 0000000..26a7446
--- /dev/null
+++ b/src/state.rs
@@ -0,0 +1,329 @@
+//! Shared gateway state.
+//!
+//! Only the **deny-set** is dynamic (watched from NATS, behind `ArcSwap` for lock-free reads).
+//! Everything else — the signing keyring and the resolved provider registry (upstreams + pool auth
+//! values) — is built once at boot from config (SSM/env), so the auth + key paths have **no runtime
+//! dependency on NATS**.
+
+use crate::config::AiConfig;
+use crate::deny::DenySet;
+use crate::error::{GatewayError, Result};
+use crate::key::Keyring;
+use crate::metrics::{Metrics, ProviderMetrics};
+use crate::ratelimit::RateLimit;
+use crate::route::{self, AuthScheme, Dialect, Provider};
+use arc_swap::ArcSwap;
+use arrayvec::ArrayString;
+use std::collections::HashMap;
+use std::fmt::Write as _;
+use std::net::SocketAddr;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+use tracing::warn;
+
+/// How long a resolved upstream address is reused before re-resolving.
+const DNS_TTL: Duration = Duration::from_secs(60);
+
+/// A process-unique request id, `{instance:x}-{seq:x}`. Two `u64`s in hex (≤16 chars each) plus the
+/// `-` separator never exceed 33 bytes, so it lives inline on the stack — no per-request heap
+/// allocation on the admitted path (it's minted for every request, including fast rejects).
+pub type RequestId = ArrayString<33>;
+
+/// Build the resolved provider registry from the static known set + config: every known provider
+/// (its authority overridable by `provider_authorities`), plus any config-only OpenAI-wire provider
+/// (a `provider_authorities` entry whose name isn't known). Each provider's pool key (if any) is
+/// looked up by name and its managed auth header value precomputed.
+fn build_providers(config: &AiConfig, metrics: &Metrics) -> HashMap<String, Arc<Provider>> {
+    // One independent breaker per provider, all built from the same config (the breaker holds
+    // atomics so it can't be cloned — we mint a fresh one per provider). `None` ⇒ breaker disabled.
+    let cb_config = config.circuit_breaker_config();
+    let breaker = || {
+        cb_config
+            .clone()
+            .map(crate::circuit_breaker::CircuitBreaker::new)
+    };
+
+    let mut providers = HashMap::new();
+    for spec in route::KNOWN_PROVIDERS {
+        let authority = config
+            .provider_authorities
+            .get(spec.name)
+            .cloned()
+            .unwrap_or_else(|| spec.authority.to_string());
+        let pool_key = config.pool_keys.get(spec.name).map(|s| s.expose());
+        providers.insert(
+            spec.name.to_string(),
+            Arc::new(Provider::resolve(
+                spec.name,
+                authority,
+                spec.dialect,
+                spec.auth,
+                pool_key,
+                ProviderMetrics::resolve(metrics, spec.name),
+                breaker(),
+            )),
+        );
+    }
+    // Config-only providers (name not in the known set): assume OpenAI-wire (Bearer). A non-OpenAI
+    // wire format would need real code, so we don't pretend to support it from config alone.
+    for (name, authority) in &config.provider_authorities {
+        if !providers.contains_key(name) {
+            let pool_key = config.pool_keys.get(name).map(|s| s.expose());
+            providers.insert(
+                name.clone(),
+                Arc::new(Provider::resolve(
+                    name,
+                    authority.clone(),
+                    Dialect::OpenAI,
+                    AuthScheme::Bearer,
+                    pool_key,
+                    ProviderMetrics::resolve(metrics, name),
+                    breaker(),
+                )),
+            );
+        }
+    }
+    providers
+}
+
+pub struct GatewayState {
+    pub config: AiConfig,
+    pub metrics: Arc<Metrics>,
+
+    /// Trusted Ed25519 public keys by kid — from config (rotate via redeploy). Static for life.
+    pub keyring: Keyring,
+    /// Resolved providers by name (upstream authority/host + precomputed managed auth value). Built
+    /// once at boot from `route::KNOWN_PROVIDERS` + config; the request path clones the `Arc`.
+    providers: HashMap<String, Arc<Provider>>,
+
+    /// Sparse deny-set — the ONE thing watched from NATS. Default-allow on miss; fail-open.
+    pub deny: ArcSwap<DenySet>,
+
+    /// Per-key request-rate guardrail (see `ratelimit`). `None` when `rate_limit_rps == 0`. Fixed
+    /// memory regardless of tenant count, so it lives in the static state with no GC.
+    pub rate_limit: Option<RateLimit>,
+
+    /// TTL cache of resolved upstream addresses, so `upstream_peer` neither blocks on a synchronous
+    /// `getaddrinfo` nor re-resolves the same provider host every request. `ArcSwap` so the common
+    /// case — a cache hit, on every admitted request after warmup — is a lock-free atomic load; the
+    /// only writes are the ~10 providers' entries refreshed once per `DNS_TTL`, applied via `rcu`.
+    dns_cache: ArcSwap<HashMap<String, (SocketAddr, Instant)>>,
+
+    /// Per-process instance token (8 OS-random bytes), the high half of every `request_id`.
+    /// Random rather than a uuid dep, so log lines from two gateways don't collide when aggregated —
+    /// and random rather than the boot wall-clock, which collides when a rapid scale-up boots several
+    /// instances within the same nanosecond.
+    instance_id: u64,
+    /// Monotonic per-request counter, the low half of `request_id`. A relaxed `fetch_add` — the only
+    /// requirement is uniqueness within the process, not cross-request ordering.
+    request_seq: AtomicU64,
+}
+
+impl GatewayState {
+    pub fn new(config: AiConfig, metrics: Arc<Metrics>) -> Result<Arc<Self>> {
+        let keyring = config.build_keyring()?;
+        // No signing keys ⇒ every `bai_…` fails verify and falls through to BYO treatment: no key
+        // swap, no deny-set, no `ai.usage` billing. That's a *valid* mode (a BYO-only deployment),
+        // but a far more common cause is a missing/typo'd `signing_keys` (SSM param, env) — which
+        // looks healthy while silently dropping all billing. A managed deployment sets
+        // `require_signing_keys = true` so this mis-deploy is a hard, visible boot failure; otherwise
+        // we warn loudly and continue (BYO-only is legitimate and the test/e2e harnesses run keyless).
+        if config.signing_keys.is_empty() {
+            if config.require_signing_keys {
+                return Err(GatewayError::Config(
+                    "require_signing_keys is set but no signing_keys are configured — refusing to \
+                     boot into silent BYO-only mode (no key swap, no deny-set, no billing). Check \
+                     the signing_keys config / SSM param."
+                        .to_string(),
+                ));
+            }
+            warn!(
+                "no signing_keys configured — all managed (bai_) traffic will be treated as BYO \
+                 (no key swap, no deny-set, no billing). Expected only for a BYO-only deployment."
+            );
+        }
+        let providers = build_providers(&config, &metrics);
+        let rate_limit = RateLimit::new(config.rate_limit_rps, config.byo_rate_limit_rps);
+
+        // 8 OS-random bytes as the instance token, so two gateways' request_ids never collide when
+        // aggregated — including when a rapid scale-up boots several instances within the same
+        // nanosecond (which a wall-clock token can't distinguish). If the OS RNG is somehow
+        // unavailable, fall back to the boot wall-clock rather than panicking — a degraded-uniqueness
+        // id beats failing to start.
+        let instance = {
+            let mut buf = [0u8; 8];
+            match getrandom::fill(&mut buf) {
+                Ok(()) => u64::from_le_bytes(buf),
+                Err(_) => SystemTime::now()
+                    .duration_since(UNIX_EPOCH)
+                    .map(|d| d.as_nanos() as u64)
+                    .unwrap_or(0),
+            }
+        };
+
+        Ok(Arc::new(Self {
+            metrics,
+            keyring,
+            providers,
+            deny: ArcSwap::from_pointee(DenySet::new()),
+            rate_limit,
+            dns_cache: ArcSwap::from_pointee(HashMap::new()),
+            instance_id: instance,
+            request_seq: AtomicU64::new(0),
+            config,
+        }))
+    }
+
+    /// A process-unique request id (`{instance}-{seq}`) for log correlation and the
+    /// `x-beyond-request-id` response header. Deliberately *not* a uuid: a per-process instance
+    /// token (computed once at boot) plus a relaxed atomic counter is unique across the fleet, costs
+    /// one `fetch_add` + a hex format into a stack buffer (no heap allocation), and needs no
+    /// randomness per request.
+    pub fn next_request_id(&self) -> RequestId {
+        let seq = self.request_seq.fetch_add(1, Ordering::Relaxed);
+        let mut id = RequestId::new();
+        // Can't overflow: two `u64`s in hex + `-` is ≤33 bytes, exactly the buffer's capacity. The
+        // `write!` is infallible here, but if a future format change ever exceeded the cap we'd
+        // rather emit a truncated id than panic on a correlation aid — so swallow the result.
+        let _ = write!(id, "{:x}-{seq:x}", self.instance_id);
+        id
+    }
+
+    /// The resolved provider for `name` (the request's first path segment, or the bare-path dialect
+    /// default), or `None` if no such provider is registered — which `request_filter` turns into a
+    /// 404.
+    pub fn provider(&self, name: &str) -> Option<&Arc<Provider>> {
+        self.providers.get(name)
+    }
+
+    /// Resolve an `host:port` authority to a `SocketAddr`, cached for `DNS_TTL`. Uses
+    /// `tokio::net::lookup_host` (runs `getaddrinfo` on the blocking pool — async-safe) instead of
+    /// `HttpPeer::new`'s eager blocking resolve.
+    pub async fn resolve(&self, authority: &str) -> Result<SocketAddr> {
+        // Cache hit (the common case after warmup): a lock-free `ArcSwap` load — no mutex, no
+        // syscall — so concurrent workers never serialize on a DNS lookup that's already resolved.
+        if let Some((addr, at)) = self.dns_cache.load().get(authority) {
+            if at.elapsed() < DNS_TTL {
+                return Ok(*addr);
+            }
+        }
+        let addr = tokio::net::lookup_host(authority)
+            .await
+            .map_err(|e| GatewayError::Dns(format!("{authority}: {e}")))?
+            .next()
+            .ok_or_else(|| GatewayError::Dns(format!("{authority}: no addresses")))?;
+        // rcu the new/refreshed entry in. Two concurrent misses for the same host may both resolve
+        // and both rcu; that's harmless (same answer, last writer wins) and far cheaper than holding
+        // a lock across `getaddrinfo`. The clone-on-write copies a ~10-entry map — trivial, and only
+        // on the rare miss/refresh path, never on a hit.
+        //
+        // Sweep entries that are long dead while we're already paying for the clone. The cache keys
+        // are provider authorities, which come entirely from the boot-time registry (so in practice
+        // the map is bounded by the provider count, not by traffic) — this sweep is belt-and-
+        // suspenders against authorities ever becoming dynamic, and it's a *TTL* drop, not an
+        // eviction *policy*: there's no capacity contest here, so LRU/SIEVE would be machinery for a
+        // problem we don't have. We keep anything within `2 × DNS_TTL` so a still-live provider whose
+        // entry just expired (and is about to be refreshed) is never dropped out from under a
+        // concurrent resolve.
+        let now = Instant::now();
+        self.dns_cache.rcu(|cur| {
+            let mut next = HashMap::clone(cur);
+            next.retain(|_, (_, at)| now.duration_since(*at) < DNS_TTL * 2);
+            next.insert(authority.to_string(), (addr, now));
+            next
+        });
+        Ok(addr)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::route::AuthScheme;
+    use crate::secret::Secret;
+
+    /// One process-wide `Metrics` (it registers on the default Prometheus registry, which rejects a
+    /// second registration), shared by every test that needs a `GatewayState`.
+    fn test_metrics() -> Arc<Metrics> {
+        use std::sync::OnceLock;
+        static M: OnceLock<Arc<Metrics>> = OnceLock::new();
+        M.get_or_init(|| Metrics::new().expect("register metrics once"))
+            .clone()
+    }
+
+    #[test]
+    fn registry_resolves_known_overrides_and_additions() {
+        let config = AiConfig {
+            // Override a known provider's authority + give it a pool key; add a config-only one.
+            // `custom2` is a config-only provider with **no** pool key — the condition that makes a
+            // managed request to it 503 (no managed auth value to swap in).
+            provider_authorities: HashMap::from([
+                ("openai".to_string(), "127.0.0.1:9".to_string()),
+                ("custom".to_string(), "llm.internal:8443".to_string()),
+                ("custom2".to_string(), "other.internal:8443".to_string()),
+            ]),
+            pool_keys: HashMap::from([
+                ("openai".to_string(), Secret::new("sk-openai")),
+                ("custom".to_string(), Secret::new("sk-custom")),
+            ]),
+            ..Default::default()
+        };
+        let providers = build_providers(&config, &test_metrics());
+
+        // Known provider: authority overridden, pool auth precomputed in the right scheme.
+        let openai = providers.get("openai").unwrap();
+        assert_eq!(openai.authority, "127.0.0.1:9");
+        assert_eq!(openai.auth, AuthScheme::Bearer);
+        assert_eq!(
+            openai.pool_auth_value.as_ref().unwrap().expose(),
+            "Bearer sk-openai"
+        );
+
+        // Known provider, no override: built-in default + no pool key ⇒ no managed auth value.
+        let anthropic = providers.get("anthropic").unwrap();
+        assert_eq!(anthropic.authority, "api.anthropic.com:443");
+        assert_eq!(anthropic.auth, AuthScheme::XApiKey);
+        assert!(anthropic.pool_auth_value.is_none());
+
+        // Config-only provider: added as OpenAI-wire (Bearer), reachable by name.
+        let custom = providers.get("custom").unwrap();
+        assert_eq!(custom.host, "llm.internal");
+        assert_eq!(
+            custom.pool_auth_value.as_ref().unwrap().expose(),
+            "Bearer sk-custom"
+        );
+
+        // Config-only provider with no pool key: registered (reachable by name) but with no managed
+        // auth value — this `None` is exactly what `request_filter` turns into a 503 for a managed
+        // request. (BYO to it still works; it just can't serve the pooled path.)
+        let custom2 = providers.get("custom2").unwrap();
+        assert!(
+            custom2.pool_auth_value.is_none(),
+            "a provider with no configured pool key must have no managed auth value (→ 503)"
+        );
+    }
+
+    #[tokio::test]
+    async fn resolve_caches_hit_and_errors_on_bad_host() {
+        // `resolve` is on the request hot path (every admitted request hits `upstream_peer`). Cover
+        // the three outcomes: a successful resolve, a cache hit returning the same address without a
+        // fresh lookup, and a lookup failure surfacing as `GatewayError::Dns` (not a panic/hang).
+        let config = AiConfig::default();
+        let state = GatewayState::new(config, test_metrics()).unwrap();
+
+        // An IP literal resolves through `lookup_host` without real DNS — deterministic, offline-safe.
+        let addr = state.resolve("127.0.0.1:9").await.unwrap();
+        assert_eq!(addr, "127.0.0.1:9".parse().unwrap());
+
+        // Second call is served from the TTL cache: same answer, and the entry is now present.
+        assert_eq!(state.resolve("127.0.0.1:9").await.unwrap(), addr);
+        assert!(state.dns_cache.load().contains_key("127.0.0.1:9"));
+
+        // A guaranteed-NXDOMAIN host (RFC 6761 reserves `.invalid`) → a Dns error, never a panic.
+        assert!(matches!(
+            state.resolve("nonexistent.invalid:80").await,
+            Err(GatewayError::Dns(_))
+        ));
+    }
+}
diff --git a/src/store_watch.rs b/src/store_watch.rs
new file mode 100644
index 0000000..72c3602
--- /dev/null
+++ b/src/store_watch.rs
@@ -0,0 +1,459 @@
+//! slipstream deny-set watcher — the gateway's **only** use of NATS.
+//!
+//! Seeds the deny-set at boot, then streams deltas. **Fail-open**: a NATS blip keeps the last-known
+//! set (we never clear), so an outage degrades to a stale deny-set, not "reject everything". Auth
+//! and pool/signing keys come from config, so they're unaffected by NATS being down — only
+//! spend/fraud enforcement goes stale.
+//!
+//! Seeding has two modes, chosen by `config.snapshot_path`:
+//!
+//! - **Unset (ephemeral, e.g. Fargate):** scan `blackhole.*` from NATS on first connect. The resume
+//!   revision is kept *in memory* across reconnects, so a NATS blip resumes the watch from where it
+//!   left off (gap-free) rather than re-scanning.
+//! - **Set (edge/tunnel, durable disk):** load slipstream's on-disk snapshot (entries + a saved
+//!   watch cursor), seed from it, and resume the watch from that cursor — a restart skips the scan
+//!   and enforces immediately, even before NATS reconnects. Every applied delta is appended back to
+//!   the snapshot so the file tracks the live set.
+//!
+//! Either way the watch resumes from a **revision** (`watch_prefix_from`), never a bare
+//! `watch_prefix`: the latter uses NATS `DeliverPolicy::New` (no replay), so a deny entry written in
+//! the window between seeding and the subscription attaching would be silently lost. Resuming from
+//! the seeded revision closes that window with no gap and no double-apply (it starts strictly after
+//! the seeded revision). If the backend compacted past the cursor (`CursorExpired`), we drop back to
+//! a fresh scan, which re-establishes a valid baseline.
+//!
+//! Runs as a Pingora `BackgroundService` so the NATS client is created on the serving runtime
+//! (async-nats ties its tasks to the runtime it's built on; connecting earlier would break it).
+
+use crate::deny::{self, DenySet};
+use crate::state::GatewayState;
+use async_trait::async_trait;
+use pingora_core::server::ShutdownWatch;
+use pingora_core::services::background::BackgroundService;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+use store::snapshot::SnapshotWriter;
+use store::{
+    Connection, KvEntry, KvError, KvStore, KvUpdate, NatsConnection, NatsConnectionConfig,
+    StoreConfig, WatchCursor,
+};
+use tracing::{error, info, warn};
+
+const BLACKHOLE_PREFIX: &str = "blackhole.";
+
+/// Compact the on-disk snapshot once it grows past this many bytes of appended deltas. The deny-set
+/// is low-churn, so this is rarely hit; it just bounds the log if a tenant flaps.
+const SNAPSHOT_COMPACT_THRESHOLD: u64 = 1024 * 1024;
+
+/// Reconnect backoff bounds: start at 1s, double to a 30s ceiling. Generous enough to stop log
+/// spam during a long NATS outage, tight enough that recovery is near-immediate once it returns.
+const RECONNECT_BACKOFF_BASE: Duration = Duration::from_secs(1);
+const RECONNECT_BACKOFF_MAX: Duration = Duration::from_secs(30);
+
+pub struct WatcherService {
+    pub state: Arc<GatewayState>,
+}
+
+#[async_trait]
+impl BackgroundService for WatcherService {
+    async fn start(&self, mut shutdown: ShutdownWatch) {
+        // Resume position + on-disk snapshot writer persist across reconnects: a NATS blip resumes
+        // the watch from `cursor` instead of re-scanning, and `seeded` stays true so we don't reseed.
+        let mut cursor = WatchCursor::none();
+        let mut writer: Option<SnapshotWriter> = None;
+        let mut seeded = false;
+
+        if let Some(path) = self.state.config.snapshot_path.clone() {
+            let path = PathBuf::from(path);
+            // Snapshot I/O is synchronous (whole-file read/rewrite) — offload it so we never stall
+            // the serving runtime this BackgroundService shares with the proxy.
+            let load_path = path.clone();
+            match tokio::task::spawn_blocking(move || store::snapshot::load(&load_path)).await {
+                Ok(Ok(Some(snap))) => {
+                    let set = denyset_from_entries(snap.entries.values());
+                    info!(count = set.len(), "seeded deny-set from on-disk snapshot");
+                    self.state.metrics.deny_set_size.set(set.len() as i64);
+                    self.state.deny.store(Arc::new(set));
+                    // A snapshot without a saved cursor can't safely resume (a bare watch would
+                    // race), so only treat it as seeded when it carries a resume point; otherwise
+                    // fall through to a NATS scan on connect.
+                    if !snap.cursor.is_none() {
+                        cursor = snap.cursor;
+                        seeded = true;
+                    }
+                }
+                Ok(Ok(None)) => info!("no on-disk snapshot yet; will seed from a NATS scan"),
+                Ok(Err(e)) => warn!(error = %e, "snapshot load failed; will seed from a NATS scan"),
+                Err(e) => warn!(error = %e, "snapshot load task panicked; seeding from NATS"),
+            }
+            let open_path = path.clone();
+            match tokio::task::spawn_blocking(move || {
+                SnapshotWriter::open(&open_path, SNAPSHOT_COMPACT_THRESHOLD)
+            })
+            .await
+            {
+                Ok(Ok(w)) => writer = Some(w),
+                Ok(Err(e)) => warn!(error = %e, "snapshot writer open failed; running without it"),
+                Err(e) => warn!(error = %e, "snapshot writer open task panicked"),
+            }
+        }
+
+        // Reconnect backoff: 1s doubling to a 30s cap, reset on every successful connect. A fixed
+        // 2s retry hammered the log at a constant rate through a long outage (minutes to hours),
+        // burying other signals during the very incident an oncall is reading these logs for. The
+        // gateway serves correctly on the stale set throughout — this is purely about log volume
+        // and not pointlessly spinning on a down NATS.
+        let mut backoff = RECONNECT_BACKOFF_BASE;
+        loop {
+            // Connect, but bail immediately if Pingora signals shutdown mid-connect (e.g. NATS is
+            // down and `connect` is retrying its own backoff) rather than blocking teardown.
+            let store = tokio::select! {
+                _ = shutdown.changed() => {
+                    info!(
+                        in_flight = self.state.metrics.requests_in_flight.get(),
+                        "shutdown signaled; deny-set watcher exiting"
+                    );
+                    return;
+                }
+                outcome = connect(&self.state) => match outcome {
+                    Ok(store) => store,
+                    Err(e) => {
+                        self.state.metrics.nats_connected.set(0);
+                        error!(error = %e, backoff_secs = backoff.as_secs(), "slipstream connect failed; retrying");
+                        // Reconnect backoff, also interruptible by shutdown.
+                        tokio::select! {
+                            _ = shutdown.changed() => return,
+                            _ = tokio::time::sleep(backoff) => {
+                                backoff = (backoff * 2).min(RECONNECT_BACKOFF_MAX);
+                                continue;
+                            }
+                        }
+                    }
+                },
+            };
+
+            backoff = RECONNECT_BACKOFF_BASE;
+            self.state.metrics.nats_connected.set(1);
+            info!("slipstream connected; watching deny-set");
+            // `watch_deny` returns `true` when it exited because shutdown was signaled — stop the
+            // reconnect loop cleanly instead of trying to reconnect a shutting-down process.
+            if watch_deny(
+                &self.state,
+                store,
+                &mut cursor,
+                &mut writer,
+                &mut seeded,
+                &mut shutdown,
+            )
+            .await
+            {
+                info!("shutdown signaled; deny-set watcher exiting");
+                return;
+            }
+            self.state.metrics.nats_connected.set(0);
+            warn!("deny-set watch exited; reconnecting");
+            tokio::select! {
+                _ = shutdown.changed() => return,
+                _ = tokio::time::sleep(backoff) => {
+                    backoff = (backoff * 2).min(RECONNECT_BACKOFF_MAX);
+                }
+            }
+        }
+    }
+}
+
+/// Build a `DenySet` from KV entries, dropping any whose key isn't a `blackhole.{tenant}`.
+fn denyset_from_entries<'a>(entries: impl Iterator<Item = &'a KvEntry>) -> DenySet {
+    entries
+        .filter_map(|e| Some((deny::parse_key(&e.key)?, deny::parse_reason(&e.value))))
+        .collect()
+}
+
+/// Rewrite the on-disk snapshot from a fresh scan: truncate, write one `Put` per live entry, and
+/// checkpoint the cursor. Returns the reopened writer, or `None` if the rewrite failed (the gateway
+/// then runs snapshot-less — the in-memory deny-set is unaffected). Synchronous file I/O, so it runs
+/// on a blocking thread off the serving runtime.
+async fn rebuild_snapshot(
+    path: PathBuf,
+    entries: Vec<KvEntry>,
+    cursor: WatchCursor,
+) -> Option<SnapshotWriter> {
+    let res = tokio::task::spawn_blocking(
+        move || -> Result<SnapshotWriter, store::snapshot::SnapshotError> {
+            // Remove the old log so we don't replay a deleted-but-uncompacted key on a later load.
+            // A failed removal is *not* ignorable: if `SnapshotWriter::open` then appends to the
+            // surviving file, a compacted-away `Delete` can't undo its stale `Put`, and a later
+            // `load()` resurrects a tenant we no longer deny — the exact corruption this rebuild
+            // exists to prevent. `NotFound` is the expected, benign case (first boot, or scratch
+            // storage); any other error aborts the rebuild so we run snapshot-less rather than on
+            // poisoned state.
+            match std::fs::remove_file(&path) {
+                Ok(()) => {}
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+                Err(e) => return Err(e.into()),
+            }
+            let mut w = SnapshotWriter::open(&path, SNAPSHOT_COMPACT_THRESHOLD)?;
+            for e in &entries {
+                w.write_update(&KvUpdate::Put(e.clone()))?;
+            }
+            w.checkpoint(&cursor)?;
+            Ok(w)
+        },
+    )
+    .await;
+    match res {
+        Ok(Ok(w)) => Some(w),
+        Ok(Err(e)) => {
+            warn!(error = %e, "snapshot rebuild failed; running without on-disk snapshot");
+            None
+        }
+        Err(e) => {
+            warn!(error = %e, "snapshot rebuild task panicked");
+            None
+        }
+    }
+}
+
+async fn connect(state: &GatewayState) -> crate::error::Result<Arc<dyn KvStore>> {
+    let cfg = &state.config;
+    // `expose().to_string()` lifts the creds out of our `Secret` into the plain `String` the store's
+    // config requires. This doesn't widen the leak surface: `NatsConnectionConfig` has a hand-written
+    // redacting `Debug` (prints `creds: [redacted]`), so a stray `{:?}` on it — in a span, an error
+    // context, a reconnect log — can't print the credential. The plaintext copy is necessarily
+    // un-zeroized for the connection's life (we hand ownership to the store); same trade-off the pool
+    // keys make once they reach Pingora's headers (see `secret`). Redaction, not zeroization, is the
+    // control here.
+    let conn = NatsConnection::new(NatsConnectionConfig {
+        url: cfg.nats_url.clone(),
+        creds: cfg.nats_creds.as_ref().map(|s| s.expose().to_string()),
+        creds_file: cfg.nats_creds_file.clone(),
+    });
+    conn.connect().await?;
+    let store = conn
+        .store_with_config(StoreConfig {
+            name: cfg.config_bucket.clone(),
+            ..Default::default()
+        })
+        .await?;
+    Ok(store)
+}
+
+/// Seed (if needed) and stream deny-set deltas until the watch ends or shutdown is signaled.
+/// Returns `true` iff it exited because `shutdown` fired — the caller then stops reconnecting.
+async fn watch_deny(
+    state: &Arc<GatewayState>,
+    store: Arc<dyn KvStore>,
+    cursor: &mut WatchCursor,
+    writer: &mut Option<SnapshotWriter>,
+    seeded: &mut bool,
+    shutdown: &mut ShutdownWatch,
+) -> bool {
+    // Seed once, on the first connect that lacks a usable resume point (cold boot with no snapshot,
+    // or after a `CursorExpired` reset). A NATS scan is a point-in-time read of the live set; the
+    // highest revision among its entries is the baseline the watch resumes strictly after. An empty
+    // set ⇒ revision 0 ⇒ resume from the start of history (the deny bucket is low-churn, so a full
+    // replay is cheap and still gap-free).
+    if !*seeded {
+        match store.reader().scan(BLACKHOLE_PREFIX).await {
+            Ok(entries) => {
+                let baseline_rev = entries
+                    .iter()
+                    .filter_map(|e| e.version.as_u64())
+                    .max()
+                    .unwrap_or(0);
+                let set = denyset_from_entries(entries.iter());
+                info!(
+                    count = set.len(),
+                    revision = baseline_rev,
+                    "seeded deny-set from scan"
+                );
+                state.metrics.deny_set_size.set(set.len() as i64);
+                state.deny.store(Arc::new(set));
+                *cursor = WatchCursor::from_u64(baseline_rev);
+                // Persist the freshly-scanned baseline so a later restart can skip the scan. We
+                // *rebuild* the file (not append): this path runs on a cold boot or after a
+                // `CursorExpired` reset, and a stale prior log could otherwise contain a `Put` for a
+                // tenant deleted while we were offline — whose `Delete` was compacted away — which a
+                // later `load()` would replay and resurrect (wrongly re-denying a tenant). A clean
+                // rewrite from the live scan makes the on-disk state exactly match NATS.
+                if writer.is_some() {
+                    if let Some(path) = state.config.snapshot_path.clone() {
+                        *writer =
+                            rebuild_snapshot(PathBuf::from(path), entries, cursor.clone()).await;
+                    }
+                }
+                *seeded = true;
+            }
+            Err(e) => {
+                // No baseline yet — serve whatever's already in memory (fail-open) and let the
+                // reconnect loop retry the scan.
+                warn!(error = %e, "deny-set scan failed; serving current set, will retry");
+                return false;
+            }
+        }
+    }
+
+    // Stream deltas, resuming from the seeded revision. Never a bare `watch_prefix` (DeliverPolicy::
+    // New) — that would drop anything written in the seed→subscribe window.
+    let Some(watcher) = store.watcher() else {
+        warn!("store has no watcher; deny-set will not update");
+        return false;
+    };
+    let (tx, mut rx) = tokio::sync::mpsc::channel::<KvUpdate>(256);
+    let w = watcher.clone();
+    let start_cursor = cursor.clone();
+    let watch = tokio::spawn(async move {
+        w.watch_prefix_from(BLACKHOLE_PREFIX, &start_cursor, tx)
+            .await
+    });
+
+    // Updates are rcu (clone-on-write); the set is tiny (O(denied)). Each applied delta also
+    // advances the in-memory cursor (so a reconnect resumes from here) and is appended to the
+    // on-disk snapshot if one is configured. We `select!` on shutdown so a quiet stream (no deltas
+    // arriving) doesn't pin the task open through teardown; `select!` can only switch at an await
+    // point — between updates — so we never abort mid-`persist_update`, leaving the snapshot intact.
+    loop {
+        let update = tokio::select! {
+            _ = shutdown.changed() => {
+                watch.abort();
+                return true;
+            }
+            update = rx.recv() => match update {
+                Some(u) => u,
+                None => break,
+            },
+        };
+        state.deny.rcu(|cur| {
+            let mut set = (**cur).clone();
+            match &update {
+                KvUpdate::Put(e) => {
+                    if let Some(t) = deny::parse_key(&e.key) {
+                        set.insert(t, deny::parse_reason(&e.value));
+                    }
+                }
+                // Delete/Purge = restore (explicit delete or TTL expiry).
+                KvUpdate::Delete { key, .. } | KvUpdate::Purge { key, .. } => {
+                    if let Some(t) = deny::parse_key(key) {
+                        set.remove(t);
+                    }
+                }
+            }
+            Arc::new(set)
+        });
+        // Reflect the new cardinality. A lock-free load of the set we just swapped in — cheap, and
+        // the deltas are low-churn, so this is far off any hot path.
+        state
+            .metrics
+            .deny_set_size
+            .set(state.deny.load().len() as i64);
+        *cursor = WatchCursor::from_version(update.version().clone());
+        persist_update(writer, &update, cursor).await;
+    }
+
+    // The watch ended (NATS dropped, or the cursor was compacted away). Inspect why so a compacted
+    // cursor forces a fresh scan on the next connect instead of resuming from a dead revision.
+    match watch.await {
+        Ok(Ok(())) => {}
+        Ok(Err(KvError::CursorExpired)) => {
+            warn!("deny-set resume cursor expired (history compacted past it); will rescan");
+            *seeded = false;
+            *cursor = WatchCursor::none();
+        }
+        Ok(Err(e)) => warn!(error = %e, "deny-set watch ended"),
+        Err(e) => warn!(error = %e, "deny-set watch task panicked"),
+    }
+    false
+}
+
+/// Append one applied delta to the on-disk snapshot (if configured) and checkpoint the cursor.
+/// `write_update`/`checkpoint` are buffered/`write(2)` and cheap; `compact` reads+rewrites the whole
+/// file, so it's offloaded off the serving runtime when the log crosses its threshold.
+async fn persist_update(
+    writer: &mut Option<SnapshotWriter>,
+    update: &KvUpdate,
+    cursor: &WatchCursor,
+) {
+    let needs_compact = match writer.as_mut() {
+        Some(w) => {
+            if let Err(e) = w.write_update(update) {
+                warn!(error = %e, "snapshot write failed");
+            }
+            match w.checkpoint(cursor) {
+                Ok(b) => b,
+                Err(e) => {
+                    warn!(error = %e, "snapshot checkpoint failed");
+                    false
+                }
+            }
+        }
+        None => false,
+    };
+    if needs_compact {
+        // Move the writer into a blocking task for the rewrite, then take it back. If it fails we
+        // drop the writer (None) and run snapshot-less until the next restart reopens the file —
+        // the deny-set itself is unaffected (it lives in the ArcSwap, fed by NATS).
+        if let Some(mut w) = writer.take() {
+            match tokio::task::spawn_blocking(move || w.compact().map(|()| w)).await {
+                Ok(Ok(w)) => *writer = Some(w),
+                Ok(Err(e)) => {
+                    warn!(error = %e, "snapshot compaction failed; disabling snapshot writer")
+                }
+                Err(e) => warn!(error = %e, "snapshot compaction task panicked"),
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::deny::DenyReason;
+    use store::VersionToken;
+
+    fn entry(key: &str, value: &[u8]) -> KvEntry {
+        KvEntry {
+            key: key.to_string(),
+            value: value.to_vec(),
+            version: VersionToken::from_u64(1),
+        }
+    }
+
+    #[test]
+    fn denyset_from_entries_seeds_and_skips_malformed() {
+        // This is the seeding core: every boot turns raw KV entries into the live deny-set. A bug
+        // here (or a foreign key bleeding through the `filter_map`) means the deny-set is silently
+        // wrong at boot — denied tenants served, or unrelated keys denying real tenants.
+        let entries = [
+            entry("blackhole.42", b"spend"),
+            entry("blackhole.99", b"fraud"),
+            // Not a `blackhole.{tenant}` key — must be dropped, never inserted as tenant 0 or junk.
+            entry("signkey.1", b"spend"),
+            // `blackhole.` with a non-numeric tail — `parse_key` rejects it, so it's dropped too.
+            entry("blackhole.notanumber", b"spend"),
+            // Unrecognized reason value still denies (fail-safe) under `DenyReason::Unknown`.
+            entry("blackhole.7", b"mystery"),
+        ];
+
+        let set = denyset_from_entries(entries.iter());
+
+        assert_eq!(
+            set.len(),
+            3,
+            "only the three valid blackhole keys are seeded"
+        );
+        assert_eq!(set.reason(42), Some(DenyReason::Spend));
+        assert_eq!(set.reason(99), Some(DenyReason::Fraud));
+        assert_eq!(set.reason(7), Some(DenyReason::Unknown));
+        // The malformed keys produced no entries (and crucially no spurious tenant 0).
+        assert!(!set.is_denied(0));
+        assert!(!set.is_denied(1));
+    }
+
+    #[test]
+    fn denyset_from_entries_empty_is_allow_all() {
+        let set = denyset_from_entries([].iter());
+        assert!(set.is_empty());
+        assert!(!set.is_denied(42)); // default-allow on a cold/empty scan
+    }
+}
diff --git a/src/usage.rs b/src/usage.rs
new file mode 100644
index 0000000..22443a0
--- /dev/null
+++ b/src/usage.rs
@@ -0,0 +1,269 @@
+//! Token-usage extraction — the "passive tap" the gateway emits as billing *facts*.
+//!
+//! We never compute price here (pricing is a closed downstream consumer); we only extract raw
+//! token counts. Two shapes per provider: the non-streaming JSON body, and the terminal event of
+//! an SSE stream. For streaming we scan the relayed bytes for the usage event but never block the
+//! relay on it (see `proxy`).
+
+use serde::Deserialize;
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub struct Usage {
+    pub input_tokens: u64,
+    pub output_tokens: u64,
+    pub cache_read_tokens: u64,
+    pub cache_write_tokens: u64,
+}
+
+// Typed views of just the fields we meter. Deserializing into these (rather than a
+// `serde_json::Value` DOM) lets serde skip every field we don't read without allocating a node for
+// it — no `Map`/`String`/`Number` tree to build and drop per body or per SSE line. Every field is
+// `#[serde(default)]` so a missing or partial `usage` block reads as zeros, matching the prior
+// pointer-with-`unwrap_or(0)` behavior.
+
+/// OpenAI `usage` block (chat/completions + responses). `prompt`/`completion` map to in/out; cached
+/// input rides in `prompt_tokens_details.cached_tokens`. No cache-write concept on the OpenAI wire.
+#[derive(Deserialize, Default)]
+struct OpenAiUsage {
+    #[serde(default)]
+    prompt_tokens: u64,
+    #[serde(default)]
+    completion_tokens: u64,
+    #[serde(default)]
+    prompt_tokens_details: OpenAiPromptDetails,
+}
+
+#[derive(Deserialize, Default)]
+struct OpenAiPromptDetails {
+    #[serde(default)]
+    cached_tokens: u64,
+}
+
+impl From<OpenAiUsage> for Usage {
+    fn from(u: OpenAiUsage) -> Self {
+        Usage {
+            input_tokens: u.prompt_tokens,
+            output_tokens: u.completion_tokens,
+            cache_read_tokens: u.prompt_tokens_details.cached_tokens,
+            cache_write_tokens: 0,
+        }
+    }
+}
+
+/// Anthropic `usage` block (`/v1/messages` body + streaming events).
+#[derive(Deserialize, Default)]
+struct AnthropicUsage {
+    #[serde(default)]
+    input_tokens: u64,
+    #[serde(default)]
+    output_tokens: u64,
+    #[serde(default)]
+    cache_read_input_tokens: u64,
+    #[serde(default)]
+    cache_creation_input_tokens: u64,
+}
+
+/// OpenAI non-streaming: top-level `usage`. `None` (absent/`null`) ⇒ no usage to meter.
+pub fn openai_body(body: &[u8]) -> Option<Usage> {
+    #[derive(Deserialize)]
+    struct Body {
+        usage: Option<OpenAiUsage>,
+    }
+    serde_json::from_slice::<Body>(body)
+        .ok()?
+        .usage
+        .map(Usage::from)
+}
+
+/// Anthropic non-streaming: top-level `usage.{input,output,cache_*}`.
+pub fn anthropic_body(body: &[u8]) -> Option<Usage> {
+    #[derive(Deserialize)]
+    struct Body {
+        usage: Option<AnthropicUsage>,
+    }
+    let u = serde_json::from_slice::<Body>(body).ok()?.usage?;
+    Some(Usage {
+        input_tokens: u.input_tokens,
+        output_tokens: u.output_tokens,
+        cache_read_tokens: u.cache_read_input_tokens,
+        cache_write_tokens: u.cache_creation_input_tokens,
+    })
+}
+
+/// Iterate the raw JSON payloads carried on `data:` lines of an SSE byte stream. `[DONE]` and the
+/// `data:` framing are stripped; each caller deserializes the payload into its own typed view.
+fn sse_data_lines(sse: &[u8]) -> impl Iterator<Item = &[u8]> + '_ {
+    sse.split(|&b| b == b'\n').filter_map(|line| {
+        let line = line.strip_prefix(b"data:")?;
+        // SSE strips *all* leading spaces after the field colon (not exactly one) — OpenAI/Anthropic
+        // emit `data: ` (one space), but a config-added OpenAI-wire provider that pads with more
+        // would otherwise leave whitespace in the payload and fail the JSON parse → silent zero usage.
+        let line = line.trim_ascii_start();
+        (line != b"[DONE]").then_some(line)
+    })
+}
+
+/// OpenAI streaming (requires `stream_options.include_usage`): the penultimate chunk carries a
+/// top-level `usage` object. Last one with usage wins.
+pub fn openai_stream(sse: &[u8]) -> Option<Usage> {
+    #[derive(Deserialize)]
+    struct Chunk {
+        usage: Option<OpenAiUsage>,
+    }
+    let mut found = None;
+    for line in sse_data_lines(sse) {
+        if let Ok(chunk) = serde_json::from_slice::<Chunk>(line) {
+            if let Some(u) = chunk.usage {
+                found = Some(Usage::from(u));
+            }
+        }
+    }
+    found
+}
+
+/// Anthropic streaming: input + cache tokens arrive in `message_start.message.usage`; output
+/// accumulates in `message_delta.usage.output_tokens` (last delta is the cumulative total).
+pub fn anthropic_stream(sse: &[u8]) -> Option<Usage> {
+    #[derive(Deserialize)]
+    struct Message {
+        usage: Option<AnthropicUsage>,
+    }
+    #[derive(Deserialize)]
+    struct Chunk {
+        // `message_start` nests usage under `message`; `message_delta` carries it top-level.
+        message: Option<Message>,
+        usage: Option<AnthropicUsage>,
+    }
+    let mut usage = Usage::default();
+    let mut saw_any = false;
+    for line in sse_data_lines(sse) {
+        let Ok(chunk) = serde_json::from_slice::<Chunk>(line) else {
+            continue;
+        };
+        if let Some(u) = chunk.message.and_then(|m| m.usage) {
+            usage.input_tokens = u.input_tokens;
+            usage.cache_read_tokens = u.cache_read_input_tokens;
+            usage.cache_write_tokens = u.cache_creation_input_tokens;
+            saw_any = true;
+        }
+        if let Some(u) = chunk.usage {
+            // message_delta carries the running output token count.
+            if u.output_tokens > 0 {
+                usage.output_tokens = u.output_tokens;
+            }
+            saw_any = true;
+        }
+    }
+    saw_any.then_some(usage)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn openai_nonstreaming() {
+        let body = br#"{"usage":{"prompt_tokens":12,"completion_tokens":34,
+            "prompt_tokens_details":{"cached_tokens":4}}}"#;
+        assert_eq!(
+            openai_body(body).unwrap(),
+            Usage {
+                input_tokens: 12,
+                output_tokens: 34,
+                cache_read_tokens: 4,
+                cache_write_tokens: 0
+            }
+        );
+    }
+
+    #[test]
+    fn anthropic_nonstreaming() {
+        let body = br#"{"usage":{"input_tokens":100,"output_tokens":50,
+            "cache_read_input_tokens":10,"cache_creation_input_tokens":7}}"#;
+        assert_eq!(
+            anthropic_body(body).unwrap(),
+            Usage {
+                input_tokens: 100,
+                output_tokens: 50,
+                cache_read_tokens: 10,
+                cache_write_tokens: 7
+            }
+        );
+    }
+
+    #[test]
+    fn openai_streaming_terminal_usage() {
+        let sse = b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\n\
+                    data: {\"choices\":[],\"usage\":{\"prompt_tokens\":5,\"completion_tokens\":9}}\n\n\
+                    data: [DONE]\n\n";
+        assert_eq!(
+            openai_stream(sse).unwrap(),
+            Usage {
+                input_tokens: 5,
+                output_tokens: 9,
+                cache_read_tokens: 0,
+                cache_write_tokens: 0
+            }
+        );
+    }
+
+    #[test]
+    fn anthropic_streaming_accumulates() {
+        let sse = b"event: message_start\n\
+                    data: {\"type\":\"message_start\",\"message\":{\"usage\":{\"input_tokens\":20,\"output_tokens\":0}}}\n\n\
+                    event: message_delta\n\
+                    data: {\"type\":\"message_delta\",\"usage\":{\"output_tokens\":15}}\n\n";
+        assert_eq!(
+            anthropic_stream(sse).unwrap(),
+            Usage {
+                input_tokens: 20,
+                output_tokens: 15,
+                cache_read_tokens: 0,
+                cache_write_tokens: 0
+            }
+        );
+    }
+
+    #[test]
+    fn anthropic_streaming_includes_cache_tokens() {
+        // Cache tokens ride in `message_start.message.usage` alongside input_tokens. The earlier
+        // accumulation test omits them; this guards the `cache_read`/`cache_creation` pointers so a
+        // regression can't silently zero cache billing.
+        let sse = b"event: message_start\n\
+                    data: {\"type\":\"message_start\",\"message\":{\"usage\":{\"input_tokens\":20,\"output_tokens\":0,\"cache_read_input_tokens\":12,\"cache_creation_input_tokens\":8}}}\n\n\
+                    event: message_delta\n\
+                    data: {\"type\":\"message_delta\",\"usage\":{\"output_tokens\":15}}\n\n";
+        assert_eq!(
+            anthropic_stream(sse).unwrap(),
+            Usage {
+                input_tokens: 20,
+                output_tokens: 15,
+                cache_read_tokens: 12,
+                cache_write_tokens: 8
+            }
+        );
+    }
+
+    #[test]
+    fn tolerates_extra_leading_spaces_after_data_colon() {
+        // SSE strips all leading spaces, not just one. A provider padding `data:   {…}` must still
+        // parse — the alternative is a silent zero-usage row for that request.
+        let sse =
+            b"data:   {\"choices\":[],\"usage\":{\"prompt_tokens\":3,\"completion_tokens\":7}}\n\n";
+        assert_eq!(
+            openai_stream(sse).unwrap(),
+            Usage {
+                input_tokens: 3,
+                output_tokens: 7,
+                cache_read_tokens: 0,
+                cache_write_tokens: 0
+            }
+        );
+    }
+
+    #[test]
+    fn no_usage_returns_none() {
+        assert!(openai_stream(b"data: {\"choices\":[]}\n\n").is_none());
+        assert!(anthropic_body(b"{}").map(|u| u.input_tokens).unwrap_or(0) == 0);
+    }
+}
diff --git a/tests/common/mod.rs b/tests/common/mod.rs
new file mode 100644
index 0000000..a5d4f1e
--- /dev/null
+++ b/tests/common/mod.rs
@@ -0,0 +1,716 @@
+//! e2e harness: a real `beyond-ai` binary, a real `nats-server` (JetStream KV backing the deny-set),
+//! and a mock HTTP upstream that records what the gateway forwarded.
+//!
+//! Requires `nats-server` on PATH — run via `mise run test:integration:rs`.
+//! Signing keys + pool keys are passed via the gateway's *config* (not NATS); NATS carries only the
+//! deny-set. Every component picks a free port and cleans up on drop, so tests run in parallel.
+
+#![allow(dead_code)]
+// Test harness: `.unwrap()`/`.expect()`/`panic!` are assertions, not production code. See e2e.rs.
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
+
+use std::io::Write;
+use std::net::TcpListener as StdTcpListener;
+use std::process::{Child, Command};
+use std::sync::{Arc, Mutex};
+use std::time::Duration;
+
+use base64::Engine;
+use bytes::Bytes;
+use http_body_util::{BodyExt, Full};
+use hyper::service::service_fn;
+use hyper::{Request, Response};
+use hyper_util::rt::{TokioExecutor, TokioIo};
+use hyper_util::server::conn::auto;
+use store::Connection;
+use tokio::net::TcpListener;
+use tokio::time::{sleep, timeout};
+use tokio_rustls::TlsAcceptor;
+
+/// Hand out a TCP port no other `free_port()` call in this test binary has returned.
+///
+/// Tests run as concurrent threads in **one** process, and closing a `bind(:0)` listener lets the OS
+/// immediately re-hand that ephemeral port to the next `bind(:0)` — so two `free_port()` calls (a
+/// gateway's `listen` + `metrics` ports, or two tests at once) can collide, and a component then
+/// fails to bind → a *different* test flakes. A process-global reservation set makes every returned
+/// port distinct within the run; binding fresh listeners on collision forces the OS off the just-used
+/// port (it can't re-hand a port still held open) so the loop makes progress.
+///
+/// A residual TOCTOU window remains between returning a port and a *subprocess* (nats/gateway) binding
+/// it, vs. other OS processes — unavoidable when the bind happens in another process. In-process
+/// servers must instead bind `:0` and read the port back (see `MockUpstream`), which has no window.
+pub fn free_port() -> u16 {
+    use std::collections::HashSet;
+    use std::sync::OnceLock;
+    static USED: OnceLock<Mutex<HashSet<u16>>> = OnceLock::new();
+    let used = USED.get_or_init(|| Mutex::new(HashSet::new()));
+
+    let mut held = Vec::new();
+    for _ in 0..1000 {
+        let listener = StdTcpListener::bind("127.0.0.1:0").unwrap();
+        let port = listener.local_addr().unwrap().port();
+        if used.lock().unwrap_or_else(|p| p.into_inner()).insert(port) {
+            return port; // `listener` drops here, freeing the port for the (sub)process to bind.
+        }
+        // Already handed out: keep this listener open so the next bind gets a different port, then
+        // try again. The held listeners all drop at return, releasing those ports back to the OS.
+        held.push(listener);
+    }
+    panic!("could not find an unused free port after 1000 attempts");
+}
+
+/// Base64 (standard) — used to put an Ed25519 public key into the gateway's `signing_keys` config.
+pub fn b64(bytes: &[u8]) -> String {
+    base64::engine::general_purpose::STANDARD.encode(bytes)
+}
+
+/// Deterministic Ed25519 keypair: (raw 32-byte public key, signing key).
+pub fn test_keypair(seed: u8) -> (Vec<u8>, ed25519_dalek::SigningKey) {
+    let sk = ed25519_dalek::SigningKey::from_bytes(&[seed; 32]);
+    (sk.verifying_key().to_bytes().to_vec(), sk)
+}
+
+async fn wait_for_port(port: u16, what: &str) {
+    timeout(Duration::from_secs(20), async {
+        loop {
+            if tokio::net::TcpStream::connect(("127.0.0.1", port))
+                .await
+                .is_ok()
+            {
+                return;
+            }
+            sleep(Duration::from_millis(50)).await;
+        }
+    })
+    .await
+    .unwrap_or_else(|_| panic!("{what} did not come up on port {port}"));
+}
+
+// --- nats-server (JetStream) ------------------------------------------------
+
+pub struct Nats {
+    child: Child,
+    pub port: u16,
+    store_dir: std::path::PathBuf,
+}
+
+impl Nats {
+    pub async fn start() -> Self {
+        let port = free_port();
+        let store_dir = std::env::temp_dir().join(format!("beyond-ai-nats-{port}"));
+        let _ = std::fs::create_dir_all(&store_dir);
+        let child = Command::new("nats-server")
+            .args([
+                "-js",
+                "-a",
+                "127.0.0.1",
+                "-p",
+                &port.to_string(),
+                "-sd",
+                store_dir.to_str().unwrap(),
+            ])
+            .stdout(std::process::Stdio::null())
+            .stderr(std::process::Stdio::null())
+            .spawn()
+            .expect("spawn nats-server (on PATH? run via mise)");
+        let nats = Nats {
+            child,
+            port,
+            store_dir,
+        };
+        wait_for_port(port, "nats-server").await;
+        nats
+    }
+}
+
+impl Nats {
+    /// Kill the server mid-test (for fail-open coverage). Idempotent with `Drop`.
+    pub fn stop(&mut self) {
+        let _ = self.child.kill();
+        let _ = self.child.wait();
+    }
+}
+
+impl Drop for Nats {
+    fn drop(&mut self) {
+        let _ = self.child.kill();
+        let _ = std::fs::remove_dir_all(&self.store_dir);
+    }
+}
+
+pub async fn put_kv(nats_port: u16, key: &str, value: &[u8]) {
+    open_writer(nats_port).await.put(key, value).await.unwrap();
+}
+
+pub async fn del_kv(nats_port: u16, key: &str) {
+    open_writer(nats_port).await.delete(key).await.unwrap();
+}
+
+async fn open_writer(nats_port: u16) -> std::sync::Arc<dyn store::KvWriter> {
+    let conn = store::NatsConnection::new(store::NatsConnectionConfig {
+        url: format!("nats://127.0.0.1:{nats_port}"),
+        creds: None,
+        creds_file: None,
+    });
+    conn.connect().await.unwrap();
+    let kv = conn
+        .store_with_config(store::StoreConfig {
+            name: "ai-gateway".into(),
+            ..Default::default()
+        })
+        .await
+        .unwrap();
+    kv.writer().expect("bucket is writable")
+}
+
+// --- mock upstream provider -------------------------------------------------
+
+#[derive(Clone, Copy)]
+pub enum Mode {
+    /// OpenAI-shaped non-streaming JSON body.
+    Json,
+    /// OpenAI-shaped SSE stream with a terminal usage chunk.
+    Sse,
+    /// Anthropic-shaped non-streaming JSON body (`usage.input_tokens`).
+    AnthropicJson,
+    /// OpenAI-shaped SSE stream with >128 KiB of content *before* the usage chunk — forces the
+    /// proxy's response-tail compaction path.
+    SseLarge,
+    /// Always reply with this HTTP status and a small JSON error body — for circuit-breaker tests
+    /// (5xx trips the breaker; 4xx/429 do not).
+    Status(u16),
+}
+
+#[derive(Default, Clone)]
+pub struct Captured {
+    pub path: String,
+    pub authorization: Option<String>,
+    pub x_api_key: Option<String>,
+    pub host: Option<String>,
+    pub body: Vec<u8>,
+}
+
+pub struct MockUpstream {
+    pub port: u16,
+    captured: Arc<Mutex<Option<Captured>>>,
+    hits: Arc<std::sync::atomic::AtomicUsize>,
+    task: tokio::task::JoinHandle<()>,
+}
+
+const CANNED_JSON: &str = r#"{"id":"chatcmpl-mock","object":"chat.completion","model":"gpt-4o-2024-08-06","choices":[{"index":0,"message":{"role":"assistant","content":"hi"},"finish_reason":"stop"}],"usage":{"prompt_tokens":11,"completion_tokens":7,"total_tokens":18}}"#;
+
+const CANNED_SSE: &str = "data: {\"id\":\"chatcmpl-mock\",\"object\":\"chat.completion.chunk\",\"model\":\"gpt-4o-2024-08-06\",\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\ndata: {\"choices\":[],\"usage\":{\"prompt_tokens\":5,\"completion_tokens\":9}}\n\ndata: [DONE]\n\n";
+
+const CANNED_ANTHROPIC_JSON: &str = r#"{"id":"msg_mock","type":"message","model":"claude-opus-4-8","content":[{"type":"text","text":"hi"}],"usage":{"input_tokens":13,"output_tokens":7}}"#;
+
+/// An OpenAI SSE stream whose first chunk carries ~130 KiB of content, pushing the proxy's response
+/// tail past `2 × USAGE_TAIL_CAP` (128 KiB) so it compacts at least once before the trailing usage
+/// chunk arrives. The usage event must survive in the retained 64 KiB tail.
+fn large_sse() -> String {
+    let filler = "x".repeat(130 * 1024);
+    format!(
+        "data: {{\"id\":\"chatcmpl-mock\",\"object\":\"chat.completion.chunk\",\"model\":\"gpt-4o-2024-08-06\",\"choices\":[{{\"delta\":{{\"content\":\"{filler}\"}}}}]}}\n\n\
+         data: {{\"choices\":[],\"usage\":{{\"prompt_tokens\":5,\"completion_tokens\":9}}}}\n\n\
+         data: [DONE]\n\n"
+    )
+}
+
+/// The canned `(content-type, body)` for a mode. `SseLarge` allocates; the rest are static.
+fn canned_body(mode: Mode) -> (&'static str, Bytes) {
+    match mode {
+        Mode::Json => (
+            "application/json",
+            Bytes::from_static(CANNED_JSON.as_bytes()),
+        ),
+        Mode::Sse => (
+            "text/event-stream",
+            Bytes::from_static(CANNED_SSE.as_bytes()),
+        ),
+        Mode::AnthropicJson => (
+            "application/json",
+            Bytes::from_static(CANNED_ANTHROPIC_JSON.as_bytes()),
+        ),
+        Mode::SseLarge => ("text/event-stream", Bytes::from(large_sse())),
+        // The status is applied by `mock_handle`; the body is a stock error shape.
+        Mode::Status(_) => (
+            "application/json",
+            Bytes::from_static(br#"{"error":{"message":"mock"}}"#),
+        ),
+    }
+}
+
+/// The protocol the gateway used to *reach the mock* — derived from the version hyper parsed off the
+/// wire. Echoed back in `x-mock-proto`; since the gateway relays response headers untouched, the bench
+/// client reads this to prove which protocol the gateway→upstream hop negotiated (H2 vs H1).
+fn proto_label(version: hyper::Version) -> &'static str {
+    match version {
+        hyper::Version::HTTP_2 => "h2",
+        _ => "http/1.1",
+    }
+}
+
+/// Shared request handler for both the plaintext and TLS listeners: record what the gateway forwarded,
+/// then return the canned body tagged with the negotiated protocol.
+async fn mock_handle(
+    req: Request<hyper::body::Incoming>,
+    cap: Arc<Mutex<Option<Captured>>>,
+    hits: Arc<std::sync::atomic::AtomicUsize>,
+    mode: Mode,
+) -> Result<Response<Full<Bytes>>, std::convert::Infallible> {
+    hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+    let version = req.version();
+    let path = req.uri().path().to_string();
+    // Pull the headers we record before consuming the body (which moves `req`).
+    let (authorization, x_api_key, host) = {
+        let h = req.headers();
+        let get = |k: &str| h.get(k).and_then(|v| v.to_str().ok()).map(String::from);
+        (get("authorization"), get("x-api-key"), get("host"))
+    };
+    let body = req
+        .into_body()
+        .collect()
+        .await
+        .map(|b| b.to_bytes().to_vec())
+        .unwrap_or_default();
+    *cap.lock().unwrap() = Some(Captured {
+        path,
+        authorization,
+        x_api_key,
+        host,
+        body,
+    });
+    let (ct, payload) = canned_body(mode);
+    let status = match mode {
+        Mode::Status(s) => s,
+        _ => 200,
+    };
+    Ok(Response::builder()
+        .status(status)
+        .header("content-type", ct)
+        .header("x-mock-proto", proto_label(version))
+        .body(Full::new(payload))
+        .unwrap())
+}
+
+impl MockUpstream {
+    pub async fn start(mode: Mode) -> Self {
+        // Bind `:0` and read the port back, keeping the listener open the whole time — no
+        // free_port()→rebind window for another test to slip into (this is an in-process server).
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let port = listener.local_addr().unwrap().port();
+        let captured: Arc<Mutex<Option<Captured>>> = Arc::new(Mutex::new(None));
+        let hits = Arc::new(std::sync::atomic::AtomicUsize::new(0));
+        let cap = captured.clone();
+        let hit_counter = hits.clone();
+        let task = tokio::spawn(async move {
+            loop {
+                let Ok((stream, _)) = listener.accept().await else {
+                    break;
+                };
+                let io = TokioIo::new(stream);
+                let cap = cap.clone();
+                let hit_counter = hit_counter.clone();
+                tokio::spawn(async move {
+                    let svc = service_fn(move |req| {
+                        mock_handle(req, cap.clone(), hit_counter.clone(), mode)
+                    });
+                    let _ = hyper::server::conn::http1::Builder::new()
+                        .serve_connection(io, svc)
+                        .await;
+                });
+            }
+        });
+        MockUpstream {
+            port,
+            captured,
+            hits,
+            task,
+        }
+    }
+
+    /// Like [`start`], but terminates **TLS** and serves H1 *and* H2 on the one listener (protocol
+    /// chosen by ALPN, via hyper-util's auto builder). Presents a throwaway self-signed cert, so the
+    /// gateway must be pointed at it with `upstream_tls = true` and `upstream_verify_cert = false`.
+    /// This is what lets the concurrency bench drive the gateway's real TLS+ALPN+H2 path against a
+    /// local mock. Returns the mock; reach it at `authority()` (host `127.0.0.1`).
+    pub async fn start_tls(mode: Mode) -> Self {
+        // rustls 0.23 needs a process crypto provider; both ring and aws-lc are compiled in (so there's
+        // no default), pick ring to match the gateway. Idempotent across multiple mocks in one process.
+        let _ = rustls::crypto::ring::default_provider().install_default();
+
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let port = listener.local_addr().unwrap().port();
+
+        let ck = rcgen::generate_simple_self_signed(vec![
+            "127.0.0.1".to_string(),
+            "localhost".to_string(),
+        ])
+        .expect("self-signed cert");
+        let certs = vec![ck.cert.der().clone()];
+        let key = rustls::pki_types::PrivateKeyDer::Pkcs8(ck.key_pair.serialize_der().into());
+        let mut tls = rustls::ServerConfig::builder()
+            .with_no_client_auth()
+            .with_single_cert(certs, key)
+            .expect("server tls config");
+        // Offer both so the gateway's ALPN preference decides: H2H1 → h2, H1 → http/1.1.
+        tls.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+        let acceptor = TlsAcceptor::from(Arc::new(tls));
+
+        let captured: Arc<Mutex<Option<Captured>>> = Arc::new(Mutex::new(None));
+        let hits = Arc::new(std::sync::atomic::AtomicUsize::new(0));
+        let cap = captured.clone();
+        let hit_counter = hits.clone();
+        let task = tokio::spawn(async move {
+            loop {
+                let Ok((stream, _)) = listener.accept().await else {
+                    break;
+                };
+                let acceptor = acceptor.clone();
+                let cap = cap.clone();
+                let hit_counter = hit_counter.clone();
+                tokio::spawn(async move {
+                    let Ok(tls_stream) = acceptor.accept(stream).await else {
+                        return;
+                    };
+                    let io = TokioIo::new(tls_stream);
+                    let svc = service_fn(move |req| {
+                        mock_handle(req, cap.clone(), hit_counter.clone(), mode)
+                    });
+                    // Auto builder: serves H2 or H1 per the negotiated ALPN.
+                    let _ = auto::Builder::new(TokioExecutor::new())
+                        .serve_connection(io, svc)
+                        .await;
+                });
+            }
+        });
+        MockUpstream {
+            port,
+            captured,
+            hits,
+            task,
+        }
+    }
+
+    pub fn authority(&self) -> String {
+        format!("127.0.0.1:{}", self.port)
+    }
+
+    pub fn captured(&self) -> Option<Captured> {
+        self.captured.lock().unwrap().clone()
+    }
+
+    /// Total requests the mock has received — used to prove an open circuit breaker stops requests
+    /// from reaching the upstream at all.
+    pub fn hits(&self) -> usize {
+        self.hits.load(std::sync::atomic::Ordering::Relaxed)
+    }
+}
+
+impl Drop for MockUpstream {
+    fn drop(&mut self) {
+        self.task.abort();
+    }
+}
+
+// --- the real beyond-ai binary ----------------------------------------------
+
+pub struct Gateway {
+    child: Child,
+    pub port: u16,
+    pub metrics_port: u16,
+    config_path: std::path::PathBuf,
+}
+
+/// The managed pool key configured for a provider. Each provider gets a distinct value so a test
+/// can assert the gateway swapped in the *right* one.
+fn pool_key(provider: &str) -> &'static str {
+    match provider {
+        "openai" => "sk-pool-secret",
+        "anthropic" => "sk-anthropic-pool",
+        "fireworks" => "sk-fireworks-pool",
+        "openrouter" => "sk-openrouter-pool",
+        _ => "sk-unknown-pool",
+    }
+}
+
+/// Builds a gateway config, choosing which providers are *configured* (authority → the mock + a
+/// pool key). A managed request to a provider absent from this list has no pool key → 503.
+pub struct GatewayBuilder {
+    nats_port: u16,
+    authority: String,
+    signkey_b64: String,
+    providers: Vec<&'static str>,
+    snapshot_path: Option<String>,
+    real_upstreams: bool,
+    pool_key_overrides: Vec<(String, String)>,
+    rate_limit_rps: Option<u32>,
+    byo_rate_limit_rps: Option<u32>,
+    /// Point at a TLS mock (`MockUpstream::start_tls`): `upstream_tls = true` + skip cert verification
+    /// (the mock is self-signed), while still routing via `provider_authorities`. For the H2 bench.
+    tls_upstream: bool,
+    /// Override the gateway's `upstream_http2` (H2H1 vs H1 ALPN). `None` ⇒ leave the gateway default.
+    upstream_http2: Option<bool>,
+    /// Override the per-provider circuit-breaker threshold (failures in the window before opening).
+    /// `None` ⇒ leave the gateway default; `Some(0)` disables the breaker.
+    circuit_breaker_threshold: Option<u32>,
+}
+
+impl GatewayBuilder {
+    /// Set which providers are configured. Defaults to `["openai", "fireworks"]`.
+    pub fn providers(mut self, providers: &[&'static str]) -> Self {
+        self.providers = providers.to_vec();
+        self
+    }
+
+    /// Point the gateway at the **real** provider hosts over TLS (the `route::KNOWN_PROVIDERS`
+    /// defaults), instead of the plaintext mock. Used by the live smoke tests (`tests/smoke.rs`):
+    /// no authority overrides, no pool keys, no signing keys — smoke traffic is BYO (the caller's
+    /// real provider token, passed through), so none of that is needed.
+    pub fn real_upstreams(mut self) -> Self {
+        self.real_upstreams = true;
+        self
+    }
+
+    /// Set the managed pool key for a provider by name — in `real_upstreams` mode this is the *real*
+    /// provider key the gateway swaps in for a managed (`bai_…`) request. Combine with a signing key
+    /// (the `signkey_b64` passed to `builder`) to smoke-test the full managed path against the real
+    /// provider.
+    pub fn pool_key(mut self, provider: &str, key: &str) -> Self {
+        self.pool_key_overrides
+            .push((provider.to_string(), key.to_string()));
+        self
+    }
+
+    /// Point the gateway at an on-disk deny-set snapshot. Pass the same path to two `start()` calls
+    /// to model a restart that reloads from disk.
+    pub fn snapshot_path(mut self, path: impl Into<String>) -> Self {
+        self.snapshot_path = Some(path.into());
+        self
+    }
+
+    /// Override the per-credential request-rate ceiling (requests/sec). The harness default leaves
+    /// the gateway's own generous default (100) in place; set a small value to exercise the 429 path.
+    pub fn rate_limit_rps(mut self, rps: u32) -> Self {
+        self.rate_limit_rps = Some(rps);
+        self
+    }
+
+    /// Override the aggregate BYO request-rate ceiling (requests/sec). `0` disables that tier so a
+    /// per-credential 429 test isn't perturbed by the shared BYO bucket.
+    pub fn byo_rate_limit_rps(mut self, rps: u32) -> Self {
+        self.byo_rate_limit_rps = Some(rps);
+        self
+    }
+
+    /// Talk to the upstream over TLS without verifying its cert — for a `MockUpstream::start_tls`
+    /// target (self-signed). The gateway still routes via `provider_authorities` (the mock), but with
+    /// real TLS + ALPN, so the H2 path is exercised. Used by the concurrency bench.
+    pub fn tls_upstream(mut self) -> Self {
+        self.tls_upstream = true;
+        self
+    }
+
+    /// Force the gateway's upstream ALPN: `true` ⇒ H2H1 (prefer H2), `false` ⇒ H1 only. The bench
+    /// starts one gateway each way against the same TLS mock to compare them.
+    pub fn upstream_http2(mut self, on: bool) -> Self {
+        self.upstream_http2 = Some(on);
+        self
+    }
+
+    /// Set the per-provider circuit-breaker failure threshold (a tight window/reset are written too,
+    /// so the breaker trips fast in-test). `0` disables it.
+    pub fn circuit_breaker_threshold(mut self, threshold: u32) -> Self {
+        self.circuit_breaker_threshold = Some(threshold);
+        self
+    }
+
+    pub async fn start(self) -> Gateway {
+        let port = free_port();
+        let metrics_port = free_port();
+        let config_path = std::env::temp_dir().join(format!("beyond-ai-config-{port}.toml"));
+        let nats_port = self.nats_port;
+        // Scalars first, `[…]` tables last (TOML ordering).
+        let tls = self.real_upstreams || self.tls_upstream;
+        let mut cfg = format!(
+            "listen = \"127.0.0.1:{port}\"\n\
+             metrics_listen = \"127.0.0.1:{metrics_port}\"\n\
+             nats_url = \"nats://127.0.0.1:{nats_port}\"\n\
+             config_bucket = \"ai-gateway\"\n\
+             upstream_tls = {tls}\n"
+        );
+        // TLS mock is self-signed → don't verify its cert (production always verifies).
+        if self.tls_upstream {
+            cfg.push_str("upstream_verify_cert = false\n");
+        }
+        if let Some(h2) = self.upstream_http2 {
+            cfg.push_str(&format!("upstream_http2 = {h2}\n"));
+        }
+        if let Some(path) = &self.snapshot_path {
+            cfg.push_str(&format!("snapshot_path = \"{path}\"\n"));
+        }
+        if let Some(rps) = self.rate_limit_rps {
+            cfg.push_str(&format!("rate_limit_rps = {rps}\n"));
+        }
+        if let Some(rps) = self.byo_rate_limit_rps {
+            cfg.push_str(&format!("byo_rate_limit_rps = {rps}\n"));
+        }
+        if let Some(threshold) = self.circuit_breaker_threshold {
+            // Tight window + reset so the test trips and recovers quickly.
+            cfg.push_str(&format!(
+                "circuit_breaker_threshold = {threshold}\n\
+                 circuit_breaker_window_secs = 60\n\
+                 circuit_breaker_reset_secs = 1\n"
+            ));
+        }
+        if self.real_upstreams {
+            // Real-host smoke mode: built-in provider defaults (no authority overrides). For a
+            // *managed* smoke we still write the caller-supplied pool key(s) — the real provider key
+            // the gateway swaps in — and the signing key the minted virtual key verifies against.
+            // With neither set, this is a BYO smoke (the caller's token passes through).
+            if !self.pool_key_overrides.is_empty() {
+                cfg.push_str("\n[pool_keys]\n");
+                for (p, k) in &self.pool_key_overrides {
+                    cfg.push_str(&format!("{p} = \"{k}\"\n"));
+                }
+            }
+            if !self.signkey_b64.is_empty() {
+                cfg.push_str(&format!("\n[signing_keys]\n1 = \"{}\"\n", self.signkey_b64));
+            }
+        } else {
+            // Every configured provider points at the one mock upstream...
+            cfg.push_str("\n[provider_authorities]\n");
+            for p in &self.providers {
+                cfg.push_str(&format!("{p} = \"{}\"\n", self.authority));
+            }
+            // ...with a distinct pool key per provider so key-swap assertions can tell them apart.
+            cfg.push_str("\n[pool_keys]\n");
+            for p in &self.providers {
+                cfg.push_str(&format!("{p} = \"{}\"\n", pool_key(p)));
+            }
+            cfg.push_str(&format!("\n[signing_keys]\n1 = \"{}\"\n", self.signkey_b64));
+        }
+        std::fs::File::create(&config_path)
+            .unwrap()
+            .write_all(cfg.as_bytes())
+            .unwrap();
+
+        let child = Command::new(env!("CARGO_BIN_EXE_beyond-ai"))
+            .arg("run")
+            .arg("-c")
+            .arg(&config_path)
+            .env(
+                "AI_LOG",
+                std::env::var("AI_LOG").unwrap_or_else(|_| "warn".into()),
+            )
+            .spawn()
+            .expect("spawn beyond-ai");
+        let gw = Gateway {
+            child,
+            port,
+            metrics_port,
+            config_path,
+        };
+        wait_for_port(port, "beyond-ai").await;
+        gw
+    }
+}
+
+impl Gateway {
+    /// Start the gateway pointed at `nats` (deny-set) + the mock upstream, configuring the OpenAI
+    /// and Fireworks providers. Signing key + pool key come from config (mirrors production: NATS
+    /// holds only the deny-set). For other provider sets use [`Gateway::builder`].
+    pub async fn start(nats_port: u16, openai_authority: &str, signkey_b64: &str) -> Self {
+        Gateway::builder(nats_port, openai_authority, signkey_b64)
+            .start()
+            .await
+    }
+
+    /// A configurable gateway (which providers exist, etc.). Defaults match [`Gateway::start`].
+    pub fn builder(nats_port: u16, authority: &str, signkey_b64: &str) -> GatewayBuilder {
+        GatewayBuilder {
+            nats_port,
+            authority: authority.to_string(),
+            signkey_b64: signkey_b64.to_string(),
+            providers: vec!["openai", "fireworks"],
+            snapshot_path: None,
+            real_upstreams: false,
+            pool_key_overrides: Vec::new(),
+            rate_limit_rps: None,
+            byo_rate_limit_rps: None,
+            tls_upstream: false,
+            upstream_http2: None,
+            circuit_breaker_threshold: None,
+        }
+    }
+
+    pub fn url(&self) -> String {
+        format!("http://127.0.0.1:{}", self.port)
+    }
+
+    pub async fn metrics(&self) -> String {
+        reqwest::get(format!("http://127.0.0.1:{}/metrics", self.metrics_port))
+            .await
+            .unwrap()
+            .text()
+            .await
+            .unwrap()
+    }
+
+    /// GET a path on the admin/metrics listener, returning `(status, body)`. Used to probe
+    /// `/livez` and `/readyz` (which live on `metrics_port`, alongside `/metrics`).
+    pub async fn admin_get(&self, path: &str) -> (u16, String) {
+        let resp = reqwest::get(format!("http://127.0.0.1:{}{path}", self.metrics_port))
+            .await
+            .unwrap();
+        let status = resp.status().as_u16();
+        (status, resp.text().await.unwrap())
+    }
+}
+
+impl Drop for Gateway {
+    fn drop(&mut self) {
+        let _ = self.child.kill();
+        let _ = std::fs::remove_file(&self.config_path);
+    }
+}
+
+// --- assertions -------------------------------------------------------------
+
+pub fn parse_metric(metrics: &str, name: &str, label_value: &str) -> f64 {
+    metrics
+        .lines()
+        .find(|l| l.starts_with(name) && l.contains(label_value))
+        .and_then(|l| l.rsplit(' ').next())
+        .and_then(|v| v.parse().ok())
+        .unwrap_or(0.0)
+}
+
+pub async fn wait_for_metric(gw: &Gateway, name: &str, label: &str, min: f64) {
+    let r = timeout(Duration::from_secs(5), async {
+        loop {
+            if parse_metric(&gw.metrics().await, name, label) >= min {
+                return;
+            }
+            sleep(Duration::from_millis(150)).await;
+        }
+    })
+    .await;
+    assert!(r.is_ok(), "metric {name}{{{label}}} never reached {min}");
+}
+
+pub async fn wait_for_status<F, Fut>(want: u16, mut f: F)
+where
+    F: FnMut() -> Fut,
+    Fut: std::future::Future<Output = u16>,
+{
+    let r = timeout(Duration::from_secs(10), async {
+        loop {
+            if f().await == want {
+                return;
+            }
+            sleep(Duration::from_millis(150)).await;
+        }
+    })
+    .await;
+    assert!(r.is_ok(), "status never became {want}");
+}
diff --git a/tests/e2e.rs b/tests/e2e.rs
new file mode 100644
index 0000000..a12207d
--- /dev/null
+++ b/tests/e2e.rs
@@ -0,0 +1,929 @@
+//! End-to-end: real `beyond-ai` binary + real nats-server + mock upstream.
+//! Run via `mise run test:integration:rs` (needs `nats-server` on PATH).
+//!
+//! Signing key + pool key come from the gateway's *config*; NATS carries only the deny-set.
+
+// Test target: `.unwrap()`/`.expect()`/`panic!` are assertions, not production code — allow the
+// panic-surface restriction lints denied workspace-wide in `[workspace.lints.clippy]`.
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
+
+mod common;
+
+use beyond_ai::key::{VirtualKey, mint};
+use common::*;
+
+fn body_for(model: &str) -> String {
+    format!(r#"{{"model":"{model}","messages":[{{"role":"user","content":"hi"}}]}}"#)
+}
+
+async fn post_status(client: &reqwest::Client, url: &str, key: &str, body: String) -> u16 {
+    client
+        .post(format!("{url}/v1/chat/completions"))
+        .header("authorization", format!("Bearer {key}"))
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .map(|r| r.status().as_u16())
+        .unwrap_or(0)
+}
+
+/// POST to an arbitrary gateway path with a Bearer key — exercises provider routing by the first
+/// path segment (`/{provider}/…`) vs the bare-path default.
+async fn post_path_status(
+    client: &reqwest::Client,
+    url: &str,
+    path: &str,
+    key: &str,
+    body: String,
+) -> u16 {
+    client
+        .post(format!("{url}{path}"))
+        .header("authorization", format!("Bearer {key}"))
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .map(|r| r.status().as_u16())
+        .unwrap_or(0)
+}
+
+/// POST with the virtual key in the `x-api-key` header (Anthropic-SDK style) instead of `Bearer`.
+async fn post_status_xapikey(
+    client: &reqwest::Client,
+    url: &str,
+    path: &str,
+    key: &str,
+    body: String,
+) -> u16 {
+    client
+        .post(format!("{url}{path}"))
+        .header("x-api-key", key)
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .map(|r| r.status().as_u16())
+        .unwrap_or(0)
+}
+
+/// Send a hand-written HTTP/1.1 request and return the response status. Used to declare a
+/// Content-Length the body guard must reject *without* actually transferring that many bytes
+/// (the guard fires on the header, before any body is read).
+async fn raw_status(port: u16, request: &str) -> u16 {
+    use tokio::io::{AsyncReadExt, AsyncWriteExt};
+    let mut s = tokio::net::TcpStream::connect(("127.0.0.1", port))
+        .await
+        .unwrap();
+    s.write_all(request.as_bytes()).await.unwrap();
+    s.flush().await.unwrap();
+    let mut buf = vec![0u8; 256];
+    let n = s.read(&mut buf).await.unwrap();
+    String::from_utf8_lossy(&buf[..n])
+        .split_whitespace()
+        .nth(1)
+        .and_then(|c| c.parse().ok())
+        .unwrap_or(0)
+}
+
+#[tokio::test]
+async fn managed_swaps_key_relays_body_and_meters_usage() {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(1);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 42,
+            vpc_id: 7,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    let resp = client
+        .post(format!("{}/v1/chat/completions", gw.url()))
+        .header("authorization", format!("Bearer {vkey}"))
+        .header("content-type", "application/json")
+        .body(body_for("gpt-4o"))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+    assert!(resp.text().await.unwrap().contains("\"chatcmpl-mock\""));
+
+    // Managed: the mock saw the real pool key, never the virtual key.
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(cap.path, "/v1/chat/completions");
+    assert_eq!(cap.authorization.as_deref(), Some("Bearer sk-pool-secret"));
+    assert!(!cap.body.is_empty());
+
+    wait_for_metric(&gw, "ai_tokens_total", "input", 11.0).await;
+
+    // Bad managed key → 401.
+    assert_eq!(
+        post_status(
+            &client,
+            &gw.url(),
+            "bai_v1.1.bogus.bogus",
+            body_for("gpt-4o")
+        )
+        .await,
+        401
+    );
+}
+
+#[tokio::test]
+async fn byo_passes_user_token_through_unchanged() {
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(1); // gateway still needs a signing key in config to boot
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let client = reqwest::Client::new();
+    // A raw provider token (not `bai_`) → BYO → forwarded verbatim.
+    {
+        let (c, u) = (client.clone(), gw.url());
+        wait_for_status(200, move || {
+            let (c, u) = (c.clone(), u.clone());
+            async move { post_status(&c, &u, "sk-user-byo", body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(
+        cap.authorization.as_deref(),
+        Some("Bearer sk-user-byo"),
+        "BYO token must pass through unchanged (no swap)"
+    );
+}
+
+#[tokio::test]
+async fn fireworks_path_prefix_strips_and_swaps_pool_key() {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(4);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 5,
+            vpc_id: 6,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    // Fireworks is selected by the `/fireworks` path segment; the client uses its native base path
+    // (`/inference/v1`). The gateway strips `/fireworks` and forwards the rest VERBATIM, and a
+    // managed key swaps to the Fireworks-specific pool key.
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                post_path_status(
+                    &c,
+                    &u,
+                    "/fireworks/inference/v1/chat/completions",
+                    &k,
+                    body_for("accounts/fireworks/models/llama-v3p1-70b-instruct"),
+                )
+                .await
+            }
+        })
+        .await;
+    }
+
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(
+        cap.authorization.as_deref(),
+        Some("Bearer sk-fireworks-pool"),
+        "managed Fireworks request must swap to the Fireworks pool key"
+    );
+    // The `/fireworks` segment is stripped; the provider-native remainder is forwarded verbatim
+    // (the gateway does no per-provider path rewriting).
+    assert_eq!(
+        cap.path, "/inference/v1/chat/completions",
+        "first segment (provider) stripped; remainder forwarded verbatim"
+    );
+}
+
+#[tokio::test]
+async fn openai_prefix_matches_bare_default() {
+    // `/openai/v1/chat/completions` (explicit prefix) must reach OpenAI identically to bare
+    // `/v1/chat/completions` (dialect default): same pool-key swap, same upstream path after strip.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(8);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 1,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                post_path_status(
+                    &c,
+                    &u,
+                    "/openai/v1/chat/completions",
+                    &k,
+                    body_for("gpt-4o"),
+                )
+                .await
+            }
+        })
+        .await;
+    }
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(cap.authorization.as_deref(), Some("Bearer sk-pool-secret"));
+    assert_eq!(
+        cap.path, "/v1/chat/completions",
+        "`/openai` stripped → same upstream path as the bare `/v1` default"
+    );
+}
+
+#[tokio::test]
+async fn unknown_provider_segment_returns_404() {
+    // An unrecognized first path segment that isn't the bare `/v1` default is a routing miss — 404
+    // from the gateway (before any auth), not a confusing upstream error. Provider resolution is the
+    // very first step, so this fires regardless of the key.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(9);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 1,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+    wait_for_status(404, move || {
+        let (c, u, k) = (c.clone(), u.clone(), k.clone());
+        async move {
+            post_path_status(&c, &u, "/bogus/v1/chat/completions", &k, body_for("gpt-4o")).await
+        }
+    })
+    .await;
+}
+
+#[tokio::test]
+async fn streaming_relays_sse_and_meters_usage() {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(3);
+    let mock = MockUpstream::start(Mode::Sse).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 7,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    let body = r#"{"model":"gpt-4o","stream":true,"messages":[{"role":"user","content":"hi"}]}"#
+        .to_string();
+
+    {
+        let (c, u, k, b) = (client.clone(), gw.url(), vkey.clone(), body.clone());
+        wait_for_status(200, move || {
+            let (c, u, k, b) = (c.clone(), u.clone(), k.clone(), b.clone());
+            async move { post_status(&c, &u, &k, b).await }
+        })
+        .await;
+    }
+
+    let resp = client
+        .post(format!("{}/v1/chat/completions", gw.url()))
+        .header("authorization", format!("Bearer {vkey}"))
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+    assert!(resp.text().await.unwrap().contains("[DONE]"));
+    wait_for_metric(&gw, "ai_tokens_total", "input", 5.0).await;
+
+    // The client streamed without `stream_options`, so the managed OpenAI path must have buffered the
+    // body and spliced `stream_options.include_usage` in before forwarding — otherwise OpenAI emits no
+    // usage chunk and the request is unbillable. The metric above can't prove this (the mock returns a
+    // usage chunk unconditionally), so assert the *forwarded body* the mock actually received carries
+    // the injected fragment. This is the only coverage that the splice in `request_body_filter` ran.
+    let cap = mock.captured().expect("mock received a request");
+    let needle = br#""stream_options":{"include_usage":true}"#;
+    assert!(
+        cap.body.windows(needle.len()).any(|w| w == needle),
+        "managed OpenAI streaming body must have stream_options.include_usage injected; got: {}",
+        String::from_utf8_lossy(&cap.body)
+    );
+}
+
+#[tokio::test]
+async fn blackhole_denies_then_restores() {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(2);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 99,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    let probe = |want: u16| {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        async move {
+            wait_for_status(want, move || {
+                let (c, u, k) = (c.clone(), u.clone(), k.clone());
+                async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+            })
+            .await
+        }
+    };
+
+    probe(200).await; // ready + allowed
+    put_kv(nats.port, "blackhole.99", b"spend").await;
+    probe(402).await; // denied once the watch delta lands
+    del_kv(nats.port, "blackhole.99").await;
+    probe(200).await; // restored
+}
+
+#[tokio::test]
+async fn blackhole_fraud_returns_403() {
+    // The spend path (402) is covered above; fraud takes the separate `DenyReason::Fraud` branch
+    // and must surface as 403 (not 402, not 200) end-to-end.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(20);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 1234,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    let probe = |want: u16| {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        async move {
+            wait_for_status(want, move || {
+                let (c, u, k) = (c.clone(), u.clone(), k.clone());
+                async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+            })
+            .await
+        }
+    };
+
+    probe(200).await; // ready + allowed
+    put_kv(nats.port, "blackhole.1234", b"fraud").await;
+    probe(403).await; // fraud → forbidden
+}
+
+#[tokio::test]
+async fn oversized_content_length_is_rejected_413() {
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(21);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 1,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    // Wait until the gateway is serving.
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    // Declare a body of 200 MiB + 1 (> the 100 MiB guard) but send no body — the guard rejects on
+    // the Content-Length header in request_filter before any body is read.
+    let req = format!(
+        "POST /v1/chat/completions HTTP/1.1\r\n\
+         Host: x\r\n\
+         Authorization: Bearer {vkey}\r\n\
+         Content-Type: application/json\r\n\
+         Content-Length: 209715201\r\n\
+         Connection: close\r\n\r\n"
+    );
+    assert_eq!(raw_status(gw.port, &req).await, 413);
+}
+
+#[tokio::test]
+async fn per_credential_rate_limit_returns_429() {
+    // Every other rejection code is covered e2e (401/402/403/413/503) — 429 was the gap. A
+    // misconfigured ceiling (e.g. `rate_limit_rps` env typo'd to 0) would silently disable the
+    // guardrail, so prove the full enforcement path: a burst on one credential trips 429, charged on
+    // the raw key in `request_filter` before any verify/upstream connect. BYO (so no key material
+    // needed); the global BYO tier is disabled so this isolates the per-credential ceiling.
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(40);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .rate_limit_rps(5)
+        .byo_rate_limit_rps(0)
+        .start()
+        .await;
+    let client = reqwest::Client::new();
+
+    // Wait until the gateway serves, using a *different* credential so the flood token's budget is
+    // untouched by readiness probing.
+    {
+        let (c, u) = (client.clone(), gw.url());
+        wait_for_status(200, move || {
+            let (c, u) = (c.clone(), u.clone());
+            async move { post_status(&c, &u, "sk-byo-warmup", body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    // Burst one credential well past its 5 rps ceiling within a single window. The first few are
+    // served (200); once the ceiling is crossed the rest are throttled (429).
+    let mut saw_200 = false;
+    let mut saw_429 = false;
+    for _ in 0..50 {
+        match post_status(&client, &gw.url(), "sk-byo-flood", body_for("gpt-4o")).await {
+            200 => saw_200 = true,
+            429 => saw_429 = true,
+            other => panic!("unexpected status under rate limit: {other}"),
+        }
+    }
+    assert!(
+        saw_200,
+        "the first requests under the ceiling must be served"
+    );
+    assert!(
+        saw_429,
+        "a burst past the per-credential ceiling must yield 429"
+    );
+    wait_for_metric(&gw, "ai_rejections_total", "rate_limit", 1.0).await;
+}
+
+#[tokio::test]
+async fn managed_key_via_x_api_key_header_is_accepted() {
+    // Anthropic SDKs present the key in `x-api-key`, not `Authorization: Bearer`. A managed virtual
+    // key must be extracted from either header; here it arrives via x-api-key on the OpenAI path and
+    // must still swap to the OpenAI pool key in the Bearer scheme the upstream wants.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(22);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 8,
+            vpc_id: 8,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                post_status_xapikey(&c, &u, "/v1/chat/completions", &k, body_for("gpt-4o")).await
+            }
+        })
+        .await;
+    }
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(cap.authorization.as_deref(), Some("Bearer sk-pool-secret"));
+}
+
+#[tokio::test]
+async fn managed_key_for_unconfigured_provider_returns_503() {
+    // The default gateway configures OpenAI + Fireworks pool keys, but NOT Anthropic. A managed key
+    // routed to Anthropic (via the `/anthropic` path segment) has no pool key → 503, surfaced in
+    // request_filter before any upstream connect.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(23);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 11,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+    wait_for_status(503, move || {
+        let (c, u, k) = (c.clone(), u.clone(), k.clone());
+        async move {
+            post_path_status(
+                &c,
+                &u,
+                "/anthropic/v1/messages",
+                &k,
+                body_for("claude-opus-4-8"),
+            )
+            .await
+        }
+    })
+    .await;
+}
+
+#[tokio::test]
+async fn anthropic_dialect_swaps_key_relays_and_meters() {
+    // The Anthropic path (`/v1/messages`) drives a different dialect, a different auth scheme
+    // (x-api-key, not Bearer), and a different usage parser than the OpenAI tests above.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(24);
+    let mock = MockUpstream::start(Mode::AnthropicJson).await;
+    let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .providers(&["anthropic"])
+        .start()
+        .await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 77,
+            vpc_id: 2,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    {
+        let (c, u, k) = (client.clone(), gw.url(), vkey.clone());
+        wait_for_status(200, move || {
+            let (c, u, k) = (c.clone(), u.clone(), k.clone());
+            async move {
+                post_status_xapikey(&c, &u, "/v1/messages", &k, body_for("claude-opus-4-8")).await
+            }
+        })
+        .await;
+    }
+
+    let resp = client
+        .post(format!("{}/v1/messages", gw.url()))
+        .header("x-api-key", &vkey)
+        .header("content-type", "application/json")
+        .body(body_for("claude-opus-4-8"))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+
+    let cap = mock.captured().expect("mock received a request");
+    assert_eq!(cap.path, "/v1/messages");
+    // Anthropic wants the key in x-api-key, and the inbound virtual key must not leak upstream.
+    assert_eq!(cap.x_api_key.as_deref(), Some("sk-anthropic-pool"));
+    assert_eq!(cap.authorization, None);
+
+    wait_for_metric(&gw, "ai_tokens_total", "input", 13.0).await;
+}
+
+#[tokio::test]
+async fn missing_api_key_returns_401() {
+    // A request with neither Authorization nor x-api-key takes the "missing API key" branch — a
+    // different path than the malformed-key (invalid) branch the managed test exercises.
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(25);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let client = reqwest::Client::new();
+
+    let (c, u) = (client.clone(), gw.url());
+    wait_for_status(401, move || {
+        let (c, u) = (c.clone(), u.clone());
+        async move {
+            c.post(format!("{u}/v1/chat/completions"))
+                .header("content-type", "application/json")
+                .body(body_for("gpt-4o"))
+                .send()
+                .await
+                .map(|r| r.status().as_u16())
+                .unwrap_or(0)
+        }
+    })
+    .await;
+}
+
+#[tokio::test]
+async fn deny_set_is_fail_open_when_nats_drops() {
+    // After NATS goes away the last-known deny-set must be *retained* (fail-open), and auth/keys —
+    // which come from config, not NATS — must keep working.
+    let mut nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(26);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+
+    let denied = mint(
+        &VirtualKey {
+            tenant_id: 555,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let allowed = mint(
+        &VirtualKey {
+            tenant_id: 556,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    let probe = |key: String, want: u16| {
+        let (c, u) = (client.clone(), gw.url());
+        async move {
+            wait_for_status(want, move || {
+                let (c, u, k) = (c.clone(), u.clone(), key.clone());
+                async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+            })
+            .await
+        }
+    };
+
+    probe(denied.clone(), 200).await; // ready + allowed
+    put_kv(nats.port, "blackhole.555", b"spend").await;
+    probe(denied.clone(), 402).await; // deny delta landed
+
+    nats.stop(); // NATS disappears
+
+    probe(denied.clone(), 402).await; // stale deny retained, not cleared
+    probe(allowed.clone(), 200).await; // un-denied tenant still served without NATS
+}
+
+#[tokio::test]
+async fn streaming_tail_compaction_preserves_usage_event() {
+    // The usage chunk trails 130 KiB of content, forcing the proxy's response-tail compaction
+    // (resp_tail grows past 2× the 64 KiB cap). The usage event must survive in the retained tail.
+    let nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(27);
+    let mock = MockUpstream::start(Mode::SseLarge).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 21,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+    let body = r#"{"model":"gpt-4o","stream":true,"messages":[{"role":"user","content":"hi"}]}"#
+        .to_string();
+
+    {
+        let (c, u, k, b) = (client.clone(), gw.url(), vkey.clone(), body.clone());
+        wait_for_status(200, move || {
+            let (c, u, k, b) = (c.clone(), u.clone(), k.clone(), b.clone());
+            async move { post_status(&c, &u, &k, b).await }
+        })
+        .await;
+    }
+
+    let resp = client
+        .post(format!("{}/v1/chat/completions", gw.url()))
+        .header("authorization", format!("Bearer {vkey}"))
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+    let _ = resp.bytes().await.unwrap(); // drain the >128 KiB stream
+
+    wait_for_metric(&gw, "ai_tokens_total", "input", 5.0).await;
+}
+
+#[tokio::test]
+async fn on_disk_snapshot_enforces_across_restart_without_nats() {
+    // With a configured snapshot path, the deny-set is persisted to disk as deltas arrive. A restart
+    // must seed from that file and enforce immediately — even with NATS unreachable — proving the
+    // gateway reads the snapshot rather than re-scanning NATS on every boot.
+    let mut nats = Nats::start().await;
+    let (pubkey, sk) = test_keypair(28);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let snap = std::env::temp_dir().join(format!("beyond-ai-snap-{}.log", nats.port));
+    let _ = std::fs::remove_file(&snap);
+    let snap_str = snap.to_str().unwrap().to_string();
+
+    let denied = mint(
+        &VirtualKey {
+            tenant_id: 8800,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let allowed = mint(
+        &VirtualKey {
+            tenant_id: 8801,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    let client = reqwest::Client::new();
+
+    let probe = |gw_url: String, key: String, want: u16| {
+        let c = client.clone();
+        async move {
+            wait_for_status(want, move || {
+                let (c, u, k) = (c.clone(), gw_url.clone(), key.clone());
+                async move { post_status(&c, &u, &k, body_for("gpt-4o")).await }
+            })
+            .await
+        }
+    };
+
+    // --- First run: blackhole the tenant; the delta is persisted to the snapshot. ---
+    {
+        let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+            .snapshot_path(&snap_str)
+            .start()
+            .await;
+        probe(gw.url(), denied.clone(), 200).await; // ready + allowed
+        put_kv(nats.port, "blackhole.8800", b"fraud").await;
+        probe(gw.url(), denied.clone(), 403).await; // applied in-memory AND appended to the snapshot
+        // Let the watcher's apply→persist step flush the checkpoint to disk before we kill it.
+        tokio::time::sleep(std::time::Duration::from_millis(400)).await;
+        // gw drops here → process killed.
+    }
+
+    // NATS goes away entirely: a restart has nothing to scan and must rely on the snapshot.
+    nats.stop();
+
+    // --- Restart against the same snapshot path, NATS down. ---
+    let gw2 = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .snapshot_path(&snap_str)
+        .start()
+        .await;
+    // Seeded from disk: the fraud hold is enforced even though NATS is unreachable.
+    probe(gw2.url(), denied, 403).await;
+    // And an un-denied tenant is still served (auth/keys are from config, not NATS).
+    probe(gw2.url(), allowed, 200).await;
+
+    let _ = std::fs::remove_file(&snap);
+}
+
+#[tokio::test]
+async fn health_endpoints_report_ready_on_the_metrics_listener() {
+    // /livez and /readyz live on the metrics listener (alongside /metrics) and must both 200 with a
+    // `{status:"ok"}` body once the process is up. Readiness is intentionally *not* gated on NATS —
+    // the gateway is fail-open, so it can serve from config alone. We stop NATS before probing to
+    // prove readiness doesn't depend on it: a NATS-less gateway is still ready.
+    let mut nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(30);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::start(nats.port, &mock.authority(), &b64(&pubkey)).await;
+    nats.stop();
+
+    let (live_status, live_body) = gw.admin_get("/livez").await;
+    assert_eq!(
+        live_status, 200,
+        "livez should be 200 once the process answers"
+    );
+    assert!(
+        live_body.contains("\"status\":\"ok\""),
+        "livez body: {live_body}"
+    );
+
+    let (ready_status, ready_body) = gw.admin_get("/readyz").await;
+    assert_eq!(
+        ready_status, 200,
+        "readyz should be 200 even with NATS down (fail-open): {ready_body}"
+    );
+    assert!(
+        ready_body.contains("\"status\":\"ok\""),
+        "readyz body: {ready_body}"
+    );
+
+    // An unknown admin path is a clean 404, not a hang or a 200.
+    let (nf_status, _) = gw.admin_get("/nope").await;
+    assert_eq!(nf_status, 404);
+}
+
+#[tokio::test]
+async fn circuit_breaker_opens_on_5xx_and_sheds() {
+    // A provider returning 5xx is *broken*: after `threshold` failures the per-provider breaker
+    // opens and the gateway fast-fails with 503 — without connecting upstream — instead of piling
+    // requests against `read_timeout_secs`. BYO traffic (no minting needed); the breaker gates all
+    // traffic to the provider.
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(1);
+    let mock = MockUpstream::start(Mode::Status(500)).await;
+    let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .circuit_breaker_threshold(3)
+        .start()
+        .await;
+    let client = reqwest::Client::new();
+
+    // While closed the gateway relays the mock's 500; once the breaker trips it returns its own 503.
+    // Poll until we observe the trip (each failure is recorded in `logging`, which lags the response
+    // slightly — polling absorbs that).
+    {
+        let (c, u) = (client.clone(), gw.url());
+        wait_for_status(503, move || {
+            let (c, u) = (c.clone(), u.clone());
+            async move { post_status(&c, &u, "sk-byo-test", body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    // The trip is visible as circuit_open rejections — the breaker shed requests before the upstream.
+    assert!(
+        parse_metric(&gw.metrics().await, "ai_rejections_total", "circuit_open") >= 1.0,
+        "expected ai_rejections_total{{reason=\"circuit_open\"}} >= 1 after the breaker tripped"
+    );
+}
+
+#[tokio::test]
+async fn circuit_breaker_does_not_trip_on_429() {
+    // A 429 is a *healthy* provider throttling our pool key — the rate limiter and the client's
+    // Retry-After own that, NOT the breaker. So a 429 storm must never open the circuit: every
+    // request is relayed (429) and reaches the upstream; none is shed (503).
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(1);
+    let mock = MockUpstream::start(Mode::Status(429)).await;
+    let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .circuit_breaker_threshold(3)
+        // Don't let the BYO rate limiter shed these — we want every request to reach the upstream.
+        .byo_rate_limit_rps(0)
+        .start()
+        .await;
+    let client = reqwest::Client::new();
+
+    // Warm up until the gateway is serving and relaying the mock's 429 (the readiness pattern the
+    // other e2e tests use — avoids racing the first request against gateway startup under load).
+    {
+        let (c, u) = (client.clone(), gw.url());
+        wait_for_status(429, move || {
+            let (c, u) = (c.clone(), u.clone());
+            async move { post_status(&c, &u, "sk-byo-test", body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    // Well past the failure threshold (3): all relayed as 429, never the breaker's 503.
+    for _ in 0..10 {
+        assert_eq!(
+            post_status(&client, &gw.url(), "sk-byo-test", body_for("gpt-4o")).await,
+            429
+        );
+    }
+
+    assert_eq!(
+        parse_metric(&gw.metrics().await, "ai_rejections_total", "circuit_open"),
+        0.0,
+        "a 429 storm must not open the circuit breaker"
+    );
+    assert!(
+        mock.hits() >= 10,
+        "every request should have reached the upstream (got {} hits)",
+        mock.hits()
+    );
+}
diff --git a/tests/smoke.rs b/tests/smoke.rs
new file mode 100644
index 0000000..622e56a
--- /dev/null
+++ b/tests/smoke.rs
@@ -0,0 +1,251 @@
+//! Live smoke tests against **real** providers — the proof docs and the mock can't give:
+//! a real TLS/SNI handshake to the provider host, the base-path rewrite landing on a real mount
+//! (200, not 404), the **managed** path (verify → deny-check → pool-key swap), and a real
+//! (non-canned) response body.
+//!
+//! These exercise the **production** path, not BYO: the test generates an Ed25519 keypair, configures
+//! the *real* provider key (from the env var) as the gateway's pool key, mints a `bai_…` virtual key,
+//! and sends that. So the gateway verifies the virtual key, runs the deny-set check, and swaps in the
+//! real provider key before forwarding — the same flow a real managed tenant takes. The real key only
+//! ever lives in the gateway's config; the client presents the minted virtual key.
+//!
+//! Two safety layers so this never runs — or bills — by accident:
+//!   1. Every test is `#[ignore]`, so a plain `cargo test` skips the whole file.
+//!   2. When explicitly run, each test still **skips** (early-returns) unless its provider's API
+//!      key env var is set — so you only ever hit the providers you have keys for.
+//!
+//! Run them:
+//!   ANTHROPIC_API_KEY=sk-ant-… mise run test:smoke
+//!   # or directly:
+//!   ANTHROPIC_API_KEY=sk-ant-… cargo test -p beyond-ai --test smoke -- --ignored --nocapture
+//!
+//! Model ids are the cheapest small model per provider as of 2026-05; adjust if a provider retires
+//! one (a model-not-found is a stale id here, not a gateway bug).
+
+// Test target: `.unwrap()`/`.expect()`/`panic!` are assertions, not production code. See e2e.rs.
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
+
+mod common;
+
+use beyond_ai::key::{VirtualKey, mint};
+use common::*;
+
+/// The provider's API key from the environment, or `None` (→ the test logs a skip and returns).
+fn env_key(var: &str) -> Option<String> {
+    std::env::var(var).ok().filter(|v| !v.trim().is_empty())
+}
+
+/// A gateway wired to the **real** provider hosts over TLS, with `provider`'s pool key set to the
+/// caller's real key and a signing key installed — so a minted virtual key for `provider` verifies
+/// and swaps to the real key. Returns the gateway plus the minted `bai_…` key to present as a client.
+/// (Its own nats-server backs the deny-set, empty here — this tenant isn't denied.)
+async fn managed_gateway(nats: &Nats, provider: &str, real_key: &str) -> (Gateway, String) {
+    let (pubkey, sk) = test_keypair(7);
+    let gw = Gateway::builder(nats.port, "unused", &b64(&pubkey))
+        .real_upstreams()
+        .pool_key(provider, real_key)
+        .start()
+        .await;
+    let vkey = mint(
+        &VirtualKey {
+            tenant_id: 1,
+            vpc_id: 1,
+        },
+        1,
+        &sk,
+    );
+    (gw, vkey)
+}
+
+/// Drive one OpenAI-wire provider through the gateway as a managed request. The provider is selected
+/// by the first path segment; `chat_path` is the full gateway path — `/{provider}/{native-base}/
+/// chat/completions` (the provider's own base path after the selector, forwarded verbatim).
+async fn smoke_openai_wire(provider: &str, key_env: &str, model: &str, chat_path: &str) {
+    let Some(key) = env_key(key_env) else {
+        eprintln!("smoke[{provider}]: {key_env} unset — skipping");
+        return;
+    };
+    let nats = Nats::start().await;
+    let (gw, vkey) = managed_gateway(&nats, provider, &key).await;
+    let client = reqwest::Client::new();
+
+    let body = format!(
+        r#"{{"model":"{model}","max_tokens":16,"messages":[{{"role":"user","content":"Reply with the single word: ping"}}]}}"#
+    );
+    let resp = client
+        .post(format!("{}{chat_path}", gw.url()))
+        .header("authorization", format!("Bearer {vkey}"))
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .expect("request to gateway");
+    let status = resp.status();
+    let text = resp.text().await.unwrap_or_default();
+    assert!(
+        status.is_success(),
+        "smoke[{provider}] model={model} path={chat_path}: expected 2xx, got {status}.\n\
+         404 ⇒ wrong native path / provider segment; 401 ⇒ pool-key swap/verify; 403 ⇒ deny-set; \
+         a model error ⇒ stale model id. body: {text}"
+    );
+    assert!(
+        text.contains("\"choices\""),
+        "smoke[{provider}]: {status} but no `choices` in body: {text}"
+    );
+    eprintln!("smoke[{provider}]: OK ({status}) — verified, swapped, real 2xx");
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_anthropic() {
+    let Some(key) = env_key("ANTHROPIC_API_KEY") else {
+        eprintln!("smoke[anthropic]: ANTHROPIC_API_KEY unset — skipping");
+        return;
+    };
+    let nats = Nats::start().await;
+    let (gw, vkey) = managed_gateway(&nats, "anthropic", &key).await;
+    let client = reqwest::Client::new();
+
+    // `/anthropic/v1/messages` → provider `anthropic` (selected by the path segment, stripped to
+    // `/v1/messages` upstream). The minted virtual key is presented in `x-api-key` (the Anthropic
+    // SDK's header); the gateway verifies it and swaps in the real key — again in `x-api-key` (not
+    // Bearer). The required `anthropic-version` header passes through. This is the *only* test
+    // covering the x-api-key auth scheme + a real TLS handshake to api.anthropic.com via the full
+    // managed path.
+    let body = r#"{"model":"claude-haiku-4-5","max_tokens":16,"messages":[{"role":"user","content":"Reply with the single word: ping"}]}"#;
+    let resp = client
+        .post(format!("{}/anthropic/v1/messages", gw.url()))
+        .header("x-api-key", &vkey)
+        .header("anthropic-version", "2023-06-01")
+        .header("content-type", "application/json")
+        .body(body)
+        .send()
+        .await
+        .expect("request to gateway");
+    let status = resp.status();
+    let text = resp.text().await.unwrap_or_default();
+    assert!(
+        status.is_success(),
+        "smoke[anthropic]: expected 2xx, got {status}. body: {text}"
+    );
+    assert!(
+        text.contains("\"content\""),
+        "smoke[anthropic]: {status} but no `content` in body: {text}"
+    );
+    eprintln!("smoke[anthropic]: OK ({status}) — verified, swapped to x-api-key, real 2xx");
+}
+
+// --- OpenAI-wire providers. Same code path; testing more than one confirms each host/base-path/auth
+// row in `route::KNOWN_PROVIDERS` against the real endpoint. ---
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_openai() {
+    smoke_openai_wire(
+        "openai",
+        "OPENAI_API_KEY",
+        "gpt-4o-mini",
+        "/openai/v1/chat/completions",
+    )
+    .await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_groq() {
+    // Groq mounts under `/openai/v1`; the client sends `/groq/openai/v1/...` and the gateway strips
+    // `/groq` and forwards the rest verbatim. The highest-value non-`/v1` native-path case.
+    smoke_openai_wire(
+        "groq",
+        "GROQ_API_KEY",
+        "llama-3.1-8b-instant",
+        "/groq/openai/v1/chat/completions",
+    )
+    .await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_fireworks() {
+    // Fireworks mounts under `/inference/v1`: client sends `/fireworks/inference/v1/...`.
+    smoke_openai_wire(
+        "fireworks",
+        "FIREWORKS_API_KEY",
+        "accounts/fireworks/models/llama-v3p1-8b-instruct",
+        "/fireworks/inference/v1/chat/completions",
+    )
+    .await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_openrouter() {
+    // OpenRouter mounts under `/api/v1`: client sends `/openrouter/api/v1/...`.
+    smoke_openai_wire(
+        "openrouter",
+        "OPENROUTER_API_KEY",
+        "openai/gpt-4o-mini",
+        "/openrouter/api/v1/chat/completions",
+    )
+    .await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_deepseek() {
+    smoke_openai_wire(
+        "deepseek",
+        "DEEPSEEK_API_KEY",
+        "deepseek-chat",
+        "/deepseek/v1/chat/completions",
+    )
+    .await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_together() {
+    smoke_openai_wire(
+        "together",
+        "TOGETHER_API_KEY",
+        "meta-llama/Llama-3.1-8B-Instruct-Turbo",
+        "/together/v1/chat/completions",
+    )
+    .await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_cerebras() {
+    smoke_openai_wire(
+        "cerebras",
+        "CEREBRAS_API_KEY",
+        "llama3.1-8b",
+        "/cerebras/v1/chat/completions",
+    )
+    .await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_mistral() {
+    smoke_openai_wire(
+        "mistral",
+        "MISTRAL_API_KEY",
+        "mistral-small-latest",
+        "/mistral/v1/chat/completions",
+    )
+    .await;
+}
+
+#[tokio::test]
+#[ignore = "live provider smoke; run via `mise run test:smoke` with API keys set"]
+async fn smoke_xai() {
+    smoke_openai_wire(
+        "xai",
+        "XAI_API_KEY",
+        "grok-3-mini",
+        "/xai/v1/chat/completions",
+    )
+    .await;
+}