diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..ebb111d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +target +.git +dist +*.md +.env +.env.* +.claude +.github diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c391b9a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,77 @@ +# syntax=docker/dockerfile:1.7-labs +# +# Multi-stage build for the Beyond AI gateway (`beyond-ai`). +# +# Built from this crate's root (it's a single standalone crate — no workspace, +# all deps come from crates.io): +# +# docker build -t beyond-ai . +# +# We use cargo-chef to cache the (heavy: pingora + tokio) dependency build in a +# layer that's keyed only on the dependency graph, so source-only changes skip +# the slow dep compile. + +# Latest stable 1.x. The crate's MSRV is 1.85, but cargo-chef's own build pulls +# deps that need a newer rustc, so the image toolchain must lead the MSRV. +ARG RUST_VERSION=1 + +# --------------------------------------------------------------------------- +# Stage 1: chef — base with cargo-chef installed. +# --------------------------------------------------------------------------- +FROM rust:${RUST_VERSION}-bookworm AS chef +# cmake builds the bundled zlib-ng (libz-ng-sys, pulled in by pingora); the base +# rust image ships a C/C++ toolchain but not cmake. +RUN apt-get update && apt-get install -y --no-install-recommends cmake \ + && rm -rf /var/lib/apt/lists/* +RUN cargo install cargo-chef --locked --version ^0.1 +WORKDIR /app + +# --------------------------------------------------------------------------- +# Stage 2: planner — compute a dependency-only recipe. +# --------------------------------------------------------------------------- +FROM chef AS planner +COPY . . +RUN cargo chef prepare --recipe-path recipe.json + +# --------------------------------------------------------------------------- +# Stage 3: builder — cook dependencies from the recipe, then build the binary. +# --------------------------------------------------------------------------- +FROM chef AS builder +COPY --from=planner /app/recipe.json recipe.json +# Cook only dependencies — cached until the recipe (the dependency graph) +# changes. Cache mounts keep the cargo registry + git db warm across builds. +# rustls uses ring, which builds C/asm with the toolchain already in the image; +# no extra apt packages are needed (pure-rustls, no OpenSSL/protobuf). +RUN --mount=type=cache,target=/usr/local/cargo/registry \ + --mount=type=cache,target=/usr/local/cargo/git \ + cargo chef cook --release --recipe-path recipe.json + +# Now copy the full source and build just the gateway binary. +COPY . . +RUN --mount=type=cache,target=/usr/local/cargo/registry \ + --mount=type=cache,target=/usr/local/cargo/git \ + --mount=type=cache,target=/app/target \ + cargo build --release --bin beyond-ai \ + && cp /app/target/release/beyond-ai /usr/local/bin/beyond-ai + +# --------------------------------------------------------------------------- +# Stage 4: runtime — minimal Debian slim with just the binary and the CA +# certificates needed to verify outbound TLS to the LLM providers. +# --------------------------------------------------------------------------- +FROM debian:bookworm-slim AS runtime +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Run as a non-root user. +RUN useradd --system --uid 10001 --no-create-home --shell /usr/sbin/nologin beyond +USER beyond + +COPY --from=builder /usr/local/bin/beyond-ai /usr/local/bin/beyond-ai + +# 8080: data-plane listener (client requests). 9090: admin — /metrics +# (Prometheus), /livez, /readyz. Both default to 0.0.0.0; override via the +# mounted config or the AI_LISTEN / AI_METRICS_LISTEN env vars. +EXPOSE 8080/tcp 9090/tcp + +ENTRYPOINT ["/usr/local/bin/beyond-ai"] diff --git a/src/deny.rs b/src/deny.rs index af9964f..135841c 100644 --- a/src/deny.rs +++ b/src/deny.rs @@ -151,5 +151,9 @@ mod tests { fn spend_is_402_fraud_is_403() { assert_eq!(DenyReason::Spend.http_status(), 402); assert_eq!(DenyReason::Fraud.http_status(), 403); + // Unknown is fail-safe: an unrecognized reason still denies, and maps to 403 like fraud + // (not 402) — so a control-plane reason we don't yet parse can't be mistaken for a billing + // block or, worse, leak through as an allow. + assert_eq!(DenyReason::Unknown.http_status(), 403); } } diff --git a/src/proxy.rs b/src/proxy.rs index 9083ea7..9b77e18 100644 --- a/src/proxy.rs +++ b/src/proxy.rs @@ -940,4 +940,33 @@ mod tests { let ok = "a".repeat(MAX_MODEL_LEN); assert_eq!(sanitize_model(ok.clone()), ok); } + + #[test] + fn dialect_for_path_selects_anthropic_only_for_messages() { + // The dialect drives usage parsing *and* stream injection: misclassifying an Anthropic + // `/v1/messages` request as OpenAI mis-meters its tokens. The rule is a `/v1/messages` + // prefix ⇒ Anthropic; everything else (chat completions, embeddings, the bare root) is + // OpenAI-dialect. This locks that mapping so a refactor can't silently flip it. + assert_eq!(dialect_for_path("/v1/messages"), Dialect::Anthropic); + assert_eq!(dialect_for_path("/v1/messages/batches"), Dialect::Anthropic); + assert_eq!(dialect_for_path("/v1/chat/completions"), Dialect::OpenAI); + assert_eq!(dialect_for_path("/v1/embeddings"), Dialect::OpenAI); + assert_eq!(dialect_for_path("/"), Dialect::OpenAI); + } + + #[test] + fn is_streamable_path_matches_generation_suffixes_across_prefixes() { + // Only chat-completions / responses get buffered for `stream_options.include_usage` + // injection. The check is by *suffix* so it holds whatever mount prefix the provider uses; + // a mismatch here either skips injection on a streamable path (lost usage) or needlessly + // buffers a non-streaming one. + assert!(is_streamable_path("/v1/chat/completions")); + assert!(is_streamable_path("/openai/v1/chat/completions")); + assert!(is_streamable_path("/inference/v1/chat/completions")); + assert!(is_streamable_path("/v1/responses")); + // Non-streaming endpoints must not be buffered. + assert!(!is_streamable_path("/v1/embeddings")); + assert!(!is_streamable_path("/v1/messages")); + assert!(!is_streamable_path("/v1/models")); + } } diff --git a/src/usage.rs b/src/usage.rs index 22443a0..7c97a37 100644 --- a/src/usage.rs +++ b/src/usage.rs @@ -263,7 +263,44 @@ mod tests { #[test] fn no_usage_returns_none() { - assert!(openai_stream(b"data: {\"choices\":[]}\n\n").is_none()); - assert!(anthropic_body(b"{}").map(|u| u.input_tokens).unwrap_or(0) == 0); + // Absent `usage` and unparseable bodies must both meter as `None` — never a silent zero-token + // row that bills nothing while *looking* like a successful meter. A provider dropping `usage` + // (an error 200, a wire-version change) or returning non-JSON must surface as "no fact", which + // the proxy logs/alerts on, rather than a phantom 0-token success. + + // --- non-streaming bodies --- + assert!( + openai_body(br#"{"choices":[{"message":{"content":"hi"}}]}"#).is_none(), + "openai body without a `usage` block has nothing to meter" + ); + assert!( + openai_body(b"not json at all").is_none(), + "malformed openai body must not panic or meter zeros" + ); + assert!( + anthropic_body(br#"{"content":[{"type":"text","text":"hi"}]}"#).is_none(), + "anthropic body without a `usage` block has nothing to meter" + ); + assert!( + anthropic_body(b"{ broken").is_none(), + "malformed anthropic body must not panic or meter zeros" + ); + + // --- streaming: well-formed SSE that simply never carries a usage event --- + assert!( + openai_stream( + b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\ndata: [DONE]\n\n" + ) + .is_none(), + "an openai stream with content but no usage chunk meters nothing" + ); + assert!( + anthropic_stream( + b"event: content_block_delta\n\ + data: {\"type\":\"content_block_delta\",\"delta\":{\"text\":\"hi\"}}\n\n" + ) + .is_none(), + "an anthropic stream with no usage-bearing event meters nothing" + ); } } diff --git a/tests/e2e.rs b/tests/e2e.rs index a12207d..5409a13 100644 --- a/tests/e2e.rs +++ b/tests/e2e.rs @@ -505,6 +505,64 @@ async fn per_credential_rate_limit_returns_429() { wait_for_metric(&gw, "ai_rejections_total", "rate_limit", 1.0).await; } +#[tokio::test] +async fn byo_global_rate_limit_caps_distinct_tokens() { + // The global BYO aggregate cap is the primary egress-IP protection: a flood of *distinct* junk + // BYO tokens (each its own per-credential bucket, so the per-credential tier never trips) would + // otherwise open junk-auth connections to providers from our egress IPs and get them banned. + // This proves the shared bucket bounds that aggregate. Per-credential is disabled so the only + // ceiling in play is the global BYO one — and a regression that inverts the `!managed` guard or + // skips the tier would slip past every other test. + let nats = Nats::start().await; + let (pubkey, _sk) = test_keypair(43); + let mock = MockUpstream::start(Mode::Json).await; + let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey)) + .rate_limit_rps(0) // isolate: per-credential tier off + .byo_rate_limit_rps(3) // global BYO ceiling + .start() + .await; + let client = reqwest::Client::new(); + + // Warm with a *managed* path is pointless here (managed is exempt); warm with a distinct BYO + // token whose single hit can't itself exhaust a 3-rps bucket mid-readiness. + { + let (c, u) = (client.clone(), gw.url()); + wait_for_status(200, move || { + let (c, u) = (c.clone(), u.clone()); + async move { post_status(&c, &u, "sk-byo-warmup", body_for("gpt-4o")).await } + }) + .await; + } + + // Each request uses a *different* BYO token, so per-credential keying would never trip — only the + // shared BYO bucket can. The first few are served; once the aggregate ceiling is crossed the rest + // are throttled. + let mut saw_200 = false; + let mut saw_429 = false; + for i in 0..50 { + match post_status( + &client, + &gw.url(), + &format!("sk-byo-distinct-{i}"), + body_for("gpt-4o"), + ) + .await + { + 200 => saw_200 = true, + 429 => saw_429 = true, + other => panic!("unexpected status under BYO global rate limit: {other}"), + } + } + assert!(saw_200, "requests under the BYO ceiling must be served"); + assert!( + saw_429, + "a flood of distinct BYO tokens past the global ceiling must yield 429" + ); + // The dedicated BYO-global reason label — distinct from the per-credential `rate_limit` — must be + // what fired, so an operator can tell which knob tripped. + wait_for_metric(&gw, "ai_rejections_total", "rate_limit_byo_global", 1.0).await; +} + #[tokio::test] async fn managed_key_via_x_api_key_header_is_accepted() { // Anthropic SDKs present the key in `x-api-key`, not `Authorization: Bearer`. A managed virtual