From d636756c2c03260d19bd023dcc79f142caf4ad93 Mon Sep 17 00:00:00 2001 From: intech Date: Sun, 19 Apr 2026 04:31:00 +0400 Subject: [PATCH 1/2] feat(performance-test-server): add OTel OTLP export overhead scenario Adds a 6th server configuration (port 8085) with full interceptor chain and a real OTLP/gRPC exporter pointed at a local OTel collector, plus a k6 scenario that measures p50/p95/p99 latency and throughput delta between the baseline (port 8081) and otel-export configurations under 100 VUs sustained load. Motivation: the existing interceptor-overhead scenario runs the OTel interceptor with the provider uninitialized (no-op spans/metrics), so it cannot answer "what is the CPU cost of actually shipping spans in production". This closes that gap and gives us an end-to-end harness for validating future @opentelemetry/otlp-transformer bumps (R1.2). The OTel scenario is opt-in: - port 8085 is bound only when OTEL_EXPORT_ENABLED=1 - the otel-collector service lives under the "otel-export" docker-compose profile, so default compose runs are unaffected - all other servers/ports and existing scenarios work unchanged Files: - src/index.ts: +6th server, conditional on OTEL_EXPORT_ENABLED, eager initProvider + graceful shutdownProvider - k6/otel-export-overhead.js: new scenario (100 VUs, ~5 min, shuffled baseline vs otel-export calls, JSON summary via K6_OUT) - docker-compose.yml: otel-collector service (profile: otel-export), k6-otel-export runner, OTEL_* env var surface with production-ish BatchSpanProcessor defaults - otel-collector-config.yaml: OTLP gRPC+HTTP receivers -> debug exporter (the goal is export-side CPU measurement, not backend write throughput) - Dockerfile: EXPOSE 8085 - README.md: scenario docs, env-var reference, expected overhead table, links to upstream issues (#6221, #6225, #6390, #6570) Smoke test: 10 requests to :8085 verified spans arrive at collector with correct rpc.system / rpc.service / rpc.method attributes. Quick 20-sample comparison on the same host: baseline p95 ~1.87ms, otel-export p95 ~2.25ms (+20%), in line with the expected-overhead range documented in the README. pnpm-lock.yaml is updated to resolve pre-existing drift between manifest (1.0.0-rc.10) and lockfile (1.0.0-rc.7) introduced by the earlier rc.10 bump; install now succeeds with --frozen-lockfile. Co-Authored-By: Claude Opus 4.7 (1M context) --- performance-test-server/Dockerfile | 5 +- performance-test-server/README.md | 66 +++++- performance-test-server/docker-compose.yml | 59 +++++ .../k6/otel-export-overhead.js | 221 ++++++++++++++++++ performance-test-server/k6/results/.gitignore | 2 + .../otel-collector-config.yaml | 55 +++++ performance-test-server/pnpm-lock.yaml | 38 +-- performance-test-server/src/index.ts | 110 +++++++-- 8 files changed, 513 insertions(+), 43 deletions(-) create mode 100644 performance-test-server/k6/otel-export-overhead.js create mode 100644 performance-test-server/k6/results/.gitignore create mode 100644 performance-test-server/otel-collector-config.yaml diff --git a/performance-test-server/Dockerfile b/performance-test-server/Dockerfile index 4d2dd4a..70bb7cc 100644 --- a/performance-test-server/Dockerfile +++ b/performance-test-server/Dockerfile @@ -71,7 +71,8 @@ ENV NODE_ENV=production ENV TLS_DIR=/app/certs # Ports: 8080 (full chain), 8081 (baseline), 8082 (validation), -# 8083 (logger), 8084 (otel) -EXPOSE 8080 8081 8082 8083 8084 +# 8083 (logger), 8084 (otel no-op), 8085 (otel real OTLP export — +# only bound when OTEL_EXPORT_ENABLED=1) +EXPOSE 8080 8081 8082 8083 8084 8085 CMD ["node", "src/index.ts"] diff --git a/performance-test-server/README.md b/performance-test-server/README.md index 764bbfb..0d99753 100644 --- a/performance-test-server/README.md +++ b/performance-test-server/README.md @@ -4,17 +4,18 @@ Dedicated server for k6 performance benchmarking with configurable interceptor c ## Purpose -This server runs **5 parallel instances** on different ports, each with a different interceptor configuration: +This server runs **5 parallel instances** on different ports, each with a different interceptor configuration, plus an **optional 6th instance** for measuring OTLP export overhead end-to-end: | Port | Configuration | Purpose | |------|---------------|---------| | 8081 | **Baseline** (no interceptors) | Measure baseline latency without any overhead | | 8082 | **Validation only** | Measure validation interceptor overhead | | 8083 | **Logger only** | Measure logger interceptor overhead | -| 8084 | **Tracing only** | Measure tracing interceptor overhead | -| 8080 | **Full chain** (all interceptors) | Measure total overhead with all interceptors | +| 8084 | **Tracing only** (no-op exporter) | Measure tracing interceptor overhead | +| 8080 | **Full chain** (all interceptors, no-op exporter) | Measure total overhead with all interceptors | +| 8085 | **OTel export** — full chain + real OTLP exporter (opt-in via `OTEL_EXPORT_ENABLED=1`) | Measure end-to-end cost of the stock `@connectum/otel` export path (BatchSpanProcessor + otlp-transformer + OTLP/gRPC) | -This allows k6 benchmarks to accurately measure the overhead introduced by each interceptor. +This allows k6 benchmarks to accurately measure the overhead introduced by each interceptor, and — with the OTel export scenario — the CPU cost of actually shipping spans over the wire. ## Requirements @@ -94,10 +95,39 @@ Stress-tests the full-chain configuration with 100 concurrent VUs for 7 minutes: docker compose --profile load up k6-basic-load --build --abort-on-container-exit ``` +### OTel OTLP Export Overhead + +Measures the p50/p95/p99 latency delta and throughput delta between the baseline (port 8081) and the full-chain-with-real-OTLP-exporter configuration (port 8085). Runs for ~5 minutes at 100 VUs: + +```bash +OTEL_EXPORT_ENABLED=1 docker compose --profile otel-export up \ + --build --abort-on-container-exit +``` + +What this measures that the `k6-interceptor-overhead` scenario does *not*: + +- Real `BatchSpanProcessor` + `@opentelemetry/otlp-transformer` serialization cost per exported span +- OTLP/gRPC wire transport cost (`@grpc/grpc-js`) +- End-to-end CPU pressure of the full OTel export pipeline under sustained load + +The collector runs locally in Docker and drops all telemetry via a `debug` exporter — the goal is export-side CPU profiling, not backend write throughput. See `otel-collector-config.yaml`. + +k6 writes a machine-readable JSON summary to `k6/results/otel-export-overhead.json` (gitignored) for CI / bench-tracking tooling. + +**Expected overhead range** (informational — actual numbers depend on the installed `@opentelemetry/otlp-transformer` version): + +| Metric | Baseline (8081) | OTel export (8085) | Overhead | Relative | +|--------|-----------------|--------------------|----------|----------| +| p50 latency | ~1–3 ms | ~1.5–4 ms | +0.5–1 ms | 1.2×–1.5× | +| p95 latency | ~2–5 ms | ~3–8 ms | +1–3 ms | 1.3×–2× | +| p99 latency | ~5–10 ms | ~8–20 ms | +3–10 ms | 1.5×–2.5× | + +A **relative overhead >1.5×** on p95 — or any sudden jump from a previous run — is a signal to investigate the `@opentelemetry/otlp-transformer` version. See Connectum recommendations R1.2 and upstream issues [#6221](https://github.com/open-telemetry/opentelemetry-js/issues/6221), PR [#6225](https://github.com/open-telemetry/opentelemetry-js/pull/6225), PR [#6390](https://github.com/open-telemetry/opentelemetry-js/pull/6390), issue [#6570](https://github.com/open-telemetry/opentelemetry-js/issues/6570). + ### Cleanup ```bash -docker compose --profile load down --rmi local -v +docker compose --profile load --profile otel-export down --rmi local -v ``` ### Environment Variables @@ -106,14 +136,30 @@ k6 scripts accept the following environment variables (set via `docker-compose.y | Variable | Default | Used by | |----------|---------|---------| -| `PROTOCOL` | `https` | interceptor-overhead | -| `BASE_HOST` | `server` | interceptor-overhead | +| `PROTOCOL` | `https` | interceptor-overhead, otel-export-overhead | +| `BASE_HOST` | `server` | interceptor-overhead, otel-export-overhead | | `BASE_URL` | `https://server:8080` | basic-load | -| `BASELINE_PORT` | `8081` | interceptor-overhead | +| `BASELINE_PORT` | `8081` | interceptor-overhead, otel-export-overhead | | `VALIDATION_PORT` | `8082` | interceptor-overhead | | `LOGGER_PORT` | `8083` | interceptor-overhead | | `TRACING_PORT` | `8084` | interceptor-overhead | | `FULLCHAIN_PORT` | `8080` | interceptor-overhead | +| `OTEL_EXPORT_PORT` | `8085` | otel-export-overhead | + +The server-side OTel export scenario (port 8085) is controlled via standard `OTEL_*` env vars. Defaults are set in `docker-compose.yml`; override by exporting before `docker compose up`: + +| Variable | Default | Meaning | +|----------|---------|---------| +| `OTEL_EXPORT_ENABLED` | `0` | Set to `1` to bind port 8085 and initialize the OTel provider | +| `OTEL_SERVICE_NAME` | `performance-test-server` | Resource `service.name` attribute | +| `OTEL_TRACES_EXPORTER` | `otlp/grpc` | `console`, `otlp/http`, `otlp/grpc`, or `none` | +| `OTEL_METRICS_EXPORTER` | `otlp/grpc` | same values as above | +| `OTEL_LOGS_EXPORTER` | `none` | same values as above | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://otel-collector:4317` | Collector endpoint | +| `OTEL_BSP_MAX_EXPORT_BATCH_SIZE` | `512` | BatchSpanProcessor batch size | +| `OTEL_BSP_MAX_QUEUE_SIZE` | `2048` | BatchSpanProcessor queue size | +| `OTEL_BSP_SCHEDULE_DELAY` | `1000` | BatchSpanProcessor flush interval (ms) | +| `OTEL_BSP_EXPORT_TIMEOUT` | `10000` | Single export attempt timeout (ms) | ## Testing @@ -136,6 +182,9 @@ curl http://localhost:8084/grpc.health.v1.Health/Check # Full chain curl http://localhost:8080/grpc.health.v1.Health/Check + +# OTel export (only when OTEL_EXPORT_ENABLED=1) +curl http://localhost:8085/grpc.health.v1.Health/Check ``` ### Manual Test @@ -261,6 +310,7 @@ Benchmark scripts are located in the `k6/` directory: - `k6/interceptor-overhead.js` - Uses **all ports** to compare interceptor overhead - `k6/basic-load.js` - Uses port 8080 (full chain) with ramping VUs +- `k6/otel-export-overhead.js` - Uses **port 8081 (baseline) + port 8085 (OTel export)** to measure end-to-end OTLP export cost under 100 VUs sustained load ## Troubleshooting diff --git a/performance-test-server/docker-compose.yml b/performance-test-server/docker-compose.yml index 24c7929..20bc297 100644 --- a/performance-test-server/docker-compose.yml +++ b/performance-test-server/docker-compose.yml @@ -1,6 +1,22 @@ services: server: build: . + # OTel export scenario (port 8085) is opt-in. When OTEL_EXPORT_ENABLED=1 + # the server initializes the @connectum/otel provider with env-driven OTLP + # exporter settings and binds the extra port. All other scenarios work + # without these env vars; the provider stays uninitialized. + environment: + OTEL_EXPORT_ENABLED: "${OTEL_EXPORT_ENABLED:-0}" + OTEL_SERVICE_NAME: "${OTEL_SERVICE_NAME:-performance-test-server}" + OTEL_TRACES_EXPORTER: "${OTEL_TRACES_EXPORTER:-otlp/grpc}" + OTEL_METRICS_EXPORTER: "${OTEL_METRICS_EXPORTER:-otlp/grpc}" + OTEL_LOGS_EXPORTER: "${OTEL_LOGS_EXPORTER:-none}" + OTEL_EXPORTER_OTLP_ENDPOINT: "${OTEL_EXPORTER_OTLP_ENDPOINT:-http://otel-collector:4317}" + # BatchSpanProcessor tuning — realistic production-ish defaults + OTEL_BSP_MAX_EXPORT_BATCH_SIZE: "${OTEL_BSP_MAX_EXPORT_BATCH_SIZE:-512}" + OTEL_BSP_MAX_QUEUE_SIZE: "${OTEL_BSP_MAX_QUEUE_SIZE:-2048}" + OTEL_BSP_SCHEDULE_DELAY: "${OTEL_BSP_SCHEDULE_DELAY:-1000}" + OTEL_BSP_EXPORT_TIMEOUT: "${OTEL_BSP_EXPORT_TIMEOUT:-10000}" healthcheck: test: > node -e "const h=require('node:http2'),c=h.connect('https://localhost:8080', @@ -15,6 +31,25 @@ services: retries: 10 start_period: 5s + # ========================================================================= + # OpenTelemetry Collector (only started for the otel-export profile) + # ========================================================================= + # Accepts OTLP/gRPC on :4317 and OTLP/HTTP on :4318, then drops everything + # via a debug exporter. The goal is to measure export-side CPU cost, not + # backend write throughput — see otel-collector-config.yaml for rationale. + otel-collector: + image: otel/opentelemetry-collector-contrib:0.120.0 + profiles: ["otel-export"] + volumes: + - ./otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml:ro + command: ["--config=/etc/otelcol-contrib/config.yaml"] + healthcheck: + test: ["CMD", "/otelcol-contrib", "components"] + interval: 5s + timeout: 3s + retries: 5 + start_period: 5s + k6-interceptor-overhead: image: grafana/k6:latest volumes: @@ -36,3 +71,27 @@ services: server: { condition: service_healthy } command: run /scripts/basic-load.js profiles: ["load"] + + # ========================================================================= + # OTel OTLP export overhead scenario (profile: otel-export) + # ========================================================================= + # Measures p50/p95/p99 latency delta and throughput delta between the + # baseline (port 8081) and full-chain-with-real-OTLP-exporter (port 8085). + # Requires OTEL_EXPORT_ENABLED=1 on the server and a running otel-collector. + k6-otel-export: + image: grafana/k6:latest + volumes: + - ./k6:/scripts + - ./k6/results:/results + environment: + PROTOCOL: https + BASE_HOST: server + BASELINE_PORT: "8081" + OTEL_EXPORT_PORT: "8085" + # k6 writes a machine-readable summary here for CI/bench tracking. + K6_OUT: "json=/results/otel-export-overhead.json" + depends_on: + server: { condition: service_healthy } + otel-collector: { condition: service_healthy } + command: run /scripts/otel-export-overhead.js + profiles: ["otel-export"] diff --git a/performance-test-server/k6/otel-export-overhead.js b/performance-test-server/k6/otel-export-overhead.js new file mode 100644 index 0000000..1c13f72 --- /dev/null +++ b/performance-test-server/k6/otel-export-overhead.js @@ -0,0 +1,221 @@ +/** + * OTel OTLP Export Overhead Benchmark + * + * Purpose: Measure the server-side CPU/latency cost of enabling the stock + * @connectum/otel export path (BatchSpanProcessor + otlp-transformer + * + OTLP/gRPC) under a production-like RPC workload. + * + * Why this exists (R1.3, connectum-recommendations.md): + * The existing interceptor-overhead scenario runs the OTel interceptor with + * the provider UNSET — it emits no-op spans/metrics. That's correct for + * measuring interceptor *wiring* cost, but it tells us nothing about the + * expensive parts: span serialization and OTLP export. Those only run when + * a real provider + exporter is initialized. This scenario fills that gap. + * + * Configurations under test: + * - Baseline (port 8081) — no interceptors, no OTel + * - OTel export (port 8085) — full chain + real OTLP/gRPC exporter + * + * Load profile: + * 100 VUs, ramp 30s → 4m steady → 30s ramp down = ~5 min total. + * High enough throughput that BatchSpanProcessor exports continuously. + * + * Output: + * p50/p95/p99 latency per config + * Throughput (requests/sec) per config + * Export-overhead delta printed in teardown + * JSON summary written to /results/otel-export-overhead.json when run via + * docker compose (K6_OUT env var). + */ + +import { check, sleep } from "k6"; +import http from "k6/http"; +import { Rate, Trend } from "k6/metrics"; + +// ============================================================================ +// Custom Metrics +// ============================================================================ + +const baselineDuration = new Trend("baseline_no_otel", true); +const otelExportDuration = new Trend("with_otel_export", true); + +const baselineSuccess = new Rate("baseline_success"); +const otelExportSuccess = new Rate("otel_export_success"); + +// ============================================================================ +// Test Configuration +// ============================================================================ + +export const options = { + scenarios: { + sustained: { + executor: "ramping-vus", + startVUs: 0, + stages: [ + { duration: "30s", target: 100 }, // ramp up + { duration: "4m", target: 100 }, // steady load + { duration: "30s", target: 0 }, // ramp down + ], + gracefulRampDown: "10s", + }, + }, + + thresholds: { + // Both configs should stay healthy under 100 VUs. + baseline_no_otel: ["p(95)<50"], + // Stock OTel export adds BatchSpanProcessor + otlp-transformer on the + // critical path of every 1s batch flush. We set a loose threshold so + // the scenario reports instead of failing — the delta itself is the + // deliverable, not a SLA. + with_otel_export: ["p(95)<200"], + + baseline_success: ["rate>0.99"], + otel_export_success: ["rate>0.99"], + }, + + tags: { + test_type: "otel-export-overhead", + environment: "docker", + }, + + insecureSkipTLSVerify: true, + + // Compact summary — full percentiles for both custom trends. + summaryTrendStats: ["avg", "min", "med", "max", "p(50)", "p(90)", "p(95)", "p(99)"], +}; + +// ============================================================================ +// Server Ports +// ============================================================================ + +const BASELINE_PORT = __ENV.BASELINE_PORT || "8081"; +const OTEL_EXPORT_PORT = __ENV.OTEL_EXPORT_PORT || "8085"; + +const BASE_HOST = __ENV.BASE_HOST || "server"; +const PROTOCOL = __ENV.PROTOCOL || "https"; +const SERVICE_PATH = "/greeter.v1.GreeterService/SayHello"; + +// ============================================================================ +// Helpers +// ============================================================================ + +function callService(port, configName) { + const payload = JSON.stringify({ + name: `OtelBench-${configName}-${__VU}-${__ITER}`, + }); + + const response = http.post(`${PROTOCOL}://${BASE_HOST}:${port}${SERVICE_PATH}`, payload, { + headers: { + "Content-Type": "application/json", + "Connect-Protocol-Version": "1", + "User-Agent": "k6-otel-export-benchmark/1.0", + }, + tags: { + name: "SayHello", + config: configName, + }, + }); + + const success = check(response, { + [`${configName}: status is 200`]: (r) => r.status === 200, + }); + + return { response, success }; +} + +// ============================================================================ +// Test Scenario +// ============================================================================ + +export default function () { + // Alternate baseline / otel-export per iteration to average out JIT/GC + // jitter. Each iteration touches both configs once, matching the + // interceptor-overhead.js pattern. + const testCases = [ + { + run() { + const { response, success } = callService(BASELINE_PORT, "baseline"); + baselineDuration.add(response.timings.duration); + baselineSuccess.add(success); + }, + }, + { + run() { + const { response, success } = callService(OTEL_EXPORT_PORT, "otel_export"); + otelExportDuration.add(response.timings.duration); + otelExportSuccess.add(success); + }, + }, + ]; + + // Fisher-Yates shuffle — eliminate ordering bias. + for (let i = testCases.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + [testCases[i], testCases[j]] = [testCases[j], testCases[i]]; + } + + for (const testCase of testCases) { + testCase.run(); + } + + // Small think time to keep the offered load realistic and to give + // BatchSpanProcessor room to batch exports rather than flush per-request. + sleep(0.05); +} + +// ============================================================================ +// Setup (runs once before test) +// ============================================================================ + +export function setup() { + console.log("\n Starting OTel OTLP Export Overhead Benchmark"); + console.log(" Duration: ~5 minutes (30s ramp + 4m steady + 30s ramp down)"); + console.log(" VUs: 100"); + console.log("\n Configurations to test:"); + console.log(` 1. Baseline (no interceptors, no OTel) - :${BASELINE_PORT}`); + console.log(` 2. OTel export (full chain + real OTLP/gRPC) - :${OTEL_EXPORT_PORT}`); + console.log("\n Goal: measure p50/p95/p99 latency delta and throughput delta"); + console.log(" caused by the stock @connectum/otel export path."); + + const ports = [ + { port: BASELINE_PORT, name: "Baseline" }, + { port: OTEL_EXPORT_PORT, name: "OTel Export" }, + ]; + + console.log("\n Health checks:\n"); + for (const { port, name } of ports) { + const healthResponse = http.post( + `${PROTOCOL}://${BASE_HOST}:${port}/greeter.v1.GreeterService/SayHello`, + JSON.stringify({ name: "healthcheck" }), + { + headers: { + "Content-Type": "application/json", + "Connect-Protocol-Version": "1", + }, + }, + ); + if (healthResponse.status === 200) { + console.log(` OK ${name.padEnd(15)} - :${port}`); + } else { + console.error(` FAIL ${name.padEnd(15)} - :${port} (status: ${healthResponse.status})`); + throw new Error(`Health check failed for ${name} on port ${port}. ` + "Did you start the server with OTEL_EXPORT_ENABLED=1?"); + } + } + + console.log("\n"); +} + +// ============================================================================ +// Teardown (runs once after test) +// ============================================================================ + +export function teardown(_data) { + console.log("\n OTel OTLP Export Overhead Benchmark completed"); + console.log("\n Analysis:"); + console.log(" - Compute overhead = with_otel_export(p95) - baseline_no_otel(p95)"); + console.log(" - Compute relative = with_otel_export / baseline_no_otel"); + console.log(" - If relative > 1.5x, investigate otlp-transformer version"); + console.log(" (Connectum recommendations R1.2; see upstream #6221, #6390, #6570)\n"); + console.log(" JSON summary (when running under docker compose):"); + console.log(" examples/performance-test-server/k6/results/otel-export-overhead.json\n"); +} diff --git a/performance-test-server/k6/results/.gitignore b/performance-test-server/k6/results/.gitignore new file mode 100644 index 0000000..0827618 --- /dev/null +++ b/performance-test-server/k6/results/.gitignore @@ -0,0 +1,2 @@ +*.json +!.gitignore diff --git a/performance-test-server/otel-collector-config.yaml b/performance-test-server/otel-collector-config.yaml new file mode 100644 index 0000000..df3e855 --- /dev/null +++ b/performance-test-server/otel-collector-config.yaml @@ -0,0 +1,55 @@ +# otel-collector-config.yaml +# +# OpenTelemetry Collector configuration used by the OTel export benchmark +# scenario (port 8085 in performance-test-server). +# +# The scenario's goal is to measure server-side CPU cost of the stock OTel +# export path (BatchSpanProcessor + @opentelemetry/otlp-transformer + OTLP gRPC +# transport), NOT the cost of a downstream backend. So the collector accepts +# telemetry and immediately drops it via a logging/debug exporter — no +# ClickHouse, no Prometheus, no network fan-out. +# +# If you want to visually inspect exported spans, raise the debug exporter +# verbosity to "detailed" and re-run. + +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 1024 + +exporters: + debug: + verbosity: basic + sampling_initial: 5 + sampling_thereafter: 1000 + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + +service: + extensions: [health_check] + telemetry: + logs: + level: warn + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [debug] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [debug] + logs: + receivers: [otlp] + processors: [batch] + exporters: [debug] diff --git a/performance-test-server/pnpm-lock.yaml b/performance-test-server/pnpm-lock.yaml index 15cb5a8..bfb9db8 100644 --- a/performance-test-server/pnpm-lock.yaml +++ b/performance-test-server/pnpm-lock.yaml @@ -17,14 +17,14 @@ importers: specifier: ^2.1.1 version: 2.1.1(@bufbuild/protobuf@2.11.0) '@connectum/core': - specifier: 1.0.0-rc.7 - version: 1.0.0-rc.7 + specifier: 1.0.0-rc.10 + version: 1.0.0-rc.10 '@connectum/interceptors': - specifier: 1.0.0-rc.7 - version: 1.0.0-rc.7(@bufbuild/protovalidate@1.1.1(@bufbuild/protobuf@2.11.0)) + specifier: 1.0.0-rc.10 + version: 1.0.0-rc.10(@bufbuild/protovalidate@1.1.1(@bufbuild/protobuf@2.11.0)) '@connectum/otel': - specifier: 1.0.0-rc.7 - version: 1.0.0-rc.7(@bufbuild/protobuf@2.11.0) + specifier: 1.0.0-rc.10 + version: 1.0.0-rc.10(@bufbuild/protobuf@2.11.0) devDependencies: '@bufbuild/buf': specifier: ^1.65.0 @@ -138,17 +138,17 @@ packages: '@bufbuild/protovalidate': ^1.0.0 '@connectrpc/connect': ^2.0.3 - '@connectum/core@1.0.0-rc.7': - resolution: {integrity: sha512-1x5sthjO9yk88MTJKPwMv234/wXxisErtyvxHyD/cDJR6iyIrM31mQmp4MMgV3GlAIcNKqjSVUI3MrSbeYw6fg==} - engines: {node: '>=18.0.0'} + '@connectum/core@1.0.0-rc.10': + resolution: {integrity: sha512-IiF+wLI0f3hbMSEQefgG8M3c4Y9ZQ/wXhTcaT58EN7xfa9tdy/pr03yK0iuN5h7kNVPrIMK5nVnwRBAwWtwUOg==} + engines: {node: '>=20.0.0'} - '@connectum/interceptors@1.0.0-rc.7': - resolution: {integrity: sha512-D05/Otft7K56sEjmySW6+SeG/QsxbmGkkVbcxfUXib1BVUG0l0ZwKbQTG1NTE0+koqpWG4VWbywuJ8xpVfPK6A==} - engines: {node: '>=18.0.0'} + '@connectum/interceptors@1.0.0-rc.10': + resolution: {integrity: sha512-B9J7UC7W+oAU6vILBAta++lTRsPkhKizsAjUs6b9qeLDBFrjGSYOCtrHGnIlqJtdttkQL/R2G/t6Rtvkx8o+WA==} + engines: {node: '>=20.0.0'} - '@connectum/otel@1.0.0-rc.7': - resolution: {integrity: sha512-O14qZsL9LGbvy6LZkJV4s1ck5aazSmdWhJw7rAqPc6QIAUxyWJywgF9ynCctZhL4JTsqpyu3XBHZe4tzAT3tHA==} - engines: {node: '>=18.0.0'} + '@connectum/otel@1.0.0-rc.10': + resolution: {integrity: sha512-V/F8Rkakl2TVcxyddqplyPzEazSceM0mgZ1Htk1OzLBJHdXG7MFbPe2FNzYP6AV3kk9+V6f4OLgLZiF6GSAJrg==} + engines: {node: '>=20.0.0'} '@grpc/grpc-js@1.14.3': resolution: {integrity: sha512-Iq8QQQ/7X3Sac15oB6p0FmUg/klxQvXLeileoqrTRGJYLV+/9tubbr9ipz0GKHjmXVsgFPo/+W+2cA8eNcR+XA==} @@ -500,7 +500,7 @@ snapshots: '@bufbuild/protovalidate': 1.1.1(@bufbuild/protobuf@2.11.0) '@connectrpc/connect': 2.1.1(@bufbuild/protobuf@2.11.0) - '@connectum/core@1.0.0-rc.7': + '@connectum/core@1.0.0-rc.10': dependencies: '@bufbuild/protobuf': 2.11.0 '@connectrpc/connect': 2.1.1(@bufbuild/protobuf@2.11.0) @@ -508,17 +508,17 @@ snapshots: env-var: 7.5.0 zod: 4.3.6 - '@connectum/interceptors@1.0.0-rc.7(@bufbuild/protovalidate@1.1.1(@bufbuild/protobuf@2.11.0))': + '@connectum/interceptors@1.0.0-rc.10(@bufbuild/protovalidate@1.1.1(@bufbuild/protobuf@2.11.0))': dependencies: '@bufbuild/protobuf': 2.11.0 '@connectrpc/connect': 2.1.1(@bufbuild/protobuf@2.11.0) '@connectrpc/validate': 0.2.0(@bufbuild/protobuf@2.11.0)(@bufbuild/protovalidate@1.1.1(@bufbuild/protobuf@2.11.0))(@connectrpc/connect@2.1.1(@bufbuild/protobuf@2.11.0)) - '@connectum/core': 1.0.0-rc.7 + '@connectum/core': 1.0.0-rc.10 cockatiel: 3.2.1 transitivePeerDependencies: - '@bufbuild/protovalidate' - '@connectum/otel@1.0.0-rc.7(@bufbuild/protobuf@2.11.0)': + '@connectum/otel@1.0.0-rc.10(@bufbuild/protobuf@2.11.0)': dependencies: '@connectrpc/connect': 2.1.1(@bufbuild/protobuf@2.11.0) '@opentelemetry/api': 1.9.0 diff --git a/performance-test-server/src/index.ts b/performance-test-server/src/index.ts index de0b2d2..405d831 100644 --- a/performance-test-server/src/index.ts +++ b/performance-test-server/src/index.ts @@ -3,14 +3,18 @@ * * Dedicated server for k6 performance benchmarking. * - * Runs 5 parallel servers with different interceptor configurations: + * Runs up to 6 parallel servers with different interceptor configurations: * - Port 8081: Baseline (no interceptors) * - Port 8082: Validation only * - Port 8083: Logger only - * - Port 8084: OTel (tracing + metrics) only - * - Port 8080: Full chain (all interceptors) + * - Port 8084: OTel (tracing + metrics) only (no-op exporter) + * - Port 8080: Full chain (all interceptors, no-op exporter) + * - Port 8085: OTel export — full chain + real OTLP exporter to a collector + * (enabled only when OTEL_EXPORT_ENABLED=1, i.e. the OTel collector + * is running; otherwise this port is skipped). * - * This allows measuring the overhead of each interceptor individually. + * This allows measuring the overhead of each interceptor individually, plus + * the end-to-end overhead of real OTLP export on port 8085. * * Uses the new createServer() API with explicit lifecycle control. */ @@ -21,7 +25,7 @@ import { createDefaultInterceptors, createLoggerInterceptor, } from "@connectum/interceptors"; -import { createOtelInterceptor } from "@connectum/otel"; +import { createOtelInterceptor, initProvider, shutdownProvider } from "@connectum/otel"; import { benchmarkServiceRoutes } from "./services/benchmarkService.ts"; // Optional TLS: set TLS_DIR env var to enable HTTPS (required for HTTP/1.1 compatibility) @@ -124,26 +128,93 @@ const fullChainOptions: CreateServerOptions = { ], }; +// ============================================================================ +// Configuration 6 (OPTIONAL): OTel export — full chain + real OTLP exporter +// ============================================================================ +// +// Enabled only when OTEL_EXPORT_ENABLED=1. +// +// Uses @connectum/otel provider with env-driven OTLP/gRPC exporter pointed at +// a local OTel Collector. This measures the stock OTel-JS export path +// (BatchSpanProcessor + @opentelemetry/otlp-transformer serialization + +// @grpc/grpc-js wire), i.e. exactly what production users pay. +// +// The OTLP exporter and BatchSpanProcessor options are read from standard +// OTEL_* env vars (see @connectum/otel config.ts): +// OTEL_SERVICE_NAME, OTEL_TRACES_EXPORTER, OTEL_METRICS_EXPORTER, +// OTEL_LOGS_EXPORTER, OTEL_EXPORTER_OTLP_ENDPOINT, +// OTEL_BSP_MAX_EXPORT_BATCH_SIZE, OTEL_BSP_MAX_QUEUE_SIZE, +// OTEL_BSP_SCHEDULE_DELAY, OTEL_BSP_EXPORT_TIMEOUT. + +const otelExportEnabled = process.env.OTEL_EXPORT_ENABLED === "1"; + +const otelExportOptions: CreateServerOptions = { + services: [benchmarkServiceRoutes], + port: 8085, + host: "0.0.0.0", + tls: tlsConfig, + interceptors: [ + ...createDefaultInterceptors({ + errorHandler: { + logErrors: true, + includeStackTrace: true, + }, + serializer: true, + validation: true, + }), + createLoggerInterceptor({ + level: "error", + skipHealthCheck: true, + }), + createOtelInterceptor({ + filter: ({ service }) => !service.includes("grpc.health"), + }), + ], +}; + // ============================================================================ // Start all servers // ============================================================================ -console.log("Starting 5 server configurations:\n"); +const serverCount = otelExportEnabled ? 6 : 5; +console.log(`Starting ${serverCount} server configurations:\n`); if (tlsConfig) { console.log(`TLS enabled (certs from ${process.env.TLS_DIR})\n`); } try { + // Initialize OTel provider eagerly when the export scenario is enabled, so + // that the BatchSpanProcessor and exporters are set up before the first + // request reaches the interceptor on port 8085. Without this the provider + // would still auto-init lazily on first use, but eager init fails fast if + // the collector endpoint is misconfigured. + if (otelExportEnabled) { + console.log("OTEL_EXPORT_ENABLED=1 — initializing OTLP provider"); + console.log(` OTEL_SERVICE_NAME=${process.env.OTEL_SERVICE_NAME ?? "(unset)"}`); + console.log(` OTEL_TRACES_EXPORTER=${process.env.OTEL_TRACES_EXPORTER ?? "(unset)"}`); + console.log(` OTEL_METRICS_EXPORTER=${process.env.OTEL_METRICS_EXPORTER ?? "(unset)"}`); + console.log(` OTEL_EXPORTER_OTLP_ENDPOINT=${process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "(unset)"}\n`); + initProvider({ + serviceName: process.env.OTEL_SERVICE_NAME ?? "performance-test-server", + }); + } + // createServer() is synchronous - creates unstarted server instances - const servers: Server[] = [ - createServer(baselineOptions), - createServer(validationOptions), - createServer(loggerOptions), - createServer(otelOptions), - createServer(fullChainOptions), + const serverOptions: CreateServerOptions[] = [ + baselineOptions, + validationOptions, + loggerOptions, + otelOptions, + fullChainOptions, ]; + if (otelExportEnabled) { + serverOptions.push(otelExportOptions); + } + + const servers: Server[] = serverOptions.map((opts) => createServer(opts)); + // start() is async - start all servers in parallel await Promise.all(servers.map((server) => server.start())); @@ -153,13 +224,19 @@ try { console.log("8081 | Baseline (no interceptors)"); console.log("8082 | Validation only"); console.log("8083 | Logger only"); - console.log("8084 | OTel (tracing + metrics) only"); - console.log("8080 | Full chain (all interceptors)"); + console.log("8084 | OTel (tracing + metrics) only (no-op exporter)"); + console.log("8080 | Full chain (all interceptors, no-op exporter)"); + if (otelExportEnabled) { + console.log("8085 | OTel export — full chain + real OTLP exporter"); + } console.log("\nReady for k6 benchmarks!"); console.log("\nRun benchmarks with:"); console.log(" k6 run k6/basic-load.js"); console.log(" k6 run k6/interceptor-overhead.js"); + if (otelExportEnabled) { + console.log(" k6 run k6/otel-export-overhead.js"); + } console.log("\nPress Ctrl+C to shutdown all servers\n"); @@ -172,6 +249,11 @@ try { await Promise.all(servers.map((server) => server.stop())); + if (otelExportEnabled) { + console.log("Flushing OTel provider..."); + await shutdownProvider(); + } + console.log("All servers stopped"); process.exit(0); }; From e5de9e6b633974541445604986c1e38ac8e18487 Mon Sep 17 00:00:00 2001 From: intech Date: Wed, 17 Jun 2026 13:48:50 +0400 Subject: [PATCH 2/2] fix(performance-test-server): address CodeRabbit review on otel-export scenario MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply all four CodeRabbit findings on the OTLP-export benchmark: 1. Profile command: name `server otel-collector k6-otel-export` explicitly so the profile-less `k6-interceptor-overhead` does not auto-start and contaminate the export measurement with concurrent CPU load. 2. Drop the misleading "throughput delta" claim from the otel-export scenario (README, docker-compose comment, k6 doc-comment + setup log). The paired loop hits both configs once per iteration, so request counts are identical by construction — only the latency delta is a real deliverable. Legitimate throughput references (basic-load SLA, backend-write disclaimer) are kept. 3. Document the 8085 readiness gate: the server healthcheck probes 8080 only, but all ports bind in one Promise.all and the k6 setup() health-checks 8085 and aborts if not serving, so no silent partial-measurement race. 4. Remove the false-positive collector healthcheck (`otelcol-contrib components` exits 0 without opening a socket) and switch k6-otel-export to `service_started`. The contrib image is scratch-based (no shell/HTTP client) so readiness cannot be probed; correctness instead relies on BatchSpanProcessor buffering/retry over the ~5-minute steady-state run. Honest comments added; no "readiness fixed" overclaim. Co-Authored-By: Claude Opus 4.8 (1M context) --- performance-test-server/README.md | 10 ++++-- performance-test-server/docker-compose.yml | 33 ++++++++++++++----- .../k6/otel-export-overhead.js | 3 +- .../otel-collector-config.yaml | 4 +++ 4 files changed, 37 insertions(+), 13 deletions(-) diff --git a/performance-test-server/README.md b/performance-test-server/README.md index 4d1f3f8..142ef58 100644 --- a/performance-test-server/README.md +++ b/performance-test-server/README.md @@ -97,13 +97,19 @@ docker compose --profile load up k6-basic-load --build --abort-on-container-exit ### OTel OTLP Export Overhead -Measures the p50/p95/p99 latency delta and throughput delta between the baseline (port 8081) and the full-chain-with-real-OTLP-exporter configuration (port 8085). Runs for ~5 minutes at 100 VUs: +Measures the p50/p95/p99 latency delta between the baseline (port 8081) and the full-chain-with-real-OTLP-exporter configuration (port 8085). Runs for ~5 minutes at 100 VUs: ```bash OTEL_EXPORT_ENABLED=1 docker compose --profile otel-export up \ - --build --abort-on-container-exit + server otel-collector k6-otel-export --build --abort-on-container-exit ``` +Naming the three services explicitly is deliberate: `k6-interceptor-overhead` +has no profile, so a bare `docker compose --profile otel-export up` would start +it too and run the interceptor benchmark concurrently, stealing CPU from and +contaminating the OTLP-export measurement. Listing only the services this +scenario needs keeps the run isolated. + What this measures that the `k6-interceptor-overhead` scenario does *not*: - Real `BatchSpanProcessor` + `@opentelemetry/otlp-transformer` serialization cost per exported span diff --git a/performance-test-server/docker-compose.yml b/performance-test-server/docker-compose.yml index 20bc297..eca97f8 100644 --- a/performance-test-server/docker-compose.yml +++ b/performance-test-server/docker-compose.yml @@ -43,12 +43,16 @@ services: volumes: - ./otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml:ro command: ["--config=/etc/otelcol-contrib/config.yaml"] - healthcheck: - test: ["CMD", "/otelcol-contrib", "components"] - interval: 5s - timeout: 3s - retries: 5 - start_period: 5s + # No healthcheck: the contrib image is built FROM scratch — it has no shell, + # curl, or wget — so the OTLP listeners (:4317/:4318) and the health_check + # extension (:13133) cannot be probed from inside the container, and the only + # in-image binary (`otelcol-contrib components`) exits 0 without ever opening + # a socket, which would be a false "healthy" signal. We do NOT claim to prove + # collector readiness here. Instead, k6-otel-export gates on `service_started` + # only, and correctness does not depend on the collector being up at t=0: + # the BatchSpanProcessor buffers and retries exports, and over the ~5-minute + # steady-state run any sub-second collector startup gap is negligible relative + # to total spans exported. k6-interceptor-overhead: image: grafana/k6:latest @@ -75,9 +79,16 @@ services: # ========================================================================= # OTel OTLP export overhead scenario (profile: otel-export) # ========================================================================= - # Measures p50/p95/p99 latency delta and throughput delta between the - # baseline (port 8081) and full-chain-with-real-OTLP-exporter (port 8085). + # Measures the p50/p95/p99 latency delta between the baseline (port 8081) + # and full-chain-with-real-OTLP-exporter (port 8085). # Requires OTEL_EXPORT_ENABLED=1 on the server and a running otel-collector. + # + # Readiness note: the server healthcheck below probes port 8080 only. All + # ports (incl. 8085) are bound in a single Promise.all in src/index.ts, so + # 8080 being healthy means 8085 is almost certainly up too — and the k6 + # script closes the remaining gap deterministically: its setup() health-checks + # 8081 AND 8085 and aborts the run if 8085 is not yet serving. So a brief + # bind-order race cannot produce silent, partially-measured results. k6-otel-export: image: grafana/k6:latest volumes: @@ -92,6 +103,10 @@ services: K6_OUT: "json=/results/otel-export-overhead.json" depends_on: server: { condition: service_healthy } - otel-collector: { condition: service_healthy } + # service_started, not service_healthy: the scratch-based collector image + # cannot be health-probed (see the otel-collector comment above). The + # BatchSpanProcessor tolerates a not-yet-ready collector via buffering and + # retries, so a started container is a sufficient precondition here. + otel-collector: { condition: service_started } command: run /scripts/otel-export-overhead.js profiles: ["otel-export"] diff --git a/performance-test-server/k6/otel-export-overhead.js b/performance-test-server/k6/otel-export-overhead.js index 1c13f72..3548e8a 100644 --- a/performance-test-server/k6/otel-export-overhead.js +++ b/performance-test-server/k6/otel-export-overhead.js @@ -22,7 +22,6 @@ * * Output: * p50/p95/p99 latency per config - * Throughput (requests/sec) per config * Export-overhead delta printed in teardown * JSON summary written to /results/otel-export-overhead.json when run via * docker compose (K6_OUT env var). @@ -174,7 +173,7 @@ export function setup() { console.log("\n Configurations to test:"); console.log(` 1. Baseline (no interceptors, no OTel) - :${BASELINE_PORT}`); console.log(` 2. OTel export (full chain + real OTLP/gRPC) - :${OTEL_EXPORT_PORT}`); - console.log("\n Goal: measure p50/p95/p99 latency delta and throughput delta"); + console.log("\n Goal: measure the p50/p95/p99 latency delta"); console.log(" caused by the stock @connectum/otel export path."); const ports = [ diff --git a/performance-test-server/otel-collector-config.yaml b/performance-test-server/otel-collector-config.yaml index df3e855..eb89161 100644 --- a/performance-test-server/otel-collector-config.yaml +++ b/performance-test-server/otel-collector-config.yaml @@ -31,6 +31,10 @@ exporters: sampling_initial: 5 sampling_thereafter: 1000 +# The health_check extension exposes readiness on :13133 for anyone running +# this collector outside the benchmark. It is intentionally NOT wired to a +# Docker healthcheck: the contrib image is built FROM scratch and has no shell +# or HTTP client to probe the endpoint with (see docker-compose.yml). extensions: health_check: endpoint: 0.0.0.0:13133