From 6e20a6fc400fe3522711bb82badbdf43cc7a2c96 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Fri, 29 May 2026 07:32:53 -0600 Subject: [PATCH] =?UTF-8?q?feat(otel-export):=20exportEvalRuns=20=E2=80=94?= =?UTF-8?q?=20ship=20self-improvement=20provenance=20to=20Tangle=20Intelli?= =?UTF-8?q?gence=20(0.31.0)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a reusable client for Intelligence's first-class self-improvement record (POST /v1/ingest/eval-runs, 'Mode D'), alongside the existing OTLP span exporter. A consumer's RSI loop emits one EvalRunEvent per proposal generation (surfaceHash = proposed-change identity, surface = arbitrary provenance, labels.measured flags it unmeasured); a later gate-decided event re-emits the same runId (idempotent upsert) with a real gateDecision + holdoutLift, so proposal→verdict is one diffable record. Unlike the best-effort span exporter, exportEvalRuns RESOLVES with the ingest verdict (accepted/rejected per event) so a loop can assert its provenance landed. Reads TANGLE_API_KEY + INTELLIGENCE_BASE from env; tenant resolved server-side from the key. Wire version + X-Tangle-Wire-Version header handled. +4 tests (payload/header shape, 400 rejection passthrough, empty no-op). Makes Intelligence the de-facto provenance store for any agent-runtime consumer's self-improvement loop, not just one benchmark. --- package.json | 2 +- src/index.ts | 8 ++- src/otel-export.ts | 105 ++++++++++++++++++++++++++++++++++++++ tests/otel-export.test.ts | 67 +++++++++++++++++++++++- 4 files changed, 179 insertions(+), 3 deletions(-) diff --git a/package.json b/package.json index 2e37101..a6c2aca 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.30.1", + "version": "0.31.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": { diff --git a/src/index.ts b/src/index.ts index ee034d4..79845b1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -123,14 +123,20 @@ export { validateChatModelId, } from './model-resolution' export type { + EvalRunEvent, + EvalRunGeneration, + EvalRunsExportConfig, + EvalRunsExportResult, OtelAttribute, OtelExportConfig, OtelExporter, OtelSpan, } from './otel-export' -// ── OTEL export + trace propagation ────────────────────────────────── +// ── OTEL export + trace propagation + eval-run provenance ──────────── export { createOtelExporter, + exportEvalRuns, + INTELLIGENCE_WIRE_VERSION, loopEventToOtelSpan, } from './otel-export' // ── Readiness ───────────────────────────────────────────────────────── diff --git a/src/otel-export.ts b/src/otel-export.ts index 370502d..f0a1e54 100644 --- a/src/otel-export.ts +++ b/src/otel-export.ts @@ -233,3 +233,108 @@ function generateSpanId(): string { .map((b) => b.toString(16).padStart(2, '0')) .join('') } + +// ─── Eval-run ingest (self-improvement provenance) ─────────────────────────── +// +// Tangle Intelligence has a first-class, non-trace record for self-improvement +// runs: POST /v1/ingest/eval-runs ("Mode D"). Each generation carries a +// `surfaceHash` (the proposed-change identity) + arbitrary `surface` provenance; +// a later `gate-decided` event re-emits the same `runId` (idempotent upsert) with +// a real `gateDecision` + `holdoutLift`, so proposal→verdict is one diffable +// record. This is how a consumer's RSI loop records WHAT it changed, WHY, from +// which evidence — the audit trail behind agentic self-improvement. + +/** Wire version the eval-runs ingest enforces (X-Tangle-Wire-Version + body). */ +export const INTELLIGENCE_WIRE_VERSION = '2026-05-26.v1' + +export interface EvalRunGeneration { + /** 0-based ordinal of this generation within the run (required by ingest). */ + index: number + /** Identity of the proposed surface change (content-addressed hash). */ + surfaceHash: string + /** Arbitrary provenance for this generation (rationale, evidence, source). */ + surface?: unknown + /** Per-scenario results; empty until the generation is measured. */ + cells?: unknown[] + /** Mean composite score (0 when unmeasured — pair with labels.measured). */ + compositeMean: number + costUsd: number + durationMs: number +} + +export interface EvalRunEvent { + runId: string + runDir: string + /** ISO timestamp. */ + timestamp: string + status: 'started' | 'baseline-complete' | 'generation-complete' | 'gate-decided' | 'finished' | 'errored' + labels?: Record + baseline?: EvalRunGeneration + generations?: EvalRunGeneration[] + gateDecision?: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling' + holdoutLift?: number + totalCostUsd: number + totalDurationMs: number + errorMessage?: string +} + +export interface EvalRunsExportConfig { + /** Bearer key — tenant is resolved server-side from it. Reads TANGLE_API_KEY. */ + apiKey?: string + /** Intelligence base. Reads INTELLIGENCE_BASE env, else prod. */ + base?: string + /** Idempotency-Key header (e.g. the runId) — safe retries + upsert. */ + idempotencyKey?: string +} + +export interface EvalRunsExportResult { + ok: boolean + status: number + accepted: number + rejected: Array<{ index: number; reason: string }> +} + +const DEFAULT_INTELLIGENCE_BASE = 'https://intelligence.tangle.tools' + +/** + * Ship self-improvement eval-run events to Tangle Intelligence. Unlike the + * best-effort span exporter, this RESOLVES with the ingest verdict (accepted / + * rejected per event) so a consumer's loop can assert its provenance landed. + * Throws only on a missing key or network failure. + */ +export async function exportEvalRuns( + events: EvalRunEvent[], + config?: EvalRunsExportConfig, +): Promise { + if (events.length === 0) return { ok: true, status: 0, accepted: 0, rejected: [] } + const apiKey = + config?.apiKey ?? (typeof process !== 'undefined' ? process.env.TANGLE_API_KEY : undefined) + if (!apiKey) throw new Error('exportEvalRuns: apiKey required (pass config.apiKey or set TANGLE_API_KEY)') + const base = + config?.base ?? + (typeof process !== 'undefined' ? process.env.INTELLIGENCE_BASE : undefined) ?? + DEFAULT_INTELLIGENCE_BASE + const url = `${base.replace(/\/+$/, '')}/v1/ingest/eval-runs` + const res = await fetch(url, { + method: 'POST', + headers: { + 'content-type': 'application/json', + authorization: `Bearer ${apiKey}`, + 'X-Tangle-Wire-Version': INTELLIGENCE_WIRE_VERSION, + ...(config?.idempotencyKey ? { 'Idempotency-Key': config.idempotencyKey } : {}), + }, + body: JSON.stringify({ wireVersion: INTELLIGENCE_WIRE_VERSION, events }), + }) + let parsed: { accepted?: number; rejected?: Array<{ index: number; reason: string }> } = {} + try { + parsed = (await res.json()) as typeof parsed + } catch { + // non-JSON body (e.g. 5xx HTML) — leave parsed empty + } + return { + ok: res.ok, + status: res.status, + accepted: parsed.accepted ?? (res.ok ? events.length : 0), + rejected: parsed.rejected ?? [], + } +} diff --git a/tests/otel-export.test.ts b/tests/otel-export.test.ts index 9e1d2b7..6e2a08a 100644 --- a/tests/otel-export.test.ts +++ b/tests/otel-export.test.ts @@ -1,5 +1,10 @@ import { afterEach, describe, expect, it, vi } from 'vitest' -import { createOtelExporter, loopEventToOtelSpan } from '../src/otel-export' +import { + createOtelExporter, + exportEvalRuns, + INTELLIGENCE_WIRE_VERSION, + loopEventToOtelSpan, +} from '../src/otel-export' describe('otel-export', () => { afterEach(() => { @@ -166,3 +171,63 @@ describe('otel-export', () => { expect(attrMap['loop.agentRunName']).toEqual({ stringValue: 'coder' }) }) }) + +describe('exportEvalRuns (Intelligence self-improvement provenance)', () => { + afterEach(() => { + delete process.env.TANGLE_API_KEY + delete process.env.INTELLIGENCE_BASE + vi.unstubAllGlobals() + }) + + const event = { + runId: 'rsi-1', + runDir: 'rsi/fhenix/abc', + timestamp: '2026-05-29T00:00:00.000Z', + status: 'generation-complete' as const, + labels: { stage: 'proposed', measured: 'false' }, + generations: [ + { index: 0, surfaceHash: 'h1', surface: { surfaceId: 'completeness-audit' }, cells: [], compositeMean: 0, costUsd: 0, durationMs: 0 }, + ], + totalCostUsd: 0, + totalDurationMs: 0, + } + + it('throws without an api key', async () => { + await expect(exportEvalRuns([event])).rejects.toThrow(/apiKey required/) + }) + + it('POSTs the wire-versioned envelope to /v1/ingest/eval-runs with bearer + version header', async () => { + let captured: { url: string; init: any } | undefined + const mockFetch = vi.fn(async (url: string, init: any) => { + captured = { url, init } + return new Response(JSON.stringify({ accepted: 1, rejected: [] }), { status: 200 }) + }) + vi.stubGlobal('fetch', mockFetch) + const r = await exportEvalRuns([event], { apiKey: 'sk-tan-test', base: 'https://intel.example', idempotencyKey: 'rsi-1' }) + expect(r.ok).toBe(true) + expect(r.accepted).toBe(1) + expect(captured!.url).toBe('https://intel.example/v1/ingest/eval-runs') + expect(captured!.init.headers.authorization).toBe('Bearer sk-tan-test') + expect(captured!.init.headers['X-Tangle-Wire-Version']).toBe(INTELLIGENCE_WIRE_VERSION) + expect(captured!.init.headers['Idempotency-Key']).toBe('rsi-1') + const body = JSON.parse(captured!.init.body) + expect(body.wireVersion).toBe(INTELLIGENCE_WIRE_VERSION) + expect(body.events[0].generations[0].index).toBe(0) + }) + + it('surfaces per-event rejections from a 400 (does not throw)', async () => { + const mockFetch = vi.fn(async () => + new Response(JSON.stringify({ accepted: 0, rejected: [{ index: 0, reason: 'bad' }] }), { status: 400 }), + ) + vi.stubGlobal('fetch', mockFetch) + const r = await exportEvalRuns([event], { apiKey: 'k' }) + expect(r.ok).toBe(false) + expect(r.status).toBe(400) + expect(r.rejected[0]?.reason).toBe('bad') + }) + + it('no-ops on empty events', async () => { + const r = await exportEvalRuns([], { apiKey: 'k' }) + expect(r).toEqual({ ok: true, status: 0, accepted: 0, rejected: [] }) + }) +})