From 6e20a6fc400fe3522711bb82badbdf43cc7a2c96 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Fri, 29 May 2026 07:32:53 -0600
Subject: [PATCH] =?UTF-8?q?feat(otel-export):=20exportEvalRuns=20=E2=80=94?=
 =?UTF-8?q?=20ship=20self-improvement=20provenance=20to=20Tangle=20Intelli?=
 =?UTF-8?q?gence=20(0.31.0)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a reusable client for Intelligence's first-class self-improvement record
(POST /v1/ingest/eval-runs, 'Mode D'), alongside the existing OTLP span exporter.
A consumer's RSI loop emits one EvalRunEvent per proposal generation (surfaceHash =
proposed-change identity, surface = arbitrary provenance, labels.measured flags it
unmeasured); a later gate-decided event re-emits the same runId (idempotent upsert)
with a real gateDecision + holdoutLift, so proposal→verdict is one diffable record.

Unlike the best-effort span exporter, exportEvalRuns RESOLVES with the ingest verdict
(accepted/rejected per event) so a loop can assert its provenance landed. Reads
TANGLE_API_KEY + INTELLIGENCE_BASE from env; tenant resolved server-side from the key.
Wire version + X-Tangle-Wire-Version header handled. +4 tests (payload/header shape,
400 rejection passthrough, empty no-op). Makes Intelligence the de-facto provenance
store for any agent-runtime consumer's self-improvement loop, not just one benchmark.
---
 package.json              |   2 +-
 src/index.ts              |   8 ++-
 src/otel-export.ts        | 105 ++++++++++++++++++++++++++++++++++++++
 tests/otel-export.test.ts |  67 +++++++++++++++++++++++-
 4 files changed, 179 insertions(+), 3 deletions(-)

diff --git a/package.json b/package.json
index 2e37101..a6c2aca 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-runtime",
-  "version": "0.30.1",
+  "version": "0.31.0",
   "description": "Reusable runtime lifecycle for domain-specific agents.",
   "homepage": "https://github.com/tangle-network/agent-runtime#readme",
   "repository": {
diff --git a/src/index.ts b/src/index.ts
index ee034d4..79845b1 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -123,14 +123,20 @@ export {
   validateChatModelId,
 } from './model-resolution'
 export type {
+  EvalRunEvent,
+  EvalRunGeneration,
+  EvalRunsExportConfig,
+  EvalRunsExportResult,
   OtelAttribute,
   OtelExportConfig,
   OtelExporter,
   OtelSpan,
 } from './otel-export'
-// ── OTEL export + trace propagation ──────────────────────────────────
+// ── OTEL export + trace propagation + eval-run provenance ────────────
 export {
   createOtelExporter,
+  exportEvalRuns,
+  INTELLIGENCE_WIRE_VERSION,
   loopEventToOtelSpan,
 } from './otel-export'
 // ── Readiness ─────────────────────────────────────────────────────────
diff --git a/src/otel-export.ts b/src/otel-export.ts
index 370502d..f0a1e54 100644
--- a/src/otel-export.ts
+++ b/src/otel-export.ts
@@ -233,3 +233,108 @@ function generateSpanId(): string {
     .map((b) => b.toString(16).padStart(2, '0'))
     .join('')
 }
+
+// ─── Eval-run ingest (self-improvement provenance) ───────────────────────────
+//
+// Tangle Intelligence has a first-class, non-trace record for self-improvement
+// runs: POST /v1/ingest/eval-runs ("Mode D"). Each generation carries a
+// `surfaceHash` (the proposed-change identity) + arbitrary `surface` provenance;
+// a later `gate-decided` event re-emits the same `runId` (idempotent upsert) with
+// a real `gateDecision` + `holdoutLift`, so proposal→verdict is one diffable
+// record. This is how a consumer's RSI loop records WHAT it changed, WHY, from
+// which evidence — the audit trail behind agentic self-improvement.
+
+/** Wire version the eval-runs ingest enforces (X-Tangle-Wire-Version + body). */
+export const INTELLIGENCE_WIRE_VERSION = '2026-05-26.v1'
+
+export interface EvalRunGeneration {
+  /** 0-based ordinal of this generation within the run (required by ingest). */
+  index: number
+  /** Identity of the proposed surface change (content-addressed hash). */
+  surfaceHash: string
+  /** Arbitrary provenance for this generation (rationale, evidence, source). */
+  surface?: unknown
+  /** Per-scenario results; empty until the generation is measured. */
+  cells?: unknown[]
+  /** Mean composite score (0 when unmeasured — pair with labels.measured). */
+  compositeMean: number
+  costUsd: number
+  durationMs: number
+}
+
+export interface EvalRunEvent {
+  runId: string
+  runDir: string
+  /** ISO timestamp. */
+  timestamp: string
+  status: 'started' | 'baseline-complete' | 'generation-complete' | 'gate-decided' | 'finished' | 'errored'
+  labels?: Record<string, string>
+  baseline?: EvalRunGeneration
+  generations?: EvalRunGeneration[]
+  gateDecision?: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'
+  holdoutLift?: number
+  totalCostUsd: number
+  totalDurationMs: number
+  errorMessage?: string
+}
+
+export interface EvalRunsExportConfig {
+  /** Bearer key — tenant is resolved server-side from it. Reads TANGLE_API_KEY. */
+  apiKey?: string
+  /** Intelligence base. Reads INTELLIGENCE_BASE env, else prod. */
+  base?: string
+  /** Idempotency-Key header (e.g. the runId) — safe retries + upsert. */
+  idempotencyKey?: string
+}
+
+export interface EvalRunsExportResult {
+  ok: boolean
+  status: number
+  accepted: number
+  rejected: Array<{ index: number; reason: string }>
+}
+
+const DEFAULT_INTELLIGENCE_BASE = 'https://intelligence.tangle.tools'
+
+/**
+ * Ship self-improvement eval-run events to Tangle Intelligence. Unlike the
+ * best-effort span exporter, this RESOLVES with the ingest verdict (accepted /
+ * rejected per event) so a consumer's loop can assert its provenance landed.
+ * Throws only on a missing key or network failure.
+ */
+export async function exportEvalRuns(
+  events: EvalRunEvent[],
+  config?: EvalRunsExportConfig,
+): Promise<EvalRunsExportResult> {
+  if (events.length === 0) return { ok: true, status: 0, accepted: 0, rejected: [] }
+  const apiKey =
+    config?.apiKey ?? (typeof process !== 'undefined' ? process.env.TANGLE_API_KEY : undefined)
+  if (!apiKey) throw new Error('exportEvalRuns: apiKey required (pass config.apiKey or set TANGLE_API_KEY)')
+  const base =
+    config?.base ??
+    (typeof process !== 'undefined' ? process.env.INTELLIGENCE_BASE : undefined) ??
+    DEFAULT_INTELLIGENCE_BASE
+  const url = `${base.replace(/\/+$/, '')}/v1/ingest/eval-runs`
+  const res = await fetch(url, {
+    method: 'POST',
+    headers: {
+      'content-type': 'application/json',
+      authorization: `Bearer ${apiKey}`,
+      'X-Tangle-Wire-Version': INTELLIGENCE_WIRE_VERSION,
+      ...(config?.idempotencyKey ? { 'Idempotency-Key': config.idempotencyKey } : {}),
+    },
+    body: JSON.stringify({ wireVersion: INTELLIGENCE_WIRE_VERSION, events }),
+  })
+  let parsed: { accepted?: number; rejected?: Array<{ index: number; reason: string }> } = {}
+  try {
+    parsed = (await res.json()) as typeof parsed
+  } catch {
+    // non-JSON body (e.g. 5xx HTML) — leave parsed empty
+  }
+  return {
+    ok: res.ok,
+    status: res.status,
+    accepted: parsed.accepted ?? (res.ok ? events.length : 0),
+    rejected: parsed.rejected ?? [],
+  }
+}
diff --git a/tests/otel-export.test.ts b/tests/otel-export.test.ts
index 9e1d2b7..6e2a08a 100644
--- a/tests/otel-export.test.ts
+++ b/tests/otel-export.test.ts
@@ -1,5 +1,10 @@
 import { afterEach, describe, expect, it, vi } from 'vitest'
-import { createOtelExporter, loopEventToOtelSpan } from '../src/otel-export'
+import {
+  createOtelExporter,
+  exportEvalRuns,
+  INTELLIGENCE_WIRE_VERSION,
+  loopEventToOtelSpan,
+} from '../src/otel-export'
 
 describe('otel-export', () => {
   afterEach(() => {
@@ -166,3 +171,63 @@ describe('otel-export', () => {
     expect(attrMap['loop.agentRunName']).toEqual({ stringValue: 'coder' })
   })
 })
+
+describe('exportEvalRuns (Intelligence self-improvement provenance)', () => {
+  afterEach(() => {
+    delete process.env.TANGLE_API_KEY
+    delete process.env.INTELLIGENCE_BASE
+    vi.unstubAllGlobals()
+  })
+
+  const event = {
+    runId: 'rsi-1',
+    runDir: 'rsi/fhenix/abc',
+    timestamp: '2026-05-29T00:00:00.000Z',
+    status: 'generation-complete' as const,
+    labels: { stage: 'proposed', measured: 'false' },
+    generations: [
+      { index: 0, surfaceHash: 'h1', surface: { surfaceId: 'completeness-audit' }, cells: [], compositeMean: 0, costUsd: 0, durationMs: 0 },
+    ],
+    totalCostUsd: 0,
+    totalDurationMs: 0,
+  }
+
+  it('throws without an api key', async () => {
+    await expect(exportEvalRuns([event])).rejects.toThrow(/apiKey required/)
+  })
+
+  it('POSTs the wire-versioned envelope to /v1/ingest/eval-runs with bearer + version header', async () => {
+    let captured: { url: string; init: any } | undefined
+    const mockFetch = vi.fn(async (url: string, init: any) => {
+      captured = { url, init }
+      return new Response(JSON.stringify({ accepted: 1, rejected: [] }), { status: 200 })
+    })
+    vi.stubGlobal('fetch', mockFetch)
+    const r = await exportEvalRuns([event], { apiKey: 'sk-tan-test', base: 'https://intel.example', idempotencyKey: 'rsi-1' })
+    expect(r.ok).toBe(true)
+    expect(r.accepted).toBe(1)
+    expect(captured!.url).toBe('https://intel.example/v1/ingest/eval-runs')
+    expect(captured!.init.headers.authorization).toBe('Bearer sk-tan-test')
+    expect(captured!.init.headers['X-Tangle-Wire-Version']).toBe(INTELLIGENCE_WIRE_VERSION)
+    expect(captured!.init.headers['Idempotency-Key']).toBe('rsi-1')
+    const body = JSON.parse(captured!.init.body)
+    expect(body.wireVersion).toBe(INTELLIGENCE_WIRE_VERSION)
+    expect(body.events[0].generations[0].index).toBe(0)
+  })
+
+  it('surfaces per-event rejections from a 400 (does not throw)', async () => {
+    const mockFetch = vi.fn(async () =>
+      new Response(JSON.stringify({ accepted: 0, rejected: [{ index: 0, reason: 'bad' }] }), { status: 400 }),
+    )
+    vi.stubGlobal('fetch', mockFetch)
+    const r = await exportEvalRuns([event], { apiKey: 'k' })
+    expect(r.ok).toBe(false)
+    expect(r.status).toBe(400)
+    expect(r.rejected[0]?.reason).toBe('bad')
+  })
+
+  it('no-ops on empty events', async () => {
+    const r = await exportEvalRuns([], { apiKey: 'k' })
+    expect(r).toEqual({ ok: true, status: 0, accepted: 0, rejected: [] })
+  })
+})