tangle-network · tangletools · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/package.json b/package.json
@@ -76,7 +76,7 @@
     "typecheck": "tsc --noEmit"
   },
   "dependencies": {
-    "@tangle-network/agent-eval": "^0.52.0"
+    "@tangle-network/agent-eval": "^0.54.0"
   },
   "devDependencies": {
     "@biomejs/biome": "^2.4.0",

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/src/index.ts b/src/index.ts
@@ -136,7 +136,7 @@ export {
 // ── Readiness ─────────────────────────────────────────────────────────
 export { decideKnowledgeReadiness } from './readiness'
 // ── Run loop ─────────────────────────────────────────────────────────
-export { runAgentTask, runAgentTaskStream } from './run'
+export { applyRunRecordDefaults, runAgentTask, runAgentTaskStream } from './run'
 // ── Production run lifecycle ─────────────────────────────────────────
 export type {
   RuntimeRunHandle,

diff --git a/src/run.ts b/src/run.ts
@@ -19,13 +19,46 @@ import {
   type ControlEvalResult,
   type ControlRunResult,
   type DataAcquisitionPlan,
+  FAILURE_CLASSES,
+  type FailureClass,
   type KnowledgeReadinessReport,
+  type RunRecord,
   runAgentControlLoop,
   scoreKnowledgeReadiness,
   type UserQuestion,
   userQuestionsForKnowledgeGaps,
 } from '@tangle-network/agent-eval'
 
+const FAILURE_CLASS_SET = new Set<string>(FAILURE_CLASSES)
+
+/** True when a free-form control failure string is a canonical taxonomy
+ *  class — so only real taxonomy tags are promoted to the cross-agent
+ *  `RunRecord.failureClass` key; novel strings stay as `failureMode` detail. */
+function asFailureClass(value: string | undefined): FailureClass | undefined {
+  return value && FAILURE_CLASS_SET.has(value) ? (value as FailureClass) : undefined
+}
+
+/** Stamp cross-cutting defaults onto adapter-projected RunRecords without
+ *  overriding anything the adapter set explicitly:
+ *   - `scenarioId` — the run's scenario, when the record omits one.
+ *   - `failureClass` — the control layer's failure classification promoted
+ *     onto the canonical cross-agent key, but ONLY when it's a real taxonomy
+ *     class. This is what lets the substrate aggregate failures across every
+ *     agent in one vocabulary instead of per-agent ad-hoc strings. */
+export function applyRunRecordDefaults(
+  records: RunRecord[],
+  scenarioId: string,
+  controlFailureClass: string | undefined,
+): RunRecord[] {
+  const fc = asFailureClass(controlFailureClass)
+  return records.map((record) => {
+    let r = record
+    if (r.scenarioId === undefined) r = { ...r, scenarioId }
+    if (r.failureClass === undefined && fc) r = { ...r, failureClass: fc }
+    return r
+  })
+}
+
 import { normalizeBackendStreamEvent } from './backends'
 import { BackendTransportError, SessionMismatchError } from './errors'
 import { decideKnowledgeReadiness } from './readiness'
@@ -152,8 +185,10 @@ export async function runAgentTask<
     userAnswers: preflight.userAnswers,
     acquiredEvidenceIds: preflight.acquiredEvidenceIds,
     control,
-    runRecords: (options.adapter.projectRunRecords?.(control, task) ?? []).map((record) =>
-      record.scenarioId === undefined ? { ...record, scenarioId } : record,
+    runRecords: applyRunRecordDefaults(
+      options.adapter.projectRunRecords?.(control, task) ?? [],
+      scenarioId,
+      control.failureClass,
     ),
   }
 }

diff --git a/tests/mcp/in-process-executor.test.ts b/tests/mcp/in-process-executor.test.ts
@@ -113,7 +113,7 @@ describe('createInProcessExecutor', () => {
       const box = await exec.client.create()
       for await (const _ of (
         box as unknown as { streamPrompt: (m: string) => AsyncGenerator<unknown> }
-      ).streamPrompt('task ' + i)) {
+      ).streamPrompt(`task ${i}`)) {
         // drain
       }
     }

diff --git a/tests/runtime.test.ts b/tests/runtime.test.ts
@@ -1,9 +1,11 @@
+import type { RunRecord } from '@tangle-network/agent-eval'
 import { describe, expect, it } from 'vitest'
 import {
   type AgentAdapter,
   type AgentBackendInput,
   type AgentExecutionBackend,
   type AgentTaskSpec,
+  applyRunRecordDefaults,
   type ControlEvalResult,
   createIterableBackend,
   createOpenAICompatibleBackend,
@@ -985,3 +987,53 @@ async function collect(iterable: AsyncIterable<RuntimeStreamEvent>): Promise<Run
   for await (const event of iterable) events.push(event)
   return events
 }
+
+describe('applyRunRecordDefaults — canonical failureClass propagation', () => {
+  function rec(overrides: Partial<RunRecord> = {}): RunRecord {
+    return {
+      runId: 'run-1',
+      experimentId: 'exp-1',
+      candidateId: 'cand-1',
+      seed: 0,
+      model: 'm@v',
+      promptHash: 'sha256:p',
+      configHash: 'sha256:c',
+      commitSha: 'abc',
+      wallMs: 10,
+      costUsd: 0,
+      tokenUsage: { input: 0, output: 0 },
+      outcome: { holdoutScore: 0, raw: {} },
+      splitTag: 'holdout',
+      ...overrides,
+    }
+  }
+
+  it('promotes a canonical control failureClass onto records that lack one', () => {
+    const out = applyRunRecordDefaults([rec()], 'scn-1', 'tool_recovery_failure')
+    expect(out[0]?.failureClass).toBe('tool_recovery_failure')
+    expect(out[0]?.scenarioId).toBe('scn-1')
+  })
+
+  it('never overrides a failureClass the adapter set explicitly', () => {
+    const out = applyRunRecordDefaults(
+      [rec({ failureClass: 'hallucination', scenarioId: 'kept' })],
+      'scn-1',
+      'tool_recovery_failure',
+    )
+    expect(out[0]?.failureClass).toBe('hallucination')
+    expect(out[0]?.scenarioId).toBe('kept')
+  })
+
+  it('does NOT promote a non-taxonomy control string (stays unclassified)', () => {
+    // agent-builder's ad-hoc "forge_build_unsatisfied" is not a FailureClass —
+    // it must not pollute the canonical cross-agent key.
+    const out = applyRunRecordDefaults([rec()], 'scn-1', 'forge_build_unsatisfied')
+    expect(out[0]?.failureClass).toBeUndefined()
+  })
+
+  it('is a no-op on failureClass when the control did not fail', () => {
+    const out = applyRunRecordDefaults([rec()], 'scn-1', undefined)
+    expect(out[0]?.failureClass).toBeUndefined()
+    expect(out[0]?.scenarioId).toBe('scn-1')
+  })
+})