From 4c05953045fbbdb41c3a66d2862258f9f522a8e3 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Thu, 28 May 2026 07:30:01 -0600
Subject: [PATCH 1/2] feat(run): propagate control failureClass onto canonical
 RunRecord (agent-eval 0.54.0)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

agent-runtime built RunRecords but never set the structured failure tag, starving
the substrate's cross-agent failure aggregation. The control layer already
classifies failures (ControlRunResult.failureClass) — this promotes that onto the
canonical RunRecord.failureClass key (agent-eval 0.54.0) at the projection
boundary, so failures aggregate across every agent in ONE taxonomy vocabulary.

- bump @tangle-network/agent-eval ^0.52.0 -> ^0.54.0
- applyRunRecordDefaults(): stamps scenarioId + failureClass without overriding
  anything the adapter set; promotes ONLY real FAILURE_CLASSES taxonomy tags
  (a non-taxonomy string like "forge_build_unsatisfied" stays unclassified, not
  polluting the canonical key)
- extracted as a pure exported helper + 4 unit tests (promote / don't-override /
  reject-non-taxonomy / no-failure)

typecheck clean, 374/374.
---
 package.json          |  2 +-
 pnpm-lock.yaml        | 10 ++++-----
 src/index.ts          |  2 +-
 src/run.ts            | 39 ++++++++++++++++++++++++++++++--
 tests/runtime.test.ts | 52 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 96 insertions(+), 9 deletions(-)

diff --git a/package.json b/package.json
index c2b4648..a7e854c 100644
--- a/package.json
+++ b/package.json
@@ -76,7 +76,7 @@
     "typecheck": "tsc --noEmit"
   },
   "dependencies": {
-    "@tangle-network/agent-eval": "^0.52.0"
+    "@tangle-network/agent-eval": "^0.54.0"
   },
   "devDependencies": {
     "@biomejs/biome": "^2.4.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index e0067fb..ba8960c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -9,8 +9,8 @@ importers:
   .:
     dependencies:
       '@tangle-network/agent-eval':
-        specifier: ^0.52.0
-        version: 0.52.0(@tangle-network/sandbox@0.3.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)
+        specifier: ^0.54.0
+        version: 0.54.0(@tangle-network/sandbox@0.3.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)
       '@tangle-network/agent-knowledge':
         specifier: '>=1.3.0 <2.0.0'
         version: 1.4.0(typescript@5.9.3)(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))
@@ -458,8 +458,8 @@ packages:
     engines: {node: '>=20'}
     hasBin: true
 
-  '@tangle-network/agent-eval@0.52.0':
-    resolution: {integrity: sha512-4/egLYIXzKP4C71GenDDMp1RT7gy9S3Jdg2j55dRZQLB0MxtjiwFaYJvkAjlux5jgo8ir0WGFl+QFIu/rN9TSw==}
+  '@tangle-network/agent-eval@0.54.0':
+    resolution: {integrity: sha512-9dmCfXOBZHbmX//RrN/8iKUfmTB21hwjKEWD6qWFszwNK7/KoCzootKsYr6s1yt2vCoX1F54LjwE9qn1VNfUKw==}
     engines: {node: '>=20'}
     hasBin: true
     peerDependencies:
@@ -1294,7 +1294,7 @@ snapshots:
       - typescript
       - utf-8-validate
 
-  '@tangle-network/agent-eval@0.52.0(@tangle-network/sandbox@0.3.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)':
+  '@tangle-network/agent-eval@0.54.0(@tangle-network/sandbox@0.3.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)':
     dependencies:
       '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.2)
       '@ax-llm/ax': 19.0.45(zod@4.4.2)
diff --git a/src/index.ts b/src/index.ts
index 7fbd7cf..ee034d4 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -136,7 +136,7 @@ export {
 // ── Readiness ─────────────────────────────────────────────────────────
 export { decideKnowledgeReadiness } from './readiness'
 // ── Run loop ─────────────────────────────────────────────────────────
-export { runAgentTask, runAgentTaskStream } from './run'
+export { applyRunRecordDefaults, runAgentTask, runAgentTaskStream } from './run'
 // ── Production run lifecycle ─────────────────────────────────────────
 export type {
   RuntimeRunHandle,
diff --git a/src/run.ts b/src/run.ts
index a0e6e2c..d736b69 100644
--- a/src/run.ts
+++ b/src/run.ts
@@ -19,13 +19,46 @@ import {
   type ControlEvalResult,
   type ControlRunResult,
   type DataAcquisitionPlan,
+  FAILURE_CLASSES,
+  type FailureClass,
   type KnowledgeReadinessReport,
+  type RunRecord,
   runAgentControlLoop,
   scoreKnowledgeReadiness,
   type UserQuestion,
   userQuestionsForKnowledgeGaps,
 } from '@tangle-network/agent-eval'
 
+const FAILURE_CLASS_SET = new Set<string>(FAILURE_CLASSES)
+
+/** True when a free-form control failure string is a canonical taxonomy
+ *  class — so only real taxonomy tags are promoted to the cross-agent
+ *  `RunRecord.failureClass` key; novel strings stay as `failureMode` detail. */
+function asFailureClass(value: string | undefined): FailureClass | undefined {
+  return value && FAILURE_CLASS_SET.has(value) ? (value as FailureClass) : undefined
+}
+
+/** Stamp cross-cutting defaults onto adapter-projected RunRecords without
+ *  overriding anything the adapter set explicitly:
+ *   - `scenarioId` — the run's scenario, when the record omits one.
+ *   - `failureClass` — the control layer's failure classification promoted
+ *     onto the canonical cross-agent key, but ONLY when it's a real taxonomy
+ *     class. This is what lets the substrate aggregate failures across every
+ *     agent in one vocabulary instead of per-agent ad-hoc strings. */
+export function applyRunRecordDefaults(
+  records: RunRecord[],
+  scenarioId: string,
+  controlFailureClass: string | undefined,
+): RunRecord[] {
+  const fc = asFailureClass(controlFailureClass)
+  return records.map((record) => {
+    let r = record
+    if (r.scenarioId === undefined) r = { ...r, scenarioId }
+    if (r.failureClass === undefined && fc) r = { ...r, failureClass: fc }
+    return r
+  })
+}
+
 import { normalizeBackendStreamEvent } from './backends'
 import { BackendTransportError, SessionMismatchError } from './errors'
 import { decideKnowledgeReadiness } from './readiness'
@@ -152,8 +185,10 @@ export async function runAgentTask<
     userAnswers: preflight.userAnswers,
     acquiredEvidenceIds: preflight.acquiredEvidenceIds,
     control,
-    runRecords: (options.adapter.projectRunRecords?.(control, task) ?? []).map((record) =>
-      record.scenarioId === undefined ? { ...record, scenarioId } : record,
+    runRecords: applyRunRecordDefaults(
+      options.adapter.projectRunRecords?.(control, task) ?? [],
+      scenarioId,
+      control.failureClass,
     ),
   }
 }
diff --git a/tests/runtime.test.ts b/tests/runtime.test.ts
index d5b48f8..f22bc2a 100644
--- a/tests/runtime.test.ts
+++ b/tests/runtime.test.ts
@@ -1,6 +1,8 @@
+import type { RunRecord } from '@tangle-network/agent-eval'
 import { describe, expect, it } from 'vitest'
 import {
   type AgentAdapter,
+  applyRunRecordDefaults,
   type AgentBackendInput,
   type AgentExecutionBackend,
   type AgentTaskSpec,
@@ -985,3 +987,53 @@ async function collect(iterable: AsyncIterable<RuntimeStreamEvent>): Promise<Run
   for await (const event of iterable) events.push(event)
   return events
 }
+
+describe('applyRunRecordDefaults — canonical failureClass propagation', () => {
+  function rec(overrides: Partial<RunRecord> = {}): RunRecord {
+    return {
+      runId: 'run-1',
+      experimentId: 'exp-1',
+      candidateId: 'cand-1',
+      seed: 0,
+      model: 'm@v',
+      promptHash: 'sha256:p',
+      configHash: 'sha256:c',
+      commitSha: 'abc',
+      wallMs: 10,
+      costUsd: 0,
+      tokenUsage: { input: 0, output: 0 },
+      outcome: { holdoutScore: 0, raw: {} },
+      splitTag: 'holdout',
+      ...overrides,
+    }
+  }
+
+  it('promotes a canonical control failureClass onto records that lack one', () => {
+    const out = applyRunRecordDefaults([rec()], 'scn-1', 'tool_recovery_failure')
+    expect(out[0]?.failureClass).toBe('tool_recovery_failure')
+    expect(out[0]?.scenarioId).toBe('scn-1')
+  })
+
+  it('never overrides a failureClass the adapter set explicitly', () => {
+    const out = applyRunRecordDefaults(
+      [rec({ failureClass: 'hallucination', scenarioId: 'kept' })],
+      'scn-1',
+      'tool_recovery_failure',
+    )
+    expect(out[0]?.failureClass).toBe('hallucination')
+    expect(out[0]?.scenarioId).toBe('kept')
+  })
+
+  it('does NOT promote a non-taxonomy control string (stays unclassified)', () => {
+    // agent-builder's ad-hoc "forge_build_unsatisfied" is not a FailureClass —
+    // it must not pollute the canonical cross-agent key.
+    const out = applyRunRecordDefaults([rec()], 'scn-1', 'forge_build_unsatisfied')
+    expect(out[0]?.failureClass).toBeUndefined()
+  })
+
+  it('is a no-op on failureClass when the control did not fail', () => {
+    const out = applyRunRecordDefaults([rec()], 'scn-1', undefined)
+    expect(out[0]?.failureClass).toBeUndefined()
+    expect(out[0]?.scenarioId).toBe('scn-1')
+  })
+})

From 9589dcb5c7914f630afbc1d89b92ec2aab071b27 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Thu, 28 May 2026 07:50:25 -0600
Subject: [PATCH 2/2] fix(lint): biome import sort + useTemplate (CI lint gate)

---
 tests/mcp/in-process-executor.test.ts | 2 +-
 tests/runtime.test.ts                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mcp/in-process-executor.test.ts b/tests/mcp/in-process-executor.test.ts
index 1b4ca39..18cadcf 100644
--- a/tests/mcp/in-process-executor.test.ts
+++ b/tests/mcp/in-process-executor.test.ts
@@ -113,7 +113,7 @@ describe('createInProcessExecutor', () => {
       const box = await exec.client.create()
       for await (const _ of (
         box as unknown as { streamPrompt: (m: string) => AsyncGenerator<unknown> }
-      ).streamPrompt('task ' + i)) {
+      ).streamPrompt(`task ${i}`)) {
         // drain
       }
     }
diff --git a/tests/runtime.test.ts b/tests/runtime.test.ts
index f22bc2a..9a54179 100644
--- a/tests/runtime.test.ts
+++ b/tests/runtime.test.ts
@@ -2,10 +2,10 @@ import type { RunRecord } from '@tangle-network/agent-eval'
 import { describe, expect, it } from 'vitest'
 import {
   type AgentAdapter,
-  applyRunRecordDefaults,
   type AgentBackendInput,
   type AgentExecutionBackend,
   type AgentTaskSpec,
+  applyRunRecordDefaults,
   type ControlEvalResult,
   createIterableBackend,
   createOpenAICompatibleBackend,