From 4c05953045fbbdb41c3a66d2862258f9f522a8e3 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Thu, 28 May 2026 07:30:01 -0600 Subject: [PATCH 1/2] feat(run): propagate control failureClass onto canonical RunRecord (agent-eval 0.54.0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit agent-runtime built RunRecords but never set the structured failure tag, starving the substrate's cross-agent failure aggregation. The control layer already classifies failures (ControlRunResult.failureClass) — this promotes that onto the canonical RunRecord.failureClass key (agent-eval 0.54.0) at the projection boundary, so failures aggregate across every agent in ONE taxonomy vocabulary. - bump @tangle-network/agent-eval ^0.52.0 -> ^0.54.0 - applyRunRecordDefaults(): stamps scenarioId + failureClass without overriding anything the adapter set; promotes ONLY real FAILURE_CLASSES taxonomy tags (a non-taxonomy string like "forge_build_unsatisfied" stays unclassified, not polluting the canonical key) - extracted as a pure exported helper + 4 unit tests (promote / don't-override / reject-non-taxonomy / no-failure) typecheck clean, 374/374. --- package.json | 2 +- pnpm-lock.yaml | 10 ++++----- src/index.ts | 2 +- src/run.ts | 39 ++++++++++++++++++++++++++++++-- tests/runtime.test.ts | 52 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 9 deletions(-) diff --git a/package.json b/package.json index c2b4648..a7e854c 100644 --- a/package.json +++ b/package.json @@ -76,7 +76,7 @@ "typecheck": "tsc --noEmit" }, "dependencies": { - "@tangle-network/agent-eval": "^0.52.0" + "@tangle-network/agent-eval": "^0.54.0" }, "devDependencies": { "@biomejs/biome": "^2.4.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e0067fb..ba8960c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9,8 +9,8 @@ importers: .: dependencies: '@tangle-network/agent-eval': - specifier: ^0.52.0 - version: 0.52.0(@tangle-network/sandbox@0.3.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) + specifier: ^0.54.0 + version: 0.54.0(@tangle-network/sandbox@0.3.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) '@tangle-network/agent-knowledge': specifier: '>=1.3.0 <2.0.0' version: 1.4.0(typescript@5.9.3)(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)) @@ -458,8 +458,8 @@ packages: engines: {node: '>=20'} hasBin: true - '@tangle-network/agent-eval@0.52.0': - resolution: {integrity: sha512-4/egLYIXzKP4C71GenDDMp1RT7gy9S3Jdg2j55dRZQLB0MxtjiwFaYJvkAjlux5jgo8ir0WGFl+QFIu/rN9TSw==} + '@tangle-network/agent-eval@0.54.0': + resolution: {integrity: sha512-9dmCfXOBZHbmX//RrN/8iKUfmTB21hwjKEWD6qWFszwNK7/KoCzootKsYr6s1yt2vCoX1F54LjwE9qn1VNfUKw==} engines: {node: '>=20'} hasBin: true peerDependencies: @@ -1294,7 +1294,7 @@ snapshots: - typescript - utf-8-validate - '@tangle-network/agent-eval@0.52.0(@tangle-network/sandbox@0.3.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)': + '@tangle-network/agent-eval@0.54.0(@tangle-network/sandbox@0.3.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)': dependencies: '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.2) '@ax-llm/ax': 19.0.45(zod@4.4.2) diff --git a/src/index.ts b/src/index.ts index 7fbd7cf..ee034d4 100644 --- a/src/index.ts +++ b/src/index.ts @@ -136,7 +136,7 @@ export { // ── Readiness ───────────────────────────────────────────────────────── export { decideKnowledgeReadiness } from './readiness' // ── Run loop ───────────────────────────────────────────────────────── -export { runAgentTask, runAgentTaskStream } from './run' +export { applyRunRecordDefaults, runAgentTask, runAgentTaskStream } from './run' // ── Production run lifecycle ───────────────────────────────────────── export type { RuntimeRunHandle, diff --git a/src/run.ts b/src/run.ts index a0e6e2c..d736b69 100644 --- a/src/run.ts +++ b/src/run.ts @@ -19,13 +19,46 @@ import { type ControlEvalResult, type ControlRunResult, type DataAcquisitionPlan, + FAILURE_CLASSES, + type FailureClass, type KnowledgeReadinessReport, + type RunRecord, runAgentControlLoop, scoreKnowledgeReadiness, type UserQuestion, userQuestionsForKnowledgeGaps, } from '@tangle-network/agent-eval' +const FAILURE_CLASS_SET = new Set(FAILURE_CLASSES) + +/** True when a free-form control failure string is a canonical taxonomy + * class — so only real taxonomy tags are promoted to the cross-agent + * `RunRecord.failureClass` key; novel strings stay as `failureMode` detail. */ +function asFailureClass(value: string | undefined): FailureClass | undefined { + return value && FAILURE_CLASS_SET.has(value) ? (value as FailureClass) : undefined +} + +/** Stamp cross-cutting defaults onto adapter-projected RunRecords without + * overriding anything the adapter set explicitly: + * - `scenarioId` — the run's scenario, when the record omits one. + * - `failureClass` — the control layer's failure classification promoted + * onto the canonical cross-agent key, but ONLY when it's a real taxonomy + * class. This is what lets the substrate aggregate failures across every + * agent in one vocabulary instead of per-agent ad-hoc strings. */ +export function applyRunRecordDefaults( + records: RunRecord[], + scenarioId: string, + controlFailureClass: string | undefined, +): RunRecord[] { + const fc = asFailureClass(controlFailureClass) + return records.map((record) => { + let r = record + if (r.scenarioId === undefined) r = { ...r, scenarioId } + if (r.failureClass === undefined && fc) r = { ...r, failureClass: fc } + return r + }) +} + import { normalizeBackendStreamEvent } from './backends' import { BackendTransportError, SessionMismatchError } from './errors' import { decideKnowledgeReadiness } from './readiness' @@ -152,8 +185,10 @@ export async function runAgentTask< userAnswers: preflight.userAnswers, acquiredEvidenceIds: preflight.acquiredEvidenceIds, control, - runRecords: (options.adapter.projectRunRecords?.(control, task) ?? []).map((record) => - record.scenarioId === undefined ? { ...record, scenarioId } : record, + runRecords: applyRunRecordDefaults( + options.adapter.projectRunRecords?.(control, task) ?? [], + scenarioId, + control.failureClass, ), } } diff --git a/tests/runtime.test.ts b/tests/runtime.test.ts index d5b48f8..f22bc2a 100644 --- a/tests/runtime.test.ts +++ b/tests/runtime.test.ts @@ -1,6 +1,8 @@ +import type { RunRecord } from '@tangle-network/agent-eval' import { describe, expect, it } from 'vitest' import { type AgentAdapter, + applyRunRecordDefaults, type AgentBackendInput, type AgentExecutionBackend, type AgentTaskSpec, @@ -985,3 +987,53 @@ async function collect(iterable: AsyncIterable): Promise { + function rec(overrides: Partial = {}): RunRecord { + return { + runId: 'run-1', + experimentId: 'exp-1', + candidateId: 'cand-1', + seed: 0, + model: 'm@v', + promptHash: 'sha256:p', + configHash: 'sha256:c', + commitSha: 'abc', + wallMs: 10, + costUsd: 0, + tokenUsage: { input: 0, output: 0 }, + outcome: { holdoutScore: 0, raw: {} }, + splitTag: 'holdout', + ...overrides, + } + } + + it('promotes a canonical control failureClass onto records that lack one', () => { + const out = applyRunRecordDefaults([rec()], 'scn-1', 'tool_recovery_failure') + expect(out[0]?.failureClass).toBe('tool_recovery_failure') + expect(out[0]?.scenarioId).toBe('scn-1') + }) + + it('never overrides a failureClass the adapter set explicitly', () => { + const out = applyRunRecordDefaults( + [rec({ failureClass: 'hallucination', scenarioId: 'kept' })], + 'scn-1', + 'tool_recovery_failure', + ) + expect(out[0]?.failureClass).toBe('hallucination') + expect(out[0]?.scenarioId).toBe('kept') + }) + + it('does NOT promote a non-taxonomy control string (stays unclassified)', () => { + // agent-builder's ad-hoc "forge_build_unsatisfied" is not a FailureClass — + // it must not pollute the canonical cross-agent key. + const out = applyRunRecordDefaults([rec()], 'scn-1', 'forge_build_unsatisfied') + expect(out[0]?.failureClass).toBeUndefined() + }) + + it('is a no-op on failureClass when the control did not fail', () => { + const out = applyRunRecordDefaults([rec()], 'scn-1', undefined) + expect(out[0]?.failureClass).toBeUndefined() + expect(out[0]?.scenarioId).toBe('scn-1') + }) +}) From 9589dcb5c7914f630afbc1d89b92ec2aab071b27 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Thu, 28 May 2026 07:50:25 -0600 Subject: [PATCH 2/2] fix(lint): biome import sort + useTemplate (CI lint gate) --- tests/mcp/in-process-executor.test.ts | 2 +- tests/runtime.test.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/mcp/in-process-executor.test.ts b/tests/mcp/in-process-executor.test.ts index 1b4ca39..18cadcf 100644 --- a/tests/mcp/in-process-executor.test.ts +++ b/tests/mcp/in-process-executor.test.ts @@ -113,7 +113,7 @@ describe('createInProcessExecutor', () => { const box = await exec.client.create() for await (const _ of ( box as unknown as { streamPrompt: (m: string) => AsyncGenerator } - ).streamPrompt('task ' + i)) { + ).streamPrompt(`task ${i}`)) { // drain } } diff --git a/tests/runtime.test.ts b/tests/runtime.test.ts index f22bc2a..9a54179 100644 --- a/tests/runtime.test.ts +++ b/tests/runtime.test.ts @@ -2,10 +2,10 @@ import type { RunRecord } from '@tangle-network/agent-eval' import { describe, expect, it } from 'vitest' import { type AgentAdapter, - applyRunRecordDefaults, type AgentBackendInput, type AgentExecutionBackend, type AgentTaskSpec, + applyRunRecordDefaults, type ControlEvalResult, createIterableBackend, createOpenAICompatibleBackend,