From 570326b5ebe6bf065369c2ad405492196b48322b Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 08:47:22 -0600 Subject: [PATCH 1/5] feat(loops): surface aggregated tokenUsage on LoopResult + reportLoopUsage bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit runLoop tracked per-call tokensIn/tokensOut (extractLlmCallEvent) but only aggregated costUsd — token counts were dropped before reaching Iteration or LoopResult. A runProfileMatrix/runCampaign dispatch wrapping runLoop could report cost but had no tokens to report, so agent-eval's backend-integrity guard (assertRealBackend, which keys on tokenUsage) would misread a real run as a stub and throw. - Iteration + LoopResult gain tokenUsage: { input, output }, summed across every llm_call event (per iteration) and across iterations (LoopResult). - reportLoopUsage(cost, result) forwards a finished loop's cost + tokens into a campaign cost meter in one call — the trivial consumption path for the new runProfileMatrix primitive. Typed structurally so loops stay free of an agent-eval import. Extends the existing cost-aggregation test to assert token aggregation + reportLoopUsage forwarding. Full suite 381 green. --- src/loops/index.ts | 2 ++ src/loops/report-usage.ts | 41 ++++++++++++++++++++++++++++++++++++++ src/loops/run-loop.ts | 12 +++++++++++ src/loops/types.ts | 15 ++++++++++++++ tests/loops/refine.test.ts | 22 ++++++++++++++++++++ 5 files changed, 92 insertions(+) create mode 100644 src/loops/report-usage.ts diff --git a/src/loops/index.ts b/src/loops/index.ts index ae9fad3..7d74986 100644 --- a/src/loops/index.ts +++ b/src/loops/index.ts @@ -27,6 +27,7 @@ export type { CreateRefineDriverOptions, RefineDecision } from './drivers/refine export { createRefineDriver, refineWinnerIndex } from './drivers/refine' export type { RunLoopOptions } from './run-loop' export { runLoop } from './run-loop' +export { reportLoopUsage, type UsageSink } from './report-usage' export type { AgentRunSpec, DefaultVerdict, @@ -42,6 +43,7 @@ export type { LoopSandboxClient, LoopSandboxPlacement, LoopStartedPayload, + LoopTokenUsage, LoopTraceEmitter, LoopTraceEvent, LoopWinner, diff --git a/src/loops/report-usage.ts b/src/loops/report-usage.ts new file mode 100644 index 0000000..30c9a6d --- /dev/null +++ b/src/loops/report-usage.ts @@ -0,0 +1,41 @@ +/** + * Bridge a finished `runLoop` into an agent-eval campaign / profile-matrix + * dispatch. + * + * `runProfileMatrix` (and `runCampaign`) run the backend-integrity guard over + * the token usage a dispatch reports through `ctx.cost`. A dispatch that wraps + * `runLoop` must forward the loop's cost AND token usage, or the guard reads + * the run as a stub and throws. `reportLoopUsage` is that one line: + * + * const dispatch: ProfileDispatchFn = async (profile, scenario, ctx) => { + * const result = await runLoop({ ...optsFor(profile, scenario), ctx: loopCtx }) + * reportLoopUsage(ctx, result) + * return result.winner?.output as A + * } + * + * Typed structurally against the campaign `DispatchContext.cost` so this module + * stays free of an agent-eval import — it works with any cost meter exposing + * `observe` + `observeTokens`. + */ + +import type { LoopResult } from './types' + +/** The slice of an agent-eval campaign `DispatchContext.cost` this needs. */ +export interface UsageSink { + observe(amountUsd: number, source: string): void + observeTokens(usage: { input: number; output: number }): void +} + +/** + * Forward a `LoopResult`'s aggregated cost + token usage into a campaign cost + * meter so the backend-integrity guard sees real LLM activity. `source` + * defaults to `'loop'`. + */ +export function reportLoopUsage( + cost: UsageSink, + result: Pick, 'costUsd' | 'tokenUsage'>, + source = 'loop', +): void { + cost.observe(result.costUsd, source) + cost.observeTokens({ input: result.tokenUsage.input, output: result.tokenUsage.output }) +} diff --git a/src/loops/run-loop.ts b/src/loops/run-loop.ts index fafc52a..c7c8a77 100644 --- a/src/loops/run-loop.ts +++ b/src/loops/run-loop.ts @@ -149,6 +149,7 @@ export async function runLoop( startedAt: now(), endedAt: 0, costUsd: 0, + tokenUsage: { input: 0, output: 0 }, }) } @@ -288,6 +289,8 @@ async function executeIteration(args: ExecuteIterationArgs( ): LoopResult { const winner = (args.options.selectWinner ?? defaultSelectWinner)(args.iterations) const costUsd = args.iterations.reduce((sum, iter) => sum + (iter.costUsd || 0), 0) + const tokenUsage = args.iterations.reduce( + (acc, iter) => { + acc.input += iter.tokenUsage?.input ?? 0 + acc.output += iter.tokenUsage?.output ?? 0 + return acc + }, + { input: 0, output: 0 }, + ) const result: LoopResult = { decision: args.decision, iterations: args.iterations, winner, durationMs: args.now() - args.startMs, costUsd, + tokenUsage, } void emitTrace(args.options.ctx.traceEmitter, { kind: 'loop.ended', diff --git a/src/loops/types.ts b/src/loops/types.ts index a28eae0..a183cd9 100644 --- a/src/loops/types.ts +++ b/src/loops/types.ts @@ -90,6 +90,15 @@ export interface OutputAdapter { parse(events: SandboxEvent[]): Output } +/** LLM token usage. Structurally matches agent-eval's `RunTokenUsage` / + * `CampaignTokenUsage` ({ input, output }) so a loop result maps straight + * onto `ctx.cost.observeTokens` in a `runProfileMatrix` dispatch — without + * which the backend-integrity guard reads the run as a stub. */ +export interface LoopTokenUsage { + input: number + output: number +} + /** @experimental */ export interface Iteration { /** 0-based iteration index assigned by the kernel. */ @@ -105,6 +114,8 @@ export interface Iteration { startedAt: number endedAt: number costUsd: number + /** Summed LLM token usage across every `llm_call` event in this iteration. */ + tokenUsage: LoopTokenUsage } /** @experimental */ @@ -144,6 +155,10 @@ export interface LoopResult { durationMs: number /** Sum of every iteration's `costUsd`. */ costUsd: number + /** Sum of every iteration's token usage. Forward to + * `ctx.cost.observeTokens` in a `runProfileMatrix` dispatch so the + * integrity guard sees real LLM activity. */ + tokenUsage: LoopTokenUsage } /** diff --git a/tests/loops/refine.test.ts b/tests/loops/refine.test.ts index 50121de..a8b4915 100644 --- a/tests/loops/refine.test.ts +++ b/tests/loops/refine.test.ts @@ -11,6 +11,7 @@ import { type LoopTraceEvent, type OutputAdapter, refineWinnerIndex, + reportLoopUsage, runLoop, type Validator, } from '../../src/loops' @@ -242,6 +243,27 @@ describe('runLoop + createRefineDriver', () => { expect(result.iterations[0]?.costUsd).toBeCloseTo(0.01, 9) expect(result.iterations[1]?.costUsd).toBeCloseTo(0.02, 9) expect(result.costUsd).toBeCloseTo(0.03, 9) + // Token usage must aggregate too — a runProfileMatrix dispatch forwards + // this to the backend-integrity guard; if it stayed 0/0 a real run would + // be misread as a stub. + expect(result.iterations[0]?.tokenUsage).toEqual({ input: 100, output: 50 }) + expect(result.iterations[1]?.tokenUsage).toEqual({ input: 80, output: 30 }) + expect(result.tokenUsage).toEqual({ input: 180, output: 80 }) + + // reportLoopUsage forwards both cost AND tokens into a campaign cost meter. + const observed: Array<{ usd: number; src: string }> = [] + let tokens = { input: 0, output: 0 } + reportLoopUsage( + { + observe: (usd, src) => observed.push({ usd, src }), + observeTokens: (u) => { + tokens = u + }, + }, + result, + ) + expect(observed).toEqual([{ usd: 0.03, src: 'loop' }]) + expect(tokens).toEqual({ input: 180, output: 80 }) }) it('refineWinnerIndex returns the last valid iteration', () => { From 9cbd6862d806a53c113ae065b1c71673f8d222d8 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 08:52:12 -0600 Subject: [PATCH 2/5] =?UTF-8?q?chore(deps):=20bump=20@tangle-network/agent?= =?UTF-8?q?-eval=20^0.54.0=20=E2=86=92=20^0.61.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consumes the published runProfileMatrix + token-capture release. 7-minor jump verified: typecheck + build + full suite (381) green. --- package.json | 2 +- pnpm-lock.yaml | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/package.json b/package.json index a6c2aca..4565a5a 100644 --- a/package.json +++ b/package.json @@ -76,7 +76,7 @@ "typecheck": "tsc --noEmit" }, "dependencies": { - "@tangle-network/agent-eval": "^0.54.0" + "@tangle-network/agent-eval": "^0.61.0" }, "devDependencies": { "@biomejs/biome": "^2.4.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7e4087e..ee94426 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9,8 +9,8 @@ importers: .: dependencies: '@tangle-network/agent-eval': - specifier: ^0.54.0 - version: 0.54.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) + specifier: ^0.61.0 + version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) '@tangle-network/agent-knowledge': specifier: '>=1.3.0 <2.0.0' version: 1.4.0(typescript@5.9.3)(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)) @@ -458,12 +458,12 @@ packages: engines: {node: '>=20'} hasBin: true - '@tangle-network/agent-eval@0.54.0': - resolution: {integrity: sha512-9dmCfXOBZHbmX//RrN/8iKUfmTB21hwjKEWD6qWFszwNK7/KoCzootKsYr6s1yt2vCoX1F54LjwE9qn1VNfUKw==} + '@tangle-network/agent-eval@0.61.0': + resolution: {integrity: sha512-yydVL47bNa2lNaapgFnDKjYRPJfpVTK8luFASCuLNyKtahibMM7bXF+JCScKhYdtEwCYiUijZI0F8VaoIvDi3g==} engines: {node: '>=20'} hasBin: true peerDependencies: - '@tangle-network/sandbox': '>=0.2.1 <0.4.0' + '@tangle-network/sandbox': '>=0.2.1 <0.5.0' peerDependenciesMeta: '@tangle-network/sandbox': optional: true @@ -1289,7 +1289,7 @@ snapshots: - typescript - utf-8-validate - '@tangle-network/agent-eval@0.54.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)': + '@tangle-network/agent-eval@0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)': dependencies: '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.2) '@ax-llm/ax': 19.0.45(zod@4.4.2) From 01f3b2874ad05cdbd157687b696fa7b388da84aa Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 09:08:21 -0600 Subject: [PATCH 3/5] =?UTF-8?q?feat(loops):=20loopDispatch=20=E2=80=94=20f?= =?UTF-8?q?irst-class=20runLoop=E2=86=92campaign=20dispatch=20adapter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The seam critique found reportLoopUsage had one consumer (a test) and zero products: wiring runLoop into runProfileMatrix/runCampaign required hand-building ExecCtx, hand-adapting the campaign trace, and remembering to forward usage (forgetting the last yields a {0,0} stub cell). loopDispatch collapses all three into one typed call: const dispatch = loopDispatch({ sandboxClient, toLoopOptions }) await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha }) It builds the ExecCtx, forwards loop.* trace events into the campaign's scoped trace (campaignTraceToLoopEmitter), runs runLoop, reports cost+tokens via reportLoopUsage internally, and returns winner.output. loopCampaignDispatch is the runCampaign (no-profile) variant. AgentProfile imported from agent-eval (the eval-harness type ProfileDispatchFn keys on), NOT sandbox's — closes the name-collision footgun at this call site. Tests: returns winner artifact + reports exact usage + forwards trace spans; usage still flows on a validator-failing run (must not read as a stub). Full suite 383 green. --- src/loops/index.ts | 6 ++ src/loops/loop-dispatch.ts | 132 +++++++++++++++++++++++ tests/loops/loop-dispatch.test.ts | 168 ++++++++++++++++++++++++++++++ 3 files changed, 306 insertions(+) create mode 100644 src/loops/loop-dispatch.ts create mode 100644 tests/loops/loop-dispatch.test.ts diff --git a/src/loops/index.ts b/src/loops/index.ts index 7d74986..d854266 100644 --- a/src/loops/index.ts +++ b/src/loops/index.ts @@ -28,6 +28,12 @@ export { createRefineDriver, refineWinnerIndex } from './drivers/refine' export type { RunLoopOptions } from './run-loop' export { runLoop } from './run-loop' export { reportLoopUsage, type UsageSink } from './report-usage' +export { + loopCampaignDispatch, + loopDispatch, + type LoopDispatchOptions, + type LoopOptionsForDispatch, +} from './loop-dispatch' export type { AgentRunSpec, DefaultVerdict, diff --git a/src/loops/loop-dispatch.ts b/src/loops/loop-dispatch.ts new file mode 100644 index 0000000..e4c9a77 --- /dev/null +++ b/src/loops/loop-dispatch.ts @@ -0,0 +1,132 @@ +/** + * `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch. + * + * Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` / + * `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a + * sandbox client, (b) adapt the campaign `DispatchContext.trace` into a + * `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to + * forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a + * `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns, + * the third silent. The fleet's products skipped (c) and fell back to a + * `workerRecords[]` side-channel — the exact anti-pattern the substrate exists + * to kill. + * + * `loopDispatch` collapses all three into one typed call: + * + * const dispatch = loopDispatch({ + * sandboxClient, + * toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }), + * }) + * await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha }) + * + * Usage is reported automatically; trace events are forwarded automatically; + * the ctx is built automatically. The seam becomes impossible to mis-wire. + * + * Typed structurally against the campaign `DispatchContext` (imported type-only + * from `@tangle-network/agent-eval/campaign`) — a downward dependency, never an + * inversion. + */ + +// agent-eval's AgentProfile (the eval-harness unit of variation, `model: string`) +// — NOT sandbox's AgentProfile. ProfileDispatchFn is keyed on the former. +import type { AgentProfile } from '@tangle-network/agent-eval' +import type { + CampaignTraceWriter, + DispatchContext, + DispatchFn, + ProfileDispatchFn, + Scenario, +} from '@tangle-network/agent-eval/campaign' +import { reportLoopUsage } from './report-usage' +import { type RunLoopOptions, runLoop } from './run-loop' +import type { LoopResult, LoopSandboxClient, LoopTraceEmitter } from './types' + +/** runLoop options minus the `ctx` (loopDispatch builds the ctx). */ +export type LoopOptionsForDispatch = Omit< + RunLoopOptions, + 'ctx' +> + +export interface LoopDispatchOptions { + /** Sandbox client used for every cell's `runLoop`. Supplied once. */ + sandboxClient: LoopSandboxClient + /** Build the per-cell runLoop options from the scenario (+ profile, when + * used with `runProfileMatrix`). */ + toLoopOptions: ( + scenario: TScenario, + profile: AgentProfile, + ) => LoopOptionsForDispatch + /** Map the finished loop to the artifact the judges score. Default: + * `result.winner?.output`. A loop with no winner yields `undefined` (judges + * skip the cell) — but the loop's token usage is STILL reported, so the + * integrity guard sees real activity. */ + toArtifact?: (result: LoopResult) => TArtifact + /** Forward `loop.*` trace events into the campaign's scoped trace so loop + * spans correlate with the cell. Default true. */ + forwardTrace?: boolean + /** Cost-meter source label for the loop's spend. Default `'loop'`. */ + costSource?: string +} + +/** Bridge a campaign `DispatchContext.trace` to a `LoopTraceEmitter` so every + * `loop.*` event lands as a span under the cell's scoped trace. */ +function campaignTraceToLoopEmitter(trace: CampaignTraceWriter): LoopTraceEmitter { + return { + emit(event) { + trace + .span(event.kind, { runId: event.runId, timestamp: event.timestamp, ...event.payload }) + .end() + }, + } +} + +async function runLoopForCell( + opts: LoopDispatchOptions, + scenario: TScenario, + profile: AgentProfile, + ctx: DispatchContext, +): Promise { + const loopOptions = opts.toLoopOptions(scenario, profile) + const result = await runLoop({ + ...loopOptions, + ctx: { + sandboxClient: opts.sandboxClient, + signal: ctx.signal, + traceEmitter: + opts.forwardTrace === false ? undefined : campaignTraceToLoopEmitter(ctx.trace), + }, + }) + reportLoopUsage(ctx.cost, result, opts.costSource ?? 'loop') + const toArtifact = + opts.toArtifact ?? ((r: LoopResult) => r.winner?.output as TArtifact) + return toArtifact(result) +} + +/** + * Adapter for `runProfileMatrix` (profile is an axis). Returns a + * `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and + * reports usage automatically. + */ +export function loopDispatch( + opts: LoopDispatchOptions, +): ProfileDispatchFn { + return (profile, scenario, ctx) => runLoopForCell(opts, scenario, profile, ctx) +} + +/** + * Adapter for `runCampaign` (no profile axis). `toLoopOptions` receives only + * the scenario; the `profile` passed to the shared core is a stable sentinel + * so a single `runLoop` config is reused across cells. + */ +export function loopCampaignDispatch( + opts: Omit, 'toLoopOptions'> & { + toLoopOptions: (scenario: TScenario) => LoopOptionsForDispatch + }, +): DispatchFn { + const profileSentinel = { id: 'loop-campaign', model: 'n/a@loop-campaign' } as AgentProfile + const profiled: LoopDispatchOptions = { + ...opts, + toLoopOptions: (scenario) => opts.toLoopOptions(scenario), + } + return (scenario, ctx) => runLoopForCell(profiled, scenario, profileSentinel, ctx) +} diff --git a/tests/loops/loop-dispatch.test.ts b/tests/loops/loop-dispatch.test.ts new file mode 100644 index 0000000..517a19c --- /dev/null +++ b/tests/loops/loop-dispatch.test.ts @@ -0,0 +1,168 @@ +import type { + AgentProfile as SandboxAgentProfile, + CreateSandboxOptions, + SandboxEvent, + SandboxInstance, +} from '@tangle-network/sandbox' +import type { DispatchContext } from '@tangle-network/agent-eval/campaign' +import { describe, expect, it } from 'vitest' +import { + type AgentRunSpec, + createRefineDriver, + loopDispatch, + type OutputAdapter, + type Validator, +} from '../../src/loops' + +interface Task { + goal: string +} +interface Output { + attempt: number +} +interface FakeScenario { + id: string + kind: string +} + +const sandboxProfile: SandboxAgentProfile = { name: 'stub' } + +function spec(): AgentRunSpec { + return { profile: sandboxProfile, name: 'agent', taskToPrompt: (t) => t.goal } +} + +const output: OutputAdapter = { + parse: (events) => { + const data = events.at(-1)?.data as { attempt?: number } | undefined + return { attempt: typeof data?.attempt === 'number' ? data.attempt : -1 } + }, +} + +const passAlways: Validator = { + async validate(out) { + return { valid: true, score: 1, scores: { attempt: out.attempt } } + }, +} + +function stubClient(events: SandboxEvent[]): { create(opts?: CreateSandboxOptions): Promise } { + return { + async create() { + return { + async *streamPrompt() { + for (const e of events) yield e + }, + } as unknown as SandboxInstance + }, + } +} + +/** Minimal campaign DispatchContext that records what the dispatch reports. */ +function fakeDispatchContext(): { + ctx: DispatchContext + observed: Array<{ usd: number; src: string }> + tokens: { input: number; output: number } + spans: string[] +} { + const observed: Array<{ usd: number; src: string }> = [] + const tokens = { input: 0, output: 0 } + const spans: string[] = [] + const ctx: DispatchContext = { + cellId: 'cell-0', + rep: 0, + seed: 1, + signal: new AbortController().signal, + trace: { + span(name: string) { + spans.push(name) + return { end() {}, setAttribute() {} } + }, + async flush() {}, + }, + artifacts: { + async write() { + return 'p' + }, + async writeJson() { + return 'p' + }, + }, + cost: { + observe(usd: number, src: string) { + observed.push({ usd, src }) + }, + observeTokens(u: { input: number; output: number }) { + tokens.input += u.input + tokens.output += u.output + }, + current() { + return 0 + }, + tokens() { + return tokens + }, + }, + } + return { ctx, observed, tokens, spans } +} + +describe('loopDispatch', () => { + it('bridges runLoop into a ProfileDispatchFn: returns the winner artifact, reports usage, forwards trace', async () => { + const sandboxClient = stubClient([ + { type: 'llm_call', data: { tokensIn: 150, tokensOut: 60, costUsd: 0.02, model: 'm' } }, + { type: 'result', data: { attempt: 2 } }, + ]) + const dispatch = loopDispatch({ + sandboxClient, + toLoopOptions: (scenario) => ({ + driver: createRefineDriver(), + agentRun: spec(), + output, + validator: passAlways, + task: { goal: scenario.id }, + maxIterations: 1, + }), + }) + + const fake = fakeDispatchContext() + const profile = { id: 'baseline', model: 'test-model@2025-01-01' } + const artifact = await dispatch(profile, { id: 's1', kind: 'task' }, fake.ctx) + + // Returns the loop's winner output. + expect(artifact).toEqual({ attempt: 2 }) + // Usage reported to the campaign cost meter — the integrity guard's input. + expect(fake.observed).toEqual([{ usd: 0.02, src: 'loop' }]) + expect(fake.tokens).toEqual({ input: 150, output: 60 }) + // Loop trace events forwarded into the campaign trace as spans. + expect(fake.spans).toContain('loop.started') + expect(fake.spans).toContain('loop.ended') + }) + + it('reports usage even when the run fails the validator (real activity must NOT read as a stub)', async () => { + const failAlways: Validator = { + async validate() { + return { valid: false, score: 0, scores: {}, notes: 'no' } + }, + } + const sandboxClient = stubClient([ + { type: 'llm_call', data: { tokensIn: 90, tokensOut: 20, costUsd: 0.01, model: 'm' } }, + { type: 'result', data: { attempt: 1 } }, + ]) + const dispatch = loopDispatch({ + sandboxClient, + toLoopOptions: (scenario) => ({ + driver: createRefineDriver(), + agentRun: spec(), + output, + validator: failAlways, + task: { goal: scenario.id }, + maxIterations: 1, + }), + }) + const fake = fakeDispatchContext() + await dispatch({ id: 'p', model: 'm@2025-01-01' }, { id: 's1', kind: 'task' }, fake.ctx) + // The validator failed, but real LLM activity happened — tokens + cost MUST + // still reach the cost meter, or the integrity guard would call it a stub. + expect(fake.tokens).toEqual({ input: 90, output: 20 }) + expect(fake.observed).toEqual([{ usd: 0.01, src: 'loop' }]) + }) +}) From fad618ee7e7eba2fa0fb1c76c554eda658faa60c Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 09:25:05 -0600 Subject: [PATCH 4/5] chore(deps): declare agent-eval as a required peerDependency, not a hard dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Version-discipline fix (boundary critique, VERSIONING 3/10). agent-eval was the lone hard dependency while sandbox + agent-knowledge are already peers. A hard dep lets pnpm install a SECOND, divergent agent-eval tree with an incompatible RunRecord/DefaultVerdict; today only pnpm.overrides prevents it. As a peer (>=0.61.0 <1.0.0, required — not optional), a consumer running a stale or divergent substrate gets a loud unmet-peer warning instead of a silent split tree. agent-eval moves to devDependencies for agent-runtime's own build/test. Typecheck + full suite (383) green with the peer layout. --- package.json | 6 +++--- pnpm-lock.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/package.json b/package.json index 4565a5a..7835731 100644 --- a/package.json +++ b/package.json @@ -75,11 +75,10 @@ "lint:fix": "biome check --write src tests examples", "typecheck": "tsc --noEmit" }, - "dependencies": { - "@tangle-network/agent-eval": "^0.61.0" - }, + "dependencies": {}, "devDependencies": { "@biomejs/biome": "^2.4.0", + "@tangle-network/agent-eval": "^0.61.0", "@tangle-network/sandbox": "^0.4.0", "@types/node": "^25.6.0", "tsup": "^8.0.0", @@ -101,6 +100,7 @@ "license": "MIT", "packageManager": "pnpm@10.28.0", "peerDependencies": { + "@tangle-network/agent-eval": ">=0.61.0 <1.0.0", "@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0", "@tangle-network/sandbox": ">=0.1.2 <0.5.0" }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ee94426..45dc755 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,9 +8,6 @@ importers: .: dependencies: - '@tangle-network/agent-eval': - specifier: ^0.61.0 - version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) '@tangle-network/agent-knowledge': specifier: '>=1.3.0 <2.0.0' version: 1.4.0(typescript@5.9.3)(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)) @@ -18,6 +15,9 @@ importers: '@biomejs/biome': specifier: ^2.4.0 version: 2.4.15 + '@tangle-network/agent-eval': + specifier: ^0.61.0 + version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) '@tangle-network/sandbox': specifier: ^0.4.0 version: 0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)) From ffc89ce123c22f797409b36cb7ac4d36817409e1 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 09:27:33 -0600 Subject: [PATCH 5/5] =?UTF-8?q?chore(release):=200.32.0=20=E2=80=94=20loop?= =?UTF-8?q?Dispatch=20adapter=20+=20tokenUsage=20seam=20+=20agent-eval=20p?= =?UTF-8?q?eer-dep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 7835731..a2f128b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.31.0", + "version": "0.32.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": {