From 570326b5ebe6bf065369c2ad405492196b48322b Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 08:47:22 -0600
Subject: [PATCH 1/5] feat(loops): surface aggregated tokenUsage on LoopResult
 + reportLoopUsage bridge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

runLoop tracked per-call tokensIn/tokensOut (extractLlmCallEvent) but only
aggregated costUsd — token counts were dropped before reaching Iteration or
LoopResult. A runProfileMatrix/runCampaign dispatch wrapping runLoop could
report cost but had no tokens to report, so agent-eval's backend-integrity
guard (assertRealBackend, which keys on tokenUsage) would misread a real run
as a stub and throw.

- Iteration + LoopResult gain tokenUsage: { input, output }, summed across
  every llm_call event (per iteration) and across iterations (LoopResult).
- reportLoopUsage(cost, result) forwards a finished loop's cost + tokens into
  a campaign cost meter in one call — the trivial consumption path for the new
  runProfileMatrix primitive. Typed structurally so loops stay free of an
  agent-eval import.

Extends the existing cost-aggregation test to assert token aggregation +
reportLoopUsage forwarding. Full suite 381 green.
---
 src/loops/index.ts         |  2 ++
 src/loops/report-usage.ts  | 41 ++++++++++++++++++++++++++++++++++++++
 src/loops/run-loop.ts      | 12 +++++++++++
 src/loops/types.ts         | 15 ++++++++++++++
 tests/loops/refine.test.ts | 22 ++++++++++++++++++++
 5 files changed, 92 insertions(+)
 create mode 100644 src/loops/report-usage.ts

diff --git a/src/loops/index.ts b/src/loops/index.ts
index ae9fad3..7d74986 100644
--- a/src/loops/index.ts
+++ b/src/loops/index.ts
@@ -27,6 +27,7 @@ export type { CreateRefineDriverOptions, RefineDecision } from './drivers/refine
 export { createRefineDriver, refineWinnerIndex } from './drivers/refine'
 export type { RunLoopOptions } from './run-loop'
 export { runLoop } from './run-loop'
+export { reportLoopUsage, type UsageSink } from './report-usage'
 export type {
   AgentRunSpec,
   DefaultVerdict,
@@ -42,6 +43,7 @@ export type {
   LoopSandboxClient,
   LoopSandboxPlacement,
   LoopStartedPayload,
+  LoopTokenUsage,
   LoopTraceEmitter,
   LoopTraceEvent,
   LoopWinner,
diff --git a/src/loops/report-usage.ts b/src/loops/report-usage.ts
new file mode 100644
index 0000000..30c9a6d
--- /dev/null
+++ b/src/loops/report-usage.ts
@@ -0,0 +1,41 @@
+/**
+ * Bridge a finished `runLoop` into an agent-eval campaign / profile-matrix
+ * dispatch.
+ *
+ * `runProfileMatrix` (and `runCampaign`) run the backend-integrity guard over
+ * the token usage a dispatch reports through `ctx.cost`. A dispatch that wraps
+ * `runLoop` must forward the loop's cost AND token usage, or the guard reads
+ * the run as a stub and throws. `reportLoopUsage` is that one line:
+ *
+ *   const dispatch: ProfileDispatchFn<S, A> = async (profile, scenario, ctx) => {
+ *     const result = await runLoop({ ...optsFor(profile, scenario), ctx: loopCtx })
+ *     reportLoopUsage(ctx, result)
+ *     return result.winner?.output as A
+ *   }
+ *
+ * Typed structurally against the campaign `DispatchContext.cost` so this module
+ * stays free of an agent-eval import — it works with any cost meter exposing
+ * `observe` + `observeTokens`.
+ */
+
+import type { LoopResult } from './types'
+
+/** The slice of an agent-eval campaign `DispatchContext.cost` this needs. */
+export interface UsageSink {
+  observe(amountUsd: number, source: string): void
+  observeTokens(usage: { input: number; output: number }): void
+}
+
+/**
+ * Forward a `LoopResult`'s aggregated cost + token usage into a campaign cost
+ * meter so the backend-integrity guard sees real LLM activity. `source`
+ * defaults to `'loop'`.
+ */
+export function reportLoopUsage<Task, Output, Decision>(
+  cost: UsageSink,
+  result: Pick<LoopResult<Task, Output, Decision>, 'costUsd' | 'tokenUsage'>,
+  source = 'loop',
+): void {
+  cost.observe(result.costUsd, source)
+  cost.observeTokens({ input: result.tokenUsage.input, output: result.tokenUsage.output })
+}
diff --git a/src/loops/run-loop.ts b/src/loops/run-loop.ts
index fafc52a..c7c8a77 100644
--- a/src/loops/run-loop.ts
+++ b/src/loops/run-loop.ts
@@ -149,6 +149,7 @@ export async function runLoop<Task, Output, Decision>(
           startedAt: now(),
           endedAt: 0,
           costUsd: 0,
+          tokenUsage: { input: 0, output: 0 },
         })
       }
 
@@ -288,6 +289,8 @@ async function executeIteration<Task, Output>(args: ExecuteIterationArgs<Task, O
       const llmCall = extractLlmCallEvent(event, slot.agentRunName)
       if (llmCall) {
         slot.costUsd += llmCall.costUsd ?? 0
+        slot.tokenUsage.input += llmCall.tokensIn ?? 0
+        slot.tokenUsage.output += llmCall.tokensOut ?? 0
         args.ctx.runHandle?.observe(llmCall)
       }
     }
@@ -405,12 +408,21 @@ function finalize<Task, Output, Decision>(
 ): LoopResult<Task, Output, Decision> {
   const winner = (args.options.selectWinner ?? defaultSelectWinner)(args.iterations)
   const costUsd = args.iterations.reduce((sum, iter) => sum + (iter.costUsd || 0), 0)
+  const tokenUsage = args.iterations.reduce(
+    (acc, iter) => {
+      acc.input += iter.tokenUsage?.input ?? 0
+      acc.output += iter.tokenUsage?.output ?? 0
+      return acc
+    },
+    { input: 0, output: 0 },
+  )
   const result: LoopResult<Task, Output, Decision> = {
     decision: args.decision,
     iterations: args.iterations,
     winner,
     durationMs: args.now() - args.startMs,
     costUsd,
+    tokenUsage,
   }
   void emitTrace(args.options.ctx.traceEmitter, {
     kind: 'loop.ended',
diff --git a/src/loops/types.ts b/src/loops/types.ts
index a28eae0..a183cd9 100644
--- a/src/loops/types.ts
+++ b/src/loops/types.ts
@@ -90,6 +90,15 @@ export interface OutputAdapter<Output> {
   parse(events: SandboxEvent[]): Output
 }
 
+/** LLM token usage. Structurally matches agent-eval's `RunTokenUsage` /
+ *  `CampaignTokenUsage` ({ input, output }) so a loop result maps straight
+ *  onto `ctx.cost.observeTokens` in a `runProfileMatrix` dispatch — without
+ *  which the backend-integrity guard reads the run as a stub. */
+export interface LoopTokenUsage {
+  input: number
+  output: number
+}
+
 /** @experimental */
 export interface Iteration<Task, Output> {
   /** 0-based iteration index assigned by the kernel. */
@@ -105,6 +114,8 @@ export interface Iteration<Task, Output> {
   startedAt: number
   endedAt: number
   costUsd: number
+  /** Summed LLM token usage across every `llm_call` event in this iteration. */
+  tokenUsage: LoopTokenUsage
 }
 
 /** @experimental */
@@ -144,6 +155,10 @@ export interface LoopResult<Task, Output, Decision> {
   durationMs: number
   /** Sum of every iteration's `costUsd`. */
   costUsd: number
+  /** Sum of every iteration's token usage. Forward to
+   *  `ctx.cost.observeTokens` in a `runProfileMatrix` dispatch so the
+   *  integrity guard sees real LLM activity. */
+  tokenUsage: LoopTokenUsage
 }
 
 /**
diff --git a/tests/loops/refine.test.ts b/tests/loops/refine.test.ts
index 50121de..a8b4915 100644
--- a/tests/loops/refine.test.ts
+++ b/tests/loops/refine.test.ts
@@ -11,6 +11,7 @@ import {
   type LoopTraceEvent,
   type OutputAdapter,
   refineWinnerIndex,
+  reportLoopUsage,
   runLoop,
   type Validator,
 } from '../../src/loops'
@@ -242,6 +243,27 @@ describe('runLoop + createRefineDriver', () => {
     expect(result.iterations[0]?.costUsd).toBeCloseTo(0.01, 9)
     expect(result.iterations[1]?.costUsd).toBeCloseTo(0.02, 9)
     expect(result.costUsd).toBeCloseTo(0.03, 9)
+    // Token usage must aggregate too — a runProfileMatrix dispatch forwards
+    // this to the backend-integrity guard; if it stayed 0/0 a real run would
+    // be misread as a stub.
+    expect(result.iterations[0]?.tokenUsage).toEqual({ input: 100, output: 50 })
+    expect(result.iterations[1]?.tokenUsage).toEqual({ input: 80, output: 30 })
+    expect(result.tokenUsage).toEqual({ input: 180, output: 80 })
+
+    // reportLoopUsage forwards both cost AND tokens into a campaign cost meter.
+    const observed: Array<{ usd: number; src: string }> = []
+    let tokens = { input: 0, output: 0 }
+    reportLoopUsage(
+      {
+        observe: (usd, src) => observed.push({ usd, src }),
+        observeTokens: (u) => {
+          tokens = u
+        },
+      },
+      result,
+    )
+    expect(observed).toEqual([{ usd: 0.03, src: 'loop' }])
+    expect(tokens).toEqual({ input: 180, output: 80 })
   })
 
   it('refineWinnerIndex returns the last valid iteration', () => {

From 9cbd6862d806a53c113ae065b1c71673f8d222d8 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 08:52:12 -0600
Subject: [PATCH 2/5] =?UTF-8?q?chore(deps):=20bump=20@tangle-network/agent?=
 =?UTF-8?q?-eval=20^0.54.0=20=E2=86=92=20^0.61.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Consumes the published runProfileMatrix + token-capture release. 7-minor
jump verified: typecheck + build + full suite (381) green.
---
 package.json   |  2 +-
 pnpm-lock.yaml | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/package.json b/package.json
index a6c2aca..4565a5a 100644
--- a/package.json
+++ b/package.json
@@ -76,7 +76,7 @@
     "typecheck": "tsc --noEmit"
   },
   "dependencies": {
-    "@tangle-network/agent-eval": "^0.54.0"
+    "@tangle-network/agent-eval": "^0.61.0"
   },
   "devDependencies": {
     "@biomejs/biome": "^2.4.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 7e4087e..ee94426 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -9,8 +9,8 @@ importers:
   .:
     dependencies:
       '@tangle-network/agent-eval':
-        specifier: ^0.54.0
-        version: 0.54.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)
+        specifier: ^0.61.0
+        version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)
       '@tangle-network/agent-knowledge':
         specifier: '>=1.3.0 <2.0.0'
         version: 1.4.0(typescript@5.9.3)(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))
@@ -458,12 +458,12 @@ packages:
     engines: {node: '>=20'}
     hasBin: true
 
-  '@tangle-network/agent-eval@0.54.0':
-    resolution: {integrity: sha512-9dmCfXOBZHbmX//RrN/8iKUfmTB21hwjKEWD6qWFszwNK7/KoCzootKsYr6s1yt2vCoX1F54LjwE9qn1VNfUKw==}
+  '@tangle-network/agent-eval@0.61.0':
+    resolution: {integrity: sha512-yydVL47bNa2lNaapgFnDKjYRPJfpVTK8luFASCuLNyKtahibMM7bXF+JCScKhYdtEwCYiUijZI0F8VaoIvDi3g==}
     engines: {node: '>=20'}
     hasBin: true
     peerDependencies:
-      '@tangle-network/sandbox': '>=0.2.1 <0.4.0'
+      '@tangle-network/sandbox': '>=0.2.1 <0.5.0'
     peerDependenciesMeta:
       '@tangle-network/sandbox':
         optional: true
@@ -1289,7 +1289,7 @@ snapshots:
       - typescript
       - utf-8-validate
 
-  '@tangle-network/agent-eval@0.54.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)':
+  '@tangle-network/agent-eval@0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)':
     dependencies:
       '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.2)
       '@ax-llm/ax': 19.0.45(zod@4.4.2)

From 01f3b2874ad05cdbd157687b696fa7b388da84aa Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 09:08:21 -0600
Subject: [PATCH 3/5] =?UTF-8?q?feat(loops):=20loopDispatch=20=E2=80=94=20f?=
 =?UTF-8?q?irst-class=20runLoop=E2=86=92campaign=20dispatch=20adapter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The seam critique found reportLoopUsage had one consumer (a test) and zero
products: wiring runLoop into runProfileMatrix/runCampaign required hand-building
ExecCtx, hand-adapting the campaign trace, and remembering to forward usage
(forgetting the last yields a {0,0} stub cell). loopDispatch collapses all three
into one typed call:

  const dispatch = loopDispatch({ sandboxClient, toLoopOptions })
  await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })

It builds the ExecCtx, forwards loop.* trace events into the campaign's scoped
trace (campaignTraceToLoopEmitter), runs runLoop, reports cost+tokens via
reportLoopUsage internally, and returns winner.output. loopCampaignDispatch is
the runCampaign (no-profile) variant. AgentProfile imported from agent-eval
(the eval-harness type ProfileDispatchFn keys on), NOT sandbox's — closes the
name-collision footgun at this call site.

Tests: returns winner artifact + reports exact usage + forwards trace spans;
usage still flows on a validator-failing run (must not read as a stub).
Full suite 383 green.
---
 src/loops/index.ts                |   6 ++
 src/loops/loop-dispatch.ts        | 132 +++++++++++++++++++++++
 tests/loops/loop-dispatch.test.ts | 168 ++++++++++++++++++++++++++++++
 3 files changed, 306 insertions(+)
 create mode 100644 src/loops/loop-dispatch.ts
 create mode 100644 tests/loops/loop-dispatch.test.ts

diff --git a/src/loops/index.ts b/src/loops/index.ts
index 7d74986..d854266 100644
--- a/src/loops/index.ts
+++ b/src/loops/index.ts
@@ -28,6 +28,12 @@ export { createRefineDriver, refineWinnerIndex } from './drivers/refine'
 export type { RunLoopOptions } from './run-loop'
 export { runLoop } from './run-loop'
 export { reportLoopUsage, type UsageSink } from './report-usage'
+export {
+  loopCampaignDispatch,
+  loopDispatch,
+  type LoopDispatchOptions,
+  type LoopOptionsForDispatch,
+} from './loop-dispatch'
 export type {
   AgentRunSpec,
   DefaultVerdict,
diff --git a/src/loops/loop-dispatch.ts b/src/loops/loop-dispatch.ts
new file mode 100644
index 0000000..e4c9a77
--- /dev/null
+++ b/src/loops/loop-dispatch.ts
@@ -0,0 +1,132 @@
+/**
+ * `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch.
+ *
+ * Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` /
+ * `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a
+ * sandbox client, (b) adapt the campaign `DispatchContext.trace` into a
+ * `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to
+ * forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
+ * `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
+ * the third silent. The fleet's products skipped (c) and fell back to a
+ * `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
+ * to kill.
+ *
+ * `loopDispatch` collapses all three into one typed call:
+ *
+ *   const dispatch = loopDispatch({
+ *     sandboxClient,
+ *     toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }),
+ *   })
+ *   await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
+ *
+ * Usage is reported automatically; trace events are forwarded automatically;
+ * the ctx is built automatically. The seam becomes impossible to mis-wire.
+ *
+ * Typed structurally against the campaign `DispatchContext` (imported type-only
+ * from `@tangle-network/agent-eval/campaign`) — a downward dependency, never an
+ * inversion.
+ */
+
+// agent-eval's AgentProfile (the eval-harness unit of variation, `model: string`)
+// — NOT sandbox's AgentProfile. ProfileDispatchFn is keyed on the former.
+import type { AgentProfile } from '@tangle-network/agent-eval'
+import type {
+  CampaignTraceWriter,
+  DispatchContext,
+  DispatchFn,
+  ProfileDispatchFn,
+  Scenario,
+} from '@tangle-network/agent-eval/campaign'
+import { reportLoopUsage } from './report-usage'
+import { type RunLoopOptions, runLoop } from './run-loop'
+import type { LoopResult, LoopSandboxClient, LoopTraceEmitter } from './types'
+
+/** runLoop options minus the `ctx` (loopDispatch builds the ctx). */
+export type LoopOptionsForDispatch<Task, Output, Decision> = Omit<
+  RunLoopOptions<Task, Output, Decision>,
+  'ctx'
+>
+
+export interface LoopDispatchOptions<Task, Output, Decision, TScenario extends Scenario, TArtifact> {
+  /** Sandbox client used for every cell's `runLoop`. Supplied once. */
+  sandboxClient: LoopSandboxClient
+  /** Build the per-cell runLoop options from the scenario (+ profile, when
+   *  used with `runProfileMatrix`). */
+  toLoopOptions: (
+    scenario: TScenario,
+    profile: AgentProfile,
+  ) => LoopOptionsForDispatch<Task, Output, Decision>
+  /** Map the finished loop to the artifact the judges score. Default:
+   *  `result.winner?.output`. A loop with no winner yields `undefined` (judges
+   *  skip the cell) — but the loop's token usage is STILL reported, so the
+   *  integrity guard sees real activity. */
+  toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact
+  /** Forward `loop.*` trace events into the campaign's scoped trace so loop
+   *  spans correlate with the cell. Default true. */
+  forwardTrace?: boolean
+  /** Cost-meter source label for the loop's spend. Default `'loop'`. */
+  costSource?: string
+}
+
+/** Bridge a campaign `DispatchContext.trace` to a `LoopTraceEmitter` so every
+ *  `loop.*` event lands as a span under the cell's scoped trace. */
+function campaignTraceToLoopEmitter(trace: CampaignTraceWriter): LoopTraceEmitter {
+  return {
+    emit(event) {
+      trace
+        .span(event.kind, { runId: event.runId, timestamp: event.timestamp, ...event.payload })
+        .end()
+    },
+  }
+}
+
+async function runLoopForCell<Task, Output, Decision, TScenario extends Scenario, TArtifact>(
+  opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>,
+  scenario: TScenario,
+  profile: AgentProfile,
+  ctx: DispatchContext,
+): Promise<TArtifact> {
+  const loopOptions = opts.toLoopOptions(scenario, profile)
+  const result = await runLoop<Task, Output, Decision>({
+    ...loopOptions,
+    ctx: {
+      sandboxClient: opts.sandboxClient,
+      signal: ctx.signal,
+      traceEmitter:
+        opts.forwardTrace === false ? undefined : campaignTraceToLoopEmitter(ctx.trace),
+    },
+  })
+  reportLoopUsage(ctx.cost, result, opts.costSource ?? 'loop')
+  const toArtifact =
+    opts.toArtifact ?? ((r: LoopResult<Task, Output, Decision>) => r.winner?.output as TArtifact)
+  return toArtifact(result)
+}
+
+/**
+ * Adapter for `runProfileMatrix` (profile is an axis). Returns a
+ * `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and
+ * reports usage automatically.
+ */
+export function loopDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(
+  opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>,
+): ProfileDispatchFn<TScenario, TArtifact> {
+  return (profile, scenario, ctx) => runLoopForCell(opts, scenario, profile, ctx)
+}
+
+/**
+ * Adapter for `runCampaign` (no profile axis). `toLoopOptions` receives only
+ * the scenario; the `profile` passed to the shared core is a stable sentinel
+ * so a single `runLoop` config is reused across cells.
+ */
+export function loopCampaignDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(
+  opts: Omit<LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>, 'toLoopOptions'> & {
+    toLoopOptions: (scenario: TScenario) => LoopOptionsForDispatch<Task, Output, Decision>
+  },
+): DispatchFn<TScenario, TArtifact> {
+  const profileSentinel = { id: 'loop-campaign', model: 'n/a@loop-campaign' } as AgentProfile
+  const profiled: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact> = {
+    ...opts,
+    toLoopOptions: (scenario) => opts.toLoopOptions(scenario),
+  }
+  return (scenario, ctx) => runLoopForCell(profiled, scenario, profileSentinel, ctx)
+}
diff --git a/tests/loops/loop-dispatch.test.ts b/tests/loops/loop-dispatch.test.ts
new file mode 100644
index 0000000..517a19c
--- /dev/null
+++ b/tests/loops/loop-dispatch.test.ts
@@ -0,0 +1,168 @@
+import type {
+  AgentProfile as SandboxAgentProfile,
+  CreateSandboxOptions,
+  SandboxEvent,
+  SandboxInstance,
+} from '@tangle-network/sandbox'
+import type { DispatchContext } from '@tangle-network/agent-eval/campaign'
+import { describe, expect, it } from 'vitest'
+import {
+  type AgentRunSpec,
+  createRefineDriver,
+  loopDispatch,
+  type OutputAdapter,
+  type Validator,
+} from '../../src/loops'
+
+interface Task {
+  goal: string
+}
+interface Output {
+  attempt: number
+}
+interface FakeScenario {
+  id: string
+  kind: string
+}
+
+const sandboxProfile: SandboxAgentProfile = { name: 'stub' }
+
+function spec(): AgentRunSpec<Task> {
+  return { profile: sandboxProfile, name: 'agent', taskToPrompt: (t) => t.goal }
+}
+
+const output: OutputAdapter<Output> = {
+  parse: (events) => {
+    const data = events.at(-1)?.data as { attempt?: number } | undefined
+    return { attempt: typeof data?.attempt === 'number' ? data.attempt : -1 }
+  },
+}
+
+const passAlways: Validator<Output> = {
+  async validate(out) {
+    return { valid: true, score: 1, scores: { attempt: out.attempt } }
+  },
+}
+
+function stubClient(events: SandboxEvent[]): { create(opts?: CreateSandboxOptions): Promise<SandboxInstance> } {
+  return {
+    async create() {
+      return {
+        async *streamPrompt() {
+          for (const e of events) yield e
+        },
+      } as unknown as SandboxInstance
+    },
+  }
+}
+
+/** Minimal campaign DispatchContext that records what the dispatch reports. */
+function fakeDispatchContext(): {
+  ctx: DispatchContext
+  observed: Array<{ usd: number; src: string }>
+  tokens: { input: number; output: number }
+  spans: string[]
+} {
+  const observed: Array<{ usd: number; src: string }> = []
+  const tokens = { input: 0, output: 0 }
+  const spans: string[] = []
+  const ctx: DispatchContext = {
+    cellId: 'cell-0',
+    rep: 0,
+    seed: 1,
+    signal: new AbortController().signal,
+    trace: {
+      span(name: string) {
+        spans.push(name)
+        return { end() {}, setAttribute() {} }
+      },
+      async flush() {},
+    },
+    artifacts: {
+      async write() {
+        return 'p'
+      },
+      async writeJson() {
+        return 'p'
+      },
+    },
+    cost: {
+      observe(usd: number, src: string) {
+        observed.push({ usd, src })
+      },
+      observeTokens(u: { input: number; output: number }) {
+        tokens.input += u.input
+        tokens.output += u.output
+      },
+      current() {
+        return 0
+      },
+      tokens() {
+        return tokens
+      },
+    },
+  }
+  return { ctx, observed, tokens, spans }
+}
+
+describe('loopDispatch', () => {
+  it('bridges runLoop into a ProfileDispatchFn: returns the winner artifact, reports usage, forwards trace', async () => {
+    const sandboxClient = stubClient([
+      { type: 'llm_call', data: { tokensIn: 150, tokensOut: 60, costUsd: 0.02, model: 'm' } },
+      { type: 'result', data: { attempt: 2 } },
+    ])
+    const dispatch = loopDispatch<Task, Output, 'stop', FakeScenario, Output>({
+      sandboxClient,
+      toLoopOptions: (scenario) => ({
+        driver: createRefineDriver<Task, Output>(),
+        agentRun: spec(),
+        output,
+        validator: passAlways,
+        task: { goal: scenario.id },
+        maxIterations: 1,
+      }),
+    })
+
+    const fake = fakeDispatchContext()
+    const profile = { id: 'baseline', model: 'test-model@2025-01-01' }
+    const artifact = await dispatch(profile, { id: 's1', kind: 'task' }, fake.ctx)
+
+    // Returns the loop's winner output.
+    expect(artifact).toEqual({ attempt: 2 })
+    // Usage reported to the campaign cost meter — the integrity guard's input.
+    expect(fake.observed).toEqual([{ usd: 0.02, src: 'loop' }])
+    expect(fake.tokens).toEqual({ input: 150, output: 60 })
+    // Loop trace events forwarded into the campaign trace as spans.
+    expect(fake.spans).toContain('loop.started')
+    expect(fake.spans).toContain('loop.ended')
+  })
+
+  it('reports usage even when the run fails the validator (real activity must NOT read as a stub)', async () => {
+    const failAlways: Validator<Output> = {
+      async validate() {
+        return { valid: false, score: 0, scores: {}, notes: 'no' }
+      },
+    }
+    const sandboxClient = stubClient([
+      { type: 'llm_call', data: { tokensIn: 90, tokensOut: 20, costUsd: 0.01, model: 'm' } },
+      { type: 'result', data: { attempt: 1 } },
+    ])
+    const dispatch = loopDispatch<Task, Output, 'stop', FakeScenario, Output>({
+      sandboxClient,
+      toLoopOptions: (scenario) => ({
+        driver: createRefineDriver<Task, Output>(),
+        agentRun: spec(),
+        output,
+        validator: failAlways,
+        task: { goal: scenario.id },
+        maxIterations: 1,
+      }),
+    })
+    const fake = fakeDispatchContext()
+    await dispatch({ id: 'p', model: 'm@2025-01-01' }, { id: 's1', kind: 'task' }, fake.ctx)
+    // The validator failed, but real LLM activity happened — tokens + cost MUST
+    // still reach the cost meter, or the integrity guard would call it a stub.
+    expect(fake.tokens).toEqual({ input: 90, output: 20 })
+    expect(fake.observed).toEqual([{ usd: 0.01, src: 'loop' }])
+  })
+})

From fad618ee7e7eba2fa0fb1c76c554eda658faa60c Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 09:25:05 -0600
Subject: [PATCH 4/5] chore(deps): declare agent-eval as a required
 peerDependency, not a hard dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Version-discipline fix (boundary critique, VERSIONING 3/10). agent-eval was the
lone hard dependency while sandbox + agent-knowledge are already peers. A hard
dep lets pnpm install a SECOND, divergent agent-eval tree with an incompatible
RunRecord/DefaultVerdict; today only pnpm.overrides prevents it. As a peer
(>=0.61.0 <1.0.0, required — not optional), a consumer running a stale or
divergent substrate gets a loud unmet-peer warning instead of a silent split
tree. agent-eval moves to devDependencies for agent-runtime's own build/test.
Typecheck + full suite (383) green with the peer layout.
---
 package.json   | 6 +++---
 pnpm-lock.yaml | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/package.json b/package.json
index 4565a5a..7835731 100644
--- a/package.json
+++ b/package.json
@@ -75,11 +75,10 @@
     "lint:fix": "biome check --write src tests examples",
     "typecheck": "tsc --noEmit"
   },
-  "dependencies": {
-    "@tangle-network/agent-eval": "^0.61.0"
-  },
+  "dependencies": {},
   "devDependencies": {
     "@biomejs/biome": "^2.4.0",
+    "@tangle-network/agent-eval": "^0.61.0",
     "@tangle-network/sandbox": "^0.4.0",
     "@types/node": "^25.6.0",
     "tsup": "^8.0.0",
@@ -101,6 +100,7 @@
   "license": "MIT",
   "packageManager": "pnpm@10.28.0",
   "peerDependencies": {
+    "@tangle-network/agent-eval": ">=0.61.0 <1.0.0",
     "@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0",
     "@tangle-network/sandbox": ">=0.1.2 <0.5.0"
   },
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index ee94426..45dc755 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -8,9 +8,6 @@ importers:
 
   .:
     dependencies:
-      '@tangle-network/agent-eval':
-        specifier: ^0.61.0
-        version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)
       '@tangle-network/agent-knowledge':
         specifier: '>=1.3.0 <2.0.0'
         version: 1.4.0(typescript@5.9.3)(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))
@@ -18,6 +15,9 @@ importers:
       '@biomejs/biome':
         specifier: ^2.4.0
         version: 2.4.15
+      '@tangle-network/agent-eval':
+        specifier: ^0.61.0
+        version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)
       '@tangle-network/sandbox':
         specifier: ^0.4.0
         version: 0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))

From ffc89ce123c22f797409b36cb7ac4d36817409e1 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 09:27:33 -0600
Subject: [PATCH 5/5] =?UTF-8?q?chore(release):=200.32.0=20=E2=80=94=20loop?=
 =?UTF-8?q?Dispatch=20adapter=20+=20tokenUsage=20seam=20+=20agent-eval=20p?=
 =?UTF-8?q?eer-dep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index 7835731..a2f128b 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-runtime",
-  "version": "0.31.0",
+  "version": "0.32.0",
   "description": "Reusable runtime lifecycle for domain-specific agents.",
   "homepage": "https://github.com/tangle-network/agent-runtime#readme",
   "repository": {