From 570326b5ebe6bf065369c2ad405492196b48322b Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 08:47:22 -0600
Subject: [PATCH 1/7] feat(loops): surface aggregated tokenUsage on LoopResult
 + reportLoopUsage bridge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

runLoop tracked per-call tokensIn/tokensOut (extractLlmCallEvent) but only
aggregated costUsd — token counts were dropped before reaching Iteration or
LoopResult. A runProfileMatrix/runCampaign dispatch wrapping runLoop could
report cost but had no tokens to report, so agent-eval's backend-integrity
guard (assertRealBackend, which keys on tokenUsage) would misread a real run
as a stub and throw.

- Iteration + LoopResult gain tokenUsage: { input, output }, summed across
  every llm_call event (per iteration) and across iterations (LoopResult).
- reportLoopUsage(cost, result) forwards a finished loop's cost + tokens into
  a campaign cost meter in one call — the trivial consumption path for the new
  runProfileMatrix primitive. Typed structurally so loops stay free of an
  agent-eval import.

Extends the existing cost-aggregation test to assert token aggregation +
reportLoopUsage forwarding. Full suite 381 green.
---
 src/loops/index.ts         |  2 ++
 src/loops/report-usage.ts  | 41 ++++++++++++++++++++++++++++++++++++++
 src/loops/run-loop.ts      | 12 +++++++++++
 src/loops/types.ts         | 15 ++++++++++++++
 tests/loops/refine.test.ts | 22 ++++++++++++++++++++
 5 files changed, 92 insertions(+)
 create mode 100644 src/loops/report-usage.ts

diff --git a/src/loops/index.ts b/src/loops/index.ts
index ae9fad3..7d74986 100644
--- a/src/loops/index.ts
+++ b/src/loops/index.ts
@@ -27,6 +27,7 @@ export type { CreateRefineDriverOptions, RefineDecision } from './drivers/refine
 export { createRefineDriver, refineWinnerIndex } from './drivers/refine'
 export type { RunLoopOptions } from './run-loop'
 export { runLoop } from './run-loop'
+export { reportLoopUsage, type UsageSink } from './report-usage'
 export type {
   AgentRunSpec,
   DefaultVerdict,
@@ -42,6 +43,7 @@ export type {
   LoopSandboxClient,
   LoopSandboxPlacement,
   LoopStartedPayload,
+  LoopTokenUsage,
   LoopTraceEmitter,
   LoopTraceEvent,
   LoopWinner,
diff --git a/src/loops/report-usage.ts b/src/loops/report-usage.ts
new file mode 100644
index 0000000..30c9a6d
--- /dev/null
+++ b/src/loops/report-usage.ts
@@ -0,0 +1,41 @@
+/**
+ * Bridge a finished `runLoop` into an agent-eval campaign / profile-matrix
+ * dispatch.
+ *
+ * `runProfileMatrix` (and `runCampaign`) run the backend-integrity guard over
+ * the token usage a dispatch reports through `ctx.cost`. A dispatch that wraps
+ * `runLoop` must forward the loop's cost AND token usage, or the guard reads
+ * the run as a stub and throws. `reportLoopUsage` is that one line:
+ *
+ *   const dispatch: ProfileDispatchFn<S, A> = async (profile, scenario, ctx) => {
+ *     const result = await runLoop({ ...optsFor(profile, scenario), ctx: loopCtx })
+ *     reportLoopUsage(ctx, result)
+ *     return result.winner?.output as A
+ *   }
+ *
+ * Typed structurally against the campaign `DispatchContext.cost` so this module
+ * stays free of an agent-eval import — it works with any cost meter exposing
+ * `observe` + `observeTokens`.
+ */
+
+import type { LoopResult } from './types'
+
+/** The slice of an agent-eval campaign `DispatchContext.cost` this needs. */
+export interface UsageSink {
+  observe(amountUsd: number, source: string): void
+  observeTokens(usage: { input: number; output: number }): void
+}
+
+/**
+ * Forward a `LoopResult`'s aggregated cost + token usage into a campaign cost
+ * meter so the backend-integrity guard sees real LLM activity. `source`
+ * defaults to `'loop'`.
+ */
+export function reportLoopUsage<Task, Output, Decision>(
+  cost: UsageSink,
+  result: Pick<LoopResult<Task, Output, Decision>, 'costUsd' | 'tokenUsage'>,
+  source = 'loop',
+): void {
+  cost.observe(result.costUsd, source)
+  cost.observeTokens({ input: result.tokenUsage.input, output: result.tokenUsage.output })
+}
diff --git a/src/loops/run-loop.ts b/src/loops/run-loop.ts
index fafc52a..c7c8a77 100644
--- a/src/loops/run-loop.ts
+++ b/src/loops/run-loop.ts
@@ -149,6 +149,7 @@ export async function runLoop<Task, Output, Decision>(
           startedAt: now(),
           endedAt: 0,
           costUsd: 0,
+          tokenUsage: { input: 0, output: 0 },
         })
       }
 
@@ -288,6 +289,8 @@ async function executeIteration<Task, Output>(args: ExecuteIterationArgs<Task, O
       const llmCall = extractLlmCallEvent(event, slot.agentRunName)
       if (llmCall) {
         slot.costUsd += llmCall.costUsd ?? 0
+        slot.tokenUsage.input += llmCall.tokensIn ?? 0
+        slot.tokenUsage.output += llmCall.tokensOut ?? 0
         args.ctx.runHandle?.observe(llmCall)
       }
     }
@@ -405,12 +408,21 @@ function finalize<Task, Output, Decision>(
 ): LoopResult<Task, Output, Decision> {
   const winner = (args.options.selectWinner ?? defaultSelectWinner)(args.iterations)
   const costUsd = args.iterations.reduce((sum, iter) => sum + (iter.costUsd || 0), 0)
+  const tokenUsage = args.iterations.reduce(
+    (acc, iter) => {
+      acc.input += iter.tokenUsage?.input ?? 0
+      acc.output += iter.tokenUsage?.output ?? 0
+      return acc
+    },
+    { input: 0, output: 0 },
+  )
   const result: LoopResult<Task, Output, Decision> = {
     decision: args.decision,
     iterations: args.iterations,
     winner,
     durationMs: args.now() - args.startMs,
     costUsd,
+    tokenUsage,
   }
   void emitTrace(args.options.ctx.traceEmitter, {
     kind: 'loop.ended',
diff --git a/src/loops/types.ts b/src/loops/types.ts
index a28eae0..a183cd9 100644
--- a/src/loops/types.ts
+++ b/src/loops/types.ts
@@ -90,6 +90,15 @@ export interface OutputAdapter<Output> {
   parse(events: SandboxEvent[]): Output
 }
 
+/** LLM token usage. Structurally matches agent-eval's `RunTokenUsage` /
+ *  `CampaignTokenUsage` ({ input, output }) so a loop result maps straight
+ *  onto `ctx.cost.observeTokens` in a `runProfileMatrix` dispatch — without
+ *  which the backend-integrity guard reads the run as a stub. */
+export interface LoopTokenUsage {
+  input: number
+  output: number
+}
+
 /** @experimental */
 export interface Iteration<Task, Output> {
   /** 0-based iteration index assigned by the kernel. */
@@ -105,6 +114,8 @@ export interface Iteration<Task, Output> {
   startedAt: number
   endedAt: number
   costUsd: number
+  /** Summed LLM token usage across every `llm_call` event in this iteration. */
+  tokenUsage: LoopTokenUsage
 }
 
 /** @experimental */
@@ -144,6 +155,10 @@ export interface LoopResult<Task, Output, Decision> {
   durationMs: number
   /** Sum of every iteration's `costUsd`. */
   costUsd: number
+  /** Sum of every iteration's token usage. Forward to
+   *  `ctx.cost.observeTokens` in a `runProfileMatrix` dispatch so the
+   *  integrity guard sees real LLM activity. */
+  tokenUsage: LoopTokenUsage
 }
 
 /**
diff --git a/tests/loops/refine.test.ts b/tests/loops/refine.test.ts
index 50121de..a8b4915 100644
--- a/tests/loops/refine.test.ts
+++ b/tests/loops/refine.test.ts
@@ -11,6 +11,7 @@ import {
   type LoopTraceEvent,
   type OutputAdapter,
   refineWinnerIndex,
+  reportLoopUsage,
   runLoop,
   type Validator,
 } from '../../src/loops'
@@ -242,6 +243,27 @@ describe('runLoop + createRefineDriver', () => {
     expect(result.iterations[0]?.costUsd).toBeCloseTo(0.01, 9)
     expect(result.iterations[1]?.costUsd).toBeCloseTo(0.02, 9)
     expect(result.costUsd).toBeCloseTo(0.03, 9)
+    // Token usage must aggregate too — a runProfileMatrix dispatch forwards
+    // this to the backend-integrity guard; if it stayed 0/0 a real run would
+    // be misread as a stub.
+    expect(result.iterations[0]?.tokenUsage).toEqual({ input: 100, output: 50 })
+    expect(result.iterations[1]?.tokenUsage).toEqual({ input: 80, output: 30 })
+    expect(result.tokenUsage).toEqual({ input: 180, output: 80 })
+
+    // reportLoopUsage forwards both cost AND tokens into a campaign cost meter.
+    const observed: Array<{ usd: number; src: string }> = []
+    let tokens = { input: 0, output: 0 }
+    reportLoopUsage(
+      {
+        observe: (usd, src) => observed.push({ usd, src }),
+        observeTokens: (u) => {
+          tokens = u
+        },
+      },
+      result,
+    )
+    expect(observed).toEqual([{ usd: 0.03, src: 'loop' }])
+    expect(tokens).toEqual({ input: 180, output: 80 })
   })
 
   it('refineWinnerIndex returns the last valid iteration', () => {

From 9cbd6862d806a53c113ae065b1c71673f8d222d8 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 08:52:12 -0600
Subject: [PATCH 2/7] =?UTF-8?q?chore(deps):=20bump=20@tangle-network/agent?=
 =?UTF-8?q?-eval=20^0.54.0=20=E2=86=92=20^0.61.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Consumes the published runProfileMatrix + token-capture release. 7-minor
jump verified: typecheck + build + full suite (381) green.
---
 package.json   |  2 +-
 pnpm-lock.yaml | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/package.json b/package.json
index a6c2aca..4565a5a 100644
--- a/package.json
+++ b/package.json
@@ -76,7 +76,7 @@
     "typecheck": "tsc --noEmit"
   },
   "dependencies": {
-    "@tangle-network/agent-eval": "^0.54.0"
+    "@tangle-network/agent-eval": "^0.61.0"
   },
   "devDependencies": {
     "@biomejs/biome": "^2.4.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 7e4087e..ee94426 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -9,8 +9,8 @@ importers:
   .:
     dependencies:
       '@tangle-network/agent-eval':
-        specifier: ^0.54.0
-        version: 0.54.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)
+        specifier: ^0.61.0
+        version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)
       '@tangle-network/agent-knowledge':
         specifier: '>=1.3.0 <2.0.0'
         version: 1.4.0(typescript@5.9.3)(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))
@@ -458,12 +458,12 @@ packages:
     engines: {node: '>=20'}
     hasBin: true
 
-  '@tangle-network/agent-eval@0.54.0':
-    resolution: {integrity: sha512-9dmCfXOBZHbmX//RrN/8iKUfmTB21hwjKEWD6qWFszwNK7/KoCzootKsYr6s1yt2vCoX1F54LjwE9qn1VNfUKw==}
+  '@tangle-network/agent-eval@0.61.0':
+    resolution: {integrity: sha512-yydVL47bNa2lNaapgFnDKjYRPJfpVTK8luFASCuLNyKtahibMM7bXF+JCScKhYdtEwCYiUijZI0F8VaoIvDi3g==}
     engines: {node: '>=20'}
     hasBin: true
     peerDependencies:
-      '@tangle-network/sandbox': '>=0.2.1 <0.4.0'
+      '@tangle-network/sandbox': '>=0.2.1 <0.5.0'
     peerDependenciesMeta:
       '@tangle-network/sandbox':
         optional: true
@@ -1289,7 +1289,7 @@ snapshots:
       - typescript
       - utf-8-validate
 
-  '@tangle-network/agent-eval@0.54.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)':
+  '@tangle-network/agent-eval@0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)':
     dependencies:
       '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.2)
       '@ax-llm/ax': 19.0.45(zod@4.4.2)

From 01f3b2874ad05cdbd157687b696fa7b388da84aa Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 09:08:21 -0600
Subject: [PATCH 3/7] =?UTF-8?q?feat(loops):=20loopDispatch=20=E2=80=94=20f?=
 =?UTF-8?q?irst-class=20runLoop=E2=86=92campaign=20dispatch=20adapter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The seam critique found reportLoopUsage had one consumer (a test) and zero
products: wiring runLoop into runProfileMatrix/runCampaign required hand-building
ExecCtx, hand-adapting the campaign trace, and remembering to forward usage
(forgetting the last yields a {0,0} stub cell). loopDispatch collapses all three
into one typed call:

  const dispatch = loopDispatch({ sandboxClient, toLoopOptions })
  await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })

It builds the ExecCtx, forwards loop.* trace events into the campaign's scoped
trace (campaignTraceToLoopEmitter), runs runLoop, reports cost+tokens via
reportLoopUsage internally, and returns winner.output. loopCampaignDispatch is
the runCampaign (no-profile) variant. AgentProfile imported from agent-eval
(the eval-harness type ProfileDispatchFn keys on), NOT sandbox's — closes the
name-collision footgun at this call site.

Tests: returns winner artifact + reports exact usage + forwards trace spans;
usage still flows on a validator-failing run (must not read as a stub).
Full suite 383 green.
---
 src/loops/index.ts                |   6 ++
 src/loops/loop-dispatch.ts        | 132 +++++++++++++++++++++++
 tests/loops/loop-dispatch.test.ts | 168 ++++++++++++++++++++++++++++++
 3 files changed, 306 insertions(+)
 create mode 100644 src/loops/loop-dispatch.ts
 create mode 100644 tests/loops/loop-dispatch.test.ts

diff --git a/src/loops/index.ts b/src/loops/index.ts
index 7d74986..d854266 100644
--- a/src/loops/index.ts
+++ b/src/loops/index.ts
@@ -28,6 +28,12 @@ export { createRefineDriver, refineWinnerIndex } from './drivers/refine'
 export type { RunLoopOptions } from './run-loop'
 export { runLoop } from './run-loop'
 export { reportLoopUsage, type UsageSink } from './report-usage'
+export {
+  loopCampaignDispatch,
+  loopDispatch,
+  type LoopDispatchOptions,
+  type LoopOptionsForDispatch,
+} from './loop-dispatch'
 export type {
   AgentRunSpec,
   DefaultVerdict,
diff --git a/src/loops/loop-dispatch.ts b/src/loops/loop-dispatch.ts
new file mode 100644
index 0000000..e4c9a77
--- /dev/null
+++ b/src/loops/loop-dispatch.ts
@@ -0,0 +1,132 @@
+/**
+ * `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch.
+ *
+ * Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` /
+ * `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a
+ * sandbox client, (b) adapt the campaign `DispatchContext.trace` into a
+ * `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to
+ * forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
+ * `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
+ * the third silent. The fleet's products skipped (c) and fell back to a
+ * `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
+ * to kill.
+ *
+ * `loopDispatch` collapses all three into one typed call:
+ *
+ *   const dispatch = loopDispatch({
+ *     sandboxClient,
+ *     toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }),
+ *   })
+ *   await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
+ *
+ * Usage is reported automatically; trace events are forwarded automatically;
+ * the ctx is built automatically. The seam becomes impossible to mis-wire.
+ *
+ * Typed structurally against the campaign `DispatchContext` (imported type-only
+ * from `@tangle-network/agent-eval/campaign`) — a downward dependency, never an
+ * inversion.
+ */
+
+// agent-eval's AgentProfile (the eval-harness unit of variation, `model: string`)
+// — NOT sandbox's AgentProfile. ProfileDispatchFn is keyed on the former.
+import type { AgentProfile } from '@tangle-network/agent-eval'
+import type {
+  CampaignTraceWriter,
+  DispatchContext,
+  DispatchFn,
+  ProfileDispatchFn,
+  Scenario,
+} from '@tangle-network/agent-eval/campaign'
+import { reportLoopUsage } from './report-usage'
+import { type RunLoopOptions, runLoop } from './run-loop'
+import type { LoopResult, LoopSandboxClient, LoopTraceEmitter } from './types'
+
+/** runLoop options minus the `ctx` (loopDispatch builds the ctx). */
+export type LoopOptionsForDispatch<Task, Output, Decision> = Omit<
+  RunLoopOptions<Task, Output, Decision>,
+  'ctx'
+>
+
+export interface LoopDispatchOptions<Task, Output, Decision, TScenario extends Scenario, TArtifact> {
+  /** Sandbox client used for every cell's `runLoop`. Supplied once. */
+  sandboxClient: LoopSandboxClient
+  /** Build the per-cell runLoop options from the scenario (+ profile, when
+   *  used with `runProfileMatrix`). */
+  toLoopOptions: (
+    scenario: TScenario,
+    profile: AgentProfile,
+  ) => LoopOptionsForDispatch<Task, Output, Decision>
+  /** Map the finished loop to the artifact the judges score. Default:
+   *  `result.winner?.output`. A loop with no winner yields `undefined` (judges
+   *  skip the cell) — but the loop's token usage is STILL reported, so the
+   *  integrity guard sees real activity. */
+  toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact
+  /** Forward `loop.*` trace events into the campaign's scoped trace so loop
+   *  spans correlate with the cell. Default true. */
+  forwardTrace?: boolean
+  /** Cost-meter source label for the loop's spend. Default `'loop'`. */
+  costSource?: string
+}
+
+/** Bridge a campaign `DispatchContext.trace` to a `LoopTraceEmitter` so every
+ *  `loop.*` event lands as a span under the cell's scoped trace. */
+function campaignTraceToLoopEmitter(trace: CampaignTraceWriter): LoopTraceEmitter {
+  return {
+    emit(event) {
+      trace
+        .span(event.kind, { runId: event.runId, timestamp: event.timestamp, ...event.payload })
+        .end()
+    },
+  }
+}
+
+async function runLoopForCell<Task, Output, Decision, TScenario extends Scenario, TArtifact>(
+  opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>,
+  scenario: TScenario,
+  profile: AgentProfile,
+  ctx: DispatchContext,
+): Promise<TArtifact> {
+  const loopOptions = opts.toLoopOptions(scenario, profile)
+  const result = await runLoop<Task, Output, Decision>({
+    ...loopOptions,
+    ctx: {
+      sandboxClient: opts.sandboxClient,
+      signal: ctx.signal,
+      traceEmitter:
+        opts.forwardTrace === false ? undefined : campaignTraceToLoopEmitter(ctx.trace),
+    },
+  })
+  reportLoopUsage(ctx.cost, result, opts.costSource ?? 'loop')
+  const toArtifact =
+    opts.toArtifact ?? ((r: LoopResult<Task, Output, Decision>) => r.winner?.output as TArtifact)
+  return toArtifact(result)
+}
+
+/**
+ * Adapter for `runProfileMatrix` (profile is an axis). Returns a
+ * `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and
+ * reports usage automatically.
+ */
+export function loopDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(
+  opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>,
+): ProfileDispatchFn<TScenario, TArtifact> {
+  return (profile, scenario, ctx) => runLoopForCell(opts, scenario, profile, ctx)
+}
+
+/**
+ * Adapter for `runCampaign` (no profile axis). `toLoopOptions` receives only
+ * the scenario; the `profile` passed to the shared core is a stable sentinel
+ * so a single `runLoop` config is reused across cells.
+ */
+export function loopCampaignDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(
+  opts: Omit<LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>, 'toLoopOptions'> & {
+    toLoopOptions: (scenario: TScenario) => LoopOptionsForDispatch<Task, Output, Decision>
+  },
+): DispatchFn<TScenario, TArtifact> {
+  const profileSentinel = { id: 'loop-campaign', model: 'n/a@loop-campaign' } as AgentProfile
+  const profiled: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact> = {
+    ...opts,
+    toLoopOptions: (scenario) => opts.toLoopOptions(scenario),
+  }
+  return (scenario, ctx) => runLoopForCell(profiled, scenario, profileSentinel, ctx)
+}
diff --git a/tests/loops/loop-dispatch.test.ts b/tests/loops/loop-dispatch.test.ts
new file mode 100644
index 0000000..517a19c
--- /dev/null
+++ b/tests/loops/loop-dispatch.test.ts
@@ -0,0 +1,168 @@
+import type {
+  AgentProfile as SandboxAgentProfile,
+  CreateSandboxOptions,
+  SandboxEvent,
+  SandboxInstance,
+} from '@tangle-network/sandbox'
+import type { DispatchContext } from '@tangle-network/agent-eval/campaign'
+import { describe, expect, it } from 'vitest'
+import {
+  type AgentRunSpec,
+  createRefineDriver,
+  loopDispatch,
+  type OutputAdapter,
+  type Validator,
+} from '../../src/loops'
+
+interface Task {
+  goal: string
+}
+interface Output {
+  attempt: number
+}
+interface FakeScenario {
+  id: string
+  kind: string
+}
+
+const sandboxProfile: SandboxAgentProfile = { name: 'stub' }
+
+function spec(): AgentRunSpec<Task> {
+  return { profile: sandboxProfile, name: 'agent', taskToPrompt: (t) => t.goal }
+}
+
+const output: OutputAdapter<Output> = {
+  parse: (events) => {
+    const data = events.at(-1)?.data as { attempt?: number } | undefined
+    return { attempt: typeof data?.attempt === 'number' ? data.attempt : -1 }
+  },
+}
+
+const passAlways: Validator<Output> = {
+  async validate(out) {
+    return { valid: true, score: 1, scores: { attempt: out.attempt } }
+  },
+}
+
+function stubClient(events: SandboxEvent[]): { create(opts?: CreateSandboxOptions): Promise<SandboxInstance> } {
+  return {
+    async create() {
+      return {
+        async *streamPrompt() {
+          for (const e of events) yield e
+        },
+      } as unknown as SandboxInstance
+    },
+  }
+}
+
+/** Minimal campaign DispatchContext that records what the dispatch reports. */
+function fakeDispatchContext(): {
+  ctx: DispatchContext
+  observed: Array<{ usd: number; src: string }>
+  tokens: { input: number; output: number }
+  spans: string[]
+} {
+  const observed: Array<{ usd: number; src: string }> = []
+  const tokens = { input: 0, output: 0 }
+  const spans: string[] = []
+  const ctx: DispatchContext = {
+    cellId: 'cell-0',
+    rep: 0,
+    seed: 1,
+    signal: new AbortController().signal,
+    trace: {
+      span(name: string) {
+        spans.push(name)
+        return { end() {}, setAttribute() {} }
+      },
+      async flush() {},
+    },
+    artifacts: {
+      async write() {
+        return 'p'
+      },
+      async writeJson() {
+        return 'p'
+      },
+    },
+    cost: {
+      observe(usd: number, src: string) {
+        observed.push({ usd, src })
+      },
+      observeTokens(u: { input: number; output: number }) {
+        tokens.input += u.input
+        tokens.output += u.output
+      },
+      current() {
+        return 0
+      },
+      tokens() {
+        return tokens
+      },
+    },
+  }
+  return { ctx, observed, tokens, spans }
+}
+
+describe('loopDispatch', () => {
+  it('bridges runLoop into a ProfileDispatchFn: returns the winner artifact, reports usage, forwards trace', async () => {
+    const sandboxClient = stubClient([
+      { type: 'llm_call', data: { tokensIn: 150, tokensOut: 60, costUsd: 0.02, model: 'm' } },
+      { type: 'result', data: { attempt: 2 } },
+    ])
+    const dispatch = loopDispatch<Task, Output, 'stop', FakeScenario, Output>({
+      sandboxClient,
+      toLoopOptions: (scenario) => ({
+        driver: createRefineDriver<Task, Output>(),
+        agentRun: spec(),
+        output,
+        validator: passAlways,
+        task: { goal: scenario.id },
+        maxIterations: 1,
+      }),
+    })
+
+    const fake = fakeDispatchContext()
+    const profile = { id: 'baseline', model: 'test-model@2025-01-01' }
+    const artifact = await dispatch(profile, { id: 's1', kind: 'task' }, fake.ctx)
+
+    // Returns the loop's winner output.
+    expect(artifact).toEqual({ attempt: 2 })
+    // Usage reported to the campaign cost meter — the integrity guard's input.
+    expect(fake.observed).toEqual([{ usd: 0.02, src: 'loop' }])
+    expect(fake.tokens).toEqual({ input: 150, output: 60 })
+    // Loop trace events forwarded into the campaign trace as spans.
+    expect(fake.spans).toContain('loop.started')
+    expect(fake.spans).toContain('loop.ended')
+  })
+
+  it('reports usage even when the run fails the validator (real activity must NOT read as a stub)', async () => {
+    const failAlways: Validator<Output> = {
+      async validate() {
+        return { valid: false, score: 0, scores: {}, notes: 'no' }
+      },
+    }
+    const sandboxClient = stubClient([
+      { type: 'llm_call', data: { tokensIn: 90, tokensOut: 20, costUsd: 0.01, model: 'm' } },
+      { type: 'result', data: { attempt: 1 } },
+    ])
+    const dispatch = loopDispatch<Task, Output, 'stop', FakeScenario, Output>({
+      sandboxClient,
+      toLoopOptions: (scenario) => ({
+        driver: createRefineDriver<Task, Output>(),
+        agentRun: spec(),
+        output,
+        validator: failAlways,
+        task: { goal: scenario.id },
+        maxIterations: 1,
+      }),
+    })
+    const fake = fakeDispatchContext()
+    await dispatch({ id: 'p', model: 'm@2025-01-01' }, { id: 's1', kind: 'task' }, fake.ctx)
+    // The validator failed, but real LLM activity happened — tokens + cost MUST
+    // still reach the cost meter, or the integrity guard would call it a stub.
+    expect(fake.tokens).toEqual({ input: 90, output: 20 })
+    expect(fake.observed).toEqual([{ usd: 0.01, src: 'loop' }])
+  })
+})

From fad618ee7e7eba2fa0fb1c76c554eda658faa60c Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 09:25:05 -0600
Subject: [PATCH 4/7] chore(deps): declare agent-eval as a required
 peerDependency, not a hard dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Version-discipline fix (boundary critique, VERSIONING 3/10). agent-eval was the
lone hard dependency while sandbox + agent-knowledge are already peers. A hard
dep lets pnpm install a SECOND, divergent agent-eval tree with an incompatible
RunRecord/DefaultVerdict; today only pnpm.overrides prevents it. As a peer
(>=0.61.0 <1.0.0, required — not optional), a consumer running a stale or
divergent substrate gets a loud unmet-peer warning instead of a silent split
tree. agent-eval moves to devDependencies for agent-runtime's own build/test.
Typecheck + full suite (383) green with the peer layout.
---
 package.json   | 6 +++---
 pnpm-lock.yaml | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/package.json b/package.json
index 4565a5a..7835731 100644
--- a/package.json
+++ b/package.json
@@ -75,11 +75,10 @@
     "lint:fix": "biome check --write src tests examples",
     "typecheck": "tsc --noEmit"
   },
-  "dependencies": {
-    "@tangle-network/agent-eval": "^0.61.0"
-  },
+  "dependencies": {},
   "devDependencies": {
     "@biomejs/biome": "^2.4.0",
+    "@tangle-network/agent-eval": "^0.61.0",
     "@tangle-network/sandbox": "^0.4.0",
     "@types/node": "^25.6.0",
     "tsup": "^8.0.0",
@@ -101,6 +100,7 @@
   "license": "MIT",
   "packageManager": "pnpm@10.28.0",
   "peerDependencies": {
+    "@tangle-network/agent-eval": ">=0.61.0 <1.0.0",
     "@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0",
     "@tangle-network/sandbox": ">=0.1.2 <0.5.0"
   },
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index ee94426..45dc755 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -8,9 +8,6 @@ importers:
 
   .:
     dependencies:
-      '@tangle-network/agent-eval':
-        specifier: ^0.61.0
-        version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)
       '@tangle-network/agent-knowledge':
         specifier: '>=1.3.0 <2.0.0'
         version: 1.4.0(typescript@5.9.3)(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))
@@ -18,6 +15,9 @@ importers:
       '@biomejs/biome':
         specifier: ^2.4.0
         version: 2.4.15
+      '@tangle-network/agent-eval':
+        specifier: ^0.61.0
+        version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)
       '@tangle-network/sandbox':
         specifier: ^0.4.0
         version: 0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))

From ffc89ce123c22f797409b36cb7ac4d36817409e1 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 09:27:33 -0600
Subject: [PATCH 5/7] =?UTF-8?q?chore(release):=200.32.0=20=E2=80=94=20loop?=
 =?UTF-8?q?Dispatch=20adapter=20+=20tokenUsage=20seam=20+=20agent-eval=20p?=
 =?UTF-8?q?eer-dep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index 7835731..a2f128b 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-runtime",
-  "version": "0.31.0",
+  "version": "0.32.0",
   "description": "Reusable runtime lifecycle for domain-specific agents.",
   "homepage": "https://github.com/tangle-network/agent-runtime#readme",
   "repository": {

From 39ccd427188d5d9f9595a417290098795dc9e23a Mon Sep 17 00:00:00 2001
From: tangletools <hello@tangle.tools>
Date: Sat, 30 May 2026 18:58:36 -0600
Subject: [PATCH 6/7] feat(loops+improvement): dynamic loop driver +
 identity-gated optimizePrompt (#75)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(loops): dynamic driver — agent-authored loop topology

Third example driver alongside refine and fanout-vote, built on the
existing Driver seam with zero kernel changes. Where refine/fanout-vote
encode a fixed shape as a pure function of history, createDynamicDriver
delegates the per-round shape to an injected TopologyPlanner that emits
one TopologyMove (refine | fanout | stop) per round.

- createDynamicDriver: maps moves onto plan/decide, enforces the
  iteration + fanout caps, fails loud (PlannerError) on a malformed move.
  Planner invoked once per round in plan(); decide() reads the cached
  move so an LLM planner is never double-called. 'done' is already a
  kernel-terminal decision, so termination needs no kernel change.
- createSandboxPlanner: wires the planner to a sandbox profile (any
  harness) — streams a prompt carrying the history summary, decodes the
  move from a JSON envelope (structured result event or fenced block).
- summarizeHistory: bounded, planner-friendly view of iteration history.
- PlannerError added to the error taxonomy (carries 'validation').

Topology is orthogonal to harness: the planner never names a backend;
the kernel's agentRuns round-robin decides which harness runs a branch,
so one dynamic driver spans claude-code/codex/opencode/pi, including
fanning a single round across several at once.

11 tests through the real kernel (sandbox stubbed at the process
boundary): adaptive refine→refine→fanout→stop, explicit scripted
trajectory across two harnesses, maxIterations cap, maxFanout clamp,
empty-fanout + unknown-kind PlannerError, createSandboxPlanner
end-to-end + n-shorthand + fenced-delta parse + decodeTask rejection.

* feat(improvement): optimizePrompt — identity-gated optimization for any text prompt surface

The text-surface entry point onto agent-eval's runImprovementLoop, sibling
to improvementDriver (the code/worktree path). Defaults the driver to
agent-eval's gepaDriver (reflective text mutator) and the gate to
heldOutGate; runtime-agnostic via a single runWithPrompt seam.

Identity-gated by construction: the loop runs evals, collects per-scenario
signal, proposes candidates, and the held-out gate compares candidate vs
baseline. result.prompt is the baseline (identity) UNLESS the gate decided
'ship' — so registering a prompt for optimization can never regress it; it
only improves when held-out data earns it.

Generic over the surface's execution (sandbox streamPrompt, runLoop, direct
model call) — the optimizer never assumes how a prompt runs. Fails loud on
misconfig (no driver/reflection, empty scenarios/holdout) and on a non-string
CodeSurface (wrong entry point).

4 tests through the real runImprovementLoop, zero LLM (deterministic driver +
judge + runner, in-memory storage): identity holds when no candidate beats
baseline on holdout (returns the untouched baseline), promotes + returns the
improved prompt + rationale when a candidate wins, fail-loud on misconfig and
empty holdout.

---------

Co-authored-by: Drew Stone <drewstone329@gmail.com>
---
 src/errors.ts                        |  17 +
 src/improvement/index.ts             |  21 +-
 src/improvement/optimize-prompt.ts   | 242 ++++++++++++++
 src/index.ts                         |   1 +
 src/loops/drivers/dynamic.ts         | 217 ++++++++++++
 src/loops/drivers/sandbox-planner.ts | 254 ++++++++++++++
 src/loops/index.ts                   |  13 +
 tests/loops/dynamic.test.ts          | 474 +++++++++++++++++++++++++++
 tests/optimize-prompt.test.ts        | 136 ++++++++
 9 files changed, 1369 insertions(+), 6 deletions(-)
 create mode 100644 src/improvement/optimize-prompt.ts
 create mode 100644 src/loops/drivers/dynamic.ts
 create mode 100644 src/loops/drivers/sandbox-planner.ts
 create mode 100644 tests/loops/dynamic.test.ts
 create mode 100644 tests/optimize-prompt.test.ts

diff --git a/src/errors.ts b/src/errors.ts
index 40e4716..5b71a7e 100644
--- a/src/errors.ts
+++ b/src/errors.ts
@@ -99,3 +99,20 @@ export class RuntimeRunStateError extends AgentEvalError {
     super('validation', message, options)
   }
 }
+
+/**
+ * @stable
+ *
+ * The dynamic-loop planner returned an unusable topology move — the LLM emitted
+ * no parseable envelope, an unknown `kind`, or a structurally-invalid move
+ * (e.g. a fanout with zero tasks). This is a structural failure of the
+ * agent-authored topology, not a config mistake: the planner ran but its output
+ * cannot drive the kernel. Carries `validation` so cross-package handlers can
+ * pattern-match without importing the runtime. Fail loud — never substitute a
+ * default move, or the loop silently runs a topology nobody chose.
+ */
+export class PlannerError extends AgentEvalError {
+  constructor(message: string, options?: { cause?: unknown }) {
+    super('validation', message, options)
+  }
+}
diff --git a/src/improvement/index.ts b/src/improvement/index.ts
index 7a523d6..80eae77 100644
--- a/src/improvement/index.ts
+++ b/src/improvement/index.ts
@@ -1,11 +1,14 @@
 /**
- * `@tangle-network/agent-runtime` improvement drivers — implementations of
- * agent-eval's `ImprovementDriver` contract.
+ * `@tangle-network/agent-runtime` improvement — two entry points onto
+ * agent-eval's `runImprovementLoop`:
  *
- * ONE driver (`improvementDriver`) owns the candidate lifecycle; pluggable
- * `CandidateGenerator`s set the cost/capability dial:
- *   - `reflectiveGenerator` — cheap, no sandbox, applies pre-drafted patches
- *   - `agenticGenerator`     — full coding harness in the worktree, multi-shot
+ *   - `improvementDriver` (CODE surface) — owns the candidate lifecycle via a
+ *     pluggable `CandidateGenerator`:
+ *       - `reflectiveGenerator` — cheap, no sandbox, applies pre-drafted patches
+ *       - `agenticGenerator`     — full coding harness in the worktree, multi-shot
+ *   - `optimizePrompt` (TEXT surface) — identity-gated optimization of any
+ *     system / planner prompt. Defaults to agent-eval's `gepaDriver` +
+ *     `heldOutGate`; returns the baseline unless the held-out gate ships a win.
  */
 
 export { type AgenticGeneratorOptions, agenticGenerator } from './agentic-generator'
@@ -14,4 +17,10 @@ export {
   type ImprovementDriverOptions,
   improvementDriver,
 } from './improvement-driver'
+export {
+  type OptimizePromptOptions,
+  type OptimizePromptReflection,
+  type OptimizePromptResult,
+  optimizePrompt,
+} from './optimize-prompt'
 export { type ReflectiveGeneratorOptions, reflectiveGenerator } from './reflective-generator'
diff --git a/src/improvement/optimize-prompt.ts b/src/improvement/optimize-prompt.ts
new file mode 100644
index 0000000..1c822c8
--- /dev/null
+++ b/src/improvement/optimize-prompt.ts
@@ -0,0 +1,242 @@
+/**
+ * @experimental
+ *
+ * `optimizePrompt` — identity-gated optimization for any TEXT prompt surface
+ * (system prompt, planner prompt, judge rubric, skill doc).
+ *
+ * The text-surface sibling to this module's `improvementDriver` (the
+ * CODE-surface / worktree path). Both feed agent-eval's `runImprovementLoop`;
+ * this one defaults the driver to agent-eval's `gepaDriver` (reflective text
+ * mutator) and the gate to `heldOutGate`.
+ *
+ * IDENTITY-GATED BY CONSTRUCTION — the whole point. The loop runs evals,
+ * collects per-scenario signal, proposes candidates, and the gate compares
+ * candidate-vs-baseline ON THE HELDOUT. `result.prompt` is the baseline
+ * (identity) UNLESS the gate decided `'ship'`. So wiring a surface up is safe:
+ * a surface with no beneficial mutation simply keeps its baseline. You never
+ * regress by registering a prompt — you only ever improve when the held-out
+ * data earns it.
+ *
+ * Generic over the runtime: `runWithPrompt` is the only domain seam — given a
+ * candidate prompt + scenario, run it however the surface runs (sandbox
+ * `streamPrompt`, a `runLoop`, a direct model call) and return the artifact the
+ * judges score. The optimizer never assumes how a prompt is executed.
+ */
+
+import type { LlmClientOptions } from '@tangle-network/agent-eval'
+import type {
+  CampaignResult,
+  CampaignStorage,
+  DispatchContext,
+  Gate,
+  GateResult,
+  ImprovementDriver,
+  JudgeConfig,
+  RunImprovementLoopResult,
+  Scenario,
+} from '@tangle-network/agent-eval/campaign'
+import { gepaDriver, heldOutGate, runImprovementLoop } from '@tangle-network/agent-eval/campaign'
+import { ConfigError } from '../errors'
+
+/** Reflection config for the default `gepaDriver`. Omit when passing a custom
+ *  `driver`. */
+export interface OptimizePromptReflection {
+  /** Router transport for the reflection model. */
+  llm: LlmClientOptions
+  /** Model that performs the reflective rewrite. */
+  model: string
+  /** What is being optimized — orients the reflection prompt. Default
+   *  `'system prompt'`. */
+  target?: string
+  /** Surface-specific mutation levers offered to the reflector. */
+  mutationPrimitives?: string[]
+  /** H2 (`## Foo`) headings that MUST survive every candidate. gepaDriver's
+   *  only structural guard — load-bearing sections of the prompt should be
+   *  `##` headings so a rewrite cannot drop them. */
+  preserveSections?: string[]
+  /** Max sentence-level edits per candidate vs the parent (a textual learning
+   *  rate). Caps a rewrite from wiping prior rules in one generation. */
+  maxSentenceEdits?: number
+}
+
+/** @experimental */
+export interface OptimizePromptOptions<TScenario extends Scenario, TArtifact> {
+  /** The prompt being optimized — the identity baseline the gate protects. */
+  baselinePrompt: string
+  /** Domain seam: run a candidate prompt against a scenario → artifact the
+   *  judges score. The optimizer is agnostic to HOW the prompt runs. */
+  runWithPrompt: (prompt: string, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>
+  /** Training pool — scored each generation to rank candidates. */
+  scenarios: TScenario[]
+  /** Held out of training — scored ONLY for the gate's baseline-vs-winner
+   *  delta. Disjoint from `scenarios`; this is what makes promotion measure
+   *  generalization, not memorization. */
+  holdoutScenarios: TScenario[]
+  /** Scorers — deterministic checks or LLM judges. */
+  judges: JudgeConfig<TArtifact, TScenario>[]
+  /** Where artifacts + traces land (opaque key under in-memory storage). */
+  runDir: string
+  /** Default driver = `gepaDriver` built from this. Required UNLESS `driver`
+   *  is supplied. */
+  reflection?: OptimizePromptReflection
+  /** Override the improvement strategy (custom driver / deterministic tests). */
+  driver?: ImprovementDriver
+  /** Override the promotion gate. Default `heldOutGate` over `holdoutScenarios`
+   *  — zero extra LLM. Wrap `defaultProductionGate` for red-team/reward-hacking
+   *  hardening on production wiring. */
+  gate?: Gate<TArtifact, TScenario>
+  /** Minimum held-out composite lift to ship, forwarded to the default
+   *  `heldOutGate`. When omitted the gate uses its own default. */
+  deltaThreshold?: number
+  /** Candidates proposed per generation. Default 4. */
+  populationSize?: number
+  /** Generations to run. Default 3. */
+  maxGenerations?: number
+  /** Candidates carried to the next generation. Default 2. */
+  promoteTopK?: number
+  /** Storage backend. Pass `inMemoryCampaignStorage()` for filesystem-less /
+   *  test runs. Default: Node filesystem. */
+  storage?: CampaignStorage
+  /** Reproducibility seed. Default 42. */
+  seed?: number
+  /** Per-scenario replicates for CI bands. Default 1. */
+  reps?: number
+  /** Max concurrent cells. Default 2. */
+  maxConcurrency?: number
+  /** Test seam — override the wall clock. */
+  now?: () => Date
+  /** On a shipped gate: `'pr'` opens a PR, `'none'` just reports. Default
+   *  `'none'`. */
+  autoOnPromote?: 'pr' | 'none'
+  ghOwner?: string
+  ghRepo?: string
+}
+
+/** @experimental */
+export interface OptimizePromptResult<TArtifact, TScenario extends Scenario> {
+  /** The prompt to USE. Identity (the baseline) unless the gate shipped a
+   *  winner — so a caller can always assign `result.prompt` unconditionally. */
+  prompt: string
+  /** True only when the gate promoted a candidate over baseline on holdout. */
+  improved: boolean
+  /** The gate's verdict (`'ship' | 'hold' | 'need_more_work' | ...`). */
+  decision: GateResult['decision']
+  /** Human-readable reasons the gate gave. */
+  reasons: string[]
+  /** Mean held-out composite of the baseline. */
+  baselineComposite: number
+  /** Mean held-out composite of the winner candidate. */
+  winnerComposite: number
+  /** Held-out lift (winner − baseline); the gate's `delta` when it reported one. */
+  delta: number
+  /** Why the winner was proposed — present when a shipped winner carried a
+   *  driver rationale. */
+  rationale?: string
+  /** Unified baseline→winner diff (empty when the winner is the baseline). */
+  diff: string
+  /** The full loop result for callers that need generations / campaigns. */
+  raw: RunImprovementLoopResult<TArtifact, TScenario>
+}
+
+/** @experimental */
+export async function optimizePrompt<TScenario extends Scenario, TArtifact>(
+  opts: OptimizePromptOptions<TScenario, TArtifact>,
+): Promise<OptimizePromptResult<TArtifact, TScenario>> {
+  if (!opts.driver && !opts.reflection) {
+    throw new ConfigError(
+      'optimizePrompt: pass `reflection` (builds the default gepaDriver) or a custom `driver`',
+    )
+  }
+  if (opts.scenarios.length === 0) {
+    throw new ConfigError('optimizePrompt: `scenarios` must be non-empty')
+  }
+  if (opts.holdoutScenarios.length === 0) {
+    throw new ConfigError(
+      'optimizePrompt: `holdoutScenarios` must be non-empty (the gate needs it)',
+    )
+  }
+
+  const driver =
+    opts.driver ??
+    gepaDriver({
+      llm: opts.reflection!.llm,
+      model: opts.reflection!.model,
+      target: opts.reflection!.target ?? 'system prompt',
+      mutationPrimitives: opts.reflection!.mutationPrimitives,
+      constraints:
+        opts.reflection!.preserveSections || opts.reflection!.maxSentenceEdits !== undefined
+          ? {
+              preserveSections: opts.reflection!.preserveSections,
+              maxSentenceEdits: opts.reflection!.maxSentenceEdits,
+            }
+          : undefined,
+    })
+
+  const gate =
+    opts.gate ??
+    heldOutGate<TArtifact, TScenario>({
+      scenarios: opts.holdoutScenarios,
+      ...(opts.deltaThreshold !== undefined ? { deltaThreshold: opts.deltaThreshold } : {}),
+    })
+
+  const result = await runImprovementLoop<TScenario, TArtifact>({
+    baselineSurface: opts.baselinePrompt,
+    dispatchWithSurface: (surface, scenario, ctx) => {
+      if (typeof surface !== 'string') {
+        // optimizePrompt is the TEXT-surface entry point; a CodeSurface means
+        // the caller wired the wrong driver. Fail loud — don't silently run the
+        // baseline and report a phantom score.
+        throw new ConfigError(
+          'optimizePrompt: received a CodeSurface — this entry point optimizes string prompts only',
+        )
+      }
+      return opts.runWithPrompt(surface, scenario, ctx)
+    },
+    driver,
+    populationSize: opts.populationSize ?? 4,
+    maxGenerations: opts.maxGenerations ?? 3,
+    ...(opts.promoteTopK !== undefined ? { promoteTopK: opts.promoteTopK } : {}),
+    scenarios: opts.scenarios,
+    holdoutScenarios: opts.holdoutScenarios,
+    judges: opts.judges,
+    gate,
+    autoOnPromote: opts.autoOnPromote ?? 'none',
+    ...(opts.ghOwner !== undefined ? { ghOwner: opts.ghOwner } : {}),
+    ...(opts.ghRepo !== undefined ? { ghRepo: opts.ghRepo } : {}),
+    runDir: opts.runDir,
+    ...(opts.storage !== undefined ? { storage: opts.storage } : {}),
+    ...(opts.seed !== undefined ? { seed: opts.seed } : {}),
+    ...(opts.reps !== undefined ? { reps: opts.reps } : {}),
+    ...(opts.maxConcurrency !== undefined ? { maxConcurrency: opts.maxConcurrency } : {}),
+    ...(opts.now !== undefined ? { now: opts.now } : {}),
+  })
+
+  const improved = result.gateResult.decision === 'ship'
+  const winnerSurface =
+    typeof result.winnerSurface === 'string' ? result.winnerSurface : opts.baselinePrompt
+  const baselineComposite = meanComposite(result.baselineOnHoldout)
+  const winnerComposite = meanComposite(result.winnerOnHoldout)
+
+  return {
+    prompt: improved ? winnerSurface : opts.baselinePrompt,
+    improved,
+    decision: result.gateResult.decision,
+    reasons: result.gateResult.reasons,
+    baselineComposite,
+    winnerComposite,
+    delta: result.gateResult.delta ?? winnerComposite - baselineComposite,
+    ...(improved && result.winnerRationale ? { rationale: result.winnerRationale } : {}),
+    diff: result.promotedDiff,
+    raw: result,
+  }
+}
+
+/** Mean composite over a campaign's per-scenario aggregates. The held-out
+ *  campaigns score one surface across `holdoutScenarios`; averaging the
+ *  per-scenario means gives the single number the gate's delta is built from. */
+function meanComposite(campaign: CampaignResult<unknown, Scenario>): number {
+  const scenarios = Object.values(campaign.aggregates.byScenario)
+  if (scenarios.length === 0) return 0
+  const sum = scenarios.reduce((acc, s) => acc + s.meanComposite, 0)
+  return sum / scenarios.length
+}
diff --git a/src/index.ts b/src/index.ts
index 79845b1..9fc8836 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -101,6 +101,7 @@ export {
   ConfigError,
   JudgeError,
   NotFoundError,
+  PlannerError,
   RuntimeRunStateError,
   ValidationError,
 } from './errors'
diff --git a/src/loops/drivers/dynamic.ts b/src/loops/drivers/dynamic.ts
new file mode 100644
index 0000000..3a9ec39
--- /dev/null
+++ b/src/loops/drivers/dynamic.ts
@@ -0,0 +1,217 @@
+/**
+ * @experimental
+ *
+ * Dynamic driver — the agent authors the loop topology at runtime.
+ *
+ * Where `refine` and `fanout-vote` encode a fixed shape as a pure function of
+ * history, this driver delegates the per-round shape to an injected
+ * `TopologyPlanner`. Each round the planner inspects the task + iteration
+ * history and emits one `TopologyMove`:
+ *   - `refine` → one task next round (optionally rewritten from the prior attempt)
+ *   - `fanout` → N tasks next round (the kernel round-robins `agentRuns`, so a
+ *     2-harness fanout dispatches branch 0 to harness A and branch 1 to harness B)
+ *   - `stop`   → terminate; the kernel selects the winner across all iterations
+ *
+ * The planner is the brain; this driver is the structure. It maps moves onto
+ * the kernel's `plan`/`decide` contract, enforces the iteration + fanout caps,
+ * and fails loud on a malformed move. The planner is injected exactly like
+ * `refine`'s `refineTask` and `fanout-vote`'s `selector` — so a test can drive
+ * a deterministic policy through the real kernel, and production can wire it to
+ * an LLM via `createSandboxPlanner`.
+ *
+ * Topology is orthogonal to harness: the planner never names a backend. Which
+ * harness runs a branch is decided by the `AgentRunSpec` the kernel round-robins
+ * to, so one dynamic driver works across claude-code, codex, opencode, pi —
+ * including fanning a single round across several at once.
+ */
+
+import { PlannerError, ValidationError } from '../../errors'
+import type { Driver, Iteration } from '../types'
+
+/** Terminal once `decide` returns `'done'` (a kernel terminal decision). */
+export type DynamicDecision = 'continue' | 'done'
+
+/**
+ * One topology decision for the next round. `fanout` carries explicit tasks
+ * rather than a count so the planner can issue heterogeneous branches (a
+ * different sub-task per harness); pass N copies of one task for a homogeneous
+ * fanout that relies on `agentRuns` diversity instead.
+ *
+ * @experimental
+ */
+export type TopologyMove<Task> =
+  | { kind: 'refine'; task: Task; rationale?: string }
+  | { kind: 'fanout'; tasks: Task[]; rationale?: string }
+  | { kind: 'stop'; rationale?: string }
+
+/** @experimental */
+export interface PlannerContext<Task, Output> {
+  /** The root task the loop was invoked with — stable across rounds. */
+  task: Task
+  /** Every iteration so far, in dispatch order, with outputs + verdicts. */
+  history: ReadonlyArray<Iteration<Task, Output>>
+  /** `history.length` — iterations already spent. */
+  iterationsSpent: number
+  /** Iterations left before the driver's `maxIterations` cap forces a stop. */
+  iterationsRemaining: number
+}
+
+/**
+ * Chooses the next topology move from the task + history. Sync or async; an
+ * async planner is where an LLM call goes (see `createSandboxPlanner`).
+ *
+ * @experimental
+ */
+export type TopologyPlanner<Task, Output> = (
+  ctx: PlannerContext<Task, Output>,
+) => TopologyMove<Task> | Promise<TopologyMove<Task>>
+
+/** @experimental */
+export interface CreateDynamicDriverOptions<Task, Output> {
+  /** The agent-authored topology policy. Invoked once per round in `plan`. */
+  planner: TopologyPlanner<Task, Output>
+  /**
+   * Hard safety cap on total iterations. When reached, the driver stops before
+   * consulting the planner. Default 8. Set the kernel's `runLoop`
+   * `maxIterations >= ` this so the driver's cap governs and the loop closes on
+   * a clean `'done'` rather than a truncated `'continue'`.
+   */
+  maxIterations?: number
+  /** Max branches a single `fanout` move may dispatch. Default 4. */
+  maxFanout?: number
+  /** Stable identifier surfaced in trace events. Default `'dynamic'`. */
+  name?: string
+}
+
+/** @experimental */
+export function createDynamicDriver<Task, Output>(
+  options: CreateDynamicDriverOptions<Task, Output>,
+): Driver<Task, Output, DynamicDecision> {
+  if (typeof options.planner !== 'function') {
+    throw new ValidationError('createDynamicDriver: planner must be a function')
+  }
+  const maxIterations = options.maxIterations ?? 8
+  if (!Number.isFinite(maxIterations) || maxIterations <= 0) {
+    throw new ValidationError('createDynamicDriver: maxIterations must be > 0')
+  }
+  const maxFanout = options.maxFanout ?? 4
+  if (!Number.isFinite(maxFanout) || maxFanout < 1) {
+    throw new ValidationError('createDynamicDriver: maxFanout must be >= 1')
+  }
+
+  // The kernel calls plan(), runs the batch, then calls decide() — strictly
+  // sequential, one driver instance per loop. Caching the move the planner
+  // chose this round lets decide() report terminality without re-invoking the
+  // planner (which would double every LLM call).
+  let pending: TopologyMove<Task> | undefined
+
+  return {
+    name: options.name ?? 'dynamic',
+    async plan(task, history) {
+      if (history.length >= maxIterations) {
+        pending = { kind: 'stop', rationale: `maxIterations (${maxIterations}) reached` }
+        return []
+      }
+      const move = await options.planner({
+        task,
+        history,
+        iterationsSpent: history.length,
+        iterationsRemaining: maxIterations - history.length,
+      })
+      pending = validateMove(move, maxFanout)
+      switch (pending.kind) {
+        case 'refine':
+          return [pending.task]
+        case 'fanout':
+          return pending.tasks
+        case 'stop':
+          return []
+      }
+    },
+    decide() {
+      // pending is set by the plan() call that immediately precedes every
+      // decide(). Only a `stop` move terminates; refine/fanout keep looping so
+      // plan() — and thus the planner — runs again next round.
+      return pending?.kind === 'stop' ? 'done' : 'continue'
+    },
+  }
+}
+
+function validateMove<Task>(move: TopologyMove<Task>, maxFanout: number): TopologyMove<Task> {
+  if (!move || typeof move !== 'object' || typeof (move as { kind?: unknown }).kind !== 'string') {
+    throw new PlannerError(`dynamic planner returned a non-move value: ${describe(move)}`)
+  }
+  switch (move.kind) {
+    case 'refine':
+      return move
+    case 'stop':
+      return move
+    case 'fanout': {
+      if (!Array.isArray(move.tasks) || move.tasks.length === 0) {
+        throw new PlannerError('dynamic planner fanout move must carry a non-empty tasks[]')
+      }
+      if (move.tasks.length <= maxFanout) return move
+      // Clamp rather than reject — over-fanning is a budget concern, not a
+      // structural error. The clamp is recorded in the rationale for traces.
+      return {
+        kind: 'fanout',
+        tasks: move.tasks.slice(0, maxFanout),
+        rationale: `${move.rationale ?? ''} [clamped ${move.tasks.length}→${maxFanout}]`.trim(),
+      }
+    }
+    default:
+      throw new PlannerError(
+        `dynamic planner returned unknown move kind: ${describe((move as { kind: unknown }).kind)}`,
+      )
+  }
+}
+
+function describe(value: unknown): string {
+  try {
+    return JSON.stringify(value) ?? String(value)
+  } catch {
+    return String(value)
+  }
+}
+
+/**
+ * Compact, planner-friendly view of iteration history — what an LLM planner
+ * needs to choose the next move without the raw event streams. Output is
+ * truncated so a long run's prompt stays bounded.
+ *
+ * @experimental
+ */
+export function summarizeHistory<Task, Output>(
+  history: ReadonlyArray<Iteration<Task, Output>>,
+  opts: { maxOutputChars?: number } = {},
+): Array<{
+  index: number
+  agentRunName: string
+  valid?: boolean
+  score?: number
+  error?: string
+  output?: string
+}> {
+  const maxOutputChars = opts.maxOutputChars ?? 600
+  return history.map((iter) => {
+    const row: {
+      index: number
+      agentRunName: string
+      valid?: boolean
+      score?: number
+      error?: string
+      output?: string
+    } = { index: iter.index, agentRunName: iter.agentRunName }
+    if (iter.verdict) {
+      row.valid = iter.verdict.valid
+      if (typeof iter.verdict.score === 'number') row.score = iter.verdict.score
+    }
+    if (iter.error) row.error = iter.error.message
+    if (iter.output !== undefined) {
+      const serialized = describe(iter.output)
+      row.output =
+        serialized.length > maxOutputChars ? `${serialized.slice(0, maxOutputChars)}…` : serialized
+    }
+    return row
+  })
+}
diff --git a/src/loops/drivers/sandbox-planner.ts b/src/loops/drivers/sandbox-planner.ts
new file mode 100644
index 0000000..9389776
--- /dev/null
+++ b/src/loops/drivers/sandbox-planner.ts
@@ -0,0 +1,254 @@
+/**
+ * @experimental
+ *
+ * `createSandboxPlanner` — wire the dynamic driver's `TopologyPlanner` to a
+ * real agent. Each round it spins a sandbox on `profile`, streams a prompt that
+ * carries the history summary, and decodes the agent's chosen `TopologyMove`
+ * from a JSON envelope it emits. This is the "agent authors its own loop
+ * topology" path: the planner profile can be any harness (claude-code, codex,
+ * opencode, pi) — its only job is to read what happened and emit the next move.
+ *
+ * The planner profile is deliberately distinct from the worker `agentRuns`: a
+ * cheap fast model can steer topology while expensive workers do the labor, and
+ * the planner never names which harness runs a branch — the kernel's
+ * `agentRuns` round-robin decides that.
+ *
+ * Envelope contract the agent must emit (fenced ```json or a structured
+ * `result`/`final` event payload):
+ *   { "kind": "refine" | "fanout" | "stop",
+ *     "tasks"?: [ <task>, ... ],   // decoded via `decodeTask`
+ *     "n"?: number,                // fanout shorthand: N copies of the root task
+ *     "rationale"?: string }
+ *
+ * A missing / unparseable / unknown-kind envelope throws `PlannerError` — the
+ * loop never silently runs a topology the agent did not choose.
+ */
+
+import type {
+  AgentProfile,
+  CreateSandboxOptions,
+  SandboxEvent,
+} from '@tangle-network/sandbox'
+import { PlannerError, ValidationError } from '../../errors'
+import type { AgentRunSpec, LoopSandboxClient } from '../types'
+import type { PlannerContext, TopologyMove, TopologyPlanner } from './dynamic'
+import { summarizeHistory } from './dynamic'
+
+/** Raw, pre-decode envelope an agent emits to choose the next move. */
+export interface TopologyMoveEnvelope {
+  kind: string
+  tasks?: unknown[]
+  n?: number
+  rationale?: string
+}
+
+/** @experimental */
+export interface CreateSandboxPlannerOptions<Task, Output> {
+  /** Sandbox client — the planner calls `.create()` once per round. */
+  client: LoopSandboxClient
+  /** The planner agent. Steers topology; does not run the work. */
+  profile: AgentProfile
+  /**
+   * Decode one raw task from the envelope's `tasks[]` into a domain `Task`.
+   * Required because `Task` is opaque to this module — only the caller knows
+   * its shape. Throw to reject a malformed task; the error surfaces as a
+   * `PlannerError`.
+   */
+  decodeTask: (raw: unknown, ctx: PlannerContext<Task, Output>) => Task
+  /** Override the default prompt (history summary + envelope contract). */
+  buildPrompt?: (ctx: PlannerContext<Task, Output>) => string
+  /** Override envelope extraction from the event stream. */
+  parseEnvelope?: (events: SandboxEvent[]) => TopologyMoveEnvelope | undefined
+  /** Sandbox overrides for the planner sandbox (timeouts, env, etc.). */
+  sandboxOverrides?: AgentRunSpec<Task>['sandboxOverrides']
+  /** Cancellation for the planner's own LLM call. */
+  signal?: AbortSignal
+}
+
+/** @experimental */
+export function createSandboxPlanner<Task, Output>(
+  opts: CreateSandboxPlannerOptions<Task, Output>,
+): TopologyPlanner<Task, Output> {
+  if (!opts.client || typeof opts.client.create !== 'function') {
+    throw new ValidationError('createSandboxPlanner: client.create is required')
+  }
+  if (typeof opts.decodeTask !== 'function') {
+    throw new ValidationError('createSandboxPlanner: decodeTask is required')
+  }
+  const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt
+  const parseEnvelope = opts.parseEnvelope ?? defaultParseEnvelope
+
+  return async (ctx) => {
+    const box = await opts.client.create(buildSandboxOptions(opts.profile, opts.sandboxOverrides))
+    const events: SandboxEvent[] = []
+    for await (const event of box.streamPrompt(buildPrompt(ctx), { signal: opts.signal })) {
+      events.push(event)
+    }
+    const envelope = parseEnvelope(events)
+    if (!envelope) {
+      throw new PlannerError('sandbox planner emitted no parseable topology-move envelope')
+    }
+    return envelopeToMove(envelope, ctx, opts.decodeTask)
+  }
+}
+
+function envelopeToMove<Task, Output>(
+  envelope: TopologyMoveEnvelope,
+  ctx: PlannerContext<Task, Output>,
+  decodeTask: (raw: unknown, ctx: PlannerContext<Task, Output>) => Task,
+): TopologyMove<Task> {
+  const kind = String(envelope.kind ?? '').toLowerCase()
+  const rationale = typeof envelope.rationale === 'string' ? envelope.rationale : undefined
+  if (kind === 'stop') {
+    return { kind: 'stop', rationale }
+  }
+  if (kind === 'refine') {
+    const raw = Array.isArray(envelope.tasks) ? envelope.tasks[0] : undefined
+    // No new task → replay the root task; the worker self-corrects from its
+    // own prior attempt in sandbox state, mirroring the refine driver default.
+    const task = raw === undefined ? ctx.task : decodeTaskGuarded(decodeTask, raw, ctx)
+    return { kind: 'refine', task, rationale }
+  }
+  if (kind === 'fanout') {
+    const tasks = resolveFanoutTasks(envelope, ctx, decodeTask)
+    return { kind: 'fanout', tasks, rationale }
+  }
+  throw new PlannerError(`sandbox planner emitted unknown move kind: ${JSON.stringify(envelope.kind)}`)
+}
+
+function resolveFanoutTasks<Task, Output>(
+  envelope: TopologyMoveEnvelope,
+  ctx: PlannerContext<Task, Output>,
+  decodeTask: (raw: unknown, ctx: PlannerContext<Task, Output>) => Task,
+): Task[] {
+  if (Array.isArray(envelope.tasks) && envelope.tasks.length > 0) {
+    return envelope.tasks.map((raw) => decodeTaskGuarded(decodeTask, raw, ctx))
+  }
+  // `n` shorthand: N copies of the root task, leaning on `agentRuns` diversity.
+  if (typeof envelope.n === 'number' && Number.isFinite(envelope.n) && envelope.n >= 1) {
+    return Array.from({ length: Math.floor(envelope.n) }, () => ctx.task)
+  }
+  throw new PlannerError('sandbox planner fanout envelope needs a non-empty tasks[] or n >= 1')
+}
+
+function decodeTaskGuarded<Task, Output>(
+  decodeTask: (raw: unknown, ctx: PlannerContext<Task, Output>) => Task,
+  raw: unknown,
+  ctx: PlannerContext<Task, Output>,
+): Task {
+  try {
+    return decodeTask(raw, ctx)
+  } catch (err) {
+    throw new PlannerError(`sandbox planner decodeTask rejected ${JSON.stringify(raw)}`, {
+      cause: err,
+    })
+  }
+}
+
+function buildSandboxOptions(
+  profile: AgentProfile,
+  overrides: AgentRunSpec<unknown>['sandboxOverrides'],
+): CreateSandboxOptions {
+  const base = overrides ?? {}
+  const overrideBackend = base.backend
+  const explicitType = profile.metadata?.backendType
+  type BackendType = NonNullable<CreateSandboxOptions['backend']>['type']
+  return {
+    ...base,
+    backend: {
+      type: (overrideBackend?.type ?? explicitType ?? 'opencode') as BackendType,
+      profile,
+      ...(overrideBackend?.model ? { model: overrideBackend.model } : {}),
+      ...(overrideBackend?.server ? { server: overrideBackend.server } : {}),
+    },
+  }
+}
+
+function defaultBuildPrompt<Task, Output>(ctx: PlannerContext<Task, Output>): string {
+  const summary = summarizeHistory(ctx.history)
+  return [
+    'You are the loop planner. You do not do the work — you decide the topology of the next round.',
+    '',
+    `Root task:\n${safeJson(ctx.task)}`,
+    '',
+    `Iterations spent: ${ctx.iterationsSpent}. Remaining before the hard cap: ${ctx.iterationsRemaining}.`,
+    '',
+    ctx.history.length === 0
+      ? 'No attempts yet.'
+      : `Attempts so far (index, agent, verdict, output):\n${safeJson(summary)}`,
+    '',
+    'Choose ONE move and emit it as a fenced JSON block:',
+    '  - {"kind":"refine","tasks":[<task>],"rationale":"..."} — one more attempt; omit tasks to replay the root task.',
+    '  - {"kind":"fanout","tasks":[<task>,<task>],"rationale":"..."} — N parallel branches (or "n": N for N copies of the root task).',
+    '  - {"kind":"stop","rationale":"..."} — a valid result exists or further attempts will not help.',
+    '',
+    'Stop as soon as an attempt is valid. Prefer refine when an attempt is close; fan out when attempts disagree or the approach is uncertain.',
+    'Emit ONLY the JSON block.',
+  ].join('\n')
+}
+
+function defaultParseEnvelope(events: SandboxEvent[]): TopologyMoveEnvelope | undefined {
+  // Structured payload on a terminal event wins — sandbox SDKs lift emitted
+  // JSON onto data.result / data.output / data of a result|final event.
+  for (let i = events.length - 1; i >= 0; i -= 1) {
+    const event = events[i]
+    if (!event) continue
+    const type = String(event.type ?? '')
+    const data = isRecord(event.data) ? event.data : undefined
+    if (!data) continue
+    if (type === 'result' || type === 'final' || type === 'planner.move') {
+      const direct = coerceEnvelope(data.result ?? data.output ?? data)
+      if (direct) return direct
+    }
+  }
+  // Fall back to a fenced JSON block in the most recent text delta.
+  for (let i = events.length - 1; i >= 0; i -= 1) {
+    const event = events[i]
+    if (!event) continue
+    const data = isRecord(event.data) ? event.data : undefined
+    if (!data) continue
+    const text = pickString(data.text) ?? pickString(data.delta) ?? pickString(data.content)
+    if (!text) continue
+    const fenced = extractFencedJson(text)
+    const coerced = coerceEnvelope(fenced)
+    if (coerced) return coerced
+  }
+  return undefined
+}
+
+function coerceEnvelope(value: unknown): TopologyMoveEnvelope | undefined {
+  if (!isRecord(value)) return undefined
+  if (typeof value.kind !== 'string' || value.kind.length === 0) return undefined
+  const out: TopologyMoveEnvelope = { kind: value.kind }
+  if (Array.isArray(value.tasks)) out.tasks = value.tasks
+  if (typeof value.n === 'number') out.n = value.n
+  if (typeof value.rationale === 'string') out.rationale = value.rationale
+  return out
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return value !== null && typeof value === 'object' && !Array.isArray(value)
+}
+
+function pickString(value: unknown): string | undefined {
+  return typeof value === 'string' && value.length > 0 ? value : undefined
+}
+
+function extractFencedJson(text: string): unknown | undefined {
+  const match = text.match(/```(?:json)?\s*([\s\S]*?)```/i)
+  const body = (match?.[1] ?? text).trim()
+  if (!body) return undefined
+  try {
+    return JSON.parse(body)
+  } catch {
+    return undefined
+  }
+}
+
+function safeJson(value: unknown): string {
+  try {
+    return JSON.stringify(value, null, 2) ?? String(value)
+  } catch {
+    return String(value)
+  }
+}
diff --git a/src/loops/index.ts b/src/loops/index.ts
index d854266..8bb4c39 100644
--- a/src/loops/index.ts
+++ b/src/loops/index.ts
@@ -17,6 +17,14 @@ export type {
   SandboxEvent,
   SandboxInstance,
 } from '@tangle-network/sandbox'
+export type {
+  CreateDynamicDriverOptions,
+  DynamicDecision,
+  PlannerContext,
+  TopologyMove,
+  TopologyPlanner,
+} from './drivers/dynamic'
+export { createDynamicDriver, summarizeHistory } from './drivers/dynamic'
 export type {
   CreateFanoutVoteDriverOptions,
   FanoutVoteDecision,
@@ -25,6 +33,11 @@ export type {
 export { createFanoutVoteDriver, scoreFanoutVoteIterations } from './drivers/fanout-vote'
 export type { CreateRefineDriverOptions, RefineDecision } from './drivers/refine'
 export { createRefineDriver, refineWinnerIndex } from './drivers/refine'
+export type {
+  CreateSandboxPlannerOptions,
+  TopologyMoveEnvelope,
+} from './drivers/sandbox-planner'
+export { createSandboxPlanner } from './drivers/sandbox-planner'
 export type { RunLoopOptions } from './run-loop'
 export { runLoop } from './run-loop'
 export { reportLoopUsage, type UsageSink } from './report-usage'
diff --git a/tests/loops/dynamic.test.ts b/tests/loops/dynamic.test.ts
new file mode 100644
index 0000000..48ea848
--- /dev/null
+++ b/tests/loops/dynamic.test.ts
@@ -0,0 +1,474 @@
+import type {
+  AgentProfile,
+  CreateSandboxOptions,
+  SandboxEvent,
+  SandboxInstance,
+} from '@tangle-network/sandbox'
+import { describe, expect, it } from 'vitest'
+import { PlannerError } from '../../src/errors'
+import {
+  type AgentRunSpec,
+  createDynamicDriver,
+  createSandboxPlanner,
+  type OutputAdapter,
+  runLoop,
+  type TopologyMove,
+  type TopologyPlanner,
+  type Validator,
+} from '../../src/loops'
+
+interface Task {
+  goal: string
+  strategy: string
+}
+
+interface Out {
+  strategy: string
+  harness: string
+  score: number
+}
+
+const VALID_THRESHOLD = 0.7
+
+// Score is a pure function of the strategy the planner chose — so a stronger
+// strategy (parallel-*) clears the bar while naive/careful do not. This lets a
+// planner adapt: refine the strategy, then fan out when refinement stalls.
+function scoreFor(strategy: string): number {
+  if (strategy.startsWith('parallel')) return 0.9
+  if (strategy === 'careful') return 0.6
+  return 0.3
+}
+
+const output: OutputAdapter<Out> = {
+  parse(events) {
+    const last = events.at(-1)
+    const data = last?.data as Partial<Out> | undefined
+    return {
+      strategy: data?.strategy ?? '',
+      harness: data?.harness ?? '',
+      score: typeof data?.score === 'number' ? data.score : 0,
+    }
+  },
+}
+
+const validator: Validator<Out> = {
+  async validate(out) {
+    return { valid: out.score >= VALID_THRESHOLD, score: out.score }
+  },
+}
+
+function profile(name: string): AgentProfile {
+  return { name }
+}
+
+function workerSpecs(names: string[]): AgentRunSpec<Task>[] {
+  return names.map((name) => ({
+    profile: profile(name),
+    name,
+    taskToPrompt: (t) => JSON.stringify(t),
+  }))
+}
+
+// Worker client: each iteration's score derives from the task strategy carried
+// in the prompt; the harness is read from the profile the kernel round-robined
+// to. Records dispatch order so tests can assert topology + harness rotation.
+function workerClient() {
+  const dispatched: Array<{ harness: string; strategy: string }> = []
+  return {
+    dispatched,
+    client: {
+      async create(opts?: CreateSandboxOptions): Promise<SandboxInstance> {
+        const harness =
+          (opts?.backend?.profile && typeof opts.backend.profile === 'object'
+            ? opts.backend.profile.name
+            : undefined) ?? 'unknown'
+        return {
+          async *streamPrompt(message: string) {
+            const task = JSON.parse(message) as Task
+            dispatched.push({ harness, strategy: task.strategy })
+            yield {
+              type: 'result',
+              data: { strategy: task.strategy, harness, score: scoreFor(task.strategy) },
+            } satisfies SandboxEvent
+          },
+        } as unknown as SandboxInstance
+      },
+    },
+  }
+}
+
+describe('runLoop + createDynamicDriver', () => {
+  it('lets an adaptive planner choose refine→refine→fanout→stop from history', async () => {
+    const goal = 'ship the feature'
+    // The planner reads history and adapts: try cheap strategies first, escalate
+    // to a heterogeneous fanout when refinement stalls, stop once a branch wins.
+    const planner: TopologyPlanner<Task, Out> = ({ history }) => {
+      if (history.some((h) => h.verdict?.valid === true)) return { kind: 'stop' }
+      if (history.length === 0) return { kind: 'refine', task: { goal, strategy: 'naive' } }
+      if (history.length === 1) return { kind: 'refine', task: { goal, strategy: 'careful' } }
+      return {
+        kind: 'fanout',
+        tasks: [
+          { goal, strategy: 'parallel-a' },
+          { goal, strategy: 'parallel-b' },
+        ],
+      }
+    }
+
+    const { client, dispatched } = workerClient()
+    const result = await runLoop({
+      driver: createDynamicDriver<Task, Out>({ planner, maxIterations: 8 }),
+      agentRuns: workerSpecs(['worker-a', 'worker-b']),
+      output,
+      validator,
+      task: { goal, strategy: 'naive' },
+      ctx: { sandboxClient: client },
+      maxIterations: 10,
+    })
+
+    expect(result.decision).toBe('done')
+    expect(result.iterations).toHaveLength(4)
+    expect(dispatched.map((d) => d.strategy)).toEqual([
+      'naive',
+      'careful',
+      'parallel-a',
+      'parallel-b',
+    ])
+    // The fanout round dispatched its two branches across two distinct harnesses.
+    expect(result.iterations[2]?.agentRunName).toBe('worker-a')
+    expect(result.iterations[3]?.agentRunName).toBe('worker-b')
+    // Winner is the highest-valid-score attempt (0.9), earliest index breaks the tie.
+    expect(result.winner?.verdict?.valid).toBe(true)
+    expect(result.winner?.verdict?.score).toBeCloseTo(0.9, 6)
+    expect(result.winner?.iterationIndex).toBe(2)
+  })
+
+  it('runs an explicit refine→fanout→stop script across two harnesses', async () => {
+    const goal = 'explicit'
+    const moves: TopologyMove<Task>[] = [
+      { kind: 'refine', task: { goal, strategy: 'careful' } },
+      {
+        kind: 'fanout',
+        tasks: [
+          { goal, strategy: 'parallel-a' },
+          { goal, strategy: 'parallel-b' },
+        ],
+      },
+      { kind: 'stop' },
+    ]
+    let round = 0
+    const planner: TopologyPlanner<Task, Out> = () => moves[round++]!
+
+    const { client } = workerClient()
+    const result = await runLoop({
+      driver: createDynamicDriver<Task, Out>({ planner }),
+      agentRuns: workerSpecs(['claude-code', 'codex']),
+      output,
+      validator,
+      task: { goal, strategy: 'careful' },
+      ctx: { sandboxClient: client },
+    })
+
+    expect(result.decision).toBe('done')
+    expect(round).toBe(3)
+    // Assert the ordered iteration record (deterministic) rather than dispatch
+    // order, which races across the concurrent fanout branches. The kernel maps
+    // iteration index N to agentRuns[N % len], so the fanout spans both harnesses.
+    expect(result.iterations.map((i) => [i.agentRunName, i.task.strategy])).toEqual([
+      ['claude-code', 'careful'],
+      ['codex', 'parallel-a'],
+      ['claude-code', 'parallel-b'],
+    ])
+    expect(result.winner?.verdict?.score).toBeCloseTo(0.9, 6)
+  })
+
+  it('terminates on the maxIterations cap even when the planner never stops', async () => {
+    const planner: TopologyPlanner<Task, Out> = () => ({
+      kind: 'refine',
+      task: { goal: 'forever', strategy: 'naive' },
+    })
+    const { client } = workerClient()
+    const result = await runLoop({
+      driver: createDynamicDriver<Task, Out>({ planner, maxIterations: 3 }),
+      agentRun: workerSpecs(['solo'])[0],
+      output,
+      validator,
+      task: { goal: 'forever', strategy: 'naive' },
+      ctx: { sandboxClient: client },
+      maxIterations: 10,
+    })
+
+    expect(result.iterations).toHaveLength(3)
+    expect(result.decision).toBe('done')
+  })
+
+  it('clamps a fanout move to maxFanout branches', async () => {
+    const moves: TopologyMove<Task>[] = [
+      {
+        kind: 'fanout',
+        tasks: Array.from({ length: 5 }, (_, i) => ({ goal: 'wide', strategy: `parallel-${i}` })),
+      },
+      { kind: 'stop' },
+    ]
+    let round = 0
+    const planner: TopologyPlanner<Task, Out> = () => moves[round++]!
+
+    const { client, dispatched } = workerClient()
+    const result = await runLoop({
+      driver: createDynamicDriver<Task, Out>({ planner, maxFanout: 2 }),
+      agentRuns: workerSpecs(['a', 'b']),
+      output,
+      validator,
+      task: { goal: 'wide', strategy: 'parallel-0' },
+      ctx: { sandboxClient: client },
+    })
+
+    expect(result.iterations).toHaveLength(2)
+    expect(dispatched.map((d) => d.strategy)).toEqual(['parallel-0', 'parallel-1'])
+  })
+
+  it('fails loud on a fanout move with no tasks', async () => {
+    const planner: TopologyPlanner<Task, Out> = () => ({ kind: 'fanout', tasks: [] })
+    const { client } = workerClient()
+    await expect(
+      runLoop({
+        driver: createDynamicDriver<Task, Out>({ planner }),
+        agentRun: workerSpecs(['a'])[0],
+        output,
+        validator,
+        task: { goal: 'x', strategy: 'naive' },
+        ctx: { sandboxClient: client },
+      }),
+    ).rejects.toThrow(PlannerError)
+  })
+
+  it('fails loud on an unknown move kind', async () => {
+    const planner = (() => ({ kind: 'teleport' })) as unknown as TopologyPlanner<Task, Out>
+    const { client } = workerClient()
+    await expect(
+      runLoop({
+        driver: createDynamicDriver<Task, Out>({ planner }),
+        agentRun: workerSpecs(['a'])[0],
+        output,
+        validator,
+        task: { goal: 'x', strategy: 'naive' },
+        ctx: { sandboxClient: client },
+      }),
+    ).rejects.toThrow(/unknown move kind/i)
+  })
+})
+
+// A single client serving BOTH the planner agent and the workers, routed by
+// profile name. The planner sandbox reads "Iterations spent: N" out of the
+// prompt the driver built and emits a structured topology-move envelope —
+// exercising the real createSandboxPlanner → kernel → worker path.
+function plannerAndWorkerClient(plannerMove: (spent: number) => unknown) {
+  const dispatched: Array<{ harness: string; strategy: string }> = []
+  const plannerPrompts: string[] = []
+  return {
+    dispatched,
+    plannerPrompts,
+    client: {
+      async create(opts?: CreateSandboxOptions): Promise<SandboxInstance> {
+        const name =
+          (opts?.backend?.profile && typeof opts.backend.profile === 'object'
+            ? opts.backend.profile.name
+            : undefined) ?? 'unknown'
+        if (name === 'planner') {
+          return {
+            async *streamPrompt(message: string) {
+              plannerPrompts.push(message)
+              const spent = Number(message.match(/Iterations spent: (\d+)/)?.[1] ?? '0')
+              yield {
+                type: 'result',
+                data: { result: plannerMove(spent) },
+              } satisfies SandboxEvent
+            },
+          } as unknown as SandboxInstance
+        }
+        return {
+          async *streamPrompt(message: string) {
+            const task = JSON.parse(message) as Task
+            dispatched.push({ harness: name, strategy: task.strategy })
+            yield {
+              type: 'result',
+              data: { strategy: task.strategy, harness: name, score: scoreFor(task.strategy) },
+            } satisfies SandboxEvent
+          },
+        } as unknown as SandboxInstance
+      },
+    },
+  }
+}
+
+describe('createSandboxPlanner', () => {
+  it('drives the loop end-to-end: planner agent authors refine→fanout→stop', async () => {
+    const goal = 'sandbox-planner'
+    const { client, plannerPrompts } = plannerAndWorkerClient((spent) => {
+      if (spent === 0) return { kind: 'refine', tasks: [{ goal, strategy: 'careful' }] }
+      if (spent === 1)
+        return {
+          kind: 'fanout',
+          tasks: [
+            { goal, strategy: 'parallel-a' },
+            { goal, strategy: 'parallel-b' },
+          ],
+        }
+      return { kind: 'stop' }
+    })
+
+    const planner = createSandboxPlanner<Task, Out>({
+      client,
+      profile: profile('planner'),
+      decodeTask: (raw) => raw as Task,
+    })
+
+    const result = await runLoop({
+      driver: createDynamicDriver<Task, Out>({ planner }),
+      agentRuns: workerSpecs(['worker-a', 'worker-b']),
+      output,
+      validator,
+      task: { goal, strategy: 'naive' },
+      ctx: { sandboxClient: client },
+    })
+
+    expect(result.decision).toBe('done')
+    expect(result.iterations.map((i) => [i.agentRunName, i.task.strategy])).toEqual([
+      ['worker-a', 'careful'],
+      ['worker-b', 'parallel-a'],
+      ['worker-a', 'parallel-b'],
+    ])
+    expect(result.winner?.verdict?.score).toBeCloseTo(0.9, 6)
+    // The planner saw a growing history each round (its prompt carried the count).
+    expect(plannerPrompts).toHaveLength(3)
+    expect(plannerPrompts[0]).toMatch(/Iterations spent: 0/)
+    expect(plannerPrompts[2]).toMatch(/Iterations spent: 3/)
+  })
+
+  it('expands the n shorthand into N copies of the root task', async () => {
+    const { client, dispatched } = plannerAndWorkerClient((spent) =>
+      spent === 0 ? { kind: 'fanout', n: 3 } : { kind: 'stop' },
+    )
+    const planner = createSandboxPlanner<Task, Out>({
+      client,
+      profile: profile('planner'),
+      decodeTask: (raw) => raw as Task,
+    })
+    const result = await runLoop({
+      driver: createDynamicDriver<Task, Out>({ planner, maxFanout: 4 }),
+      agentRuns: workerSpecs(['a', 'b']),
+      output,
+      validator,
+      task: { goal: 'n-shorthand', strategy: 'parallel-root' },
+      ctx: { sandboxClient: client },
+    })
+    expect(dispatched).toHaveLength(3)
+    expect(dispatched.every((d) => d.strategy === 'parallel-root')).toBe(true)
+    expect(result.decision).toBe('done')
+  })
+
+  it('fails loud when the planner emits no parseable envelope', async () => {
+    const client = {
+      async create(): Promise<SandboxInstance> {
+        return {
+          async *streamPrompt() {
+            yield { type: 'message', data: { text: 'I think we should keep going!' } }
+          },
+        } as unknown as SandboxInstance
+      },
+    }
+    const planner = createSandboxPlanner<Task, Out>({
+      client,
+      profile: profile('planner'),
+      decodeTask: (raw) => raw as Task,
+    })
+    await expect(
+      runLoop({
+        driver: createDynamicDriver<Task, Out>({ planner }),
+        agentRun: workerSpecs(['a'])[0],
+        output,
+        validator,
+        task: { goal: 'x', strategy: 'naive' },
+        ctx: { sandboxClient: client },
+      }),
+    ).rejects.toThrow(/no parseable topology-move envelope/i)
+  })
+
+  it('parses a fenced JSON envelope from a text delta', async () => {
+    let plannerRound = 0
+    const client = {
+      async create(opts?: CreateSandboxOptions): Promise<SandboxInstance> {
+        const name =
+          (opts?.backend?.profile && typeof opts.backend.profile === 'object'
+            ? opts.backend.profile.name
+            : undefined) ?? 'unknown'
+        if (name === 'planner') {
+          const fenced =
+            plannerRound++ === 0
+              ? '```json\n{"kind":"refine","tasks":[{"goal":"g","strategy":"parallel-x"}]}\n```'
+              : '```json\n{"kind":"stop"}\n```'
+          return {
+            async *streamPrompt() {
+              yield { type: 'message.delta', data: { text: `here is my plan:\n${fenced}` } }
+            },
+          } as unknown as SandboxInstance
+        }
+        return {
+          async *streamPrompt(message: string) {
+            const task = JSON.parse(message) as Task
+            yield {
+              type: 'result',
+              data: { strategy: task.strategy, harness: name, score: scoreFor(task.strategy) },
+            } satisfies SandboxEvent
+          },
+        } as unknown as SandboxInstance
+      },
+    }
+    const planner = createSandboxPlanner<Task, Out>({
+      client,
+      profile: profile('planner'),
+      decodeTask: (raw) => raw as Task,
+    })
+    const result = await runLoop({
+      driver: createDynamicDriver<Task, Out>({ planner }),
+      agentRun: workerSpecs(['a'])[0],
+      output,
+      validator,
+      task: { goal: 'g', strategy: 'naive' },
+      ctx: { sandboxClient: client },
+    })
+    expect(result.decision).toBe('done')
+    expect(result.winner?.verdict?.score).toBeCloseTo(0.9, 6)
+  })
+
+  it('surfaces a decodeTask rejection as a PlannerError', async () => {
+    const client = {
+      async create(): Promise<SandboxInstance> {
+        return {
+          async *streamPrompt() {
+            yield { type: 'result', data: { result: { kind: 'refine', tasks: [{ bad: true }] } } }
+          },
+        } as unknown as SandboxInstance
+      },
+    }
+    const planner = createSandboxPlanner<Task, Out>({
+      client,
+      profile: profile('planner'),
+      decodeTask: (raw) => {
+        const t = raw as Partial<Task>
+        if (typeof t.strategy !== 'string') throw new Error('missing strategy')
+        return t as Task
+      },
+    })
+    await expect(
+      runLoop({
+        driver: createDynamicDriver<Task, Out>({ planner }),
+        agentRun: workerSpecs(['a'])[0],
+        output,
+        validator,
+        task: { goal: 'x', strategy: 'naive' },
+        ctx: { sandboxClient: client },
+      }),
+    ).rejects.toThrow(PlannerError)
+  })
+})
diff --git a/tests/optimize-prompt.test.ts b/tests/optimize-prompt.test.ts
new file mode 100644
index 0000000..2137468
--- /dev/null
+++ b/tests/optimize-prompt.test.ts
@@ -0,0 +1,136 @@
+import {
+  type ImprovementDriver,
+  inMemoryCampaignStorage,
+  type JudgeConfig,
+  type MutableSurface,
+  type ProposedCandidate,
+  type Scenario,
+} from '@tangle-network/agent-eval/campaign'
+import { describe, expect, it } from 'vitest'
+import { ConfigError } from '../src/errors'
+import { optimizePrompt } from '../src/improvement'
+
+interface SumScenario extends Scenario {
+  kind: 'sum'
+}
+
+interface SumArtifact {
+  text: string
+  quality: number
+}
+
+// Artifact quality is a pure function of the prompt: a prompt that says
+// "PRECISE" produces a high-quality artifact, a vague one does not. This is the
+// measurable signal the gate steers on — a candidate only wins if it lifts
+// quality on the held-out scenarios.
+const runWithPrompt = async (prompt: string): Promise<SumArtifact> => ({
+  text: prompt,
+  quality: /PRECISE/.test(prompt) ? 0.9 : 0.4,
+})
+
+const qualityJudge: JudgeConfig<SumArtifact, SumScenario> = {
+  name: 'quality',
+  dimensions: [{ key: 'quality', description: 'artifact quality 0..1' }],
+  score({ artifact }) {
+    return { dimensions: { quality: artifact.quality }, composite: artifact.quality, notes: '' }
+  },
+}
+
+const scenarios: SumScenario[] = [
+  { id: 't1', kind: 'sum' },
+  { id: 't2', kind: 'sum' },
+]
+const holdoutScenarios: SumScenario[] = [
+  { id: 'h1', kind: 'sum' },
+  { id: 'h2', kind: 'sum' },
+]
+
+const BASELINE = 'Summarize the text.'
+
+/** Deterministic driver — proposes exactly the candidate the test wants to
+ *  measure, once. Stands in for `gepaDriver` so the loop runs with zero LLM. */
+function fixedDriver(candidate: MutableSurface | ProposedCandidate): ImprovementDriver {
+  return {
+    kind: 'test-fixed',
+    async propose() {
+      return [candidate]
+    },
+  }
+}
+
+const baseOpts = {
+  runWithPrompt,
+  scenarios,
+  holdoutScenarios,
+  judges: [qualityJudge],
+  baselinePrompt: BASELINE,
+  populationSize: 1,
+  maxGenerations: 1,
+  promoteTopK: 1,
+  deltaThreshold: 0.1,
+  seed: 7,
+}
+
+describe('optimizePrompt — identity-gated prompt optimization', () => {
+  it('keeps the baseline (identity) when no candidate beats it on holdout', async () => {
+    // Candidate is just as vague as the baseline → no held-out lift.
+    const result = await optimizePrompt<SumScenario, SumArtifact>({
+      ...baseOpts,
+      runDir: 'mem://optimize-identity',
+      storage: inMemoryCampaignStorage(),
+      driver: fixedDriver('Summarize the text concisely.'),
+    })
+
+    expect(result.improved).toBe(false)
+    expect(result.decision).not.toBe('ship')
+    expect(result.prompt).toBe(BASELINE)
+    expect(result.baselineComposite).toBeCloseTo(0.4, 6)
+    // No regression possible: the returned prompt is the untouched baseline.
+    expect(result.delta).toBeLessThan(0.1)
+  })
+
+  it('promotes a candidate that wins on holdout, returning the improved prompt', async () => {
+    const improvedPrompt = 'Summarize the text. Be PRECISE.'
+    const result = await optimizePrompt<SumScenario, SumArtifact>({
+      ...baseOpts,
+      runDir: 'mem://optimize-promote',
+      storage: inMemoryCampaignStorage(),
+      driver: fixedDriver({
+        surface: improvedPrompt,
+        label: 'add precision',
+        rationale: 'precision lifts quality',
+      }),
+    })
+
+    expect(result.improved).toBe(true)
+    expect(result.decision).toBe('ship')
+    expect(result.prompt).toBe(improvedPrompt)
+    expect(result.winnerComposite).toBeCloseTo(0.9, 6)
+    expect(result.baselineComposite).toBeCloseTo(0.4, 6)
+    expect(result.delta).toBeGreaterThanOrEqual(0.1)
+    expect(result.rationale).toBe('precision lifts quality')
+    expect(result.diff).not.toBe('')
+  })
+
+  it('fails loud when neither reflection nor a driver is supplied', async () => {
+    await expect(
+      optimizePrompt<SumScenario, SumArtifact>({
+        ...baseOpts,
+        runDir: 'mem://optimize-misconfig',
+        storage: inMemoryCampaignStorage(),
+      }),
+    ).rejects.toThrow(ConfigError)
+  })
+
+  it('fails loud on an empty holdout set (the gate needs it)', async () => {
+    await expect(
+      optimizePrompt<SumScenario, SumArtifact>({
+        ...baseOpts,
+        holdoutScenarios: [],
+        runDir: 'mem://optimize-noholdout',
+        storage: inMemoryCampaignStorage(),
+        driver: fixedDriver('whatever'),
+      }),
+    ).rejects.toThrow(/holdoutScenarios/)
+  })
+})

From d8c237ef62f28b6f394b6415440c33a1e70ad145 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 30 May 2026 19:03:03 -0600
Subject: [PATCH 7/7] =?UTF-8?q?chore(release):=200.33.0=20=E2=80=94=20dyna?=
 =?UTF-8?q?mic=20loop=20driver=20+=20identity-gated=20optimizePrompt=20(#7?=
 =?UTF-8?q?5)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index a2f128b..c190a6e 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-runtime",
-  "version": "0.32.0",
+  "version": "0.33.0",
   "description": "Reusable runtime lifecycle for domain-specific agents.",
   "homepage": "https://github.com/tangle-network/agent-runtime#readme",
   "repository": {