From 570326b5ebe6bf065369c2ad405492196b48322b Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 08:47:22 -0600 Subject: [PATCH 1/7] feat(loops): surface aggregated tokenUsage on LoopResult + reportLoopUsage bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit runLoop tracked per-call tokensIn/tokensOut (extractLlmCallEvent) but only aggregated costUsd — token counts were dropped before reaching Iteration or LoopResult. A runProfileMatrix/runCampaign dispatch wrapping runLoop could report cost but had no tokens to report, so agent-eval's backend-integrity guard (assertRealBackend, which keys on tokenUsage) would misread a real run as a stub and throw. - Iteration + LoopResult gain tokenUsage: { input, output }, summed across every llm_call event (per iteration) and across iterations (LoopResult). - reportLoopUsage(cost, result) forwards a finished loop's cost + tokens into a campaign cost meter in one call — the trivial consumption path for the new runProfileMatrix primitive. Typed structurally so loops stay free of an agent-eval import. Extends the existing cost-aggregation test to assert token aggregation + reportLoopUsage forwarding. Full suite 381 green. --- src/loops/index.ts | 2 ++ src/loops/report-usage.ts | 41 ++++++++++++++++++++++++++++++++++++++ src/loops/run-loop.ts | 12 +++++++++++ src/loops/types.ts | 15 ++++++++++++++ tests/loops/refine.test.ts | 22 ++++++++++++++++++++ 5 files changed, 92 insertions(+) create mode 100644 src/loops/report-usage.ts diff --git a/src/loops/index.ts b/src/loops/index.ts index ae9fad3..7d74986 100644 --- a/src/loops/index.ts +++ b/src/loops/index.ts @@ -27,6 +27,7 @@ export type { CreateRefineDriverOptions, RefineDecision } from './drivers/refine export { createRefineDriver, refineWinnerIndex } from './drivers/refine' export type { RunLoopOptions } from './run-loop' export { runLoop } from './run-loop' +export { reportLoopUsage, type UsageSink } from './report-usage' export type { AgentRunSpec, DefaultVerdict, @@ -42,6 +43,7 @@ export type { LoopSandboxClient, LoopSandboxPlacement, LoopStartedPayload, + LoopTokenUsage, LoopTraceEmitter, LoopTraceEvent, LoopWinner, diff --git a/src/loops/report-usage.ts b/src/loops/report-usage.ts new file mode 100644 index 0000000..30c9a6d --- /dev/null +++ b/src/loops/report-usage.ts @@ -0,0 +1,41 @@ +/** + * Bridge a finished `runLoop` into an agent-eval campaign / profile-matrix + * dispatch. + * + * `runProfileMatrix` (and `runCampaign`) run the backend-integrity guard over + * the token usage a dispatch reports through `ctx.cost`. A dispatch that wraps + * `runLoop` must forward the loop's cost AND token usage, or the guard reads + * the run as a stub and throws. `reportLoopUsage` is that one line: + * + * const dispatch: ProfileDispatchFn = async (profile, scenario, ctx) => { + * const result = await runLoop({ ...optsFor(profile, scenario), ctx: loopCtx }) + * reportLoopUsage(ctx, result) + * return result.winner?.output as A + * } + * + * Typed structurally against the campaign `DispatchContext.cost` so this module + * stays free of an agent-eval import — it works with any cost meter exposing + * `observe` + `observeTokens`. + */ + +import type { LoopResult } from './types' + +/** The slice of an agent-eval campaign `DispatchContext.cost` this needs. */ +export interface UsageSink { + observe(amountUsd: number, source: string): void + observeTokens(usage: { input: number; output: number }): void +} + +/** + * Forward a `LoopResult`'s aggregated cost + token usage into a campaign cost + * meter so the backend-integrity guard sees real LLM activity. `source` + * defaults to `'loop'`. + */ +export function reportLoopUsage( + cost: UsageSink, + result: Pick, 'costUsd' | 'tokenUsage'>, + source = 'loop', +): void { + cost.observe(result.costUsd, source) + cost.observeTokens({ input: result.tokenUsage.input, output: result.tokenUsage.output }) +} diff --git a/src/loops/run-loop.ts b/src/loops/run-loop.ts index fafc52a..c7c8a77 100644 --- a/src/loops/run-loop.ts +++ b/src/loops/run-loop.ts @@ -149,6 +149,7 @@ export async function runLoop( startedAt: now(), endedAt: 0, costUsd: 0, + tokenUsage: { input: 0, output: 0 }, }) } @@ -288,6 +289,8 @@ async function executeIteration(args: ExecuteIterationArgs( ): LoopResult { const winner = (args.options.selectWinner ?? defaultSelectWinner)(args.iterations) const costUsd = args.iterations.reduce((sum, iter) => sum + (iter.costUsd || 0), 0) + const tokenUsage = args.iterations.reduce( + (acc, iter) => { + acc.input += iter.tokenUsage?.input ?? 0 + acc.output += iter.tokenUsage?.output ?? 0 + return acc + }, + { input: 0, output: 0 }, + ) const result: LoopResult = { decision: args.decision, iterations: args.iterations, winner, durationMs: args.now() - args.startMs, costUsd, + tokenUsage, } void emitTrace(args.options.ctx.traceEmitter, { kind: 'loop.ended', diff --git a/src/loops/types.ts b/src/loops/types.ts index a28eae0..a183cd9 100644 --- a/src/loops/types.ts +++ b/src/loops/types.ts @@ -90,6 +90,15 @@ export interface OutputAdapter { parse(events: SandboxEvent[]): Output } +/** LLM token usage. Structurally matches agent-eval's `RunTokenUsage` / + * `CampaignTokenUsage` ({ input, output }) so a loop result maps straight + * onto `ctx.cost.observeTokens` in a `runProfileMatrix` dispatch — without + * which the backend-integrity guard reads the run as a stub. */ +export interface LoopTokenUsage { + input: number + output: number +} + /** @experimental */ export interface Iteration { /** 0-based iteration index assigned by the kernel. */ @@ -105,6 +114,8 @@ export interface Iteration { startedAt: number endedAt: number costUsd: number + /** Summed LLM token usage across every `llm_call` event in this iteration. */ + tokenUsage: LoopTokenUsage } /** @experimental */ @@ -144,6 +155,10 @@ export interface LoopResult { durationMs: number /** Sum of every iteration's `costUsd`. */ costUsd: number + /** Sum of every iteration's token usage. Forward to + * `ctx.cost.observeTokens` in a `runProfileMatrix` dispatch so the + * integrity guard sees real LLM activity. */ + tokenUsage: LoopTokenUsage } /** diff --git a/tests/loops/refine.test.ts b/tests/loops/refine.test.ts index 50121de..a8b4915 100644 --- a/tests/loops/refine.test.ts +++ b/tests/loops/refine.test.ts @@ -11,6 +11,7 @@ import { type LoopTraceEvent, type OutputAdapter, refineWinnerIndex, + reportLoopUsage, runLoop, type Validator, } from '../../src/loops' @@ -242,6 +243,27 @@ describe('runLoop + createRefineDriver', () => { expect(result.iterations[0]?.costUsd).toBeCloseTo(0.01, 9) expect(result.iterations[1]?.costUsd).toBeCloseTo(0.02, 9) expect(result.costUsd).toBeCloseTo(0.03, 9) + // Token usage must aggregate too — a runProfileMatrix dispatch forwards + // this to the backend-integrity guard; if it stayed 0/0 a real run would + // be misread as a stub. + expect(result.iterations[0]?.tokenUsage).toEqual({ input: 100, output: 50 }) + expect(result.iterations[1]?.tokenUsage).toEqual({ input: 80, output: 30 }) + expect(result.tokenUsage).toEqual({ input: 180, output: 80 }) + + // reportLoopUsage forwards both cost AND tokens into a campaign cost meter. + const observed: Array<{ usd: number; src: string }> = [] + let tokens = { input: 0, output: 0 } + reportLoopUsage( + { + observe: (usd, src) => observed.push({ usd, src }), + observeTokens: (u) => { + tokens = u + }, + }, + result, + ) + expect(observed).toEqual([{ usd: 0.03, src: 'loop' }]) + expect(tokens).toEqual({ input: 180, output: 80 }) }) it('refineWinnerIndex returns the last valid iteration', () => { From 9cbd6862d806a53c113ae065b1c71673f8d222d8 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 08:52:12 -0600 Subject: [PATCH 2/7] =?UTF-8?q?chore(deps):=20bump=20@tangle-network/agent?= =?UTF-8?q?-eval=20^0.54.0=20=E2=86=92=20^0.61.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consumes the published runProfileMatrix + token-capture release. 7-minor jump verified: typecheck + build + full suite (381) green. --- package.json | 2 +- pnpm-lock.yaml | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/package.json b/package.json index a6c2aca..4565a5a 100644 --- a/package.json +++ b/package.json @@ -76,7 +76,7 @@ "typecheck": "tsc --noEmit" }, "dependencies": { - "@tangle-network/agent-eval": "^0.54.0" + "@tangle-network/agent-eval": "^0.61.0" }, "devDependencies": { "@biomejs/biome": "^2.4.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7e4087e..ee94426 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9,8 +9,8 @@ importers: .: dependencies: '@tangle-network/agent-eval': - specifier: ^0.54.0 - version: 0.54.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) + specifier: ^0.61.0 + version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) '@tangle-network/agent-knowledge': specifier: '>=1.3.0 <2.0.0' version: 1.4.0(typescript@5.9.3)(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)) @@ -458,12 +458,12 @@ packages: engines: {node: '>=20'} hasBin: true - '@tangle-network/agent-eval@0.54.0': - resolution: {integrity: sha512-9dmCfXOBZHbmX//RrN/8iKUfmTB21hwjKEWD6qWFszwNK7/KoCzootKsYr6s1yt2vCoX1F54LjwE9qn1VNfUKw==} + '@tangle-network/agent-eval@0.61.0': + resolution: {integrity: sha512-yydVL47bNa2lNaapgFnDKjYRPJfpVTK8luFASCuLNyKtahibMM7bXF+JCScKhYdtEwCYiUijZI0F8VaoIvDi3g==} engines: {node: '>=20'} hasBin: true peerDependencies: - '@tangle-network/sandbox': '>=0.2.1 <0.4.0' + '@tangle-network/sandbox': '>=0.2.1 <0.5.0' peerDependenciesMeta: '@tangle-network/sandbox': optional: true @@ -1289,7 +1289,7 @@ snapshots: - typescript - utf-8-validate - '@tangle-network/agent-eval@0.54.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)': + '@tangle-network/agent-eval@0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)': dependencies: '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.2) '@ax-llm/ax': 19.0.45(zod@4.4.2) From 01f3b2874ad05cdbd157687b696fa7b388da84aa Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 09:08:21 -0600 Subject: [PATCH 3/7] =?UTF-8?q?feat(loops):=20loopDispatch=20=E2=80=94=20f?= =?UTF-8?q?irst-class=20runLoop=E2=86=92campaign=20dispatch=20adapter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The seam critique found reportLoopUsage had one consumer (a test) and zero products: wiring runLoop into runProfileMatrix/runCampaign required hand-building ExecCtx, hand-adapting the campaign trace, and remembering to forward usage (forgetting the last yields a {0,0} stub cell). loopDispatch collapses all three into one typed call: const dispatch = loopDispatch({ sandboxClient, toLoopOptions }) await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha }) It builds the ExecCtx, forwards loop.* trace events into the campaign's scoped trace (campaignTraceToLoopEmitter), runs runLoop, reports cost+tokens via reportLoopUsage internally, and returns winner.output. loopCampaignDispatch is the runCampaign (no-profile) variant. AgentProfile imported from agent-eval (the eval-harness type ProfileDispatchFn keys on), NOT sandbox's — closes the name-collision footgun at this call site. Tests: returns winner artifact + reports exact usage + forwards trace spans; usage still flows on a validator-failing run (must not read as a stub). Full suite 383 green. --- src/loops/index.ts | 6 ++ src/loops/loop-dispatch.ts | 132 +++++++++++++++++++++++ tests/loops/loop-dispatch.test.ts | 168 ++++++++++++++++++++++++++++++ 3 files changed, 306 insertions(+) create mode 100644 src/loops/loop-dispatch.ts create mode 100644 tests/loops/loop-dispatch.test.ts diff --git a/src/loops/index.ts b/src/loops/index.ts index 7d74986..d854266 100644 --- a/src/loops/index.ts +++ b/src/loops/index.ts @@ -28,6 +28,12 @@ export { createRefineDriver, refineWinnerIndex } from './drivers/refine' export type { RunLoopOptions } from './run-loop' export { runLoop } from './run-loop' export { reportLoopUsage, type UsageSink } from './report-usage' +export { + loopCampaignDispatch, + loopDispatch, + type LoopDispatchOptions, + type LoopOptionsForDispatch, +} from './loop-dispatch' export type { AgentRunSpec, DefaultVerdict, diff --git a/src/loops/loop-dispatch.ts b/src/loops/loop-dispatch.ts new file mode 100644 index 0000000..e4c9a77 --- /dev/null +++ b/src/loops/loop-dispatch.ts @@ -0,0 +1,132 @@ +/** + * `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch. + * + * Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` / + * `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a + * sandbox client, (b) adapt the campaign `DispatchContext.trace` into a + * `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to + * forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a + * `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns, + * the third silent. The fleet's products skipped (c) and fell back to a + * `workerRecords[]` side-channel — the exact anti-pattern the substrate exists + * to kill. + * + * `loopDispatch` collapses all three into one typed call: + * + * const dispatch = loopDispatch({ + * sandboxClient, + * toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }), + * }) + * await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha }) + * + * Usage is reported automatically; trace events are forwarded automatically; + * the ctx is built automatically. The seam becomes impossible to mis-wire. + * + * Typed structurally against the campaign `DispatchContext` (imported type-only + * from `@tangle-network/agent-eval/campaign`) — a downward dependency, never an + * inversion. + */ + +// agent-eval's AgentProfile (the eval-harness unit of variation, `model: string`) +// — NOT sandbox's AgentProfile. ProfileDispatchFn is keyed on the former. +import type { AgentProfile } from '@tangle-network/agent-eval' +import type { + CampaignTraceWriter, + DispatchContext, + DispatchFn, + ProfileDispatchFn, + Scenario, +} from '@tangle-network/agent-eval/campaign' +import { reportLoopUsage } from './report-usage' +import { type RunLoopOptions, runLoop } from './run-loop' +import type { LoopResult, LoopSandboxClient, LoopTraceEmitter } from './types' + +/** runLoop options minus the `ctx` (loopDispatch builds the ctx). */ +export type LoopOptionsForDispatch = Omit< + RunLoopOptions, + 'ctx' +> + +export interface LoopDispatchOptions { + /** Sandbox client used for every cell's `runLoop`. Supplied once. */ + sandboxClient: LoopSandboxClient + /** Build the per-cell runLoop options from the scenario (+ profile, when + * used with `runProfileMatrix`). */ + toLoopOptions: ( + scenario: TScenario, + profile: AgentProfile, + ) => LoopOptionsForDispatch + /** Map the finished loop to the artifact the judges score. Default: + * `result.winner?.output`. A loop with no winner yields `undefined` (judges + * skip the cell) — but the loop's token usage is STILL reported, so the + * integrity guard sees real activity. */ + toArtifact?: (result: LoopResult) => TArtifact + /** Forward `loop.*` trace events into the campaign's scoped trace so loop + * spans correlate with the cell. Default true. */ + forwardTrace?: boolean + /** Cost-meter source label for the loop's spend. Default `'loop'`. */ + costSource?: string +} + +/** Bridge a campaign `DispatchContext.trace` to a `LoopTraceEmitter` so every + * `loop.*` event lands as a span under the cell's scoped trace. */ +function campaignTraceToLoopEmitter(trace: CampaignTraceWriter): LoopTraceEmitter { + return { + emit(event) { + trace + .span(event.kind, { runId: event.runId, timestamp: event.timestamp, ...event.payload }) + .end() + }, + } +} + +async function runLoopForCell( + opts: LoopDispatchOptions, + scenario: TScenario, + profile: AgentProfile, + ctx: DispatchContext, +): Promise { + const loopOptions = opts.toLoopOptions(scenario, profile) + const result = await runLoop({ + ...loopOptions, + ctx: { + sandboxClient: opts.sandboxClient, + signal: ctx.signal, + traceEmitter: + opts.forwardTrace === false ? undefined : campaignTraceToLoopEmitter(ctx.trace), + }, + }) + reportLoopUsage(ctx.cost, result, opts.costSource ?? 'loop') + const toArtifact = + opts.toArtifact ?? ((r: LoopResult) => r.winner?.output as TArtifact) + return toArtifact(result) +} + +/** + * Adapter for `runProfileMatrix` (profile is an axis). Returns a + * `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and + * reports usage automatically. + */ +export function loopDispatch( + opts: LoopDispatchOptions, +): ProfileDispatchFn { + return (profile, scenario, ctx) => runLoopForCell(opts, scenario, profile, ctx) +} + +/** + * Adapter for `runCampaign` (no profile axis). `toLoopOptions` receives only + * the scenario; the `profile` passed to the shared core is a stable sentinel + * so a single `runLoop` config is reused across cells. + */ +export function loopCampaignDispatch( + opts: Omit, 'toLoopOptions'> & { + toLoopOptions: (scenario: TScenario) => LoopOptionsForDispatch + }, +): DispatchFn { + const profileSentinel = { id: 'loop-campaign', model: 'n/a@loop-campaign' } as AgentProfile + const profiled: LoopDispatchOptions = { + ...opts, + toLoopOptions: (scenario) => opts.toLoopOptions(scenario), + } + return (scenario, ctx) => runLoopForCell(profiled, scenario, profileSentinel, ctx) +} diff --git a/tests/loops/loop-dispatch.test.ts b/tests/loops/loop-dispatch.test.ts new file mode 100644 index 0000000..517a19c --- /dev/null +++ b/tests/loops/loop-dispatch.test.ts @@ -0,0 +1,168 @@ +import type { + AgentProfile as SandboxAgentProfile, + CreateSandboxOptions, + SandboxEvent, + SandboxInstance, +} from '@tangle-network/sandbox' +import type { DispatchContext } from '@tangle-network/agent-eval/campaign' +import { describe, expect, it } from 'vitest' +import { + type AgentRunSpec, + createRefineDriver, + loopDispatch, + type OutputAdapter, + type Validator, +} from '../../src/loops' + +interface Task { + goal: string +} +interface Output { + attempt: number +} +interface FakeScenario { + id: string + kind: string +} + +const sandboxProfile: SandboxAgentProfile = { name: 'stub' } + +function spec(): AgentRunSpec { + return { profile: sandboxProfile, name: 'agent', taskToPrompt: (t) => t.goal } +} + +const output: OutputAdapter = { + parse: (events) => { + const data = events.at(-1)?.data as { attempt?: number } | undefined + return { attempt: typeof data?.attempt === 'number' ? data.attempt : -1 } + }, +} + +const passAlways: Validator = { + async validate(out) { + return { valid: true, score: 1, scores: { attempt: out.attempt } } + }, +} + +function stubClient(events: SandboxEvent[]): { create(opts?: CreateSandboxOptions): Promise } { + return { + async create() { + return { + async *streamPrompt() { + for (const e of events) yield e + }, + } as unknown as SandboxInstance + }, + } +} + +/** Minimal campaign DispatchContext that records what the dispatch reports. */ +function fakeDispatchContext(): { + ctx: DispatchContext + observed: Array<{ usd: number; src: string }> + tokens: { input: number; output: number } + spans: string[] +} { + const observed: Array<{ usd: number; src: string }> = [] + const tokens = { input: 0, output: 0 } + const spans: string[] = [] + const ctx: DispatchContext = { + cellId: 'cell-0', + rep: 0, + seed: 1, + signal: new AbortController().signal, + trace: { + span(name: string) { + spans.push(name) + return { end() {}, setAttribute() {} } + }, + async flush() {}, + }, + artifacts: { + async write() { + return 'p' + }, + async writeJson() { + return 'p' + }, + }, + cost: { + observe(usd: number, src: string) { + observed.push({ usd, src }) + }, + observeTokens(u: { input: number; output: number }) { + tokens.input += u.input + tokens.output += u.output + }, + current() { + return 0 + }, + tokens() { + return tokens + }, + }, + } + return { ctx, observed, tokens, spans } +} + +describe('loopDispatch', () => { + it('bridges runLoop into a ProfileDispatchFn: returns the winner artifact, reports usage, forwards trace', async () => { + const sandboxClient = stubClient([ + { type: 'llm_call', data: { tokensIn: 150, tokensOut: 60, costUsd: 0.02, model: 'm' } }, + { type: 'result', data: { attempt: 2 } }, + ]) + const dispatch = loopDispatch({ + sandboxClient, + toLoopOptions: (scenario) => ({ + driver: createRefineDriver(), + agentRun: spec(), + output, + validator: passAlways, + task: { goal: scenario.id }, + maxIterations: 1, + }), + }) + + const fake = fakeDispatchContext() + const profile = { id: 'baseline', model: 'test-model@2025-01-01' } + const artifact = await dispatch(profile, { id: 's1', kind: 'task' }, fake.ctx) + + // Returns the loop's winner output. + expect(artifact).toEqual({ attempt: 2 }) + // Usage reported to the campaign cost meter — the integrity guard's input. + expect(fake.observed).toEqual([{ usd: 0.02, src: 'loop' }]) + expect(fake.tokens).toEqual({ input: 150, output: 60 }) + // Loop trace events forwarded into the campaign trace as spans. + expect(fake.spans).toContain('loop.started') + expect(fake.spans).toContain('loop.ended') + }) + + it('reports usage even when the run fails the validator (real activity must NOT read as a stub)', async () => { + const failAlways: Validator = { + async validate() { + return { valid: false, score: 0, scores: {}, notes: 'no' } + }, + } + const sandboxClient = stubClient([ + { type: 'llm_call', data: { tokensIn: 90, tokensOut: 20, costUsd: 0.01, model: 'm' } }, + { type: 'result', data: { attempt: 1 } }, + ]) + const dispatch = loopDispatch({ + sandboxClient, + toLoopOptions: (scenario) => ({ + driver: createRefineDriver(), + agentRun: spec(), + output, + validator: failAlways, + task: { goal: scenario.id }, + maxIterations: 1, + }), + }) + const fake = fakeDispatchContext() + await dispatch({ id: 'p', model: 'm@2025-01-01' }, { id: 's1', kind: 'task' }, fake.ctx) + // The validator failed, but real LLM activity happened — tokens + cost MUST + // still reach the cost meter, or the integrity guard would call it a stub. + expect(fake.tokens).toEqual({ input: 90, output: 20 }) + expect(fake.observed).toEqual([{ usd: 0.01, src: 'loop' }]) + }) +}) From fad618ee7e7eba2fa0fb1c76c554eda658faa60c Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 09:25:05 -0600 Subject: [PATCH 4/7] chore(deps): declare agent-eval as a required peerDependency, not a hard dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Version-discipline fix (boundary critique, VERSIONING 3/10). agent-eval was the lone hard dependency while sandbox + agent-knowledge are already peers. A hard dep lets pnpm install a SECOND, divergent agent-eval tree with an incompatible RunRecord/DefaultVerdict; today only pnpm.overrides prevents it. As a peer (>=0.61.0 <1.0.0, required — not optional), a consumer running a stale or divergent substrate gets a loud unmet-peer warning instead of a silent split tree. agent-eval moves to devDependencies for agent-runtime's own build/test. Typecheck + full suite (383) green with the peer layout. --- package.json | 6 +++--- pnpm-lock.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/package.json b/package.json index 4565a5a..7835731 100644 --- a/package.json +++ b/package.json @@ -75,11 +75,10 @@ "lint:fix": "biome check --write src tests examples", "typecheck": "tsc --noEmit" }, - "dependencies": { - "@tangle-network/agent-eval": "^0.61.0" - }, + "dependencies": {}, "devDependencies": { "@biomejs/biome": "^2.4.0", + "@tangle-network/agent-eval": "^0.61.0", "@tangle-network/sandbox": "^0.4.0", "@types/node": "^25.6.0", "tsup": "^8.0.0", @@ -101,6 +100,7 @@ "license": "MIT", "packageManager": "pnpm@10.28.0", "peerDependencies": { + "@tangle-network/agent-eval": ">=0.61.0 <1.0.0", "@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0", "@tangle-network/sandbox": ">=0.1.2 <0.5.0" }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ee94426..45dc755 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,9 +8,6 @@ importers: .: dependencies: - '@tangle-network/agent-eval': - specifier: ^0.61.0 - version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) '@tangle-network/agent-knowledge': specifier: '>=1.3.0 <2.0.0' version: 1.4.0(typescript@5.9.3)(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)) @@ -18,6 +15,9 @@ importers: '@biomejs/biome': specifier: ^2.4.0 version: 2.4.15 + '@tangle-network/agent-eval': + specifier: ^0.61.0 + version: 0.61.0(@tangle-network/sandbox@0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) '@tangle-network/sandbox': specifier: ^0.4.0 version: 0.4.0(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)) From ffc89ce123c22f797409b36cb7ac4d36817409e1 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 09:27:33 -0600 Subject: [PATCH 5/7] =?UTF-8?q?chore(release):=200.32.0=20=E2=80=94=20loop?= =?UTF-8?q?Dispatch=20adapter=20+=20tokenUsage=20seam=20+=20agent-eval=20p?= =?UTF-8?q?eer-dep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 7835731..a2f128b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.31.0", + "version": "0.32.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": { From 39ccd427188d5d9f9595a417290098795dc9e23a Mon Sep 17 00:00:00 2001 From: tangletools Date: Sat, 30 May 2026 18:58:36 -0600 Subject: [PATCH 6/7] feat(loops+improvement): dynamic loop driver + identity-gated optimizePrompt (#75) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(loops): dynamic driver — agent-authored loop topology Third example driver alongside refine and fanout-vote, built on the existing Driver seam with zero kernel changes. Where refine/fanout-vote encode a fixed shape as a pure function of history, createDynamicDriver delegates the per-round shape to an injected TopologyPlanner that emits one TopologyMove (refine | fanout | stop) per round. - createDynamicDriver: maps moves onto plan/decide, enforces the iteration + fanout caps, fails loud (PlannerError) on a malformed move. Planner invoked once per round in plan(); decide() reads the cached move so an LLM planner is never double-called. 'done' is already a kernel-terminal decision, so termination needs no kernel change. - createSandboxPlanner: wires the planner to a sandbox profile (any harness) — streams a prompt carrying the history summary, decodes the move from a JSON envelope (structured result event or fenced block). - summarizeHistory: bounded, planner-friendly view of iteration history. - PlannerError added to the error taxonomy (carries 'validation'). Topology is orthogonal to harness: the planner never names a backend; the kernel's agentRuns round-robin decides which harness runs a branch, so one dynamic driver spans claude-code/codex/opencode/pi, including fanning a single round across several at once. 11 tests through the real kernel (sandbox stubbed at the process boundary): adaptive refine→refine→fanout→stop, explicit scripted trajectory across two harnesses, maxIterations cap, maxFanout clamp, empty-fanout + unknown-kind PlannerError, createSandboxPlanner end-to-end + n-shorthand + fenced-delta parse + decodeTask rejection. * feat(improvement): optimizePrompt — identity-gated optimization for any text prompt surface The text-surface entry point onto agent-eval's runImprovementLoop, sibling to improvementDriver (the code/worktree path). Defaults the driver to agent-eval's gepaDriver (reflective text mutator) and the gate to heldOutGate; runtime-agnostic via a single runWithPrompt seam. Identity-gated by construction: the loop runs evals, collects per-scenario signal, proposes candidates, and the held-out gate compares candidate vs baseline. result.prompt is the baseline (identity) UNLESS the gate decided 'ship' — so registering a prompt for optimization can never regress it; it only improves when held-out data earns it. Generic over the surface's execution (sandbox streamPrompt, runLoop, direct model call) — the optimizer never assumes how a prompt runs. Fails loud on misconfig (no driver/reflection, empty scenarios/holdout) and on a non-string CodeSurface (wrong entry point). 4 tests through the real runImprovementLoop, zero LLM (deterministic driver + judge + runner, in-memory storage): identity holds when no candidate beats baseline on holdout (returns the untouched baseline), promotes + returns the improved prompt + rationale when a candidate wins, fail-loud on misconfig and empty holdout. --------- Co-authored-by: Drew Stone --- src/errors.ts | 17 + src/improvement/index.ts | 21 +- src/improvement/optimize-prompt.ts | 242 ++++++++++++++ src/index.ts | 1 + src/loops/drivers/dynamic.ts | 217 ++++++++++++ src/loops/drivers/sandbox-planner.ts | 254 ++++++++++++++ src/loops/index.ts | 13 + tests/loops/dynamic.test.ts | 474 +++++++++++++++++++++++++++ tests/optimize-prompt.test.ts | 136 ++++++++ 9 files changed, 1369 insertions(+), 6 deletions(-) create mode 100644 src/improvement/optimize-prompt.ts create mode 100644 src/loops/drivers/dynamic.ts create mode 100644 src/loops/drivers/sandbox-planner.ts create mode 100644 tests/loops/dynamic.test.ts create mode 100644 tests/optimize-prompt.test.ts diff --git a/src/errors.ts b/src/errors.ts index 40e4716..5b71a7e 100644 --- a/src/errors.ts +++ b/src/errors.ts @@ -99,3 +99,20 @@ export class RuntimeRunStateError extends AgentEvalError { super('validation', message, options) } } + +/** + * @stable + * + * The dynamic-loop planner returned an unusable topology move — the LLM emitted + * no parseable envelope, an unknown `kind`, or a structurally-invalid move + * (e.g. a fanout with zero tasks). This is a structural failure of the + * agent-authored topology, not a config mistake: the planner ran but its output + * cannot drive the kernel. Carries `validation` so cross-package handlers can + * pattern-match without importing the runtime. Fail loud — never substitute a + * default move, or the loop silently runs a topology nobody chose. + */ +export class PlannerError extends AgentEvalError { + constructor(message: string, options?: { cause?: unknown }) { + super('validation', message, options) + } +} diff --git a/src/improvement/index.ts b/src/improvement/index.ts index 7a523d6..80eae77 100644 --- a/src/improvement/index.ts +++ b/src/improvement/index.ts @@ -1,11 +1,14 @@ /** - * `@tangle-network/agent-runtime` improvement drivers — implementations of - * agent-eval's `ImprovementDriver` contract. + * `@tangle-network/agent-runtime` improvement — two entry points onto + * agent-eval's `runImprovementLoop`: * - * ONE driver (`improvementDriver`) owns the candidate lifecycle; pluggable - * `CandidateGenerator`s set the cost/capability dial: - * - `reflectiveGenerator` — cheap, no sandbox, applies pre-drafted patches - * - `agenticGenerator` — full coding harness in the worktree, multi-shot + * - `improvementDriver` (CODE surface) — owns the candidate lifecycle via a + * pluggable `CandidateGenerator`: + * - `reflectiveGenerator` — cheap, no sandbox, applies pre-drafted patches + * - `agenticGenerator` — full coding harness in the worktree, multi-shot + * - `optimizePrompt` (TEXT surface) — identity-gated optimization of any + * system / planner prompt. Defaults to agent-eval's `gepaDriver` + + * `heldOutGate`; returns the baseline unless the held-out gate ships a win. */ export { type AgenticGeneratorOptions, agenticGenerator } from './agentic-generator' @@ -14,4 +17,10 @@ export { type ImprovementDriverOptions, improvementDriver, } from './improvement-driver' +export { + type OptimizePromptOptions, + type OptimizePromptReflection, + type OptimizePromptResult, + optimizePrompt, +} from './optimize-prompt' export { type ReflectiveGeneratorOptions, reflectiveGenerator } from './reflective-generator' diff --git a/src/improvement/optimize-prompt.ts b/src/improvement/optimize-prompt.ts new file mode 100644 index 0000000..1c822c8 --- /dev/null +++ b/src/improvement/optimize-prompt.ts @@ -0,0 +1,242 @@ +/** + * @experimental + * + * `optimizePrompt` — identity-gated optimization for any TEXT prompt surface + * (system prompt, planner prompt, judge rubric, skill doc). + * + * The text-surface sibling to this module's `improvementDriver` (the + * CODE-surface / worktree path). Both feed agent-eval's `runImprovementLoop`; + * this one defaults the driver to agent-eval's `gepaDriver` (reflective text + * mutator) and the gate to `heldOutGate`. + * + * IDENTITY-GATED BY CONSTRUCTION — the whole point. The loop runs evals, + * collects per-scenario signal, proposes candidates, and the gate compares + * candidate-vs-baseline ON THE HELDOUT. `result.prompt` is the baseline + * (identity) UNLESS the gate decided `'ship'`. So wiring a surface up is safe: + * a surface with no beneficial mutation simply keeps its baseline. You never + * regress by registering a prompt — you only ever improve when the held-out + * data earns it. + * + * Generic over the runtime: `runWithPrompt` is the only domain seam — given a + * candidate prompt + scenario, run it however the surface runs (sandbox + * `streamPrompt`, a `runLoop`, a direct model call) and return the artifact the + * judges score. The optimizer never assumes how a prompt is executed. + */ + +import type { LlmClientOptions } from '@tangle-network/agent-eval' +import type { + CampaignResult, + CampaignStorage, + DispatchContext, + Gate, + GateResult, + ImprovementDriver, + JudgeConfig, + RunImprovementLoopResult, + Scenario, +} from '@tangle-network/agent-eval/campaign' +import { gepaDriver, heldOutGate, runImprovementLoop } from '@tangle-network/agent-eval/campaign' +import { ConfigError } from '../errors' + +/** Reflection config for the default `gepaDriver`. Omit when passing a custom + * `driver`. */ +export interface OptimizePromptReflection { + /** Router transport for the reflection model. */ + llm: LlmClientOptions + /** Model that performs the reflective rewrite. */ + model: string + /** What is being optimized — orients the reflection prompt. Default + * `'system prompt'`. */ + target?: string + /** Surface-specific mutation levers offered to the reflector. */ + mutationPrimitives?: string[] + /** H2 (`## Foo`) headings that MUST survive every candidate. gepaDriver's + * only structural guard — load-bearing sections of the prompt should be + * `##` headings so a rewrite cannot drop them. */ + preserveSections?: string[] + /** Max sentence-level edits per candidate vs the parent (a textual learning + * rate). Caps a rewrite from wiping prior rules in one generation. */ + maxSentenceEdits?: number +} + +/** @experimental */ +export interface OptimizePromptOptions { + /** The prompt being optimized — the identity baseline the gate protects. */ + baselinePrompt: string + /** Domain seam: run a candidate prompt against a scenario → artifact the + * judges score. The optimizer is agnostic to HOW the prompt runs. */ + runWithPrompt: (prompt: string, scenario: TScenario, ctx: DispatchContext) => Promise + /** Training pool — scored each generation to rank candidates. */ + scenarios: TScenario[] + /** Held out of training — scored ONLY for the gate's baseline-vs-winner + * delta. Disjoint from `scenarios`; this is what makes promotion measure + * generalization, not memorization. */ + holdoutScenarios: TScenario[] + /** Scorers — deterministic checks or LLM judges. */ + judges: JudgeConfig[] + /** Where artifacts + traces land (opaque key under in-memory storage). */ + runDir: string + /** Default driver = `gepaDriver` built from this. Required UNLESS `driver` + * is supplied. */ + reflection?: OptimizePromptReflection + /** Override the improvement strategy (custom driver / deterministic tests). */ + driver?: ImprovementDriver + /** Override the promotion gate. Default `heldOutGate` over `holdoutScenarios` + * — zero extra LLM. Wrap `defaultProductionGate` for red-team/reward-hacking + * hardening on production wiring. */ + gate?: Gate + /** Minimum held-out composite lift to ship, forwarded to the default + * `heldOutGate`. When omitted the gate uses its own default. */ + deltaThreshold?: number + /** Candidates proposed per generation. Default 4. */ + populationSize?: number + /** Generations to run. Default 3. */ + maxGenerations?: number + /** Candidates carried to the next generation. Default 2. */ + promoteTopK?: number + /** Storage backend. Pass `inMemoryCampaignStorage()` for filesystem-less / + * test runs. Default: Node filesystem. */ + storage?: CampaignStorage + /** Reproducibility seed. Default 42. */ + seed?: number + /** Per-scenario replicates for CI bands. Default 1. */ + reps?: number + /** Max concurrent cells. Default 2. */ + maxConcurrency?: number + /** Test seam — override the wall clock. */ + now?: () => Date + /** On a shipped gate: `'pr'` opens a PR, `'none'` just reports. Default + * `'none'`. */ + autoOnPromote?: 'pr' | 'none' + ghOwner?: string + ghRepo?: string +} + +/** @experimental */ +export interface OptimizePromptResult { + /** The prompt to USE. Identity (the baseline) unless the gate shipped a + * winner — so a caller can always assign `result.prompt` unconditionally. */ + prompt: string + /** True only when the gate promoted a candidate over baseline on holdout. */ + improved: boolean + /** The gate's verdict (`'ship' | 'hold' | 'need_more_work' | ...`). */ + decision: GateResult['decision'] + /** Human-readable reasons the gate gave. */ + reasons: string[] + /** Mean held-out composite of the baseline. */ + baselineComposite: number + /** Mean held-out composite of the winner candidate. */ + winnerComposite: number + /** Held-out lift (winner − baseline); the gate's `delta` when it reported one. */ + delta: number + /** Why the winner was proposed — present when a shipped winner carried a + * driver rationale. */ + rationale?: string + /** Unified baseline→winner diff (empty when the winner is the baseline). */ + diff: string + /** The full loop result for callers that need generations / campaigns. */ + raw: RunImprovementLoopResult +} + +/** @experimental */ +export async function optimizePrompt( + opts: OptimizePromptOptions, +): Promise> { + if (!opts.driver && !opts.reflection) { + throw new ConfigError( + 'optimizePrompt: pass `reflection` (builds the default gepaDriver) or a custom `driver`', + ) + } + if (opts.scenarios.length === 0) { + throw new ConfigError('optimizePrompt: `scenarios` must be non-empty') + } + if (opts.holdoutScenarios.length === 0) { + throw new ConfigError( + 'optimizePrompt: `holdoutScenarios` must be non-empty (the gate needs it)', + ) + } + + const driver = + opts.driver ?? + gepaDriver({ + llm: opts.reflection!.llm, + model: opts.reflection!.model, + target: opts.reflection!.target ?? 'system prompt', + mutationPrimitives: opts.reflection!.mutationPrimitives, + constraints: + opts.reflection!.preserveSections || opts.reflection!.maxSentenceEdits !== undefined + ? { + preserveSections: opts.reflection!.preserveSections, + maxSentenceEdits: opts.reflection!.maxSentenceEdits, + } + : undefined, + }) + + const gate = + opts.gate ?? + heldOutGate({ + scenarios: opts.holdoutScenarios, + ...(opts.deltaThreshold !== undefined ? { deltaThreshold: opts.deltaThreshold } : {}), + }) + + const result = await runImprovementLoop({ + baselineSurface: opts.baselinePrompt, + dispatchWithSurface: (surface, scenario, ctx) => { + if (typeof surface !== 'string') { + // optimizePrompt is the TEXT-surface entry point; a CodeSurface means + // the caller wired the wrong driver. Fail loud — don't silently run the + // baseline and report a phantom score. + throw new ConfigError( + 'optimizePrompt: received a CodeSurface — this entry point optimizes string prompts only', + ) + } + return opts.runWithPrompt(surface, scenario, ctx) + }, + driver, + populationSize: opts.populationSize ?? 4, + maxGenerations: opts.maxGenerations ?? 3, + ...(opts.promoteTopK !== undefined ? { promoteTopK: opts.promoteTopK } : {}), + scenarios: opts.scenarios, + holdoutScenarios: opts.holdoutScenarios, + judges: opts.judges, + gate, + autoOnPromote: opts.autoOnPromote ?? 'none', + ...(opts.ghOwner !== undefined ? { ghOwner: opts.ghOwner } : {}), + ...(opts.ghRepo !== undefined ? { ghRepo: opts.ghRepo } : {}), + runDir: opts.runDir, + ...(opts.storage !== undefined ? { storage: opts.storage } : {}), + ...(opts.seed !== undefined ? { seed: opts.seed } : {}), + ...(opts.reps !== undefined ? { reps: opts.reps } : {}), + ...(opts.maxConcurrency !== undefined ? { maxConcurrency: opts.maxConcurrency } : {}), + ...(opts.now !== undefined ? { now: opts.now } : {}), + }) + + const improved = result.gateResult.decision === 'ship' + const winnerSurface = + typeof result.winnerSurface === 'string' ? result.winnerSurface : opts.baselinePrompt + const baselineComposite = meanComposite(result.baselineOnHoldout) + const winnerComposite = meanComposite(result.winnerOnHoldout) + + return { + prompt: improved ? winnerSurface : opts.baselinePrompt, + improved, + decision: result.gateResult.decision, + reasons: result.gateResult.reasons, + baselineComposite, + winnerComposite, + delta: result.gateResult.delta ?? winnerComposite - baselineComposite, + ...(improved && result.winnerRationale ? { rationale: result.winnerRationale } : {}), + diff: result.promotedDiff, + raw: result, + } +} + +/** Mean composite over a campaign's per-scenario aggregates. The held-out + * campaigns score one surface across `holdoutScenarios`; averaging the + * per-scenario means gives the single number the gate's delta is built from. */ +function meanComposite(campaign: CampaignResult): number { + const scenarios = Object.values(campaign.aggregates.byScenario) + if (scenarios.length === 0) return 0 + const sum = scenarios.reduce((acc, s) => acc + s.meanComposite, 0) + return sum / scenarios.length +} diff --git a/src/index.ts b/src/index.ts index 79845b1..9fc8836 100644 --- a/src/index.ts +++ b/src/index.ts @@ -101,6 +101,7 @@ export { ConfigError, JudgeError, NotFoundError, + PlannerError, RuntimeRunStateError, ValidationError, } from './errors' diff --git a/src/loops/drivers/dynamic.ts b/src/loops/drivers/dynamic.ts new file mode 100644 index 0000000..3a9ec39 --- /dev/null +++ b/src/loops/drivers/dynamic.ts @@ -0,0 +1,217 @@ +/** + * @experimental + * + * Dynamic driver — the agent authors the loop topology at runtime. + * + * Where `refine` and `fanout-vote` encode a fixed shape as a pure function of + * history, this driver delegates the per-round shape to an injected + * `TopologyPlanner`. Each round the planner inspects the task + iteration + * history and emits one `TopologyMove`: + * - `refine` → one task next round (optionally rewritten from the prior attempt) + * - `fanout` → N tasks next round (the kernel round-robins `agentRuns`, so a + * 2-harness fanout dispatches branch 0 to harness A and branch 1 to harness B) + * - `stop` → terminate; the kernel selects the winner across all iterations + * + * The planner is the brain; this driver is the structure. It maps moves onto + * the kernel's `plan`/`decide` contract, enforces the iteration + fanout caps, + * and fails loud on a malformed move. The planner is injected exactly like + * `refine`'s `refineTask` and `fanout-vote`'s `selector` — so a test can drive + * a deterministic policy through the real kernel, and production can wire it to + * an LLM via `createSandboxPlanner`. + * + * Topology is orthogonal to harness: the planner never names a backend. Which + * harness runs a branch is decided by the `AgentRunSpec` the kernel round-robins + * to, so one dynamic driver works across claude-code, codex, opencode, pi — + * including fanning a single round across several at once. + */ + +import { PlannerError, ValidationError } from '../../errors' +import type { Driver, Iteration } from '../types' + +/** Terminal once `decide` returns `'done'` (a kernel terminal decision). */ +export type DynamicDecision = 'continue' | 'done' + +/** + * One topology decision for the next round. `fanout` carries explicit tasks + * rather than a count so the planner can issue heterogeneous branches (a + * different sub-task per harness); pass N copies of one task for a homogeneous + * fanout that relies on `agentRuns` diversity instead. + * + * @experimental + */ +export type TopologyMove = + | { kind: 'refine'; task: Task; rationale?: string } + | { kind: 'fanout'; tasks: Task[]; rationale?: string } + | { kind: 'stop'; rationale?: string } + +/** @experimental */ +export interface PlannerContext { + /** The root task the loop was invoked with — stable across rounds. */ + task: Task + /** Every iteration so far, in dispatch order, with outputs + verdicts. */ + history: ReadonlyArray> + /** `history.length` — iterations already spent. */ + iterationsSpent: number + /** Iterations left before the driver's `maxIterations` cap forces a stop. */ + iterationsRemaining: number +} + +/** + * Chooses the next topology move from the task + history. Sync or async; an + * async planner is where an LLM call goes (see `createSandboxPlanner`). + * + * @experimental + */ +export type TopologyPlanner = ( + ctx: PlannerContext, +) => TopologyMove | Promise> + +/** @experimental */ +export interface CreateDynamicDriverOptions { + /** The agent-authored topology policy. Invoked once per round in `plan`. */ + planner: TopologyPlanner + /** + * Hard safety cap on total iterations. When reached, the driver stops before + * consulting the planner. Default 8. Set the kernel's `runLoop` + * `maxIterations >= ` this so the driver's cap governs and the loop closes on + * a clean `'done'` rather than a truncated `'continue'`. + */ + maxIterations?: number + /** Max branches a single `fanout` move may dispatch. Default 4. */ + maxFanout?: number + /** Stable identifier surfaced in trace events. Default `'dynamic'`. */ + name?: string +} + +/** @experimental */ +export function createDynamicDriver( + options: CreateDynamicDriverOptions, +): Driver { + if (typeof options.planner !== 'function') { + throw new ValidationError('createDynamicDriver: planner must be a function') + } + const maxIterations = options.maxIterations ?? 8 + if (!Number.isFinite(maxIterations) || maxIterations <= 0) { + throw new ValidationError('createDynamicDriver: maxIterations must be > 0') + } + const maxFanout = options.maxFanout ?? 4 + if (!Number.isFinite(maxFanout) || maxFanout < 1) { + throw new ValidationError('createDynamicDriver: maxFanout must be >= 1') + } + + // The kernel calls plan(), runs the batch, then calls decide() — strictly + // sequential, one driver instance per loop. Caching the move the planner + // chose this round lets decide() report terminality without re-invoking the + // planner (which would double every LLM call). + let pending: TopologyMove | undefined + + return { + name: options.name ?? 'dynamic', + async plan(task, history) { + if (history.length >= maxIterations) { + pending = { kind: 'stop', rationale: `maxIterations (${maxIterations}) reached` } + return [] + } + const move = await options.planner({ + task, + history, + iterationsSpent: history.length, + iterationsRemaining: maxIterations - history.length, + }) + pending = validateMove(move, maxFanout) + switch (pending.kind) { + case 'refine': + return [pending.task] + case 'fanout': + return pending.tasks + case 'stop': + return [] + } + }, + decide() { + // pending is set by the plan() call that immediately precedes every + // decide(). Only a `stop` move terminates; refine/fanout keep looping so + // plan() — and thus the planner — runs again next round. + return pending?.kind === 'stop' ? 'done' : 'continue' + }, + } +} + +function validateMove(move: TopologyMove, maxFanout: number): TopologyMove { + if (!move || typeof move !== 'object' || typeof (move as { kind?: unknown }).kind !== 'string') { + throw new PlannerError(`dynamic planner returned a non-move value: ${describe(move)}`) + } + switch (move.kind) { + case 'refine': + return move + case 'stop': + return move + case 'fanout': { + if (!Array.isArray(move.tasks) || move.tasks.length === 0) { + throw new PlannerError('dynamic planner fanout move must carry a non-empty tasks[]') + } + if (move.tasks.length <= maxFanout) return move + // Clamp rather than reject — over-fanning is a budget concern, not a + // structural error. The clamp is recorded in the rationale for traces. + return { + kind: 'fanout', + tasks: move.tasks.slice(0, maxFanout), + rationale: `${move.rationale ?? ''} [clamped ${move.tasks.length}→${maxFanout}]`.trim(), + } + } + default: + throw new PlannerError( + `dynamic planner returned unknown move kind: ${describe((move as { kind: unknown }).kind)}`, + ) + } +} + +function describe(value: unknown): string { + try { + return JSON.stringify(value) ?? String(value) + } catch { + return String(value) + } +} + +/** + * Compact, planner-friendly view of iteration history — what an LLM planner + * needs to choose the next move without the raw event streams. Output is + * truncated so a long run's prompt stays bounded. + * + * @experimental + */ +export function summarizeHistory( + history: ReadonlyArray>, + opts: { maxOutputChars?: number } = {}, +): Array<{ + index: number + agentRunName: string + valid?: boolean + score?: number + error?: string + output?: string +}> { + const maxOutputChars = opts.maxOutputChars ?? 600 + return history.map((iter) => { + const row: { + index: number + agentRunName: string + valid?: boolean + score?: number + error?: string + output?: string + } = { index: iter.index, agentRunName: iter.agentRunName } + if (iter.verdict) { + row.valid = iter.verdict.valid + if (typeof iter.verdict.score === 'number') row.score = iter.verdict.score + } + if (iter.error) row.error = iter.error.message + if (iter.output !== undefined) { + const serialized = describe(iter.output) + row.output = + serialized.length > maxOutputChars ? `${serialized.slice(0, maxOutputChars)}…` : serialized + } + return row + }) +} diff --git a/src/loops/drivers/sandbox-planner.ts b/src/loops/drivers/sandbox-planner.ts new file mode 100644 index 0000000..9389776 --- /dev/null +++ b/src/loops/drivers/sandbox-planner.ts @@ -0,0 +1,254 @@ +/** + * @experimental + * + * `createSandboxPlanner` — wire the dynamic driver's `TopologyPlanner` to a + * real agent. Each round it spins a sandbox on `profile`, streams a prompt that + * carries the history summary, and decodes the agent's chosen `TopologyMove` + * from a JSON envelope it emits. This is the "agent authors its own loop + * topology" path: the planner profile can be any harness (claude-code, codex, + * opencode, pi) — its only job is to read what happened and emit the next move. + * + * The planner profile is deliberately distinct from the worker `agentRuns`: a + * cheap fast model can steer topology while expensive workers do the labor, and + * the planner never names which harness runs a branch — the kernel's + * `agentRuns` round-robin decides that. + * + * Envelope contract the agent must emit (fenced ```json or a structured + * `result`/`final` event payload): + * { "kind": "refine" | "fanout" | "stop", + * "tasks"?: [ , ... ], // decoded via `decodeTask` + * "n"?: number, // fanout shorthand: N copies of the root task + * "rationale"?: string } + * + * A missing / unparseable / unknown-kind envelope throws `PlannerError` — the + * loop never silently runs a topology the agent did not choose. + */ + +import type { + AgentProfile, + CreateSandboxOptions, + SandboxEvent, +} from '@tangle-network/sandbox' +import { PlannerError, ValidationError } from '../../errors' +import type { AgentRunSpec, LoopSandboxClient } from '../types' +import type { PlannerContext, TopologyMove, TopologyPlanner } from './dynamic' +import { summarizeHistory } from './dynamic' + +/** Raw, pre-decode envelope an agent emits to choose the next move. */ +export interface TopologyMoveEnvelope { + kind: string + tasks?: unknown[] + n?: number + rationale?: string +} + +/** @experimental */ +export interface CreateSandboxPlannerOptions { + /** Sandbox client — the planner calls `.create()` once per round. */ + client: LoopSandboxClient + /** The planner agent. Steers topology; does not run the work. */ + profile: AgentProfile + /** + * Decode one raw task from the envelope's `tasks[]` into a domain `Task`. + * Required because `Task` is opaque to this module — only the caller knows + * its shape. Throw to reject a malformed task; the error surfaces as a + * `PlannerError`. + */ + decodeTask: (raw: unknown, ctx: PlannerContext) => Task + /** Override the default prompt (history summary + envelope contract). */ + buildPrompt?: (ctx: PlannerContext) => string + /** Override envelope extraction from the event stream. */ + parseEnvelope?: (events: SandboxEvent[]) => TopologyMoveEnvelope | undefined + /** Sandbox overrides for the planner sandbox (timeouts, env, etc.). */ + sandboxOverrides?: AgentRunSpec['sandboxOverrides'] + /** Cancellation for the planner's own LLM call. */ + signal?: AbortSignal +} + +/** @experimental */ +export function createSandboxPlanner( + opts: CreateSandboxPlannerOptions, +): TopologyPlanner { + if (!opts.client || typeof opts.client.create !== 'function') { + throw new ValidationError('createSandboxPlanner: client.create is required') + } + if (typeof opts.decodeTask !== 'function') { + throw new ValidationError('createSandboxPlanner: decodeTask is required') + } + const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt + const parseEnvelope = opts.parseEnvelope ?? defaultParseEnvelope + + return async (ctx) => { + const box = await opts.client.create(buildSandboxOptions(opts.profile, opts.sandboxOverrides)) + const events: SandboxEvent[] = [] + for await (const event of box.streamPrompt(buildPrompt(ctx), { signal: opts.signal })) { + events.push(event) + } + const envelope = parseEnvelope(events) + if (!envelope) { + throw new PlannerError('sandbox planner emitted no parseable topology-move envelope') + } + return envelopeToMove(envelope, ctx, opts.decodeTask) + } +} + +function envelopeToMove( + envelope: TopologyMoveEnvelope, + ctx: PlannerContext, + decodeTask: (raw: unknown, ctx: PlannerContext) => Task, +): TopologyMove { + const kind = String(envelope.kind ?? '').toLowerCase() + const rationale = typeof envelope.rationale === 'string' ? envelope.rationale : undefined + if (kind === 'stop') { + return { kind: 'stop', rationale } + } + if (kind === 'refine') { + const raw = Array.isArray(envelope.tasks) ? envelope.tasks[0] : undefined + // No new task → replay the root task; the worker self-corrects from its + // own prior attempt in sandbox state, mirroring the refine driver default. + const task = raw === undefined ? ctx.task : decodeTaskGuarded(decodeTask, raw, ctx) + return { kind: 'refine', task, rationale } + } + if (kind === 'fanout') { + const tasks = resolveFanoutTasks(envelope, ctx, decodeTask) + return { kind: 'fanout', tasks, rationale } + } + throw new PlannerError(`sandbox planner emitted unknown move kind: ${JSON.stringify(envelope.kind)}`) +} + +function resolveFanoutTasks( + envelope: TopologyMoveEnvelope, + ctx: PlannerContext, + decodeTask: (raw: unknown, ctx: PlannerContext) => Task, +): Task[] { + if (Array.isArray(envelope.tasks) && envelope.tasks.length > 0) { + return envelope.tasks.map((raw) => decodeTaskGuarded(decodeTask, raw, ctx)) + } + // `n` shorthand: N copies of the root task, leaning on `agentRuns` diversity. + if (typeof envelope.n === 'number' && Number.isFinite(envelope.n) && envelope.n >= 1) { + return Array.from({ length: Math.floor(envelope.n) }, () => ctx.task) + } + throw new PlannerError('sandbox planner fanout envelope needs a non-empty tasks[] or n >= 1') +} + +function decodeTaskGuarded( + decodeTask: (raw: unknown, ctx: PlannerContext) => Task, + raw: unknown, + ctx: PlannerContext, +): Task { + try { + return decodeTask(raw, ctx) + } catch (err) { + throw new PlannerError(`sandbox planner decodeTask rejected ${JSON.stringify(raw)}`, { + cause: err, + }) + } +} + +function buildSandboxOptions( + profile: AgentProfile, + overrides: AgentRunSpec['sandboxOverrides'], +): CreateSandboxOptions { + const base = overrides ?? {} + const overrideBackend = base.backend + const explicitType = profile.metadata?.backendType + type BackendType = NonNullable['type'] + return { + ...base, + backend: { + type: (overrideBackend?.type ?? explicitType ?? 'opencode') as BackendType, + profile, + ...(overrideBackend?.model ? { model: overrideBackend.model } : {}), + ...(overrideBackend?.server ? { server: overrideBackend.server } : {}), + }, + } +} + +function defaultBuildPrompt(ctx: PlannerContext): string { + const summary = summarizeHistory(ctx.history) + return [ + 'You are the loop planner. You do not do the work — you decide the topology of the next round.', + '', + `Root task:\n${safeJson(ctx.task)}`, + '', + `Iterations spent: ${ctx.iterationsSpent}. Remaining before the hard cap: ${ctx.iterationsRemaining}.`, + '', + ctx.history.length === 0 + ? 'No attempts yet.' + : `Attempts so far (index, agent, verdict, output):\n${safeJson(summary)}`, + '', + 'Choose ONE move and emit it as a fenced JSON block:', + ' - {"kind":"refine","tasks":[],"rationale":"..."} — one more attempt; omit tasks to replay the root task.', + ' - {"kind":"fanout","tasks":[,],"rationale":"..."} — N parallel branches (or "n": N for N copies of the root task).', + ' - {"kind":"stop","rationale":"..."} — a valid result exists or further attempts will not help.', + '', + 'Stop as soon as an attempt is valid. Prefer refine when an attempt is close; fan out when attempts disagree or the approach is uncertain.', + 'Emit ONLY the JSON block.', + ].join('\n') +} + +function defaultParseEnvelope(events: SandboxEvent[]): TopologyMoveEnvelope | undefined { + // Structured payload on a terminal event wins — sandbox SDKs lift emitted + // JSON onto data.result / data.output / data of a result|final event. + for (let i = events.length - 1; i >= 0; i -= 1) { + const event = events[i] + if (!event) continue + const type = String(event.type ?? '') + const data = isRecord(event.data) ? event.data : undefined + if (!data) continue + if (type === 'result' || type === 'final' || type === 'planner.move') { + const direct = coerceEnvelope(data.result ?? data.output ?? data) + if (direct) return direct + } + } + // Fall back to a fenced JSON block in the most recent text delta. + for (let i = events.length - 1; i >= 0; i -= 1) { + const event = events[i] + if (!event) continue + const data = isRecord(event.data) ? event.data : undefined + if (!data) continue + const text = pickString(data.text) ?? pickString(data.delta) ?? pickString(data.content) + if (!text) continue + const fenced = extractFencedJson(text) + const coerced = coerceEnvelope(fenced) + if (coerced) return coerced + } + return undefined +} + +function coerceEnvelope(value: unknown): TopologyMoveEnvelope | undefined { + if (!isRecord(value)) return undefined + if (typeof value.kind !== 'string' || value.kind.length === 0) return undefined + const out: TopologyMoveEnvelope = { kind: value.kind } + if (Array.isArray(value.tasks)) out.tasks = value.tasks + if (typeof value.n === 'number') out.n = value.n + if (typeof value.rationale === 'string') out.rationale = value.rationale + return out +} + +function isRecord(value: unknown): value is Record { + return value !== null && typeof value === 'object' && !Array.isArray(value) +} + +function pickString(value: unknown): string | undefined { + return typeof value === 'string' && value.length > 0 ? value : undefined +} + +function extractFencedJson(text: string): unknown | undefined { + const match = text.match(/```(?:json)?\s*([\s\S]*?)```/i) + const body = (match?.[1] ?? text).trim() + if (!body) return undefined + try { + return JSON.parse(body) + } catch { + return undefined + } +} + +function safeJson(value: unknown): string { + try { + return JSON.stringify(value, null, 2) ?? String(value) + } catch { + return String(value) + } +} diff --git a/src/loops/index.ts b/src/loops/index.ts index d854266..8bb4c39 100644 --- a/src/loops/index.ts +++ b/src/loops/index.ts @@ -17,6 +17,14 @@ export type { SandboxEvent, SandboxInstance, } from '@tangle-network/sandbox' +export type { + CreateDynamicDriverOptions, + DynamicDecision, + PlannerContext, + TopologyMove, + TopologyPlanner, +} from './drivers/dynamic' +export { createDynamicDriver, summarizeHistory } from './drivers/dynamic' export type { CreateFanoutVoteDriverOptions, FanoutVoteDecision, @@ -25,6 +33,11 @@ export type { export { createFanoutVoteDriver, scoreFanoutVoteIterations } from './drivers/fanout-vote' export type { CreateRefineDriverOptions, RefineDecision } from './drivers/refine' export { createRefineDriver, refineWinnerIndex } from './drivers/refine' +export type { + CreateSandboxPlannerOptions, + TopologyMoveEnvelope, +} from './drivers/sandbox-planner' +export { createSandboxPlanner } from './drivers/sandbox-planner' export type { RunLoopOptions } from './run-loop' export { runLoop } from './run-loop' export { reportLoopUsage, type UsageSink } from './report-usage' diff --git a/tests/loops/dynamic.test.ts b/tests/loops/dynamic.test.ts new file mode 100644 index 0000000..48ea848 --- /dev/null +++ b/tests/loops/dynamic.test.ts @@ -0,0 +1,474 @@ +import type { + AgentProfile, + CreateSandboxOptions, + SandboxEvent, + SandboxInstance, +} from '@tangle-network/sandbox' +import { describe, expect, it } from 'vitest' +import { PlannerError } from '../../src/errors' +import { + type AgentRunSpec, + createDynamicDriver, + createSandboxPlanner, + type OutputAdapter, + runLoop, + type TopologyMove, + type TopologyPlanner, + type Validator, +} from '../../src/loops' + +interface Task { + goal: string + strategy: string +} + +interface Out { + strategy: string + harness: string + score: number +} + +const VALID_THRESHOLD = 0.7 + +// Score is a pure function of the strategy the planner chose — so a stronger +// strategy (parallel-*) clears the bar while naive/careful do not. This lets a +// planner adapt: refine the strategy, then fan out when refinement stalls. +function scoreFor(strategy: string): number { + if (strategy.startsWith('parallel')) return 0.9 + if (strategy === 'careful') return 0.6 + return 0.3 +} + +const output: OutputAdapter = { + parse(events) { + const last = events.at(-1) + const data = last?.data as Partial | undefined + return { + strategy: data?.strategy ?? '', + harness: data?.harness ?? '', + score: typeof data?.score === 'number' ? data.score : 0, + } + }, +} + +const validator: Validator = { + async validate(out) { + return { valid: out.score >= VALID_THRESHOLD, score: out.score } + }, +} + +function profile(name: string): AgentProfile { + return { name } +} + +function workerSpecs(names: string[]): AgentRunSpec[] { + return names.map((name) => ({ + profile: profile(name), + name, + taskToPrompt: (t) => JSON.stringify(t), + })) +} + +// Worker client: each iteration's score derives from the task strategy carried +// in the prompt; the harness is read from the profile the kernel round-robined +// to. Records dispatch order so tests can assert topology + harness rotation. +function workerClient() { + const dispatched: Array<{ harness: string; strategy: string }> = [] + return { + dispatched, + client: { + async create(opts?: CreateSandboxOptions): Promise { + const harness = + (opts?.backend?.profile && typeof opts.backend.profile === 'object' + ? opts.backend.profile.name + : undefined) ?? 'unknown' + return { + async *streamPrompt(message: string) { + const task = JSON.parse(message) as Task + dispatched.push({ harness, strategy: task.strategy }) + yield { + type: 'result', + data: { strategy: task.strategy, harness, score: scoreFor(task.strategy) }, + } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + }, + } +} + +describe('runLoop + createDynamicDriver', () => { + it('lets an adaptive planner choose refine→refine→fanout→stop from history', async () => { + const goal = 'ship the feature' + // The planner reads history and adapts: try cheap strategies first, escalate + // to a heterogeneous fanout when refinement stalls, stop once a branch wins. + const planner: TopologyPlanner = ({ history }) => { + if (history.some((h) => h.verdict?.valid === true)) return { kind: 'stop' } + if (history.length === 0) return { kind: 'refine', task: { goal, strategy: 'naive' } } + if (history.length === 1) return { kind: 'refine', task: { goal, strategy: 'careful' } } + return { + kind: 'fanout', + tasks: [ + { goal, strategy: 'parallel-a' }, + { goal, strategy: 'parallel-b' }, + ], + } + } + + const { client, dispatched } = workerClient() + const result = await runLoop({ + driver: createDynamicDriver({ planner, maxIterations: 8 }), + agentRuns: workerSpecs(['worker-a', 'worker-b']), + output, + validator, + task: { goal, strategy: 'naive' }, + ctx: { sandboxClient: client }, + maxIterations: 10, + }) + + expect(result.decision).toBe('done') + expect(result.iterations).toHaveLength(4) + expect(dispatched.map((d) => d.strategy)).toEqual([ + 'naive', + 'careful', + 'parallel-a', + 'parallel-b', + ]) + // The fanout round dispatched its two branches across two distinct harnesses. + expect(result.iterations[2]?.agentRunName).toBe('worker-a') + expect(result.iterations[3]?.agentRunName).toBe('worker-b') + // Winner is the highest-valid-score attempt (0.9), earliest index breaks the tie. + expect(result.winner?.verdict?.valid).toBe(true) + expect(result.winner?.verdict?.score).toBeCloseTo(0.9, 6) + expect(result.winner?.iterationIndex).toBe(2) + }) + + it('runs an explicit refine→fanout→stop script across two harnesses', async () => { + const goal = 'explicit' + const moves: TopologyMove[] = [ + { kind: 'refine', task: { goal, strategy: 'careful' } }, + { + kind: 'fanout', + tasks: [ + { goal, strategy: 'parallel-a' }, + { goal, strategy: 'parallel-b' }, + ], + }, + { kind: 'stop' }, + ] + let round = 0 + const planner: TopologyPlanner = () => moves[round++]! + + const { client } = workerClient() + const result = await runLoop({ + driver: createDynamicDriver({ planner }), + agentRuns: workerSpecs(['claude-code', 'codex']), + output, + validator, + task: { goal, strategy: 'careful' }, + ctx: { sandboxClient: client }, + }) + + expect(result.decision).toBe('done') + expect(round).toBe(3) + // Assert the ordered iteration record (deterministic) rather than dispatch + // order, which races across the concurrent fanout branches. The kernel maps + // iteration index N to agentRuns[N % len], so the fanout spans both harnesses. + expect(result.iterations.map((i) => [i.agentRunName, i.task.strategy])).toEqual([ + ['claude-code', 'careful'], + ['codex', 'parallel-a'], + ['claude-code', 'parallel-b'], + ]) + expect(result.winner?.verdict?.score).toBeCloseTo(0.9, 6) + }) + + it('terminates on the maxIterations cap even when the planner never stops', async () => { + const planner: TopologyPlanner = () => ({ + kind: 'refine', + task: { goal: 'forever', strategy: 'naive' }, + }) + const { client } = workerClient() + const result = await runLoop({ + driver: createDynamicDriver({ planner, maxIterations: 3 }), + agentRun: workerSpecs(['solo'])[0], + output, + validator, + task: { goal: 'forever', strategy: 'naive' }, + ctx: { sandboxClient: client }, + maxIterations: 10, + }) + + expect(result.iterations).toHaveLength(3) + expect(result.decision).toBe('done') + }) + + it('clamps a fanout move to maxFanout branches', async () => { + const moves: TopologyMove[] = [ + { + kind: 'fanout', + tasks: Array.from({ length: 5 }, (_, i) => ({ goal: 'wide', strategy: `parallel-${i}` })), + }, + { kind: 'stop' }, + ] + let round = 0 + const planner: TopologyPlanner = () => moves[round++]! + + const { client, dispatched } = workerClient() + const result = await runLoop({ + driver: createDynamicDriver({ planner, maxFanout: 2 }), + agentRuns: workerSpecs(['a', 'b']), + output, + validator, + task: { goal: 'wide', strategy: 'parallel-0' }, + ctx: { sandboxClient: client }, + }) + + expect(result.iterations).toHaveLength(2) + expect(dispatched.map((d) => d.strategy)).toEqual(['parallel-0', 'parallel-1']) + }) + + it('fails loud on a fanout move with no tasks', async () => { + const planner: TopologyPlanner = () => ({ kind: 'fanout', tasks: [] }) + const { client } = workerClient() + await expect( + runLoop({ + driver: createDynamicDriver({ planner }), + agentRun: workerSpecs(['a'])[0], + output, + validator, + task: { goal: 'x', strategy: 'naive' }, + ctx: { sandboxClient: client }, + }), + ).rejects.toThrow(PlannerError) + }) + + it('fails loud on an unknown move kind', async () => { + const planner = (() => ({ kind: 'teleport' })) as unknown as TopologyPlanner + const { client } = workerClient() + await expect( + runLoop({ + driver: createDynamicDriver({ planner }), + agentRun: workerSpecs(['a'])[0], + output, + validator, + task: { goal: 'x', strategy: 'naive' }, + ctx: { sandboxClient: client }, + }), + ).rejects.toThrow(/unknown move kind/i) + }) +}) + +// A single client serving BOTH the planner agent and the workers, routed by +// profile name. The planner sandbox reads "Iterations spent: N" out of the +// prompt the driver built and emits a structured topology-move envelope — +// exercising the real createSandboxPlanner → kernel → worker path. +function plannerAndWorkerClient(plannerMove: (spent: number) => unknown) { + const dispatched: Array<{ harness: string; strategy: string }> = [] + const plannerPrompts: string[] = [] + return { + dispatched, + plannerPrompts, + client: { + async create(opts?: CreateSandboxOptions): Promise { + const name = + (opts?.backend?.profile && typeof opts.backend.profile === 'object' + ? opts.backend.profile.name + : undefined) ?? 'unknown' + if (name === 'planner') { + return { + async *streamPrompt(message: string) { + plannerPrompts.push(message) + const spent = Number(message.match(/Iterations spent: (\d+)/)?.[1] ?? '0') + yield { + type: 'result', + data: { result: plannerMove(spent) }, + } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + } + return { + async *streamPrompt(message: string) { + const task = JSON.parse(message) as Task + dispatched.push({ harness: name, strategy: task.strategy }) + yield { + type: 'result', + data: { strategy: task.strategy, harness: name, score: scoreFor(task.strategy) }, + } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + }, + } +} + +describe('createSandboxPlanner', () => { + it('drives the loop end-to-end: planner agent authors refine→fanout→stop', async () => { + const goal = 'sandbox-planner' + const { client, plannerPrompts } = plannerAndWorkerClient((spent) => { + if (spent === 0) return { kind: 'refine', tasks: [{ goal, strategy: 'careful' }] } + if (spent === 1) + return { + kind: 'fanout', + tasks: [ + { goal, strategy: 'parallel-a' }, + { goal, strategy: 'parallel-b' }, + ], + } + return { kind: 'stop' } + }) + + const planner = createSandboxPlanner({ + client, + profile: profile('planner'), + decodeTask: (raw) => raw as Task, + }) + + const result = await runLoop({ + driver: createDynamicDriver({ planner }), + agentRuns: workerSpecs(['worker-a', 'worker-b']), + output, + validator, + task: { goal, strategy: 'naive' }, + ctx: { sandboxClient: client }, + }) + + expect(result.decision).toBe('done') + expect(result.iterations.map((i) => [i.agentRunName, i.task.strategy])).toEqual([ + ['worker-a', 'careful'], + ['worker-b', 'parallel-a'], + ['worker-a', 'parallel-b'], + ]) + expect(result.winner?.verdict?.score).toBeCloseTo(0.9, 6) + // The planner saw a growing history each round (its prompt carried the count). + expect(plannerPrompts).toHaveLength(3) + expect(plannerPrompts[0]).toMatch(/Iterations spent: 0/) + expect(plannerPrompts[2]).toMatch(/Iterations spent: 3/) + }) + + it('expands the n shorthand into N copies of the root task', async () => { + const { client, dispatched } = plannerAndWorkerClient((spent) => + spent === 0 ? { kind: 'fanout', n: 3 } : { kind: 'stop' }, + ) + const planner = createSandboxPlanner({ + client, + profile: profile('planner'), + decodeTask: (raw) => raw as Task, + }) + const result = await runLoop({ + driver: createDynamicDriver({ planner, maxFanout: 4 }), + agentRuns: workerSpecs(['a', 'b']), + output, + validator, + task: { goal: 'n-shorthand', strategy: 'parallel-root' }, + ctx: { sandboxClient: client }, + }) + expect(dispatched).toHaveLength(3) + expect(dispatched.every((d) => d.strategy === 'parallel-root')).toBe(true) + expect(result.decision).toBe('done') + }) + + it('fails loud when the planner emits no parseable envelope', async () => { + const client = { + async create(): Promise { + return { + async *streamPrompt() { + yield { type: 'message', data: { text: 'I think we should keep going!' } } + }, + } as unknown as SandboxInstance + }, + } + const planner = createSandboxPlanner({ + client, + profile: profile('planner'), + decodeTask: (raw) => raw as Task, + }) + await expect( + runLoop({ + driver: createDynamicDriver({ planner }), + agentRun: workerSpecs(['a'])[0], + output, + validator, + task: { goal: 'x', strategy: 'naive' }, + ctx: { sandboxClient: client }, + }), + ).rejects.toThrow(/no parseable topology-move envelope/i) + }) + + it('parses a fenced JSON envelope from a text delta', async () => { + let plannerRound = 0 + const client = { + async create(opts?: CreateSandboxOptions): Promise { + const name = + (opts?.backend?.profile && typeof opts.backend.profile === 'object' + ? opts.backend.profile.name + : undefined) ?? 'unknown' + if (name === 'planner') { + const fenced = + plannerRound++ === 0 + ? '```json\n{"kind":"refine","tasks":[{"goal":"g","strategy":"parallel-x"}]}\n```' + : '```json\n{"kind":"stop"}\n```' + return { + async *streamPrompt() { + yield { type: 'message.delta', data: { text: `here is my plan:\n${fenced}` } } + }, + } as unknown as SandboxInstance + } + return { + async *streamPrompt(message: string) { + const task = JSON.parse(message) as Task + yield { + type: 'result', + data: { strategy: task.strategy, harness: name, score: scoreFor(task.strategy) }, + } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + } + const planner = createSandboxPlanner({ + client, + profile: profile('planner'), + decodeTask: (raw) => raw as Task, + }) + const result = await runLoop({ + driver: createDynamicDriver({ planner }), + agentRun: workerSpecs(['a'])[0], + output, + validator, + task: { goal: 'g', strategy: 'naive' }, + ctx: { sandboxClient: client }, + }) + expect(result.decision).toBe('done') + expect(result.winner?.verdict?.score).toBeCloseTo(0.9, 6) + }) + + it('surfaces a decodeTask rejection as a PlannerError', async () => { + const client = { + async create(): Promise { + return { + async *streamPrompt() { + yield { type: 'result', data: { result: { kind: 'refine', tasks: [{ bad: true }] } } } + }, + } as unknown as SandboxInstance + }, + } + const planner = createSandboxPlanner({ + client, + profile: profile('planner'), + decodeTask: (raw) => { + const t = raw as Partial + if (typeof t.strategy !== 'string') throw new Error('missing strategy') + return t as Task + }, + }) + await expect( + runLoop({ + driver: createDynamicDriver({ planner }), + agentRun: workerSpecs(['a'])[0], + output, + validator, + task: { goal: 'x', strategy: 'naive' }, + ctx: { sandboxClient: client }, + }), + ).rejects.toThrow(PlannerError) + }) +}) diff --git a/tests/optimize-prompt.test.ts b/tests/optimize-prompt.test.ts new file mode 100644 index 0000000..2137468 --- /dev/null +++ b/tests/optimize-prompt.test.ts @@ -0,0 +1,136 @@ +import { + type ImprovementDriver, + inMemoryCampaignStorage, + type JudgeConfig, + type MutableSurface, + type ProposedCandidate, + type Scenario, +} from '@tangle-network/agent-eval/campaign' +import { describe, expect, it } from 'vitest' +import { ConfigError } from '../src/errors' +import { optimizePrompt } from '../src/improvement' + +interface SumScenario extends Scenario { + kind: 'sum' +} + +interface SumArtifact { + text: string + quality: number +} + +// Artifact quality is a pure function of the prompt: a prompt that says +// "PRECISE" produces a high-quality artifact, a vague one does not. This is the +// measurable signal the gate steers on — a candidate only wins if it lifts +// quality on the held-out scenarios. +const runWithPrompt = async (prompt: string): Promise => ({ + text: prompt, + quality: /PRECISE/.test(prompt) ? 0.9 : 0.4, +}) + +const qualityJudge: JudgeConfig = { + name: 'quality', + dimensions: [{ key: 'quality', description: 'artifact quality 0..1' }], + score({ artifact }) { + return { dimensions: { quality: artifact.quality }, composite: artifact.quality, notes: '' } + }, +} + +const scenarios: SumScenario[] = [ + { id: 't1', kind: 'sum' }, + { id: 't2', kind: 'sum' }, +] +const holdoutScenarios: SumScenario[] = [ + { id: 'h1', kind: 'sum' }, + { id: 'h2', kind: 'sum' }, +] + +const BASELINE = 'Summarize the text.' + +/** Deterministic driver — proposes exactly the candidate the test wants to + * measure, once. Stands in for `gepaDriver` so the loop runs with zero LLM. */ +function fixedDriver(candidate: MutableSurface | ProposedCandidate): ImprovementDriver { + return { + kind: 'test-fixed', + async propose() { + return [candidate] + }, + } +} + +const baseOpts = { + runWithPrompt, + scenarios, + holdoutScenarios, + judges: [qualityJudge], + baselinePrompt: BASELINE, + populationSize: 1, + maxGenerations: 1, + promoteTopK: 1, + deltaThreshold: 0.1, + seed: 7, +} + +describe('optimizePrompt — identity-gated prompt optimization', () => { + it('keeps the baseline (identity) when no candidate beats it on holdout', async () => { + // Candidate is just as vague as the baseline → no held-out lift. + const result = await optimizePrompt({ + ...baseOpts, + runDir: 'mem://optimize-identity', + storage: inMemoryCampaignStorage(), + driver: fixedDriver('Summarize the text concisely.'), + }) + + expect(result.improved).toBe(false) + expect(result.decision).not.toBe('ship') + expect(result.prompt).toBe(BASELINE) + expect(result.baselineComposite).toBeCloseTo(0.4, 6) + // No regression possible: the returned prompt is the untouched baseline. + expect(result.delta).toBeLessThan(0.1) + }) + + it('promotes a candidate that wins on holdout, returning the improved prompt', async () => { + const improvedPrompt = 'Summarize the text. Be PRECISE.' + const result = await optimizePrompt({ + ...baseOpts, + runDir: 'mem://optimize-promote', + storage: inMemoryCampaignStorage(), + driver: fixedDriver({ + surface: improvedPrompt, + label: 'add precision', + rationale: 'precision lifts quality', + }), + }) + + expect(result.improved).toBe(true) + expect(result.decision).toBe('ship') + expect(result.prompt).toBe(improvedPrompt) + expect(result.winnerComposite).toBeCloseTo(0.9, 6) + expect(result.baselineComposite).toBeCloseTo(0.4, 6) + expect(result.delta).toBeGreaterThanOrEqual(0.1) + expect(result.rationale).toBe('precision lifts quality') + expect(result.diff).not.toBe('') + }) + + it('fails loud when neither reflection nor a driver is supplied', async () => { + await expect( + optimizePrompt({ + ...baseOpts, + runDir: 'mem://optimize-misconfig', + storage: inMemoryCampaignStorage(), + }), + ).rejects.toThrow(ConfigError) + }) + + it('fails loud on an empty holdout set (the gate needs it)', async () => { + await expect( + optimizePrompt({ + ...baseOpts, + holdoutScenarios: [], + runDir: 'mem://optimize-noholdout', + storage: inMemoryCampaignStorage(), + driver: fixedDriver('whatever'), + }), + ).rejects.toThrow(/holdoutScenarios/) + }) +}) From d8c237ef62f28b6f394b6415440c33a1e70ad145 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 30 May 2026 19:03:03 -0600 Subject: [PATCH 7/7] =?UTF-8?q?chore(release):=200.33.0=20=E2=80=94=20dyna?= =?UTF-8?q?mic=20loop=20driver=20+=20identity-gated=20optimizePrompt=20(#7?= =?UTF-8?q?5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index a2f128b..c190a6e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.32.0", + "version": "0.33.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": {