Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@tangle-network/agent-runtime",
"version": "0.31.0",
"version": "0.32.0",
"description": "Reusable runtime lifecycle for domain-specific agents.",
"homepage": "https://github.com/tangle-network/agent-runtime#readme",
"repository": {
Expand Down Expand Up @@ -75,11 +75,10 @@
"lint:fix": "biome check --write src tests examples",
"typecheck": "tsc --noEmit"
},
"dependencies": {
"@tangle-network/agent-eval": "^0.54.0"
},
"dependencies": {},
"devDependencies": {
"@biomejs/biome": "^2.4.0",
"@tangle-network/agent-eval": "^0.61.0",
"@tangle-network/sandbox": "^0.4.0",
"@types/node": "^25.6.0",
"tsup": "^8.0.0",
Expand All @@ -101,6 +100,7 @@
"license": "MIT",
"packageManager": "pnpm@10.28.0",
"peerDependencies": {
"@tangle-network/agent-eval": ">=0.61.0 <1.0.0",
"@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0",
"@tangle-network/sandbox": ">=0.1.2 <0.5.0"
},
Expand Down
14 changes: 7 additions & 7 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions src/loops/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ export type { CreateRefineDriverOptions, RefineDecision } from './drivers/refine
export { createRefineDriver, refineWinnerIndex } from './drivers/refine'
export type { RunLoopOptions } from './run-loop'
export { runLoop } from './run-loop'
export { reportLoopUsage, type UsageSink } from './report-usage'
export {
loopCampaignDispatch,
loopDispatch,
type LoopDispatchOptions,
type LoopOptionsForDispatch,
} from './loop-dispatch'
export type {
AgentRunSpec,
DefaultVerdict,
Expand All @@ -42,6 +49,7 @@ export type {
LoopSandboxClient,
LoopSandboxPlacement,
LoopStartedPayload,
LoopTokenUsage,
LoopTraceEmitter,
LoopTraceEvent,
LoopWinner,
Expand Down
132 changes: 132 additions & 0 deletions src/loops/loop-dispatch.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/**
* `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch.
*
* Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` /
* `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a
* sandbox client, (b) adapt the campaign `DispatchContext.trace` into a
* `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to
* forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
* `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
* the third silent. The fleet's products skipped (c) and fell back to a
* `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
* to kill.
*
* `loopDispatch` collapses all three into one typed call:
*
* const dispatch = loopDispatch({
* sandboxClient,
* toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }),
* })
* await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
*
* Usage is reported automatically; trace events are forwarded automatically;
* the ctx is built automatically. The seam becomes impossible to mis-wire.
*
* Typed structurally against the campaign `DispatchContext` (imported type-only
* from `@tangle-network/agent-eval/campaign`) — a downward dependency, never an
* inversion.
*/

// agent-eval's AgentProfile (the eval-harness unit of variation, `model: string`)
// — NOT sandbox's AgentProfile. ProfileDispatchFn is keyed on the former.
import type { AgentProfile } from '@tangle-network/agent-eval'
import type {
CampaignTraceWriter,
DispatchContext,
DispatchFn,
ProfileDispatchFn,
Scenario,
} from '@tangle-network/agent-eval/campaign'
import { reportLoopUsage } from './report-usage'
import { type RunLoopOptions, runLoop } from './run-loop'
import type { LoopResult, LoopSandboxClient, LoopTraceEmitter } from './types'

/** runLoop options minus the `ctx` (loopDispatch builds the ctx). */
export type LoopOptionsForDispatch<Task, Output, Decision> = Omit<
RunLoopOptions<Task, Output, Decision>,
'ctx'
>

export interface LoopDispatchOptions<Task, Output, Decision, TScenario extends Scenario, TArtifact> {
/** Sandbox client used for every cell's `runLoop`. Supplied once. */
sandboxClient: LoopSandboxClient
/** Build the per-cell runLoop options from the scenario (+ profile, when
* used with `runProfileMatrix`). */
toLoopOptions: (
scenario: TScenario,
profile: AgentProfile,
) => LoopOptionsForDispatch<Task, Output, Decision>
/** Map the finished loop to the artifact the judges score. Default:
* `result.winner?.output`. A loop with no winner yields `undefined` (judges
* skip the cell) — but the loop's token usage is STILL reported, so the
* integrity guard sees real activity. */
toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact
/** Forward `loop.*` trace events into the campaign's scoped trace so loop
* spans correlate with the cell. Default true. */
forwardTrace?: boolean
/** Cost-meter source label for the loop's spend. Default `'loop'`. */
costSource?: string
}

/** Bridge a campaign `DispatchContext.trace` to a `LoopTraceEmitter` so every
* `loop.*` event lands as a span under the cell's scoped trace. */
function campaignTraceToLoopEmitter(trace: CampaignTraceWriter): LoopTraceEmitter {
return {
emit(event) {
trace
.span(event.kind, { runId: event.runId, timestamp: event.timestamp, ...event.payload })
.end()
},
}
}

async function runLoopForCell<Task, Output, Decision, TScenario extends Scenario, TArtifact>(
opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>,
scenario: TScenario,
profile: AgentProfile,
ctx: DispatchContext,
): Promise<TArtifact> {
const loopOptions = opts.toLoopOptions(scenario, profile)
const result = await runLoop<Task, Output, Decision>({
...loopOptions,
ctx: {
sandboxClient: opts.sandboxClient,
signal: ctx.signal,
traceEmitter:
opts.forwardTrace === false ? undefined : campaignTraceToLoopEmitter(ctx.trace),
},
})
reportLoopUsage(ctx.cost, result, opts.costSource ?? 'loop')
const toArtifact =
opts.toArtifact ?? ((r: LoopResult<Task, Output, Decision>) => r.winner?.output as TArtifact)
return toArtifact(result)
}

/**
* Adapter for `runProfileMatrix` (profile is an axis). Returns a
* `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and
* reports usage automatically.
*/
export function loopDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(
opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>,
): ProfileDispatchFn<TScenario, TArtifact> {
return (profile, scenario, ctx) => runLoopForCell(opts, scenario, profile, ctx)
}

/**
* Adapter for `runCampaign` (no profile axis). `toLoopOptions` receives only
* the scenario; the `profile` passed to the shared core is a stable sentinel
* so a single `runLoop` config is reused across cells.
*/
export function loopCampaignDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(
opts: Omit<LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>, 'toLoopOptions'> & {
toLoopOptions: (scenario: TScenario) => LoopOptionsForDispatch<Task, Output, Decision>
},
): DispatchFn<TScenario, TArtifact> {
const profileSentinel = { id: 'loop-campaign', model: 'n/a@loop-campaign' } as AgentProfile
const profiled: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact> = {
...opts,
toLoopOptions: (scenario) => opts.toLoopOptions(scenario),
}
return (scenario, ctx) => runLoopForCell(profiled, scenario, profileSentinel, ctx)
}
41 changes: 41 additions & 0 deletions src/loops/report-usage.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/**
* Bridge a finished `runLoop` into an agent-eval campaign / profile-matrix
* dispatch.
*
* `runProfileMatrix` (and `runCampaign`) run the backend-integrity guard over
* the token usage a dispatch reports through `ctx.cost`. A dispatch that wraps
* `runLoop` must forward the loop's cost AND token usage, or the guard reads
* the run as a stub and throws. `reportLoopUsage` is that one line:
*
* const dispatch: ProfileDispatchFn<S, A> = async (profile, scenario, ctx) => {
* const result = await runLoop({ ...optsFor(profile, scenario), ctx: loopCtx })
* reportLoopUsage(ctx, result)
* return result.winner?.output as A
* }
*
* Typed structurally against the campaign `DispatchContext.cost` so this module
* stays free of an agent-eval import — it works with any cost meter exposing
* `observe` + `observeTokens`.
*/

import type { LoopResult } from './types'

/** The slice of an agent-eval campaign `DispatchContext.cost` this needs. */
export interface UsageSink {
observe(amountUsd: number, source: string): void
observeTokens(usage: { input: number; output: number }): void
}

/**
* Forward a `LoopResult`'s aggregated cost + token usage into a campaign cost
* meter so the backend-integrity guard sees real LLM activity. `source`
* defaults to `'loop'`.
*/
export function reportLoopUsage<Task, Output, Decision>(
cost: UsageSink,
result: Pick<LoopResult<Task, Output, Decision>, 'costUsd' | 'tokenUsage'>,
source = 'loop',
): void {
cost.observe(result.costUsd, source)
cost.observeTokens({ input: result.tokenUsage.input, output: result.tokenUsage.output })
}
12 changes: 12 additions & 0 deletions src/loops/run-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ export async function runLoop<Task, Output, Decision>(
startedAt: now(),
endedAt: 0,
costUsd: 0,
tokenUsage: { input: 0, output: 0 },
})
}

Expand Down Expand Up @@ -288,6 +289,8 @@ async function executeIteration<Task, Output>(args: ExecuteIterationArgs<Task, O
const llmCall = extractLlmCallEvent(event, slot.agentRunName)
if (llmCall) {
slot.costUsd += llmCall.costUsd ?? 0
slot.tokenUsage.input += llmCall.tokensIn ?? 0
slot.tokenUsage.output += llmCall.tokensOut ?? 0
args.ctx.runHandle?.observe(llmCall)
}
}
Expand Down Expand Up @@ -405,12 +408,21 @@ function finalize<Task, Output, Decision>(
): LoopResult<Task, Output, Decision> {
const winner = (args.options.selectWinner ?? defaultSelectWinner)(args.iterations)
const costUsd = args.iterations.reduce((sum, iter) => sum + (iter.costUsd || 0), 0)
const tokenUsage = args.iterations.reduce(
(acc, iter) => {
acc.input += iter.tokenUsage?.input ?? 0
acc.output += iter.tokenUsage?.output ?? 0
return acc
},
{ input: 0, output: 0 },
)
const result: LoopResult<Task, Output, Decision> = {
decision: args.decision,
iterations: args.iterations,
winner,
durationMs: args.now() - args.startMs,
costUsd,
tokenUsage,
}
void emitTrace(args.options.ctx.traceEmitter, {
kind: 'loop.ended',
Expand Down
15 changes: 15 additions & 0 deletions src/loops/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,15 @@ export interface OutputAdapter<Output> {
parse(events: SandboxEvent[]): Output
}

/** LLM token usage. Structurally matches agent-eval's `RunTokenUsage` /
* `CampaignTokenUsage` ({ input, output }) so a loop result maps straight
* onto `ctx.cost.observeTokens` in a `runProfileMatrix` dispatch — without
* which the backend-integrity guard reads the run as a stub. */
export interface LoopTokenUsage {
input: number
output: number
}

/** @experimental */
export interface Iteration<Task, Output> {
/** 0-based iteration index assigned by the kernel. */
Expand All @@ -105,6 +114,8 @@ export interface Iteration<Task, Output> {
startedAt: number
endedAt: number
costUsd: number
/** Summed LLM token usage across every `llm_call` event in this iteration. */
tokenUsage: LoopTokenUsage
}

/** @experimental */
Expand Down Expand Up @@ -144,6 +155,10 @@ export interface LoopResult<Task, Output, Decision> {
durationMs: number
/** Sum of every iteration's `costUsd`. */
costUsd: number
/** Sum of every iteration's token usage. Forward to
* `ctx.cost.observeTokens` in a `runProfileMatrix` dispatch so the
* integrity guard sees real LLM activity. */
tokenUsage: LoopTokenUsage
}

/**
Expand Down
Loading
Loading