Skip to content

Commit 0ff9c32

Browse files
christsoCopilot
andauthored
feat(cli): add --budget-usd run-level cost cap (#1118)
* feat(cli): add --budget-usd run-level cost cap (#1113) Add a `--budget-usd` flag to `agentv run` that caps total cost across all eval files in a single invocation. When the cumulative cost exceeds the cap, remaining eval files are skipped with `budget_exceeded` results. Implementation: - New `RunBudgetTracker` class in packages/core for reusable budget tracking - CLI flag with validation (must be positive number) - Integrated into sequential file loop: costs accumulated after each file, budget checked before dispatching the next file - Per-suite `execution.budget_usd` still enforced within files by orchestrator - Exit code 1 when run-level budget is exceeded - Summary output shows cap and actual spend when exceeded Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(cli): enforce run budget during orchestration Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 61725a1 commit 0ff9c32

9 files changed

Lines changed: 290 additions & 13 deletions

File tree

apps/cli/src/commands/eval/commands/run.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,12 @@ export const evalRunCommand = command({
209209
description:
210210
'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value',
211211
}),
212+
budgetUsd: option({
213+
type: optional(number),
214+
long: 'budget-usd',
215+
description:
216+
'Maximum total cost in USD across all eval files in this run. Stops dispatching new cases when exceeded.',
217+
}),
212218
tag: multioption({
213219
type: array(string),
214220
long: 'tag',
@@ -235,6 +241,10 @@ export const evalRunCommand = command({
235241
}
236242

237243
const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd());
244+
if (args.budgetUsd !== undefined && args.budgetUsd <= 0) {
245+
console.error('Error: --budget-usd must be a positive number.');
246+
process.exit(2);
247+
}
238248
const rawOptions: Record<string, unknown> = {
239249
target: args.target,
240250
targets: args.targets,
@@ -273,6 +283,7 @@ export const evalRunCommand = command({
273283
model: args.model,
274284
outputMessages: args.outputMessages,
275285
threshold: args.threshold,
286+
budgetUsd: args.budgetUsd,
276287
tag: args.tag,
277288
excludeTag: args.excludeTag,
278289
transcript: args.transcript,
@@ -281,6 +292,9 @@ export const evalRunCommand = command({
281292
if (result?.allExecutionErrors) {
282293
process.exit(2);
283294
}
295+
if (result?.budgetExceeded) {
296+
process.exit(1);
297+
}
284298
if (result?.thresholdFailed) {
285299
process.exit(1);
286300
}

apps/cli/src/commands/eval/run-eval.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
type OtelTraceExporter as OtelTraceExporterType,
1414
type ResolvedTarget,
1515
ResponseCache,
16+
RunBudgetTracker,
1617
type TrialsConfig,
1718
runEvaluation as defaultRunEvaluation,
1819
deriveCategory,
@@ -119,6 +120,7 @@ interface NormalizedOptions {
119120
readonly excludeTags: readonly string[];
120121
readonly transcript?: string;
121122
readonly experiment?: string;
123+
readonly budgetUsd?: number;
122124
}
123125

124126
function normalizeBoolean(value: unknown): boolean {
@@ -394,6 +396,7 @@ function normalizeOptions(
394396
excludeTags: normalizeStringArray(rawOptions.excludeTag),
395397
transcript: normalizeString(rawOptions.transcript),
396398
experiment: normalizeString(rawOptions.experiment),
399+
budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd),
397400
} satisfies NormalizedOptions;
398401
}
399402

@@ -734,6 +737,7 @@ async function runSingleEvalFile(params: {
734737
readonly trialsConfig?: TrialsConfig;
735738
readonly matrixMode?: boolean;
736739
readonly budgetUsd?: number;
740+
readonly runBudgetTracker?: RunBudgetTracker;
737741
readonly failOnError?: FailOnError;
738742
readonly threshold?: number;
739743
readonly providerFactory?: (
@@ -760,6 +764,7 @@ async function runSingleEvalFile(params: {
760764
trialsConfig,
761765
matrixMode,
762766
budgetUsd,
767+
runBudgetTracker,
763768
failOnError,
764769
providerFactory,
765770
} = params;
@@ -856,6 +861,7 @@ async function runSingleEvalFile(params: {
856861
keepWorkspaces: options.keepWorkspaces,
857862
trials: trialsConfig,
858863
budgetUsd,
864+
runBudgetTracker,
859865
failOnError,
860866
graderTarget: options.graderTarget,
861867
model: options.model,
@@ -940,6 +946,8 @@ export interface RunEvalResult {
940946
readonly thresholdFailed?: boolean;
941947
/** True when all tests had execution errors and no evaluation was performed */
942948
readonly allExecutionErrors?: boolean;
949+
/** True when --budget-usd was set and the run-level budget was exceeded */
950+
readonly budgetExceeded?: boolean;
943951
}
944952

945953
interface RemoteEvalSummaryInput {
@@ -1203,6 +1211,12 @@ export async function runEvalCommand(
12031211
const seenTestCases = new Set<string>();
12041212
const displayIdTracker = createDisplayIdTracker();
12051213

1214+
// Run-level budget tracker: caps total cost across all eval files in this run.
1215+
const runBudgetTracker = options.budgetUsd ? new RunBudgetTracker(options.budgetUsd) : undefined;
1216+
if (runBudgetTracker) {
1217+
console.log(`Run budget cap: $${runBudgetTracker.budgetCapUsd.toFixed(2)}`);
1218+
}
1219+
12061220
// Each file gets the full worker budget — no splitting across files
12071221
const perFileWorkers = options.workers;
12081222
const fileMetadata = new Map<
@@ -1420,6 +1434,35 @@ export async function runEvalCommand(
14201434
// workspace races without any grouping complexity.
14211435
try {
14221436
for (const testFilePath of activeTestFiles) {
1437+
// Run-level budget check: skip remaining files if budget exceeded
1438+
if (runBudgetTracker?.isExceeded()) {
1439+
const targetPrep = fileMetadata.get(testFilePath);
1440+
if (!targetPrep) continue;
1441+
const budgetMsg = `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`;
1442+
console.log(`\n⚠ ${budgetMsg} — skipping ${path.basename(testFilePath)}`);
1443+
for (const { selection } of targetPrep.selections) {
1444+
const skippedResults: EvaluationResult[] = targetPrep.testCases.map((testCase) => ({
1445+
timestamp: new Date().toISOString(),
1446+
testId: testCase.id,
1447+
score: 0,
1448+
assertions: [],
1449+
output: [],
1450+
error: budgetMsg,
1451+
budgetExceeded: true,
1452+
executionStatus: 'execution_error' as const,
1453+
failureStage: 'setup' as const,
1454+
failureReasonCode: 'budget_exceeded' as const,
1455+
executionError: { message: budgetMsg, stage: 'setup' as const },
1456+
target: selection.targetName,
1457+
}));
1458+
for (const r of skippedResults) {
1459+
await outputWriter.append(r);
1460+
}
1461+
allResults.push(...skippedResults);
1462+
}
1463+
continue;
1464+
}
1465+
14231466
const targetPrep = fileMetadata.get(testFilePath);
14241467
if (!targetPrep) {
14251468
throw new Error(`Missing metadata for ${testFilePath}`);
@@ -1472,6 +1515,7 @@ export async function runEvalCommand(
14721515
trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
14731516
matrixMode: targetPrep.selections.length > 1,
14741517
budgetUsd: targetPrep.budgetUsd,
1518+
runBudgetTracker,
14751519
failOnError: targetPrep.failOnError,
14761520
threshold: resolvedThreshold,
14771521
providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory,
@@ -1690,13 +1734,22 @@ export async function runEvalCommand(
16901734
);
16911735
}
16921736

1737+
// Print run-level budget summary when exceeded
1738+
const runBudgetExceeded = runBudgetTracker?.isExceeded() ?? false;
1739+
if (runBudgetExceeded) {
1740+
console.log(
1741+
`\n⚠ Run budget exceeded: $${runBudgetTracker?.currentCostUsd.toFixed(4)} spent of $${runBudgetTracker?.budgetCapUsd.toFixed(4)} cap`,
1742+
);
1743+
}
1744+
16931745
return {
16941746
executionErrorCount: summary.executionErrorCount,
16951747
outputPath,
16961748
testFiles: activeTestFiles,
16971749
target: options.target,
16981750
thresholdFailed,
16991751
allExecutionErrors,
1752+
budgetExceeded: runBudgetExceeded || undefined,
17001753
};
17011754
} finally {
17021755
unsubscribeCodexLogs();

apps/cli/test/eval.integration.test.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,4 +270,20 @@ describe('agentv eval CLI', () => {
270270
await rm(fixture.baseDir, { recursive: true, force: true });
271271
}
272272
});
273+
274+
it('passes run-level budget tracking through to the evaluator', async () => {
275+
const fixture = await createFixture();
276+
try {
277+
await runCli(fixture, ['eval', fixture.testFilePath, '--budget-usd', '0.5']);
278+
279+
const diagnostics = await readDiagnostics(fixture);
280+
expect(diagnostics).toMatchObject({
281+
budgetUsd: null,
282+
hasRunBudgetTracker: true,
283+
runBudgetCapUsd: 0.5,
284+
});
285+
} finally {
286+
await rm(fixture.baseDir, { recursive: true, force: true });
287+
}
288+
});
273289
});

apps/cli/test/fixtures/mock-run-evaluation.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ interface RunEvaluationOptionsLike {
1818
readonly filter?: string | readonly string[];
1919
readonly evalCases?: ReadonlyArray<unknown>;
2020
readonly verbose?: boolean;
21+
readonly budgetUsd?: number;
22+
readonly runBudgetTracker?: {
23+
readonly budgetCapUsd?: number;
24+
};
2125
readonly onResult?: (result: EvaluationResultLike) => Promise<void> | void;
2226
}
2327

@@ -82,6 +86,9 @@ async function maybeWriteDiagnostics(
8286
envSample: process.env.CLI_ENV_SAMPLE ?? null,
8387
envRootOnly: process.env.CLI_ENV_ROOT_ONLY ?? null,
8488
envLocalOnly: process.env.CLI_ENV_LOCAL_ONLY ?? null,
89+
budgetUsd: options.budgetUsd ?? null,
90+
hasRunBudgetTracker: options.runBudgetTracker !== undefined,
91+
runBudgetCapUsd: options.runBudgetTracker?.budgetCapUsd ?? null,
8592
evalCaseIds: Array.isArray(options.evalCases)
8693
? options.evalCases
8794
.map((evalCase) =>

packages/core/src/evaluation/orchestrator.ts

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import {
4242
isAgentProvider,
4343
} from './providers/types.js';
4444
import { createBuiltinRegistry, discoverAssertions, discoverGraders } from './registry/index.js';
45+
import type { RunBudgetTracker } from './run-budget-tracker.js';
4546
import {
4647
type TokenUsage,
4748
type TraceSummary,
@@ -414,6 +415,8 @@ export interface RunEvaluationOptions {
414415
readonly streamCallbacks?: ProviderStreamCallbacks;
415416
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
416417
readonly budgetUsd?: number;
418+
/** Run-level total cost tracker shared across multiple eval files/targets in one CLI invocation */
419+
readonly runBudgetTracker?: RunBudgetTracker;
417420
/** Execution error tolerance: true halts on first error */
418421
readonly failOnError?: FailOnError;
419422
/** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
@@ -467,6 +470,7 @@ export async function runEvaluation(
467470
trials,
468471
streamCallbacks,
469472
budgetUsd,
473+
runBudgetTracker,
470474
failOnError,
471475
poolWorkspaces,
472476
poolMaxSlots: configPoolMaxSlots,
@@ -1153,6 +1157,14 @@ export async function runEvaluation(
11531157
return { ok: allPassed, depResults };
11541158
}
11551159

1160+
function extractEvaluationCostUsd(result: EvaluationResult): number | undefined {
1161+
if (result.trials && result.trials.length > 0) {
1162+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
1163+
return trialCostSum > 0 ? trialCostSum : undefined;
1164+
}
1165+
return result.costUsd;
1166+
}
1167+
11561168
// Worker function: dispatches a single eval case with dependency context
11571169
async function dispatchTest(
11581170
evalCase: EvalTest,
@@ -1161,6 +1173,47 @@ export async function runEvaluation(
11611173
const workerId = nextWorkerId++;
11621174
workerIdByEvalId.set(evalCase.id, workerId);
11631175

1176+
// Check run-level budget before dispatching. This shared tracker spans all
1177+
// eval files/targets in the current CLI invocation, so queued cases stop once
1178+
// cumulative spend reaches the cap while already-running cases are allowed to finish.
1179+
if (runBudgetTracker?.isExceeded()) {
1180+
const budgetResult: EvaluationResult = {
1181+
timestamp: (now ?? (() => new Date()))().toISOString(),
1182+
testId: evalCase.id,
1183+
suite: evalCase.suite,
1184+
category: evalCase.category,
1185+
score: 0,
1186+
assertions: [],
1187+
output: [],
1188+
target: target.name,
1189+
error: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
1190+
budgetExceeded: true,
1191+
executionStatus: 'execution_error',
1192+
failureStage: 'setup',
1193+
failureReasonCode: 'budget_exceeded',
1194+
executionError: {
1195+
message: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
1196+
stage: 'setup',
1197+
},
1198+
};
1199+
1200+
if (onProgress) {
1201+
await onProgress({
1202+
workerId,
1203+
testId: evalCase.id,
1204+
status: 'failed',
1205+
completedAt: Date.now(),
1206+
error: budgetResult.error,
1207+
score: budgetResult.score,
1208+
executionStatus: budgetResult.executionStatus,
1209+
});
1210+
}
1211+
if (onResult) {
1212+
await onResult(budgetResult);
1213+
}
1214+
return budgetResult;
1215+
}
1216+
11641217
// Check suite-level budget before dispatching
11651218
if (budgetUsd !== undefined && budgetExhausted) {
11661219
const budgetResult: EvaluationResult = {
@@ -1291,24 +1344,17 @@ export async function runEvaluation(
12911344
? await runEvalCaseWithTrials(runCaseOptions, trials)
12921345
: await runEvalCase(runCaseOptions);
12931346

1294-
// Track suite-level budget
1295-
if (budgetUsd !== undefined) {
1296-
// Sum all trial costs when trials are used, otherwise use trace cost
1297-
let caseCost: number | undefined;
1298-
if (result.trials && result.trials.length > 0) {
1299-
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
1300-
if (trialCostSum > 0) {
1301-
caseCost = trialCostSum;
1302-
}
1303-
} else {
1304-
caseCost = result.costUsd;
1305-
}
1306-
if (caseCost !== undefined) {
1347+
const caseCost = extractEvaluationCostUsd(result);
1348+
if (caseCost !== undefined) {
1349+
if (budgetUsd !== undefined) {
13071350
cumulativeBudgetCost += caseCost;
13081351
if (cumulativeBudgetCost >= budgetUsd) {
13091352
budgetExhausted = true;
13101353
}
13111354
}
1355+
if (runBudgetTracker) {
1356+
runBudgetTracker.add(caseCost);
1357+
}
13121358
}
13131359

13141360
// Track fail_on_error
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/**
2+
* Tracks cumulative cost across all eval files in a single CLI run.
3+
*
4+
* The per-suite budget (`execution.budget_usd` in YAML) is enforced by the orchestrator
5+
* and caps spend within one eval file. This tracker provides a **run-level** cap that
6+
* spans all files in a single `agentv run` invocation.
7+
*
8+
* Usage:
9+
* 1. Instantiate with the cap from `--budget-usd`.
10+
* 2. Share the tracker with each orchestrator running in the invocation.
11+
* 3. After each completed case, call `add()` with that case's total cost.
12+
* 4. Before dispatching the next case or file, check `isExceeded()`.
13+
*
14+
* Thread-safety note: AgentV mutates this tracker from async orchestration code, but all
15+
* updates occur on the JavaScript event loop. There is no shared-memory mutation across
16+
* threads, so simple cumulative accounting is sufficient here.
17+
*/
18+
export class RunBudgetTracker {
19+
private cumulative = 0;
20+
21+
constructor(private readonly capUsd: number) {}
22+
23+
/** Accumulate cost from a completed test or file. */
24+
add(costUsd: number): void {
25+
this.cumulative += costUsd;
26+
}
27+
28+
/** True when cumulative cost meets or exceeds the cap. */
29+
isExceeded(): boolean {
30+
return this.cumulative >= this.capUsd;
31+
}
32+
33+
/** Current accumulated cost. */
34+
get currentCostUsd(): number {
35+
return this.cumulative;
36+
}
37+
38+
/** The configured cap. */
39+
get budgetCapUsd(): number {
40+
return this.capUsd;
41+
}
42+
}

packages/core/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ export {
128128
type AssertionResult,
129129
} from './evaluation/graders/assertions.js';
130130
export { discoverGraders } from './evaluation/registry/grader-discovery.js';
131+
export { RunBudgetTracker } from './evaluation/run-budget-tracker.js';
131132

132133
// Import pipeline
133134
export * from './import/index.js';

0 commit comments

Comments
 (0)