feat(cli): add --budget-usd run-level cost cap (#1118)

christso · Copilot · web-flow · commit 0ff9c32d6dbc · 2026-04-16T14:21:06.000+10:00
* feat(cli): add --budget-usd run-level cost cap (#1113) Add a `--budget-usd` flag to `agentv run` that caps total cost across all eval files in a single invocation. When the cumulative cost exceeds the cap, remaining eval files are skipped with `budget_exceeded` results. Implementation: - New `RunBudgetTracker` class in packages/core for reusable budget tracking - CLI flag with validation (must be positive number) - Integrated into sequential file loop: costs accumulated after each file, budget checked before dispatching the next file - Per-suite `execution.budget_usd` still enforced within files by orchestrator - Exit code 1 when run-level budget is exceeded - Summary output shows cap and actual spend when exceeded Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(cli): enforce run budget during orchestration Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -209,6 +209,12 @@ export const evalRunCommand = command({
       description:
         'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value',
     }),
+    budgetUsd: option({
+      type: optional(number),
+      long: 'budget-usd',
+      description:
+        'Maximum total cost in USD across all eval files in this run. Stops dispatching new cases when exceeded.',
+    }),
     tag: multioption({
       type: array(string),
       long: 'tag',
@@ -235,6 +241,10 @@ export const evalRunCommand = command({
     }
 
     const resolvedPaths = await resolveEvalPaths(args.evalPaths, process.cwd());
+    if (args.budgetUsd !== undefined && args.budgetUsd <= 0) {
+      console.error('Error: --budget-usd must be a positive number.');
+      process.exit(2);
+    }
     const rawOptions: Record<string, unknown> = {
       target: args.target,
       targets: args.targets,
@@ -273,6 +283,7 @@ export const evalRunCommand = command({
       model: args.model,
       outputMessages: args.outputMessages,
       threshold: args.threshold,
+      budgetUsd: args.budgetUsd,
       tag: args.tag,
       excludeTag: args.excludeTag,
       transcript: args.transcript,
@@ -281,6 +292,9 @@ export const evalRunCommand = command({
     if (result?.allExecutionErrors) {
       process.exit(2);
     }
+    if (result?.budgetExceeded) {
+      process.exit(1);
+    }
     if (result?.thresholdFailed) {
       process.exit(1);
     }
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -13,6 +13,7 @@ import {
   type OtelTraceExporter as OtelTraceExporterType,
   type ResolvedTarget,
   ResponseCache,
+  RunBudgetTracker,
   type TrialsConfig,
   runEvaluation as defaultRunEvaluation,
   deriveCategory,
@@ -119,6 +120,7 @@ interface NormalizedOptions {
   readonly excludeTags: readonly string[];
   readonly transcript?: string;
   readonly experiment?: string;
+  readonly budgetUsd?: number;
 }
 
 function normalizeBoolean(value: unknown): boolean {
@@ -394,6 +396,7 @@ function normalizeOptions(
     excludeTags: normalizeStringArray(rawOptions.excludeTag),
     transcript: normalizeString(rawOptions.transcript),
     experiment: normalizeString(rawOptions.experiment),
+    budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd),
   } satisfies NormalizedOptions;
 }
 
@@ -734,6 +737,7 @@ async function runSingleEvalFile(params: {
   readonly trialsConfig?: TrialsConfig;
   readonly matrixMode?: boolean;
   readonly budgetUsd?: number;
+  readonly runBudgetTracker?: RunBudgetTracker;
   readonly failOnError?: FailOnError;
   readonly threshold?: number;
   readonly providerFactory?: (
@@ -760,6 +764,7 @@ async function runSingleEvalFile(params: {
     trialsConfig,
     matrixMode,
     budgetUsd,
+    runBudgetTracker,
     failOnError,
     providerFactory,
   } = params;
@@ -856,6 +861,7 @@ async function runSingleEvalFile(params: {
     keepWorkspaces: options.keepWorkspaces,
     trials: trialsConfig,
     budgetUsd,
+    runBudgetTracker,
     failOnError,
     graderTarget: options.graderTarget,
     model: options.model,
@@ -940,6 +946,8 @@ export interface RunEvalResult {
   readonly thresholdFailed?: boolean;
   /** True when all tests had execution errors and no evaluation was performed */
   readonly allExecutionErrors?: boolean;
+  /** True when --budget-usd was set and the run-level budget was exceeded */
+  readonly budgetExceeded?: boolean;
 }
 
 interface RemoteEvalSummaryInput {
@@ -1203,6 +1211,12 @@ export async function runEvalCommand(
   const seenTestCases = new Set<string>();
   const displayIdTracker = createDisplayIdTracker();
 
+  // Run-level budget tracker: caps total cost across all eval files in this run.
+  const runBudgetTracker = options.budgetUsd ? new RunBudgetTracker(options.budgetUsd) : undefined;
+  if (runBudgetTracker) {
+    console.log(`Run budget cap: $${runBudgetTracker.budgetCapUsd.toFixed(2)}`);
+  }
+
   // Each file gets the full worker budget — no splitting across files
   const perFileWorkers = options.workers;
   const fileMetadata = new Map<
@@ -1420,6 +1434,35 @@ export async function runEvalCommand(
   // workspace races without any grouping complexity.
   try {
     for (const testFilePath of activeTestFiles) {
+      // Run-level budget check: skip remaining files if budget exceeded
+      if (runBudgetTracker?.isExceeded()) {
+        const targetPrep = fileMetadata.get(testFilePath);
+        if (!targetPrep) continue;
+        const budgetMsg = `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`;
+        console.log(`\n⚠ ${budgetMsg} — skipping ${path.basename(testFilePath)}`);
+        for (const { selection } of targetPrep.selections) {
+          const skippedResults: EvaluationResult[] = targetPrep.testCases.map((testCase) => ({
+            timestamp: new Date().toISOString(),
+            testId: testCase.id,
+            score: 0,
+            assertions: [],
+            output: [],
+            error: budgetMsg,
+            budgetExceeded: true,
+            executionStatus: 'execution_error' as const,
+            failureStage: 'setup' as const,
+            failureReasonCode: 'budget_exceeded' as const,
+            executionError: { message: budgetMsg, stage: 'setup' as const },
+            target: selection.targetName,
+          }));
+          for (const r of skippedResults) {
+            await outputWriter.append(r);
+          }
+          allResults.push(...skippedResults);
+        }
+        continue;
+      }
+
       const targetPrep = fileMetadata.get(testFilePath);
       if (!targetPrep) {
         throw new Error(`Missing metadata for ${testFilePath}`);
@@ -1472,6 +1515,7 @@ export async function runEvalCommand(
               trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
               matrixMode: targetPrep.selections.length > 1,
               budgetUsd: targetPrep.budgetUsd,
+              runBudgetTracker,
               failOnError: targetPrep.failOnError,
               threshold: resolvedThreshold,
               providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory,
@@ -1690,13 +1734,22 @@ export async function runEvalCommand(
       );
     }
 
+    // Print run-level budget summary when exceeded
+    const runBudgetExceeded = runBudgetTracker?.isExceeded() ?? false;
+    if (runBudgetExceeded) {
+      console.log(
+        `\n⚠ Run budget exceeded: $${runBudgetTracker?.currentCostUsd.toFixed(4)} spent of $${runBudgetTracker?.budgetCapUsd.toFixed(4)} cap`,
+      );
+    }
+
     return {
       executionErrorCount: summary.executionErrorCount,
       outputPath,
       testFiles: activeTestFiles,
       target: options.target,
       thresholdFailed,
       allExecutionErrors,
+      budgetExceeded: runBudgetExceeded || undefined,
     };
   } finally {
     unsubscribeCodexLogs();
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
@@ -270,4 +270,20 @@ describe('agentv eval CLI', () => {
       await rm(fixture.baseDir, { recursive: true, force: true });
     }
   });
+
+  it('passes run-level budget tracking through to the evaluator', async () => {
+    const fixture = await createFixture();
+    try {
+      await runCli(fixture, ['eval', fixture.testFilePath, '--budget-usd', '0.5']);
+
+      const diagnostics = await readDiagnostics(fixture);
+      expect(diagnostics).toMatchObject({
+        budgetUsd: null,
+        hasRunBudgetTracker: true,
+        runBudgetCapUsd: 0.5,
+      });
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  });
 });
diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts
@@ -18,6 +18,10 @@ interface RunEvaluationOptionsLike {
   readonly filter?: string | readonly string[];
   readonly evalCases?: ReadonlyArray<unknown>;
   readonly verbose?: boolean;
+  readonly budgetUsd?: number;
+  readonly runBudgetTracker?: {
+    readonly budgetCapUsd?: number;
+  };
   readonly onResult?: (result: EvaluationResultLike) => Promise<void> | void;
 }
 
@@ -82,6 +86,9 @@ async function maybeWriteDiagnostics(
     envSample: process.env.CLI_ENV_SAMPLE ?? null,
     envRootOnly: process.env.CLI_ENV_ROOT_ONLY ?? null,
     envLocalOnly: process.env.CLI_ENV_LOCAL_ONLY ?? null,
+    budgetUsd: options.budgetUsd ?? null,
+    hasRunBudgetTracker: options.runBudgetTracker !== undefined,
+    runBudgetCapUsd: options.runBudgetTracker?.budgetCapUsd ?? null,
     evalCaseIds: Array.isArray(options.evalCases)
       ? options.evalCases
           .map((evalCase) =>
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
@@ -42,6 +42,7 @@ import {
   isAgentProvider,
 } from './providers/types.js';
 import { createBuiltinRegistry, discoverAssertions, discoverGraders } from './registry/index.js';
+import type { RunBudgetTracker } from './run-budget-tracker.js';
 import {
   type TokenUsage,
   type TraceSummary,
@@ -414,6 +415,8 @@ export interface RunEvaluationOptions {
   readonly streamCallbacks?: ProviderStreamCallbacks;
   /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
   readonly budgetUsd?: number;
+  /** Run-level total cost tracker shared across multiple eval files/targets in one CLI invocation */
+  readonly runBudgetTracker?: RunBudgetTracker;
   /** Execution error tolerance: true halts on first error */
   readonly failOnError?: FailOnError;
   /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
@@ -467,6 +470,7 @@ export async function runEvaluation(
     trials,
     streamCallbacks,
     budgetUsd,
+    runBudgetTracker,
     failOnError,
     poolWorkspaces,
     poolMaxSlots: configPoolMaxSlots,
@@ -1153,6 +1157,14 @@ export async function runEvaluation(
       return { ok: allPassed, depResults };
     }
 
+    function extractEvaluationCostUsd(result: EvaluationResult): number | undefined {
+      if (result.trials && result.trials.length > 0) {
+        const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
+        return trialCostSum > 0 ? trialCostSum : undefined;
+      }
+      return result.costUsd;
+    }
+
     // Worker function: dispatches a single eval case with dependency context
     async function dispatchTest(
       evalCase: EvalTest,
@@ -1161,6 +1173,47 @@ export async function runEvaluation(
       const workerId = nextWorkerId++;
       workerIdByEvalId.set(evalCase.id, workerId);
 
+      // Check run-level budget before dispatching. This shared tracker spans all
+      // eval files/targets in the current CLI invocation, so queued cases stop once
+      // cumulative spend reaches the cap while already-running cases are allowed to finish.
+      if (runBudgetTracker?.isExceeded()) {
+        const budgetResult: EvaluationResult = {
+          timestamp: (now ?? (() => new Date()))().toISOString(),
+          testId: evalCase.id,
+          suite: evalCase.suite,
+          category: evalCase.category,
+          score: 0,
+          assertions: [],
+          output: [],
+          target: target.name,
+          error: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
+          budgetExceeded: true,
+          executionStatus: 'execution_error',
+          failureStage: 'setup',
+          failureReasonCode: 'budget_exceeded',
+          executionError: {
+            message: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
+            stage: 'setup',
+          },
+        };
+
+        if (onProgress) {
+          await onProgress({
+            workerId,
+            testId: evalCase.id,
+            status: 'failed',
+            completedAt: Date.now(),
+            error: budgetResult.error,
+            score: budgetResult.score,
+            executionStatus: budgetResult.executionStatus,
+          });
+        }
+        if (onResult) {
+          await onResult(budgetResult);
+        }
+        return budgetResult;
+      }
+
       // Check suite-level budget before dispatching
       if (budgetUsd !== undefined && budgetExhausted) {
         const budgetResult: EvaluationResult = {
@@ -1291,24 +1344,17 @@ export async function runEvaluation(
             ? await runEvalCaseWithTrials(runCaseOptions, trials)
             : await runEvalCase(runCaseOptions);
 
-        // Track suite-level budget
-        if (budgetUsd !== undefined) {
-          // Sum all trial costs when trials are used, otherwise use trace cost
-          let caseCost: number | undefined;
-          if (result.trials && result.trials.length > 0) {
-            const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
-            if (trialCostSum > 0) {
-              caseCost = trialCostSum;
-            }
-          } else {
-            caseCost = result.costUsd;
-          }
-          if (caseCost !== undefined) {
+        const caseCost = extractEvaluationCostUsd(result);
+        if (caseCost !== undefined) {
+          if (budgetUsd !== undefined) {
             cumulativeBudgetCost += caseCost;
             if (cumulativeBudgetCost >= budgetUsd) {
               budgetExhausted = true;
             }
           }
+          if (runBudgetTracker) {
+            runBudgetTracker.add(caseCost);
+          }
         }
 
         // Track fail_on_error
diff --git a/packages/core/src/evaluation/run-budget-tracker.ts b/packages/core/src/evaluation/run-budget-tracker.ts
@@ -0,0 +1,42 @@
+/**
+ * Tracks cumulative cost across all eval files in a single CLI run.
+ *
+ * The per-suite budget (`execution.budget_usd` in YAML) is enforced by the orchestrator
+ * and caps spend within one eval file. This tracker provides a **run-level** cap that
+ * spans all files in a single `agentv run` invocation.
+ *
+ * Usage:
+ * 1. Instantiate with the cap from `--budget-usd`.
+ * 2. Share the tracker with each orchestrator running in the invocation.
+ * 3. After each completed case, call `add()` with that case's total cost.
+ * 4. Before dispatching the next case or file, check `isExceeded()`.
+ *
+ * Thread-safety note: AgentV mutates this tracker from async orchestration code, but all
+ * updates occur on the JavaScript event loop. There is no shared-memory mutation across
+ * threads, so simple cumulative accounting is sufficient here.
+ */
+export class RunBudgetTracker {
+  private cumulative = 0;
+
+  constructor(private readonly capUsd: number) {}
+
+  /** Accumulate cost from a completed test or file. */
+  add(costUsd: number): void {
+    this.cumulative += costUsd;
+  }
+
+  /** True when cumulative cost meets or exceeds the cap. */
+  isExceeded(): boolean {
+    return this.cumulative >= this.capUsd;
+  }
+
+  /** Current accumulated cost. */
+  get currentCostUsd(): number {
+    return this.cumulative;
+  }
+
+  /** The configured cap. */
+  get budgetCapUsd(): number {
+    return this.capUsd;
+  }
+}
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
@@ -128,6 +128,7 @@ export {
   type AssertionResult,
 } from './evaluation/graders/assertions.js';
 export { discoverGraders } from './evaluation/registry/grader-discovery.js';
+export { RunBudgetTracker } from './evaluation/run-budget-tracker.js';
 
 // Import pipeline
 export * from './import/index.js';
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
diff --git a/packages/core/test/evaluation/run-budget-tracker.test.ts b/packages/core/test/evaluation/run-budget-tracker.test.ts