@@ -13,6 +13,7 @@ import {
1313 type OtelTraceExporter as OtelTraceExporterType ,
1414 type ResolvedTarget ,
1515 ResponseCache ,
16+ RunBudgetTracker ,
1617 type TrialsConfig ,
1718 runEvaluation as defaultRunEvaluation ,
1819 deriveCategory ,
@@ -119,6 +120,7 @@ interface NormalizedOptions {
119120 readonly excludeTags : readonly string [ ] ;
120121 readonly transcript ?: string ;
121122 readonly experiment ?: string ;
123+ readonly budgetUsd ?: number ;
122124}
123125
124126function normalizeBoolean ( value : unknown ) : boolean {
@@ -394,6 +396,7 @@ function normalizeOptions(
394396 excludeTags : normalizeStringArray ( rawOptions . excludeTag ) ,
395397 transcript : normalizeString ( rawOptions . transcript ) ,
396398 experiment : normalizeString ( rawOptions . experiment ) ,
399+ budgetUsd : normalizeOptionalNumber ( rawOptions . budgetUsd ) ,
397400 } satisfies NormalizedOptions ;
398401}
399402
@@ -734,6 +737,7 @@ async function runSingleEvalFile(params: {
734737 readonly trialsConfig ?: TrialsConfig ;
735738 readonly matrixMode ?: boolean ;
736739 readonly budgetUsd ?: number ;
740+ readonly runBudgetTracker ?: RunBudgetTracker ;
737741 readonly failOnError ?: FailOnError ;
738742 readonly threshold ?: number ;
739743 readonly providerFactory ?: (
@@ -760,6 +764,7 @@ async function runSingleEvalFile(params: {
760764 trialsConfig,
761765 matrixMode,
762766 budgetUsd,
767+ runBudgetTracker,
763768 failOnError,
764769 providerFactory,
765770 } = params ;
@@ -856,6 +861,7 @@ async function runSingleEvalFile(params: {
856861 keepWorkspaces : options . keepWorkspaces ,
857862 trials : trialsConfig ,
858863 budgetUsd,
864+ runBudgetTracker,
859865 failOnError,
860866 graderTarget : options . graderTarget ,
861867 model : options . model ,
@@ -940,6 +946,8 @@ export interface RunEvalResult {
940946 readonly thresholdFailed ?: boolean ;
941947 /** True when all tests had execution errors and no evaluation was performed */
942948 readonly allExecutionErrors ?: boolean ;
949+ /** True when --budget-usd was set and the run-level budget was exceeded */
950+ readonly budgetExceeded ?: boolean ;
943951}
944952
945953interface RemoteEvalSummaryInput {
@@ -1203,6 +1211,12 @@ export async function runEvalCommand(
12031211 const seenTestCases = new Set < string > ( ) ;
12041212 const displayIdTracker = createDisplayIdTracker ( ) ;
12051213
1214+ // Run-level budget tracker: caps total cost across all eval files in this run.
1215+ const runBudgetTracker = options . budgetUsd ? new RunBudgetTracker ( options . budgetUsd ) : undefined ;
1216+ if ( runBudgetTracker ) {
1217+ console . log ( `Run budget cap: $${ runBudgetTracker . budgetCapUsd . toFixed ( 2 ) } ` ) ;
1218+ }
1219+
12061220 // Each file gets the full worker budget — no splitting across files
12071221 const perFileWorkers = options . workers ;
12081222 const fileMetadata = new Map <
@@ -1420,6 +1434,35 @@ export async function runEvalCommand(
14201434 // workspace races without any grouping complexity.
14211435 try {
14221436 for ( const testFilePath of activeTestFiles ) {
1437+ // Run-level budget check: skip remaining files if budget exceeded
1438+ if ( runBudgetTracker ?. isExceeded ( ) ) {
1439+ const targetPrep = fileMetadata . get ( testFilePath ) ;
1440+ if ( ! targetPrep ) continue ;
1441+ const budgetMsg = `Run budget exceeded ($${ runBudgetTracker . currentCostUsd . toFixed ( 4 ) } / $${ runBudgetTracker . budgetCapUsd . toFixed ( 4 ) } )` ;
1442+ console . log ( `\n⚠ ${ budgetMsg } — skipping ${ path . basename ( testFilePath ) } ` ) ;
1443+ for ( const { selection } of targetPrep . selections ) {
1444+ const skippedResults : EvaluationResult [ ] = targetPrep . testCases . map ( ( testCase ) => ( {
1445+ timestamp : new Date ( ) . toISOString ( ) ,
1446+ testId : testCase . id ,
1447+ score : 0 ,
1448+ assertions : [ ] ,
1449+ output : [ ] ,
1450+ error : budgetMsg ,
1451+ budgetExceeded : true ,
1452+ executionStatus : 'execution_error' as const ,
1453+ failureStage : 'setup' as const ,
1454+ failureReasonCode : 'budget_exceeded' as const ,
1455+ executionError : { message : budgetMsg , stage : 'setup' as const } ,
1456+ target : selection . targetName ,
1457+ } ) ) ;
1458+ for ( const r of skippedResults ) {
1459+ await outputWriter . append ( r ) ;
1460+ }
1461+ allResults . push ( ...skippedResults ) ;
1462+ }
1463+ continue ;
1464+ }
1465+
14231466 const targetPrep = fileMetadata . get ( testFilePath ) ;
14241467 if ( ! targetPrep ) {
14251468 throw new Error ( `Missing metadata for ${ testFilePath } ` ) ;
@@ -1472,6 +1515,7 @@ export async function runEvalCommand(
14721515 trialsConfig : options . transcript ? undefined : targetPrep . trialsConfig ,
14731516 matrixMode : targetPrep . selections . length > 1 ,
14741517 budgetUsd : targetPrep . budgetUsd ,
1518+ runBudgetTracker,
14751519 failOnError : targetPrep . failOnError ,
14761520 threshold : resolvedThreshold ,
14771521 providerFactory : transcriptProviderFactory ?? targetPrep . providerFactory ,
@@ -1690,13 +1734,22 @@ export async function runEvalCommand(
16901734 ) ;
16911735 }
16921736
1737+ // Print run-level budget summary when exceeded
1738+ const runBudgetExceeded = runBudgetTracker ?. isExceeded ( ) ?? false ;
1739+ if ( runBudgetExceeded ) {
1740+ console . log (
1741+ `\n⚠ Run budget exceeded: $${ runBudgetTracker ?. currentCostUsd . toFixed ( 4 ) } spent of $${ runBudgetTracker ?. budgetCapUsd . toFixed ( 4 ) } cap` ,
1742+ ) ;
1743+ }
1744+
16931745 return {
16941746 executionErrorCount : summary . executionErrorCount ,
16951747 outputPath,
16961748 testFiles : activeTestFiles ,
16971749 target : options . target ,
16981750 thresholdFailed,
16991751 allExecutionErrors,
1752+ budgetExceeded : runBudgetExceeded || undefined ,
17001753 } ;
17011754 } finally {
17021755 unsubscribeCodexLogs ( ) ;
0 commit comments