Skip to content

Commit 491066f

Browse files
christsoCopilot
andauthored
feat(core): rename total_budget_usd to budget_usd (#1114) (#1117)
Rename execution.total_budget_usd → execution.budget_usd (and totalBudgetUsd → budgetUsd) across schema, config-loader, yaml-parser, orchestrator, CLI, tests, and JSON schema reference. The old key is rejected at load time with a clear error message: "execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML." Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 28bd1b6 commit 491066f

9 files changed

Lines changed: 74 additions & 52 deletions

File tree

apps/cli/src/commands/eval/artifact-writer.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ export function buildTestTargetKey(testId?: string, target?: string): string {
1515
}
1616

1717
// Deduplication helper — keeps the last entry per (test_id, target) pair.
18-
export function deduplicateByTestIdTarget(results: readonly EvaluationResult[]): EvaluationResult[] {
18+
export function deduplicateByTestIdTarget(
19+
results: readonly EvaluationResult[],
20+
): EvaluationResult[] {
1921
const seen = new Map<string, number>();
2022
for (let i = 0; i < results.length; i++) {
2123
seen.set(buildTestTargetKey(results[i].testId, results[i].target), i);

apps/cli/src/commands/eval/run-eval.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ async function prepareFileMetadata(params: {
527527
readonly yamlWorkers?: number;
528528
readonly yamlCache?: boolean;
529529
readonly yamlCachePath?: string;
530-
readonly totalBudgetUsd?: number;
530+
readonly budgetUsd?: number;
531531
readonly failOnError?: FailOnError;
532532
readonly threshold?: number;
533533
readonly tags?: readonly string[];
@@ -654,7 +654,7 @@ async function prepareFileMetadata(params: {
654654
yamlWorkers: suite.workers,
655655
yamlCache: suite.cacheConfig?.enabled,
656656
yamlCachePath: suite.cacheConfig?.cachePath,
657-
totalBudgetUsd: suite.totalBudgetUsd,
657+
budgetUsd: suite.budgetUsd,
658658
failOnError: suite.failOnError,
659659
threshold: suite.threshold,
660660
tags: suite.metadata?.tags,
@@ -680,7 +680,7 @@ async function runSingleEvalFile(params: {
680680
readonly testCases: readonly EvalTest[];
681681
readonly trialsConfig?: TrialsConfig;
682682
readonly matrixMode?: boolean;
683-
readonly totalBudgetUsd?: number;
683+
readonly budgetUsd?: number;
684684
readonly failOnError?: FailOnError;
685685
readonly threshold?: number;
686686
readonly providerFactory?: (
@@ -706,7 +706,7 @@ async function runSingleEvalFile(params: {
706706
testCases,
707707
trialsConfig,
708708
matrixMode,
709-
totalBudgetUsd,
709+
budgetUsd,
710710
failOnError,
711711
providerFactory,
712712
} = params;
@@ -802,7 +802,7 @@ async function runSingleEvalFile(params: {
802802
workspacePath: options.workspacePath,
803803
keepWorkspaces: options.keepWorkspaces,
804804
trials: trialsConfig,
805-
totalBudgetUsd,
805+
budgetUsd,
806806
failOnError,
807807
graderTarget: options.graderTarget,
808808
model: options.model,
@@ -1166,7 +1166,7 @@ export async function runEvalCommand(
11661166
readonly yamlWorkers?: number;
11671167
readonly yamlCache?: boolean;
11681168
readonly yamlCachePath?: string;
1169-
readonly totalBudgetUsd?: number;
1169+
readonly budgetUsd?: number;
11701170
readonly failOnError?: FailOnError;
11711171
readonly threshold?: number;
11721172
readonly tags?: readonly string[];
@@ -1439,7 +1439,7 @@ export async function runEvalCommand(
14391439
testCases: filteredTestCases,
14401440
trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
14411441
matrixMode: targetPrep.selections.length > 1,
1442-
totalBudgetUsd: targetPrep.totalBudgetUsd,
1442+
budgetUsd: targetPrep.budgetUsd,
14431443
failOnError: targetPrep.failOnError,
14441444
threshold: resolvedThreshold,
14451445
providerFactory: transcriptProviderFactory,

packages/core/src/evaluation/loaders/config-loader.ts

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -394,14 +394,22 @@ export function extractCacheConfig(suite: JsonObject): CacheConfig | undefined {
394394
* Extract suite-level total budget from parsed eval suite's execution block.
395395
* Returns undefined when not specified.
396396
*/
397-
export function extractTotalBudgetUsd(suite: JsonObject): number | undefined {
397+
export function extractBudgetUsd(suite: JsonObject): number | undefined {
398398
const execution = suite.execution;
399399
if (!execution || typeof execution !== 'object' || Array.isArray(execution)) {
400400
return undefined;
401401
}
402402

403403
const executionObj = execution as Record<string, unknown>;
404-
const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
404+
405+
// Reject the old key with a clear error
406+
if ('total_budget_usd' in executionObj || 'totalBudgetUsd' in executionObj) {
407+
throw new Error(
408+
'execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML.',
409+
);
410+
}
411+
412+
const rawBudget = executionObj.budget_usd ?? executionObj.budgetUsd;
405413

406414
if (rawBudget === undefined || rawBudget === null) {
407415
return undefined;
@@ -411,9 +419,7 @@ export function extractTotalBudgetUsd(suite: JsonObject): number | undefined {
411419
return rawBudget;
412420
}
413421

414-
logWarning(
415-
`Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`,
416-
);
422+
logWarning(`Invalid execution.budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`);
417423
return undefined;
418424
}
419425

packages/core/src/evaluation/orchestrator.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@ export interface RunEvaluationOptions {
413413
/** Real-time observability callbacks passed to the provider */
414414
readonly streamCallbacks?: ProviderStreamCallbacks;
415415
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
416-
readonly totalBudgetUsd?: number;
416+
readonly budgetUsd?: number;
417417
/** Execution error tolerance: true halts on first error */
418418
readonly failOnError?: FailOnError;
419419
/** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
@@ -466,7 +466,7 @@ export async function runEvaluation(
466466
cleanupWorkspaces,
467467
trials,
468468
streamCallbacks,
469-
totalBudgetUsd,
469+
budgetUsd,
470470
failOnError,
471471
poolWorkspaces,
472472
poolMaxSlots: configPoolMaxSlots,
@@ -1162,7 +1162,7 @@ export async function runEvaluation(
11621162
workerIdByEvalId.set(evalCase.id, workerId);
11631163

11641164
// Check suite-level budget before dispatching
1165-
if (totalBudgetUsd !== undefined && budgetExhausted) {
1165+
if (budgetUsd !== undefined && budgetExhausted) {
11661166
const budgetResult: EvaluationResult = {
11671167
timestamp: (now ?? (() => new Date()))().toISOString(),
11681168
testId: evalCase.id,
@@ -1172,13 +1172,13 @@ export async function runEvaluation(
11721172
assertions: [],
11731173
output: [],
11741174
target: target.name,
1175-
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
1175+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
11761176
budgetExceeded: true,
11771177
executionStatus: 'execution_error',
11781178
failureStage: 'setup',
11791179
failureReasonCode: 'budget_exceeded',
11801180
executionError: {
1181-
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
1181+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
11821182
stage: 'setup',
11831183
},
11841184
};
@@ -1292,7 +1292,7 @@ export async function runEvaluation(
12921292
: await runEvalCase(runCaseOptions);
12931293

12941294
// Track suite-level budget
1295-
if (totalBudgetUsd !== undefined) {
1295+
if (budgetUsd !== undefined) {
12961296
// Sum all trial costs when trials are used, otherwise use trace cost
12971297
let caseCost: number | undefined;
12981298
if (result.trials && result.trials.length > 0) {
@@ -1305,7 +1305,7 @@ export async function runEvaluation(
13051305
}
13061306
if (caseCost !== undefined) {
13071307
cumulativeBudgetCost += caseCost;
1308-
if (cumulativeBudgetCost >= totalBudgetUsd) {
1308+
if (cumulativeBudgetCost >= budgetUsd) {
13091309
budgetExhausted = true;
13101310
}
13111311
}

packages/core/src/evaluation/validation/eval-file.schema.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,8 +366,8 @@ const ExecutionSchema = z.object({
366366
skip_defaults: z.boolean().optional(),
367367
cache: z.boolean().optional(),
368368
trials: TrialsSchema.optional(),
369-
total_budget_usd: z.number().min(0).optional(),
370-
totalBudgetUsd: z.number().min(0).optional(),
369+
budget_usd: z.number().min(0).optional(),
370+
budgetUsd: z.number().min(0).optional(),
371371
fail_on_error: FailOnErrorSchema.optional(),
372372
failOnError: FailOnErrorSchema.optional(),
373373
threshold: z.number().min(0).max(1).optional(),

packages/core/src/evaluation/yaml-parser.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@ import { interpolateEnv } from './interpolation.js';
88
import { loadTestsFromAgentSkills } from './loaders/agent-skills-parser.js';
99
import { expandFileReferences, loadCasesFromFile } from './loaders/case-file-loader.js';
1010
import {
11+
extractBudgetUsd,
1112
extractCacheConfig,
1213
extractFailOnError,
1314
extractTargetFromSuite,
1415
extractTargetRefsFromSuite,
1516
extractTargetsFromSuite,
1617
extractTargetsFromTestCase,
1718
extractThreshold,
18-
extractTotalBudgetUsd,
1919
extractTrialsConfig,
2020
extractWorkersFromSuite,
2121
loadConfig,
@@ -203,7 +203,7 @@ export type EvalSuiteResult = {
203203
/** Suite-level metadata (name, description, version, etc.) */
204204
readonly metadata?: import('./metadata.js').EvalMetadata;
205205
/** Suite-level total cost budget in USD */
206-
readonly totalBudgetUsd?: number;
206+
readonly budgetUsd?: number;
207207
/** Execution error tolerance: true or false */
208208
readonly failOnError?: import('./types.js').FailOnError;
209209
/** Suite-level quality threshold (0-1) — suite fails if mean score is below */
@@ -243,7 +243,7 @@ export async function loadTestSuite(
243243
targetRefs: extractTargetRefsFromSuite(parsed),
244244
workers: extractWorkersFromSuite(parsed),
245245
cacheConfig: extractCacheConfig(parsed),
246-
totalBudgetUsd: extractTotalBudgetUsd(parsed),
246+
budgetUsd: extractBudgetUsd(parsed),
247247
...(metadata !== undefined && { metadata }),
248248
...(failOnError !== undefined && { failOnError }),
249249
...(threshold !== undefined && { threshold }),

packages/core/test/evaluation/loaders/config-loader.test.ts

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import { describe, expect, it } from 'bun:test';
22

33
import {
4+
extractBudgetUsd,
45
extractFailOnError,
56
extractTargetFromSuite,
67
extractTargetRefsFromSuite,
78
extractTargetsFromSuite,
89
extractTargetsFromTestCase,
910
extractThreshold,
10-
extractTotalBudgetUsd,
1111
extractTrialsConfig,
1212
parseExecutionDefaults,
1313
parseResultsConfig,
@@ -380,40 +380,54 @@ describe('extractTargetsFromTestCase', () => {
380380
});
381381
});
382382

383-
describe('extractTotalBudgetUsd', () => {
383+
describe('extractBudgetUsd', () => {
384384
it('returns undefined when no execution block', () => {
385385
const suite: JsonObject = { tests: [] };
386-
expect(extractTotalBudgetUsd(suite)).toBeUndefined();
386+
expect(extractBudgetUsd(suite)).toBeUndefined();
387387
});
388388

389-
it('returns undefined when no total_budget_usd in execution', () => {
389+
it('returns undefined when no budget_usd in execution', () => {
390390
const suite: JsonObject = { execution: { target: 'default' } };
391-
expect(extractTotalBudgetUsd(suite)).toBeUndefined();
391+
expect(extractBudgetUsd(suite)).toBeUndefined();
392392
});
393393

394-
it('parses valid total_budget_usd (snake_case)', () => {
395-
const suite: JsonObject = { execution: { total_budget_usd: 10.0 } };
396-
expect(extractTotalBudgetUsd(suite)).toBe(10.0);
394+
it('parses valid budget_usd (snake_case)', () => {
395+
const suite: JsonObject = { execution: { budget_usd: 10.0 } };
396+
expect(extractBudgetUsd(suite)).toBe(10.0);
397397
});
398398

399-
it('parses valid totalBudgetUsd (camelCase)', () => {
400-
const suite: JsonObject = { execution: { totalBudgetUsd: 5.5 } };
401-
expect(extractTotalBudgetUsd(suite)).toBe(5.5);
399+
it('parses valid budgetUsd (camelCase)', () => {
400+
const suite: JsonObject = { execution: { budgetUsd: 5.5 } };
401+
expect(extractBudgetUsd(suite)).toBe(5.5);
402402
});
403403

404404
it('returns undefined for zero budget', () => {
405-
const suite: JsonObject = { execution: { total_budget_usd: 0 } };
406-
expect(extractTotalBudgetUsd(suite)).toBeUndefined();
405+
const suite: JsonObject = { execution: { budget_usd: 0 } };
406+
expect(extractBudgetUsd(suite)).toBeUndefined();
407407
});
408408

409409
it('returns undefined for negative budget', () => {
410-
const suite: JsonObject = { execution: { total_budget_usd: -1 } };
411-
expect(extractTotalBudgetUsd(suite)).toBeUndefined();
410+
const suite: JsonObject = { execution: { budget_usd: -1 } };
411+
expect(extractBudgetUsd(suite)).toBeUndefined();
412412
});
413413

414414
it('returns undefined for non-number budget', () => {
415-
const suite: JsonObject = { execution: { total_budget_usd: 'ten' } };
416-
expect(extractTotalBudgetUsd(suite)).toBeUndefined();
415+
const suite: JsonObject = { execution: { budget_usd: 'ten' } };
416+
expect(extractBudgetUsd(suite)).toBeUndefined();
417+
});
418+
419+
it('rejects old key total_budget_usd with a clear error', () => {
420+
const suite: JsonObject = { execution: { total_budget_usd: 10.0 } };
421+
expect(() => extractBudgetUsd(suite)).toThrow(
422+
'execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML.',
423+
);
424+
});
425+
426+
it('rejects old key totalBudgetUsd with a clear error', () => {
427+
const suite: JsonObject = { execution: { totalBudgetUsd: 10.0 } };
428+
expect(() => extractBudgetUsd(suite)).toThrow(
429+
'execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML.',
430+
);
417431
});
418432
});
419433

packages/core/test/evaluation/orchestrator.test.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2507,7 +2507,7 @@ describe('workspace.template .code-workspace resolution', () => {
25072507
});
25082508

25092509
describe('suite-level total budget guardrail', () => {
2510-
it('completes normally when totalBudgetUsd is not set', async () => {
2510+
it('completes normally when budgetUsd is not set', async () => {
25112511
const provider: Provider = {
25122512
id: 'budget:mock',
25132513
kind: 'mock' as const,
@@ -2564,7 +2564,7 @@ describe('suite-level total budget guardrail', () => {
25642564
providerFactory: () => provider,
25652565
evaluators: evaluatorRegistry,
25662566
evalCases,
2567-
totalBudgetUsd: 10.0,
2567+
budgetUsd: 10.0,
25682568
});
25692569

25702570
expect(results).toHaveLength(2);
@@ -2598,7 +2598,7 @@ describe('suite-level total budget guardrail', () => {
25982598
providerFactory: () => provider,
25992599
evaluators: evaluatorRegistry,
26002600
evalCases,
2601-
totalBudgetUsd: 5.0,
2601+
budgetUsd: 5.0,
26022602
maxConcurrency: 1,
26032603
});
26042604

@@ -2647,7 +2647,7 @@ describe('suite-level total budget guardrail', () => {
26472647
providerFactory: () => provider,
26482648
evaluators: evaluatorRegistry,
26492649
evalCases,
2650-
totalBudgetUsd: 5.0,
2650+
budgetUsd: 5.0,
26512651
maxConcurrency: 1,
26522652
trials: { count: 2, strategy: 'pass_at_k' },
26532653
});

plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5015,11 +5015,11 @@
50155015
"required": ["count"],
50165016
"additionalProperties": false
50175017
},
5018-
"total_budget_usd": {
5018+
"budget_usd": {
50195019
"type": "number",
50205020
"minimum": 0
50215021
},
5022-
"totalBudgetUsd": {
5022+
"budgetUsd": {
50235023
"type": "number",
50245024
"minimum": 0
50255025
},
@@ -11543,11 +11543,11 @@
1154311543
"required": ["count"],
1154411544
"additionalProperties": false
1154511545
},
11546-
"total_budget_usd": {
11546+
"budget_usd": {
1154711547
"type": "number",
1154811548
"minimum": 0
1154911549
},
11550-
"totalBudgetUsd": {
11550+
"budgetUsd": {
1155111551
"type": "number",
1155211552
"minimum": 0
1155311553
},
@@ -15682,11 +15682,11 @@
1568215682
"required": ["count"],
1568315683
"additionalProperties": false
1568415684
},
15685-
"total_budget_usd": {
15685+
"budget_usd": {
1568615686
"type": "number",
1568715687
"minimum": 0
1568815688
},
15689-
"totalBudgetUsd": {
15689+
"budgetUsd": {
1569015690
"type": "number",
1569115691
"minimum": 0
1569215692
},

0 commit comments

Comments
 (0)