Skip to content

Commit 0ee2e93

Browse files
christsoCopilot
andauthored
feat(core): add beforeAll, budgetUsd, turns, aggregation to programmatic API (#1115) (#1119)
Close the programmatic TS API gap by adding four YAML-first features to the public SDK types: - EvalConfig.beforeAll: string | string[] — suite-level setup command - EvalConfig.budgetUsd: number — cost cap passed to orchestrator - EvalTestInput.turns: ConversationTurnInput[] — multi-turn conversations - EvalTestInput.aggregation: ConversationAggregation — score strategy - EvalTestInput.mode: "conversation" — inferred automatically from turns[] New ConversationTurnInput type mirrors YAML turn structure with camelCase. Input field on EvalTestInput is now optional (omit when using turns[]). Includes 10 new tests, an advanced SDK example, and full lint/build pass. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent a70ceac commit 0ee2e93

6 files changed

Lines changed: 440 additions & 27 deletions

File tree

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# SDK Programmatic API — Advanced
2+
3+
Demonstrates the advanced programmatic API features added in [#1115](https://github.com/anthropics/agentv/issues/1115):
4+
5+
- **`beforeAll`** — run setup commands before the suite starts
6+
- **`budgetUsd`** — cap total LLM spend
7+
- **`turns`** — multi-turn conversation evaluation
8+
- **`aggregation`** — control how turn scores combine (`mean`, `min`, `max`)
9+
10+
## Run
11+
12+
```bash
13+
bun run evaluate.ts
14+
```
15+
16+
See also: [`sdk-programmatic-api`](../sdk-programmatic-api/) for the basic API.
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/**
2+
* Advanced Programmatic API Example
3+
*
4+
* Demonstrates evaluate() with beforeAll, budgetUsd, multi-turn conversations,
5+
* and aggregation — all defined in TypeScript with full type safety.
6+
*
7+
* Run: bun run evaluate.ts
8+
*/
9+
import { evaluate } from '@agentv/core';
10+
11+
const { results, summary } = await evaluate({
12+
// Run a setup command before the suite starts
13+
beforeAll: 'echo "Setting up test environment"',
14+
15+
// Cap total LLM spend at $5
16+
budgetUsd: 5.0,
17+
18+
tests: [
19+
// Standard single-turn test (unchanged from basic API)
20+
{
21+
id: 'greeting',
22+
input: 'Say hello.',
23+
assert: [{ type: 'contains', value: 'Hello' }],
24+
},
25+
26+
// Multi-turn conversation test
27+
{
28+
id: 'multi-turn-memory',
29+
mode: 'conversation',
30+
turns: [
31+
{
32+
input: 'Hi, my name is Alice.',
33+
assert: [{ type: 'contains', value: 'Alice' }],
34+
},
35+
{
36+
input: 'What is my name?',
37+
expectedOutput: 'Your name is Alice.',
38+
assert: [{ type: 'contains', value: 'Alice' }],
39+
},
40+
],
41+
// Use weakest-link scoring: final score = lowest turn score
42+
aggregation: 'min',
43+
},
44+
],
45+
46+
onResult: (result) => {
47+
console.log(` ${result.testId}: score=${result.score.toFixed(2)}`);
48+
},
49+
});
50+
51+
console.log('\n--- Summary ---');
52+
console.log(`Total: ${summary.total}`);
53+
console.log(`Passed: ${summary.passed}`);
54+
console.log(`Failed: ${summary.failed}`);
55+
console.log(`Mean score: ${summary.meanScore.toFixed(2)}`);
56+
console.log(`Duration: ${summary.durationMs}ms`);
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"name": "agentv-example-sdk-programmatic-api-advanced",
3+
"private": true,
4+
"type": "module",
5+
"dependencies": {
6+
"@agentv/core": "file:../../../packages/core"
7+
}
8+
}

packages/core/src/evaluation/evaluate.ts

Lines changed: 118 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,13 @@ import { type ResolvedTarget, resolveTargetDefinition } from './providers/target
6969
import type { TargetDefinition } from './providers/types.js';
7070
import { INLINE_ASSERT_FN } from './registry/builtin-graders.js';
7171
import type {
72+
ConversationAggregation,
73+
ConversationTurn,
7274
EvalTest,
7375
EvaluationResult,
7476
GraderConfig,
7577
InlineAssertEvaluatorConfig,
78+
WorkspaceHookConfig,
7679
} from './types.js';
7780
import { loadTests } from './yaml-parser.js';
7881

@@ -85,8 +88,8 @@ export interface EvalTestInput {
8588
readonly id: string;
8689
/** What the response should accomplish */
8790
readonly criteria?: string;
88-
/** Input to the agent (string or message array) */
89-
readonly input: string | readonly { role: string; content: string }[];
91+
/** Input to the agent (string or message array). Omit when using turns[]. */
92+
readonly input?: string | readonly { role: string; content: string }[];
9093
/** Expected reference output (camelCase preferred) */
9194
readonly expectedOutput?: string;
9295
/** @deprecated Use `expectedOutput` instead */
@@ -95,6 +98,27 @@ export interface EvalTestInput {
9598
readonly assert?: readonly AssertEntry[];
9699
/** Arbitrary metadata */
97100
readonly metadata?: Record<string, unknown>;
101+
/** Enable multi-turn conversation mode. Inferred automatically when turns[] is provided. */
102+
readonly mode?: 'conversation';
103+
/** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
104+
readonly turns?: readonly ConversationTurnInput[];
105+
/** Score aggregation across turns: 'mean' (default), 'min', or 'max'. */
106+
readonly aggregation?: ConversationAggregation;
107+
}
108+
109+
/**
110+
* A single turn in a multi-turn conversation evaluation (programmatic API).
111+
* Mirrors the YAML `turns` structure with camelCase naming.
112+
*/
113+
export interface ConversationTurnInput {
114+
/** Input for this turn (string or message array) */
115+
readonly input: string | readonly { role: string; content: string }[];
116+
/** Expected reference output for this turn */
117+
readonly expectedOutput?: string;
118+
/** @deprecated Use `expectedOutput` instead */
119+
readonly expected_output?: string;
120+
/** Per-turn assertions (string criteria or grader config) */
121+
readonly assert?: readonly AssertEntry[];
98122
}
99123

100124
/**
@@ -162,6 +186,10 @@ export interface EvalConfig {
162186
readonly onResult?: (result: EvaluationResult) => void;
163187
/** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
164188
readonly threshold?: number;
189+
/** Command(s) to run once before the suite starts. Same semantics as YAML before_all. */
190+
readonly beforeAll?: string | readonly string[];
191+
/** Suite-level cost cap in USD. Stops dispatching new tests when exceeded. */
192+
readonly budgetUsd?: number;
165193
}
166194

167195
/**
@@ -279,17 +307,27 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
279307
filter: config.filter,
280308
});
281309
} else {
310+
// Build workspace config with before_all hook if beforeAll is provided
311+
const suiteWorkspace = config.beforeAll
312+
? { hooks: { before_all: toBeforeAllHook(config.beforeAll) } }
313+
: undefined;
314+
282315
// Inline mode: convert EvalTestInput[] to EvalTest[]
283316
evalCases = (config.tests ?? []).map((test): EvalTest => {
284-
const input =
285-
typeof test.input === 'string'
286-
? ([{ role: 'user' as const, content: test.input }] as EvalTest['input'])
287-
: (test.input as unknown as EvalTest['input']);
317+
// Conversation mode: use turns[] for input/question derivation
318+
const isConversation = test.mode === 'conversation' || (test.turns && test.turns.length > 0);
319+
320+
if (!isConversation && !test.input) {
321+
throw new Error(`Test '${test.id}': input is required for non-conversation tests`);
322+
}
323+
324+
const input = isConversation
325+
? toMessageArray(test.turns?.[0]?.input ?? '')
326+
: toMessageArray(test.input ?? '');
288327

289-
const question =
290-
typeof test.input === 'string'
291-
? test.input
292-
: (test.input.find((m) => m.role === 'user')?.content ?? '');
328+
const question = isConversation
329+
? extractQuestion(test.turns?.[0]?.input ?? '')
330+
: extractQuestion(test.input ?? '');
293331

294332
const expectedOutputValue = test.expectedOutput ?? test.expected_output;
295333
const expectedOutput = expectedOutputValue
@@ -300,24 +338,19 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
300338

301339
// Convert inline assertions to evaluator config format
302340
const allAssertions = [...(test.assert ?? []), ...(config.assert ?? [])];
303-
const assertConfigs = allAssertions.map((entry, i) => {
304-
if (typeof entry === 'function') {
305-
// Wrap AssertFn as InlineAssertEvaluatorConfig with function attached via Symbol
306-
const base: InlineAssertEvaluatorConfig = {
307-
type: 'inline-assert',
308-
name: `inline-assert-${i}`,
309-
};
310-
return Object.assign(base, {
311-
[INLINE_ASSERT_FN]: entry as AssertFn,
312-
}) as unknown as GraderConfig;
313-
}
314-
const a = entry as EvalAssertionInput;
315-
const { type: rawType, ...rest } = a;
341+
const assertConfigs = convertAssertions(allAssertions);
342+
343+
// Convert conversation turns if present — keep input/expected_output as
344+
// TestMessageContent (matching YAML parser behavior), not wrapped in message arrays.
345+
const turns: ConversationTurn[] | undefined = test.turns?.map((turn) => {
346+
const turnExpected = turn.expectedOutput ?? turn.expected_output;
316347
return {
317-
...rest,
318-
name: a.name ?? `${rawType}_${i}`,
319-
type: mapAssertionType(rawType),
320-
} as unknown as GraderConfig;
348+
input: turn.input as ConversationTurn['input'],
349+
...(turnExpected !== undefined && {
350+
expected_output: turnExpected as ConversationTurn['expected_output'],
351+
}),
352+
assertions: turn.assert ? convertAssertions([...turn.assert]) : undefined,
353+
};
321354
});
322355

323356
return {
@@ -330,6 +363,10 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
330363
file_paths: [],
331364
assertions: assertConfigs.length > 0 ? assertConfigs : undefined,
332365
metadata: test.metadata,
366+
...(suiteWorkspace && { workspace: suiteWorkspace }),
367+
...(isConversation && { mode: 'conversation' as const }),
368+
...(turns && { turns }),
369+
...(test.aggregation && { aggregation: test.aggregation }),
333370
};
334371
});
335372
}
@@ -348,6 +385,7 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
348385
filter: config.filter,
349386
threshold: config.threshold,
350387
evalCases,
388+
...(config.budgetUsd !== undefined && { budgetUsd: config.budgetUsd }),
351389
onResult: async (result) => {
352390
collectedResults.push(result);
353391
config.onResult?.(result);
@@ -363,6 +401,59 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
363401
};
364402
}
365403

404+
/**
405+
* Convert a flexible input (string or message array) to the internal TestMessage[] format.
406+
*/
407+
function toMessageArray(
408+
input: string | readonly { role: string; content: string }[],
409+
): EvalTest['input'] {
410+
if (typeof input === 'string') {
411+
return [{ role: 'user' as const, content: input }] as EvalTest['input'];
412+
}
413+
return input as unknown as EvalTest['input'];
414+
}
415+
416+
/**
417+
* Extract the user-facing question string from a flexible input.
418+
*/
419+
function extractQuestion(input: string | readonly { role: string; content: string }[]): string {
420+
if (typeof input === 'string') return input;
421+
return input.find((m) => m.role === 'user')?.content ?? '';
422+
}
423+
424+
/**
425+
* Convert programmatic API beforeAll (string | string[]) to internal WorkspaceHookConfig.
426+
* Accepts a shell command string or an array of command tokens.
427+
*/
428+
function toBeforeAllHook(beforeAll: string | readonly string[]): WorkspaceHookConfig {
429+
const command = typeof beforeAll === 'string' ? ['sh', '-c', beforeAll] : [...beforeAll];
430+
return { command };
431+
}
432+
433+
/**
434+
* Convert an array of assert entries (inline functions or config objects) to GraderConfig[].
435+
*/
436+
function convertAssertions(entries: readonly AssertEntry[]): GraderConfig[] {
437+
return entries.map((entry, i) => {
438+
if (typeof entry === 'function') {
439+
const base: InlineAssertEvaluatorConfig = {
440+
type: 'inline-assert',
441+
name: `inline-assert-${i}`,
442+
};
443+
return Object.assign(base, {
444+
[INLINE_ASSERT_FN]: entry as AssertFn,
445+
}) as unknown as GraderConfig;
446+
}
447+
const a = entry as EvalAssertionInput;
448+
const { type: rawType, ...rest } = a;
449+
return {
450+
...rest,
451+
name: a.name ?? `${rawType}_${i}`,
452+
type: mapAssertionType(rawType),
453+
} as unknown as GraderConfig;
454+
});
455+
}
456+
366457
/**
367458
* Map user-facing assertion type names to internal grader type names.
368459
* Handles snake_case to kebab-case normalization (e.g., 'llm_grader' -> 'llm-grader').

packages/core/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ export * from './evaluation/orchestrator.js';
2828
export {
2929
evaluate,
3030
type AssertEntry,
31+
type ConversationTurnInput,
3132
type EvalConfig,
3233
type EvalTestInput,
3334
type EvalAssertionInput,

0 commit comments

Comments
 (0)