feat(core): add beforeAll, budgetUsd, turns, aggregation to programmatic API (#1115) (#1119)

christso · Copilot · web-flow · commit 0ee2e938c94e · 2026-04-16T13:40:35.000+10:00
Close the programmatic TS API gap by adding four YAML-first features to
the public SDK types:

- EvalConfig.beforeAll: string | string[] — suite-level setup command
- EvalConfig.budgetUsd: number — cost cap passed to orchestrator
- EvalTestInput.turns: ConversationTurnInput[] — multi-turn conversations
- EvalTestInput.aggregation: ConversationAggregation — score strategy
- EvalTestInput.mode: "conversation" — inferred automatically from turns[]

New ConversationTurnInput type mirrors YAML turn structure with camelCase.
Input field on EvalTestInput is now optional (omit when using turns[]).

Includes 10 new tests, an advanced SDK example, and full lint/build pass.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/examples/features/sdk-programmatic-api-advanced/README.md b/examples/features/sdk-programmatic-api-advanced/README.md
@@ -0,0 +1,16 @@
+# SDK Programmatic API — Advanced
+
+Demonstrates the advanced programmatic API features added in [#1115](https://github.com/anthropics/agentv/issues/1115):
+
+- **`beforeAll`** — run setup commands before the suite starts
+- **`budgetUsd`** — cap total LLM spend
+- **`turns`** — multi-turn conversation evaluation
+- **`aggregation`** — control how turn scores combine (`mean`, `min`, `max`)
+
+## Run
+
+```bash
+bun run evaluate.ts
+```
+
+See also: [`sdk-programmatic-api`](../sdk-programmatic-api/) for the basic API.
diff --git a/examples/features/sdk-programmatic-api-advanced/evaluate.ts b/examples/features/sdk-programmatic-api-advanced/evaluate.ts
@@ -0,0 +1,56 @@
+/**
+ * Advanced Programmatic API Example
+ *
+ * Demonstrates evaluate() with beforeAll, budgetUsd, multi-turn conversations,
+ * and aggregation — all defined in TypeScript with full type safety.
+ *
+ * Run: bun run evaluate.ts
+ */
+import { evaluate } from '@agentv/core';
+
+const { results, summary } = await evaluate({
+  // Run a setup command before the suite starts
+  beforeAll: 'echo "Setting up test environment"',
+
+  // Cap total LLM spend at $5
+  budgetUsd: 5.0,
+
+  tests: [
+    // Standard single-turn test (unchanged from basic API)
+    {
+      id: 'greeting',
+      input: 'Say hello.',
+      assert: [{ type: 'contains', value: 'Hello' }],
+    },
+
+    // Multi-turn conversation test
+    {
+      id: 'multi-turn-memory',
+      mode: 'conversation',
+      turns: [
+        {
+          input: 'Hi, my name is Alice.',
+          assert: [{ type: 'contains', value: 'Alice' }],
+        },
+        {
+          input: 'What is my name?',
+          expectedOutput: 'Your name is Alice.',
+          assert: [{ type: 'contains', value: 'Alice' }],
+        },
+      ],
+      // Use weakest-link scoring: final score = lowest turn score
+      aggregation: 'min',
+    },
+  ],
+
+  onResult: (result) => {
+    console.log(`  ${result.testId}: score=${result.score.toFixed(2)}`);
+  },
+});
+
+console.log('\n--- Summary ---');
+console.log(`Total: ${summary.total}`);
+console.log(`Passed: ${summary.passed}`);
+console.log(`Failed: ${summary.failed}`);
+console.log(`Mean score: ${summary.meanScore.toFixed(2)}`);
+console.log(`Duration: ${summary.durationMs}ms`);
diff --git a/examples/features/sdk-programmatic-api-advanced/package.json b/examples/features/sdk-programmatic-api-advanced/package.json
@@ -0,0 +1,8 @@
+{
+  "name": "agentv-example-sdk-programmatic-api-advanced",
+  "private": true,
+  "type": "module",
+  "dependencies": {
+    "@agentv/core": "file:../../../packages/core"
+  }
+}
diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts
@@ -69,10 +69,13 @@ import { type ResolvedTarget, resolveTargetDefinition } from './providers/target
 import type { TargetDefinition } from './providers/types.js';
 import { INLINE_ASSERT_FN } from './registry/builtin-graders.js';
 import type {
+  ConversationAggregation,
+  ConversationTurn,
   EvalTest,
   EvaluationResult,
   GraderConfig,
   InlineAssertEvaluatorConfig,
+  WorkspaceHookConfig,
 } from './types.js';
 import { loadTests } from './yaml-parser.js';
 
@@ -85,8 +88,8 @@ export interface EvalTestInput {
   readonly id: string;
   /** What the response should accomplish */
   readonly criteria?: string;
-  /** Input to the agent (string or message array) */
-  readonly input: string | readonly { role: string; content: string }[];
+  /** Input to the agent (string or message array). Omit when using turns[]. */
+  readonly input?: string | readonly { role: string; content: string }[];
   /** Expected reference output (camelCase preferred) */
   readonly expectedOutput?: string;
   /** @deprecated Use `expectedOutput` instead */
@@ -95,6 +98,27 @@ export interface EvalTestInput {
   readonly assert?: readonly AssertEntry[];
   /** Arbitrary metadata */
   readonly metadata?: Record<string, unknown>;
+  /** Enable multi-turn conversation mode. Inferred automatically when turns[] is provided. */
+  readonly mode?: 'conversation';
+  /** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
+  readonly turns?: readonly ConversationTurnInput[];
+  /** Score aggregation across turns: 'mean' (default), 'min', or 'max'. */
+  readonly aggregation?: ConversationAggregation;
+}
+
+/**
+ * A single turn in a multi-turn conversation evaluation (programmatic API).
+ * Mirrors the YAML `turns` structure with camelCase naming.
+ */
+export interface ConversationTurnInput {
+  /** Input for this turn (string or message array) */
+  readonly input: string | readonly { role: string; content: string }[];
+  /** Expected reference output for this turn */
+  readonly expectedOutput?: string;
+  /** @deprecated Use `expectedOutput` instead */
+  readonly expected_output?: string;
+  /** Per-turn assertions (string criteria or grader config) */
+  readonly assert?: readonly AssertEntry[];
 }
 
 /**
@@ -162,6 +186,10 @@ export interface EvalConfig {
   readonly onResult?: (result: EvaluationResult) => void;
   /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
   readonly threshold?: number;
+  /** Command(s) to run once before the suite starts. Same semantics as YAML before_all. */
+  readonly beforeAll?: string | readonly string[];
+  /** Suite-level cost cap in USD. Stops dispatching new tests when exceeded. */
+  readonly budgetUsd?: number;
 }
 
 /**
@@ -279,17 +307,27 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
       filter: config.filter,
     });
   } else {
+    // Build workspace config with before_all hook if beforeAll is provided
+    const suiteWorkspace = config.beforeAll
+      ? { hooks: { before_all: toBeforeAllHook(config.beforeAll) } }
+      : undefined;
+
     // Inline mode: convert EvalTestInput[] to EvalTest[]
     evalCases = (config.tests ?? []).map((test): EvalTest => {
-      const input =
-        typeof test.input === 'string'
-          ? ([{ role: 'user' as const, content: test.input }] as EvalTest['input'])
-          : (test.input as unknown as EvalTest['input']);
+      // Conversation mode: use turns[] for input/question derivation
+      const isConversation = test.mode === 'conversation' || (test.turns && test.turns.length > 0);
+
+      if (!isConversation && !test.input) {
+        throw new Error(`Test '${test.id}': input is required for non-conversation tests`);
+      }
+
+      const input = isConversation
+        ? toMessageArray(test.turns?.[0]?.input ?? '')
+        : toMessageArray(test.input ?? '');
 
-      const question =
-        typeof test.input === 'string'
-          ? test.input
-          : (test.input.find((m) => m.role === 'user')?.content ?? '');
+      const question = isConversation
+        ? extractQuestion(test.turns?.[0]?.input ?? '')
+        : extractQuestion(test.input ?? '');
 
       const expectedOutputValue = test.expectedOutput ?? test.expected_output;
       const expectedOutput = expectedOutputValue
@@ -300,24 +338,19 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
 
       // Convert inline assertions to evaluator config format
       const allAssertions = [...(test.assert ?? []), ...(config.assert ?? [])];
-      const assertConfigs = allAssertions.map((entry, i) => {
-        if (typeof entry === 'function') {
-          // Wrap AssertFn as InlineAssertEvaluatorConfig with function attached via Symbol
-          const base: InlineAssertEvaluatorConfig = {
-            type: 'inline-assert',
-            name: `inline-assert-${i}`,
-          };
-          return Object.assign(base, {
-            [INLINE_ASSERT_FN]: entry as AssertFn,
-          }) as unknown as GraderConfig;
-        }
-        const a = entry as EvalAssertionInput;
-        const { type: rawType, ...rest } = a;
+      const assertConfigs = convertAssertions(allAssertions);
+
+      // Convert conversation turns if present — keep input/expected_output as
+      // TestMessageContent (matching YAML parser behavior), not wrapped in message arrays.
+      const turns: ConversationTurn[] | undefined = test.turns?.map((turn) => {
+        const turnExpected = turn.expectedOutput ?? turn.expected_output;
         return {
-          ...rest,
-          name: a.name ?? `${rawType}_${i}`,
-          type: mapAssertionType(rawType),
-        } as unknown as GraderConfig;
+          input: turn.input as ConversationTurn['input'],
+          ...(turnExpected !== undefined && {
+            expected_output: turnExpected as ConversationTurn['expected_output'],
+          }),
+          assertions: turn.assert ? convertAssertions([...turn.assert]) : undefined,
+        };
       });
 
       return {
@@ -330,6 +363,10 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
         file_paths: [],
         assertions: assertConfigs.length > 0 ? assertConfigs : undefined,
         metadata: test.metadata,
+        ...(suiteWorkspace && { workspace: suiteWorkspace }),
+        ...(isConversation && { mode: 'conversation' as const }),
+        ...(turns && { turns }),
+        ...(test.aggregation && { aggregation: test.aggregation }),
       };
     });
   }
@@ -348,6 +385,7 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
     filter: config.filter,
     threshold: config.threshold,
     evalCases,
+    ...(config.budgetUsd !== undefined && { budgetUsd: config.budgetUsd }),
     onResult: async (result) => {
       collectedResults.push(result);
       config.onResult?.(result);
@@ -363,6 +401,59 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
   };
 }
 
+/**
+ * Convert a flexible input (string or message array) to the internal TestMessage[] format.
+ */
+function toMessageArray(
+  input: string | readonly { role: string; content: string }[],
+): EvalTest['input'] {
+  if (typeof input === 'string') {
+    return [{ role: 'user' as const, content: input }] as EvalTest['input'];
+  }
+  return input as unknown as EvalTest['input'];
+}
+
+/**
+ * Extract the user-facing question string from a flexible input.
+ */
+function extractQuestion(input: string | readonly { role: string; content: string }[]): string {
+  if (typeof input === 'string') return input;
+  return input.find((m) => m.role === 'user')?.content ?? '';
+}
+
+/**
+ * Convert programmatic API beforeAll (string | string[]) to internal WorkspaceHookConfig.
+ * Accepts a shell command string or an array of command tokens.
+ */
+function toBeforeAllHook(beforeAll: string | readonly string[]): WorkspaceHookConfig {
+  const command = typeof beforeAll === 'string' ? ['sh', '-c', beforeAll] : [...beforeAll];
+  return { command };
+}
+
+/**
+ * Convert an array of assert entries (inline functions or config objects) to GraderConfig[].
+ */
+function convertAssertions(entries: readonly AssertEntry[]): GraderConfig[] {
+  return entries.map((entry, i) => {
+    if (typeof entry === 'function') {
+      const base: InlineAssertEvaluatorConfig = {
+        type: 'inline-assert',
+        name: `inline-assert-${i}`,
+      };
+      return Object.assign(base, {
+        [INLINE_ASSERT_FN]: entry as AssertFn,
+      }) as unknown as GraderConfig;
+    }
+    const a = entry as EvalAssertionInput;
+    const { type: rawType, ...rest } = a;
+    return {
+      ...rest,
+      name: a.name ?? `${rawType}_${i}`,
+      type: mapAssertionType(rawType),
+    } as unknown as GraderConfig;
+  });
+}
+
 /**
  * Map user-facing assertion type names to internal grader type names.
  * Handles snake_case to kebab-case normalization (e.g., 'llm_grader' -> 'llm-grader').
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
@@ -28,6 +28,7 @@ export * from './evaluation/orchestrator.js';
 export {
   evaluate,
   type AssertEntry,
+  type ConversationTurnInput,
   type EvalConfig,
   type EvalTestInput,
   type EvalAssertionInput,
diff --git a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts