@@ -69,10 +69,13 @@ import { type ResolvedTarget, resolveTargetDefinition } from './providers/target
6969import type { TargetDefinition } from './providers/types.js' ;
7070import { INLINE_ASSERT_FN } from './registry/builtin-graders.js' ;
7171import type {
72+ ConversationAggregation ,
73+ ConversationTurn ,
7274 EvalTest ,
7375 EvaluationResult ,
7476 GraderConfig ,
7577 InlineAssertEvaluatorConfig ,
78+ WorkspaceHookConfig ,
7679} from './types.js' ;
7780import { loadTests } from './yaml-parser.js' ;
7881
@@ -85,8 +88,8 @@ export interface EvalTestInput {
8588 readonly id : string ;
8689 /** What the response should accomplish */
8790 readonly criteria ?: string ;
88- /** Input to the agent (string or message array) */
89- readonly input : string | readonly { role : string ; content : string } [ ] ;
91+ /** Input to the agent (string or message array). Omit when using turns[]. */
92+ readonly input ? : string | readonly { role : string ; content : string } [ ] ;
9093 /** Expected reference output (camelCase preferred) */
9194 readonly expectedOutput ?: string ;
9295 /** @deprecated Use `expectedOutput` instead */
@@ -95,6 +98,27 @@ export interface EvalTestInput {
9598 readonly assert ?: readonly AssertEntry [ ] ;
9699 /** Arbitrary metadata */
97100 readonly metadata ?: Record < string , unknown > ;
101+ /** Enable multi-turn conversation mode. Inferred automatically when turns[] is provided. */
102+ readonly mode ?: 'conversation' ;
103+ /** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
104+ readonly turns ?: readonly ConversationTurnInput [ ] ;
105+ /** Score aggregation across turns: 'mean' (default), 'min', or 'max'. */
106+ readonly aggregation ?: ConversationAggregation ;
107+ }
108+
109+ /**
110+ * A single turn in a multi-turn conversation evaluation (programmatic API).
111+ * Mirrors the YAML `turns` structure with camelCase naming.
112+ */
113+ export interface ConversationTurnInput {
114+ /** Input for this turn (string or message array) */
115+ readonly input : string | readonly { role : string ; content : string } [ ] ;
116+ /** Expected reference output for this turn */
117+ readonly expectedOutput ?: string ;
118+ /** @deprecated Use `expectedOutput` instead */
119+ readonly expected_output ?: string ;
120+ /** Per-turn assertions (string criteria or grader config) */
121+ readonly assert ?: readonly AssertEntry [ ] ;
98122}
99123
100124/**
@@ -162,6 +186,10 @@ export interface EvalConfig {
162186 readonly onResult ?: ( result : EvaluationResult ) => void ;
163187 /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
164188 readonly threshold ?: number ;
189+ /** Command(s) to run once before the suite starts. Same semantics as YAML before_all. */
190+ readonly beforeAll ?: string | readonly string [ ] ;
191+ /** Suite-level cost cap in USD. Stops dispatching new tests when exceeded. */
192+ readonly budgetUsd ?: number ;
165193}
166194
167195/**
@@ -279,17 +307,27 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
279307 filter : config . filter ,
280308 } ) ;
281309 } else {
310+ // Build workspace config with before_all hook if beforeAll is provided
311+ const suiteWorkspace = config . beforeAll
312+ ? { hooks : { before_all : toBeforeAllHook ( config . beforeAll ) } }
313+ : undefined ;
314+
282315 // Inline mode: convert EvalTestInput[] to EvalTest[]
283316 evalCases = ( config . tests ?? [ ] ) . map ( ( test ) : EvalTest => {
284- const input =
285- typeof test . input === 'string'
286- ? ( [ { role : 'user' as const , content : test . input } ] as EvalTest [ 'input' ] )
287- : ( test . input as unknown as EvalTest [ 'input' ] ) ;
317+ // Conversation mode: use turns[] for input/question derivation
318+ const isConversation = test . mode === 'conversation' || ( test . turns && test . turns . length > 0 ) ;
319+
320+ if ( ! isConversation && ! test . input ) {
321+ throw new Error ( `Test '${ test . id } ': input is required for non-conversation tests` ) ;
322+ }
323+
324+ const input = isConversation
325+ ? toMessageArray ( test . turns ?. [ 0 ] ?. input ?? '' )
326+ : toMessageArray ( test . input ?? '' ) ;
288327
289- const question =
290- typeof test . input === 'string'
291- ? test . input
292- : ( test . input . find ( ( m ) => m . role === 'user' ) ?. content ?? '' ) ;
328+ const question = isConversation
329+ ? extractQuestion ( test . turns ?. [ 0 ] ?. input ?? '' )
330+ : extractQuestion ( test . input ?? '' ) ;
293331
294332 const expectedOutputValue = test . expectedOutput ?? test . expected_output ;
295333 const expectedOutput = expectedOutputValue
@@ -300,24 +338,19 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
300338
301339 // Convert inline assertions to evaluator config format
302340 const allAssertions = [ ...( test . assert ?? [ ] ) , ...( config . assert ?? [ ] ) ] ;
303- const assertConfigs = allAssertions . map ( ( entry , i ) => {
304- if ( typeof entry === 'function' ) {
305- // Wrap AssertFn as InlineAssertEvaluatorConfig with function attached via Symbol
306- const base : InlineAssertEvaluatorConfig = {
307- type : 'inline-assert' ,
308- name : `inline-assert-${ i } ` ,
309- } ;
310- return Object . assign ( base , {
311- [ INLINE_ASSERT_FN ] : entry as AssertFn ,
312- } ) as unknown as GraderConfig ;
313- }
314- const a = entry as EvalAssertionInput ;
315- const { type : rawType , ...rest } = a ;
341+ const assertConfigs = convertAssertions ( allAssertions ) ;
342+
343+ // Convert conversation turns if present — keep input/expected_output as
344+ // TestMessageContent (matching YAML parser behavior), not wrapped in message arrays.
345+ const turns : ConversationTurn [ ] | undefined = test . turns ?. map ( ( turn ) => {
346+ const turnExpected = turn . expectedOutput ?? turn . expected_output ;
316347 return {
317- ...rest ,
318- name : a . name ?? `${ rawType } _${ i } ` ,
319- type : mapAssertionType ( rawType ) ,
320- } as unknown as GraderConfig ;
348+ input : turn . input as ConversationTurn [ 'input' ] ,
349+ ...( turnExpected !== undefined && {
350+ expected_output : turnExpected as ConversationTurn [ 'expected_output' ] ,
351+ } ) ,
352+ assertions : turn . assert ? convertAssertions ( [ ...turn . assert ] ) : undefined ,
353+ } ;
321354 } ) ;
322355
323356 return {
@@ -330,6 +363,10 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
330363 file_paths : [ ] ,
331364 assertions : assertConfigs . length > 0 ? assertConfigs : undefined ,
332365 metadata : test . metadata ,
366+ ...( suiteWorkspace && { workspace : suiteWorkspace } ) ,
367+ ...( isConversation && { mode : 'conversation' as const } ) ,
368+ ...( turns && { turns } ) ,
369+ ...( test . aggregation && { aggregation : test . aggregation } ) ,
333370 } ;
334371 } ) ;
335372 }
@@ -348,6 +385,7 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
348385 filter : config . filter ,
349386 threshold : config . threshold ,
350387 evalCases,
388+ ...( config . budgetUsd !== undefined && { budgetUsd : config . budgetUsd } ) ,
351389 onResult : async ( result ) => {
352390 collectedResults . push ( result ) ;
353391 config . onResult ?.( result ) ;
@@ -363,6 +401,59 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
363401 } ;
364402}
365403
404+ /**
405+ * Convert a flexible input (string or message array) to the internal TestMessage[] format.
406+ */
407+ function toMessageArray (
408+ input : string | readonly { role : string ; content : string } [ ] ,
409+ ) : EvalTest [ 'input' ] {
410+ if ( typeof input === 'string' ) {
411+ return [ { role : 'user' as const , content : input } ] as EvalTest [ 'input' ] ;
412+ }
413+ return input as unknown as EvalTest [ 'input' ] ;
414+ }
415+
416+ /**
417+ * Extract the user-facing question string from a flexible input.
418+ */
419+ function extractQuestion ( input : string | readonly { role : string ; content : string } [ ] ) : string {
420+ if ( typeof input === 'string' ) return input ;
421+ return input . find ( ( m ) => m . role === 'user' ) ?. content ?? '' ;
422+ }
423+
424+ /**
425+ * Convert programmatic API beforeAll (string | string[]) to internal WorkspaceHookConfig.
426+ * Accepts a shell command string or an array of command tokens.
427+ */
428+ function toBeforeAllHook ( beforeAll : string | readonly string [ ] ) : WorkspaceHookConfig {
429+ const command = typeof beforeAll === 'string' ? [ 'sh' , '-c' , beforeAll ] : [ ...beforeAll ] ;
430+ return { command } ;
431+ }
432+
433+ /**
434+ * Convert an array of assert entries (inline functions or config objects) to GraderConfig[].
435+ */
436+ function convertAssertions ( entries : readonly AssertEntry [ ] ) : GraderConfig [ ] {
437+ return entries . map ( ( entry , i ) => {
438+ if ( typeof entry === 'function' ) {
439+ const base : InlineAssertEvaluatorConfig = {
440+ type : 'inline-assert' ,
441+ name : `inline-assert-${ i } ` ,
442+ } ;
443+ return Object . assign ( base , {
444+ [ INLINE_ASSERT_FN ] : entry as AssertFn ,
445+ } ) as unknown as GraderConfig ;
446+ }
447+ const a = entry as EvalAssertionInput ;
448+ const { type : rawType , ...rest } = a ;
449+ return {
450+ ...rest ,
451+ name : a . name ?? `${ rawType } _${ i } ` ,
452+ type : mapAssertionType ( rawType ) ,
453+ } as unknown as GraderConfig ;
454+ } ) ;
455+ }
456+
366457/**
367458 * Map user-facing assertion type names to internal grader type names.
368459 * Handles snake_case to kebab-case normalization (e.g., 'llm_grader' -> 'llm-grader').
0 commit comments