Skip to content

Commit fcf6c30

Browse files
christsoclaude
andauthored
feat(core): add fallback targets for provider errors (#905)
* feat(core): add fallback targets for provider errors When a primary target fails with retryable errors after exhausting retries, the runner now tries fallback_targets in order. The result JSONL records which target actually served the response via target_used. Closes #899 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor(core): address review feedback on fallback targets Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 34e8e83 commit fcf6c30

5 files changed

Lines changed: 398 additions & 18 deletions

File tree

packages/core/src/evaluation/orchestrator.ts

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1593,6 +1593,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
15931593
let attempt = 0;
15941594
let providerResponse: ProviderResponse | undefined = cachedResponse;
15951595
let lastError: unknown;
1596+
/** Set when a fallback target actually served the response. */
1597+
let targetUsed: string | undefined;
15961598

15971599
while (!providerResponse && attempt < attemptBudget) {
15981600
try {
@@ -1616,25 +1618,36 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
16161618
attempt += 1;
16171619
continue;
16181620
}
1619-
// On error, keep workspace for debugging (unless forceCleanup is set)
1620-
const errorResult = buildErrorResult(
1621-
evalCase,
1622-
target.name,
1623-
nowFn(),
1624-
error,
1625-
promptInputs,
1626-
provider,
1627-
'agent',
1628-
'provider_error',
1629-
verbose,
1630-
);
1631-
if (workspacePath) {
1632-
if (forceCleanup) {
1633-
await cleanupWorkspace(workspacePath).catch(() => {});
1634-
}
1635-
return { ...errorResult, workspacePath };
1621+
break; // Exhausted retries on primary — try fallback targets below
1622+
}
1623+
}
1624+
1625+
// Try fallback targets in order after exhausting retries on the primary
1626+
if (!providerResponse && target.fallbackTargets?.length && targetResolver) {
1627+
for (const fallbackName of target.fallbackTargets) {
1628+
const fallbackProvider = targetResolver(fallbackName);
1629+
if (!fallbackProvider) {
1630+
continue;
1631+
}
1632+
try {
1633+
providerResponse = await invokeProvider(fallbackProvider, {
1634+
evalCase: evalCase,
1635+
target,
1636+
promptInputs,
1637+
attempt: 0,
1638+
agentTimeoutMs,
1639+
signal,
1640+
cwd: workspacePath,
1641+
workspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
1642+
captureFileChanges: !!baselineCommit,
1643+
streamCallbacks: options.streamCallbacks,
1644+
});
1645+
targetUsed = fallbackName;
1646+
break; // Fallback succeeded
1647+
} catch (error) {
1648+
lastError = error;
1649+
// Continue to next fallback
16361650
}
1637-
return errorResult;
16381651
}
16391652
}
16401653

@@ -1812,9 +1825,13 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
18121825
? 'execution_error'
18131826
: classifyQualityStatus(result.score, caseThreshold);
18141827

1828+
// Include targetUsed only when a fallback target served the response
1829+
const targetUsedField = targetUsed ? { targetUsed } : {};
1830+
18151831
const finalResult = providerError
18161832
? {
18171833
...result,
1834+
...targetUsedField,
18181835
evalRun,
18191836
error: providerError,
18201837
executionStatus,
@@ -1828,6 +1845,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
18281845
: skippedEvaluatorError
18291846
? {
18301847
...result,
1848+
...targetUsedField,
18311849
score: 0,
18321850
evalRun,
18331851
error: skippedEvaluatorError,
@@ -1841,6 +1859,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
18411859
}
18421860
: {
18431861
...result,
1862+
...targetUsedField,
18441863
evalRun,
18451864
executionStatus,
18461865
beforeAllOutput,

packages/core/src/evaluation/providers/targets.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,11 @@ interface ResolvedTargetBase {
592592
* to force CLI invocation even in subagent mode.
593593
*/
594594
readonly subagentModeAllowed?: boolean;
595+
/**
596+
* Ordered list of target names to try when the primary target fails after
597+
* exhausting retries. Each fallback is attempted in order.
598+
*/
599+
readonly fallbackTargets?: readonly string[];
595600
}
596601

597602
export type ResolvedTarget =
@@ -642,6 +647,8 @@ export const COMMON_TARGET_SETTINGS = [
642647
'providerBatching',
643648
'subagent_mode_allowed',
644649
'subagentModeAllowed',
650+
'fallback_targets',
651+
'fallbackTargets',
645652
] as const;
646653

647654
const BASE_TARGET_SCHEMA = z
@@ -654,6 +661,8 @@ const BASE_TARGET_SCHEMA = z
654661
workspace_template: z.string().optional(),
655662
workspaceTemplate: z.string().optional(),
656663
subagent_mode_allowed: z.boolean().optional(),
664+
fallback_targets: z.array(z.string().min(1)).optional(),
665+
fallbackTargets: z.array(z.string().min(1)).optional(),
657666
})
658667
.passthrough();
659668

@@ -741,12 +750,14 @@ export function resolveTargetDefinition(
741750
);
742751

743752
// Shared base fields for all resolved targets
753+
const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
744754
const base = {
745755
name: parsed.name,
746756
graderTarget: parsed.grader_target ?? parsed.judge_target,
747757
workers: parsed.workers,
748758
providerBatching,
749759
subagentModeAllowed,
760+
...(fallbackTargets ? { fallbackTargets } : {}),
750761
} as const;
751762

752763
switch (provider) {

packages/core/src/evaluation/providers/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,4 +382,7 @@ export interface TargetDefinition {
382382
readonly retryBackoffFactor?: number | unknown | undefined;
383383
readonly retry_status_codes?: unknown | undefined;
384384
readonly retryStatusCodes?: unknown | undefined;
385+
// Fallback targets for provider errors
386+
readonly fallback_targets?: readonly string[] | unknown | undefined;
387+
readonly fallbackTargets?: readonly string[] | unknown | undefined;
385388
}

packages/core/src/evaluation/types.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,11 @@ export interface EvaluationResult {
900900
readonly score: number;
901901
readonly assertions: readonly AssertionEntry[];
902902
readonly target: string;
903+
/**
904+
* The target that actually served the response, when different from the
905+
* primary target. Present only when a fallback target was used.
906+
*/
907+
readonly targetUsed?: string;
903908
/** Token usage metrics from provider (optional) */
904909
readonly tokenUsage?: TokenUsage;
905910
/** Total cost in USD (optional, from provider) */

0 commit comments

Comments
 (0)