Skip to content

Commit 28bd1b6

Browse files
christsoCopilot
andauthored
feat(cli): incremental eval runs — resume, append, and aggregate (#1110)
* feat(cli): incremental eval runs — resume, append, and aggregate Add three related capabilities for incremental eval runs: 1. `agentv eval aggregate <runDir>` subcommand - Reads index.jsonl, deduplicates by (test_id, target) keeping last entry - Recomputes benchmark.json and timing.json - Prints summary to stdout 2. `--resume` flag on `agentv eval run` - Skips already-completed (non-error) tests - Appends new results to existing index.jsonl - Aggregates with deduplication at the end 3. `--rerun-failed` flag on `agentv eval run` - Like --resume but only skips tests with execution_status "ok" - Reruns execution_error and quality_failure tests - New results replace old ones via last-entry-wins deduplication Key changes: - artifact-writer.ts: Add deduplicateByTestIdTarget(), aggregateRunDir(), writePerTestArtifacts() - jsonl-writer.ts: Support append mode (flags: "a") - output-writer.ts: Pass append option through - commands/aggregate.ts: New subcommand - commands/run.ts: Add --resume and --rerun-failed flags - run-eval.ts: Resume/rerun skip logic, append writer, aggregate after run Closes #1071 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(cli): register aggregate in EVAL_SUBCOMMANDS for argv preprocessing Without this, `agentv eval aggregate <dir>` was rewritten to `agentv eval run aggregate <dir>` by preprocessArgv(), causing aggregate to be treated as an eval file path. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(cli): flush writer before summary & use full results for matrix display - Close outputWriter before reading index.jsonl for summary computation to avoid race condition with unflushed stream data - Use summaryResults (all deduplicated) instead of allResults (new only) for matrix summary in resume mode Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * refactor(cli): extract eval resume key helpers Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 04206fc commit 28bd1b6

9 files changed

Lines changed: 453 additions & 35 deletions

File tree

apps/cli/src/commands/eval/artifact-writer.ts

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,47 @@ import {
1010
import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
1111
import { RESULT_INDEX_FILENAME } from './result-layout.js';
1212

13+
export function buildTestTargetKey(testId?: string, target?: string): string {
14+
return `${testId ?? 'unknown'}::${target ?? 'unknown'}`;
15+
}
16+
17+
// Deduplication helper — keeps the last entry per (test_id, target) pair.
18+
export function deduplicateByTestIdTarget(results: readonly EvaluationResult[]): EvaluationResult[] {
19+
const seen = new Map<string, number>();
20+
for (let i = 0; i < results.length; i++) {
21+
seen.set(buildTestTargetKey(results[i].testId, results[i].target), i);
22+
}
23+
const deduped: EvaluationResult[] = [];
24+
for (let i = 0; i < results.length; i++) {
25+
const key = buildTestTargetKey(results[i].testId, results[i].target);
26+
if (seen.get(key) === i) {
27+
deduped.push(results[i]);
28+
}
29+
}
30+
return deduped;
31+
}
32+
33+
export async function aggregateRunDir(
34+
runDir: string,
35+
options?: { evalFile?: string; experiment?: string },
36+
): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> {
37+
const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
38+
const content = await readFile(indexPath, 'utf8');
39+
const allResults = parseJsonlResults(content);
40+
const results = deduplicateByTestIdTarget(allResults);
41+
42+
const timing = buildTimingArtifact(results);
43+
const timingPath = path.join(runDir, 'timing.json');
44+
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
45+
46+
const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
47+
const benchmarkPath = path.join(runDir, 'benchmark.json');
48+
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
49+
50+
const targetSet = new Set(results.map((r) => r.target ?? 'unknown'));
51+
return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
52+
}
53+
1354
// ---------------------------------------------------------------------------
1455
// Artifact interfaces (snake_case to match skill-creator conventions)
1556
// ---------------------------------------------------------------------------
@@ -737,6 +778,45 @@ function buildTranscriptMessageLines(results: readonly EvaluationResult[]): stri
737778
return lines.length > 0 ? `${lines.join('\n')}\n` : '';
738779
}
739780

781+
export async function writePerTestArtifacts(
782+
results: readonly EvaluationResult[],
783+
outputDir: string,
784+
options?: { experiment?: string },
785+
): Promise<void> {
786+
await mkdir(outputDir, { recursive: true });
787+
for (const result of results) {
788+
const grading = buildGradingArtifact(result);
789+
const timing = buildTimingArtifact([result]);
790+
const artifactSubdir = buildArtifactSubdir(result);
791+
const testDir = path.join(outputDir, artifactSubdir);
792+
await mkdir(testDir, { recursive: true });
793+
await writeFile(
794+
path.join(testDir, 'grading.json'),
795+
`${JSON.stringify(grading, null, 2)}\n`,
796+
'utf8',
797+
);
798+
await writeFile(
799+
path.join(testDir, 'timing.json'),
800+
`${JSON.stringify(timing, null, 2)}\n`,
801+
'utf8',
802+
);
803+
804+
const input = extractInput(result);
805+
if (input) {
806+
await writeFile(path.join(testDir, 'input.md'), input, 'utf8');
807+
}
808+
if (result.output && result.output.length > 0) {
809+
const outputsDir = path.join(testDir, 'outputs');
810+
await mkdir(outputsDir, { recursive: true });
811+
await writeFile(
812+
path.join(outputsDir, 'response.md'),
813+
formatOutputMarkdown(result.output),
814+
'utf8',
815+
);
816+
}
817+
}
818+
}
819+
740820
export async function writeArtifactsFromResults(
741821
results: readonly EvaluationResult[],
742822
outputDir: string,
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import path from 'node:path';
2+
import { command, positional, string } from 'cmd-ts';
3+
4+
import { aggregateRunDir } from '../artifact-writer.js';
5+
6+
export const evalAggregateCommand = command({
7+
name: 'aggregate',
8+
description:
9+
'Recompute benchmark.json and timing.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.',
10+
args: {
11+
runDir: positional({
12+
type: string,
13+
displayName: 'run-dir',
14+
description: 'Path to a run directory containing index.jsonl',
15+
}),
16+
},
17+
handler: async (args) => {
18+
const runDir = path.resolve(args.runDir);
19+
const { benchmarkPath, timingPath, testCount, targetCount } = await aggregateRunDir(runDir);
20+
console.log(`Aggregated ${testCount} test result(s) across ${targetCount} target(s)`);
21+
console.log(` Benchmark: ${benchmarkPath}`);
22+
console.log(` Timing: ${timingPath}`);
23+
},
24+
});

apps/cli/src/commands/eval/commands/run.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,16 @@ export const evalRunCommand = command({
160160
description:
161161
'Path to a previous run workspace or index.jsonl manifest — re-run only execution_error test cases',
162162
}),
163+
resume: flag({
164+
long: 'resume',
165+
description:
166+
'Resume an interrupted run: skip already-completed tests and append new results to --output dir',
167+
}),
168+
rerunFailed: flag({
169+
long: 'rerun-failed',
170+
description:
171+
'Rerun failed/errored tests while keeping passing results. Implies --resume semantics',
172+
}),
163173
strict: flag({
164174
long: 'strict',
165175
description: 'Exit with error on version mismatch (instead of warning)',
@@ -254,6 +264,8 @@ export const evalRunCommand = command({
254264
otelCaptureContent: args.otelCaptureContent,
255265
otelGroupTurns: args.otelGroupTurns,
256266
retryErrors: args.retryErrors,
267+
resume: args.resume,
268+
rerunFailed: args.rerunFailed,
257269
strict: args.strict,
258270
benchmarkJson: args.benchmarkJson,
259271
artifacts: args.artifacts,

apps/cli/src/commands/eval/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { subcommands } from 'cmd-ts';
22

3+
import { evalAggregateCommand } from './commands/aggregate.js';
34
import { evalAssertCommand } from './commands/assert.js';
45
import { evalRunCommand } from './commands/run.js';
56

@@ -9,5 +10,6 @@ export const evalCommand = subcommands({
910
cmds: {
1011
run: evalRunCommand,
1112
assert: evalAssertCommand,
13+
aggregate: evalAggregateCommand,
1214
},
1315
});

apps/cli/src/commands/eval/jsonl-writer.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@ export class JsonlWriter {
1515
this.stream = stream;
1616
}
1717

18-
static async open(filePath: string): Promise<JsonlWriter> {
18+
static async open(filePath: string, options?: { append?: boolean }): Promise<JsonlWriter> {
1919
await mkdir(path.dirname(filePath), { recursive: true });
20-
const stream = createWriteStream(filePath, { flags: 'w', encoding: 'utf8' });
20+
const flags = options?.append ? 'a' : 'w';
21+
const stream = createWriteStream(filePath, { flags, encoding: 'utf8' });
2122
return new JsonlWriter(stream);
2223
}
2324

apps/cli/src/commands/eval/output-writer.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@ export interface WriterOptions {
2222
export async function createOutputWriter(
2323
filePath: string,
2424
format: OutputFormat,
25+
options?: { append?: boolean },
2526
): Promise<OutputWriter> {
2627
switch (format) {
2728
case 'jsonl':
28-
return JsonlWriter.open(filePath);
29+
return JsonlWriter.open(filePath, { append: options?.append });
2930
case 'yaml':
3031
return YamlWriter.open(filePath);
3132
case 'html':

0 commit comments

Comments
 (0)