feat(cli): incremental eval runs — resume, append, and aggregate (#1110)

christso · Copilot · web-flow · commit 28bd1b665054 · 2026-04-16T08:05:03.000+10:00
* feat(cli): incremental eval runs — resume, append, and aggregate Add three related capabilities for incremental eval runs: 1. `agentv eval aggregate <runDir>` subcommand - Reads index.jsonl, deduplicates by (test_id, target) keeping last entry - Recomputes benchmark.json and timing.json - Prints summary to stdout 2. `--resume` flag on `agentv eval run` - Skips already-completed (non-error) tests - Appends new results to existing index.jsonl - Aggregates with deduplication at the end 3. `--rerun-failed` flag on `agentv eval run` - Like --resume but only skips tests with execution_status "ok" - Reruns execution_error and quality_failure tests - New results replace old ones via last-entry-wins deduplication Key changes: - artifact-writer.ts: Add deduplicateByTestIdTarget(), aggregateRunDir(), writePerTestArtifacts() - jsonl-writer.ts: Support append mode (flags: "a") - output-writer.ts: Pass append option through - commands/aggregate.ts: New subcommand - commands/run.ts: Add --resume and --rerun-failed flags - run-eval.ts: Resume/rerun skip logic, append writer, aggregate after run Closes #1071 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(cli): register aggregate in EVAL_SUBCOMMANDS for argv preprocessing Without this, `agentv eval aggregate <dir>` was rewritten to `agentv eval run aggregate <dir>` by preprocessArgv(), causing aggregate to be treated as an eval file path. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(cli): flush writer before summary & use full results for matrix display - Close outputWriter before reading index.jsonl for summary computation to avoid race condition with unflushed stream data - Use summaryResults (all deduplicated) instead of allResults (new only) for matrix summary in resume mode Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * refactor(cli): extract eval resume key helpers Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -10,6 +10,47 @@ import {
 import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
 import { RESULT_INDEX_FILENAME } from './result-layout.js';
 
+export function buildTestTargetKey(testId?: string, target?: string): string {
+  return `${testId ?? 'unknown'}::${target ?? 'unknown'}`;
+}
+
+// Deduplication helper — keeps the last entry per (test_id, target) pair.
+export function deduplicateByTestIdTarget(results: readonly EvaluationResult[]): EvaluationResult[] {
+  const seen = new Map<string, number>();
+  for (let i = 0; i < results.length; i++) {
+    seen.set(buildTestTargetKey(results[i].testId, results[i].target), i);
+  }
+  const deduped: EvaluationResult[] = [];
+  for (let i = 0; i < results.length; i++) {
+    const key = buildTestTargetKey(results[i].testId, results[i].target);
+    if (seen.get(key) === i) {
+      deduped.push(results[i]);
+    }
+  }
+  return deduped;
+}
+
+export async function aggregateRunDir(
+  runDir: string,
+  options?: { evalFile?: string; experiment?: string },
+): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> {
+  const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
+  const content = await readFile(indexPath, 'utf8');
+  const allResults = parseJsonlResults(content);
+  const results = deduplicateByTestIdTarget(allResults);
+
+  const timing = buildTimingArtifact(results);
+  const timingPath = path.join(runDir, 'timing.json');
+  await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
+
+  const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
+  const benchmarkPath = path.join(runDir, 'benchmark.json');
+  await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
+
+  const targetSet = new Set(results.map((r) => r.target ?? 'unknown'));
+  return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
+}
+
 // ---------------------------------------------------------------------------
 // Artifact interfaces (snake_case to match skill-creator conventions)
 // ---------------------------------------------------------------------------
@@ -737,6 +778,45 @@ function buildTranscriptMessageLines(results: readonly EvaluationResult[]): stri
   return lines.length > 0 ? `${lines.join('\n')}\n` : '';
 }
 
+export async function writePerTestArtifacts(
+  results: readonly EvaluationResult[],
+  outputDir: string,
+  options?: { experiment?: string },
+): Promise<void> {
+  await mkdir(outputDir, { recursive: true });
+  for (const result of results) {
+    const grading = buildGradingArtifact(result);
+    const timing = buildTimingArtifact([result]);
+    const artifactSubdir = buildArtifactSubdir(result);
+    const testDir = path.join(outputDir, artifactSubdir);
+    await mkdir(testDir, { recursive: true });
+    await writeFile(
+      path.join(testDir, 'grading.json'),
+      `${JSON.stringify(grading, null, 2)}\n`,
+      'utf8',
+    );
+    await writeFile(
+      path.join(testDir, 'timing.json'),
+      `${JSON.stringify(timing, null, 2)}\n`,
+      'utf8',
+    );
+
+    const input = extractInput(result);
+    if (input) {
+      await writeFile(path.join(testDir, 'input.md'), input, 'utf8');
+    }
+    if (result.output && result.output.length > 0) {
+      const outputsDir = path.join(testDir, 'outputs');
+      await mkdir(outputsDir, { recursive: true });
+      await writeFile(
+        path.join(outputsDir, 'response.md'),
+        formatOutputMarkdown(result.output),
+        'utf8',
+      );
+    }
+  }
+}
+
 export async function writeArtifactsFromResults(
   results: readonly EvaluationResult[],
   outputDir: string,
diff --git a/apps/cli/src/commands/eval/commands/aggregate.ts b/apps/cli/src/commands/eval/commands/aggregate.ts
@@ -0,0 +1,24 @@
+import path from 'node:path';
+import { command, positional, string } from 'cmd-ts';
+
+import { aggregateRunDir } from '../artifact-writer.js';
+
+export const evalAggregateCommand = command({
+  name: 'aggregate',
+  description:
+    'Recompute benchmark.json and timing.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.',
+  args: {
+    runDir: positional({
+      type: string,
+      displayName: 'run-dir',
+      description: 'Path to a run directory containing index.jsonl',
+    }),
+  },
+  handler: async (args) => {
+    const runDir = path.resolve(args.runDir);
+    const { benchmarkPath, timingPath, testCount, targetCount } = await aggregateRunDir(runDir);
+    console.log(`Aggregated ${testCount} test result(s) across ${targetCount} target(s)`);
+    console.log(`  Benchmark: ${benchmarkPath}`);
+    console.log(`  Timing:    ${timingPath}`);
+  },
+});
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -160,6 +160,16 @@ export const evalRunCommand = command({
       description:
         'Path to a previous run workspace or index.jsonl manifest — re-run only execution_error test cases',
     }),
+    resume: flag({
+      long: 'resume',
+      description:
+        'Resume an interrupted run: skip already-completed tests and append new results to --output dir',
+    }),
+    rerunFailed: flag({
+      long: 'rerun-failed',
+      description:
+        'Rerun failed/errored tests while keeping passing results. Implies --resume semantics',
+    }),
     strict: flag({
       long: 'strict',
       description: 'Exit with error on version mismatch (instead of warning)',
@@ -254,6 +264,8 @@ export const evalRunCommand = command({
       otelCaptureContent: args.otelCaptureContent,
       otelGroupTurns: args.otelGroupTurns,
       retryErrors: args.retryErrors,
+      resume: args.resume,
+      rerunFailed: args.rerunFailed,
       strict: args.strict,
       benchmarkJson: args.benchmarkJson,
       artifacts: args.artifacts,
diff --git a/apps/cli/src/commands/eval/index.ts b/apps/cli/src/commands/eval/index.ts
@@ -1,5 +1,6 @@
 import { subcommands } from 'cmd-ts';
 
+import { evalAggregateCommand } from './commands/aggregate.js';
 import { evalAssertCommand } from './commands/assert.js';
 import { evalRunCommand } from './commands/run.js';
 
@@ -9,5 +10,6 @@ export const evalCommand = subcommands({
   cmds: {
     run: evalRunCommand,
     assert: evalAssertCommand,
+    aggregate: evalAggregateCommand,
   },
 });
diff --git a/apps/cli/src/commands/eval/jsonl-writer.ts b/apps/cli/src/commands/eval/jsonl-writer.ts
@@ -15,9 +15,10 @@ export class JsonlWriter {
     this.stream = stream;
   }
 
-  static async open(filePath: string): Promise<JsonlWriter> {
+  static async open(filePath: string, options?: { append?: boolean }): Promise<JsonlWriter> {
     await mkdir(path.dirname(filePath), { recursive: true });
-    const stream = createWriteStream(filePath, { flags: 'w', encoding: 'utf8' });
+    const flags = options?.append ? 'a' : 'w';
+    const stream = createWriteStream(filePath, { flags, encoding: 'utf8' });
     return new JsonlWriter(stream);
   }
 
diff --git a/apps/cli/src/commands/eval/output-writer.ts b/apps/cli/src/commands/eval/output-writer.ts
@@ -22,10 +22,11 @@ export interface WriterOptions {
 export async function createOutputWriter(
   filePath: string,
   format: OutputFormat,
+  options?: { append?: boolean },
 ): Promise<OutputWriter> {
   switch (format) {
     case 'jsonl':
-      return JsonlWriter.open(filePath);
+      return JsonlWriter.open(filePath, { append: options?.append });
     case 'yaml':
       return YamlWriter.open(filePath);
     case 'html':
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts
diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts

Original file line number	Diff line number	Diff line change
`@@ -15,9 +15,10 @@ export class JsonlWriter {`
`15`	`15`	`this.stream = stream;`
`16`	`16`	`}`
`17`	`17`
`18`		`- static async open(filePath: string): Promise<JsonlWriter> {`
	`18`	`+ static async open(filePath: string, options?: { append?: boolean }): Promise<JsonlWriter> {`
`19`	`19`	`await mkdir(path.dirname(filePath), { recursive: true });`
`20`		`- const stream = createWriteStream(filePath, { flags: 'w', encoding: 'utf8' });`
	`20`	`+ const flags = options?.append ? 'a' : 'w';`
	`21`	`+ const stream = createWriteStream(filePath, { flags, encoding: 'utf8' });`
`21`	`22`	`return new JsonlWriter(stream);`
`22`	`23`	`}`
`23`	`24`