feat(cli): add --tag / --exclude-tag filtering for eval runs (#904)

christso · claude · web-flow · commit 34e8e8340225 · 2026-04-02T08:09:47.000+11:00
* feat(cli): add --tag / --exclude-tag filtering for eval runs Add --tag and --exclude-tag CLI options to `agentv eval run` that filter eval files by their metadata tags before execution. Tags use AND logic: all --tag values must be present and no --exclude-tag values may be present. Closes #900 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(cli): use filtered file list for downstream artifact and retry logic Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -181,6 +181,16 @@ export const evalRunCommand = command({
       description:
         'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value',
     }),
+    tag: multioption({
+      type: array(string),
+      long: 'tag',
+      description: 'Only run eval files that have this tag (repeatable, AND logic)',
+    }),
+    excludeTag: multioption({
+      type: array(string),
+      long: 'exclude-tag',
+      description: 'Skip eval files that have this tag (repeatable, file skipped if any match)',
+    }),
   },
   handler: async (args) => {
     // Launch interactive wizard when no eval paths and stdin is a TTY
@@ -224,6 +234,8 @@ export const evalRunCommand = command({
       model: args.model,
       outputMessages: args.outputMessages,
       threshold: args.threshold,
+      tag: args.tag,
+      excludeTag: args.excludeTag,
     };
     const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
     if (result?.thresholdFailed) {
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -88,6 +88,8 @@ interface NormalizedOptions {
   readonly model?: string;
   readonly outputMessages: number | 'all';
   readonly threshold?: number;
+  readonly tags: readonly string[];
+  readonly excludeTags: readonly string[];
 }
 
 function normalizeBoolean(value: unknown): boolean {
@@ -140,6 +142,43 @@ function normalizeWorkspaceMode(value: unknown): 'pooled' | 'temp' | 'static' |
   return value === 'pooled' || value === 'temp' || value === 'static' ? value : undefined;
 }
 
+function normalizeStringArray(value: unknown): readonly string[] {
+  if (Array.isArray(value)) {
+    return value.filter((v): v is string => typeof v === 'string' && v.trim().length > 0);
+  }
+  return [];
+}
+
+/**
+ * Check whether an eval file's tags satisfy --tag / --exclude-tag filters.
+ *
+ * - `--tag X` means the file must have tag X (AND logic: all specified tags must be present)
+ * - `--exclude-tag X` means the file must NOT have tag X (AND logic: none of the specified tags may be present)
+ * - When both are used, both conditions must hold.
+ * - Files without tags are excluded when --tag is specified, but included when only --exclude-tag is specified.
+ */
+export function matchesTagFilters(
+  fileTags: readonly string[] | undefined,
+  includeTags: readonly string[],
+  excludeTags: readonly string[],
+): boolean {
+  const tags = new Set(fileTags ?? []);
+
+  // --tag: every specified tag must be present
+  if (includeTags.length > 0) {
+    for (const required of includeTags) {
+      if (!tags.has(required)) return false;
+    }
+  }
+
+  // --exclude-tag: none of the specified tags may be present
+  for (const excluded of excludeTags) {
+    if (tags.has(excluded)) return false;
+  }
+
+  return true;
+}
+
 /**
  * Normalize --output-messages value. Accepts a number (>= 1) or "all".
  * Defaults to 1 (last assistant message only).
@@ -304,6 +343,8 @@ function normalizeOptions(
     model: normalizeString(rawOptions.model),
     outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
     threshold: normalizeOptionalNumber(rawOptions.threshold),
+    tags: normalizeStringArray(rawOptions.tag),
+    excludeTags: normalizeStringArray(rawOptions.excludeTag),
   } satisfies NormalizedOptions;
 }
 
@@ -434,6 +475,7 @@ async function prepareFileMetadata(params: {
   readonly totalBudgetUsd?: number;
   readonly failOnError?: FailOnError;
   readonly threshold?: number;
+  readonly tags?: readonly string[];
 }> {
   const { testFilePath, repoRoot, cwd, options } = params;
 
@@ -524,6 +566,7 @@ async function prepareFileMetadata(params: {
     totalBudgetUsd: suite.totalBudgetUsd,
     failOnError: suite.failOnError,
     threshold: suite.threshold,
+    tags: suite.metadata?.tags,
   };
 }
 
@@ -970,6 +1013,7 @@ export async function runEvalCommand(
       readonly totalBudgetUsd?: number;
       readonly failOnError?: FailOnError;
       readonly threshold?: number;
+      readonly tags?: readonly string[];
     }
   >();
   // Separate TypeScript/JS eval files from YAML files.
@@ -1006,6 +1050,27 @@ export async function runEvalCommand(
     fileMetadata.set(testFilePath, meta);
   }
 
+  // Apply --tag / --exclude-tag filtering at the eval-file level
+  const hasTagFilters = options.tags.length > 0 || options.excludeTags.length > 0;
+  if (hasTagFilters) {
+    const skippedFiles: string[] = [];
+    for (const [testFilePath, meta] of fileMetadata.entries()) {
+      if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
+        fileMetadata.delete(testFilePath);
+        skippedFiles.push(path.relative(cwd, testFilePath));
+      }
+    }
+    if (skippedFiles.length > 0 && options.verbose) {
+      console.log(
+        `Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`,
+      );
+    }
+    if (fileMetadata.size === 0) {
+      console.log('No eval files matched the tag filters. Nothing to run.');
+      return;
+    }
+  }
+
   // Resolve cache: combine CLI flags with YAML config
   // Use first file's YAML config for cache settings (consistent across a run)
   const firstMeta = fileMetadata.values().next().value;
@@ -1116,8 +1181,11 @@ export async function runEvalCommand(
     }
   }
 
+  // Use only files that survived tag filtering (fileMetadata keys)
+  const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
+
   try {
-    await runWithLimit(resolvedTestFiles, fileConcurrency, async (testFilePath) => {
+    await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
       const targetPrep = fileMetadata.get(testFilePath);
       if (!targetPrep) {
         throw new Error(`Missing metadata for ${testFilePath}`);
@@ -1208,7 +1276,7 @@ export async function runEvalCommand(
     }
 
     if (usesDefaultArtifactWorkspace) {
-      const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : '';
+      const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
       const workspaceDir = path.dirname(outputPath);
       const {
         testArtifactDir,
@@ -1230,7 +1298,7 @@ export async function runEvalCommand(
     // Write companion artifacts (grading, timing, benchmark) if requested
     if (options.artifacts) {
       const artifactsDir = path.resolve(options.artifacts);
-      const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : '';
+      const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
       const {
         testArtifactDir,
         indexPath,
@@ -1275,7 +1343,7 @@ export async function runEvalCommand(
 
     // Suggest retry-errors command when execution errors are detected
     if (summary.executionErrorCount > 0 && !options.retryErrors) {
-      const evalFileArgs = resolvedTestFiles.map((f) => path.relative(cwd, f)).join(' ');
+      const evalFileArgs = activeTestFiles.map((f) => path.relative(cwd, f)).join(' ');
       const targetFlag = options.target ? ` --target ${options.target}` : '';
       const relativeOutputPath = path.relative(cwd, outputPath);
       console.log(
@@ -1287,7 +1355,7 @@ export async function runEvalCommand(
     return {
       executionErrorCount: summary.executionErrorCount,
       outputPath,
-      testFiles: resolvedTestFiles,
+      testFiles: activeTestFiles,
       target: options.target,
       thresholdFailed,
     };
diff --git a/apps/cli/test/commands/eval/tag-filtering.test.ts b/apps/cli/test/commands/eval/tag-filtering.test.ts
@@ -0,0 +1,73 @@
+import { describe, expect, it } from 'bun:test';
+
+import { matchesTagFilters } from '../../../src/commands/eval/run-eval.js';
+
+describe('matchesTagFilters', () => {
+  describe('no filters', () => {
+    it('accepts files with tags', () => {
+      expect(matchesTagFilters(['agent', 'slow'], [], [])).toBe(true);
+    });
+
+    it('accepts files without tags', () => {
+      expect(matchesTagFilters(undefined, [], [])).toBe(true);
+    });
+
+    it('accepts files with empty tags', () => {
+      expect(matchesTagFilters([], [], [])).toBe(true);
+    });
+  });
+
+  describe('--tag (include)', () => {
+    it('accepts file with matching tag', () => {
+      expect(matchesTagFilters(['agent', 'fast'], ['agent'], [])).toBe(true);
+    });
+
+    it('rejects file without matching tag', () => {
+      expect(matchesTagFilters(['slow', 'multi-provider'], ['agent'], [])).toBe(false);
+    });
+
+    it('requires all specified tags (AND logic)', () => {
+      expect(matchesTagFilters(['agent', 'fast'], ['agent', 'fast'], [])).toBe(true);
+      expect(matchesTagFilters(['agent'], ['agent', 'fast'], [])).toBe(false);
+    });
+
+    it('rejects files with no tags when --tag is specified', () => {
+      expect(matchesTagFilters(undefined, ['agent'], [])).toBe(false);
+      expect(matchesTagFilters([], ['agent'], [])).toBe(false);
+    });
+  });
+
+  describe('--exclude-tag', () => {
+    it('accepts file without excluded tag', () => {
+      expect(matchesTagFilters(['agent', 'fast'], [], ['slow'])).toBe(true);
+    });
+
+    it('rejects file with excluded tag', () => {
+      expect(matchesTagFilters(['agent', 'slow'], [], ['slow'])).toBe(false);
+    });
+
+    it('rejects file if any excluded tag is present (AND logic)', () => {
+      expect(matchesTagFilters(['agent', 'slow'], [], ['slow', 'flaky'])).toBe(false);
+      expect(matchesTagFilters(['agent', 'flaky'], [], ['slow', 'flaky'])).toBe(false);
+    });
+
+    it('accepts files with no tags when only --exclude-tag is specified', () => {
+      expect(matchesTagFilters(undefined, [], ['slow'])).toBe(true);
+      expect(matchesTagFilters([], [], ['slow'])).toBe(true);
+    });
+  });
+
+  describe('combined --tag and --exclude-tag', () => {
+    it('accepts file matching include and not matching exclude', () => {
+      expect(matchesTagFilters(['agent', 'fast'], ['agent'], ['slow'])).toBe(true);
+    });
+
+    it('rejects file matching include but also matching exclude', () => {
+      expect(matchesTagFilters(['agent', 'slow'], ['agent'], ['slow'])).toBe(false);
+    });
+
+    it('rejects file not matching include even if not matching exclude', () => {
+      expect(matchesTagFilters(['fast'], ['agent'], ['slow'])).toBe(false);
+    });
+  });
+});