Skip to content

Commit 34e8e83

Browse files
christsoclaude
andauthored
feat(cli): add --tag / --exclude-tag filtering for eval runs (#904)
* feat(cli): add --tag / --exclude-tag filtering for eval runs Add --tag and --exclude-tag CLI options to `agentv eval run` that filter eval files by their metadata tags before execution. Tags use AND logic: all --tag values must be present and no --exclude-tag values may be present. Closes #900 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(cli): use filtered file list for downstream artifact and retry logic Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 0ef8cce commit 34e8e83

3 files changed

Lines changed: 158 additions & 5 deletions

File tree

apps/cli/src/commands/eval/commands/run.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,16 @@ export const evalRunCommand = command({
181181
description:
182182
'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value',
183183
}),
184+
tag: multioption({
185+
type: array(string),
186+
long: 'tag',
187+
description: 'Only run eval files that have this tag (repeatable, AND logic)',
188+
}),
189+
excludeTag: multioption({
190+
type: array(string),
191+
long: 'exclude-tag',
192+
description: 'Skip eval files that have this tag (repeatable, file skipped if any match)',
193+
}),
184194
},
185195
handler: async (args) => {
186196
// Launch interactive wizard when no eval paths and stdin is a TTY
@@ -224,6 +234,8 @@ export const evalRunCommand = command({
224234
model: args.model,
225235
outputMessages: args.outputMessages,
226236
threshold: args.threshold,
237+
tag: args.tag,
238+
excludeTag: args.excludeTag,
227239
};
228240
const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
229241
if (result?.thresholdFailed) {

apps/cli/src/commands/eval/run-eval.ts

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ interface NormalizedOptions {
8888
readonly model?: string;
8989
readonly outputMessages: number | 'all';
9090
readonly threshold?: number;
91+
readonly tags: readonly string[];
92+
readonly excludeTags: readonly string[];
9193
}
9294

9395
function normalizeBoolean(value: unknown): boolean {
@@ -140,6 +142,43 @@ function normalizeWorkspaceMode(value: unknown): 'pooled' | 'temp' | 'static' |
140142
return value === 'pooled' || value === 'temp' || value === 'static' ? value : undefined;
141143
}
142144

145+
function normalizeStringArray(value: unknown): readonly string[] {
146+
if (Array.isArray(value)) {
147+
return value.filter((v): v is string => typeof v === 'string' && v.trim().length > 0);
148+
}
149+
return [];
150+
}
151+
152+
/**
153+
* Check whether an eval file's tags satisfy --tag / --exclude-tag filters.
154+
*
155+
* - `--tag X` means the file must have tag X (AND logic: all specified tags must be present)
156+
* - `--exclude-tag X` means the file must NOT have tag X (AND logic: none of the specified tags may be present)
157+
* - When both are used, both conditions must hold.
158+
* - Files without tags are excluded when --tag is specified, but included when only --exclude-tag is specified.
159+
*/
160+
export function matchesTagFilters(
161+
fileTags: readonly string[] | undefined,
162+
includeTags: readonly string[],
163+
excludeTags: readonly string[],
164+
): boolean {
165+
const tags = new Set(fileTags ?? []);
166+
167+
// --tag: every specified tag must be present
168+
if (includeTags.length > 0) {
169+
for (const required of includeTags) {
170+
if (!tags.has(required)) return false;
171+
}
172+
}
173+
174+
// --exclude-tag: none of the specified tags may be present
175+
for (const excluded of excludeTags) {
176+
if (tags.has(excluded)) return false;
177+
}
178+
179+
return true;
180+
}
181+
143182
/**
144183
* Normalize --output-messages value. Accepts a number (>= 1) or "all".
145184
* Defaults to 1 (last assistant message only).
@@ -304,6 +343,8 @@ function normalizeOptions(
304343
model: normalizeString(rawOptions.model),
305344
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
306345
threshold: normalizeOptionalNumber(rawOptions.threshold),
346+
tags: normalizeStringArray(rawOptions.tag),
347+
excludeTags: normalizeStringArray(rawOptions.excludeTag),
307348
} satisfies NormalizedOptions;
308349
}
309350

@@ -434,6 +475,7 @@ async function prepareFileMetadata(params: {
434475
readonly totalBudgetUsd?: number;
435476
readonly failOnError?: FailOnError;
436477
readonly threshold?: number;
478+
readonly tags?: readonly string[];
437479
}> {
438480
const { testFilePath, repoRoot, cwd, options } = params;
439481

@@ -524,6 +566,7 @@ async function prepareFileMetadata(params: {
524566
totalBudgetUsd: suite.totalBudgetUsd,
525567
failOnError: suite.failOnError,
526568
threshold: suite.threshold,
569+
tags: suite.metadata?.tags,
527570
};
528571
}
529572

@@ -970,6 +1013,7 @@ export async function runEvalCommand(
9701013
readonly totalBudgetUsd?: number;
9711014
readonly failOnError?: FailOnError;
9721015
readonly threshold?: number;
1016+
readonly tags?: readonly string[];
9731017
}
9741018
>();
9751019
// Separate TypeScript/JS eval files from YAML files.
@@ -1006,6 +1050,27 @@ export async function runEvalCommand(
10061050
fileMetadata.set(testFilePath, meta);
10071051
}
10081052

1053+
// Apply --tag / --exclude-tag filtering at the eval-file level
1054+
const hasTagFilters = options.tags.length > 0 || options.excludeTags.length > 0;
1055+
if (hasTagFilters) {
1056+
const skippedFiles: string[] = [];
1057+
for (const [testFilePath, meta] of fileMetadata.entries()) {
1058+
if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
1059+
fileMetadata.delete(testFilePath);
1060+
skippedFiles.push(path.relative(cwd, testFilePath));
1061+
}
1062+
}
1063+
if (skippedFiles.length > 0 && options.verbose) {
1064+
console.log(
1065+
`Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`,
1066+
);
1067+
}
1068+
if (fileMetadata.size === 0) {
1069+
console.log('No eval files matched the tag filters. Nothing to run.');
1070+
return;
1071+
}
1072+
}
1073+
10091074
// Resolve cache: combine CLI flags with YAML config
10101075
// Use first file's YAML config for cache settings (consistent across a run)
10111076
const firstMeta = fileMetadata.values().next().value;
@@ -1116,8 +1181,11 @@ export async function runEvalCommand(
11161181
}
11171182
}
11181183

1184+
// Use only files that survived tag filtering (fileMetadata keys)
1185+
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
1186+
11191187
try {
1120-
await runWithLimit(resolvedTestFiles, fileConcurrency, async (testFilePath) => {
1188+
await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
11211189
const targetPrep = fileMetadata.get(testFilePath);
11221190
if (!targetPrep) {
11231191
throw new Error(`Missing metadata for ${testFilePath}`);
@@ -1208,7 +1276,7 @@ export async function runEvalCommand(
12081276
}
12091277

12101278
if (usesDefaultArtifactWorkspace) {
1211-
const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : '';
1279+
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
12121280
const workspaceDir = path.dirname(outputPath);
12131281
const {
12141282
testArtifactDir,
@@ -1230,7 +1298,7 @@ export async function runEvalCommand(
12301298
// Write companion artifacts (grading, timing, benchmark) if requested
12311299
if (options.artifacts) {
12321300
const artifactsDir = path.resolve(options.artifacts);
1233-
const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : '';
1301+
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
12341302
const {
12351303
testArtifactDir,
12361304
indexPath,
@@ -1275,7 +1343,7 @@ export async function runEvalCommand(
12751343

12761344
// Suggest retry-errors command when execution errors are detected
12771345
if (summary.executionErrorCount > 0 && !options.retryErrors) {
1278-
const evalFileArgs = resolvedTestFiles.map((f) => path.relative(cwd, f)).join(' ');
1346+
const evalFileArgs = activeTestFiles.map((f) => path.relative(cwd, f)).join(' ');
12791347
const targetFlag = options.target ? ` --target ${options.target}` : '';
12801348
const relativeOutputPath = path.relative(cwd, outputPath);
12811349
console.log(
@@ -1287,7 +1355,7 @@ export async function runEvalCommand(
12871355
return {
12881356
executionErrorCount: summary.executionErrorCount,
12891357
outputPath,
1290-
testFiles: resolvedTestFiles,
1358+
testFiles: activeTestFiles,
12911359
target: options.target,
12921360
thresholdFailed,
12931361
};
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import { describe, expect, it } from 'bun:test';
2+
3+
import { matchesTagFilters } from '../../../src/commands/eval/run-eval.js';
4+
5+
describe('matchesTagFilters', () => {
6+
describe('no filters', () => {
7+
it('accepts files with tags', () => {
8+
expect(matchesTagFilters(['agent', 'slow'], [], [])).toBe(true);
9+
});
10+
11+
it('accepts files without tags', () => {
12+
expect(matchesTagFilters(undefined, [], [])).toBe(true);
13+
});
14+
15+
it('accepts files with empty tags', () => {
16+
expect(matchesTagFilters([], [], [])).toBe(true);
17+
});
18+
});
19+
20+
describe('--tag (include)', () => {
21+
it('accepts file with matching tag', () => {
22+
expect(matchesTagFilters(['agent', 'fast'], ['agent'], [])).toBe(true);
23+
});
24+
25+
it('rejects file without matching tag', () => {
26+
expect(matchesTagFilters(['slow', 'multi-provider'], ['agent'], [])).toBe(false);
27+
});
28+
29+
it('requires all specified tags (AND logic)', () => {
30+
expect(matchesTagFilters(['agent', 'fast'], ['agent', 'fast'], [])).toBe(true);
31+
expect(matchesTagFilters(['agent'], ['agent', 'fast'], [])).toBe(false);
32+
});
33+
34+
it('rejects files with no tags when --tag is specified', () => {
35+
expect(matchesTagFilters(undefined, ['agent'], [])).toBe(false);
36+
expect(matchesTagFilters([], ['agent'], [])).toBe(false);
37+
});
38+
});
39+
40+
describe('--exclude-tag', () => {
41+
it('accepts file without excluded tag', () => {
42+
expect(matchesTagFilters(['agent', 'fast'], [], ['slow'])).toBe(true);
43+
});
44+
45+
it('rejects file with excluded tag', () => {
46+
expect(matchesTagFilters(['agent', 'slow'], [], ['slow'])).toBe(false);
47+
});
48+
49+
it('rejects file if any excluded tag is present (AND logic)', () => {
50+
expect(matchesTagFilters(['agent', 'slow'], [], ['slow', 'flaky'])).toBe(false);
51+
expect(matchesTagFilters(['agent', 'flaky'], [], ['slow', 'flaky'])).toBe(false);
52+
});
53+
54+
it('accepts files with no tags when only --exclude-tag is specified', () => {
55+
expect(matchesTagFilters(undefined, [], ['slow'])).toBe(true);
56+
expect(matchesTagFilters([], [], ['slow'])).toBe(true);
57+
});
58+
});
59+
60+
describe('combined --tag and --exclude-tag', () => {
61+
it('accepts file matching include and not matching exclude', () => {
62+
expect(matchesTagFilters(['agent', 'fast'], ['agent'], ['slow'])).toBe(true);
63+
});
64+
65+
it('rejects file matching include but also matching exclude', () => {
66+
expect(matchesTagFilters(['agent', 'slow'], ['agent'], ['slow'])).toBe(false);
67+
});
68+
69+
it('rejects file not matching include even if not matching exclude', () => {
70+
expect(matchesTagFilters(['fast'], ['agent'], ['slow'])).toBe(false);
71+
});
72+
});
73+
});

0 commit comments

Comments
 (0)