diff --git a/datasets/claude-code-tools/example_run_config.yaml b/datasets/claude-code-tools/example_run_config.yaml index 46333403..486cc77f 100644 --- a/datasets/claude-code-tools/example_run_config.yaml +++ b/datasets/claude-code-tools/example_run_config.yaml @@ -20,7 +20,13 @@ runners: ### Scorer Related Configs ############################################################ scorers: + # By default trajectory_matcher drops native/harness-internal tools + # (Read, Bash, Edit, ToolSearch, ...) from both expected and actual + # trajectories before scoring. Uncomment the block below to keep them + # and score native-tool usage too. trajectory_matcher: {} + # trajectory_matcher: + # filter_native_tools: false goal_completion: model_config: datasets/model_configs/gemini_2.5_pro_model.yaml behavioral_metrics: diff --git a/datasets/codex-cli-tools/example_run_config.yaml b/datasets/codex-cli-tools/example_run_config.yaml index 27035c5a..ac9c1bb1 100644 --- a/datasets/codex-cli-tools/example_run_config.yaml +++ b/datasets/codex-cli-tools/example_run_config.yaml @@ -18,7 +18,13 @@ runners: ### Scorer Related Configs ############################################################ scorers: + # By default trajectory_matcher drops native/harness-internal tools + # (shell, file ops, ...) from both expected and actual trajectories + # before scoring. Uncomment the block below to keep them and score + # native-tool usage too. trajectory_matcher: {} + # trajectory_matcher: + # filter_native_tools: false goal_completion: model_config: datasets/model_configs/gemini_2.5_pro_model.yaml behavioral_metrics: diff --git a/datasets/gemini-cli-tools/example_run_config.yaml b/datasets/gemini-cli-tools/example_run_config.yaml index 7fea123e..8ee43816 100644 --- a/datasets/gemini-cli-tools/example_run_config.yaml +++ b/datasets/gemini-cli-tools/example_run_config.yaml @@ -13,7 +13,13 @@ simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml ### Scorer Related Configs ############################################################ scorers: + # By default trajectory_matcher drops native/harness-internal tools + # (run_shell_command, write_file, update_topic, ...) from both expected + # and actual trajectories before scoring. Uncomment the block below to + # keep them and score native-tool usage too. trajectory_matcher: {} + # trajectory_matcher: + # filter_native_tools: false goal_completion: model_config: datasets/model_configs/gemini_2.5_pro_model.yaml behavioral_metrics: diff --git a/docs/claude_code_agent_testing.md b/docs/claude_code_agent_testing.md index 02d60982..eb5992be 100644 --- a/docs/claude_code_agent_testing.md +++ b/docs/claude_code_agent_testing.md @@ -347,7 +347,7 @@ Quick reference: | Scorer | Type | Description | |---|---|---| -| `trajectory_matcher` | Deterministic | Jaccard or Levenshtein match between expected and actual tool trajectory | +| `trajectory_matcher` | Deterministic | Jaccard or Levenshtein match between expected and actual tool trajectory. Native Claude Code tools (`Read`, `Bash`, `Edit`, `ToolSearch`, ...) are dropped from both sides by default — set `filter_native_tools: false` to score them too. | | `goal_completion` | LLM | Did the agent accomplish the conversation plan? | | `behavioral_metrics` | LLM | Hallucination rate + clarification rate | | `parameter_analysis` | LLM | Qualitative feedback on tool parameters | diff --git a/docs/codex_cli_agent_testing.md b/docs/codex_cli_agent_testing.md index 7e9dd9e0..ca7b6f4d 100644 --- a/docs/codex_cli_agent_testing.md +++ b/docs/codex_cli_agent_testing.md @@ -454,7 +454,7 @@ Quick reference: | Scorer | Type | Description | |---|---|---| -| `trajectory_matcher` | Deterministic | Jaccard or Levenshtein match between expected and actual tool trajectory | +| `trajectory_matcher` | Deterministic | Jaccard or Levenshtein match between expected and actual tool trajectory. Native Codex tools (`shell`, file ops, ...) are dropped from both sides by default — set `filter_native_tools: false` to score them too. | | `goal_completion` | LLM | Did the agent accomplish the conversation plan? | | `behavioral_metrics` | LLM | Hallucination rate + clarification rate | | `parameter_analysis` | LLM | Qualitative feedback on tool parameters | diff --git a/docs/gemini_cli_agent_testing.md b/docs/gemini_cli_agent_testing.md index 62c4782b..79ee50b3 100644 --- a/docs/gemini_cli_agent_testing.md +++ b/docs/gemini_cli_agent_testing.md @@ -256,7 +256,9 @@ The evalset JSON file defines the test scenarios. Each scenario represents an ag #### Tool name format -Entries in `expected_trajectory` use the canonical form `__` (double-underscore separator) for MCP tools, and the bare name for native harness tools (e.g. `Read`, `Bash`). Each harness adapter normalizes its raw tool-call event into this form at the boundary, so the same evalset can score runs from Codex, Claude Code, and Gemini CLI without modification. The `` segment comes from the MCP server key in your model config and is case-sensitive — e.g. `cloud-sql` or `bigtable`. See `evalbench/generators/models/tool_naming.py` for the canonicalization helper. +Entries in `expected_trajectory` use the canonical form `__` (double-underscore separator) for MCP tools, and the bare name for native harness tools (e.g. `Read`, `Bash`, `run_shell_command`). Each harness adapter normalizes its raw tool-call event into this form at the boundary, so the same evalset can score runs from Codex, Claude Code, and Gemini CLI without modification. The `` segment comes from the MCP server key in your model config and is case-sensitive — e.g. `cloud-sql` or `bigtable`. See `evalbench/generators/models/tool_naming.py` for the canonicalization helper. + +By default the `trajectory_matcher` scorer drops native/harness-internal tools (anything that is **not** in canonical `__` form) from both the expected and actual lists before scoring, so authors can keep `expected_trajectory` focused on user-visible MCP intent without the score being dragged down by harness-internal calls like `update_topic` or `Bash`. Set `filter_native_tools: false` on the scorer if you intentionally want to score native-tool usage — see the [scorer configuration example](#scorer-configuration-example). #### Writing Good Conversation Plans @@ -616,7 +618,7 @@ These require no additional model: | Scorer | Score Range | Description | |--------|------------|-------------| -| `trajectory_matcher` | 0–100 | Compares expected vs. actual tool usage. Uses **Jaccard Similarity** by default (set-based, order-insensitive). Set `enforce_order: true` for **Levenshtein distance** (order-sensitive). | +| `trajectory_matcher` | 0–100 | Compares expected vs. actual tool usage. Uses **Jaccard Similarity** by default (set-based, order-insensitive). Set `enforce_order: true` for **Levenshtein distance** (order-sensitive). Native/harness-internal tools are dropped from both sides before scoring by default; set `filter_native_tools: false` to keep them. See [Tool name format](#tool-name-format) for the canonical-name rule the filter uses. | | `turn_count` | Count | Reports the number of conversation turns the agent took. Lower is generally better. | | `end_to_end_latency` | Milliseconds | Total latency = model API latency + tool execution latency. | | `tool_call_latency` | Milliseconds | Sum of all tool execution durations across all turns. | @@ -628,9 +630,13 @@ These require no additional model: ```yaml scorers: # Deterministic scorers (no model needed) + # trajectory_matcher drops native/harness-internal tools (Read, Bash, + # run_shell_command, ...) by default — uncomment the expanded block + # below to opt out, or to enforce ordered matching. trajectory_matcher: {} # trajectory_matcher: - # enforce_order: true # Use Levenshtein for ordered matching + # filter_native_tools: false # keep native tools in the score + # enforce_order: true # use Levenshtein for ordered matching turn_count: {} end_to_end_latency: {} tool_call_latency: {} diff --git a/evalbench/generators/models/claude_code.py b/evalbench/generators/models/claude_code.py index 18dae250..dab52a01 100644 --- a/evalbench/generators/models/claude_code.py +++ b/evalbench/generators/models/claude_code.py @@ -11,12 +11,6 @@ from util.context import rpc_id_var -# Infrastructure tools that the Claude Code harness invokes for its own -# bookkeeping (e.g. enumerating available MCP tools) and that should not -# count toward the user-visible trajectory. -_CLAUDE_INFRA_TOOLS = frozenset({"ToolSearch"}) - - class CLICommand: def __init__(self, cli, prompt, env=None, resume=False, session_id=None, allowedTools=None, cwd=None): self.cli = cli @@ -709,10 +703,11 @@ def parse_response(self, stdout: str) -> dict: def extract_tools(self, stdout: str) -> list[str]: """Extracts the list of tools used from the CLI output. - Filters out infrastructure tools (see ``_CLAUDE_INFRA_TOOLS``) so - that trajectory comparisons reflect user-visible behavior only. - Tool names are already in canonical ``__`` form for - MCP tools (set when the ``tool_use`` block was recorded). + Returns every tool the harness recorded -- MCP calls in canonical + ``__`` form alongside native tools (``Read``, + ``Bash``, ``Edit``, ``ToolSearch``, ...). The trajectory scorer + is responsible for filtering native/harness-internal tools when + ``filter_native_tools`` is enabled. """ output_json = self.parse_response(stdout) if ( @@ -720,11 +715,7 @@ def extract_tools(self, stdout: str) -> list[str]: and "tools" in output_json["stats"] and "byName" in output_json["stats"]["tools"] ): - return [ - name - for name in output_json["stats"]["tools"]["byName"].keys() - if name not in _CLAUDE_INFRA_TOOLS - ] + return list(output_json["stats"]["tools"]["byName"].keys()) return [] def _get_installed_skills(self) -> set[str]: diff --git a/evalbench/generators/models/codex_cli.py b/evalbench/generators/models/codex_cli.py index f88f405e..77390c58 100644 --- a/evalbench/generators/models/codex_cli.py +++ b/evalbench/generators/models/codex_cli.py @@ -1017,6 +1017,13 @@ def parse_response(self, stdout: str) -> dict: return {} def extract_tools(self, stdout: str) -> list[str]: + """Extracts the list of tools used from the CLI output. + + Returns every tool the harness recorded -- MCP calls in canonical + ``__`` form alongside native Codex tools (``shell``, + file ops, ...). The trajectory scorer is responsible for filtering + native tools when ``filter_native_tools`` is enabled. + """ output_json = self.parse_response(stdout) if ( "stats" in output_json diff --git a/evalbench/generators/models/gemini_cli.py b/evalbench/generators/models/gemini_cli.py index dd6846b5..782a90ab 100644 --- a/evalbench/generators/models/gemini_cli.py +++ b/evalbench/generators/models/gemini_cli.py @@ -1035,7 +1035,14 @@ def parse_response(self, stdout: str) -> dict: return {} def extract_tools(self, stdout: str) -> list[str]: - """Extracts the list of tools used from the CLI output.""" + """Extracts the list of tools used from the CLI output. + + Returns every tool the harness recorded -- MCP calls in canonical + ``__`` form alongside native Gemini tools + (``update_topic``, ``run_shell_command``, ``write_file``, ...). + The trajectory scorer is responsible for filtering native tools + when ``filter_native_tools`` is enabled. + """ output_json = self.parse_response(stdout) if ( "stats" in output_json diff --git a/evalbench/generators/models/tool_naming.py b/evalbench/generators/models/tool_naming.py index 0e08607a..47d44695 100644 --- a/evalbench/generators/models/tool_naming.py +++ b/evalbench/generators/models/tool_naming.py @@ -126,3 +126,19 @@ def canonicalize_gemini_tool_name(name: str) -> str: return name server, tool = parsed return canonical_tool_name(server, tool) + + +def looks_like_canonical_mcp_name(name: str) -> bool: + """Return True iff ``name`` *looks like* canonical MCP form (``__``). + + This is a structural check only -- any ``x__y`` with non-empty + segments passes; there is no registry of real MCP servers to + validate against. Native/built-in harness tools (Read, Bash, + update_topic, run_shell_command, etc.) never contain the canonical + separator, so the predicate is still good enough to distinguish MCP + calls from harness-internal ones after canonicalization. + """ + if not name: + return False + server, sep, tool = name.partition(CANONICAL_SEPARATOR) + return bool(sep and server and tool) diff --git a/evalbench/scorers/trajectorymatcher.py b/evalbench/scorers/trajectorymatcher.py index 292b4e2a..85d91133 100644 --- a/evalbench/scorers/trajectorymatcher.py +++ b/evalbench/scorers/trajectorymatcher.py @@ -8,10 +8,19 @@ harness adapter performs that normalization at the boundary (see ``generators/models/tool_naming.py``), so this scorer can stay generator-agnostic and do a plain string comparison. + +By default the matcher drops native/harness-internal tools (``Read``, +``Bash``, ``update_topic``, ``run_shell_command``, ``ToolSearch``, ...) +from both expected and actual trajectories before scoring, so dataset +authors can focus ``expected_trajectory`` on user-visible MCP intent. +Set ``filter_native_tools: false`` in the scorer config to compare raw +trajectories instead -- useful when an evalset cares about how often the +agent reaches for a native tool. """ from typing import Tuple, Any, List from scorers import comparator +from generators.models.tool_naming import looks_like_canonical_mcp_name class TrajectoryMatcher(comparator.Comparator): @@ -26,6 +35,7 @@ def __init__(self, config: dict): self.name = "trajectory_matcher" self.config = config self.enforce_order = config.get("enforce_order", False) + self.filter_native_tools = config.get("filter_native_tools", True) def _levenshtein_distance(self, seq1: List[str], seq2: List[str]) -> int: n, m = len(seq1), len(seq2) @@ -90,8 +100,23 @@ def compare( if not isinstance(expected, list) or not isinstance(actual, list): return 0.0, "Trajectory data must be lists." + filter_note = "" + if self.filter_native_tools: + filtered_expected = [t for t in expected if looks_like_canonical_mcp_name(t)] + filtered_actual = [t for t in actual if looks_like_canonical_mcp_name(t)] + dropped_expected = len(expected) - len(filtered_expected) + dropped_actual = len(actual) - len(filtered_actual) + if dropped_expected or dropped_actual: + filter_note = ( + f" (filter_native_tools=True dropped " + f"{dropped_expected} expected, {dropped_actual} actual)" + ) + expected, actual = filtered_expected, filtered_actual + if not expected and not actual: - return 100.0, "Both expected and actual trajectories are empty." + return 100.0, ( + "Both expected and actual trajectories are empty." + filter_note + ) score = 0.0 explanation = "" @@ -105,12 +130,20 @@ def compare( normalized_score = max( 0.0, 1.0 - (distance / max_len)) if max_len > 0 else 1.0 score = normalized_score * 100.0 - explanation = f"Sequence Alignment Score: {score:.2f} (Distance: {distance}, Max Length: {max_len}). Expected: {expected}, Actual: {actual}" + explanation = ( + f"Sequence Alignment Score: {score:.2f} (Distance: {distance}, " + f"Max Length: {max_len}). Expected: {expected}, Actual: {actual}" + + filter_note + ) else: # Flexible ordering (Jaccard Similarity) similarity = self._jaccard_similarity(set(expected), set(actual)) score = similarity * 100.0 - explanation = f"Jaccard Similarity Score: {score:.2f} (Intersection over Union). Expected Set: {set(expected)}, Actual Set: {set(actual)}" + explanation = ( + f"Jaccard Similarity Score: {score:.2f} (Intersection over Union). " + f"Expected Set: {set(expected)}, Actual Set: {set(actual)}" + + filter_note + ) return score, explanation diff --git a/evalbench/test/tool_naming_test.py b/evalbench/test/tool_naming_test.py index c2e7d852..6908f895 100644 --- a/evalbench/test/tool_naming_test.py +++ b/evalbench/test/tool_naming_test.py @@ -11,6 +11,7 @@ canonical_tool_name, canonicalize_claude_tool_name, canonicalize_gemini_tool_name, + looks_like_canonical_mcp_name, parse_claude_mcp_tool_name, parse_gemini_mcp_tool_name, ) @@ -123,5 +124,22 @@ def test_gemini_malformed_mcp_returned_as_is(self): ) +class LooksLikeCanonicalMcpNameTest(unittest.TestCase): + + def test_mcp_form_is_detected(self): + self.assertTrue(looks_like_canonical_mcp_name("cloud-sql__list_instances")) + self.assertTrue(looks_like_canonical_mcp_name("alloydb__create_user")) + + def test_native_tools_rejected(self): + self.assertFalse(looks_like_canonical_mcp_name("Read")) + self.assertFalse(looks_like_canonical_mcp_name("update_topic")) + self.assertFalse(looks_like_canonical_mcp_name("run_shell_command")) + + def test_empty_segments_rejected(self): + self.assertFalse(looks_like_canonical_mcp_name("")) + self.assertFalse(looks_like_canonical_mcp_name("__tool")) + self.assertFalse(looks_like_canonical_mcp_name("server__")) + + if __name__ == "__main__": unittest.main() diff --git a/evalbench/test/trajectory_matcher_test.py b/evalbench/test/trajectory_matcher_test.py index e064fb34..48e575ed 100644 --- a/evalbench/test/trajectory_matcher_test.py +++ b/evalbench/test/trajectory_matcher_test.py @@ -66,8 +66,9 @@ def test_server_qualifier_distinguishes_same_tool_across_servers(self): score, _ = _compare(matcher, expected, actual) self.assertEqual(score, 0.0) - def test_native_tools_pass_through(self): - matcher = TrajectoryMatcher({}) + def test_native_tools_pass_through_when_filter_disabled(self): + # With filtering off, native tool names are compared verbatim. + matcher = TrajectoryMatcher({"filter_native_tools": False}) expected = ["Read", "Bash"] actual = ["Read", "Bash"] @@ -75,6 +76,50 @@ def test_native_tools_pass_through(self): score, _ = _compare(matcher, expected, actual) self.assertEqual(score, 100.0) + def test_filter_native_tools_drops_native_on_actual_by_default(self): + # Default-on filter: native tools in actual must not drag Jaccard down + # when expected contains only MCP intent. + matcher = TrajectoryMatcher({}) + + expected = ["cloud-sql__list_instances"] + actual = ["cloud-sql__list_instances", "Read", "Bash", "update_topic"] + + score, explanation = _compare(matcher, expected, actual) + self.assertEqual(score, 100.0) + self.assertIn("filter_native_tools=True", explanation) + + def test_filter_disabled_keeps_native_tools(self): + # With filtering off, the same native leakage drags Jaccard down. + matcher = TrajectoryMatcher({"filter_native_tools": False}) + + expected = ["cloud-sql__list_instances"] + actual = ["cloud-sql__list_instances", "Read", "Bash", "update_topic"] + + score, _ = _compare(matcher, expected, actual) + self.assertLess(score, 100.0) + + def test_filter_applies_to_expected_too(self): + # Symmetric filtering: native tools in expected are also dropped so + # an evalset author can't accidentally pin behavior on a native tool + # while filtering is on. + matcher = TrajectoryMatcher({}) + + expected = ["cloud-sql__list_instances", "Bash"] + actual = ["cloud-sql__list_instances"] + + score, _ = _compare(matcher, expected, actual) + self.assertEqual(score, 100.0) + + def test_filter_removes_all_tools_scores_empty(self): + # If filtering wipes both sides clean, the matcher should report the + # standard "both empty" success rather than divide-by-zero. + matcher = TrajectoryMatcher({}) + + score, explanation = _compare(matcher, ["Read"], ["Bash"]) + self.assertEqual(score, 100.0) + self.assertIn("empty", explanation) + self.assertIn("filter_native_tools=True", explanation) + def test_both_empty_is_full_score(self): matcher = TrajectoryMatcher({}) score, explanation = _compare(matcher, [], [])