diff --git a/AGENTS.md b/AGENTS.md index 513d7d6d..f603be4c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -207,7 +207,7 @@ Contains the test cases. "id": "list-instances-01", "starting_prompt": "List all Cloud SQL instances in project my-evaluation-project", "conversation_plan": "Ensure the agent accurately calls list_instances. Verify the output is returned correctly.", - "expected_trajectory": ["list_instances"], + "expected_trajectory": ["cloud-sql__list_instances"], "env": { "GOOGLE_CLOUD_PROJECT": "my-evaluation-project" }, "max_turns": 4 } diff --git a/datasets/claude-code-tools/claude-code-fake.evalset.json b/datasets/claude-code-tools/claude-code-fake.evalset.json index 1d9f584b..f02e8424 100644 --- a/datasets/claude-code-tools/claude-code-fake.evalset.json +++ b/datasets/claude-code-tools/claude-code-fake.evalset.json @@ -5,7 +5,7 @@ "starting_prompt": "Create a new Cloud SQL instance named 'my-fake-db' in project 'astana-evaluation'. Use PostgreSQL 17, and set the password to 'password123'. Also use the 'Development' edition preset.", "conversation_plan": "The user wants to create a database. All required parameters are in the starting prompt. The agent should call create_instance and report the success message back.", "expected_trajectory": [ - "create_instance" + "cloud-sql__create_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -18,7 +18,7 @@ "starting_prompt": "Get the details for the Cloud SQL instance named 'missing-db' in project 'astana-evaluation'.", "conversation_plan": "The user wants to get instance details. The agent should call get_instance, which is hardcoded to fail with an error 'Instance not found or permission denied'. The agent should explain that the instance could not be found based on the error.", "expected_trajectory": [ - "get_instance" + "cloud-sql__get_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" diff --git a/datasets/claude-code-tools/claude-code.evalset.json b/datasets/claude-code-tools/claude-code.evalset.json index 29b79df0..9ce302ce 100644 --- a/datasets/claude-code-tools/claude-code.evalset.json +++ b/datasets/claude-code-tools/claude-code.evalset.json @@ -5,8 +5,8 @@ "starting_prompt": "list all Cloud SQL instances in project astana-evaluation", "conversation_plan": "Ask the agent to list instances in project astana-evaluation. Once all instances are listed if nl2code exist get its state and validate its RUNNABLE", "expected_trajectory": [ - "list_instances", - "get_instance" + "cloud-sql__list_instances", + "cloud-sql__get_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -19,9 +19,9 @@ "starting_prompt": "I need a database.", "conversation_plan": "The user starts with a vague request. You want to CREATE a NEW Cloud SQL instance named 'my-pg-app'. If the agent offers to create one, say YES. When asked for details, provide 'my-pg-app' as the instance name and 'user_data' as the database name. Never claim to have an existing instance. The goal is for the agent to eventually create the database 'user_data' inside 'my-pg-app' in astana-evaluation project.", "expected_trajectory": [ - "list_instances", - "create_instance", - "create_database" + "cloud-sql__list_instances", + "cloud-sql__create_instance", + "cloud-sql__create_database" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -34,8 +34,8 @@ "starting_prompt": "Update the instance 'non-existent-db-123' to have 8 cores.", "conversation_plan": "The user asks to interact with an instance named 'non-existent-db-123' in astana-evaluation project that doesn't exist. The agent should try to get the instance details or update it directly, fail to find it, and inform the user. The user will then ask to list instances to find the correct name.", "expected_trajectory": [ - "update_instance", - "list_instances" + "cloud-sql__update_instance", + "cloud-sql__list_instances" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" diff --git a/datasets/codex-cli-tools/codex-cli-fake.evalset.json b/datasets/codex-cli-tools/codex-cli-fake.evalset.json index 1d9f584b..f02e8424 100644 --- a/datasets/codex-cli-tools/codex-cli-fake.evalset.json +++ b/datasets/codex-cli-tools/codex-cli-fake.evalset.json @@ -5,7 +5,7 @@ "starting_prompt": "Create a new Cloud SQL instance named 'my-fake-db' in project 'astana-evaluation'. Use PostgreSQL 17, and set the password to 'password123'. Also use the 'Development' edition preset.", "conversation_plan": "The user wants to create a database. All required parameters are in the starting prompt. The agent should call create_instance and report the success message back.", "expected_trajectory": [ - "create_instance" + "cloud-sql__create_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -18,7 +18,7 @@ "starting_prompt": "Get the details for the Cloud SQL instance named 'missing-db' in project 'astana-evaluation'.", "conversation_plan": "The user wants to get instance details. The agent should call get_instance, which is hardcoded to fail with an error 'Instance not found or permission denied'. The agent should explain that the instance could not be found based on the error.", "expected_trajectory": [ - "get_instance" + "cloud-sql__get_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" diff --git a/datasets/codex-cli-tools/codex-cli.evalset.json b/datasets/codex-cli-tools/codex-cli.evalset.json index 29b79df0..9ce302ce 100644 --- a/datasets/codex-cli-tools/codex-cli.evalset.json +++ b/datasets/codex-cli-tools/codex-cli.evalset.json @@ -5,8 +5,8 @@ "starting_prompt": "list all Cloud SQL instances in project astana-evaluation", "conversation_plan": "Ask the agent to list instances in project astana-evaluation. Once all instances are listed if nl2code exist get its state and validate its RUNNABLE", "expected_trajectory": [ - "list_instances", - "get_instance" + "cloud-sql__list_instances", + "cloud-sql__get_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -19,9 +19,9 @@ "starting_prompt": "I need a database.", "conversation_plan": "The user starts with a vague request. You want to CREATE a NEW Cloud SQL instance named 'my-pg-app'. If the agent offers to create one, say YES. When asked for details, provide 'my-pg-app' as the instance name and 'user_data' as the database name. Never claim to have an existing instance. The goal is for the agent to eventually create the database 'user_data' inside 'my-pg-app' in astana-evaluation project.", "expected_trajectory": [ - "list_instances", - "create_instance", - "create_database" + "cloud-sql__list_instances", + "cloud-sql__create_instance", + "cloud-sql__create_database" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -34,8 +34,8 @@ "starting_prompt": "Update the instance 'non-existent-db-123' to have 8 cores.", "conversation_plan": "The user asks to interact with an instance named 'non-existent-db-123' in astana-evaluation project that doesn't exist. The agent should try to get the instance details or update it directly, fail to find it, and inform the user. The user will then ask to list instances to find the correct name.", "expected_trajectory": [ - "update_instance", - "list_instances" + "cloud-sql__update_instance", + "cloud-sql__list_instances" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" diff --git a/datasets/gemini-cli-tools/bigtable/gemini-cli-bigtable.evalset.json b/datasets/gemini-cli-tools/bigtable/gemini-cli-bigtable.evalset.json index 9fa8b67d..9d3ccf81 100644 --- a/datasets/gemini-cli-tools/bigtable/gemini-cli-bigtable.evalset.json +++ b/datasets/gemini-cli-tools/bigtable/gemini-cli-bigtable.evalset.json @@ -5,10 +5,10 @@ "starting_prompt": "I need to initialize my Bigtable environment for evaluation if they don't exist already. Please create the following instances in project cloud-bigtable-spacewalk: 'evalbench-prod-data', 'evalbench-legacy-archive', 'evalbench-staging-db', 'evalbench-customers-v2', 'evalbench-dev-db', and 'evalbench-test-instance-123'. All clusters should be in us-central1-a. Also, in 'evalbench-legacy-archive' create a table 'evalbench-v1-data' and in 'evalbench-staging-db' create a table 'evalbench-temporary-data' if they don't exist already.", "conversation_plan": "The user is seeding the environment. The agent should sequentially or in parallel create all requested instances and tables. Confirm when everything is ready.", "expected_trajectory": [ - "mcp_Bigtable_list_instances", - "mcp_Bigtable_create_instance", - "mcp_Bigtable_list_tables", - "mcp_Bigtable_create_table" + "bigtable__list_instances", + "bigtable__create_instance", + "bigtable__list_tables", + "bigtable__create_table" ], "env": { "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk" @@ -21,8 +21,8 @@ "starting_prompt": "What Bigtable instances are running in project cloud-bigtable-spacewalk?", "conversation_plan": "The user starts with a discovery question. The agent should list instances. The user then asks for the configuration details of the 'evalbench-prod-data' instance to check its clusters.", "expected_trajectory": [ - "mcp_Bigtable_list_instances", - "mcp_Bigtable_get_instance" + "bigtable__list_instances", + "bigtable__get_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk" @@ -35,9 +35,9 @@ "starting_prompt": "Set up a new Bigtable instance 'evalbench-metrics-db' in us-east1-c and add a 'evalbench-raw-metrics' table with a 'data' column family. If the instance creation is pending, please wait or check until it's ready before creating the table.", "conversation_plan": "The agent must create the instance. If the table creation fails because the instance isn't ready, the agent should poll 'get_instance' or wait and try again. The user expects both to be completed.", "expected_trajectory": [ - "mcp_Bigtable_create_instance", - "mcp_Bigtable_get_instance", - "mcp_Bigtable_create_table" + "bigtable__create_instance", + "bigtable__get_instance", + "bigtable__create_table" ], "env": { "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk" @@ -50,7 +50,7 @@ "starting_prompt": "I need to ensure the Bigtable instance 'evalbench-prod-data' exists. If it's already there, just let me know its current state. If not, create it in us-central1-a.", "conversation_plan": "The agent should first check for the instance's existence using 'get_instance'. If it exists, it should report the state. If it returns a 404/not found, it should proceed to 'create_instance'.", "expected_trajectory": [ - "mcp_Bigtable_get_instance" + "bigtable__get_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk" @@ -63,8 +63,8 @@ "starting_prompt": "I need to find the 'evalbench-audit' table in my 'evalbench-prod-data'. Can you get its metadata?", "conversation_plan": "The user wants to find a specific table. The agent should list tables in the instance to verify existence and then get the table details.", "expected_trajectory": [ - "mcp_Bigtable_list_tables", - "mcp_Bigtable_get_table" + "bigtable__list_tables", + "bigtable__get_table" ], "env": { "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk" @@ -77,7 +77,7 @@ "starting_prompt": "Delete the 'evalbench-temporary-data' table in the 'evalbench-staging-db' instance.", "conversation_plan": "A direct administrative action. The agent must call delete_table. Since the tool requires confirmation, the agent should ask and the user will say 'yes'.", "expected_trajectory": [ - "mcp_Bigtable_delete_table" + "bigtable__delete_table" ], "env": { "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk" @@ -90,9 +90,9 @@ "starting_prompt": "Get details for the 'evalbench-customer-db' instance.", "conversation_plan": "The user refers to an instance by a name that doesn't exist. The agent's 'get_instance' call will fail. The agent should then list instances, and the user will say 'Oh, I meant evalbench-customers-v2, get that one'.", "expected_trajectory": [ - "mcp_Bigtable_get_instance", - "mcp_Bigtable_list_instances", - "mcp_Bigtable_get_instance" + "bigtable__get_instance", + "bigtable__list_instances", + "bigtable__get_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk" @@ -105,8 +105,8 @@ "starting_prompt": "Add a new table to my instance.", "conversation_plan": "A vague request. The agent should ask which instance. The user provides 'evalbench-dev-db'. The agent then asks for table name and families. The user provides name 'evalbench-app-logs' and family 'metadata'.", "expected_trajectory": [ - "mcp_Bigtable_list_instances", - "mcp_Bigtable_create_table" + "bigtable__list_instances", + "bigtable__create_table" ], "env": { "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk" @@ -119,7 +119,7 @@ "starting_prompt": "I no longer need the 'evalbench-test-instance-123'. Please delete it.", "conversation_plan": "The user wants to delete a specific instance. The agent must call delete_instance. The tool requires explicit user confirmation, so the agent should ask and the user will confirm with 'yes'.", "expected_trajectory": [ - "mcp_Bigtable_delete_instance" + "bigtable__delete_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk" @@ -132,7 +132,7 @@ "starting_prompt": "Evaluation is complete. Please delete all the Bigtable instances we used in project cloud-bigtable-spacewalk: 'evalbench-prod-data', 'evalbench-legacy-archive', 'evalbench-staging-db', 'evalbench-customers-v2', 'evalbench-dev-db', and 'evalbench-metrics-db'. Confirm each deletion when asked.", "conversation_plan": "The user is cleaning up. The agent should call delete_instance for each instance. The user will provide 'yes' for each confirmation request.", "expected_trajectory": [ - "mcp_Bigtable_delete_instance" + "bigtable__delete_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk" diff --git a/datasets/gemini-cli-tools/gemini-cli-fake.evalset.json b/datasets/gemini-cli-tools/gemini-cli-fake.evalset.json index 1d9f584b..f02e8424 100644 --- a/datasets/gemini-cli-tools/gemini-cli-fake.evalset.json +++ b/datasets/gemini-cli-tools/gemini-cli-fake.evalset.json @@ -5,7 +5,7 @@ "starting_prompt": "Create a new Cloud SQL instance named 'my-fake-db' in project 'astana-evaluation'. Use PostgreSQL 17, and set the password to 'password123'. Also use the 'Development' edition preset.", "conversation_plan": "The user wants to create a database. All required parameters are in the starting prompt. The agent should call create_instance and report the success message back.", "expected_trajectory": [ - "create_instance" + "cloud-sql__create_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -18,7 +18,7 @@ "starting_prompt": "Get the details for the Cloud SQL instance named 'missing-db' in project 'astana-evaluation'.", "conversation_plan": "The user wants to get instance details. The agent should call get_instance, which is hardcoded to fail with an error 'Instance not found or permission denied'. The agent should explain that the instance could not be found based on the error.", "expected_trajectory": [ - "get_instance" + "cloud-sql__get_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" diff --git a/datasets/gemini-cli-tools/gemini-cli.evalset.json b/datasets/gemini-cli-tools/gemini-cli.evalset.json index 050e4640..25ec2b14 100644 --- a/datasets/gemini-cli-tools/gemini-cli.evalset.json +++ b/datasets/gemini-cli-tools/gemini-cli.evalset.json @@ -5,8 +5,8 @@ "starting_prompt": "list all instances in project astana-evaluation", "conversation_plan": "Ask the agent to list instances in project astana-evaluation. Once all instances are listed if nl2code exist get its state and validate its RUNNABLE", "expected_trajectory": [ - "list_instances", - "get_instance" + "cloud-sql__list_instances", + "cloud-sql__get_instance" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -19,9 +19,9 @@ "starting_prompt": "I need a database.", "conversation_plan": "The user starts with a vague request. You want to CREATE a NEW Cloud SQL instance named 'my-pg-app'. If the agent offers to create one, say YES. When asked for details, provide 'my-pg-app' as the instance name and 'user_data' as the database name. Never claim to have an existing instance. The goal is for the agent to eventually create the database 'user_data' inside 'my-pg-app' in astana-evaluation project.", "expected_trajectory": [ - "list_instances", - "create_instance", - "create_database" + "cloud-sql__list_instances", + "cloud-sql__create_instance", + "cloud-sql__create_database" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation", @@ -35,9 +35,9 @@ "starting_prompt": "I want to clone the 'production-db' instance.", "conversation_plan": "The user wants to create a clone of the instance 'nl2code'. If the agent asks for a new instance name, provide 'staging-nl2code'. If asked to confirm the project, say 'astana-evaluation'.", "expected_trajectory": [ - "get_instance", - "clone_instance", - "get_operation" + "cloud-sql__get_instance", + "cloud-sql__clone_instance", + "cloud-sql__get_operation" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -50,9 +50,9 @@ "starting_prompt": "Scale up my database instance.", "conversation_plan": "The user wants to increase the resources for 'nl2code' in astana-evaluation project. When asked for specifics, state you want to change the tier/machine type to 'db-custom-4-15360' (4 vCPUs, 15GB RAM). Confirm the update when asked. Verify the operation status.", "expected_trajectory": [ - "get_instance", - "update_instance", - "get_operation" + "cloud-sql__get_instance", + "cloud-sql__update_instance", + "cloud-sql__get_operation" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -65,8 +65,8 @@ "starting_prompt": "I need to rotate the password for a user on the 'magic' instance, but I forgot their exact username.", "conversation_plan": "First, the user asks to list all users on the 'magic' instance. The agent will show the users. The user then identifies 'webapp_service_account' and asks to update its password to 'NewStrictPassword99!'. Confirm the password update. The project name is astana-evaluation", "expected_trajectory": [ - "list_users", - "update_user" + "cloud-sql__list_users", + "cloud-sql__update_user" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -79,8 +79,8 @@ "starting_prompt": "Update the instance 'non-existent-db-123' to have 8 cores.", "conversation_plan": "The user asks to interact with an instance named 'non-existent-db-123' in astana-evaluation project that doesn't exist. The agent should try to get the instance details or update it directly, fail to find it, and inform the user. The user will then ask to list instances to find the correct name.", "expected_trajectory": [ - "update_instance", - "list_instances" + "cloud-sql__update_instance", + "cloud-sql__list_instances" ], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" @@ -89,4 +89,4 @@ "max_turns": 4 } ] -} \ No newline at end of file +} diff --git a/docs/claude_code_agent_testing.md b/docs/claude_code_agent_testing.md index 246bb0f1..02d60982 100644 --- a/docs/claude_code_agent_testing.md +++ b/docs/claude_code_agent_testing.md @@ -213,7 +213,7 @@ The model config defines the Claude Code CLI version, model, auth, environment, ### 3. Evaluation Dataset (Evalset) -**Identical schema** to the Gemini CLI evalset. See [Gemini CLI doc — Evalset](./gemini_cli_agent_testing.md#3-evaluation-dataset-evalset) for details. +**Identical schema** to the Gemini CLI evalset. See [Gemini CLI doc — Evalset](./gemini_cli_agent_testing.md#3-evaluation-dataset-evalset) for details, including the canonical [tool name format](./gemini_cli_agent_testing.md#tool-name-format) used in `expected_trajectory`. Minimal example: @@ -224,7 +224,7 @@ Minimal example: "id": "cloud-sql-list-instances-01", "starting_prompt": "list all Cloud SQL instances in project astana-evaluation", "conversation_plan": "Ask the agent to list instances. If nl2code exists, get its state and verify it is RUNNABLE.", - "expected_trajectory": ["list_instances", "get_instance"], + "expected_trajectory": ["cloud-sql__list_instances", "cloud-sql__get_instance"], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" }, "kind": "tools", "max_turns": 3 diff --git a/docs/codex_cli_agent_testing.md b/docs/codex_cli_agent_testing.md index f3df5809..7e9dd9e0 100644 --- a/docs/codex_cli_agent_testing.md +++ b/docs/codex_cli_agent_testing.md @@ -248,7 +248,7 @@ setup: ### 3. Evaluation Dataset (Evalset) -**Identical schema** to the Gemini CLI / Claude Code evalset. See [Gemini CLI doc — Evalset](./gemini_cli_agent_testing.md#3-evaluation-dataset-evalset) for full details. +**Identical schema** to the Gemini CLI / Claude Code evalset. See [Gemini CLI doc — Evalset](./gemini_cli_agent_testing.md#3-evaluation-dataset-evalset) for full details, including the canonical [tool name format](./gemini_cli_agent_testing.md#tool-name-format) used in `expected_trajectory`. Minimal example ([codex-cli.evalset.json](../datasets/codex-cli-tools/codex-cli.evalset.json)): @@ -259,7 +259,7 @@ Minimal example ([codex-cli.evalset.json](../datasets/codex-cli-tools/codex-cli. "id": "cloud-sql-list-instances-01", "starting_prompt": "list all Cloud SQL instances in project astana-evaluation", "conversation_plan": "Ask the agent to list instances in project astana-evaluation. Once all instances are listed if nl2code exists get its state and validate it is RUNNABLE.", - "expected_trajectory": ["list_instances", "get_instance"], + "expected_trajectory": ["cloud-sql__list_instances", "cloud-sql__get_instance"], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" }, "kind": "tools", "max_turns": 3 diff --git a/docs/gemini_cli_agent_testing.md b/docs/gemini_cli_agent_testing.md index 0e53518d..62c4782b 100644 --- a/docs/gemini_cli_agent_testing.md +++ b/docs/gemini_cli_agent_testing.md @@ -249,11 +249,15 @@ The evalset JSON file defines the test scenarios. Each scenario represents an ag | `id` | Yes | Unique identifier for the scenario | | `starting_prompt` | Yes | The first user message sent to Gemini CLI | | `conversation_plan` | Yes | Natural language instructions that guide the simulated user's behavior across turns. This defines the goals, expected information to provide, and how to react to agent responses. | -| `expected_trajectory` | Yes | Ordered list of tool names the agent is expected to call. Used by `trajectory_matcher` scorer. | +| `expected_trajectory` | Yes | Ordered list of tool names the agent is expected to call. Used by `trajectory_matcher` scorer. See [Tool name format](#tool-name-format) below. | | `env` | Optional | Per-scenario environment variables (merged with model config env) | | `kind` | Optional | Category label (e.g., `"tools"`) | | `max_turns` | Yes | Maximum number of conversation turns before the evaluation stops | +#### Tool name format + +Entries in `expected_trajectory` use the canonical form `__` (double-underscore separator) for MCP tools, and the bare name for native harness tools (e.g. `Read`, `Bash`). Each harness adapter normalizes its raw tool-call event into this form at the boundary, so the same evalset can score runs from Codex, Claude Code, and Gemini CLI without modification. The `` segment comes from the MCP server key in your model config and is case-sensitive — e.g. `cloud-sql` or `bigtable`. See `evalbench/generators/models/tool_naming.py` for the canonicalization helper. + #### Writing Good Conversation Plans The `conversation_plan` is a critical part of each scenario. It instructs the simulated user LLM how to behave. Best practices: @@ -270,7 +274,7 @@ The `conversation_plan` is a critical part of each scenario. It instructs the si "id": "csql-create-ambiguous-multiturn-01", "starting_prompt": "I need a database.", "conversation_plan": "The user starts with a vague request. You want to CREATE a NEW Cloud SQL instance named 'my-pg-app'. If the agent offers to create one, say YES. When asked for details, provide 'my-pg-app' as the instance name and 'user_data' as the database name. Never claim to have an existing instance. The goal is for the agent to eventually create the database 'user_data' inside 'my-pg-app' in astana-evaluation project.", - "expected_trajectory": ["list_instances", "create_instance", "create_database"], + "expected_trajectory": ["cloud-sql__list_instances", "cloud-sql__create_instance", "cloud-sql__create_database"], "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" }, @@ -790,7 +794,7 @@ reporting: "id": "list-and-inspect-01", "starting_prompt": "list all instances in project my-project", "conversation_plan": "Ask the agent to list instances. Once listed, get details of the 'prod-db' instance and verify it is RUNNABLE.", - "expected_trajectory": ["list_instances", "get_instance"], + "expected_trajectory": ["cloud-sql__list_instances", "cloud-sql__get_instance"], "env": { "GOOGLE_CLOUD_PROJECT": "my-project" }, "kind": "tools", "max_turns": 3 @@ -866,7 +870,7 @@ reporting: "id": "fake-create-success", "starting_prompt": "Create a new Cloud SQL instance named 'test-db' in project 'my-project'.", "conversation_plan": "All details are in the prompt. The agent should call create_instance and report success.", - "expected_trajectory": ["create_instance"], + "expected_trajectory": ["cloud-sql__create_instance"], "env": { "GOOGLE_CLOUD_PROJECT": "my-project" }, "kind": "tools", "max_turns": 3 diff --git a/evalbench/generators/models/claude_code.py b/evalbench/generators/models/claude_code.py index 05c8669c..064c3149 100644 --- a/evalbench/generators/models/claude_code.py +++ b/evalbench/generators/models/claude_code.py @@ -1,4 +1,5 @@ from .generator import QueryGenerator +from .tool_naming import canonicalize_claude_tool_name import subprocess import os import json @@ -10,6 +11,12 @@ from util.context import rpc_id_var +# Infrastructure tools that the Claude Code harness invokes for its own +# bookkeeping (e.g. enumerating available MCP tools) and that should not +# count toward the user-visible trajectory. +_CLAUDE_INFRA_TOOLS = frozenset({"ToolSearch"}) + + class CLICommand: def __init__(self, cli, prompt, env=None, resume=False, session_id=None, allowedTools=None): self.cli = cli @@ -507,8 +514,14 @@ def _parse_stream_json(self, stream_output: str) -> str: final_obj["response"] += block.get("text", "") elif block_type == "tool_use": tool_id = block.get("id", "") + # Claude Code emits MCP tools as + # ``mcp____``. Normalize to the + # canonical ``__`` form so the + # trajectory matcher can compare across + # harnesses without per-generator logic. + raw_name = block.get("name", "unknown") tool_uses[tool_id] = { - "tool_name": block.get("name", "unknown"), + "tool_name": canonicalize_claude_tool_name(raw_name), "parameters": block.get("input", {}), } @@ -692,14 +705,24 @@ def parse_response(self, stdout: str) -> dict: return {} def extract_tools(self, stdout: str) -> list[str]: - """Extracts the list of tools used from the CLI output.""" + """Extracts the list of tools used from the CLI output. + + Filters out infrastructure tools (see ``_CLAUDE_INFRA_TOOLS``) so + that trajectory comparisons reflect user-visible behavior only. + Tool names are already in canonical ``__`` form for + MCP tools (set when the ``tool_use`` block was recorded). + """ output_json = self.parse_response(stdout) if ( "stats" in output_json and "tools" in output_json["stats"] and "byName" in output_json["stats"]["tools"] ): - return list(output_json["stats"]["tools"]["byName"].keys()) + return [ + name + for name in output_json["stats"]["tools"]["byName"].keys() + if name not in _CLAUDE_INFRA_TOOLS + ] return [] def _get_installed_skills(self) -> set[str]: diff --git a/evalbench/generators/models/codex_cli.py b/evalbench/generators/models/codex_cli.py index 657de038..f88f405e 100644 --- a/evalbench/generators/models/codex_cli.py +++ b/evalbench/generators/models/codex_cli.py @@ -1,4 +1,5 @@ from .generator import QueryGenerator +from .tool_naming import canonical_tool_name import subprocess import os import json @@ -846,10 +847,15 @@ def item_payload(item: dict) -> dict: final_obj["response"] += text elif kind == "mcp_tool_call": - # Record on first sight; refresh on completion. + # Codex's mcp_tool_call payload exposes the MCP server name + # and tool name as separate fields. Combine them into the + # canonical ``__`` form so downstream scorers + # can compare across harnesses without per-generator logic. + server = payload.get("server", "") + tool = payload.get("tool", "unknown") tool_uses[item_id] = { - "tool_name": payload.get("tool", "unknown"), - "server": payload.get("server", ""), + "tool_name": canonical_tool_name(server, tool), + "server": server, "parameters": self._coerce_json(payload.get("arguments", {})), } if event_type == self._EV_ITEM_COMPLETED: diff --git a/evalbench/generators/models/gemini_cli.py b/evalbench/generators/models/gemini_cli.py index b12eba35..dd6846b5 100644 --- a/evalbench/generators/models/gemini_cli.py +++ b/evalbench/generators/models/gemini_cli.py @@ -1,4 +1,5 @@ from .generator import QueryGenerator +from .tool_naming import canonicalize_gemini_tool_name import subprocess import os import json @@ -965,7 +966,16 @@ def _parse_stream_json(self, stream_output: str) -> str: } for tid, tu in tool_uses.items(): - tname = tu.get("tool_name", "unknown") + # Gemini CLI reports MCP tools as + # ``mcp__`` (single-underscore + # separators); native tools use their bare names. + # Normalize MCP tools to the canonical + # ``__`` form so the trajectory + # matcher can compare across harnesses without + # per-generator logic. + tname = canonicalize_gemini_tool_name( + tu.get("tool_name", "unknown") + ) if tname not in tools_stats["byName"]: tools_stats["byName"][tname] = { "count": 0, diff --git a/evalbench/generators/models/tool_naming.py b/evalbench/generators/models/tool_naming.py new file mode 100644 index 00000000..0e08607a --- /dev/null +++ b/evalbench/generators/models/tool_naming.py @@ -0,0 +1,128 @@ +"""Canonical naming for MCP tool calls across harness adapters. + +Each harness (Codex, Claude Code, Gemini CLI) reports MCP tool calls in a +different format: + +- Codex emits the server name and tool name as separate fields on the + ``mcp_tool_call`` payload. +- Claude Code emits ``mcp____`` (double-underscore separators). + Server names may contain single underscores. +- Gemini CLI emits ``mcp__`` (single-underscore separators). + Upstream forbids underscores in the server name -- see + ``packages/core/src/tools/mcp-tool.ts`` in google-gemini/gemini-cli, where + the parser uses ``^([^_]+)_(.+)$`` -- so the format is unambiguous. + +This module converts each format into a single canonical string: + + __ for MCP tools + for native/built-in tools (no server) + +The double-underscore separator preserves server identity (so +``cloud-sql__list_instances`` and ``alloydb__list_instances`` stay distinct) +and matches the convention Claude Code already uses, while never colliding +with Gemini's single-underscore separator. Datasets store golden +``expected_trajectory`` entries in this same canonical form, allowing the +trajectory matcher to perform a plain string comparison without per-harness +special cases. +""" + +from __future__ import annotations + +import re +from typing import Optional, Tuple + + +CANONICAL_SEPARATOR = "__" + +_CLAUDE_MCP_PREFIX = "mcp__" + +# Matches ``mcp__`` where the server segment contains no +# underscores. Mirrors the contract enforced by gemini-cli upstream so we +# parse the same way it formats. +_GEMINI_MCP_PATTERN = re.compile(r"^mcp_([^_]+)_(.+)$") + + +def canonical_tool_name(server: Optional[str], tool: str) -> str: + """Return the canonical name for a tool call. + + Args: + server: MCP server name, or an empty string / None for native tools. + tool: Bare tool name as exposed by the server (or the native tool). + + Returns: + ``__`` when ``server`` is non-empty, otherwise ``tool``. + """ + if not tool: + return tool + if server: + return f"{server}{CANONICAL_SEPARATOR}{tool}" + return tool + + +def parse_claude_mcp_tool_name(name: str) -> Optional[Tuple[str, str]]: + """Parse a Claude Code MCP tool name into ``(server, tool)``. + + Claude Code's SDK reports MCP tools as ``mcp____``. Both + the server and tool segments may contain single underscores; only the + double-underscore acts as a separator. The first ``__`` after the + ``mcp__`` prefix is treated as the server/tool boundary so the tool + segment may itself contain ``__``. + + Returns: + ``(server, tool)`` if ``name`` matches the expected format, + otherwise ``None``. + """ + if not name.startswith(_CLAUDE_MCP_PREFIX): + return None + remainder = name[len(_CLAUDE_MCP_PREFIX):] + server, sep, tool = remainder.partition(CANONICAL_SEPARATOR) + if not sep or not server or not tool: + return None + return server, tool + + +def parse_gemini_mcp_tool_name(name: str) -> Optional[Tuple[str, str]]: + """Parse a Gemini CLI MCP tool name into ``(server, tool)``. + + Gemini CLI reports MCP tools as ``mcp__`` using a single + underscore separator. The upstream parser requires the server segment + to contain no underscores, which makes the split unambiguous even when + the tool name itself contains underscores. + + Returns: + ``(server, tool)`` if ``name`` matches the expected format, + otherwise ``None``. + """ + match = _GEMINI_MCP_PATTERN.match(name) + if not match: + return None + return match.group(1), match.group(2) + + +def canonicalize_claude_tool_name(name: str) -> str: + """Convert a Claude Code tool name to canonical form. + + MCP tools are reformatted; native tools (e.g. ``Read``, ``Bash``) pass + through unchanged. If a name starts with ``mcp__`` but does not match + the expected structure, it is returned as-is so the caller can still see + and debug the raw value. + """ + parsed = parse_claude_mcp_tool_name(name) + if parsed is None: + return name + server, tool = parsed + return canonical_tool_name(server, tool) + + +def canonicalize_gemini_tool_name(name: str) -> str: + """Convert a Gemini CLI tool name to canonical form. + + MCP tools are reformatted; native tools pass through unchanged. If a + name starts with ``mcp_`` but does not match the expected structure, it + is returned as-is. + """ + parsed = parse_gemini_mcp_tool_name(name) + if parsed is None: + return name + server, tool = parsed + return canonical_tool_name(server, tool) diff --git a/evalbench/scorers/prompt/parameteranalysis.py b/evalbench/scorers/prompt/parameteranalysis.py index 36603cad..53c134f5 100644 --- a/evalbench/scorers/prompt/parameteranalysis.py +++ b/evalbench/scorers/prompt/parameteranalysis.py @@ -9,6 +9,7 @@ ### Best Practices for Tool Design When providing your suggestions, consider the following best practices for tool building: - **Tool Names**: Use `snake_case` (`_`) and avoid product-specific prefixes. + Note: tool names you see here may carry a `__` prefix (e.g. `cloud-sql__list_instances`) added by the evaluation framework to disambiguate identically-named tools across MCP servers. Ignore this prefix when evaluating tool design — critique only the portion after `__`. - **Focus**: Tools should be focused on a specific task. Aim for tools comprehensive enough to complete a task in one go, but avoid bundling unrelated actions. - **Idempotency**: Whenever possible, tools should be idempotent (e.g., returning success if a resource to be created already exists). - **Actionable Error Messages**: Errors should be clear and actionable, explaining what went wrong, why, and how to fix it instead of generic errors. diff --git a/evalbench/scorers/trajectorymatcher.py b/evalbench/scorers/trajectorymatcher.py index 1c7e49a3..292b4e2a 100644 --- a/evalbench/scorers/trajectorymatcher.py +++ b/evalbench/scorers/trajectorymatcher.py @@ -1,7 +1,13 @@ """ TrajectoryMatcher -It compares the expected tool usage trajectory with the actual executed tools. +Compares the expected tool usage trajectory with the actual executed tools. + +Tool names on both sides are expected to already be in canonical form -- MCP +tools as ``__`` and native tools as their bare names. Each +harness adapter performs that normalization at the boundary (see +``generators/models/tool_naming.py``), so this scorer can stay +generator-agnostic and do a plain string comparison. """ from typing import Tuple, Any, List @@ -20,7 +26,6 @@ def __init__(self, config: dict): self.name = "trajectory_matcher" self.config = config self.enforce_order = config.get("enforce_order", False) - self.generator = config.get("generator", "") def _levenshtein_distance(self, seq1: List[str], seq2: List[str]) -> int: n, m = len(seq1), len(seq2) @@ -53,30 +58,6 @@ def _jaccard_similarity(self, set1: set, set2: set) -> float: return 1.0 # Both are empty return intersection / union - def _normalize_trajectory(self, trajectory: List[str]) -> List[str]: - if not trajectory: - return [] - - normalized = [] - for tool in trajectory: - if self.generator == "claude_code": - if tool == "ToolSearch": - continue - if tool.startswith("mcp__"): - # Drop the prefix "mcp____" - # Assuming the format is mcp__server_name__tool_name - parts = tool.split("__", 2) - if len(parts) == 3: - normalized.append(parts[2]) - else: - # If it doesn't match expected parts, just strip the prefix - normalized.append(tool.replace("mcp__", "", 1)) - else: - normalized.append(tool) - else: - normalized.append(tool) - return normalized - def compare( self, nl_prompt: str, @@ -106,9 +87,6 @@ def compare( expected = golden_execution_result or [] actual = generated_execution_result or [] - expected = self._normalize_trajectory(expected) - actual = self._normalize_trajectory(actual) - if not isinstance(expected, list) or not isinstance(actual, list): return 0.0, "Trajectory data must be lists." diff --git a/evalbench/test/tool_naming_test.py b/evalbench/test/tool_naming_test.py new file mode 100644 index 00000000..c2e7d852 --- /dev/null +++ b/evalbench/test/tool_naming_test.py @@ -0,0 +1,127 @@ +"""Unit tests for the canonical MCP tool-naming helper.""" + +import os +import sys +import unittest + +# Make the ``generators`` package importable when the test is run directly. +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from generators.models.tool_naming import ( + canonical_tool_name, + canonicalize_claude_tool_name, + canonicalize_gemini_tool_name, + parse_claude_mcp_tool_name, + parse_gemini_mcp_tool_name, +) + + +class CanonicalToolNameTest(unittest.TestCase): + + def test_joins_server_and_tool(self): + self.assertEqual( + canonical_tool_name("cloud-sql", "list_instances"), + "cloud-sql__list_instances", + ) + + def test_returns_bare_tool_when_no_server(self): + self.assertEqual(canonical_tool_name("", "Read"), "Read") + self.assertEqual(canonical_tool_name(None, "Read"), "Read") + + def test_empty_tool_returns_empty(self): + self.assertEqual(canonical_tool_name("cloud-sql", ""), "") + + +class ParseClaudeMcpToolNameTest(unittest.TestCase): + + def test_basic(self): + self.assertEqual( + parse_claude_mcp_tool_name("mcp__cloud-sql__list_instances"), + ("cloud-sql", "list_instances"), + ) + + def test_server_with_underscore(self): + # Claude allows underscores in server names; only the first ``__`` + # separates server from tool. + self.assertEqual( + parse_claude_mcp_tool_name("mcp__my_server__do_thing"), + ("my_server", "do_thing"), + ) + + def test_tool_with_double_underscore_preserved(self): + self.assertEqual( + parse_claude_mcp_tool_name("mcp__srv__odd__tool"), + ("srv", "odd__tool"), + ) + + def test_rejects_missing_prefix(self): + self.assertIsNone(parse_claude_mcp_tool_name("list_instances")) + + def test_rejects_empty_server(self): + self.assertIsNone(parse_claude_mcp_tool_name("mcp____tool")) + + def test_rejects_no_tool(self): + self.assertIsNone(parse_claude_mcp_tool_name("mcp__server__")) + + +class ParseGeminiMcpToolNameTest(unittest.TestCase): + + def test_basic(self): + self.assertEqual( + parse_gemini_mcp_tool_name("mcp_cloud-sql_list_instances"), + ("cloud-sql", "list_instances"), + ) + + def test_tool_with_underscores(self): + # Server has no underscore by upstream contract; tool may contain + # several. + self.assertEqual( + parse_gemini_mcp_tool_name("mcp_alloydb_create_user_password"), + ("alloydb", "create_user_password"), + ) + + def test_rejects_missing_prefix(self): + self.assertIsNone(parse_gemini_mcp_tool_name("list_instances")) + + def test_rejects_server_only(self): + self.assertIsNone(parse_gemini_mcp_tool_name("mcp_cloudsql")) + + +class CanonicalizeAdapterFormsTest(unittest.TestCase): + + def test_claude_mcp_becomes_canonical(self): + self.assertEqual( + canonicalize_claude_tool_name("mcp__cloud-sql__list_instances"), + "cloud-sql__list_instances", + ) + + def test_claude_native_tool_passthrough(self): + self.assertEqual(canonicalize_claude_tool_name("Read"), "Read") + self.assertEqual(canonicalize_claude_tool_name("Bash"), "Bash") + + def test_claude_malformed_mcp_returned_as_is(self): + # Falls back to the raw name so callers can debug unexpected inputs + # instead of silently producing a misleading canonical form. + self.assertEqual( + canonicalize_claude_tool_name("mcp__only-server"), + "mcp__only-server", + ) + + def test_gemini_mcp_becomes_canonical(self): + self.assertEqual( + canonicalize_gemini_tool_name("mcp_cloud-sql_list_instances"), + "cloud-sql__list_instances", + ) + + def test_gemini_native_tool_passthrough(self): + self.assertEqual(canonicalize_gemini_tool_name("write_file"), "write_file") + + def test_gemini_malformed_mcp_returned_as_is(self): + self.assertEqual( + canonicalize_gemini_tool_name("mcp_lonely"), + "mcp_lonely", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/evalbench/test/trajectory_matcher_test.py b/evalbench/test/trajectory_matcher_test.py index fe8eed94..e064fb34 100644 --- a/evalbench/test/trajectory_matcher_test.py +++ b/evalbench/test/trajectory_matcher_test.py @@ -1,85 +1,94 @@ -import unittest -import sys +"""Unit tests for the trajectory matcher. + +The matcher is generator-agnostic: it expects tool names on both sides to +already be in canonical ``__`` form (or bare for native +tools). Per-harness normalization lives in the adapters; see +``generators/models/tool_naming.py`` for the canonical-naming helper and +``test/tool_naming_test.py`` for its tests. +""" + import os -from unittest.mock import patch, mock_open +import sys +import unittest -# Add the parent directory to sys.path to find scorers +# Make the ``scorers`` package importable when the test is run directly. sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from scorers.trajectorymatcher import TrajectoryMatcher -class TestTrajectoryMatcher(unittest.TestCase): +def _compare(matcher, expected, actual): + """Convenience wrapper around the matcher's positional ``compare`` API.""" + return matcher.compare( + None, None, None, expected, None, None, None, actual, None, None + ) + - def test_default_behavior_no_normalization(self): - config = {} - matcher = TrajectoryMatcher(config) +class TrajectoryMatcherTest(unittest.TestCase): - expected = ["ToolA", "ToolB", "ToolSearch"] - actual = ["ToolA", "ToolB", "ToolSearch"] + def test_exact_match_returns_full_score(self): + matcher = TrajectoryMatcher({}) - score, explanation = matcher.compare(None, None, None, expected, None, None, None, actual, None, None) + expected = ["cloud-sql__list_instances", "cloud-sql__get_instance"] + actual = ["cloud-sql__list_instances", "cloud-sql__get_instance"] + + score, explanation = _compare(matcher, expected, actual) self.assertEqual(score, 100.0) self.assertIn("Jaccard Similarity Score: 100.00", explanation) - def test_claude_behavior_remove_toolsearch(self): - config = {"generator": "claude_code"} - matcher = TrajectoryMatcher(config) + def test_jaccard_ignores_order_by_default(self): + matcher = TrajectoryMatcher({}) - expected = ["ToolA", "ToolSearch", "ToolB"] - actual = ["ToolA", "ToolB"] + expected = ["cloud-sql__list_instances", "cloud-sql__get_instance"] + actual = ["cloud-sql__get_instance", "cloud-sql__list_instances"] - # After normalization, expected should become ["ToolA", "ToolB"] - # So it should match actual exactly. - score, explanation = matcher.compare(None, None, None, expected, None, None, None, actual, None, None) + score, _ = _compare(matcher, expected, actual) self.assertEqual(score, 100.0) - def test_claude_behavior_strip_mcp_prefix(self): - config = {"generator": "claude_code"} - matcher = TrajectoryMatcher(config) + def test_strict_ordering_penalizes_swaps(self): + matcher = TrajectoryMatcher({"enforce_order": True}) - expected = ["mcp__server__toolA", "ToolB"] - actual = ["toolA", "ToolB"] + expected = ["cloud-sql__list_instances", "cloud-sql__get_instance"] + actual = ["cloud-sql__get_instance", "cloud-sql__list_instances"] - # After normalization, expected should become ["toolA", "ToolB"] - score, explanation = matcher.compare(None, None, None, expected, None, None, None, actual, None, None) - self.assertEqual(score, 100.0) + score, _ = _compare(matcher, expected, actual) + self.assertLess(score, 100.0) - def test_claude_behavior_combined(self): - config = {"generator": "claude_code"} - matcher = TrajectoryMatcher(config) + def test_server_qualifier_distinguishes_same_tool_across_servers(self): + # Without the server prefix, both calls would collide with the + # expected ``list_instances``. The canonical form keeps them + # distinct: alloydb's call should not satisfy a cloud-sql expectation. + matcher = TrajectoryMatcher({}) - expected = ["mcp__server__toolA", "ToolSearch", "ToolB"] - actual = ["toolA", "ToolB"] + expected = ["cloud-sql__list_instances"] + actual = ["alloydb__list_instances"] - score, explanation = matcher.compare(None, None, None, expected, None, None, None, actual, None, None) - self.assertEqual(score, 100.0) + score, _ = _compare(matcher, expected, actual) + self.assertEqual(score, 0.0) - def test_flexible_ordering(self): - config = {"generator": "claude_code"} - matcher = TrajectoryMatcher(config) + def test_native_tools_pass_through(self): + matcher = TrajectoryMatcher({}) - expected = ["mcp__server__toolA", "ToolB"] - actual = ["ToolB", "toolA"] + expected = ["Read", "Bash"] + actual = ["Read", "Bash"] - # Jaccard similarity should ignore order - score, explanation = matcher.compare(None, None, None, expected, None, None, None, actual, None, None) + score, _ = _compare(matcher, expected, actual) self.assertEqual(score, 100.0) - def test_strict_ordering(self): - config = {"generator": "claude_code", "enforce_order": True} - matcher = TrajectoryMatcher(config) - - expected = ["mcp__server__toolA", "ToolB"] - actual = ["ToolB", "toolA"] + def test_both_empty_is_full_score(self): + matcher = TrajectoryMatcher({}) + score, explanation = _compare(matcher, [], []) + self.assertEqual(score, 100.0) + self.assertIn("empty", explanation) - # Levenshtein distance will consider order. - # expected normalized: ["toolA", "ToolB"] - # actual normalized: ["ToolB", "toolA"] - # They are different. - score, explanation = matcher.compare(None, None, None, expected, None, None, None, actual, None, None) - self.assertLess(score, 100.0) + def test_generation_error_returns_zero(self): + matcher = TrajectoryMatcher({}) + score, explanation = matcher.compare( + None, None, None, ["x"], None, None, None, ["x"], None, "boom" + ) + self.assertEqual(score, 0.0) + self.assertIn("boom", explanation) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main()