GoogleCloudPlatform · IsmailMehdi · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
@@ -207,7 +207,7 @@ Contains the test cases.
       "id": "list-instances-01",
       "starting_prompt": "List all Cloud SQL instances in project my-evaluation-project",
       "conversation_plan": "Ensure the agent accurately calls list_instances. Verify the output is returned correctly.",
-      "expected_trajectory": ["list_instances"],
+      "expected_trajectory": ["cloud-sql__list_instances"],
       "env": { "GOOGLE_CLOUD_PROJECT": "my-evaluation-project" },
       "max_turns": 4
     }

@@ -5,7 +5,7 @@
       "starting_prompt": "Create a new Cloud SQL instance named 'my-fake-db' in project 'astana-evaluation'. Use PostgreSQL 17, and set the password to 'password123'. Also use the 'Development' edition preset.",
       "conversation_plan": "The user wants to create a database. All required parameters are in the starting prompt. The agent should call create_instance and report the success message back.",
       "expected_trajectory": [
-        "create_instance"
+        "cloud-sql__create_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -18,7 +18,7 @@
       "starting_prompt": "Get the details for the Cloud SQL instance named 'missing-db' in project 'astana-evaluation'.",
       "conversation_plan": "The user wants to get instance details. The agent should call get_instance, which is hardcoded to fail with an error 'Instance not found or permission denied'. The agent should explain that the instance could not be found based on the error.",
       "expected_trajectory": [
-        "get_instance"
+        "cloud-sql__get_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"

@@ -5,8 +5,8 @@
       "starting_prompt": "list all Cloud SQL instances in project astana-evaluation",
       "conversation_plan": "Ask the agent to list instances in project astana-evaluation. Once all instances are listed if nl2code exist get its state and validate its RUNNABLE",
       "expected_trajectory": [
-        "list_instances",
-        "get_instance"
+        "cloud-sql__list_instances",
+        "cloud-sql__get_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -19,9 +19,9 @@
       "starting_prompt": "I need a database.",
       "conversation_plan": "The user starts with a vague request. You want to CREATE a NEW Cloud SQL instance named 'my-pg-app'. If the agent offers to create one, say YES. When asked for details, provide 'my-pg-app' as the instance name and 'user_data' as the database name. Never claim to have an existing instance. The goal is for the agent to eventually create the database 'user_data' inside 'my-pg-app' in astana-evaluation project.",
       "expected_trajectory": [
-        "list_instances",
-        "create_instance",
-        "create_database"
+        "cloud-sql__list_instances",
+        "cloud-sql__create_instance",
+        "cloud-sql__create_database"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -34,8 +34,8 @@
       "starting_prompt": "Update the instance 'non-existent-db-123' to have 8 cores.",
       "conversation_plan": "The user asks to interact with an instance named 'non-existent-db-123' in astana-evaluation project that doesn't exist. The agent should try to get the instance details or update it directly, fail to find it, and inform the user. The user will then ask to list instances to find the correct name.",
       "expected_trajectory": [
-        "update_instance",
-        "list_instances"
+        "cloud-sql__update_instance",
+        "cloud-sql__list_instances"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"

@@ -5,7 +5,7 @@
       "starting_prompt": "Create a new Cloud SQL instance named 'my-fake-db' in project 'astana-evaluation'. Use PostgreSQL 17, and set the password to 'password123'. Also use the 'Development' edition preset.",
       "conversation_plan": "The user wants to create a database. All required parameters are in the starting prompt. The agent should call create_instance and report the success message back.",
       "expected_trajectory": [
-        "create_instance"
+        "cloud-sql__create_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -18,7 +18,7 @@
       "starting_prompt": "Get the details for the Cloud SQL instance named 'missing-db' in project 'astana-evaluation'.",
       "conversation_plan": "The user wants to get instance details. The agent should call get_instance, which is hardcoded to fail with an error 'Instance not found or permission denied'. The agent should explain that the instance could not be found based on the error.",
       "expected_trajectory": [
-        "get_instance"
+        "cloud-sql__get_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"

@@ -5,8 +5,8 @@
       "starting_prompt": "list all Cloud SQL instances in project astana-evaluation",
       "conversation_plan": "Ask the agent to list instances in project astana-evaluation. Once all instances are listed if nl2code exist get its state and validate its RUNNABLE",
       "expected_trajectory": [
-        "list_instances",
-        "get_instance"
+        "cloud-sql__list_instances",
+        "cloud-sql__get_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -19,9 +19,9 @@
       "starting_prompt": "I need a database.",
       "conversation_plan": "The user starts with a vague request. You want to CREATE a NEW Cloud SQL instance named 'my-pg-app'. If the agent offers to create one, say YES. When asked for details, provide 'my-pg-app' as the instance name and 'user_data' as the database name. Never claim to have an existing instance. The goal is for the agent to eventually create the database 'user_data' inside 'my-pg-app' in astana-evaluation project.",
       "expected_trajectory": [
-        "list_instances",
-        "create_instance",
-        "create_database"
+        "cloud-sql__list_instances",
+        "cloud-sql__create_instance",
+        "cloud-sql__create_database"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -34,8 +34,8 @@
       "starting_prompt": "Update the instance 'non-existent-db-123' to have 8 cores.",
       "conversation_plan": "The user asks to interact with an instance named 'non-existent-db-123' in astana-evaluation project that doesn't exist. The agent should try to get the instance details or update it directly, fail to find it, and inform the user. The user will then ask to list instances to find the correct name.",
       "expected_trajectory": [
-        "update_instance",
-        "list_instances"
+        "cloud-sql__update_instance",
+        "cloud-sql__list_instances"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"

@@ -5,10 +5,10 @@
       "starting_prompt": "I need to initialize my Bigtable environment for evaluation if they don't exist already. Please create the following instances in project cloud-bigtable-spacewalk: 'evalbench-prod-data', 'evalbench-legacy-archive', 'evalbench-staging-db', 'evalbench-customers-v2', 'evalbench-dev-db', and 'evalbench-test-instance-123'. All clusters should be in us-central1-a. Also, in 'evalbench-legacy-archive' create a table 'evalbench-v1-data' and in 'evalbench-staging-db' create a table 'evalbench-temporary-data' if they don't exist already.",
       "conversation_plan": "The user is seeding the environment. The agent should sequentially or in parallel create all requested instances and tables. Confirm when everything is ready.",
       "expected_trajectory": [
-        "mcp_Bigtable_list_instances",
-        "mcp_Bigtable_create_instance",
-        "mcp_Bigtable_list_tables",
-        "mcp_Bigtable_create_table"
+        "bigtable__list_instances",
+        "bigtable__create_instance",
+        "bigtable__list_tables",
+        "bigtable__create_table"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk"
@@ -21,8 +21,8 @@
       "starting_prompt": "What Bigtable instances are running in project cloud-bigtable-spacewalk?",
       "conversation_plan": "The user starts with a discovery question. The agent should list instances. The user then asks for the configuration details of the 'evalbench-prod-data' instance to check its clusters.",
       "expected_trajectory": [
-        "mcp_Bigtable_list_instances",
-        "mcp_Bigtable_get_instance"
+        "bigtable__list_instances",
+        "bigtable__get_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk"
@@ -35,9 +35,9 @@
       "starting_prompt": "Set up a new Bigtable instance 'evalbench-metrics-db' in us-east1-c and add a 'evalbench-raw-metrics' table with a 'data' column family. If the instance creation is pending, please wait or check until it's ready before creating the table.",
       "conversation_plan": "The agent must create the instance. If the table creation fails because the instance isn't ready, the agent should poll 'get_instance' or wait and try again. The user expects both to be completed.",
       "expected_trajectory": [
-        "mcp_Bigtable_create_instance",
-        "mcp_Bigtable_get_instance",
-        "mcp_Bigtable_create_table"
+        "bigtable__create_instance",
+        "bigtable__get_instance",
+        "bigtable__create_table"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk"
@@ -50,7 +50,7 @@
       "starting_prompt": "I need to ensure the Bigtable instance 'evalbench-prod-data' exists. If it's already there, just let me know its current state. If not, create it in us-central1-a.",
       "conversation_plan": "The agent should first check for the instance's existence using 'get_instance'. If it exists, it should report the state. If it returns a 404/not found, it should proceed to 'create_instance'.",
       "expected_trajectory": [
-        "mcp_Bigtable_get_instance"
+        "bigtable__get_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk"
@@ -63,8 +63,8 @@
       "starting_prompt": "I need to find the 'evalbench-audit' table in my 'evalbench-prod-data'. Can you get its metadata?",
       "conversation_plan": "The user wants to find a specific table. The agent should list tables in the instance to verify existence and then get the table details.",
       "expected_trajectory": [
-        "mcp_Bigtable_list_tables",
-        "mcp_Bigtable_get_table"
+        "bigtable__list_tables",
+        "bigtable__get_table"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk"
@@ -77,7 +77,7 @@
       "starting_prompt": "Delete the 'evalbench-temporary-data' table in the 'evalbench-staging-db' instance.",
       "conversation_plan": "A direct administrative action. The agent must call delete_table. Since the tool requires confirmation, the agent should ask and the user will say 'yes'.",
       "expected_trajectory": [
-        "mcp_Bigtable_delete_table"
+        "bigtable__delete_table"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk"
@@ -90,9 +90,9 @@
       "starting_prompt": "Get details for the 'evalbench-customer-db' instance.",
       "conversation_plan": "The user refers to an instance by a name that doesn't exist. The agent's 'get_instance' call will fail. The agent should then list instances, and the user will say 'Oh, I meant evalbench-customers-v2, get that one'.",
       "expected_trajectory": [
-        "mcp_Bigtable_get_instance",
-        "mcp_Bigtable_list_instances",
-        "mcp_Bigtable_get_instance"
+        "bigtable__get_instance",
+        "bigtable__list_instances",
+        "bigtable__get_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk"
@@ -105,8 +105,8 @@
       "starting_prompt": "Add a new table to my instance.",
       "conversation_plan": "A vague request. The agent should ask which instance. The user provides 'evalbench-dev-db'. The agent then asks for table name and families. The user provides name 'evalbench-app-logs' and family 'metadata'.",
       "expected_trajectory": [
-        "mcp_Bigtable_list_instances",
-        "mcp_Bigtable_create_table"
+        "bigtable__list_instances",
+        "bigtable__create_table"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk"
@@ -119,7 +119,7 @@
       "starting_prompt": "I no longer need the 'evalbench-test-instance-123'. Please delete it.",
       "conversation_plan": "The user wants to delete a specific instance. The agent must call delete_instance. The tool requires explicit user confirmation, so the agent should ask and the user will confirm with 'yes'.",
       "expected_trajectory": [
-        "mcp_Bigtable_delete_instance"
+        "bigtable__delete_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk"
@@ -132,7 +132,7 @@
       "starting_prompt": "Evaluation is complete. Please delete all the Bigtable instances we used in project cloud-bigtable-spacewalk: 'evalbench-prod-data', 'evalbench-legacy-archive', 'evalbench-staging-db', 'evalbench-customers-v2', 'evalbench-dev-db', and 'evalbench-metrics-db'. Confirm each deletion when asked.",
       "conversation_plan": "The user is cleaning up. The agent should call delete_instance for each instance. The user will provide 'yes' for each confirmation request.",
       "expected_trajectory": [
-        "mcp_Bigtable_delete_instance"
+        "bigtable__delete_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "cloud-bigtable-spacewalk"

@@ -5,7 +5,7 @@
       "starting_prompt": "Create a new Cloud SQL instance named 'my-fake-db' in project 'astana-evaluation'. Use PostgreSQL 17, and set the password to 'password123'. Also use the 'Development' edition preset.",
       "conversation_plan": "The user wants to create a database. All required parameters are in the starting prompt. The agent should call create_instance and report the success message back.",
       "expected_trajectory": [
-        "create_instance"
+        "cloud-sql__create_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -18,7 +18,7 @@
       "starting_prompt": "Get the details for the Cloud SQL instance named 'missing-db' in project 'astana-evaluation'.",
       "conversation_plan": "The user wants to get instance details. The agent should call get_instance, which is hardcoded to fail with an error 'Instance not found or permission denied'. The agent should explain that the instance could not be found based on the error.",
       "expected_trajectory": [
-        "get_instance"
+        "cloud-sql__get_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"

@@ -5,8 +5,8 @@
       "starting_prompt": "list all instances in project astana-evaluation",
       "conversation_plan": "Ask the agent to list instances in project astana-evaluation. Once all instances are listed if nl2code exist get its state and validate its RUNNABLE",
       "expected_trajectory": [
-        "list_instances",
-        "get_instance"
+        "cloud-sql__list_instances",
+        "cloud-sql__get_instance"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -19,9 +19,9 @@
       "starting_prompt": "I need a database.",
       "conversation_plan": "The user starts with a vague request. You want to CREATE a NEW Cloud SQL instance named 'my-pg-app'. If the agent offers to create one, say YES. When asked for details, provide 'my-pg-app' as the instance name and 'user_data' as the database name. Never claim to have an existing instance. The goal is for the agent to eventually create the database 'user_data' inside 'my-pg-app' in astana-evaluation project.",
       "expected_trajectory": [
-        "list_instances",
-        "create_instance",
-        "create_database"
+        "cloud-sql__list_instances",
+        "cloud-sql__create_instance",
+        "cloud-sql__create_database"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation",
@@ -35,9 +35,9 @@
       "starting_prompt": "I want to clone the 'production-db' instance.",
       "conversation_plan": "The user wants to create a clone of the instance 'nl2code'. If the agent asks for a new instance name, provide 'staging-nl2code'. If asked to confirm the project, say 'astana-evaluation'.",
       "expected_trajectory": [
-        "get_instance",
-        "clone_instance",
-        "get_operation"
+        "cloud-sql__get_instance",
+        "cloud-sql__clone_instance",
+        "cloud-sql__get_operation"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -50,9 +50,9 @@
       "starting_prompt": "Scale up my database instance.",
       "conversation_plan": "The user wants to increase the resources for 'nl2code' in astana-evaluation project. When asked for specifics, state you want to change the tier/machine type to 'db-custom-4-15360' (4 vCPUs, 15GB RAM). Confirm the update when asked. Verify the operation status.",
       "expected_trajectory": [
-        "get_instance",
-        "update_instance",
-        "get_operation"
+        "cloud-sql__get_instance",
+        "cloud-sql__update_instance",
+        "cloud-sql__get_operation"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -65,8 +65,8 @@
       "starting_prompt": "I need to rotate the password for a user on the 'magic' instance, but I forgot their exact username.",
       "conversation_plan": "First, the user asks to list all users on the 'magic' instance. The agent will show the users. The user then identifies 'webapp_service_account' and asks to update its password to 'NewStrictPassword99!'. Confirm the password update. The project name is astana-evaluation",
       "expected_trajectory": [
-        "list_users",
-        "update_user"
+        "cloud-sql__list_users",
+        "cloud-sql__update_user"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -79,8 +79,8 @@
       "starting_prompt": "Update the instance 'non-existent-db-123' to have 8 cores.",
       "conversation_plan": "The user asks to interact with an instance named 'non-existent-db-123' in astana-evaluation project that doesn't exist. The agent should try to get the instance details or update it directly, fail to find it, and inform the user. The user will then ask to list instances to find the correct name.",
       "expected_trajectory": [
-        "update_instance",
-        "list_instances"
+        "cloud-sql__update_instance",
+        "cloud-sql__list_instances"
       ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "astana-evaluation"
@@ -89,4 +89,4 @@
       "max_turns": 4
     }
   ]
-}
+}
@@ -213,7 +213,7 @@ The model config defines the Claude Code CLI version, model, auth, environment,
 
 ### 3. Evaluation Dataset (Evalset)
 
-**Identical schema** to the Gemini CLI evalset. See [Gemini CLI doc — Evalset](./gemini_cli_agent_testing.md#3-evaluation-dataset-evalset) for details.
+**Identical schema** to the Gemini CLI evalset. See [Gemini CLI doc — Evalset](./gemini_cli_agent_testing.md#3-evaluation-dataset-evalset) for details, including the canonical [tool name format](./gemini_cli_agent_testing.md#tool-name-format) used in `expected_trajectory`.
 
 Minimal example:
 
@@ -224,7 +224,7 @@ Minimal example:
       "id": "cloud-sql-list-instances-01",
       "starting_prompt": "list all Cloud SQL instances in project astana-evaluation",
       "conversation_plan": "Ask the agent to list instances. If nl2code exists, get its state and verify it is RUNNABLE.",
-      "expected_trajectory": ["list_instances", "get_instance"],
+      "expected_trajectory": ["cloud-sql__list_instances", "cloud-sql__get_instance"],
       "env": { "GOOGLE_CLOUD_PROJECT": "astana-evaluation" },
       "kind": "tools",
       "max_turns": 3