From 3d5ba096628e575fc473058d9210ce094828cc1d Mon Sep 17 00:00:00 2001 From: Les Orchard Date: Wed, 13 May 2026 13:39:05 -0700 Subject: [PATCH] chore(core): hygiene cleanup from issue #431 Bundle the items from #431 that still apply to current main: - A. Warn + emit SYSTEM_DEBUG_TOOL_DROP when providers return more than one tool call in a single turn, so dropped extras are observable rather than silently lost. - B. Replace the string-match `isSetupError` check with `PlanningError` and `NoStartingUrlError` subclasses (re-exported from the public API), so setup-error detection survives refactors of the underlying message text. - D. Bump the `wait` tool's upper bound from 30s to 120s and rewrite its `execute` to sleep directly with abort-signal polling instead of going through `page.waitForTimeout` (which is abort-blind), so a 120s wait can be interrupted by a user abort within ~500ms. - E. Drop the unused `actionLoopSystemPrompt` export; refactor the prompts tests to call `buildActionLoopSystemPrompt(false, false)` directly. Item C (harden image-strip detection) was a no-op: the fallback the issue refers to is not present in the current main lineage, so there is nothing to harden. Item F (persona prompt scroll guidance) is deferred to a follow-up PR that adds a real `scroll` tool, so the prompt text becomes accurate rather than being stripped. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/core/schemas/webagent-event.json | 52 ++++++++ packages/core/src/core.ts | 8 +- packages/core/src/errors.ts | 30 +++++ packages/core/src/events.ts | 18 +++ packages/core/src/prompts.ts | 5 +- packages/core/src/tools/webActionTools.ts | 62 ++++++++-- packages/core/src/webAgent.ts | 47 +++++-- packages/core/test/events.test.ts | 1 + packages/core/test/prompts.test.ts | 5 +- .../core/test/tools/webActionTools.test.ts | 38 +++++- packages/core/test/webAgent.test.ts | 117 ++++++++++++++++++ 11 files changed, 347 insertions(+), 36 deletions(-) diff --git a/packages/core/schemas/webagent-event.json b/packages/core/schemas/webagent-event.json index 55b28f20..296a72d6 100644 --- a/packages/core/schemas/webagent-event.json +++ b/packages/core/schemas/webagent-event.json @@ -4059,6 +4059,41 @@ ], "type": "object" }, + "ToolDropDebugEventData": { + "additionalProperties": false, + "description": "Event data for tool-drop diagnostics: emitted when a provider returns more than one tool call in a single turn and the extras are dropped. The system prompt instructs the model to call exactly one tool per turn, but some providers occasionally return multiple — this event surfaces those cases so they can be observed instead of silently lost.", + "properties": { + "droppedCount": { + "description": "Number of tool calls that were dropped (returnedCount - 1).", + "type": "number" + }, + "droppedTools": { + "description": "Names of the dropped tools (in original order, excluding the first).", + "items": { + "type": "string" + }, + "type": "array" + }, + "iterationId": { + "type": "string" + }, + "keptTool": { + "description": "Name of the tool that was kept (first in the provider's response).", + "type": "string" + }, + "timestamp": { + "type": "number" + } + }, + "required": [ + "droppedCount", + "droppedTools", + "iterationId", + "keptTool", + "timestamp" + ], + "type": "object" + }, "ValidationErrorEventData": { "additionalProperties": false, "description": "Event data for validation errors during action response processing", @@ -4520,6 +4555,23 @@ ], "type": "object" }, + { + "additionalProperties": false, + "properties": { + "data": { + "$ref": "#/definitions/ToolDropDebugEventData" + }, + "type": { + "const": "system:debug_tool_drop", + "type": "string" + } + }, + "required": [ + "type", + "data" + ], + "type": "object" + }, { "additionalProperties": false, "properties": { diff --git a/packages/core/src/core.ts b/packages/core/src/core.ts index e70ddf3c..75e4f2c7 100644 --- a/packages/core/src/core.ts +++ b/packages/core/src/core.ts @@ -49,7 +49,13 @@ export type { Logger } from "./loggers/types.js"; export type { Action, TaskValidationResult } from "./schemas.js"; // Error types -export { RecoverableError, BrowserException, NavigationTimeoutException } from "./errors.js"; +export { + RecoverableError, + BrowserException, + NavigationTimeoutException, + PlanningError, + NoStartingUrlError, +} from "./errors.js"; // Navigation retry configuration export type { NavigationRetryConfig } from "./browser/navigationRetry.js"; diff --git a/packages/core/src/errors.ts b/packages/core/src/errors.ts index c25fc736..1a8a0fd9 100644 --- a/packages/core/src/errors.ts +++ b/packages/core/src/errors.ts @@ -96,6 +96,36 @@ export class ToolExecutionError extends RecoverableError { } } +/** + * Thrown when task planning fails (model never returns a usable plan after retries). + * + * Setup error: extends Error rather than RecoverableError because the agent's + * execute loop should propagate it to the caller rather than treat it as a + * retryable mid-task error. + */ +export class PlanningError extends Error { + constructor(message: string) { + super(message); + this.name = "PlanningError"; + } +} + +/** + * Defensive guard: thrown by `navigateToStart` when `this.url` is unexpectedly + * unset. Under normal flow `planTask` defaults `this.url` to `about:blank`, so + * this should not fire — it exists to fail loudly rather than silently navigate + * nowhere if that invariant is ever broken. + * + * Setup error: extends Error rather than RecoverableError for the same reason + * as PlanningError. + */ +export class NoStartingUrlError extends Error { + constructor(message = "No starting URL determined") { + super(message); + this.name = "NoStartingUrlError"; + } +} + /** * Thrown when navigation times out after all retry attempts. * diff --git a/packages/core/src/events.ts b/packages/core/src/events.ts index 442a0d59..31b2f205 100644 --- a/packages/core/src/events.ts +++ b/packages/core/src/events.ts @@ -40,6 +40,7 @@ export enum WebAgentEventType { // System/Debug SYSTEM_DEBUG_COMPRESSION = "system:debug_compression", SYSTEM_DEBUG_MESSAGE = "system:debug_message", + SYSTEM_DEBUG_TOOL_DROP = "system:debug_tool_drop", // CDP endpoint failover CDP_ENDPOINT_CONNECTED = "cdp:endpoint_connected", @@ -270,6 +271,22 @@ export interface MessagesDebugEventData extends WebAgentEventData { messages: any[]; } +/** + * Event data for tool-drop diagnostics: emitted when a provider returns more + * than one tool call in a single turn and the extras are dropped. The system + * prompt instructs the model to call exactly one tool per turn, but some + * providers occasionally return multiple — this event surfaces those cases + * so they can be observed instead of silently lost. + */ +export interface ToolDropDebugEventData extends WebAgentEventData { + /** Number of tool calls that were dropped (returnedCount - 1). */ + droppedCount: number; + /** Names of the dropped tools (in original order, excluding the first). */ + droppedTools: string[]; + /** Name of the tool that was kept (first in the provider's response). */ + keptTool: string; +} + /** * Event data for waiting notifications */ @@ -377,6 +394,7 @@ export type WebAgentEvent = } | { type: WebAgentEventType.SYSTEM_DEBUG_COMPRESSION; data: CompressionDebugEventData } | { type: WebAgentEventType.SYSTEM_DEBUG_MESSAGE; data: MessagesDebugEventData } + | { type: WebAgentEventType.SYSTEM_DEBUG_TOOL_DROP; data: ToolDropDebugEventData } | { type: WebAgentEventType.CDP_ENDPOINT_CONNECTED; data: CdpEndpointConnectedEventData } | { type: WebAgentEventType.CDP_ENDPOINT_CYCLE; data: CdpEndpointCycleEventData } | { type: WebAgentEventType.BROWSER_RECONNECTED; data: BrowserReconnectedEventData } diff --git a/packages/core/src/prompts.ts b/packages/core/src/prompts.ts index 2c9b374a..9401ae27 100644 --- a/packages/core/src/prompts.ts +++ b/packages/core/src/prompts.ts @@ -43,8 +43,8 @@ export const TOOL_STRINGS = { description: "Press Enter key on an element (useful for form submission)", }, wait: { - description: "Wait for a specified number of seconds", - seconds: "Number of seconds to wait (0-30)", + description: "Wait for a specified number of seconds (up to 120 for slow-loading pages)", + seconds: "Number of seconds to wait (0-120)", }, goto: { description: "Navigate to a URL that was previously seen in the conversation", @@ -422,7 +422,6 @@ const buildActionLoopSystemPrompt = ( currentDate: getCurrentFormattedDate(), }); -export const actionLoopSystemPrompt = buildActionLoopSystemPrompt(false, false); export { buildActionLoopSystemPrompt }; /** diff --git a/packages/core/src/tools/webActionTools.ts b/packages/core/src/tools/webActionTools.ts index 8d91f928..7d379677 100644 --- a/packages/core/src/tools/webActionTools.ts +++ b/packages/core/src/tools/webActionTools.ts @@ -225,22 +225,58 @@ export function createWebActionTools(context: WebActionContext) { wait: tool({ description: TOOL_STRINGS.webActions.wait.description, inputSchema: z.object({ - seconds: z.number().min(0).max(30).describe(TOOL_STRINGS.webActions.wait.seconds), + seconds: z.number().min(0).max(120).describe(TOOL_STRINGS.webActions.wait.seconds), }), execute: async ({ seconds }) => { - // Wait uses browser.performAction which expects value as string - const result = await performActionWithValidation( - PageAction.Wait, - context, - undefined, - String(seconds), + // Sleep here rather than going through browser.performAction (which + // calls page.waitForTimeout — a fixed, abort-blind timeout). Polling + // the abort signal keeps user aborts responsive to within ~500ms even + // at the full 120s cap. + return withSpan( + SpanName.BROWSER_ACTION, + { attributes: { "pilo.browser.action_type": "wait" } }, + async (span) => { + context.eventEmitter.emit(WebAgentEventType.AGENT_ACTION, { + action: "wait", + value: String(seconds), + }); + context.eventEmitter.emit(WebAgentEventType.BROWSER_ACTION_STARTED, { + action: "wait", + value: String(seconds), + }); + + const ABORT_POLL_MS = 500; + const deadline = Date.now() + seconds * 1000; + while (Date.now() < deadline) { + if (context.abortSignal?.aborted) { + const error = new Error("Wait cancelled by abort signal"); + context.eventEmitter.emit(WebAgentEventType.BROWSER_ACTION_COMPLETED, { + success: false, + action: "wait", + error: error.message, + }); + span.setAttribute("pilo.browser.success", false); + throw error; + } + await new Promise((resolve) => + setTimeout(resolve, Math.min(ABORT_POLL_MS, deadline - Date.now())), + ); + } + + context.eventEmitter.emit(WebAgentEventType.BROWSER_ACTION_COMPLETED, { + success: true, + action: "wait", + }); + context.eventEmitter.emit(WebAgentEventType.AGENT_WAITING, { seconds }); + span.setAttribute("pilo.browser.success", true); + + return { + success: true, + action: "wait", + value: String(seconds), + }; + }, ); - - if (result.success) { - context.eventEmitter.emit(WebAgentEventType.AGENT_WAITING, { seconds }); - } - - return result; }, }), diff --git a/packages/core/src/webAgent.ts b/packages/core/src/webAgent.ts index 3f12a73d..c0aee042 100644 --- a/packages/core/src/webAgent.ts +++ b/packages/core/src/webAgent.ts @@ -20,7 +20,13 @@ import { import { SnapshotCompressor } from "./snapshotCompressor.js"; import { Logger } from "./loggers/types.js"; import { ConsoleLogger } from "./loggers/console.js"; -import { BrowserDisconnectedError, RecoverableError, ToolExecutionError } from "./errors.js"; +import { + BrowserDisconnectedError, + NoStartingUrlError, + PlanningError, + RecoverableError, + ToolExecutionError, +} from "./errors.js"; import { generateTextWithRetry } from "./utils/retry.js"; import type { AwaitedProperties } from "./utils/types.js"; import { @@ -1016,6 +1022,25 @@ export class WebAgent { ); } + // The system prompt instructs the model to call exactly one tool per turn, + // but providers occasionally return more (especially on retries or with + // certain models). Warn + emit a debug event so the drop is observable + // instead of silently lost. + if (aiResponse.toolResults.length > 1) { + const keptTool = aiResponse.toolResults[0].toolName; + const droppedTools = aiResponse.toolResults.slice(1).map((r: any) => r.toolName); + console.warn( + `[WebAgent] Provider returned ${aiResponse.toolResults.length} tool calls in one turn; ` + + `keeping '${keptTool}', dropping: ${droppedTools.join(", ")}`, + ); + this.emit(WebAgentEventType.SYSTEM_DEBUG_TOOL_DROP, { + iterationId: this.currentIterationId, + droppedCount: droppedTools.length, + droppedTools, + keptTool, + }); + } + const toolResult = aiResponse.toolResults[0]; const actionOutput = toolResult.output as any; @@ -1465,12 +1490,11 @@ export class WebAgent { }); recordSanitizedException(span, error); - // Check if the error message already contains "Failed to generate plan" to avoid double-wrapping - if (errorMsg.includes("Failed to generate plan")) { - throw new Error(errorMsg); - } else { - throw new Error(`Failed to generate plan: ${errorMsg}`); + // Avoid double-wrapping if we already produced a PlanningError up-stack. + if (error instanceof PlanningError) { + throw error; } + throw new PlanningError(`Failed to generate plan: ${errorMsg}`); } }, ); @@ -1516,14 +1540,11 @@ export class WebAgent { } /** - * Check if error is a setup/planning error that should be re-thrown + * Check if error is a setup/planning error that should be re-thrown to the + * caller rather than converted into a TASK_FAILED result. */ private isSetupError(error: unknown): boolean { - return ( - error instanceof Error && - (error.message.includes("Failed to generate plan") || - error.message.includes("No starting URL")) - ); + return error instanceof PlanningError || error instanceof NoStartingUrlError; } /** @@ -1620,7 +1641,7 @@ export class WebAgent { private async navigateToStart(task: string): Promise { if (!this.url) { - throw new Error("No starting URL determined"); + throw new NoStartingUrlError(); } if (this.url !== "about:blank") { diff --git a/packages/core/test/events.test.ts b/packages/core/test/events.test.ts index 574c2c6a..f8f1b271 100644 --- a/packages/core/test/events.test.ts +++ b/packages/core/test/events.test.ts @@ -135,6 +135,7 @@ describe("WebAgentEventEmitter", () => { "browser:screenshot_captured_image", "system:debug_compression", "system:debug_message", + "system:debug_tool_drop", "cdp:endpoint_connected", "cdp:endpoint_cycle", "browser:reconnected", diff --git a/packages/core/test/prompts.test.ts b/packages/core/test/prompts.test.ts index 71c19603..6ded1a1d 100644 --- a/packages/core/test/prompts.test.ts +++ b/packages/core/test/prompts.test.ts @@ -1,7 +1,6 @@ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; import { buildPlanPrompt, - actionLoopSystemPrompt, buildActionLoopSystemPrompt, buildTaskAndPlanPrompt, buildPageSnapshotPrompt, @@ -10,6 +9,10 @@ import { buildExtractionPrompt, } from "../src/prompts.js"; +// Default action-loop prompt used by the tests below. Mirrors the historical +// `actionLoopSystemPrompt` export (built with both gating flags off). +const actionLoopSystemPrompt = buildActionLoopSystemPrompt(false, false); + // Mock Date for consistent test results const mockDate = new Date("2024-01-15T10:00:00Z"); diff --git a/packages/core/test/tools/webActionTools.test.ts b/packages/core/test/tools/webActionTools.test.ts index c2bc2e96..9b2693a5 100644 --- a/packages/core/test/tools/webActionTools.test.ts +++ b/packages/core/test/tools/webActionTools.test.ts @@ -145,7 +145,9 @@ describe("Web Action Tools", () => { expect(tools.enter.description).toBe( "Press Enter key on an element (useful for form submission)", ); - expect(tools.wait.description).toBe("Wait for a specified number of seconds"); + expect(tools.wait.description).toBe( + "Wait for a specified number of seconds (up to 120 for slow-loading pages)", + ); expect(tools.goto.description).toBe( "Navigate to a URL that was previously seen in the conversation", ); @@ -392,20 +394,43 @@ describe("Web Action Tools", () => { describe("Wait Action", () => { it("should execute wait action successfully", async () => { + // wait no longer goes through browser.performAction — it sleeps directly + // in the tool so the abort signal can interrupt it. const performActionSpy = vi.spyOn(mockBrowser, "performAction"); const emitSpy = vi.spyOn(eventEmitter, "emit"); - const result = await tools.wait.execute({ seconds: 2 }); + const promise = tools.wait.execute({ seconds: 2 }); + await vi.advanceTimersByTimeAsync(2000); + const result = await promise; - expect(performActionSpy).toHaveBeenCalledWith("", PageAction.Wait, "2"); + expect(performActionSpy).not.toHaveBeenCalled(); expect(emitSpy).toHaveBeenCalledWith(WebAgentEventType.AGENT_WAITING, { seconds: 2 }); expect(result).toEqual({ success: true, action: "wait", - value: "2", // performActionWithValidation adds value field + value: "2", }); }); + it("should abort wait when abort signal fires", async () => { + const controller = new AbortController(); + context.abortSignal = controller.signal; + tools = createWebActionTools(context); + + const promise = tools.wait.execute({ seconds: 60 }); + // Attach a rejection handler before triggering abort so vitest's + // fake-timer scheduler doesn't see an unhandled rejection. + const settled = promise.catch((err: unknown) => err); + + controller.abort(); + // Advance past one poll interval (500ms) so the loop notices the abort. + await vi.advanceTimersByTimeAsync(500); + + const err = await settled; + expect(err).toBeInstanceOf(Error); + expect((err as Error).message).toContain("Wait cancelled by abort signal"); + }); + it("should validate wait time constraints", () => { const schema = tools.wait.inputSchema; @@ -415,7 +440,10 @@ describe("Web Action Tools", () => { const tooShort = schema.safeParse({ seconds: -1 }); expect(tooShort.success).toBe(false); - const tooLong = schema.safeParse({ seconds: 31 }); + const validLong = schema.safeParse({ seconds: 120 }); + expect(validLong.success).toBe(true); + + const tooLong = schema.safeParse({ seconds: 121 }); expect(tooLong.success).toBe(false); }); }); diff --git a/packages/core/test/webAgent.test.ts b/packages/core/test/webAgent.test.ts index 100bfabd..65e1918c 100644 --- a/packages/core/test/webAgent.test.ts +++ b/packages/core/test/webAgent.test.ts @@ -5,6 +5,7 @@ import { WebAgentEventEmitter, WebAgentEventType } from "../src/events.js"; import { LanguageModel, streamText } from "ai"; import { Logger } from "../src/loggers/types.js"; import { generateTextWithRetry } from "../src/utils/retry.js"; +import { PlanningError } from "../src/errors.js"; // Mock the AI module vi.mock("ai", () => ({ @@ -649,6 +650,17 @@ describe("WebAgent", () => { ).rejects.toThrow(/Failed to generate plan/); }); + it("should propagate planning failures as PlanningError instances", async () => { + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "No tool result", + toolResults: [], + } as any); + + await expect( + webAgent.execute("Test task", { startingUrl: "https://example.com" }), + ).rejects.toBeInstanceOf(PlanningError); + }); + it("should start on about:blank when planner returns no URL", async () => { // Planner returns plan without url (e.g., webSearch-enabled scenario) mockGenerateTextWithRetry.mockResolvedValueOnce({ @@ -1114,6 +1126,111 @@ describe("WebAgent", () => { expect(result.success).toBe(true); }); + + it("should warn and emit SYSTEM_DEBUG_TOOL_DROP when provider returns multiple tool calls", async () => { + const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {}); + + // Provider returns 3 tool calls in one turn; only the first ('click') + // should be processed, and the extras should be reported. + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Multi", + toolResults: [ + { + type: "tool-result", + toolCallId: "click_1", + toolName: "click", + input: { ref: "btn1" }, + output: { action: "click", ref: "btn1", isTerminal: false }, + }, + { + type: "tool-result", + toolCallId: "fill_1", + toolName: "fill", + input: { ref: "input1", value: "x" }, + output: { action: "fill", ref: "input1", isTerminal: false }, + }, + { + type: "tool-result", + toolCallId: "done_1", + toolName: "done", + input: { result: "x" }, + output: { action: "done", result: "x", isTerminal: true }, + }, + ], + response: { + messages: [ + { role: "assistant", content: "Multi" }, + { + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: "click_1", + toolName: "click", + output: { action: "click", ref: "btn1" }, + }, + ], + }, + ], + }, + }) as any, + ); + + // Second turn: wrap up with a clean done. + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Done", + toolResults: [ + { + type: "tool-result", + toolCallId: "done_2", + toolName: "done", + input: { result: "Clicked" }, + output: { action: "done", result: "Clicked", isTerminal: true }, + }, + ], + response: { + messages: [ + { role: "assistant", content: "Done" }, + { + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: "done_2", + toolName: "done", + output: { action: "done", result: "Clicked", isTerminal: true }, + }, + ], + }, + ], + }, + }) as any, + ); + + mockGenerateTextWithRetry.mockResolvedValueOnce(mockValidationResponse("complete")); + + const result = await webAgent.execute("Multi-tool", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(true); + + const dropEvent = mockLogger.events.find( + (e) => e.type === WebAgentEventType.SYSTEM_DEBUG_TOOL_DROP, + ); + expect(dropEvent).toBeDefined(); + expect(dropEvent?.data.droppedCount).toBe(2); + expect(dropEvent?.data.droppedTools).toEqual(["fill", "done"]); + expect(dropEvent?.data.keptTool).toBe("click"); + + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining("Provider returned 3 tool calls in one turn"), + ); + + warnSpy.mockRestore(); + }); }); describe("error handling", () => {