From c7f1923b7e2bd3a61e8d776393b003d0a538a9ee Mon Sep 17 00:00:00 2001 From: Les Orchard Date: Wed, 13 May 2026 14:46:53 -0700 Subject: [PATCH 1/2] feat(core): add scroll tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `scroll({direction})` as a first-class browser action so the agent has a way to reveal content on dynamic / infinite-scroll pages instead of being stuck when the accessibility tree appears incomplete. - `PageAction.Scroll` in `ariaBrowser.ts`, with `ScrollDirection` type and a `SCROLL_DIRECTIONS` constant the Zod schema reuses. - PlaywrightBrowser: scroll case using `page.evaluate` → `window.scrollBy` / `window.scrollTo` so it works in remote-CDP setups. - ExtensionBrowser: matching scroll case via `browser.scripting.executeScript` for parity with the extension runtime. - `scroll` tool exposes four directions: `up` / `down` (one viewport each) and `top` / `bottom` (jump to start / end of document). - Updates the persona prompt's "if expected data isn't visible" guidance to point at the new `scroll()` action instead of describing a missing capability. Split out of the hygiene cleanup PR #444 (item F): the issue said either land scroll then keep the guidance, or strip the guidance until scroll lands. This PR takes the "land scroll" path. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/core/src/browser/ariaBrowser.ts | 9 ++++++ .../core/src/browser/playwrightBrowser.ts | 30 +++++++++++++++++ packages/core/src/prompts.ts | 10 ++++-- packages/core/src/tools/webActionTools.ts | 12 ++++++- packages/core/test/ariaBrowser.test.ts | 8 ++++- packages/core/test/playwrightBrowser.test.ts | 21 ++++++++++++ packages/core/test/prompts.test.ts | 1 + .../core/test/tools/webActionTools.test.ts | 31 ++++++++++++++++++ .../src/background/ExtensionBrowser.ts | 32 +++++++++++++++++++ 9 files changed, 150 insertions(+), 4 deletions(-) diff --git a/packages/core/src/browser/ariaBrowser.ts b/packages/core/src/browser/ariaBrowser.ts index f3e39c91..ca1abacd 100644 --- a/packages/core/src/browser/ariaBrowser.ts +++ b/packages/core/src/browser/ariaBrowser.ts @@ -21,11 +21,20 @@ export enum PageAction { Goto = "goto", Back = "back", Forward = "forward", + Scroll = "scroll", Extract = "extract", Done = "done", Abort = "abort", } +/** + * Scroll directions supported by {@link PageAction.Scroll}. + * - `up` / `down`: scroll one viewport in that direction + * - `top` / `bottom`: scroll all the way to the top/bottom of the document + */ +export type ScrollDirection = "up" | "down" | "top" | "bottom"; +export const SCROLL_DIRECTIONS: ScrollDirection[] = ["up", "down", "top", "bottom"]; + /** * Page load states to wait for */ diff --git a/packages/core/src/browser/playwrightBrowser.ts b/packages/core/src/browser/playwrightBrowser.ts index 7d95f5fe..a2a6af73 100644 --- a/packages/core/src/browser/playwrightBrowser.ts +++ b/packages/core/src/browser/playwrightBrowser.ts @@ -893,6 +893,36 @@ export class PlaywrightBrowser implements AriaBrowser { // Note: goForward already calls ensureOptimizedPageLoad internally break; + case PageAction.Scroll: + if (!value) + throw new BrowserActionException( + "scroll", + "Direction required for scroll action", + ); + await this.page!.evaluate((direction: string) => { + switch (direction) { + case "down": + window.scrollBy({ left: 0, top: window.innerHeight, behavior: "instant" }); + return; + case "up": + window.scrollBy({ left: 0, top: -window.innerHeight, behavior: "instant" }); + return; + case "top": + window.scrollTo({ left: 0, top: 0, behavior: "instant" }); + return; + case "bottom": + window.scrollTo({ + left: 0, + top: document.documentElement.scrollHeight, + behavior: "instant", + }); + return; + default: + throw new Error(`Unsupported scroll direction: ${direction}`); + } + }, value); + break; + case PageAction.Extract: // Extract is handled at a higher level in the automation flow // The browser implementation doesn't need to do anything diff --git a/packages/core/src/prompts.ts b/packages/core/src/prompts.ts index 2c9b374a..d38105d0 100644 --- a/packages/core/src/prompts.ts +++ b/packages/core/src/prompts.ts @@ -56,6 +56,11 @@ export const TOOL_STRINGS = { forward: { description: "Go forward to the next page", }, + scroll: { + description: + "Scroll the page. 'up'/'down' scroll one viewport in that direction; 'top'/'bottom' jump to the start/end of the document. Use after the accessibility tree appears incomplete on dynamic / infinite-scroll pages.", + direction: "Direction to scroll: 'up', 'down', 'top', or 'bottom'", + }, extract: { description: "Extract specific data from the current page for later reference", dataDescription: @@ -155,7 +160,7 @@ You adapt to situations and find creative ways to complete tasks without getting IMPORTANT: - You can see the entire page content through the accessibility tree snapshot. -- The accessibility tree shows all currently loaded page elements. On dynamic pages, some content may only appear after scrolling or interaction — if expected data isn't visible, try scrolling or interacting to trigger loading. +- The accessibility tree shows all currently loaded page elements. On dynamic pages, some content may only appear after scrolling or interaction — if expected data isn't visible, try scroll() or wait() to trigger loading. - Focus on the elements you need to interact with directly. `.trim(); @@ -178,6 +183,7 @@ function buildToolExamples( `- goto({"url": "https://example.com"}) - ${TOOL_STRINGS.webActions.goto.description}`, `- back() - ${TOOL_STRINGS.webActions.back.description}`, `- forward() - ${TOOL_STRINGS.webActions.forward.description}`, + `- scroll({"direction": "down"}) - ${TOOL_STRINGS.webActions.scroll.description}`, `- extract({"description": "data to extract"}) - ${TOOL_STRINGS.webActions.extract.description}`, ]; @@ -336,7 +342,7 @@ Analyze the current page state and determine your next action based on previous - extract() if you need more information **Best Practices:** -- The accessibility tree shows currently loaded elements; dynamic pages may load more content on scroll +- The accessibility tree shows currently loaded elements; if expected content is missing, use scroll() to reveal more of the page or wait() to let dynamic content load - Clear obstructing modals/popups first - Prefer click() over goto() for page navigation - Submit forms via enter() or submit button after filling diff --git a/packages/core/src/tools/webActionTools.ts b/packages/core/src/tools/webActionTools.ts index 8d91f928..3c4d6518 100644 --- a/packages/core/src/tools/webActionTools.ts +++ b/packages/core/src/tools/webActionTools.ts @@ -7,7 +7,7 @@ import { tool } from "ai"; import { z } from "zod"; -import { AriaBrowser, PageAction } from "../browser/ariaBrowser.js"; +import { AriaBrowser, PageAction, SCROLL_DIRECTIONS } from "../browser/ariaBrowser.js"; import { WebAgentEventEmitter, WebAgentEventType } from "../events.js"; import { buildExtractionPrompt, TOOL_STRINGS } from "../prompts.js"; import type { ProviderConfig } from "../provider.js"; @@ -244,6 +244,16 @@ export function createWebActionTools(context: WebActionContext) { }, }), + scroll: tool({ + description: TOOL_STRINGS.webActions.scroll.description, + inputSchema: z.object({ + direction: z.enum(SCROLL_DIRECTIONS).describe(TOOL_STRINGS.webActions.scroll.direction), + }), + execute: async ({ direction }) => { + return await performActionWithValidation(PageAction.Scroll, context, undefined, direction); + }, + }), + goto: tool({ description: TOOL_STRINGS.webActions.goto.description, inputSchema: z.object({ diff --git a/packages/core/test/ariaBrowser.test.ts b/packages/core/test/ariaBrowser.test.ts index 0c1f3f6a..216d90bc 100644 --- a/packages/core/test/ariaBrowser.test.ts +++ b/packages/core/test/ariaBrowser.test.ts @@ -17,6 +17,7 @@ describe("AriaBrowser interface", () => { "goto", "back", "forward", + "scroll", "extract", "done", "abort", @@ -63,7 +64,12 @@ describe("AriaBrowser interface", () => { ]; // Navigation actions - const navigationActions = [PageAction.Goto, PageAction.Back, PageAction.Forward]; + const navigationActions = [ + PageAction.Goto, + PageAction.Back, + PageAction.Forward, + PageAction.Scroll, + ]; // Control actions const controlActions = [ diff --git a/packages/core/test/playwrightBrowser.test.ts b/packages/core/test/playwrightBrowser.test.ts index f61b3dc2..c6e3feba 100644 --- a/packages/core/test/playwrightBrowser.test.ts +++ b/packages/core/test/playwrightBrowser.test.ts @@ -676,6 +676,7 @@ describe("PlaywrightBrowser", () => { PageAction.Goto, PageAction.Back, PageAction.Forward, + PageAction.Scroll, PageAction.Done, ]; @@ -683,6 +684,26 @@ describe("PlaywrightBrowser", () => { expect((browser as any).actionRequiresElement(action)).toBe(false); }); }); + + it("should run scroll via page.evaluate with the given direction", async () => { + const evaluateSpy = vi.fn().mockResolvedValue(undefined); + (browser as any).page = { evaluate: evaluateSpy }; + + await browser.performAction("", PageAction.Scroll, "down"); + + expect(evaluateSpy).toHaveBeenCalledWith(expect.any(Function), "down"); + }); + + it("should throw BrowserActionException for missing direction on scroll", async () => { + (browser as any).page = { evaluate: vi.fn() }; + + await expect(browser.performAction("", PageAction.Scroll)).rejects.toThrow( + BrowserActionException, + ); + await expect(browser.performAction("", PageAction.Scroll)).rejects.toThrow( + "Direction required for scroll action", + ); + }); }); describe("performAction error handling", () => { diff --git a/packages/core/test/prompts.test.ts b/packages/core/test/prompts.test.ts index 71c19603..199e8bb7 100644 --- a/packages/core/test/prompts.test.ts +++ b/packages/core/test/prompts.test.ts @@ -209,6 +209,7 @@ describe("prompts", () => { "goto", "back", "forward", + "scroll", "done", "abort", ]; diff --git a/packages/core/test/tools/webActionTools.test.ts b/packages/core/test/tools/webActionTools.test.ts index c2bc2e96..f1e40703 100644 --- a/packages/core/test/tools/webActionTools.test.ts +++ b/packages/core/test/tools/webActionTools.test.ts @@ -129,6 +129,7 @@ describe("Web Action Tools", () => { expect(tools.goto).toBeDefined(); expect(tools.back).toBeDefined(); expect(tools.forward).toBeDefined(); + expect(tools.scroll).toBeDefined(); expect(tools.extract).toBeDefined(); expect(tools.done).toBeDefined(); expect(tools.abort).toBeDefined(); @@ -151,6 +152,7 @@ describe("Web Action Tools", () => { ); expect(tools.back.description).toBe("Go back to the previous page"); expect(tools.forward.description).toBe("Go forward to the next page"); + expect(tools.scroll.description).toContain("Scroll the page"); expect(tools.extract.description).toBe( "Extract specific data from the current page for later reference", ); @@ -420,6 +422,35 @@ describe("Web Action Tools", () => { }); }); + describe("Scroll Action", () => { + it("should dispatch scroll action with the requested direction", async () => { + const performActionSpy = vi.spyOn(mockBrowser, "performAction"); + + const result = await tools.scroll.execute({ direction: "down" }); + + expect(performActionSpy).toHaveBeenCalledWith("", PageAction.Scroll, "down"); + expect(result).toEqual({ + success: true, + action: "scroll", + value: "down", + }); + }); + + it("should accept all four directions", () => { + const schema = tools.scroll.inputSchema; + for (const direction of ["up", "down", "top", "bottom"]) { + expect(schema.safeParse({ direction }).success).toBe(true); + } + }); + + it("should reject unknown directions", () => { + const schema = tools.scroll.inputSchema; + expect(schema.safeParse({ direction: "left" }).success).toBe(false); + expect(schema.safeParse({ direction: "" }).success).toBe(false); + expect(schema.safeParse({}).success).toBe(false); + }); + }); + describe("Navigation Actions", () => { it("should execute goto action successfully", async () => { const performActionSpy = vi.spyOn(mockBrowser, "performAction"); diff --git a/packages/extension/src/background/ExtensionBrowser.ts b/packages/extension/src/background/ExtensionBrowser.ts index 6619d4c6..c2d0b6e3 100644 --- a/packages/extension/src/background/ExtensionBrowser.ts +++ b/packages/extension/src/background/ExtensionBrowser.ts @@ -335,6 +335,38 @@ export class ExtensionBrowser implements AriaBrowser { // Note: goForward already calls ensureOptimizedPageLoad internally return; + case PageAction.Scroll: { + if (!value) throw new Error("Direction required for scroll action"); + const tab = await this.getActiveTab(); + await browser.scripting.executeScript({ + target: { tabId: tab.id! }, + func: (direction: string) => { + switch (direction) { + case "down": + window.scrollBy({ left: 0, top: window.innerHeight, behavior: "instant" }); + return; + case "up": + window.scrollBy({ left: 0, top: -window.innerHeight, behavior: "instant" }); + return; + case "top": + window.scrollTo({ left: 0, top: 0, behavior: "instant" }); + return; + case "bottom": + window.scrollTo({ + left: 0, + top: document.documentElement.scrollHeight, + behavior: "instant", + }); + return; + default: + throw new Error(`Unsupported scroll direction: ${direction}`); + } + }, + args: [value], + }); + return; + } + case PageAction.Done: // This is a no-op in the browser implementation // It's handled at a higher level in the automation flow From e57c9edfda2f45c805b463213e8b110caf30870f Mon Sep 17 00:00:00 2001 From: Les Orchard Date: Fri, 15 May 2026 13:17:06 -0700 Subject: [PATCH 2/2] feat(core): settle window after scroll so lazy-loaded content renders before next snapshot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scroll is most useful on dynamic pages where IntersectionObserver-driven loads append content as the viewport moves. Without a settle, the next aria-tree snapshot can capture the page mid-load and miss exactly what the scroll was supposed to reveal — leaving the agent to scroll again without ever seeing the new content. PlaywrightBrowser waits up to 500ms for networkidle (caught so the timeout doesn't fail the action). ExtensionBrowser has no networkidle equivalent, so it uses a fixed 300ms wait as the closest mirror. Both timeouts are tight so scroll stays cheap on quiet pages. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../core/src/browser/playwrightBrowser.ts | 6 +++++ packages/core/test/playwrightBrowser.test.ts | 26 +++++++++++++++++-- .../src/background/ExtensionBrowser.ts | 4 +++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/packages/core/src/browser/playwrightBrowser.ts b/packages/core/src/browser/playwrightBrowser.ts index a2a6af73..510f2a71 100644 --- a/packages/core/src/browser/playwrightBrowser.ts +++ b/packages/core/src/browser/playwrightBrowser.ts @@ -921,6 +921,12 @@ export class PlaywrightBrowser implements AriaBrowser { throw new Error(`Unsupported scroll direction: ${direction}`); } }, value); + // Settle window for lazy-loaded content: scroll commonly + // triggers IntersectionObserver-driven loads that hadn't + // resolved before our next aria snapshot. Tight timeout so + // we don't burn turns on background telemetry traffic; the + // catch makes this best-effort. + await this.page!.waitForLoadState("networkidle", { timeout: 500 }).catch(() => {}); break; case PageAction.Extract: diff --git a/packages/core/test/playwrightBrowser.test.ts b/packages/core/test/playwrightBrowser.test.ts index c6e3feba..ec49f9d6 100644 --- a/packages/core/test/playwrightBrowser.test.ts +++ b/packages/core/test/playwrightBrowser.test.ts @@ -687,15 +687,37 @@ describe("PlaywrightBrowser", () => { it("should run scroll via page.evaluate with the given direction", async () => { const evaluateSpy = vi.fn().mockResolvedValue(undefined); - (browser as any).page = { evaluate: evaluateSpy }; + const waitForLoadStateSpy = vi.fn().mockResolvedValue(undefined); + (browser as any).page = { + evaluate: evaluateSpy, + waitForLoadState: waitForLoadStateSpy, + }; await browser.performAction("", PageAction.Scroll, "down"); expect(evaluateSpy).toHaveBeenCalledWith(expect.any(Function), "down"); + // Scroll should also settle briefly so lazy-loaded content can render + // before the next aria-tree snapshot. + expect(waitForLoadStateSpy).toHaveBeenCalledWith("networkidle", { timeout: 500 }); + }); + + it("should not fail scroll if the settle timeout fires", async () => { + // The settle is best-effort — a thrown timeout from waitForLoadState + // must not bubble up and fail the scroll action. + const evaluateSpy = vi.fn().mockResolvedValue(undefined); + const waitForLoadStateSpy = vi + .fn() + .mockRejectedValue(new Error("Timeout 500ms exceeded waiting for networkidle")); + (browser as any).page = { + evaluate: evaluateSpy, + waitForLoadState: waitForLoadStateSpy, + }; + + await expect(browser.performAction("", PageAction.Scroll, "down")).resolves.not.toThrow(); }); it("should throw BrowserActionException for missing direction on scroll", async () => { - (browser as any).page = { evaluate: vi.fn() }; + (browser as any).page = { evaluate: vi.fn(), waitForLoadState: vi.fn() }; await expect(browser.performAction("", PageAction.Scroll)).rejects.toThrow( BrowserActionException, diff --git a/packages/extension/src/background/ExtensionBrowser.ts b/packages/extension/src/background/ExtensionBrowser.ts index c2d0b6e3..a3ad81c5 100644 --- a/packages/extension/src/background/ExtensionBrowser.ts +++ b/packages/extension/src/background/ExtensionBrowser.ts @@ -364,6 +364,10 @@ export class ExtensionBrowser implements AriaBrowser { }, args: [value], }); + // Settle window for lazy-loaded content. Extension runtime has no + // networkidle equivalent, so a short fixed wait is the simplest + // mirror of the Playwright implementation's settle. + await new Promise((resolve) => setTimeout(resolve, 300)); return; }