diff --git a/packages/core/src/browser/ariaBrowser.ts b/packages/core/src/browser/ariaBrowser.ts index f3e39c91..ca1abacd 100644 --- a/packages/core/src/browser/ariaBrowser.ts +++ b/packages/core/src/browser/ariaBrowser.ts @@ -21,11 +21,20 @@ export enum PageAction { Goto = "goto", Back = "back", Forward = "forward", + Scroll = "scroll", Extract = "extract", Done = "done", Abort = "abort", } +/** + * Scroll directions supported by {@link PageAction.Scroll}. + * - `up` / `down`: scroll one viewport in that direction + * - `top` / `bottom`: scroll all the way to the top/bottom of the document + */ +export type ScrollDirection = "up" | "down" | "top" | "bottom"; +export const SCROLL_DIRECTIONS: ScrollDirection[] = ["up", "down", "top", "bottom"]; + /** * Page load states to wait for */ diff --git a/packages/core/src/browser/playwrightBrowser.ts b/packages/core/src/browser/playwrightBrowser.ts index 7d95f5fe..510f2a71 100644 --- a/packages/core/src/browser/playwrightBrowser.ts +++ b/packages/core/src/browser/playwrightBrowser.ts @@ -893,6 +893,42 @@ export class PlaywrightBrowser implements AriaBrowser { // Note: goForward already calls ensureOptimizedPageLoad internally break; + case PageAction.Scroll: + if (!value) + throw new BrowserActionException( + "scroll", + "Direction required for scroll action", + ); + await this.page!.evaluate((direction: string) => { + switch (direction) { + case "down": + window.scrollBy({ left: 0, top: window.innerHeight, behavior: "instant" }); + return; + case "up": + window.scrollBy({ left: 0, top: -window.innerHeight, behavior: "instant" }); + return; + case "top": + window.scrollTo({ left: 0, top: 0, behavior: "instant" }); + return; + case "bottom": + window.scrollTo({ + left: 0, + top: document.documentElement.scrollHeight, + behavior: "instant", + }); + return; + default: + throw new Error(`Unsupported scroll direction: ${direction}`); + } + }, value); + // Settle window for lazy-loaded content: scroll commonly + // triggers IntersectionObserver-driven loads that hadn't + // resolved before our next aria snapshot. Tight timeout so + // we don't burn turns on background telemetry traffic; the + // catch makes this best-effort. + await this.page!.waitForLoadState("networkidle", { timeout: 500 }).catch(() => {}); + break; + case PageAction.Extract: // Extract is handled at a higher level in the automation flow // The browser implementation doesn't need to do anything diff --git a/packages/core/src/prompts.ts b/packages/core/src/prompts.ts index 2c9b374a..d38105d0 100644 --- a/packages/core/src/prompts.ts +++ b/packages/core/src/prompts.ts @@ -56,6 +56,11 @@ export const TOOL_STRINGS = { forward: { description: "Go forward to the next page", }, + scroll: { + description: + "Scroll the page. 'up'/'down' scroll one viewport in that direction; 'top'/'bottom' jump to the start/end of the document. Use after the accessibility tree appears incomplete on dynamic / infinite-scroll pages.", + direction: "Direction to scroll: 'up', 'down', 'top', or 'bottom'", + }, extract: { description: "Extract specific data from the current page for later reference", dataDescription: @@ -155,7 +160,7 @@ You adapt to situations and find creative ways to complete tasks without getting IMPORTANT: - You can see the entire page content through the accessibility tree snapshot. -- The accessibility tree shows all currently loaded page elements. On dynamic pages, some content may only appear after scrolling or interaction — if expected data isn't visible, try scrolling or interacting to trigger loading. +- The accessibility tree shows all currently loaded page elements. On dynamic pages, some content may only appear after scrolling or interaction — if expected data isn't visible, try scroll() or wait() to trigger loading. - Focus on the elements you need to interact with directly. `.trim(); @@ -178,6 +183,7 @@ function buildToolExamples( `- goto({"url": "https://example.com"}) - ${TOOL_STRINGS.webActions.goto.description}`, `- back() - ${TOOL_STRINGS.webActions.back.description}`, `- forward() - ${TOOL_STRINGS.webActions.forward.description}`, + `- scroll({"direction": "down"}) - ${TOOL_STRINGS.webActions.scroll.description}`, `- extract({"description": "data to extract"}) - ${TOOL_STRINGS.webActions.extract.description}`, ]; @@ -336,7 +342,7 @@ Analyze the current page state and determine your next action based on previous - extract() if you need more information **Best Practices:** -- The accessibility tree shows currently loaded elements; dynamic pages may load more content on scroll +- The accessibility tree shows currently loaded elements; if expected content is missing, use scroll() to reveal more of the page or wait() to let dynamic content load - Clear obstructing modals/popups first - Prefer click() over goto() for page navigation - Submit forms via enter() or submit button after filling diff --git a/packages/core/src/tools/webActionTools.ts b/packages/core/src/tools/webActionTools.ts index 8d91f928..3c4d6518 100644 --- a/packages/core/src/tools/webActionTools.ts +++ b/packages/core/src/tools/webActionTools.ts @@ -7,7 +7,7 @@ import { tool } from "ai"; import { z } from "zod"; -import { AriaBrowser, PageAction } from "../browser/ariaBrowser.js"; +import { AriaBrowser, PageAction, SCROLL_DIRECTIONS } from "../browser/ariaBrowser.js"; import { WebAgentEventEmitter, WebAgentEventType } from "../events.js"; import { buildExtractionPrompt, TOOL_STRINGS } from "../prompts.js"; import type { ProviderConfig } from "../provider.js"; @@ -244,6 +244,16 @@ export function createWebActionTools(context: WebActionContext) { }, }), + scroll: tool({ + description: TOOL_STRINGS.webActions.scroll.description, + inputSchema: z.object({ + direction: z.enum(SCROLL_DIRECTIONS).describe(TOOL_STRINGS.webActions.scroll.direction), + }), + execute: async ({ direction }) => { + return await performActionWithValidation(PageAction.Scroll, context, undefined, direction); + }, + }), + goto: tool({ description: TOOL_STRINGS.webActions.goto.description, inputSchema: z.object({ diff --git a/packages/core/test/ariaBrowser.test.ts b/packages/core/test/ariaBrowser.test.ts index 0c1f3f6a..216d90bc 100644 --- a/packages/core/test/ariaBrowser.test.ts +++ b/packages/core/test/ariaBrowser.test.ts @@ -17,6 +17,7 @@ describe("AriaBrowser interface", () => { "goto", "back", "forward", + "scroll", "extract", "done", "abort", @@ -63,7 +64,12 @@ describe("AriaBrowser interface", () => { ]; // Navigation actions - const navigationActions = [PageAction.Goto, PageAction.Back, PageAction.Forward]; + const navigationActions = [ + PageAction.Goto, + PageAction.Back, + PageAction.Forward, + PageAction.Scroll, + ]; // Control actions const controlActions = [ diff --git a/packages/core/test/playwrightBrowser.test.ts b/packages/core/test/playwrightBrowser.test.ts index f61b3dc2..ec49f9d6 100644 --- a/packages/core/test/playwrightBrowser.test.ts +++ b/packages/core/test/playwrightBrowser.test.ts @@ -676,6 +676,7 @@ describe("PlaywrightBrowser", () => { PageAction.Goto, PageAction.Back, PageAction.Forward, + PageAction.Scroll, PageAction.Done, ]; @@ -683,6 +684,48 @@ describe("PlaywrightBrowser", () => { expect((browser as any).actionRequiresElement(action)).toBe(false); }); }); + + it("should run scroll via page.evaluate with the given direction", async () => { + const evaluateSpy = vi.fn().mockResolvedValue(undefined); + const waitForLoadStateSpy = vi.fn().mockResolvedValue(undefined); + (browser as any).page = { + evaluate: evaluateSpy, + waitForLoadState: waitForLoadStateSpy, + }; + + await browser.performAction("", PageAction.Scroll, "down"); + + expect(evaluateSpy).toHaveBeenCalledWith(expect.any(Function), "down"); + // Scroll should also settle briefly so lazy-loaded content can render + // before the next aria-tree snapshot. + expect(waitForLoadStateSpy).toHaveBeenCalledWith("networkidle", { timeout: 500 }); + }); + + it("should not fail scroll if the settle timeout fires", async () => { + // The settle is best-effort — a thrown timeout from waitForLoadState + // must not bubble up and fail the scroll action. + const evaluateSpy = vi.fn().mockResolvedValue(undefined); + const waitForLoadStateSpy = vi + .fn() + .mockRejectedValue(new Error("Timeout 500ms exceeded waiting for networkidle")); + (browser as any).page = { + evaluate: evaluateSpy, + waitForLoadState: waitForLoadStateSpy, + }; + + await expect(browser.performAction("", PageAction.Scroll, "down")).resolves.not.toThrow(); + }); + + it("should throw BrowserActionException for missing direction on scroll", async () => { + (browser as any).page = { evaluate: vi.fn(), waitForLoadState: vi.fn() }; + + await expect(browser.performAction("", PageAction.Scroll)).rejects.toThrow( + BrowserActionException, + ); + await expect(browser.performAction("", PageAction.Scroll)).rejects.toThrow( + "Direction required for scroll action", + ); + }); }); describe("performAction error handling", () => { diff --git a/packages/core/test/prompts.test.ts b/packages/core/test/prompts.test.ts index 71c19603..199e8bb7 100644 --- a/packages/core/test/prompts.test.ts +++ b/packages/core/test/prompts.test.ts @@ -209,6 +209,7 @@ describe("prompts", () => { "goto", "back", "forward", + "scroll", "done", "abort", ]; diff --git a/packages/core/test/tools/webActionTools.test.ts b/packages/core/test/tools/webActionTools.test.ts index c2bc2e96..f1e40703 100644 --- a/packages/core/test/tools/webActionTools.test.ts +++ b/packages/core/test/tools/webActionTools.test.ts @@ -129,6 +129,7 @@ describe("Web Action Tools", () => { expect(tools.goto).toBeDefined(); expect(tools.back).toBeDefined(); expect(tools.forward).toBeDefined(); + expect(tools.scroll).toBeDefined(); expect(tools.extract).toBeDefined(); expect(tools.done).toBeDefined(); expect(tools.abort).toBeDefined(); @@ -151,6 +152,7 @@ describe("Web Action Tools", () => { ); expect(tools.back.description).toBe("Go back to the previous page"); expect(tools.forward.description).toBe("Go forward to the next page"); + expect(tools.scroll.description).toContain("Scroll the page"); expect(tools.extract.description).toBe( "Extract specific data from the current page for later reference", ); @@ -420,6 +422,35 @@ describe("Web Action Tools", () => { }); }); + describe("Scroll Action", () => { + it("should dispatch scroll action with the requested direction", async () => { + const performActionSpy = vi.spyOn(mockBrowser, "performAction"); + + const result = await tools.scroll.execute({ direction: "down" }); + + expect(performActionSpy).toHaveBeenCalledWith("", PageAction.Scroll, "down"); + expect(result).toEqual({ + success: true, + action: "scroll", + value: "down", + }); + }); + + it("should accept all four directions", () => { + const schema = tools.scroll.inputSchema; + for (const direction of ["up", "down", "top", "bottom"]) { + expect(schema.safeParse({ direction }).success).toBe(true); + } + }); + + it("should reject unknown directions", () => { + const schema = tools.scroll.inputSchema; + expect(schema.safeParse({ direction: "left" }).success).toBe(false); + expect(schema.safeParse({ direction: "" }).success).toBe(false); + expect(schema.safeParse({}).success).toBe(false); + }); + }); + describe("Navigation Actions", () => { it("should execute goto action successfully", async () => { const performActionSpy = vi.spyOn(mockBrowser, "performAction"); diff --git a/packages/extension/src/background/ExtensionBrowser.ts b/packages/extension/src/background/ExtensionBrowser.ts index 6619d4c6..a3ad81c5 100644 --- a/packages/extension/src/background/ExtensionBrowser.ts +++ b/packages/extension/src/background/ExtensionBrowser.ts @@ -335,6 +335,42 @@ export class ExtensionBrowser implements AriaBrowser { // Note: goForward already calls ensureOptimizedPageLoad internally return; + case PageAction.Scroll: { + if (!value) throw new Error("Direction required for scroll action"); + const tab = await this.getActiveTab(); + await browser.scripting.executeScript({ + target: { tabId: tab.id! }, + func: (direction: string) => { + switch (direction) { + case "down": + window.scrollBy({ left: 0, top: window.innerHeight, behavior: "instant" }); + return; + case "up": + window.scrollBy({ left: 0, top: -window.innerHeight, behavior: "instant" }); + return; + case "top": + window.scrollTo({ left: 0, top: 0, behavior: "instant" }); + return; + case "bottom": + window.scrollTo({ + left: 0, + top: document.documentElement.scrollHeight, + behavior: "instant", + }); + return; + default: + throw new Error(`Unsupported scroll direction: ${direction}`); + } + }, + args: [value], + }); + // Settle window for lazy-loaded content. Extension runtime has no + // networkidle equivalent, so a short fixed wait is the simplest + // mirror of the Playwright implementation's settle. + await new Promise((resolve) => setTimeout(resolve, 300)); + return; + } + case PageAction.Done: // This is a no-op in the browser implementation // It's handled at a higher level in the automation flow