Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions packages/core/src/browser/ariaBrowser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,20 @@ export enum PageAction {
Goto = "goto",
Back = "back",
Forward = "forward",
Scroll = "scroll",
Extract = "extract",
Done = "done",
Abort = "abort",
}

/**
* Scroll directions supported by {@link PageAction.Scroll}.
* - `up` / `down`: scroll one viewport in that direction
* - `top` / `bottom`: scroll all the way to the top/bottom of the document
*/
export type ScrollDirection = "up" | "down" | "top" | "bottom";
export const SCROLL_DIRECTIONS: ScrollDirection[] = ["up", "down", "top", "bottom"];

/**
* Page load states to wait for
*/
Expand Down
36 changes: 36 additions & 0 deletions packages/core/src/browser/playwrightBrowser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,42 @@ export class PlaywrightBrowser implements AriaBrowser {
// Note: goForward already calls ensureOptimizedPageLoad internally
break;

case PageAction.Scroll:
if (!value)
throw new BrowserActionException(
"scroll",
"Direction required for scroll action",
);
await this.page!.evaluate((direction: string) => {
switch (direction) {
case "down":
window.scrollBy({ left: 0, top: window.innerHeight, behavior: "instant" });
return;
case "up":
window.scrollBy({ left: 0, top: -window.innerHeight, behavior: "instant" });
return;
case "top":
window.scrollTo({ left: 0, top: 0, behavior: "instant" });
return;
case "bottom":
window.scrollTo({
left: 0,
top: document.documentElement.scrollHeight,
behavior: "instant",
});
return;
default:
throw new Error(`Unsupported scroll direction: ${direction}`);
}
}, value);
// Settle window for lazy-loaded content: scroll commonly
// triggers IntersectionObserver-driven loads that hadn't
// resolved before our next aria snapshot. Tight timeout so
// we don't burn turns on background telemetry traffic; the
// catch makes this best-effort.
await this.page!.waitForLoadState("networkidle", { timeout: 500 }).catch(() => {});
break;

case PageAction.Extract:
// Extract is handled at a higher level in the automation flow
// The browser implementation doesn't need to do anything
Expand Down
10 changes: 8 additions & 2 deletions packages/core/src/prompts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ export const TOOL_STRINGS = {
forward: {
description: "Go forward to the next page",
},
scroll: {
description:
"Scroll the page. 'up'/'down' scroll one viewport in that direction; 'top'/'bottom' jump to the start/end of the document. Use after the accessibility tree appears incomplete on dynamic / infinite-scroll pages.",
direction: "Direction to scroll: 'up', 'down', 'top', or 'bottom'",
},
extract: {
description: "Extract specific data from the current page for later reference",
dataDescription:
Expand Down Expand Up @@ -155,7 +160,7 @@ You adapt to situations and find creative ways to complete tasks without getting

IMPORTANT:
- You can see the entire page content through the accessibility tree snapshot.
- The accessibility tree shows all currently loaded page elements. On dynamic pages, some content may only appear after scrolling or interaction — if expected data isn't visible, try scrolling or interacting to trigger loading.
- The accessibility tree shows all currently loaded page elements. On dynamic pages, some content may only appear after scrolling or interaction — if expected data isn't visible, try scroll() or wait() to trigger loading.
- Focus on the elements you need to interact with directly.
`.trim();

Expand All @@ -178,6 +183,7 @@ function buildToolExamples(
`- goto({"url": "https://example.com"}) - ${TOOL_STRINGS.webActions.goto.description}`,
`- back() - ${TOOL_STRINGS.webActions.back.description}`,
`- forward() - ${TOOL_STRINGS.webActions.forward.description}`,
`- scroll({"direction": "down"}) - ${TOOL_STRINGS.webActions.scroll.description}`,
`- extract({"description": "data to extract"}) - ${TOOL_STRINGS.webActions.extract.description}`,
];

Expand Down Expand Up @@ -336,7 +342,7 @@ Analyze the current page state and determine your next action based on previous
- extract() if you need more information

**Best Practices:**
- The accessibility tree shows currently loaded elements; dynamic pages may load more content on scroll
- The accessibility tree shows currently loaded elements; if expected content is missing, use scroll() to reveal more of the page or wait() to let dynamic content load
- Clear obstructing modals/popups first
- Prefer click() over goto() for page navigation
- Submit forms via enter() or submit button after filling
Expand Down
12 changes: 11 additions & 1 deletion packages/core/src/tools/webActionTools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import { tool } from "ai";
import { z } from "zod";
import { AriaBrowser, PageAction } from "../browser/ariaBrowser.js";
import { AriaBrowser, PageAction, SCROLL_DIRECTIONS } from "../browser/ariaBrowser.js";
import { WebAgentEventEmitter, WebAgentEventType } from "../events.js";
import { buildExtractionPrompt, TOOL_STRINGS } from "../prompts.js";
import type { ProviderConfig } from "../provider.js";
Expand Down Expand Up @@ -244,6 +244,16 @@ export function createWebActionTools(context: WebActionContext) {
},
}),

scroll: tool({
description: TOOL_STRINGS.webActions.scroll.description,
inputSchema: z.object({
direction: z.enum(SCROLL_DIRECTIONS).describe(TOOL_STRINGS.webActions.scroll.direction),
}),
execute: async ({ direction }) => {
return await performActionWithValidation(PageAction.Scroll, context, undefined, direction);
},
}),

goto: tool({
description: TOOL_STRINGS.webActions.goto.description,
inputSchema: z.object({
Expand Down
8 changes: 7 additions & 1 deletion packages/core/test/ariaBrowser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ describe("AriaBrowser interface", () => {
"goto",
"back",
"forward",
"scroll",
"extract",
"done",
"abort",
Expand Down Expand Up @@ -63,7 +64,12 @@ describe("AriaBrowser interface", () => {
];

// Navigation actions
const navigationActions = [PageAction.Goto, PageAction.Back, PageAction.Forward];
const navigationActions = [
PageAction.Goto,
PageAction.Back,
PageAction.Forward,
PageAction.Scroll,
];

// Control actions
const controlActions = [
Expand Down
43 changes: 43 additions & 0 deletions packages/core/test/playwrightBrowser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -676,13 +676,56 @@ describe("PlaywrightBrowser", () => {
PageAction.Goto,
PageAction.Back,
PageAction.Forward,
PageAction.Scroll,
PageAction.Done,
];

nonElementActions.forEach((action) => {
expect((browser as any).actionRequiresElement(action)).toBe(false);
});
});

it("should run scroll via page.evaluate with the given direction", async () => {
const evaluateSpy = vi.fn().mockResolvedValue(undefined);
const waitForLoadStateSpy = vi.fn().mockResolvedValue(undefined);
(browser as any).page = {
evaluate: evaluateSpy,
waitForLoadState: waitForLoadStateSpy,
};

await browser.performAction("", PageAction.Scroll, "down");

expect(evaluateSpy).toHaveBeenCalledWith(expect.any(Function), "down");
// Scroll should also settle briefly so lazy-loaded content can render
// before the next aria-tree snapshot.
expect(waitForLoadStateSpy).toHaveBeenCalledWith("networkidle", { timeout: 500 });
});

it("should not fail scroll if the settle timeout fires", async () => {
// The settle is best-effort — a thrown timeout from waitForLoadState
// must not bubble up and fail the scroll action.
const evaluateSpy = vi.fn().mockResolvedValue(undefined);
const waitForLoadStateSpy = vi
.fn()
.mockRejectedValue(new Error("Timeout 500ms exceeded waiting for networkidle"));
(browser as any).page = {
evaluate: evaluateSpy,
waitForLoadState: waitForLoadStateSpy,
};

await expect(browser.performAction("", PageAction.Scroll, "down")).resolves.not.toThrow();
});

it("should throw BrowserActionException for missing direction on scroll", async () => {
(browser as any).page = { evaluate: vi.fn(), waitForLoadState: vi.fn() };

await expect(browser.performAction("", PageAction.Scroll)).rejects.toThrow(
BrowserActionException,
);
await expect(browser.performAction("", PageAction.Scroll)).rejects.toThrow(
"Direction required for scroll action",
);
});
});

describe("performAction error handling", () => {
Expand Down
1 change: 1 addition & 0 deletions packages/core/test/prompts.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ describe("prompts", () => {
"goto",
"back",
"forward",
"scroll",
"done",
"abort",
];
Expand Down
31 changes: 31 additions & 0 deletions packages/core/test/tools/webActionTools.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ describe("Web Action Tools", () => {
expect(tools.goto).toBeDefined();
expect(tools.back).toBeDefined();
expect(tools.forward).toBeDefined();
expect(tools.scroll).toBeDefined();
expect(tools.extract).toBeDefined();
expect(tools.done).toBeDefined();
expect(tools.abort).toBeDefined();
Expand All @@ -151,6 +152,7 @@ describe("Web Action Tools", () => {
);
expect(tools.back.description).toBe("Go back to the previous page");
expect(tools.forward.description).toBe("Go forward to the next page");
expect(tools.scroll.description).toContain("Scroll the page");
expect(tools.extract.description).toBe(
"Extract specific data from the current page for later reference",
);
Expand Down Expand Up @@ -420,6 +422,35 @@ describe("Web Action Tools", () => {
});
});

describe("Scroll Action", () => {
it("should dispatch scroll action with the requested direction", async () => {
const performActionSpy = vi.spyOn(mockBrowser, "performAction");

const result = await tools.scroll.execute({ direction: "down" });

expect(performActionSpy).toHaveBeenCalledWith("", PageAction.Scroll, "down");
expect(result).toEqual({
success: true,
action: "scroll",
value: "down",
});
});

it("should accept all four directions", () => {
const schema = tools.scroll.inputSchema;
for (const direction of ["up", "down", "top", "bottom"]) {
expect(schema.safeParse({ direction }).success).toBe(true);
}
});

it("should reject unknown directions", () => {
const schema = tools.scroll.inputSchema;
expect(schema.safeParse({ direction: "left" }).success).toBe(false);
expect(schema.safeParse({ direction: "" }).success).toBe(false);
expect(schema.safeParse({}).success).toBe(false);
});
});

describe("Navigation Actions", () => {
it("should execute goto action successfully", async () => {
const performActionSpy = vi.spyOn(mockBrowser, "performAction");
Expand Down
36 changes: 36 additions & 0 deletions packages/extension/src/background/ExtensionBrowser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,42 @@ export class ExtensionBrowser implements AriaBrowser {
// Note: goForward already calls ensureOptimizedPageLoad internally
return;

case PageAction.Scroll: {
if (!value) throw new Error("Direction required for scroll action");
const tab = await this.getActiveTab();
await browser.scripting.executeScript({
target: { tabId: tab.id! },
func: (direction: string) => {
switch (direction) {
case "down":
window.scrollBy({ left: 0, top: window.innerHeight, behavior: "instant" });
return;
case "up":
window.scrollBy({ left: 0, top: -window.innerHeight, behavior: "instant" });
return;
case "top":
window.scrollTo({ left: 0, top: 0, behavior: "instant" });
return;
case "bottom":
window.scrollTo({
left: 0,
top: document.documentElement.scrollHeight,
behavior: "instant",
});
return;
default:
throw new Error(`Unsupported scroll direction: ${direction}`);
}
},
args: [value],
});
// Settle window for lazy-loaded content. Extension runtime has no
// networkidle equivalent, so a short fixed wait is the simplest
// mirror of the Playwright implementation's settle.
await new Promise((resolve) => setTimeout(resolve, 300));
return;
}

case PageAction.Done:
// This is a no-op in the browser implementation
// It's handled at a higher level in the automation flow
Expand Down
Loading