Predicate agent

SentienceDEV · SentienceDEV · commit 3e9f00be8ef9 · 2026-02-14T20:48:12.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,90 @@ All notable changes to `@predicatelabs/sdk` will be documented in this file.
 
 ## Unreleased
 
+### 2026-02-15
+
+#### PredicateBrowserAgent (snapshot-first, verification-first)
+
+`PredicateBrowserAgent` is a new high-level agent wrapper that gives you a **browser-use-like** `step()` / `run()` surface, but keeps Predicate’s core philosophy:
+
+- **Snapshot-first perception** (structured DOM snapshot is the default)
+- **Verification-first control plane** (you can gate progress with deterministic checks)
+- Optional **vision fallback** (bounded) when snapshots aren’t sufficient
+
+It’s built on top of `AgentRuntime` + `RuntimeAgent`.
+
+##### Quickstart (single step)
+
+```ts
+import {
+  AgentRuntime,
+  PredicateBrowserAgent,
+  type RuntimeStep,
+  LocalLLMProvider, // or OpenAIProvider / AnthropicProvider / DeepInfraProvider
+} from '@predicatelabs/sdk';
+
+const runtime = new AgentRuntime(browserLike, page, tracer);
+const llm = new LocalLLMProvider({ model: 'qwen2.5:7b', baseUrl: 'http://localhost:11434/v1' });
+
+const agent = new PredicateBrowserAgent({
+  runtime,
+  executor: llm,
+  config: {
+    // Token control: include last N step summaries in the prompt (0 disables history).
+    historyLastN: 2,
+  },
+});
+
+const ok = await agent.step({
+  taskGoal: 'Find pricing and verify checkout button exists',
+  step: { goal: 'Open pricing page' } satisfies RuntimeStep,
+});
+```
+
+##### Customize the compact prompt (advanced)
+
+```ts
+const agent = new PredicateBrowserAgent({
+  runtime,
+  executor: llm,
+  config: {
+    compactPromptBuilder: (_taskGoal, _stepGoal, domContext, _snap, historySummary) => ({
+      systemPrompt:
+        'You are a web automation agent. Return ONLY one action: CLICK(id) | TYPE(id,"text") | PRESS("key") | FINISH()',
+      userPrompt: `RECENT:\n${historySummary}\n\nELEMENTS:\n${domContext}\n\nReturn the single best action:`,
+    }),
+  },
+});
+```
+
+##### CAPTCHA handling (interface-only; no solver shipped)
+
+If you set `captcha.policy="callback"`, you must provide a handler. The SDK does **not** include a public CAPTCHA solver.
+
+```ts
+import { HumanHandoffSolver } from '@predicatelabs/sdk';
+
+const agent = new PredicateBrowserAgent({
+  runtime,
+  executor: llm,
+  config: {
+    captcha: {
+      policy: 'callback',
+      // Manual solve in the live session; SDK waits until it clears:
+      handler: HumanHandoffSolver({ timeoutMs: 10 * 60_000, pollMs: 1_000 }),
+    },
+  },
+});
+```
+
+#### RuntimeAgent: structured prompt override hooks
+
+`RuntimeAgent` now supports optional hooks used by `PredicateBrowserAgent`:
+
+- `structuredPromptBuilder(...)`
+- `domContextPostprocessor(...)`
+- `historySummaryProvider(...)`
+
 ### 2026-02-13
 
 #### Expanded deterministic verifications (adaptive resnapshotting)
diff --git a/src/agents/browser-agent.ts b/src/agents/browser-agent.ts
@@ -0,0 +1,207 @@
+import type { Snapshot, StepHookContext } from '../types';
+import type { PermissionPolicy } from '../browser';
+import type { AgentRuntime } from '../agent-runtime';
+import type { LLMProvider } from '../llm-provider';
+import { RuntimeAgent } from '../runtime-agent';
+import type { RuntimeStep } from '../runtime-agent';
+import type { CaptchaOptions } from '../captcha/types';
+import type { CaptchaHandler } from '../captcha/types';
+
+export interface PermissionRecoveryConfig {
+  enabled?: boolean;
+  maxRestarts?: number;
+  autoGrant?: string[];
+  geolocation?: Record<string, any> | null;
+  origin?: string | null;
+}
+
+export interface VisionFallbackConfig {
+  enabled?: boolean;
+  maxVisionCalls?: number;
+  triggerRequiresVision?: boolean;
+  triggerRepeatedNoop?: boolean;
+  triggerCanvasOrLowActionables?: boolean;
+}
+
+export interface CaptchaConfig {
+  policy?: 'abort' | 'callback';
+  // Interface-only: SDK does not ship captcha solvers. Users provide a handler/callback.
+  handler?: CaptchaHandler | null;
+  timeoutMs?: number | null;
+  pollMs?: number | null;
+  minConfidence?: number;
+}
+
+export interface PredicateBrowserAgentConfig {
+  // Permissions
+  permissionStartup?: PermissionPolicy | null;
+  permissionRecovery?: PermissionRecoveryConfig | null;
+
+  // Vision fallback
+  vision?: VisionFallbackConfig;
+
+  // CAPTCHA handling
+  captcha?: CaptchaConfig;
+
+  // Prompt / token controls
+  historyLastN?: number; // 0 disables LLM-facing step history
+
+  // Compact prompt customization
+  // builder(taskGoal, stepGoal, domContext, snapshot, historySummary) -> {systemPrompt, userPrompt}
+  compactPromptBuilder?: (
+    taskGoal: string,
+    stepGoal: string,
+    domContext: string,
+    snap: Snapshot,
+    historySummary: string
+  ) => { systemPrompt: string; userPrompt: string };
+
+  compactPromptPostprocessor?: (domContext: string) => string;
+}
+
+function historySummary(items: string[]): string {
+  if (!items.length) return '';
+  return items.map(s => `- ${s}`).join('\n');
+}
+
+function applyCaptchaConfigToRuntime(runtime: AgentRuntime, cfg: CaptchaConfig | undefined): void {
+  if (!cfg) return;
+
+  const policy = (cfg.policy ?? 'abort').toLowerCase() as 'abort' | 'callback';
+  if (policy === 'abort') {
+    runtime.setCaptchaOptions({
+      policy: 'abort',
+      minConfidence: cfg.minConfidence ?? 0.7,
+    } satisfies CaptchaOptions);
+    return;
+  }
+
+  const pollMs = cfg.pollMs ?? 1_000;
+  const timeoutMs = cfg.timeoutMs ?? 120_000;
+  const minConfidence = cfg.minConfidence ?? 0.7;
+
+  const handler = cfg.handler ?? null;
+  if (!handler) {
+    throw new Error(
+      'captcha.handler is required when captcha.policy="callback". ' +
+        'Provide a handler callback (e.g. human handoff or your external system).'
+    );
+  }
+
+  runtime.setCaptchaOptions({
+    policy: 'callback',
+    handler,
+    timeoutMs,
+    pollMs,
+    minConfidence,
+  } satisfies CaptchaOptions);
+}
+
+export type StepOutcome = { stepGoal: string; ok: boolean };
+
+export class PredicateBrowserAgent {
+  readonly runtime: AgentRuntime;
+  readonly executor: LLMProvider;
+  readonly visionExecutor?: LLMProvider;
+  readonly visionVerifier?: LLMProvider;
+  readonly config: PredicateBrowserAgentConfig;
+
+  private history: string[] = [];
+  private visionCallsUsed = 0;
+  private runner: RuntimeAgent;
+
+  constructor(opts: {
+    runtime: AgentRuntime;
+    executor: LLMProvider;
+    visionExecutor?: LLMProvider;
+    visionVerifier?: LLMProvider;
+    config?: PredicateBrowserAgentConfig;
+  }) {
+    this.runtime = opts.runtime;
+    this.executor = opts.executor;
+    this.visionExecutor = opts.visionExecutor;
+    this.visionVerifier = opts.visionVerifier;
+    this.config = {
+      permissionStartup: null,
+      permissionRecovery: null,
+      vision: { enabled: false, maxVisionCalls: 0 },
+      captcha: { policy: 'abort', handler: null },
+      historyLastN: 0,
+      ...(opts.config ?? {}),
+    };
+
+    applyCaptchaConfigToRuntime(this.runtime, this.config.captcha);
+
+    this.runner = new RuntimeAgent({
+      runtime: this.runtime,
+      executor: this.executor,
+      visionExecutor: this.visionExecutor,
+      visionVerifier: this.visionVerifier,
+      structuredPromptBuilder: this.config.compactPromptBuilder,
+      domContextPostprocessor: this.config.compactPromptPostprocessor,
+      historySummaryProvider: () => {
+        const n = Math.max(0, this.config.historyLastN ?? 0);
+        if (n <= 0) return '';
+        const slice = this.history.slice(Math.max(0, this.history.length - n));
+        return historySummary(slice);
+      },
+    } as any);
+  }
+
+  private recordHistory(stepGoal: string, ok: boolean) {
+    const n = Math.max(0, this.config.historyLastN ?? 0);
+    if (n <= 0) return;
+    this.history.push(`${stepGoal} -> ${ok ? 'ok' : 'fail'}`);
+    if (this.history.length > n) {
+      this.history = this.history.slice(this.history.length - n);
+    }
+  }
+
+  async step(opts: {
+    taskGoal: string;
+    step: RuntimeStep;
+    onStepStart?: (ctx: StepHookContext) => void | Promise<void>;
+    onStepEnd?: (ctx: StepHookContext) => void | Promise<void>;
+  }): Promise<StepOutcome> {
+    let step = opts.step;
+
+    const maxVisionCalls = Math.max(0, this.config.vision?.maxVisionCalls ?? 0);
+    if (
+      this.config.vision?.enabled &&
+      maxVisionCalls > 0 &&
+      this.visionCallsUsed >= maxVisionCalls
+    ) {
+      step = { ...step, visionExecutorEnabled: false, maxVisionExecutorAttempts: 0 };
+    }
+
+    const ok = await this.runner.runStep({
+      taskGoal: opts.taskGoal,
+      step,
+      onStepStart: opts.onStepStart,
+      onStepEnd: opts.onStepEnd,
+    });
+
+    this.recordHistory(step.goal, ok);
+    return { stepGoal: step.goal, ok };
+  }
+
+  async run(opts: {
+    taskGoal: string;
+    steps: RuntimeStep[];
+    onStepStart?: (ctx: StepHookContext) => void | Promise<void>;
+    onStepEnd?: (ctx: StepHookContext) => void | Promise<void>;
+    stopOnFailure?: boolean;
+  }): Promise<boolean> {
+    const stopOnFailure = opts.stopOnFailure ?? true;
+    for (const step of opts.steps) {
+      const out = await this.step({
+        taskGoal: opts.taskGoal,
+        step,
+        onStepStart: opts.onStepStart,
+        onStepEnd: opts.onStepEnd,
+      });
+      if (stopOnFailure && !out.ok) return false;
+    }
+    return true;
+  }
+}
diff --git a/src/index.ts b/src/index.ts
@@ -91,6 +91,13 @@ export { SentienceDebugger, PredicateDebugger } from './debugger';
 export { RuntimeAgent } from './runtime-agent';
 export type { RuntimeStep, StepVerification } from './runtime-agent';
 export { parseVisionExecutorAction, executeVisionExecutorAction } from './vision-executor';
+export {
+  PredicateBrowserAgent,
+  type PredicateBrowserAgentConfig,
+  type PermissionRecoveryConfig,
+  type VisionFallbackConfig,
+  type CaptchaConfig,
+} from './agents/browser-agent';
 export * from './captcha/types';
 export * from './captcha/strategies';
 export * from './tools';
diff --git a/src/runtime-agent.ts b/src/runtime-agent.ts
@@ -58,20 +58,43 @@ export class RuntimeAgent {
   readonly shortCircuitCanvas: boolean;
 
   private structuredLLM: LLMInteractionHandler;
+  private structuredPromptBuilder?:
+    | ((
+        taskGoal: string,
+        stepGoal: string,
+        domContext: string,
+        snap: Snapshot,
+        historySummary: string
+      ) => { systemPrompt: string; userPrompt: string })
+    | undefined;
+  private domContextPostprocessor?: ((domContext: string) => string) | undefined;
+  private historySummaryProvider?: (() => string) | undefined;
 
   constructor(opts: {
     runtime: AgentRuntime;
     executor: LLMProvider;
     visionExecutor?: LLMProvider;
     visionVerifier?: LLMProvider;
     shortCircuitCanvas?: boolean;
+    structuredPromptBuilder?: (
+      taskGoal: string,
+      stepGoal: string,
+      domContext: string,
+      snap: Snapshot,
+      historySummary: string
+    ) => { systemPrompt: string; userPrompt: string };
+    domContextPostprocessor?: (domContext: string) => string;
+    historySummaryProvider?: () => string;
   }) {
     this.runtime = opts.runtime;
     this.executor = opts.executor;
     this.visionExecutor = opts.visionExecutor;
     this.visionVerifier = opts.visionVerifier;
     this.shortCircuitCanvas = opts.shortCircuitCanvas ?? true;
     this.structuredLLM = new LLMInteractionHandler(this.executor, false);
+    this.structuredPromptBuilder = opts.structuredPromptBuilder;
+    this.domContextPostprocessor = opts.domContextPostprocessor;
+    this.historySummaryProvider = opts.historySummaryProvider;
   }
 
   async runStep(opts: {
@@ -210,8 +233,31 @@ export class RuntimeAgent {
     snap: Snapshot;
   }): Promise<string> {
     const { taskGoal, step, snap } = opts;
-    const domContext = this.structuredLLM.buildContext(snap, step.goal);
-    const combinedGoal = `${taskGoal}\n\nSTEP: ${step.goal}`;
+    let domContext = this.structuredLLM.buildContext(snap, step.goal);
+    if (this.domContextPostprocessor) {
+      domContext = this.domContextPostprocessor(domContext);
+    }
+
+    const historySummary = (this.historySummaryProvider?.() ?? '').trim();
+
+    if (this.structuredPromptBuilder) {
+      const { systemPrompt, userPrompt } = this.structuredPromptBuilder(
+        taskGoal,
+        step.goal,
+        domContext,
+        snap,
+        historySummary
+      );
+      const resp = await this.executor.generate(systemPrompt, userPrompt, { temperature: 0.0 });
+      return this.extractActionFromText(resp.content);
+    }
+
+    let combinedGoal = taskGoal;
+    if (historySummary) {
+      combinedGoal = `${taskGoal}\n\nRECENT STEPS:\n${historySummary}`;
+    }
+    combinedGoal = `${combinedGoal}\n\nSTEP: ${step.goal}`;
+
     const resp = await this.structuredLLM.queryLLM(domContext, combinedGoal);
     return this.extractActionFromText(resp.content);
   }
diff --git a/tests/predicate-browser-agent.test.ts b/tests/predicate-browser-agent.test.ts