token usage & video recording

SentienceDEV · SentienceDEV · commit c5541b0b07b7 · 2026-02-14T21:34:16.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -88,6 +88,31 @@ const agent = new PredicateBrowserAgent({
 - `domContextPostprocessor(...)`
 - `historySummaryProvider(...)`
 
+#### PredicateBrowserAgent: opt-in token usage accounting (best-effort)
+
+If you want to measure token spend, you can enable best-effort accounting (depends on provider reporting token counts):
+
+```ts
+const agent = new PredicateBrowserAgent({
+  runtime,
+  executor: llm,
+  config: {
+    tokenUsageEnabled: true,
+  },
+});
+
+const usage = agent.getTokenUsage();
+agent.resetTokenUsage();
+```
+
+#### RuntimeAgent: actOnce without step lifecycle (orchestrators)
+
+`RuntimeAgent` now exposes `actOnce(...)` helpers that execute exactly one action **without** calling `runtime.beginStep()` / `runtime.emitStepEnd()`. This is intended for external orchestrators (e.g. WebBench) that already own step lifecycle and just want the SDK’s snapshot-first propose+execute block.
+
+- `await agent.actOnce(...) -> string`
+- `await agent.actOnceWithSnapshot(...) -> { action, snap }`
+- `await agent.actOnceResult(...) -> { action, snap, usedVision }`
+
 ### 2026-02-13
 
 #### Expanded deterministic verifications (adaptive resnapshotting)
diff --git a/examples/agent/README.md b/examples/agent/README.md
@@ -2,4 +2,5 @@ Predicate agent examples.
 
 - `predicate-browser-agent-minimal.ts`: minimal `PredicateBrowserAgent` usage.
 - `predicate-browser-agent-custom-prompt.ts`: customize the compact prompt builder.
+- `predicate-browser-agent-video-recording-playwright.ts`: enable Playwright video recording via context options (recommended).
 
diff --git a/examples/agent/predicate-browser-agent-video-recording-playwright.ts b/examples/agent/predicate-browser-agent-video-recording-playwright.ts
@@ -0,0 +1,101 @@
+/**
+ * Example: PredicateBrowserAgent + Playwright video recording (recommended approach).
+ *
+ * Video recording is a Playwright context feature (recordVideo), not an agent constructor flag.
+ * This example shows how to:
+ * 1) create a Playwright context with recordVideo enabled
+ * 2) wrap the existing page with SentienceBrowser.fromPage(...)
+ * 3) use AgentRuntime + PredicateBrowserAgent normally
+ *
+ * Usage:
+ *   ts-node examples/agent/predicate-browser-agent-video-recording-playwright.ts
+ */
+
+import { chromium } from 'playwright';
+import * as fs from 'fs';
+import * as path from 'path';
+
+import {
+  AgentRuntime,
+  PredicateBrowserAgent,
+  type PredicateBrowserAgentConfig,
+  SentienceBrowser,
+  type RuntimeStep,
+} from '../../src';
+import { createTracer } from '../../src/tracing/tracer-factory';
+import { LLMProvider, type LLMResponse } from '../../src/llm-provider';
+import type { Snapshot } from '../../src/types';
+
+function createBrowserAdapter(browser: SentienceBrowser) {
+  return {
+    snapshot: async (_page: any, options?: Record<string, any>): Promise<Snapshot> => {
+      return await browser.snapshot(options);
+    },
+  };
+}
+
+class FixedActionProvider extends LLMProvider {
+  constructor(private action: string) {
+    super();
+  }
+  get modelName(): string {
+    return 'fixed-action';
+  }
+  supportsJsonMode(): boolean {
+    return false;
+  }
+  async generate(_system: string, _user: string, _opts: Record<string, any> = {}): Promise<LLMResponse> {
+    return { content: this.action, modelName: this.modelName };
+  }
+}
+
+async function main() {
+  const apiKey = (process.env.PREDICATE_API_KEY ||
+    process.env.SENTIENCE_API_KEY) as string | undefined;
+
+  const recordingsDir = path.join(process.cwd(), 'recordings');
+  if (!fs.existsSync(recordingsDir)) fs.mkdirSync(recordingsDir, { recursive: true });
+
+  const pw = await chromium.launch({ headless: false });
+  const context = await pw.newContext({
+    recordVideo: { dir: recordingsDir, size: { width: 1280, height: 720 } },
+  });
+  const page = await context.newPage();
+
+  const runId = 'predicate-browser-agent-video-recording';
+  const tracer = await createTracer({ apiKey, runId, uploadTrace: false });
+
+  // Wrap existing Playwright page.
+  const sentienceBrowser = SentienceBrowser.fromPage(page, apiKey);
+
+  try {
+    await page.goto('https://example.com');
+    await page.waitForLoadState('networkidle');
+
+    const runtime = new AgentRuntime(createBrowserAdapter(sentienceBrowser), page as any, tracer);
+    const config: PredicateBrowserAgentConfig = { historyLastN: 0 };
+
+    const agent = new PredicateBrowserAgent({
+      runtime,
+      executor: new FixedActionProvider('FINISH()'),
+      config,
+    });
+
+    const out = await agent.step({
+      taskGoal: 'Open example.com',
+      step: { goal: 'Finish immediately' } satisfies RuntimeStep,
+    });
+    console.log(`step ok: ${out.ok}`);
+    console.log(`videos will be saved under: ${recordingsDir}`);
+  } finally {
+    await tracer.close(true);
+    await context.close(); // flush video
+    await pw.close();
+  }
+}
+
+main().catch(err => {
+  console.error(err);
+  process.exit(1);
+});
+
diff --git a/src/agents/browser-agent.ts b/src/agents/browser-agent.ts
@@ -1,7 +1,7 @@
 import type { Snapshot, StepHookContext } from '../types';
 import type { PermissionPolicy } from '../browser';
 import type { AgentRuntime } from '../agent-runtime';
-import type { LLMProvider } from '../llm-provider';
+import { LLMProvider } from '../llm-provider';
 import { RuntimeAgent } from '../runtime-agent';
 import type { RuntimeStep } from '../runtime-agent';
 import type { CaptchaOptions } from '../captcha/types';
@@ -46,6 +46,9 @@ export interface PredicateBrowserAgentConfig {
   // Prompt / token controls
   historyLastN?: number; // 0 disables LLM-facing step history
 
+  // Opt-in: track token usage from LLM provider responses (best-effort).
+  tokenUsageEnabled?: boolean;
+
   // Compact prompt customization
   // builder(taskGoal, stepGoal, domContext, snapshot, historySummary) -> {systemPrompt, userPrompt}
   compactPromptBuilder?: (
@@ -97,6 +100,113 @@ function applyCaptchaConfigToRuntime(runtime: AgentRuntime, cfg: CaptchaConfig |
   } satisfies CaptchaOptions);
 }
 
+type TokenUsageTotals = {
+  calls: number;
+  promptTokens: number;
+  completionTokens: number;
+  totalTokens: number;
+};
+
+class TokenUsageCollector {
+  private byRole: Record<string, TokenUsageTotals> = {};
+  private byModel: Record<string, TokenUsageTotals> = {};
+
+  record(role: string, resp: any): void {
+    const pt = typeof resp?.promptTokens === 'number' ? resp.promptTokens : 0;
+    const ct = typeof resp?.completionTokens === 'number' ? resp.completionTokens : 0;
+    const tt = typeof resp?.totalTokens === 'number' ? resp.totalTokens : pt + ct;
+    const model = String(resp?.modelName ?? 'unknown') || 'unknown';
+
+    const bump = (dst: Record<string, TokenUsageTotals>, key: string) => {
+      const cur =
+        dst[key] ??
+        ({ calls: 0, promptTokens: 0, completionTokens: 0, totalTokens: 0 } as TokenUsageTotals);
+      cur.calls += 1;
+      cur.promptTokens += Math.max(0, pt);
+      cur.completionTokens += Math.max(0, ct);
+      cur.totalTokens += Math.max(0, tt);
+      dst[key] = cur;
+    };
+
+    bump(this.byRole, role);
+    bump(this.byModel, model);
+  }
+
+  reset(): void {
+    this.byRole = {};
+    this.byModel = {};
+  }
+
+  summary(): {
+    total: TokenUsageTotals;
+    byRole: Record<string, TokenUsageTotals>;
+    byModel: Record<string, TokenUsageTotals>;
+  } {
+    const sum = (src: Record<string, TokenUsageTotals>): TokenUsageTotals => {
+      return Object.values(src).reduce(
+        (acc, v) => ({
+          calls: acc.calls + v.calls,
+          promptTokens: acc.promptTokens + v.promptTokens,
+          completionTokens: acc.completionTokens + v.completionTokens,
+          totalTokens: acc.totalTokens + v.totalTokens,
+        }),
+        { calls: 0, promptTokens: 0, completionTokens: 0, totalTokens: 0 }
+      );
+    };
+    return { total: sum(this.byRole), byRole: this.byRole, byModel: this.byModel };
+  }
+}
+
+class TokenAccountingProvider extends LLMProvider {
+  constructor(
+    private inner: LLMProvider,
+    private collector: TokenUsageCollector,
+    private role: string
+  ) {
+    super();
+  }
+  get modelName(): string {
+    return this.inner.modelName;
+  }
+  supportsJsonMode(): boolean {
+    return this.inner.supportsJsonMode();
+  }
+  supportsVision(): boolean {
+    return this.inner.supportsVision?.() ?? false;
+  }
+  async generate(
+    systemPrompt: string,
+    userPrompt: string,
+    options: Record<string, any> = {}
+  ): Promise<any> {
+    const resp = await this.inner.generate(systemPrompt, userPrompt, options);
+    try {
+      this.collector.record(this.role, resp);
+    } catch {
+      // best-effort
+    }
+    return resp;
+  }
+  async generateWithImage(
+    systemPrompt: string,
+    userPrompt: string,
+    imageBase64: string,
+    options: Record<string, any> = {}
+  ): Promise<any> {
+    const fn = (this.inner as any).generateWithImage;
+    if (typeof fn !== 'function') {
+      throw new Error('Inner provider does not implement generateWithImage');
+    }
+    const resp = await fn.call(this.inner, systemPrompt, userPrompt, imageBase64, options);
+    try {
+      this.collector.record(this.role, resp);
+    } catch {
+      // best-effort
+    }
+    return resp;
+  }
+}
+
 export type StepOutcome = { stepGoal: string; ok: boolean };
 
 export class PredicateBrowserAgent {
@@ -109,6 +219,7 @@ export class PredicateBrowserAgent {
   private history: string[] = [];
   private visionCallsUsed = 0;
   private runner: RuntimeAgent;
+  private tokenUsage: TokenUsageCollector | null = null;
 
   constructor(opts: {
     runtime: AgentRuntime;
@@ -117,10 +228,22 @@ export class PredicateBrowserAgent {
     visionVerifier?: LLMProvider;
     config?: PredicateBrowserAgentConfig;
   }) {
+    const tokenUsageEnabled = Boolean(opts.config?.tokenUsageEnabled);
+    const collector = tokenUsageEnabled ? new TokenUsageCollector() : null;
+
     this.runtime = opts.runtime;
-    this.executor = opts.executor;
-    this.visionExecutor = opts.visionExecutor;
-    this.visionVerifier = opts.visionVerifier;
+    this.tokenUsage = collector;
+    this.executor = collector
+      ? new TokenAccountingProvider(opts.executor, collector, 'executor')
+      : opts.executor;
+    this.visionExecutor =
+      collector && opts.visionExecutor
+        ? new TokenAccountingProvider(opts.visionExecutor, collector, 'vision_executor')
+        : opts.visionExecutor;
+    this.visionVerifier =
+      collector && opts.visionVerifier
+        ? new TokenAccountingProvider(opts.visionVerifier, collector, 'vision_verifier')
+        : opts.visionVerifier;
     this.config = {
       permissionStartup: null,
       permissionRecovery: null,
@@ -148,6 +271,17 @@ export class PredicateBrowserAgent {
     } as any);
   }
 
+  getTokenUsage(): any {
+    if (!this.tokenUsage) {
+      return { enabled: false, reason: 'tokenUsageEnabled is false' };
+    }
+    return { enabled: true, ...this.tokenUsage.summary() };
+  }
+
+  resetTokenUsage(): void {
+    this.tokenUsage?.reset();
+  }
+
   private recordHistory(stepGoal: string, ok: boolean) {
     const n = Math.max(0, this.config.historyLastN ?? 0);
     if (n <= 0) return;
diff --git a/tests/predicate-browser-agent.test.ts b/tests/predicate-browser-agent.test.ts
@@ -44,7 +44,13 @@ class ProviderStub extends LLMProvider {
   ): Promise<LLMResponse> {
     this.calls.push({ system: systemPrompt, user: userPrompt, options });
     const content = this.responses.length ? (this.responses.shift() as string) : 'FINISH()';
-    return { content, modelName: this.modelName };
+    return {
+      content,
+      modelName: this.modelName,
+      promptTokens: 11,
+      completionTokens: 7,
+      totalTokens: 18,
+    };
   }
 }
 
@@ -103,4 +109,47 @@ describe('PredicateBrowserAgent', () => {
     expect(executor.calls[0].system).toContain('SYSTEM_CUSTOM');
     expect(executor.calls[0].user).toBe('USER_CUSTOM');
   });
+
+  it('tracks token usage when opt-in enabled', async () => {
+    const sink = new MockSink();
+    const tracer = new Tracer('run', sink);
+    const page = new MockPage('https://example.com/start') as any;
+
+    const snapshots: Snapshot[] = [
+      {
+        status: 'success',
+        url: 'https://example.com/start',
+        elements: [makeClickableElement(1)],
+        timestamp: 't1',
+      },
+    ];
+
+    const browserLike = {
+      snapshot: async () => snapshots.shift() as Snapshot,
+    };
+
+    const runtime = new AgentRuntime(browserLike as any, page as any, tracer);
+    const executor = new ProviderStub(['FINISH()']);
+
+    const agent = new PredicateBrowserAgent({
+      runtime,
+      executor,
+      config: { tokenUsageEnabled: true, captcha: { policy: 'abort' } },
+    });
+
+    const out = await agent.step({
+      taskGoal: 'test',
+      step: { goal: 'No-op', maxSnapshotAttempts: 1 },
+    });
+    expect(out.ok).toBe(true);
+
+    const usage = agent.getTokenUsage();
+    expect(usage.enabled).toBe(true);
+    expect(usage.total.totalTokens).toBeGreaterThanOrEqual(18);
+    expect(usage.byRole.executor.calls).toBeGreaterThanOrEqual(1);
+
+    agent.resetTokenUsage();
+    const usage2 = agent.getTokenUsage();
+    expect(usage2.total.totalTokens).toBe(0);
+  });
 });

Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,5 @@ Predicate agent examples.`
`2`	`2`
`3`	`3`	- `predicate-browser-agent-minimal.ts`: minimal `PredicateBrowserAgent` usage.
`4`	`4`	- `predicate-browser-agent-custom-prompt.ts`: customize the compact prompt builder.
	`5`	+- `predicate-browser-agent-video-recording-playwright.ts`: enable Playwright video recording via context options (recommended).
`5`	`6`