superdoc-dev
diff --git a/‎AGENTS.md‎
Lines changed: 51 additions & 3 deletions b/‎AGENTS.md‎
Lines changed: 51 additions & 3 deletions
diff --git a/‎apps/cli/src/__tests__/lib/validate-type-spec.test.ts‎
Lines changed: 36 additions & 0 deletions b/‎apps/cli/src/__tests__/lib/validate-type-spec.test.ts‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎apps/cli/src/lib/operation-args.ts‎
Lines changed: 39 additions & 3 deletions b/‎apps/cli/src/lib/operation-args.ts‎
Lines changed: 39 additions & 3 deletions
diff --git a/‎apps/docs/ai/agents/best-practices.mdx‎
Lines changed: 206 additions & 0 deletions b/‎apps/docs/ai/agents/best-practices.mdx‎
Lines changed: 206 additions & 0 deletions
@@ -118,22 +118,70 @@ Many packages use `.js` files with JSDoc `@typedef` for type definitions (e.g.,
 
 ## AI Eval Suite
 
-The `evals/` directory contains a Promptfoo-based evaluation suite for validating AI tool call quality.
+The `evals/` directory contains a Promptfoo-based evaluation suite with three levels of evaluation.
+
+### Level 1: Deterministic Evals (tool selection + argument accuracy)
 
 | Command | What it does | Cost |
 |---------|-------------|------|
 | `pnpm --filter @superdoc-testing/evals run eval` | Run deterministic evals (reading + argument tests) | ~$0.30 |
 | `pnpm --filter @superdoc-testing/evals run eval:reading` | Run reading tool tests only | ~$0.15 |
-| `pnpm --filter @superdoc-testing/evals run eval:gdpval` | Run GDPval benchmark (Model+SuperDoc vs Model-Only) | ~$1-2 |
 | `pnpm --filter @superdoc-testing/evals run eval:view` | Open Promptfoo web UI with results | Free |
 | `pnpm --filter @superdoc-testing/evals run baseline:save <label>` | Save versioned results snapshot | Free |
 
 Tool definitions are extracted from `packages/sdk/tools/` via `evals/tools/extract.mjs`. Run `pnpm run generate:all` first if SDK artifacts are missing.
 
-Test files are YAML in `evals/tests/`. Each test has a `vars.task` prompt and JavaScript assertions that check tool call structure (Level 1: tool selection + argument accuracy, not execution).
+Test files are YAML in `evals/tests/`. Each test has a `vars.task` prompt and JavaScript assertions that check tool call structure (tool selection + argument accuracy, not execution).
 
 The system prompt at `evals/prompts/agent.txt` is a copy of the proven prompt from `examples/eval-demo/lib/agent.ts`. Update both when changing the prompt.
 
+### Level 2: GDPval Benchmark (Model+SuperDoc vs Model-Only)
+
+| Command | What it does | Cost |
+|---------|-------------|------|
+| `pnpm --filter @superdoc-testing/evals run eval:gdpval` | Run GDPval benchmark | ~$1-2 |
+
+### Level 3: DOCX Agent Benchmark (real agents, real documents)
+
+Runs actual Claude Code and Codex CLIs against DOCX tasks, comparing their performance with and without SuperDoc tools. 4 conditions x 2 agents x N tasks.
+
+**Conditions:**
+
+| Condition | What the agent gets |
+|-----------|-------------------|
+| baseline | No skill, agent figures out DOCX on its own |
+| baseline-with-docx-skill | Anthropic's DOCX skill (unzip + XML editing) |
+| superdoc-mcp | SuperDoc MCP server (`superdoc_open`, `superdoc_get_content`, etc.) |
+| superdoc-cli | SuperDoc CLI on PATH |
+
+**Tasks:** 3 reading (extract headings, entity names, financial figures) + 3 editing (replace entity name, insert section, fill placeholders).
+
+**Metrics per task:** correctness (pass/fail), collateral (no unintended changes), steps (agent turn count), latency (seconds), tokens (input + output), path (which DOCX approach was used).
+
+| Command | What it does | Cost |
+|---------|-------------|------|
+| `pnpm --filter @superdoc-testing/evals run eval:benchmark` | Run full benchmark | ~15 min |
+| `pnpm --filter @superdoc-testing/evals run eval:benchmark:codex` | Run Codex conditions only | ~8 min |
+| `pnpm --filter @superdoc-testing/evals run eval:benchmark:claude` | Run Claude Code conditions only | ~8 min |
+| `pnpm --filter @superdoc-testing/evals run eval:benchmark:report` | Generate comparison report (Markdown + CSV) | Free |
+
+**Prerequisites:**
+- `OPENAI_API_KEY` in `evals/.env` (for Codex; use `codex login --with-api-key` for API key auth)
+- Claude Code installed locally (uses local auth, no API key needed in `.env`)
+- MCP server built: `cd apps/mcp && pnpm run build`
+- CLI built: check `apps/cli/dist/index.js` exists
+
+**Key files:**
+
+| File | Purpose |
+|------|---------|
+| `evals/config/benchmark.promptfoo.yaml` | Level 3 Promptfoo config (8 providers) |
+| `evals/suites/benchmark/tests/agent-benchmark-v2.yaml` | Benchmark tasks with assertions |
+| `evals/providers/claude-code-agent.mjs` | Claude Agent SDK provider |
+| `evals/providers/codex-agent.mjs` | Codex SDK provider |
+| `evals/suites/benchmark/reports/benchmark-report.mjs` | Markdown + CSV report generator |
+| `evals/fixtures/vendor/vendor-docx-skill.md` | Anthropic's DOCX skill for baseline-with-docx-skill condition |
+
 ## Generated Artifacts
 
 These directories are produced by `pnpm run generate:all`:
 
@@ -72,6 +72,42 @@ describe('validateValueAgainstTypeSpec – oneOf with mixed schemas', () => {
   });
 });
 
+describe('validateValueAgainstTypeSpec – repeated actionable oneOf errors', () => {
+  const repeatedUnknownKeySchema: CliTypeSpec = {
+    oneOf: [
+      {
+        type: 'object',
+        properties: {
+          id: { type: 'string' },
+          op: { const: 'text.rewrite' },
+        },
+        required: ['id', 'op'],
+      },
+      {
+        type: 'object',
+        properties: {
+          id: { type: 'string' },
+          op: { const: 'text.insert' },
+        },
+        required: ['id', 'op'],
+      },
+    ],
+  };
+
+  test('surfaces the shared nested schema error instead of the generic oneOf message', () => {
+    try {
+      validateValueAgainstTypeSpec({ id: 'r1', op: 'text.rewrite', '},{': ':' }, repeatedUnknownKeySchema, 'steps[0]');
+      throw new Error('Expected CliError to be thrown');
+    } catch (error) {
+      const cliError = error as CliError;
+      expect(cliError.message).toBe('steps[0].},{ is not allowed by schema.');
+      expect((cliError.details as { selectedError?: string }).selectedError).toBe(
+        'steps[0].},{ is not allowed by schema.',
+      );
+    }
+  });
+});
+
 describe('validateValueAgainstTypeSpec – enum branch', () => {
   const enumSchema: CliTypeSpec = {
     type: 'string',
 
@@ -115,6 +115,37 @@ function extractConstValues(variants: CliTypeSpec[]): string[] {
   return values;
 }
 
+function isNestedValidationMessage(path: string, message: string): boolean {
+  return message.startsWith(`${path}.`) || message.startsWith(`${path}[`);
+}
+
+function selectRepeatedActionableOneOfError(path: string, errors: string[]): string | null {
+  const counts = new Map<string, number>();
+  for (const error of errors) {
+    counts.set(error, (counts.get(error) ?? 0) + 1);
+  }
+
+  let bestMessage: string | null = null;
+  let bestScore = 0;
+
+  for (const [message, count] of counts.entries()) {
+    if (count < 2) continue;
+
+    const nested = isNestedValidationMessage(path, message);
+    const isShapeError = message.includes(' is not allowed by schema.') || message.includes(' is required.');
+
+    if (!nested && !isShapeError) continue;
+
+    const score = count * 10 + (nested ? 2 : 0) + (isShapeError ? 1 : 0);
+    if (score > bestScore) {
+      bestScore = score;
+      bestMessage = message;
+    }
+  }
+
+  return bestMessage;
+}
+
 export function validateValueAgainstTypeSpec(value: unknown, schema: CliTypeSpec, path: string): void {
   if ('const' in schema) {
     if (value !== schema.const) {
@@ -136,11 +167,12 @@ export function validateValueAgainstTypeSpec(value: unknown, schema: CliTypeSpec
     }
 
     const allowedValues = extractConstValues(variants);
+    const selectedError = selectRepeatedActionableOneOfError(path, errors);
     const message =
       allowedValues.length > 0
         ? `${path} must be one of: ${allowedValues.join(', ')}.`
-        : `${path} must match one of the allowed schema variants.`;
-    throw new CliError('VALIDATION_ERROR', message, { errors });
+        : (selectedError ?? `${path} must match one of the allowed schema variants.`);
+    throw new CliError('VALIDATION_ERROR', message, { errors, selectedError });
   }
 
   if (schema.type === 'json') return;
@@ -236,7 +268,11 @@ function validateResponseValueAgainstTypeSpec(value: unknown, schema: CliTypeSpe
         errors.push(error instanceof Error ? error.message : String(error));
       }
     }
-    throw new CliError('VALIDATION_ERROR', `${path} must match one of the allowed schema variants.`, { errors });
+    const selectedError = selectRepeatedActionableOneOfError(path, errors);
+    throw new CliError('VALIDATION_ERROR', selectedError ?? `${path} must match one of the allowed schema variants.`, {
+      errors,
+      selectedError,
+    });
   }
 
   if (schema.type === 'json') return;
 
@@ -0,0 +1,206 @@
+---
+title: Best practices
+sidebarTitle: Best practices
+description: Get better results from LLM document editing — prompting, tool call patterns, and workflow tips
+keywords: "llm best practices, ai document editing, prompt engineering, superdoc tools, tool calling, document automation"
+---
+
+These patterns help your LLM agent produce reliable, efficient document edits.
+
+## Use the bundled system prompt
+
+`getSystemPrompt()` returns a tested prompt that teaches the model how to use SuperDoc tools — targeting, workflow order, and multi-action tools. Load it once and pass it as the system message.
+
+```typescript
+import { getSystemPrompt } from '@superdoc-dev/sdk';
+
+const systemPrompt = await getSystemPrompt();
+// Pass as the system message in your LLM call
+```
+
+You can extend it with task-specific instructions. Append your own rules after the bundled prompt:
+
+```typescript
+const systemPrompt = await getSystemPrompt();
+const fullPrompt = `${systemPrompt}\n\n## Additional rules\n- Use tracked changes for all edits.\n- Always search before editing.`;
+```
+
+Or start from scratch with something like this:
+
+````markdown
+You edit `.docx` files using SuperDoc intent tools. Be efficient and minimize tool calls.
+
+## Workflow
+
+1. **Read** — Use `superdoc_get_content` to understand the document.
+2. **Search** — Use `superdoc_search` to find stable handles or block addresses.
+3. **Edit** — Use the focused tool that matches the job:
+   - `superdoc_edit` for insert, replace, delete, undo, redo
+   - `superdoc_format` for inline or paragraph formatting
+   - `superdoc_create` for paragraphs and headings
+   - `superdoc_comment` for comment threads
+   - `superdoc_track_changes` for review decisions
+4. **Batch only when useful** — Use `superdoc_mutations` for preview/apply or atomic multi-step edits.
+
+## Rules
+
+- Search before mutating so targets come from fresh results.
+- Use focused intent tools for normal edits.
+- Use `superdoc_mutations` when you need an atomic batch or preview/apply flow.
+- Set `changeMode: "tracked"` when edits need human review.
+- Feed tool errors back so you can recover.
+````
+
+## Read first, search, then edit
+
+A typical edit takes 3-5 tool calls:
+
+1. `superdoc_get_content` — understand what's in the document
+2. `superdoc_search` — find the exact location (returns stable handles/addresses)
+3. Edit tool (`superdoc_edit`, `superdoc_format`, etc.) — apply the change using targets from search
+
+This matters because handles from search results point to the exact right location. If the model guesses a block address instead of searching for it, edits land in the wrong place.
+
+## Minimize tool calls
+
+Instruct the LLM to plan all edits before calling tools. A well-structured prompt like "Find the termination clause and rewrite it to allow 30-day notice" should take 3-5 calls, not 15.
+
+Batch multiple changes only when atomic execution is genuinely helpful — use `superdoc_mutations` for that.
+
+## Prefer markdown insert for multi-block creation
+
+When you need to create multiple headings and paragraphs in one operation, use `superdoc_edit` with `type: "markdown"` instead of calling `superdoc_create` once per block. A single markdown insert produces the entire structure in one call.
+
+```json
+{
+  "action": "insert",
+  "type": "markdown",
+  "value": "## Executive Summary\n\nThis agreement governs the terms of service.\n\n## Key Provisions\n\nThe following provisions apply to all parties."
+}
+```
+
+After inserting, apply formatting in a single `superdoc_mutations` batch using `format.apply` steps — one step per block or range. This reduces a workflow that might otherwise take 40+ calls down to 4: read, search, insert, format.
+
+## Use focused tools — `superdoc_mutations` is an escape hatch
+
+For straightforward edits, use the focused intent tools (`superdoc_edit`, `superdoc_format`, `superdoc_create`, `superdoc_list`, `superdoc_comment`). They validate arguments, give clear errors, and are easier for models to call correctly.
+
+Reach for `superdoc_mutations` only when you need:
+- Preview/apply semantics (show what will change before committing)
+- Atomic multi-step edits (all-or-nothing batch)
+- A workflow that would otherwise require refreshing targets between steps
+
+## Feed errors back
+
+`dispatchSuperDocTool` returns structured errors. Pass them back as tool results — most models self-correct on the next turn.
+
+```typescript
+try {
+  const result = await dispatchSuperDocTool(doc, toolCall.function.name, JSON.parse(toolCall.function.arguments));
+  messages.push({ role: 'tool', tool_call_id: toolCall.id, content: JSON.stringify(result) });
+} catch (err: any) {
+  // Return the error as a tool result — the model will see it and adjust
+  messages.push({ role: 'tool', tool_call_id: toolCall.id, content: JSON.stringify({ error: err.message }) });
+}
+```
+
+## Choose formatting values from the document
+
+Don't hardcode formatting values. Read them from the document's existing content and match what's already there.
+
+**Body text:** Read `fontFamily`, `fontSize`, and `color` from non-empty paragraphs with `alignment: "justify"` or `alignment: "left"`. Set `bold: false` for body paragraphs.
+
+Many DOCX documents report `underline: true` on all blocks due to style inheritance. This is a DOCX artifact — not intentional formatting. Do not carry it forward when inserting new paragraphs.
+
+**Headings:** Read from existing heading blocks in the document. Scale `fontSize` up relative to body text. Headings are typically bold and sometimes centered — confirm against what's already in the document rather than assuming.
+
+```typescript
+// Get content first, find a representative body paragraph
+const content = await superdoc.getContent();
+const bodyParagraph = content.blocks.find(
+  (b) => b.type === 'paragraph' && b.text?.trim().length > 0
+);
+const { fontFamily, fontSize, color } = bodyParagraph?.formatting ?? {};
+
+// Use those values when formatting inserted content
+```
+
+## Add examples for repeatable workflows
+
+If the same kind of edit runs across many documents (e.g., always rewriting a specific clause, always adding a comment to a section), include a concrete tool call example in your system prompt. Models that see a working example of the exact tool invocation produce correct calls more reliably than models that only see the schema.
+
+## Use tracked changes for review workflows
+
+Add `changeMode: "tracked"` to edit tool calls, or instruct the model via the system prompt:
+
+```
+Use tracked changes for all edits so a human can review them.
+```
+
+This way every AI edit appears as a tracked change that users can accept or reject in SuperDoc or Microsoft Word.
+
+## Pin your model version
+
+Use a specific model ID (e.g., `gpt-4.1` or `claude-sonnet-4-6`) rather than an alias like `gpt-4o`. Aliases can change behavior between releases and break working tool call patterns.
+
+## Cache tools and prompts
+
+Tools and the system prompt don't change between requests. Load them once at startup and reuse across all conversations.
+
+```typescript
+let cachedTools: any[] | null = null;
+let cachedSystemPrompt: string | null = null;
+
+async function ensureToolsLoaded() {
+  if (!cachedTools) {
+    const result = await chooseTools({ provider: 'openai' });
+    cachedTools = result.tools;
+  }
+  if (!cachedSystemPrompt) {
+    cachedSystemPrompt = await getSystemPrompt();
+  }
+  return { tools: cachedTools, systemPrompt: cachedSystemPrompt };
+}
+```
+
+## Prompt examples
+
+These prompts have been tested against the SuperDoc tool set. Use them as inspiration for your own workflows, or include them as few-shot examples in your system prompt.
+
+### Document review
+
+- "Find the termination clause and rewrite it to require 30-day written notice. Use tracked changes."
+- "Apply yellow highlight to every sentence that contains an indemnification obligation."
+- "Replace all references to 'Contractor' with 'Service Provider' and make each replacement italic with tracked changes enabled."
+- "Underline every sentence that references payment terms or late fees."
+- "Insert CONFIDENTIAL — DO NOT DISTRIBUTE at the very top of the document and make it bold, red, 14pt."
+- "Scan the document for inconsistent capitalization of defined terms and fix them with tracked changes enabled."
+
+### Formatting and structure
+
+- "Format the entire document in Times New Roman, 12-point."
+- "Make all Heading 2 paragraphs bold and set them to 14-point font."
+- "Keep each section heading with the paragraph that follows it so they don't split across pages."
+- "Remove all extra blank paragraphs and convert all double spaces after periods to single spaces."
+- "Right-align all section headings."
+
+### Content generation and editing
+
+- "Add a new heading 'Learning Objectives' at the top, followed by a bullet list with 3 key takeaways from the document content."
+- "Read the document and add a heading 'Executive Summary' at the end, followed by a one-paragraph summary and a bullet list of the 5 key provisions."
+- "Find the governing law section and insert a new paragraph after it: 'Any disputes arising under this Agreement shall be resolved through binding arbitration.'"
+- "Find all paragraphs that mention 'personally identifiable information' and add a comment: 'Verify PII handling complies with current data retention policy.'"
+- "Convert the list of references at the end into a numbered list and restart numbering at 1."
+
+### Search and replace
+
+- "Rewrite all dates in this document in the format January 1, 2026."
+- "Replace every occurrence of 'FY2024' with 'FY2025' throughout the document."
+- "Add the § symbol before every section number reference."
+
+## Related
+
+- [LLM tools](/ai/agents/llm-tools) — tool catalog and SDK functions
+- [How to use](/ai/agents/integrations) — step-by-step integration guide
+- [Debugging](/ai/agents/debugging) — troubleshoot tool call failures
+- [Document API](/document-api/overview) — the operation set behind the tools