Skip to content

Commit 1e4e294

Browse files
committed
feat(evals): add new NDA documents and implement interactive DOCX output reviewer
1 parent c3395d6 commit 1e4e294

6 files changed

Lines changed: 174 additions & 0 deletions

File tree

14.2 KB
Binary file not shown.
13.9 KB
Binary file not shown.
14 KB
Binary file not shown.

evals/providers/claude-code-agent.mjs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ export default class ClaudeCodeBenchmarkProvider {
249249
prompt: fullPrompt,
250250
options: queryOptions,
251251
})) {
252+
console.log(message);
252253

253254
if (message.type === 'assistant' && message.message?.content) {
254255
for (const block of message.message.content) {
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#!/usr/bin/env node
2+
/**
3+
* Interactive DOCX output reviewer.
4+
*
5+
* Opens each benchmark output file one at a time in Word/Preview.
6+
* Shows an AppleScript dialog explaining what to look for.
7+
* Waits for you to close the dialog before opening the next file.
8+
*
9+
* Usage: node evals/scripts/review-docx-outputs.mjs
10+
*/
11+
12+
import { existsSync, readFileSync } from 'node:fs';
13+
import { execSync } from 'node:child_process';
14+
import { dirname, resolve } from 'node:path';
15+
import { fileURLToPath } from 'node:url';
16+
17+
const __dirname = dirname(fileURLToPath(import.meta.url));
18+
const RESULTS = resolve(__dirname, '../results/benchmark/latest.json');
19+
20+
if (!existsSync(RESULTS)) {
21+
console.error('No results at', RESULTS);
22+
process.exit(1);
23+
}
24+
25+
const d = JSON.parse(readFileSync(RESULTS, 'utf8'));
26+
27+
// Collect output files with context
28+
const files = [];
29+
for (const r of d.results.results) {
30+
const o = JSON.parse(r.response?.output || '{}');
31+
if (!o.outputFile || !existsSync(o.outputFile)) continue;
32+
if (!r.vars?.keepFile) continue;
33+
34+
const provider = r.provider?.label || '?';
35+
const task = r.vars?.task || '?';
36+
const taskShort = (r.test?.description || task).substring(0, 50);
37+
const status = r.error ? 'ERROR' : r.success ? 'PASS' : 'FAIL';
38+
const steps = o.stepCount || 0;
39+
const path = o.pathUsed || '?';
40+
41+
files.push({ provider, task, taskShort, status, steps, path, file: o.outputFile });
42+
}
43+
44+
// Group by task for better review flow
45+
const byTask = {};
46+
for (const f of files) {
47+
const key = f.task.substring(0, 40);
48+
if (!byTask[key]) byTask[key] = [];
49+
byTask[key].push(f);
50+
}
51+
52+
// What to look for per task type
53+
const reviewGuide = {
54+
'Replace': [
55+
'Is "SuperDoc Inc" (or "Apex Holdings") present throughout?',
56+
'Is the old name ("Iqidis") completely gone?',
57+
'Is "TechVentures" still intact (collateral)?',
58+
'Is the formatting preserved? (bold, font, size unchanged)',
59+
'Are headings, numbering, and styles intact?',
60+
],
61+
'section': [
62+
'Is the "Force Majeure" heading present at the end?',
63+
'Does it have Heading 1 style (large, bold)?',
64+
'Is the body paragraph below the heading?',
65+
'Is the rest of the document unchanged?',
66+
],
67+
'placeholder': [
68+
'Is "Jane Smith" present where "[Candidate Name]" was?',
69+
'Are ALL occurrences replaced (check header, body, signature)?',
70+
'Is formatting preserved around the replacement?',
71+
],
72+
'table': [
73+
'Is there a summary table at the end?',
74+
'Does it have the correct columns and data?',
75+
'Does it have borders and proper alignment?',
76+
'Is the rest of the document unchanged?',
77+
],
78+
'bold': [
79+
'Is "TechVentures LLC" displayed in bold?',
80+
'Are ALL occurrences bold (not just the first)?',
81+
'Is the text content unchanged (no extra/missing words)?',
82+
],
83+
'tracked': [
84+
'Is there a tracked change showing $500,000 → $750,000?',
85+
'Is it shown as a suggestion (not a direct edit)?',
86+
'Enable "Track Changes" view in Word to verify.',
87+
'Is the rest of the document unchanged?',
88+
],
89+
'comment': [
90+
'Is there a comment on the indemnification clause?',
91+
'Does the comment text mention "legal review" or similar?',
92+
'Is the comment anchored to the right text?',
93+
'Open the Comments pane in Word to verify.',
94+
],
95+
'heading': [
96+
'Is there a properly styled heading (Heading 1)?',
97+
'Does it appear at the right position in the document?',
98+
'Is the heading text correct?',
99+
],
100+
};
101+
102+
function getGuide(task) {
103+
for (const [key, guide] of Object.entries(reviewGuide)) {
104+
if (task.toLowerCase().includes(key.toLowerCase())) return guide;
105+
}
106+
return ['Check that the edit was applied correctly.', 'Check that unrelated content is unchanged.'];
107+
}
108+
109+
function showDialog(title, message) {
110+
const escaped = message.replace(/"/g, '\\"').replace(/\n/g, '\\n');
111+
try {
112+
const result = execSync(`osascript -e 'button returned of (display dialog "${escaped}" with title "${title}" buttons {"Skip Rest", "Next"} default button "Next")'`, { encoding: 'utf8' }).trim();
113+
return result === 'Skip Rest' ? 'skip' : 'next';
114+
} catch {
115+
return 'skip';
116+
}
117+
}
118+
119+
function openFile(path) {
120+
execSync(`open "${path}"`);
121+
}
122+
123+
// Review flow
124+
const taskEntries = Object.entries(byTask);
125+
let fileIndex = 0;
126+
const total = files.length;
127+
128+
console.log(`Found ${total} output files across ${taskEntries.length} tasks.`);
129+
console.log('Opening files one at a time with review guidance.');
130+
console.log('');
131+
132+
for (const [taskKey, taskFiles] of taskEntries) {
133+
// Sort: SuperDoc MCP first, then baseline, for easy comparison
134+
taskFiles.sort((a, b) => {
135+
const order = { 'superdoc-skill': 0, 'superdoc-cli': 1, 'raw': 2, 'vendor-skill': 3 };
136+
return (order[a.path] ?? 4) - (order[b.path] ?? 4);
137+
});
138+
139+
for (const f of taskFiles) {
140+
fileIndex++;
141+
const guide = getGuide(f.task);
142+
const guideText = guide.map((g, i) => `${i + 1}. ${g}`).join('\\n');
143+
144+
const title = `[${fileIndex}/${total}] ${f.status}${f.provider}`;
145+
const message = [
146+
`Task: ${f.taskShort}`,
147+
`Provider: ${f.provider}`,
148+
`Status: ${f.status} | Path: ${f.path} | Steps: ${f.steps}`,
149+
``,
150+
`What to look for:`,
151+
...guide.map((g, i) => `${i + 1}. ${g}`),
152+
``,
153+
`File: ${f.file.split('/').pop()}`,
154+
].join('\\n');
155+
156+
console.log(`[${fileIndex}/${total}] ${f.status} ${f.provider.padEnd(22)} ${f.taskShort}`);
157+
158+
openFile(f.file);
159+
const action = showDialog(title, message);
160+
161+
if (action === 'skip') {
162+
console.log('Skipping remaining files.');
163+
process.exit(0);
164+
}
165+
}
166+
}
167+
168+
console.log('');
169+
console.log('Review complete!');

examples/collaboration/ai-node-sdk/client/src/components/chat/suggestion-chips.tsx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ const SUGGESTIONS = [
2525
label: 'Find dates',
2626
prompt: 'Find all mentions of dates and highlight them',
2727
},
28+
{
29+
label: 'Executive summary',
30+
prompt: 'Create an executive summary at the beginning of the document',
31+
},
2832
];
2933

3034
interface SuggestionChipsProps {

0 commit comments

Comments
 (0)