Skip to content

Commit aa9b168

Browse files
committed
handle markdown read
1 parent 7769b8c commit aa9b168

4 files changed

Lines changed: 205 additions & 0 deletions

File tree

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,36 @@ npm run build
9797
- `.toHaveText(text)`
9898
- `.toHaveCount(n)`
9999

100+
### Content Reading
101+
- `read(browser, options)` - Read page content
102+
- **Default format: `"raw"`** - Returns HTML suitable for Turndown
103+
- `format: "raw"` - Get cleaned HTML
104+
- `format: "markdown"` - Get high-quality markdown (uses Turndown internally)
105+
- `format: "text"` - Get plain text
106+
107+
**Examples:**
108+
```typescript
109+
import { read } from './src';
110+
111+
// Get raw HTML (default)
112+
const result = await read(browser);
113+
const html = result.content;
114+
115+
// Get high-quality markdown (uses Turndown automatically)
116+
const result = await read(browser, { format: 'markdown' });
117+
const markdown = result.content;
118+
```
119+
120+
See `examples/read-markdown.ts` for complete examples.
121+
100122
## Examples
101123

102124
See `examples/` directory:
103125
- `hello.ts` - Extension bridge verification
104126
- `basic-agent.ts` - Basic snapshot
105127
- `query-demo.ts` - Query engine
106128
- `wait-and-click.ts` - Wait and actions
129+
- `read-markdown.ts` - Reading page content and converting to markdown
107130

108131
## Testing
109132

examples/read-markdown.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/**
2+
* Example: Reading page content and converting to markdown
3+
*
4+
* This example shows how to use the read() function to get page content
5+
* and convert it to high-quality markdown using Turndown.
6+
*/
7+
8+
import { SentienceBrowser, read } from '../src';
9+
import TurndownService from 'turndown';
10+
11+
async function main() {
12+
// Initialize browser
13+
const browser = new SentienceBrowser();
14+
await browser.start();
15+
16+
try {
17+
// Navigate to a page
18+
await browser.getPage().goto('https://example.com');
19+
await browser.getPage().waitForLoadState('networkidle');
20+
21+
// Method 1: Get raw HTML (default) and convert with Turndown
22+
console.log('=== Method 1: Raw HTML + Turndown (Recommended) ===');
23+
const result = await read(browser); // format="raw" is default
24+
const htmlContent = result.content;
25+
26+
// Convert to markdown using Turndown (better quality)
27+
const turndownService = new TurndownService({
28+
headingStyle: 'atx', // Use # for headings
29+
bulletListMarker: '-', // Use - for lists
30+
codeBlockStyle: 'fenced', // Use ``` for code blocks
31+
});
32+
33+
// Add custom rules for better conversion
34+
turndownService.addRule('strikethrough', {
35+
filter: ['del', 's', 'strike'] as any,
36+
replacement: (content: string) => `~~${content}~~`,
37+
});
38+
39+
// Strip unwanted tags
40+
turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']);
41+
42+
const markdown = turndownService.turndown(htmlContent);
43+
console.log(`Markdown length: ${markdown.length} characters`);
44+
console.log(markdown.substring(0, 500)); // Print first 500 chars
45+
console.log('\n');
46+
47+
// Method 2: Get high-quality markdown directly (uses Turndown internally)
48+
console.log('=== Method 2: Direct markdown (High-quality via Turndown) ===');
49+
const result2 = await read(browser, { format: 'markdown' });
50+
const highQualityMarkdown = result2.content;
51+
console.log(`Markdown length: ${highQualityMarkdown.length} characters`);
52+
console.log(highQualityMarkdown.substring(0, 500)); // Print first 500 chars
53+
console.log('\n');
54+
55+
// Method 3: Get plain text
56+
console.log('=== Method 3: Plain text ===');
57+
const result3 = await read(browser, { format: 'text' });
58+
const textContent = result3.content;
59+
console.log(`Text length: ${textContent.length} characters`);
60+
console.log(textContent.substring(0, 500)); // Print first 500 chars
61+
} finally {
62+
await browser.close();
63+
}
64+
}
65+
66+
main().catch(console.error);
67+

src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,6 @@ export { expect, Expectation } from './expect';
1111
export { Inspector, inspect } from './inspector';
1212
export { Recorder, Trace, TraceStep, record } from './recorder';
1313
export { ScriptGenerator, generate } from './generator';
14+
export { read, ReadOptions, ReadResult } from './read';
1415
export * from './types';
1516

src/read.ts

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/**
2+
* Read page content - supports raw HTML, text, and markdown formats
3+
*/
4+
5+
import { SentienceBrowser } from './browser';
6+
import TurndownService from 'turndown';
7+
8+
export interface ReadOptions {
9+
format?: 'raw' | 'text' | 'markdown';
10+
}
11+
12+
export interface ReadResult {
13+
status: 'success' | 'error';
14+
url: string;
15+
format: 'raw' | 'text' | 'markdown';
16+
content: string;
17+
length: number;
18+
error?: string;
19+
}
20+
21+
/**
22+
* Read page content as raw HTML, text, or markdown
23+
*
24+
* @param browser - SentienceBrowser instance
25+
* @param options - Read options
26+
* @returns ReadResult with page content
27+
*
28+
* @example
29+
* // Get raw HTML (default)
30+
* const result = await read(browser);
31+
* const htmlContent = result.content;
32+
*
33+
* @example
34+
* // Get high-quality markdown (uses Turndown internally)
35+
* const result = await read(browser, { format: 'markdown' });
36+
* const markdown = result.content;
37+
*
38+
* @example
39+
* // Get plain text
40+
* const result = await read(browser, { format: 'text' });
41+
* const text = result.content;
42+
*/
43+
export async function read(
44+
browser: SentienceBrowser,
45+
options: ReadOptions = {}
46+
): Promise<ReadResult> {
47+
const page = browser.getPage();
48+
const format = options.format || 'raw'; // Default to 'raw' for Turndown compatibility
49+
50+
// For markdown format, get raw HTML first, then convert with Turndown
51+
if (format === 'markdown') {
52+
// Get raw HTML from extension
53+
const rawResult = (await page.evaluate(
54+
(opts) => {
55+
return (window as any).sentience.read(opts);
56+
},
57+
{ format: 'raw' }
58+
)) as ReadResult;
59+
60+
if (rawResult.status !== 'success') {
61+
return rawResult;
62+
}
63+
64+
// Convert to markdown using Turndown
65+
try {
66+
const turndownService = new TurndownService({
67+
headingStyle: 'atx', // Use # for headings
68+
bulletListMarker: '-', // Use - for lists
69+
codeBlockStyle: 'fenced', // Use ``` for code blocks
70+
});
71+
72+
// Add custom rules for better conversion
73+
turndownService.addRule('strikethrough', {
74+
filter: ['del', 's', 'strike'] as any,
75+
replacement: (content: string) => `~~${content}~~`,
76+
});
77+
78+
// Strip unwanted tags
79+
turndownService.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']);
80+
81+
const htmlContent = rawResult.content;
82+
const markdownContent = turndownService.turndown(htmlContent);
83+
84+
// Return result with markdown content
85+
return {
86+
status: 'success',
87+
url: rawResult.url,
88+
format: 'markdown',
89+
content: markdownContent,
90+
length: markdownContent.length,
91+
};
92+
} catch (e) {
93+
// If conversion fails, return error
94+
return {
95+
status: 'error',
96+
url: rawResult.url,
97+
format: 'markdown',
98+
content: '',
99+
length: 0,
100+
error: `Markdown conversion failed: ${e}`,
101+
};
102+
}
103+
} else {
104+
// For "raw" or "text", call extension directly
105+
const result = (await page.evaluate(
106+
(opts) => {
107+
return (window as any).sentience.read(opts);
108+
},
109+
{ format }
110+
)) as ReadResult;
111+
112+
return result;
113+
}
114+
}

0 commit comments

Comments
 (0)