@@ -7,6 +7,7 @@ import TurndownService from 'turndown';
77
88export interface ReadOptions {
99 format ?: 'raw' | 'text' | 'markdown' ;
10+ enhanceMarkdown ?: boolean ;
1011}
1112
1213export interface ReadResult {
@@ -46,69 +47,64 @@ export async function read(
4647) : Promise < ReadResult > {
4748 const page = browser . getPage ( ) ;
4849 const format = options . format || 'raw' ; // Default to 'raw' for Turndown compatibility
50+ const enhanceMarkdown = options . enhanceMarkdown !== false ; // Default to true
4951
50- // For markdown format, get raw HTML first, then convert with Turndown
51- if ( format === 'markdown' ) {
52- // Get raw HTML from extension
53- const rawResult = ( await page . evaluate (
52+ if ( format === 'markdown' && enhanceMarkdown ) {
53+ // Get raw HTML from the extension first
54+ const rawHtmlResult = ( await page . evaluate (
5455 ( opts ) => {
5556 return ( window as any ) . sentience . read ( opts ) ;
5657 } ,
5758 { format : 'raw' }
5859 ) ) as ReadResult ;
5960
60- if ( rawResult . status !== 'success' ) {
61- return rawResult ;
62- }
63-
64- // Convert to markdown using Turndown
65- try {
66- const turndownService = new TurndownService ( {
67- headingStyle : 'atx' , // Use # for headings
68- bulletListMarker : '-' , // Use - for lists
69- codeBlockStyle : 'fenced' , // Use ``` for code blocks
70- } ) ;
71-
72- // Add custom rules for better conversion
73- turndownService . addRule ( 'strikethrough' , {
74- filter : [ 'del' , 's' , 'strike' ] as any ,
75- replacement : ( content : string ) => `~~${ content } ~~` ,
76- } ) ;
61+ if ( rawHtmlResult . status === 'success' ) {
62+ const htmlContent = rawHtmlResult . content ;
63+ try {
64+ const turndownService = new TurndownService ( {
65+ headingStyle : 'atx' ,
66+ hr : '---' ,
67+ bulletListMarker : '-' ,
68+ codeBlockStyle : 'fenced' ,
69+ emDelimiter : '*' ,
70+ } ) ;
7771
78- // Strip unwanted tags
79- turndownService . remove ( [ 'script' , 'style' , 'nav' , 'footer' , 'header' , 'noscript' ] ) ;
72+ // Add custom rules for better markdown
73+ turndownService . addRule ( 'strikethrough' , {
74+ filter : ( node ) => [ 's' , 'del' , 'strike' ] . includes ( node . nodeName . toLowerCase ( ) ) ,
75+ replacement : function ( content ) {
76+ return '~~' + content + '~~' ;
77+ } ,
78+ } ) ;
8079
81- const htmlContent = rawResult . content ;
82- const markdownContent = turndownService . turndown ( htmlContent ) ;
80+ // Optionally strip certain tags entirely
81+ turndownService . remove ( [ 'script' , 'style' , 'noscript' , 'iframe' ] as any ) ;
8382
84- // Return result with markdown content
85- return {
86- status : 'success' ,
87- url : rawResult . url ,
88- format : 'markdown' ,
89- content : markdownContent ,
90- length : markdownContent . length ,
91- } ;
92- } catch ( e ) {
93- // If conversion fails, return error
94- return {
95- status : 'error' ,
96- url : rawResult . url ,
97- format : 'markdown' ,
98- content : '' ,
99- length : 0 ,
100- error : `Markdown conversion failed: ${ e } ` ,
101- } ;
83+ const markdownContent = turndownService . turndown ( htmlContent ) ;
84+ return {
85+ status : 'success' ,
86+ url : rawHtmlResult . url ,
87+ format : 'markdown' ,
88+ content : markdownContent ,
89+ length : markdownContent . length ,
90+ } ;
91+ } catch ( e : any ) {
92+ console . warn ( `Turndown conversion failed: ${ e . message } , falling back to extension's markdown.` ) ;
93+ // Fallback to extension's markdown if Turndown fails
94+ }
95+ } else {
96+ console . warn ( `Failed to get raw HTML from extension: ${ rawHtmlResult . error } , falling back to extension's markdown.` ) ;
97+ // Fallback to extension's markdown if getting raw HTML fails
10298 }
103- } else {
104- // For "raw" or "text", call extension directly
105- const result = ( await page . evaluate (
106- ( opts ) => {
107- return ( window as any ) . sentience . read ( opts ) ;
108- } ,
109- { format }
110- ) ) as ReadResult ;
111-
112- return result ;
11399 }
100+
101+ // If not enhanced markdown, or fallback, call extension with requested format
102+ const result = ( await page . evaluate (
103+ ( opts ) => {
104+ return ( window as any ) . sentience . read ( opts ) ;
105+ } ,
106+ { format }
107+ ) ) as ReadResult ;
108+
109+ return result ;
114110}
0 commit comments