Skip to content

Commit 611c498

Browse files
committed
Replace mr-pdf with Puppeteer-based PDF crawler
Remove the mr-pdf dependency and rework the PDF generator to use a single Puppeteer browser instance. gen-pdf.mjs now: launches one browser, renders a cover to PDF, crawls documentation pages following the "next" pagination link while injecting CSS and header/footer HTML, prints each page to PDF buffers, and merges all buffers with pdf-lib into the final PDF. Also updates metadata producer, simplifies temp file handling, and ensures the browser is closed on success or error. package.json no longer includes mr-pdf.
1 parent 84ff070 commit 611c498

2 files changed

Lines changed: 114 additions & 95 deletions

File tree

β€Žpackage.jsonβ€Ž

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
"@docusaurus/tsconfig": "^3.9.2",
4444
"@docusaurus/types": "^3.7.0",
4545
"@mermaid-js/mermaid-cli": "^11.12.0",
46-
"mr-pdf": "^1.1.0",
4746
"pdf-lib": "^1.17.1",
4847
"typescript": "^5.9.3",
4948
"wait-on": "^9.0.4"

β€Žscripts/gen-pdf.mjsβ€Ž

Lines changed: 114 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,17 @@
22
/**
33
* Turing ES Documentation β€” PDF Generator
44
*
5-
* Orchestrates three stages:
6-
* 1. Render the branded cover (HTML β†’ PDF via Puppeteer)
7-
* 2. Generate documentation pages (mr-pdf)
8-
* 3. Merge cover + docs into a single PDF (pdf-lib)
5+
* Uses a single Puppeteer browser instance to:
6+
* 1. Render the branded cover (HTML β†’ PDF)
7+
* 2. Crawl every documentation page following "next" links, printing each to PDF
8+
* 3. Merge everything into a single PDF with pdf-lib
99
*
1010
* Prerequisites:
11-
* - Docusaurus dev/serve running on localhost:3000
12-
* - npm dependencies installed (puppeteer, mr-pdf, pdf-lib, wait-on)
11+
* - Docusaurus serve running on localhost:3000
12+
* - npm dependencies installed (puppeteer, pdf-lib)
1313
*/
1414

15-
import { execFileSync } from 'node:child_process';
16-
import { readFileSync, writeFileSync, unlinkSync } from 'node:fs';
15+
import { readFileSync, writeFileSync } from 'node:fs';
1716
import { fileURLToPath } from 'node:url';
1817
import { dirname, join } from 'node:path';
1918
import { PDFDocument } from 'pdf-lib';
@@ -22,133 +21,149 @@ const __filename = fileURLToPath(import.meta.url);
2221
const __dirname = dirname(__filename);
2322
const ROOT = join(__dirname, '..');
2423

25-
const COVER_HTML = join(__dirname, 'pdf-cover.html');
26-
const STYLE_CSS = join(__dirname, 'pdf-style.css');
27-
const COVER_PDF = join(ROOT, '.cover-tmp.pdf');
28-
const DOCS_PDF = join(ROOT, '.docs-tmp.pdf');
29-
const OUTPUT_PDF = join(ROOT, 'turing-es-2026.1-documentation.pdf');
30-
const BASE_URL = process.env.PDF_BASE_URL || 'http://localhost:3000';
31-
const ENTRY_PATH = '/turing/getting-started/intro';
24+
const COVER_HTML = join(__dirname, 'pdf-cover.html');
25+
const STYLE_CSS = join(__dirname, 'pdf-style.css');
26+
const OUTPUT_PDF = join(ROOT, 'turing-es-2026.1-documentation.pdf');
27+
const BASE_URL = process.env.PDF_BASE_URL || 'http://localhost:3000';
28+
const ENTRY_PATH = '/turing/getting-started/intro';
29+
30+
const HEADER_HTML = [
31+
'<div style="width:100%;padding:0 15mm;font-family:system-ui,sans-serif;',
32+
'font-size:8px;display:flex;justify-content:space-between;align-items:center;',
33+
'border-bottom:0.5px solid #FED7AA;padding-bottom:3px;margin-bottom:4px;">',
34+
'<span style="color:#C2410C;font-weight:700;letter-spacing:0.08em;">TURING ES</span>',
35+
'<span style="color:#94a3b8;">Documentation</span>',
36+
'</div>',
37+
].join('');
38+
39+
const FOOTER_HTML = [
40+
'<div style="width:100%;padding:0 15mm;font-family:system-ui,sans-serif;',
41+
'font-size:7px;display:flex;justify-content:space-between;align-items:center;',
42+
'border-top:0.5px solid #e2e8f0;padding-top:3px;margin-top:4px;">',
43+
'<span style="color:#94a3b8;">viglet.com</span>',
44+
'<span style="color:#64748b;">',
45+
'<span class="pageNumber"></span> / <span class="totalPages"></span>',
46+
'</span></div>',
47+
].join('');
48+
49+
let browser;
50+
51+
async function launchBrowser() {
52+
const puppeteer = await import('puppeteer');
53+
browser = await puppeteer.launch({
54+
headless: 'new',
55+
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
56+
});
57+
}
3258

3359
/* ──────────────────────────────────────────────────────
34-
Stage 1 β€” Cover (Puppeteer)
60+
Stage 1 β€” Cover (HTML β†’ PDF)
3561
────────────────────────────────────────────────────── */
3662
async function generateCover() {
3763
console.log(' [1/3] Rendering cover pages …');
3864

39-
// Dynamic import so the script still parses even if puppeteer
40-
// is only available through the mermaid-cli transitive dep.
41-
const puppeteer = await import('puppeteer');
42-
const browser = await puppeteer.launch({
43-
headless: 'new',
44-
args: ['--no-sandbox', '--disable-setuid-sandbox'],
45-
});
46-
4765
const page = await browser.newPage();
4866
const html = readFileSync(COVER_HTML, 'utf-8');
4967

5068
await page.setContent(html, { waitUntil: 'networkidle0' });
51-
52-
// Wait for Google Fonts to load
5369
await page.evaluateHandle('document.fonts.ready');
5470

55-
await page.pdf({
56-
path: COVER_PDF,
71+
const pdfBuffer = await page.pdf({
5772
format: 'A4',
5873
printBackground: true,
5974
preferCSSPageSize: false,
6075
});
6176

62-
await browser.close();
77+
await page.close();
6378
console.log(' Cover rendered βœ“');
79+
return pdfBuffer;
6480
}
6581

6682
/* ──────────────────────────────────────────────────────
67-
Stage 2 β€” Documentation pages (mr-pdf)
83+
Stage 2 β€” Crawl doc pages & print each to PDF
6884
────────────────────────────────────────────────────── */
69-
function generateDocs() {
85+
async function generateDocs() {
7086
console.log(' [2/3] Generating documentation pages …');
7187

7288
const cssContent = readFileSync(STYLE_CSS, 'utf-8');
73-
74-
const headerHtml = [
75-
'<div style="width:100%;padding:0 15mm;font-family:system-ui,sans-serif;',
76-
'font-size:8px;display:flex;justify-content:space-between;align-items:center;',
77-
'border-bottom:0.5px solid #FED7AA;padding-bottom:3px;margin-bottom:4px;">',
78-
'<span style="color:#C2410C;font-weight:700;letter-spacing:0.08em;">TURING ES</span>',
79-
'<span style="color:#94a3b8;">Documentation</span>',
80-
'</div>',
81-
].join('');
82-
83-
const footerHtml = [
84-
'<div style="width:100%;padding:0 15mm;font-family:system-ui,sans-serif;',
85-
'font-size:7px;display:flex;justify-content:space-between;align-items:center;',
86-
'border-top:0.5px solid #e2e8f0;padding-top:3px;margin-top:4px;">',
87-
'<span style="color:#94a3b8;">viglet.com</span>',
88-
'<span style="color:#64748b;">',
89-
'<span class="pageNumber"></span> / <span class="totalPages"></span>',
90-
'</span></div>',
91-
].join('');
92-
93-
const args = [
94-
'mr-pdf',
95-
'--initialDocURLs', `${BASE_URL}${ENTRY_PATH}`,
96-
'--contentSelector', 'article',
97-
'--paginationSelector','a.pagination-nav__link--next',
98-
'--pdfFormat', 'A4',
99-
'--pdfMargin', '25,15,20,15',
100-
'--cssStyle', cssContent,
101-
'--headerTemplate', headerHtml,
102-
'--footerTemplate', footerHtml,
103-
'--disableTOC',
104-
'--outputPDFFilename', DOCS_PDF,
105-
];
106-
107-
execFileSync('npx', args, {
108-
cwd: ROOT,
109-
stdio: 'inherit',
110-
timeout: 300_000, // 5 min max
111-
});
112-
113-
console.log(' Documentation pages generated βœ“');
89+
const page = await browser.newPage();
90+
const pagePDFs = [];
91+
let url = `${BASE_URL}${ENTRY_PATH}`;
92+
const visited = new Set();
93+
94+
while (url) {
95+
// Normalise to avoid revisiting with trailing slashes etc.
96+
const normalized = url.replace(/\/+$/, '');
97+
if (visited.has(normalized)) break;
98+
visited.add(normalized);
99+
100+
const pageNum = visited.size;
101+
process.stdout.write(` [${pageNum}] ${normalized.replace(BASE_URL, '')} … `);
102+
103+
await page.goto(url, { waitUntil: 'networkidle0', timeout: 30_000 });
104+
105+
// Inject our PDF stylesheet
106+
await page.addStyleTag({ content: cssContent });
107+
108+
// Small delay for styles to apply
109+
await new Promise((r) => setTimeout(r, 300));
110+
111+
// Print this page to PDF
112+
const pdfBuf = await page.pdf({
113+
format: 'A4',
114+
printBackground: true,
115+
displayHeaderFooter: true,
116+
headerTemplate: HEADER_HTML,
117+
footerTemplate: FOOTER_HTML,
118+
margin: { top: '25mm', bottom: '20mm', left: '15mm', right: '15mm' },
119+
});
120+
121+
pagePDFs.push(pdfBuf);
122+
console.log('βœ“');
123+
124+
// Find the "next" pagination link
125+
url = await page.evaluate(() => {
126+
const next = document.querySelector('a.pagination-nav__link--next');
127+
return next ? next.href : null;
128+
});
129+
}
130+
131+
await page.close();
132+
console.log(` ${pagePDFs.length} pages generated βœ“`);
133+
return pagePDFs;
114134
}
115135

116136
/* ──────────────────────────────────────────────────────
117-
Stage 3 β€” Merge (pdf-lib)
137+
Stage 3 β€” Merge all PDFs (pdf-lib)
118138
────────────────────────────────────────────────────── */
119-
async function mergePDFs() {
120-
console.log(' [3/3] Merging cover + documentation …');
121-
122-
const coverBytes = readFileSync(COVER_PDF);
123-
const docsBytes = readFileSync(DOCS_PDF);
139+
async function mergePDFs(coverBuf, docBuffers) {
140+
console.log(' [3/3] Merging PDFs …');
124141

125-
const merged = await PDFDocument.create();
142+
const merged = await PDFDocument.create();
126143

127-
// Copy cover pages (cover + inner title)
128-
const coverDoc = await PDFDocument.load(coverBytes);
144+
// Cover pages
145+
const coverDoc = await PDFDocument.load(coverBuf);
129146
const coverPages = await merged.copyPages(coverDoc, coverDoc.getPageIndices());
130147
coverPages.forEach((p) => merged.addPage(p));
131148

132-
// Copy documentation pages
133-
const docsDoc = await PDFDocument.load(docsBytes);
134-
const docsPages = await merged.copyPages(docsDoc, docsDoc.getPageIndices());
135-
docsPages.forEach((p) => merged.addPage(p));
149+
// Documentation pages
150+
for (const buf of docBuffers) {
151+
const doc = await PDFDocument.load(buf);
152+
const pages = await merged.copyPages(doc, doc.getPageIndices());
153+
pages.forEach((p) => merged.addPage(p));
154+
}
136155

137156
// Metadata
138157
merged.setTitle('Turing ES Documentation');
139158
merged.setSubject('Enterprise Search Platform β€” v2026.1');
140159
merged.setAuthor('Viglet');
141160
merged.setCreator('Viglet PDF Generator');
142-
merged.setProducer('pdf-lib + Puppeteer + mr-pdf');
161+
merged.setProducer('pdf-lib + Puppeteer');
143162
merged.setCreationDate(new Date());
144163

145164
const mergedBytes = await merged.save();
146165
writeFileSync(OUTPUT_PDF, mergedBytes);
147166

148-
// Clean up temp files
149-
try { unlinkSync(COVER_PDF); } catch { /* ignore */ }
150-
try { unlinkSync(DOCS_PDF); } catch { /* ignore */ }
151-
152167
const sizeMB = (mergedBytes.length / 1_048_576).toFixed(2);
153168
console.log(` Merged PDF saved (${sizeMB} MB) βœ“`);
154169
}
@@ -163,17 +178,22 @@ async function main() {
163178
console.log(' β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•');
164179
console.log();
165180

166-
await generateCover();
167-
generateDocs();
168-
await mergePDFs();
181+
await launchBrowser();
182+
183+
const coverBuf = await generateCover();
184+
const docBufs = await generateDocs();
185+
await mergePDFs(coverBuf, docBufs);
186+
187+
await browser.close();
169188

170189
console.log();
171190
console.log(` βœ… ${OUTPUT_PDF}`);
172191
console.log();
173192
}
174193

175-
main().catch((err) => {
194+
main().catch(async (err) => {
176195
console.error('\n ❌ PDF generation failed:\n');
177196
console.error(err);
197+
if (browser) await browser.close().catch(() => {});
178198
process.exit(1);
179199
});

0 commit comments

Comments
Β (0)