|
7 | 7 | // served pages actually carry that header and that it never leaked into |
8 | 8 | // `llms-full.txt`. |
9 | 9 | // |
10 | | -// It runs against a live server over HTTP (in CI: Nginx on :8080), so it tests |
11 | | -// the real served bytes, not the files on disk. |
| 10 | +// The per-page checks run against a live server over HTTP (in CI: Nginx on |
| 11 | +// :8080), so they test the real served bytes. The llms-full.txt leak guard |
| 12 | +// instead reads the build artifact off disk - see readLlmsFull() for why. |
12 | 13 | // |
13 | 14 | // Usage: |
14 | 15 | // node scripts/checkNavHeaders.mjs [baseUrl] |
15 | 16 | // baseUrl defaults to $NAV_HEADERS_BASE_URL or http://localhost:8080 |
16 | 17 |
|
| 18 | +import { execFile } from 'node:child_process'; |
| 19 | +import { readFile } from 'node:fs/promises'; |
| 20 | +import { fileURLToPath } from 'node:url'; |
| 21 | +import { promisify } from 'node:util'; |
| 22 | + |
| 23 | +const execFileAsync = promisify(execFile); |
| 24 | + |
| 25 | +// Resolved relative to this script, so the disk read works regardless of cwd. |
| 26 | +const LLMS_FULL_FILE = fileURLToPath(new URL('../build/llms-full.txt', import.meta.url)); |
| 27 | + |
17 | 28 | const BASE = (process.argv[2] || process.env.NAV_HEADERS_BASE_URL || 'http://localhost:8080').replace(/\/$/, ''); |
18 | 29 | const SITE_URL = 'https://docs.apify.com'; |
19 | 30 |
|
@@ -58,20 +69,40 @@ const PAGES = [ |
58 | 69 | const failures = []; |
59 | 70 | const fail = (page, msg) => failures.push(`${page} → ${msg}`); |
60 | 71 |
|
61 | | -async function fetchText(url, attempts = 3) { |
62 | | - for (let i = 1; ; i++) { |
63 | | - try { |
64 | | - const res = await fetch(url, { headers: { Accept: 'text/markdown' } }); |
65 | | - if (res.ok) return await res.text(); |
66 | | - throw new Error(`HTTP ${res.status} for ${url}`); |
67 | | - } catch (err) { |
68 | | - // A 4xx is deterministic (e.g. a genuinely missing page) - fail fast. |
69 | | - // Anything else is transient: a 5xx, or undici throwing "terminated" |
70 | | - // when a pooled keep-alive socket goes stale mid-read (seen on the |
71 | | - // ~42MB llms-full.txt with the server's Keep-Alive: timeout=5). Retry. |
72 | | - if (i >= attempts || /^HTTP 4\d\d /.test(err.message)) throw err; |
73 | | - await new Promise((resolve) => setTimeout(resolve, 250 * i)); |
74 | | - } |
| 72 | +// Fetch with curl rather than Node's global fetch (undici). undici throws |
| 73 | +// "terminated" reading the ~42MB llms-full.txt through the CI Nginx proxy, while |
| 74 | +// curl - which every other assertion in this CI job already uses - streams it |
| 75 | +// reliably. Flags: -f makes an HTTP >=400 a non-zero exit; -sS stays quiet but |
| 76 | +// still reports real errors; --retry covers genuinely transient blips. |
| 77 | +async function fetchText(url) { |
| 78 | + try { |
| 79 | + const { stdout } = await execFileAsync( |
| 80 | + 'curl', |
| 81 | + ['-fsS', '--retry', '2', '--retry-delay', '1', '-H', 'Accept: text/markdown', url], |
| 82 | + // llms-full.txt is ~42MB; the default 1MB maxBuffer would truncate it. |
| 83 | + { maxBuffer: 256 * 1024 * 1024 }, |
| 84 | + ); |
| 85 | + return stdout; |
| 86 | + } catch (err) { |
| 87 | + throw new Error((err.stderr || err.message || '').toString().trim() || `curl failed for ${url}`); |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | +// Read llms-full.txt for the leak guard. This is deliberately NOT an HTTP fetch: |
| 92 | +// the file is ~42MB and the CI server truncates the response mid-body (undici |
| 93 | +// reports "terminated", curl reports "transfer closed with N bytes remaining"), |
| 94 | +// so no client can read it reliably over the wire. The leak guard is anyway a |
| 95 | +// build-artifact invariant - the nav header must never be concatenated into |
| 96 | +// llms-full.txt (issue #2557), which is about what addNavHeaders.mjs leaves in |
| 97 | +// build/, not how it's served - so read the file straight off disk. Fall back to |
| 98 | +// HTTP only when there is no local build (e.g. checking a remote deployment). |
| 99 | +async function readLlmsFull() { |
| 100 | + try { |
| 101 | + return { source: LLMS_FULL_FILE, text: await readFile(LLMS_FULL_FILE, 'utf8') }; |
| 102 | + } catch (err) { |
| 103 | + if (err.code !== 'ENOENT') throw err; |
| 104 | + const url = `${BASE}/llms-full.txt`; |
| 105 | + return { source: url, text: await fetchText(url) }; |
75 | 106 | } |
76 | 107 | } |
77 | 108 |
|
@@ -164,16 +195,16 @@ for (const { path, keys } of PAGES) { |
164 | 195 | // synthetic root breadcrumb appears only in the header, so its presence here |
165 | 196 | // means a leak. |
166 | 197 | try { |
167 | | - const llmsFull = await fetchText(`${BASE}/llms-full.txt`); |
168 | | - if (llmsFull.includes(ROOT_PARENT)) { |
169 | | - fail('/llms-full.txt', 'nav header leaked into llms-full.txt (found the root breadcrumb)'); |
170 | | - console.log('❌ /llms-full.txt (nav header leaked in)'); |
| 198 | + const { source, text } = await readLlmsFull(); |
| 199 | + if (text.includes(ROOT_PARENT)) { |
| 200 | + fail('llms-full.txt', `nav header leaked into ${source} (found the root breadcrumb)`); |
| 201 | + console.log('❌ llms-full.txt (nav header leaked in)'); |
171 | 202 | } else { |
172 | | - console.log('✅ /llms-full.txt (no nav header leaked in)'); |
| 203 | + console.log(`✅ llms-full.txt (no nav header leaked in; read ${source})`); |
173 | 204 | } |
174 | 205 | } catch (err) { |
175 | | - fail('/llms-full.txt', err.message); |
176 | | - console.log('❌ /llms-full.txt (fetch failed)'); |
| 206 | + fail('llms-full.txt', err.message); |
| 207 | + console.log('❌ llms-full.txt (read failed)'); |
177 | 208 | } |
178 | 209 |
|
179 | 210 | if (failures.length > 0) { |
|
0 commit comments