Skip to content

Commit 8193e85

Browse files
marekh19claude
andauthored
fix: make nav-header check pass by reading llms-full.txt from disk (#2609)
The nav-header check was failing CI on the ~42MB `llms-full.txt`: the server truncates the body mid-stream through the CI proxy, so no HTTP client can read it cleanly (undici reports `terminated`, curl reports `transfer closed with N bytes remaining`). The existing `assert_header` only ever "passed" because it greps the header and ignores curl's exit code. The leak guard is a build-artifact invariant anyway - the nav header must never be concatenated into `llms-full.txt` (issue #2557) - so it now reads the file straight off `build/` instead of over HTTP. Per-page checks stay over HTTP via curl (small, reliable). Verified locally: 14 pages + leak guard pass, and the guard still fires if the header is present. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent bea95d3 commit 8193e85

1 file changed

Lines changed: 54 additions & 23 deletions

File tree

scripts/checkNavHeaders.mjs

Lines changed: 54 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,24 @@
77
// served pages actually carry that header and that it never leaked into
88
// `llms-full.txt`.
99
//
10-
// It runs against a live server over HTTP (in CI: Nginx on :8080), so it tests
11-
// the real served bytes, not the files on disk.
10+
// The per-page checks run against a live server over HTTP (in CI: Nginx on
11+
// :8080), so they test the real served bytes. The llms-full.txt leak guard
12+
// instead reads the build artifact off disk - see readLlmsFull() for why.
1213
//
1314
// Usage:
1415
// node scripts/checkNavHeaders.mjs [baseUrl]
1516
// baseUrl defaults to $NAV_HEADERS_BASE_URL or http://localhost:8080
1617

18+
import { execFile } from 'node:child_process';
19+
import { readFile } from 'node:fs/promises';
20+
import { fileURLToPath } from 'node:url';
21+
import { promisify } from 'node:util';
22+
23+
const execFileAsync = promisify(execFile);
24+
25+
// Resolved relative to this script, so the disk read works regardless of cwd.
26+
const LLMS_FULL_FILE = fileURLToPath(new URL('../build/llms-full.txt', import.meta.url));
27+
1728
const BASE = (process.argv[2] || process.env.NAV_HEADERS_BASE_URL || 'http://localhost:8080').replace(/\/$/, '');
1829
const SITE_URL = 'https://docs.apify.com';
1930

@@ -58,20 +69,40 @@ const PAGES = [
5869
const failures = [];
5970
const fail = (page, msg) => failures.push(`${page}${msg}`);
6071

61-
async function fetchText(url, attempts = 3) {
62-
for (let i = 1; ; i++) {
63-
try {
64-
const res = await fetch(url, { headers: { Accept: 'text/markdown' } });
65-
if (res.ok) return await res.text();
66-
throw new Error(`HTTP ${res.status} for ${url}`);
67-
} catch (err) {
68-
// A 4xx is deterministic (e.g. a genuinely missing page) - fail fast.
69-
// Anything else is transient: a 5xx, or undici throwing "terminated"
70-
// when a pooled keep-alive socket goes stale mid-read (seen on the
71-
// ~42MB llms-full.txt with the server's Keep-Alive: timeout=5). Retry.
72-
if (i >= attempts || /^HTTP 4\d\d /.test(err.message)) throw err;
73-
await new Promise((resolve) => setTimeout(resolve, 250 * i));
74-
}
72+
// Fetch with curl rather than Node's global fetch (undici). undici throws
73+
// "terminated" reading the ~42MB llms-full.txt through the CI Nginx proxy, while
74+
// curl - which every other assertion in this CI job already uses - streams it
75+
// reliably. Flags: -f makes an HTTP >=400 a non-zero exit; -sS stays quiet but
76+
// still reports real errors; --retry covers genuinely transient blips.
77+
async function fetchText(url) {
78+
try {
79+
const { stdout } = await execFileAsync(
80+
'curl',
81+
['-fsS', '--retry', '2', '--retry-delay', '1', '-H', 'Accept: text/markdown', url],
82+
// llms-full.txt is ~42MB; the default 1MB maxBuffer would truncate it.
83+
{ maxBuffer: 256 * 1024 * 1024 },
84+
);
85+
return stdout;
86+
} catch (err) {
87+
throw new Error((err.stderr || err.message || '').toString().trim() || `curl failed for ${url}`);
88+
}
89+
}
90+
91+
// Read llms-full.txt for the leak guard. This is deliberately NOT an HTTP fetch:
92+
// the file is ~42MB and the CI server truncates the response mid-body (undici
93+
// reports "terminated", curl reports "transfer closed with N bytes remaining"),
94+
// so no client can read it reliably over the wire. The leak guard is anyway a
95+
// build-artifact invariant - the nav header must never be concatenated into
96+
// llms-full.txt (issue #2557), which is about what addNavHeaders.mjs leaves in
97+
// build/, not how it's served - so read the file straight off disk. Fall back to
98+
// HTTP only when there is no local build (e.g. checking a remote deployment).
99+
async function readLlmsFull() {
100+
try {
101+
return { source: LLMS_FULL_FILE, text: await readFile(LLMS_FULL_FILE, 'utf8') };
102+
} catch (err) {
103+
if (err.code !== 'ENOENT') throw err;
104+
const url = `${BASE}/llms-full.txt`;
105+
return { source: url, text: await fetchText(url) };
75106
}
76107
}
77108

@@ -164,16 +195,16 @@ for (const { path, keys } of PAGES) {
164195
// synthetic root breadcrumb appears only in the header, so its presence here
165196
// means a leak.
166197
try {
167-
const llmsFull = await fetchText(`${BASE}/llms-full.txt`);
168-
if (llmsFull.includes(ROOT_PARENT)) {
169-
fail('/llms-full.txt', 'nav header leaked into llms-full.txt (found the root breadcrumb)');
170-
console.log('❌ /llms-full.txt (nav header leaked in)');
198+
const { source, text } = await readLlmsFull();
199+
if (text.includes(ROOT_PARENT)) {
200+
fail('llms-full.txt', `nav header leaked into ${source} (found the root breadcrumb)`);
201+
console.log('❌ llms-full.txt (nav header leaked in)');
171202
} else {
172-
console.log('✅ /llms-full.txt (no nav header leaked in)');
203+
console.log(`✅ llms-full.txt (no nav header leaked in; read ${source})`);
173204
}
174205
} catch (err) {
175-
fail('/llms-full.txt', err.message);
176-
console.log('❌ /llms-full.txt (fetch failed)');
206+
fail('llms-full.txt', err.message);
207+
console.log('❌ llms-full.txt (read failed)');
177208
}
178209

179210
if (failures.length > 0) {

0 commit comments

Comments
 (0)