Skip to content

Commit 9c05a6f

Browse files
authored
feat(corpus): seed worktree pulls from the primary repo via hardlink (#2867)
Pulling the full corpus (~450 .docx files, ~100 MB) on every new worktree is wasteful: the bytes are identical to the primary repo already sitting on disk. Detect when we're in a git worktree via `git rev-parse --git-common-dir`, then hardlink any files the primary has into the worktree's corpus dir before hitting R2. Only files the primary is missing go over the network. --force still re-downloads from R2 (we unlink the destination before writing so the primary's inodes never get clobbered). --no-seed opts out for users who want the old behaviour.
1 parent a9caa8e commit 9c05a6f

3 files changed

Lines changed: 123 additions & 4 deletions

File tree

scripts/corpus/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,20 @@ pnpm corpus:update-registry
2424

2525
`pnpm corpus:pull` now tolerates missing keys and prunes stale `registry.json` entries automatically.
2626
`pnpm corpus:pull` does not remove local files that no longer exist in R2; use `pnpm corpus:delete` when you want the shared corpus and local copy removed together.
27+
28+
### Worktrees: seeding from the primary repo
29+
30+
When `pnpm corpus:pull` runs inside a git worktree it first hardlinks anything
31+
the primary repo already has on disk, then only goes to R2 for the rest. The
32+
primary's bytes are already local so this is effectively instant for a fresh
33+
worktree.
34+
35+
- Hardlinks (not copies) — zero disk overhead, both worktrees see the same
36+
inode. Falls back to copy automatically when the two checkouts live on
37+
different filesystems.
38+
- `--force` still re-downloads from R2. Destinations are unlinked before each
39+
R2 write, so the primary's files are never clobbered.
40+
- Pass `--no-seed` to skip the hardlink step and go straight to R2.
2741
`pnpm corpus:push` runs `superdoc-benchmark baseline <uploaded-key> --force` by default after upload.
2842
Set `SUPERDOC_CORPUS_SKIP_WORD_BASELINE=1` (or pass `--no-word-baseline`) to disable this behavior.
2943

scripts/corpus/pull.mjs

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import {
1717
normalizePath,
1818
printCorpusEnvHint,
1919
saveRegistry,
20+
seedCorpusFromPrimary,
2021
sortRegistryDocs,
2122
writeProgressBar,
2223
} from './shared.mjs';
@@ -48,6 +49,8 @@ Options:
4849
--match <text> Substring filter (repeatable)
4950
--exclude <prefix> Exclude filter (repeatable)
5051
--force Re-download files even if they already exist
52+
--no-seed In a git worktree, skip hardlinking from the primary
53+
repo's corpus before pulling from R2
5154
--link-visual Point tests/visual/test-data at --dest via symlink
5255
--dry-run Print actions without downloading
5356
--quiet Suppress verbose logs; show only progress and summary
@@ -65,6 +68,7 @@ function parseArgs(argv) {
6568
linkVisual: false,
6669
dryRun: false,
6770
quiet: false,
71+
seedFromPrimary: true,
6872
};
6973

7074
for (let i = 0; i < argv.length; i += 1) {
@@ -99,6 +103,10 @@ function parseArgs(argv) {
99103
args.force = true;
100104
continue;
101105
}
106+
if (arg === '--no-seed') {
107+
args.seedFromPrimary = false;
108+
continue;
109+
}
102110
if (arg === '--link-visual') {
103111
args.linkVisual = true;
104112
continue;
@@ -247,13 +255,21 @@ async function main() {
247255

248256
let downloaded = 0;
249257
let skipped = 0;
258+
let seeded = 0;
250259

251260
if (!args.quiet) {
252261
console.log(`[corpus] Source: ${corpus.source}`);
253262
console.log(`[corpus] Destination: ${destinationRoot}`);
254263
console.log(`[corpus] Corpus size: ${selectedDocs.length} documents`);
255264
}
256265

266+
// Fast path for git worktrees: hardlink from the primary repo's corpus
267+
// before reaching for R2. Downloads below still run for anything the
268+
// primary is missing, so --force or a fresh fixture still trigger R2.
269+
if (args.seedFromPrimary && !args.force && !args.dryRun) {
270+
seeded = seedCorpusFromPrimary(destinationRoot, selectedDocs, { quiet: args.quiet });
271+
}
272+
257273
if (corpus.source === REGISTRY_KEY && corpus.registry) {
258274
const allObjectKeys = await client.listObjects('');
259275
const objectKeySet = new Set(allObjectKeys.map((key) => normalizePath(key).toLowerCase()));
@@ -316,6 +332,14 @@ async function main() {
316332
const { relativePath, objectKey, destinationPath } = toDownload[idx];
317333

318334
try {
335+
// Unlink before writing. If the destination is a hardlink to the
336+
// primary repo's corpus (seeded above), wrangler's r2 object get
337+
// would otherwise write through and mutate the primary's file.
338+
try {
339+
fs.rmSync(destinationPath, { force: true });
340+
} catch {
341+
// swallow — the write below will surface any real permission issue
342+
}
319343
await client.getObjectToFile(objectKey, destinationPath);
320344
downloaded += 1;
321345
} catch (error) {
@@ -379,12 +403,16 @@ async function main() {
379403

380404
const elapsed = Date.now() - startedAt;
381405
if (args.quiet) {
382-
if (downloaded > 0) {
383-
console.log(`[corpus] Synced ${downloaded} new document(s) in ${formatDurationMs(elapsed)}`);
406+
if (downloaded > 0 || seeded > 0) {
407+
const parts = [];
408+
if (seeded > 0) parts.push(`${seeded} seeded`);
409+
if (downloaded > 0) parts.push(`${downloaded} downloaded`);
410+
console.log(`[corpus] Synced ${parts.join(' + ')} in ${formatDurationMs(elapsed)}`);
384411
}
385412
} else {
413+
const seedPart = seeded > 0 ? `, Seeded: ${seeded}` : '';
386414
console.log(
387-
`[corpus] Done. Downloaded: ${downloaded}, Skipped: ${skipped}, Missing: ${missingRegistryPaths.length}, Elapsed: ${formatDurationMs(elapsed)}`,
415+
`[corpus] Done. Downloaded: ${downloaded}${seedPart}, Skipped: ${skipped}, Missing: ${missingRegistryPaths.length}, Elapsed: ${formatDurationMs(elapsed)}`,
388416
);
389417
}
390418
} finally {

scripts/corpus/shared.mjs

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import os from 'node:os';
33
import path from 'node:path';
44
import crypto from 'node:crypto';
55
import process from 'node:process';
6-
import { execFile as execFileCb } from 'node:child_process';
6+
import { execFile as execFileCb, execFileSync } from 'node:child_process';
77
import { fileURLToPath, pathToFileURL } from 'node:url';
88
import { promisify } from 'node:util';
99
import { createRequire } from 'node:module';
@@ -468,6 +468,83 @@ export function ensureVisualTestDataSymlink(corpusRoot) {
468468
return { linked: true, changed: true, backupPath: null };
469469
}
470470

471+
/**
472+
* If the current repo is a git worktree, return the primary repo's root path.
473+
* Uses `git rev-parse --git-common-dir`: in a worktree this resolves to
474+
* `<primary>/.git/worktrees/<name>/..` pointing inside the primary's .git dir,
475+
* while in a non-worktree checkout it resolves to the local `.git`.
476+
* Returns null if we're not in a worktree, not in git, or git isn't available.
477+
*/
478+
export function findPrimaryRepoRoot() {
479+
try {
480+
const commonDir = execFileSync('git', ['rev-parse', '--git-common-dir'], {
481+
cwd: REPO_ROOT,
482+
stdio: ['ignore', 'pipe', 'ignore'],
483+
encoding: 'utf8',
484+
}).trim();
485+
const absoluteCommonDir = path.resolve(REPO_ROOT, commonDir);
486+
const ownGitDir = path.resolve(REPO_ROOT, '.git');
487+
488+
// Non-worktree: common dir IS our own .git
489+
if (absoluteCommonDir === ownGitDir) return null;
490+
491+
// Worktree: common dir is the primary's .git (or a file pointer); its parent is the primary repo
492+
const primaryRoot = path.dirname(absoluteCommonDir);
493+
if (primaryRoot === REPO_ROOT) return null;
494+
return primaryRoot;
495+
} catch {
496+
return null;
497+
}
498+
}
499+
500+
/**
501+
* Seed the worktree's corpus from the primary repo using hardlinks (falling
502+
* back to copy across filesystems). Skips files that already exist at the
503+
* destination and files the primary doesn't have. Returns the count seeded.
504+
*
505+
* Only runs when we're in a worktree AND the primary has corpus files.
506+
*/
507+
export function seedCorpusFromPrimary(destinationRoot, selectedDocs, { quiet = false } = {}) {
508+
const primaryRoot = findPrimaryRepoRoot();
509+
if (!primaryRoot) return 0;
510+
511+
// Follow the primary's default corpus layout. This is intentionally a fixed
512+
// path — users with custom --dest in the primary can still fall back to R2.
513+
const primaryCorpus = path.join(primaryRoot, path.basename(DEFAULT_CORPUS_ROOT));
514+
if (!fs.existsSync(primaryCorpus)) return 0;
515+
516+
let seeded = 0;
517+
let copyFallback = 0;
518+
519+
for (const doc of selectedDocs) {
520+
const relativePath = normalizePath(doc.relative_path);
521+
if (!relativePath) continue;
522+
523+
const primaryPath = path.join(primaryCorpus, relativePath);
524+
const destinationPath = path.join(destinationRoot, relativePath);
525+
526+
if (fs.existsSync(destinationPath)) continue;
527+
if (!fs.existsSync(primaryPath)) continue;
528+
529+
fs.mkdirSync(path.dirname(destinationPath), { recursive: true });
530+
try {
531+
fs.linkSync(primaryPath, destinationPath);
532+
} catch {
533+
// Hardlink fails across filesystems or for special files — copy instead
534+
fs.copyFileSync(primaryPath, destinationPath);
535+
copyFallback += 1;
536+
}
537+
seeded += 1;
538+
}
539+
540+
if (seeded > 0 && !quiet) {
541+
const relPrimary = path.relative(REPO_ROOT, primaryRoot) || primaryRoot;
542+
const method = copyFallback === seeded ? 'copied' : copyFallback > 0 ? 'hardlinked (with copy fallback)' : 'hardlinked';
543+
console.log(`[corpus] Seeded ${seeded} file(s) ${method} from primary repo: ${relPrimary}`);
544+
}
545+
return seeded;
546+
}
547+
471548
export function applyPathFilters(paths, { filters = [], matches = [], excludes = [] } = {}) {
472549
const normalizedFilters = filters.map((value) => String(value).toLowerCase()).filter(Boolean);
473550
const normalizedMatches = matches.map((value) => String(value).toLowerCase()).filter(Boolean);

0 commit comments

Comments
 (0)