From ac6127d1b8c56a4d808a21557897eaf9f437ad37 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 27 May 2026 09:59:43 +0300 Subject: [PATCH 1/3] =?UTF-8?q?feat(scripts):=20ShmuggingFace=20preview=20?= =?UTF-8?q?site=20builder=20=E2=80=94=20v1.0.2,=20hardened=20(PR=208.4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces scripts/build_shmuggingface_site.py to main with all review fixes applied (PR 8.4 supersedes the unmerged PR 80 which had merge conflicts and open review issues). Fixes vs. the PR 80 branch: - Remove TIER_USABILITY / TIER_MEDAL (fabricated Kaggle scores that ShmuggingFaceCore ignores; removes silent misinformation) - Raise on missing manifest/metrics fields (_require() helper) instead of silently defaulting to plausible-but-false values like n_leads=5000 - Per-tier dataset_card.md as each tier's description HTML instead of the global release/README.md embedded three times - --branch preview default for wrangler deploy; add --production flag required to push to the production slot (prevents accidental clobbers) - Fix _rewrite_links to handle bare relative links like [LICENSE](LICENSE) that would 404 on the static host - Bump to ShmuggingFaceCore v1.0.2 (implements all fixes from the recent review round per upstream release notes) - Regenerate package-lock.json via HTTPS tarball (old lockfile used git+ssh:// requiring SSH keys; new one uses the public tarball URL) - wrangler as devDependency in package.json - split column in feature_dictionary.csv: write_flat_csv prepends a split_metadata row so the column spec covers every column in lead_scoring.csv; load_tier normalises to avoid double-prepend - 22 smoke tests covering: no fabricated constants, _require raises, per-tier card vs global README, split column exactly once at index 0, all three tiers produce valid configs, deploy_site branch flag, _rewrite_links coverage Co-Authored-By: Claude Sonnet 4.6 --- .agent-plan.md | 2 +- package-lock.json | 1629 +++++++++++++++++ package.json | 15 + scripts/build_public_release.py | 33 + scripts/build_shmuggingface_site.py | 606 ++++++ .../scripts/test_build_shmuggingface_site.py | 332 ++++ 6 files changed, 2616 insertions(+), 1 deletion(-) create mode 100644 package-lock.json create mode 100644 package.json create mode 100644 scripts/build_shmuggingface_site.py create mode 100644 tests/scripts/test_build_shmuggingface_site.py diff --git a/.agent-plan.md b/.agent-plan.md index f54a6f9..37b4099 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -113,7 +113,7 @@ _Source: `docs/external_review/summaries/v1_release_review_synthesis.md` — cro - Labels: `type: test`, `layer: validation`, `layer: render` - Size: S (~450 lines, mostly tests) -- [ ] **PR 8.4** — `feat(scripts): integration script + preview hardening` +- [x] **PR 8.4** — `feat(scripts): integration script + preview hardening` - **Regenerate lockfile + bump to v1.0.1** (HIGH): delete `package-lock.json`, update `package.json` pin to `github:ShmuggingFace/ShmuggingFaceCore#v1.0.1`, regenerate via HTTPS. Fixes SSH lockfile and gets the socks/laundry copy fix in one step. - **Remove fabricated Kaggle usability scores and medals** (HIGH): delete `TIER_USABILITY`, `TIER_MEDAL` constants from `build_shmuggingface_site.py`. These are dead config today (the framework ignores them) but latent misinformation. - **Remaining preview-generator cleanup** (HIGH): `build_shmuggingface_site.py` no longer exists on `main`; if a ShmuggingFaceCore build path is reintroduced, it must consume or validate against the canonical lint gate from SMF-PR5 rather than hard-code task, tags, license, splits, or schema. diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..7029e93 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,1629 @@ +{ + "name": "leadforge-shmuggingface-tooling", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "leadforge-shmuggingface-tooling", + "dependencies": { + "@shmuggingface/core": "https://github.com/ShmuggingFace/ShmuggingFaceCore/archive/refs/tags/v1.0.2.tar.gz" + }, + "devDependencies": { + "wrangler": "latest" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/@cloudflare/kv-asset-handler": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/@cloudflare/kv-asset-handler/-/kv-asset-handler-0.5.0.tgz", + "integrity": "sha512-jxQYkj8dSIzc0cD6cMMNdOc1UVjqSqu8BZdor5s8cGjW2I8BjODt/kWPVdY+u9zj3ms75Q5qaZgnxUad83+eAg==", + "dev": true, + "license": "MIT OR Apache-2.0", + "engines": { + "node": ">=22.0.0" + } + }, + "node_modules/@cloudflare/unenv-preset": { + "version": "2.16.1", + "resolved": "https://registry.npmjs.org/@cloudflare/unenv-preset/-/unenv-preset-2.16.1.tgz", + "integrity": "sha512-ECxObrMfyTl5bhQf/lZCXwo5G6xX9IAUo+nDMKK4SZ8m4Jvvxp52vilxyySSWh2YTZz8+HQ07qGH/2rEom1vDw==", + "dev": true, + "license": "MIT OR Apache-2.0", + "peerDependencies": { + "unenv": "2.0.0-rc.24", + "workerd": ">1.20260305.0 <2.0.0-0" + }, + "peerDependenciesMeta": { + "workerd": { + "optional": true + } + } + }, + "node_modules/@cloudflare/workerd-darwin-64": { + "version": "1.20260526.1", + "resolved": "https://registry.npmjs.org/@cloudflare/workerd-darwin-64/-/workerd-darwin-64-1.20260526.1.tgz", + "integrity": "sha512-/pR3GH3gfv0PUp7DjI8v0aAIDOqFwibq4bg5xT7TZgcVdBV/cJQWckdXCMqiRtHiawLwogUX00EIOINkYJ1Zqg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=16" + } + }, + "node_modules/@cloudflare/workerd-darwin-arm64": { + "version": "1.20260526.1", + "resolved": "https://registry.npmjs.org/@cloudflare/workerd-darwin-arm64/-/workerd-darwin-arm64-1.20260526.1.tgz", + "integrity": "sha512-rcyu0iANYfaiezKh3Mcao1O4IIgVfQldxduiL5TZT1sP0NIeRY4YReSTrzPxNnXxSYaIqaqRHMcHbUM/ic4knA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=16" + } + }, + "node_modules/@cloudflare/workerd-linux-64": { + "version": "1.20260526.1", + "resolved": "https://registry.npmjs.org/@cloudflare/workerd-linux-64/-/workerd-linux-64-1.20260526.1.tgz", + "integrity": "sha512-5EZAEnlLwa9oGJRo8Nd3iY5Wcd9ROGNNG90xNIGp8MEjj8v2jTn42NC47fCZKFdnLj3+S+vWEhu1x0GVJnALjA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=16" + } + }, + "node_modules/@cloudflare/workerd-linux-arm64": { + "version": "1.20260526.1", + "resolved": "https://registry.npmjs.org/@cloudflare/workerd-linux-arm64/-/workerd-linux-arm64-1.20260526.1.tgz", + "integrity": "sha512-X/YBQXeXFeCN7QTStoWrATEBc9WKl7PIqkw/dQkjyJ72gh3rkLe0+Xkzp3wO7gtxTDQMa7NPGy1W4+sdMf8q1g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=16" + } + }, + "node_modules/@cloudflare/workerd-windows-64": { + "version": "1.20260526.1", + "resolved": "https://registry.npmjs.org/@cloudflare/workerd-windows-64/-/workerd-windows-64-1.20260526.1.tgz", + "integrity": "sha512-R+tqpFFdcfZIljx8fIW9rj9fRTtDgfoA2yonsfAGa6e8snrmr+38mdFHtkRC0D3UyZpn/hOtmXiUBfdX2gMR7Q==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "Apache-2.0", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=16" + } + }, + "node_modules/@cspotcode/source-map-support": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", + "integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/trace-mapping": "0.3.9" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/@emnapi/runtime": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.10.0.tgz", + "integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "tslib": "^2.4.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.3.tgz", + "integrity": "sha512-9fJMTNFTWZMh5qwrBItuziu834eOCUcEqymSH7pY+zoMVEZg3gcPuBNxH1EvfVYe9h0x/Ptw8KBzv7qxb7l8dg==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.27.3.tgz", + "integrity": "sha512-i5D1hPY7GIQmXlXhs2w8AWHhenb00+GxjxRncS2ZM7YNVGNfaMxgzSGuO8o8SJzRc/oZwU2bcScvVERk03QhzA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.27.3.tgz", + "integrity": "sha512-YdghPYUmj/FX2SYKJ0OZxf+iaKgMsKHVPF1MAq/P8WirnSpCStzKJFjOjzsW0QQ7oIAiccHdcqjbHmJxRb/dmg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.27.3.tgz", + "integrity": "sha512-IN/0BNTkHtk8lkOM8JWAYFg4ORxBkZQf9zXiEOfERX/CzxW3Vg1ewAhU7QSWQpVIzTW+b8Xy+lGzdYXV6UZObQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.27.3.tgz", + "integrity": "sha512-Re491k7ByTVRy0t3EKWajdLIr0gz2kKKfzafkth4Q8A5n1xTHrkqZgLLjFEHVD+AXdUGgQMq+Godfq45mGpCKg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.27.3.tgz", + "integrity": "sha512-vHk/hA7/1AckjGzRqi6wbo+jaShzRowYip6rt6q7VYEDX4LEy1pZfDpdxCBnGtl+A5zq8iXDcyuxwtv3hNtHFg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.27.3.tgz", + "integrity": "sha512-ipTYM2fjt3kQAYOvo6vcxJx3nBYAzPjgTCk7QEgZG8AUO3ydUhvelmhrbOheMnGOlaSFUoHXB6un+A7q4ygY9w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.27.3.tgz", + "integrity": "sha512-dDk0X87T7mI6U3K9VjWtHOXqwAMJBNN2r7bejDsc+j03SEjtD9HrOl8gVFByeM0aJksoUuUVU9TBaZa2rgj0oA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.27.3.tgz", + "integrity": "sha512-s6nPv2QkSupJwLYyfS+gwdirm0ukyTFNl3KTgZEAiJDd+iHZcbTPPcWCcRYH+WlNbwChgH2QkE9NSlNrMT8Gfw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.27.3.tgz", + "integrity": "sha512-sZOuFz/xWnZ4KH3YfFrKCf1WyPZHakVzTiqji3WDc0BCl2kBwiJLCXpzLzUBLgmp4veFZdvN5ChW4Eq/8Fc2Fg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.27.3.tgz", + "integrity": "sha512-yGlQYjdxtLdh0a3jHjuwOrxQjOZYD/C9PfdbgJJF3TIZWnm/tMd/RcNiLngiu4iwcBAOezdnSLAwQDPqTmtTYg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.27.3.tgz", + "integrity": "sha512-WO60Sn8ly3gtzhyjATDgieJNet/KqsDlX5nRC5Y3oTFcS1l0KWba+SEa9Ja1GfDqSF1z6hif/SkpQJbL63cgOA==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.27.3.tgz", + "integrity": "sha512-APsymYA6sGcZ4pD6k+UxbDjOFSvPWyZhjaiPyl/f79xKxwTnrn5QUnXR5prvetuaSMsb4jgeHewIDCIWljrSxw==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.27.3.tgz", + "integrity": "sha512-eizBnTeBefojtDb9nSh4vvVQ3V9Qf9Df01PfawPcRzJH4gFSgrObw+LveUyDoKU3kxi5+9RJTCWlj4FjYXVPEA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.27.3.tgz", + "integrity": "sha512-3Emwh0r5wmfm3ssTWRQSyVhbOHvqegUDRd0WhmXKX2mkHJe1SFCMJhagUleMq+Uci34wLSipf8Lagt4LlpRFWQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.27.3.tgz", + "integrity": "sha512-pBHUx9LzXWBc7MFIEEL0yD/ZVtNgLytvx60gES28GcWMqil8ElCYR4kvbV2BDqsHOvVDRrOxGySBM9Fcv744hw==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.27.3.tgz", + "integrity": "sha512-Czi8yzXUWIQYAtL/2y6vogER8pvcsOsk5cpwL4Gk5nJqH5UZiVByIY8Eorm5R13gq+DQKYg0+JyQoytLQas4dA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.27.3.tgz", + "integrity": "sha512-sDpk0RgmTCR/5HguIZa9n9u+HVKf40fbEUt+iTzSnCaGvY9kFP0YKBWZtJaraonFnqef5SlJ8/TiPAxzyS+UoA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.27.3.tgz", + "integrity": "sha512-P14lFKJl/DdaE00LItAukUdZO5iqNH7+PjoBm+fLQjtxfcfFE20Xf5CrLsmZdq5LFFZzb5JMZ9grUwvtVYzjiA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.27.3.tgz", + "integrity": "sha512-AIcMP77AvirGbRl/UZFTq5hjXK+2wC7qFRGoHSDrZ5v5b8DK/GYpXW3CPRL53NkvDqb9D+alBiC/dV0Fb7eJcw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.27.3.tgz", + "integrity": "sha512-DnW2sRrBzA+YnE70LKqnM3P+z8vehfJWHXECbwBmH/CU51z6FiqTQTHFenPlHmo3a8UgpLyH3PT+87OViOh1AQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.27.3.tgz", + "integrity": "sha512-NinAEgr/etERPTsZJ7aEZQvvg/A6IsZG/LgZy+81wON2huV7SrK3e63dU0XhyZP4RKGyTm7aOgmQk0bGp0fy2g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.27.3.tgz", + "integrity": "sha512-PanZ+nEz+eWoBJ8/f8HKxTTD172SKwdXebZ0ndd953gt1HRBbhMsaNqjTyYLGLPdoWHy4zLU7bDVJztF5f3BHA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.27.3.tgz", + "integrity": "sha512-B2t59lWWYrbRDw/tjiWOuzSsFh1Y/E95ofKz7rIVYSQkUYBjfSgf6oeYPNWHToFRr2zx52JKApIcAS/D5TUBnA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.27.3.tgz", + "integrity": "sha512-QLKSFeXNS8+tHW7tZpMtjlNb7HKau0QDpwm49u0vUp9y1WOF+PEzkU84y9GqYaAVW8aH8f3GcBck26jh54cX4Q==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.27.3.tgz", + "integrity": "sha512-4uJGhsxuptu3OcpVAzli+/gWusVGwZZHTlS63hh++ehExkVT8SgiEf7/uC/PclrPPkLhZqGgCTjd0VWLo6xMqA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@img/colour": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@img/colour/-/colour-1.1.0.tgz", + "integrity": "sha512-Td76q7j57o/tLVdgS746cYARfSyxk8iEfRxewL9h4OMzYhbW4TAcppl0mT4eyqXddh6L/jwoM75mo7ixa/pCeQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + } + }, + "node_modules/@img/sharp-darwin-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.5.tgz", + "integrity": "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-darwin-arm64": "1.2.4" + } + }, + "node_modules/@img/sharp-darwin-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.5.tgz", + "integrity": "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-darwin-x64": "1.2.4" + } + }, + "node_modules/@img/sharp-libvips-darwin-arm64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.4.tgz", + "integrity": "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "darwin" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-darwin-x64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.4.tgz", + "integrity": "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "darwin" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-arm": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.4.tgz", + "integrity": "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==", + "cpu": [ + "arm" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-arm64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.4.tgz", + "integrity": "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==", + "cpu": [ + "arm64" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-ppc64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.2.4.tgz", + "integrity": "sha512-FMuvGijLDYG6lW+b/UvyilUWu5Ayu+3r2d1S8notiGCIyYU/76eig1UfMmkZ7vwgOrzKzlQbFSuQfgm7GYUPpA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-riscv64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-riscv64/-/sharp-libvips-linux-riscv64-1.2.4.tgz", + "integrity": "sha512-oVDbcR4zUC0ce82teubSm+x6ETixtKZBh/qbREIOcI3cULzDyb18Sr/Wcyx7NRQeQzOiHTNbZFF1UwPS2scyGA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-s390x": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.2.4.tgz", + "integrity": "sha512-qmp9VrzgPgMoGZyPvrQHqk02uyjA0/QrTO26Tqk6l4ZV0MPWIW6LTkqOIov+J1yEu7MbFQaDpwdwJKhbJvuRxQ==", + "cpu": [ + "s390x" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-x64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.4.tgz", + "integrity": "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==", + "cpu": [ + "x64" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linuxmusl-arm64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.4.tgz", + "integrity": "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==", + "cpu": [ + "arm64" + ], + "dev": true, + "libc": [ + "musl" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linuxmusl-x64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.4.tgz", + "integrity": "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==", + "cpu": [ + "x64" + ], + "dev": true, + "libc": [ + "musl" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-linux-arm": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.5.tgz", + "integrity": "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==", + "cpu": [ + "arm" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-arm": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.5.tgz", + "integrity": "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==", + "cpu": [ + "arm64" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-arm64": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-ppc64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-ppc64/-/sharp-linux-ppc64-0.34.5.tgz", + "integrity": "sha512-7zznwNaqW6YtsfrGGDA6BRkISKAAE1Jo0QdpNYXNMHu2+0dTrPflTLNkpc8l7MUP5M16ZJcUvysVWWrMefZquA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-ppc64": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-riscv64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-riscv64/-/sharp-linux-riscv64-0.34.5.tgz", + "integrity": "sha512-51gJuLPTKa7piYPaVs8GmByo7/U7/7TZOq+cnXJIHZKavIRHAP77e3N2HEl3dgiqdD/w0yUfiJnII77PuDDFdw==", + "cpu": [ + "riscv64" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-riscv64": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-s390x": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.5.tgz", + "integrity": "sha512-nQtCk0PdKfho3eC5MrbQoigJ2gd1CgddUMkabUj+rBevs8tZ2cULOx46E7oyX+04WGfABgIwmMC0VqieTiR4jg==", + "cpu": [ + "s390x" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-s390x": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.5.tgz", + "integrity": "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==", + "cpu": [ + "x64" + ], + "dev": true, + "libc": [ + "glibc" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-x64": "1.2.4" + } + }, + "node_modules/@img/sharp-linuxmusl-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.5.tgz", + "integrity": "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==", + "cpu": [ + "arm64" + ], + "dev": true, + "libc": [ + "musl" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linuxmusl-arm64": "1.2.4" + } + }, + "node_modules/@img/sharp-linuxmusl-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.5.tgz", + "integrity": "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==", + "cpu": [ + "x64" + ], + "dev": true, + "libc": [ + "musl" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linuxmusl-x64": "1.2.4" + } + }, + "node_modules/@img/sharp-wasm32": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.5.tgz", + "integrity": "sha512-OdWTEiVkY2PHwqkbBI8frFxQQFekHaSSkUIJkwzclWZe64O1X4UlUjqqqLaPbUpMOQk6FBu/HtlGXNblIs0huw==", + "cpu": [ + "wasm32" + ], + "dev": true, + "license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT", + "optional": true, + "dependencies": { + "@emnapi/runtime": "^1.7.0" + }, + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-win32-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.5.tgz", + "integrity": "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "Apache-2.0 AND LGPL-3.0-or-later", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-win32-ia32": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.5.tgz", + "integrity": "sha512-FV9m/7NmeCmSHDD5j4+4pNI8Cp3aW+JvLoXcTUo0IqyjSfAZJ8dIUmijx1qaJsIiU+Hosw6xM5KijAWRJCSgNg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "Apache-2.0 AND LGPL-3.0-or-later", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-win32-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.5.tgz", + "integrity": "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "Apache-2.0 AND LGPL-3.0-or-later", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", + "dev": true, + "license": "MIT" + }, + "node_modules/@jridgewell/trace-mapping": { + "version": "0.3.9", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz", + "integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/resolve-uri": "^3.0.3", + "@jridgewell/sourcemap-codec": "^1.4.10" + } + }, + "node_modules/@poppinss/colors": { + "version": "4.1.6", + "resolved": "https://registry.npmjs.org/@poppinss/colors/-/colors-4.1.6.tgz", + "integrity": "sha512-H9xkIdFswbS8n1d6vmRd8+c10t2Qe+rZITbbDHHkQixH5+2x1FDGmi/0K+WgWiqQFKPSlIYB7jlH6Kpfn6Fleg==", + "dev": true, + "license": "MIT", + "dependencies": { + "kleur": "^4.1.5" + } + }, + "node_modules/@poppinss/dumper": { + "version": "0.6.5", + "resolved": "https://registry.npmjs.org/@poppinss/dumper/-/dumper-0.6.5.tgz", + "integrity": "sha512-NBdYIb90J7LfOI32dOewKI1r7wnkiH6m920puQ3qHUeZkxNkQiFnXVWoE6YtFSv6QOiPPf7ys6i+HWWecDz7sw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@poppinss/colors": "^4.1.5", + "@sindresorhus/is": "^7.0.2", + "supports-color": "^10.0.0" + } + }, + "node_modules/@poppinss/exception": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/@poppinss/exception/-/exception-1.2.3.tgz", + "integrity": "sha512-dCED+QRChTVatE9ibtoaxc+WkdzOSjYTKi/+uacHWIsfodVfpsueo3+DKpgU5Px8qXjgmXkSvhXvSCz3fnP9lw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@shmuggingface/core": { + "version": "1.0.2", + "resolved": "https://github.com/ShmuggingFace/ShmuggingFaceCore/archive/refs/tags/v1.0.2.tar.gz", + "integrity": "sha512-xXvRORLF6SLeRvknOrJdL8E2faY8hjk2Droq0AsAam5uPziD5YoKY0M4+6yjZZM9TI2MWixJERjMDAk1Jp0Cdw==", + "license": "MIT", + "bin": { + "shmuggingface": "bin/shmuggingface.mjs" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/@sindresorhus/is": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-7.2.0.tgz", + "integrity": "sha512-P1Cz1dWaFfR4IR+U13mqqiGsLFf1KbayybWwdd2vfctdV6hDpUkgCY0nKOLLTMSoRd/jJNjtbqzf13K8DCCXQw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sindresorhus/is?sponsor=1" + } + }, + "node_modules/@speed-highlight/core": { + "version": "1.2.15", + "resolved": "https://registry.npmjs.org/@speed-highlight/core/-/core-1.2.15.tgz", + "integrity": "sha512-BMq1K3DsElxDWawkX6eLg9+CKJrTVGCBAWVuHXVUV2u0s2711qiChLSId6ikYPfxhdYocLNt3wWwSvDiTvFabw==", + "dev": true, + "license": "CC0-1.0" + }, + "node_modules/blake3-wasm": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/blake3-wasm/-/blake3-wasm-2.1.5.tgz", + "integrity": "sha512-F1+K8EbfOZE49dtoPtmxUQrpXaBIl3ICvasLh+nJta0xkz+9kF/7uet9fLnwKqhDrmj6g+6K3Tw9yQPUg2ka5g==", + "dev": true, + "license": "MIT" + }, + "node_modules/cookie": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-1.1.1.tgz", + "integrity": "sha512-ei8Aos7ja0weRpFzJnEA9UHJ/7XQmqglbRwnf2ATjcB9Wq874VKH9kfjjirM6UhU2/E5fFYadylyhFldcqSidQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/detect-libc": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz", + "integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=8" + } + }, + "node_modules/error-stack-parser-es": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/error-stack-parser-es/-/error-stack-parser-es-1.0.5.tgz", + "integrity": "sha512-5qucVt2XcuGMcEGgWI7i+yZpmpByQ8J1lHhcL7PwqCwu9FPP3VUXzT4ltHe5i2z9dePwEHcDVOAfSnHsOlCXRA==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/antfu" + } + }, + "node_modules/esbuild": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.3.tgz", + "integrity": "sha512-8VwMnyGCONIs6cWue2IdpHxHnAjzxnw2Zr7MkVxB2vjmQ2ivqGFb4LEG3SMnv0Gb2F/G/2yA8zUaiL1gywDCCg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.27.3", + "@esbuild/android-arm": "0.27.3", + "@esbuild/android-arm64": "0.27.3", + "@esbuild/android-x64": "0.27.3", + "@esbuild/darwin-arm64": "0.27.3", + "@esbuild/darwin-x64": "0.27.3", + "@esbuild/freebsd-arm64": "0.27.3", + "@esbuild/freebsd-x64": "0.27.3", + "@esbuild/linux-arm": "0.27.3", + "@esbuild/linux-arm64": "0.27.3", + "@esbuild/linux-ia32": "0.27.3", + "@esbuild/linux-loong64": "0.27.3", + "@esbuild/linux-mips64el": "0.27.3", + "@esbuild/linux-ppc64": "0.27.3", + "@esbuild/linux-riscv64": "0.27.3", + "@esbuild/linux-s390x": "0.27.3", + "@esbuild/linux-x64": "0.27.3", + "@esbuild/netbsd-arm64": "0.27.3", + "@esbuild/netbsd-x64": "0.27.3", + "@esbuild/openbsd-arm64": "0.27.3", + "@esbuild/openbsd-x64": "0.27.3", + "@esbuild/openharmony-arm64": "0.27.3", + "@esbuild/sunos-x64": "0.27.3", + "@esbuild/win32-arm64": "0.27.3", + "@esbuild/win32-ia32": "0.27.3", + "@esbuild/win32-x64": "0.27.3" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/kleur": { + "version": "4.1.5", + "resolved": "https://registry.npmjs.org/kleur/-/kleur-4.1.5.tgz", + "integrity": "sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/miniflare": { + "version": "4.20260526.0", + "resolved": "https://registry.npmjs.org/miniflare/-/miniflare-4.20260526.0.tgz", + "integrity": "sha512-JYQ7jPZZWoaaj9jWHb8Ucp6Cu2SbDVqIsAJhumqdzzLkkfq0pYkDeino/sZfW1ixJWPjv/C44zjm9gVJC2izCA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@cspotcode/source-map-support": "0.8.1", + "sharp": "^0.34.5", + "undici": "7.24.8", + "workerd": "1.20260526.1", + "ws": "8.20.1", + "youch": "4.1.0-beta.10" + }, + "bin": { + "miniflare": "bootstrap.js" + }, + "engines": { + "node": ">=22.0.0" + } + }, + "node_modules/path-to-regexp": { + "version": "6.3.0", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-6.3.0.tgz", + "integrity": "sha512-Yhpw4T9C6hPpgPeA28us07OJeqZ5EzQTkbfwuhsUg0c237RomFoETJgmp2sa3F/41gfLE6G5cqcYwznmeEeOlQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/pathe": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/pathe/-/pathe-2.0.3.tgz", + "integrity": "sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==", + "dev": true, + "license": "MIT" + }, + "node_modules/rosie-skills": { + "version": "0.6.4", + "resolved": "https://registry.npmjs.org/rosie-skills/-/rosie-skills-0.6.4.tgz", + "integrity": "sha512-ojfhSiQRdZ2QyWbmKAHOSAUbaLYrTc5zIH7mS1jKoP8KCFSQddwVhMyFqldckTeybTfW3zNcsZzyOTzGTN1SBA==", + "dev": true, + "license": "BSD-3-Clause", + "bin": { + "rosie-skills": "dist/bin.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "rosie-skills-darwin-arm64": "0.6.4", + "rosie-skills-freebsd-x64": "0.6.4", + "rosie-skills-linux-x64": "0.6.4" + } + }, + "node_modules/rosie-skills-darwin-arm64": { + "version": "0.6.4", + "resolved": "https://registry.npmjs.org/rosie-skills-darwin-arm64/-/rosie-skills-darwin-arm64-0.6.4.tgz", + "integrity": "sha512-rn1s5hqFKcxeiDEWWoFa1hdGPshR8TkwHLzy/cBavb9XJNAaUxbe3oQ78W9sQkRHAgRyzJYyk9tw68Qrdnizgg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "BSD-3-Clause", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/rosie-skills-freebsd-x64": { + "version": "0.6.4", + "resolved": "https://registry.npmjs.org/rosie-skills-freebsd-x64/-/rosie-skills-freebsd-x64-0.6.4.tgz", + "integrity": "sha512-SxCRduPBMtfjkQ+q56Yw9OLA3PyaqoALzt7kER7IDKuUVfM2O/1w8sa5xhTDiCvWkZJixnH5d5Ya6KT+/Mwcng==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "BSD-3-Clause", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/rosie-skills-linux-x64": { + "version": "0.6.4", + "resolved": "https://registry.npmjs.org/rosie-skills-linux-x64/-/rosie-skills-linux-x64-0.6.4.tgz", + "integrity": "sha512-D9Y9mfu7goB0s0X59uU3hcFeUTef3VbpCIDwFMzyvJrAq3XhRACWBDMHQsHlyWdHxTXPX/ILyW65RXyrJlgqng==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "BSD-3-Clause", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/semver": { + "version": "7.8.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.8.1.tgz", + "integrity": "sha512-rkVq3IXh+4FDGch+KwzX3aV9W3kO54GyEgpvBzSyctDA6Xtd7RJQV1xmXbeQp5v7+VzLOfVqiutSE6GICgPFvg==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/sharp": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.5.tgz", + "integrity": "sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg==", + "dev": true, + "hasInstallScript": true, + "license": "Apache-2.0", + "dependencies": { + "@img/colour": "^1.0.0", + "detect-libc": "^2.1.2", + "semver": "^7.7.3" + }, + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-darwin-arm64": "0.34.5", + "@img/sharp-darwin-x64": "0.34.5", + "@img/sharp-libvips-darwin-arm64": "1.2.4", + "@img/sharp-libvips-darwin-x64": "1.2.4", + "@img/sharp-libvips-linux-arm": "1.2.4", + "@img/sharp-libvips-linux-arm64": "1.2.4", + "@img/sharp-libvips-linux-ppc64": "1.2.4", + "@img/sharp-libvips-linux-riscv64": "1.2.4", + "@img/sharp-libvips-linux-s390x": "1.2.4", + "@img/sharp-libvips-linux-x64": "1.2.4", + "@img/sharp-libvips-linuxmusl-arm64": "1.2.4", + "@img/sharp-libvips-linuxmusl-x64": "1.2.4", + "@img/sharp-linux-arm": "0.34.5", + "@img/sharp-linux-arm64": "0.34.5", + "@img/sharp-linux-ppc64": "0.34.5", + "@img/sharp-linux-riscv64": "0.34.5", + "@img/sharp-linux-s390x": "0.34.5", + "@img/sharp-linux-x64": "0.34.5", + "@img/sharp-linuxmusl-arm64": "0.34.5", + "@img/sharp-linuxmusl-x64": "0.34.5", + "@img/sharp-wasm32": "0.34.5", + "@img/sharp-win32-arm64": "0.34.5", + "@img/sharp-win32-ia32": "0.34.5", + "@img/sharp-win32-x64": "0.34.5" + } + }, + "node_modules/supports-color": { + "version": "10.2.2", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-10.2.2.tgz", + "integrity": "sha512-SS+jx45GF1QjgEXQx4NJZV9ImqmO2NPz5FNsIHrsDjh2YsHnawpan7SNQ1o8NuhrbHZy9AZhIoCUiCeaW/C80g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/chalk/supports-color?sponsor=1" + } + }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "dev": true, + "license": "0BSD", + "optional": true + }, + "node_modules/undici": { + "version": "7.24.8", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.24.8.tgz", + "integrity": "sha512-6KQ/+QxK49Z/p3HO6E5ZCZWNnCasyZLa5ExaVYyvPxUwKtbCPMKELJOqh7EqOle0t9cH/7d2TaaTRRa6Nhs4YQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=20.18.1" + } + }, + "node_modules/unenv": { + "version": "2.0.0-rc.24", + "resolved": "https://registry.npmjs.org/unenv/-/unenv-2.0.0-rc.24.tgz", + "integrity": "sha512-i7qRCmY42zmCwnYlh9H2SvLEypEFGye5iRmEMKjcGi7zk9UquigRjFtTLz0TYqr0ZGLZhaMHl/foy1bZR+Cwlw==", + "dev": true, + "license": "MIT", + "dependencies": { + "pathe": "^2.0.3" + } + }, + "node_modules/workerd": { + "version": "1.20260526.1", + "resolved": "https://registry.npmjs.org/workerd/-/workerd-1.20260526.1.tgz", + "integrity": "sha512-IHzymht98p10JH1zzwdCpbViAqw97HrwKl7+KfZeASFMsYSrIsAULWdPn0LRC5FTUzBpamLNyKCCKxbgXHgRHQ==", + "dev": true, + "hasInstallScript": true, + "license": "Apache-2.0", + "bin": { + "workerd": "bin/workerd" + }, + "engines": { + "node": ">=16" + }, + "optionalDependencies": { + "@cloudflare/workerd-darwin-64": "1.20260526.1", + "@cloudflare/workerd-darwin-arm64": "1.20260526.1", + "@cloudflare/workerd-linux-64": "1.20260526.1", + "@cloudflare/workerd-linux-arm64": "1.20260526.1", + "@cloudflare/workerd-windows-64": "1.20260526.1" + } + }, + "node_modules/wrangler": { + "version": "4.95.0", + "resolved": "https://registry.npmjs.org/wrangler/-/wrangler-4.95.0.tgz", + "integrity": "sha512-vgXzFVSCdUbeCadgVXvu8fK5tzNm8T9W+7lriyGWZMx0B1+CAdr4d8JTlZszHfgjypRAHmAxb49etZGIRD9pgg==", + "dev": true, + "license": "MIT OR Apache-2.0", + "dependencies": { + "@cloudflare/kv-asset-handler": "0.5.0", + "@cloudflare/unenv-preset": "2.16.1", + "blake3-wasm": "2.1.5", + "esbuild": "0.27.3", + "miniflare": "4.20260526.0", + "path-to-regexp": "6.3.0", + "rosie-skills": "^0.6.3", + "unenv": "2.0.0-rc.24", + "workerd": "1.20260526.1" + }, + "bin": { + "wrangler": "bin/wrangler.js", + "wrangler2": "bin/wrangler.js" + }, + "engines": { + "node": ">=22.0.0" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + }, + "peerDependencies": { + "@cloudflare/workers-types": "^4.20260526.1" + }, + "peerDependenciesMeta": { + "@cloudflare/workers-types": { + "optional": true + } + } + }, + "node_modules/ws": { + "version": "8.20.1", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.20.1.tgz", + "integrity": "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, + "node_modules/youch": { + "version": "4.1.0-beta.10", + "resolved": "https://registry.npmjs.org/youch/-/youch-4.1.0-beta.10.tgz", + "integrity": "sha512-rLfVLB4FgQneDr0dv1oddCVZmKjcJ6yX6mS4pU82Mq/Dt9a3cLZQ62pDBL4AUO+uVrCvtWz3ZFUL2HFAFJ/BXQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@poppinss/colors": "^4.1.5", + "@poppinss/dumper": "^0.6.4", + "@speed-highlight/core": "^1.2.7", + "cookie": "^1.0.2", + "youch-core": "^0.3.3" + } + }, + "node_modules/youch-core": { + "version": "0.3.3", + "resolved": "https://registry.npmjs.org/youch-core/-/youch-core-0.3.3.tgz", + "integrity": "sha512-ho7XuGjLaJ2hWHoK8yFnsUGy2Y5uDpqSTq1FkHLK4/oqKtyUU1AFbOOxY4IpC9f0fTLjwYbslUz0Po5BpD1wrA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@poppinss/exception": "^1.2.2", + "error-stack-parser-es": "^1.0.5" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..b8599c0 --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "name": "leadforge-shmuggingface-tooling", + "private": true, + "description": "Node tooling for the LeadForge ShmuggingFace review site (not published to npm).", + "type": "module", + "dependencies": { + "@shmuggingface/core": "https://github.com/ShmuggingFace/ShmuggingFaceCore/archive/refs/tags/v1.0.2.tar.gz" + }, + "devDependencies": { + "wrangler": "latest" + }, + "engines": { + "node": ">=20" + } +} diff --git a/scripts/build_public_release.py b/scripts/build_public_release.py index 5ccb152..ab86a7a 100644 --- a/scripts/build_public_release.py +++ b/scripts/build_public_release.py @@ -72,6 +72,14 @@ def write_flat_csv(bundle_dir: Path) -> Path: leakage-risk columns from student_public task splits before they hit disk. The flat CSV is built only for student_public bundles (see ``main()``) and inherits that redaction transitively. + + Also prepends a ``split`` row to ``feature_dictionary.csv`` so the + column spec covers every column in ``lead_scoring.csv``. The + ``split`` column is added here (not in the core bundle writer) + because it only exists in the flat convenience CSV — the Parquet + task splits do not carry it. ``feature_dictionary.csv`` is not + hashed in ``manifest.json``, so this edit does not invalidate the + bundle integrity hashes. """ task_dir = bundle_dir / "tasks" / "converted_within_90_days" frames = [] @@ -84,6 +92,31 @@ def write_flat_csv(bundle_dir: Path) -> Path: merged = pd.concat(frames, ignore_index=True) csv_path = bundle_dir / "lead_scoring.csv" merged.to_csv(csv_path, index=False) + + # Prepend split row to feature_dictionary.csv. + fd_path = bundle_dir / "feature_dictionary.csv" + if fd_path.exists(): + fd = pd.read_csv(fd_path) + if "split" not in fd["name"].values: + split_row = pd.DataFrame( + [ + { + "name": "split", + "dtype": "string", + "description": ( + "Partition label: 'train', 'valid', or 'test'. " + "Present in lead_scoring.csv only; the Parquet task " + "splits are already partitioned by filename." + ), + "category": "split_metadata", + "is_target": False, + "leakage_risk": False, + } + ] + ) + fd = pd.concat([split_row, fd], ignore_index=True) + fd.to_csv(fd_path, index=False) + return csv_path diff --git a/scripts/build_shmuggingface_site.py b/scripts/build_shmuggingface_site.py new file mode 100644 index 0000000..c597582 --- /dev/null +++ b/scripts/build_shmuggingface_site.py @@ -0,0 +1,606 @@ +#!/usr/bin/env python3 +"""Build a ShmuggingFace review minisite from leadforge release artifacts. + +Reads the three public release tiers (intro / intermediate / advanced), +renders each tier's ``dataset_card.md`` to HTML, and generates a static +site via ShmuggingFaceCore that mirrors how the dataset will look on +Kaggle and Hugging Face. The site can then be deployed to Cloudflare +Pages. + +Usage:: + + python scripts/build_shmuggingface_site.py [OPTIONS] + +Options +------- +--release-dir PATH + Root of the release directory. Default: ``release/``. +--out-dir PATH + Output directory for the generated static site. + Default: ``release/_shmuggingface/dist``. +--smf-core PATH + Path to a local ShmuggingFaceCore checkout. Overrides the default, + which is the npm-installed package at ``node_modules/@shmuggingface/core`` + (pinned to v1.0.1 via ``package.json``). Run ``npm install`` first. +--deploy + Deploy the built site to Cloudflare Pages after building. +--production + With ``--deploy``: push to the production slot (``--branch main``). + Default (without this flag) is a branch preview + (``--branch preview``). Using ``--production`` intentionally + requires a separate flag so a local run never clobbers the live + site by accident. +--cf-env PATH + Cloudflare env file to source before wrangler. + Default: ``~/.config/adanim/cloudflare_api_token.env``. +--project-name NAME + Cloudflare Pages project name. + Default: ``leadforge-lead-scoring-v1-preview``. +""" + +from __future__ import annotations + +import json +import os +import re +import subprocess +import sys +from pathlib import Path + +try: + from markdown_it import MarkdownIt +except ImportError: + sys.exit("markdown-it-py is required: pip install -e '.[publish]'") + +import pandas as pd + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +TIERS = ["intro", "intermediate", "advanced"] +TASK = "converted_within_90_days" + +GITHUB_BLOB_BASE = "https://github.com/leadforge-dev/leadforge/blob/main" +GITHUB_BLOB_RELEASE = f"{GITHUB_BLOB_BASE}/release" +# Pinned via package.json → package-lock.json; `npm install` resolves it. +SMF_CORE_NPM = Path(__file__).parent.parent / "node_modules/@shmuggingface/core" +DEFAULT_CF_ENV = Path.home() / ".config/adanim/cloudflare_api_token.env" +DEFAULT_PROJECT = "leadforge-lead-scoring-v1-preview" + +TIER_LABEL = {"intro": "Intro", "intermediate": "Intermediate", "advanced": "Advanced"} + +DISCUSSIONS = [ + ( + "What is `snapshot_day = 30` and how does it affect which features are valid" + " at inference time?" + ), + "Is `total_touches_all` a safe feature or a time-window leakage trap?", + "LR and GBM AUCs are very close across tiers — does relational feature engineering help?", + "How would you set a probability threshold for a team that can only work 50 leads per week?", + "What happens to AUC when you evaluate on a chronological hold-out instead of a random split?", +] + +# --------------------------------------------------------------------------- +# Markdown rendering +# --------------------------------------------------------------------------- + +# Rewrite patterns for markdown links that would 404 on the static host. +# Relative links of the form ``[text](../foo)`` stay relative to the +# release tree root on GitHub. Plain ``[LICENSE](LICENSE)`` and other +# bare-name links need an explicit GitHub blob URL. +_PARENT_LINK_RE = re.compile(r"\]\(\.\./([^)]+)\)") +_VALIDATION_LINK_RE = re.compile(r"\]\(validation/validation_report\.md\)") +# Bare relative links: match ``](word.ext)`` or ``](word)`` NOT starting +# with http/https/# (those are already absolute or anchors). +_BARE_RELATIVE_LINK_RE = re.compile(r"\]\((?!https?://|#)([^/][^)]*)\)") + + +def _rewrite_links(text: str, github_base: str) -> str: + """Rewrite relative markdown links to absolute GitHub blob URLs. + + Three classes handled: + 1. ``[text](../foo)`` → ``[text](/foo)`` (parent-dir links) + 2. ``[text](validation/validation_report.md)`` → absolute blob URL + 3. ``[text](bare-name)`` → ``[text](/bare-name)`` + (bare relative links like ``[LICENSE](LICENSE)`` that would 404) + """ + text = _PARENT_LINK_RE.sub(rf"]({GITHUB_BLOB_BASE}/\1)", text) + text = _VALIDATION_LINK_RE.sub( + f"]({GITHUB_BLOB_BASE}/release/validation/validation_report.md)", text + ) + text = _BARE_RELATIVE_LINK_RE.sub(rf"]({github_base}/\1)", text) + return text + + +def _render_md(text: str) -> str: + """Render ``text`` (markdown) to HTML.""" + return MarkdownIt("gfm-like").disable("linkify").render(text) + + +def render_tier_html(tier_dir: Path) -> str: + """Render a tier's ``dataset_card.md`` to HTML with link rewriting. + + Each tier ships its own ``dataset_card.md`` (auto-generated by + ``leadforge/narrative/dataset_card.py``). Using the per-tier card + rather than the global ``release/README.md`` means the description + shown for each tier is specific to that tier — not a copy of the + global README embedded three times. + + Relative links inside the card are prefixed with the GitHub blob + URL for the tier's directory so they resolve correctly on the + static preview host. + """ + card_path = tier_dir / "dataset_card.md" + text = card_path.read_text(encoding="utf-8") + tier_name = tier_dir.name # "intro" / "intermediate" / "advanced" + github_tier_base = f"{GITHUB_BLOB_RELEASE}/{tier_name}" + text = _rewrite_links(text, github_tier_base) + return _render_md(text) + + +# --------------------------------------------------------------------------- +# Tier metadata loading +# --------------------------------------------------------------------------- + + +def _require(d: dict, key: str, context: str) -> object: + """Return ``d[key]``, raising ``KeyError`` with context on miss. + + Silent ``dict.get()`` defaults produce plausible-but-false preview + pages when a required manifest / metrics field is absent or + renamed. Raising here catches schema drift at build time rather + than silently misrepresenting the dataset. + """ + if key not in d: + raise KeyError( + f"Required field {key!r} missing from {context}. " + "Was the bundle regenerated with a different schema version?" + ) + return d[key] + + +def load_tier(release_dir: Path, tier: str) -> dict: + """Load manifest, metrics, feature dictionary, and sample rows for one tier.""" + tier_dir = release_dir / tier + + manifest_raw = (tier_dir / "manifest.json").read_text() + manifest = json.loads(manifest_raw) + ctx_manifest = f"{tier}/manifest.json" + + metrics_raw = (tier_dir / "metrics.json").read_text() + metrics = json.loads(metrics_raw) + + fd = pd.read_csv(tier_dir / "feature_dictionary.csv") + # ``split`` is the first column in ``lead_scoring.csv`` (added by + # ``build_public_release.py``). Older bundles built before PR 8.4 + # won't have it in their feature_dictionary.csv; newer ones will. + # Normalise here: always put ``split`` exactly once at the front. + fd_names = list(fd["name"]) + other_cols = [c for c in fd_names if c != "split"] + columns = ["split"] + other_cols + + df = pd.read_csv(tier_dir / "lead_scoring.csv") + # Stringify every cell so JSON serialisation is clean. + sample_rows = [ + {k: ("" if str(v) in ("nan", "None") else str(v)) for k, v in row.items()} + for row in df.head(8).to_dict("records") + ] + + return { + "tier": tier, + "tier_dir": tier_dir, + "task_dir": tier_dir / "tasks" / TASK, + "manifest": manifest, + "ctx_manifest": ctx_manifest, + "metrics": metrics, + "columns": columns, + "sample_rows": sample_rows, + "n_rows": int(df.shape[0]), + } + + +# --------------------------------------------------------------------------- +# Config building +# --------------------------------------------------------------------------- + + +def _rel(path: Path, from_dir: Path) -> str: + """Relative POSIX path from from_dir to path.""" + return os.path.relpath(path, from_dir).replace(os.sep, "/") + + +def make_dataset_config(tier_data: dict, config_dir: Path) -> dict: + """Build a ShmuggingFace dataset config dict for one tier. + + Each tier page shows its own ``dataset_card.md`` as the description + body (rendered to HTML here), keeping the per-tier copy in sync with + the published card without duplicating the global README three times. + """ + tier = tier_data["tier"] + tier_dir = tier_data["tier_dir"] + task_dir = tier_data["task_dir"] + manifest = tier_data["manifest"] + ctx_manifest = tier_data["ctx_manifest"] + metrics = tier_data["metrics"] + label = TIER_LABEL[tier] + medians = metrics.get("medians", {}) + + cr = medians.get("conversion_rate_test", 0.0) + lr_auc = medians.get("lr_auc", 0.0) + # These fields are required — raise immediately on schema drift rather + # than silently defaulting to plausible-but-false values. + n_leads = int(_require(manifest, "n_leads", ctx_manifest)) + snapshot_day = int(_require(manifest, "snapshot_day", ctx_manifest)) + + task_info_all = _require(manifest, "tasks", ctx_manifest) + if not isinstance(task_info_all, dict) or TASK not in task_info_all: + raise KeyError( + f"Task {TASK!r} not found in {ctx_manifest}['tasks']. " + "Bundle may have been generated with a different task name." + ) + task_info = task_info_all[TASK] + train_rows = int(_require(task_info, "train_rows", f"{ctx_manifest}[tasks][{TASK}]")) + valid_rows = int(_require(task_info, "valid_rows", f"{ctx_manifest}[tasks][{TASK}]")) + test_rows = int(_require(task_info, "test_rows", f"{ctx_manifest}[tasks][{TASK}]")) + + def kb(path: Path) -> str: + return f"{max(1, path.stat().st_size // 1024)} KB" + + files = [ + { + "path": "lead_scoring.csv", + "size": kb(tier_dir / "lead_scoring.csv"), + "kind": "CSV", + "sourcePath": _rel(tier_dir / "lead_scoring.csv", config_dir), + "about": ( + f"Flat ML-ready snapshot CSV: {n_leads:,} leads × " + f"{len(tier_data['columns'])} columns (including 'split'), " + f"snapshot day {snapshot_day}. The 'split' column " + f"(train / valid / test) lets conventional ML workflows load " + f"a single file." + ), + }, + { + "path": "feature_dictionary.csv", + "size": kb(tier_dir / "feature_dictionary.csv"), + "kind": "CSV", + "sourcePath": _rel(tier_dir / "feature_dictionary.csv", config_dir), + "about": ( + "Per-column documentation: dtype, analytical category, " + "leakage-risk flag, and plain-language description." + ), + }, + { + "path": "tasks/converted_within_90_days/train.parquet", + "size": kb(task_dir / "train.parquet"), + "kind": "Parquet", + "sourcePath": _rel(task_dir / "train.parquet", config_dir), + "about": ( + f"Training split — {train_rows:,} leads, " + f"stratified by conversion rate. Target column: " + f"`converted_within_90_days` (bool)." + ), + }, + { + "path": "tasks/converted_within_90_days/valid.parquet", + "size": kb(task_dir / "valid.parquet"), + "kind": "Parquet", + "sourcePath": _rel(task_dir / "valid.parquet", config_dir), + "about": f"Validation split — {valid_rows:,} leads.", + }, + { + "path": "tasks/converted_within_90_days/test.parquet", + "size": kb(task_dir / "test.parquet"), + "kind": "Parquet", + "sourcePath": _rel(task_dir / "test.parquet", config_dir), + "about": (f"Test split — {test_rows:,} leads, held out for final evaluation only."), + }, + { + "path": "dataset_card.md", + "size": kb(tier_dir / "dataset_card.md"), + "kind": "Dataset card", + "sourcePath": _rel(tier_dir / "dataset_card.md", config_dir), + "about": "Auto-generated tier-specific dataset card.", + }, + ] + + cover_rel = _rel(tier_dir.parent / "dataset-cover-image.png", config_dir) + + # Per-tier description: use the tier's own dataset_card.md so each + # tier page is self-contained and matches what's published per-tier. + description_html = render_tier_html(tier_dir) + + return { + "slug": f"leadforge-lead-scoring-v1-{tier}", + "title": f"LeadForge Lead Scoring v1 — {label}", + "owner": "leadforge-dev", + "subtitle": ( + f"{label} difficulty · {n_leads:,} leads · ~{cr:.0%} conversion rate · " + f"LR AUC {lr_auc:.3f} (5-seed median)" + ), + "license": "MIT", + "task": "tabular-classification", + "language": "English", + "rowCount": n_leads, + "splits": ["train", "valid", "test"], + "subsets": [f"leadforge-lead-scoring-v1-{tier}"], + "coverImage": cover_rel, + "descriptionHtml": description_html, + "tags": [ + "tabular", + "lead-scoring", + "synthetic-data", + "crm", + "b2b", + "datasets", + "pandas", + tier, + ], + "columns": tier_data["columns"], + "rows": tier_data["sample_rows"], + "files": files, + "discussions": DISCUSSIONS, + "downloads": "0", + "likes": "0", + } + + +# --------------------------------------------------------------------------- +# Config file writing +# --------------------------------------------------------------------------- + + +def write_config(site_config: dict, datasets: list[dict], config_path: Path) -> None: + """Write shmuggingface.config.mjs.""" + full_config = {"site": site_config, "datasets": datasets} + config_json = json.dumps(full_config, indent=2, ensure_ascii=False) + config_path.write_text(f"export default {config_json};\n", encoding="utf-8") + print(f" Config → {config_path}", file=sys.stderr) + + +# --------------------------------------------------------------------------- +# ShmuggingFaceCore management +# --------------------------------------------------------------------------- + + +def ensure_smf_core(smf_core: Path | None) -> Path: + """Return path to a working ShmuggingFaceCore installation. + + Resolution order: + 1. ``--smf-core PATH`` override (for local dev / CI with a custom checkout). + 2. npm-installed package at ``node_modules/@shmuggingface/core`` — the + canonical path when ``npm install`` has been run from the repo root + (pinned to v1.0.1 via ``package.json`` / ``package-lock.json``). + + Exits with an informative error if neither source is available. + """ + if smf_core is not None: + entry = smf_core / "bin/shmuggingface.mjs" + if not entry.exists(): + sys.exit(f"ShmuggingFaceCore entry point not found at {entry}") + return smf_core + + entry = SMF_CORE_NPM / "bin/shmuggingface.mjs" + if entry.exists(): + pkg = SMF_CORE_NPM / "package.json" + version = json.loads(pkg.read_text()).get("version", "unknown") + print(f" Using npm-installed @shmuggingface/core v{version}", file=sys.stderr) + return SMF_CORE_NPM + + sys.exit( + "ShmuggingFaceCore not found.\n" + f" Expected npm installation at: {SMF_CORE_NPM}\n" + " Run `npm install` from the repo root to install the pinned v1.0.1 release,\n" + " or pass --smf-core PATH to a local checkout." + ) + + +# --------------------------------------------------------------------------- +# Build and deploy +# --------------------------------------------------------------------------- + + +def build_site(config_path: Path, out_dir: Path, smf_core: Path) -> None: + """Run the ShmuggingFaceCore generator.""" + out_dir.mkdir(parents=True, exist_ok=True) + print(f" Building static site → {out_dir}", file=sys.stderr) + subprocess.run( # noqa: S603 + [ # noqa: S607 — trusted local tool, path from npm install + "node", + str(smf_core / "bin/shmuggingface.mjs"), + "build", + "--config", + str(config_path), + "--out", + str(out_dir), + ], + check=True, + ) + + +def _load_cf_env(cf_env_path: Path) -> dict: + """Parse a shell env file and return a dict of variable overrides.""" + env = os.environ.copy() + for raw_line in cf_env_path.read_text().splitlines(): + line = raw_line.strip() + if line.startswith("#") or not line: + continue + if line.startswith("export "): + line = line[len("export ") :] + if "=" in line: + key, _, val = line.partition("=") + env[key.strip()] = val.strip().strip("'\"") + return env + + +def deploy_site( + out_dir: Path, + project_name: str, + cf_env_path: Path, + *, + production: bool = False, +) -> None: + """Deploy the built site to Cloudflare Pages via wrangler. + + By default deploys to the ``preview`` branch slot so a routine + local run never clobbers the live production URL. Pass + ``production=True`` (``--production`` on the CLI) to push to the + ``main`` branch (the Cloudflare Pages production slot). + """ + if not cf_env_path.exists(): + sys.exit( + f"Cloudflare env file not found: {cf_env_path}\n" + f"Expected format:\n" + f" export CLOUDFLARE_ACCOUNT_ID='...'\n" + f" export CLOUDFLARE_API_TOKEN='...'" + ) + + env = _load_cf_env(cf_env_path) + account_id = env.get("CLOUDFLARE_ACCOUNT_ID", "(not set)") + branch = "main" if production else "preview" + print( + f" Deploying to Cloudflare Pages\n" + f" project : {project_name}\n" + f" account : {account_id}\n" + f" branch : {branch} ({'production slot' if production else 'branch preview'})\n" + f" source : {out_dir}", + file=sys.stderr, + ) + result = subprocess.run( # noqa: S603 + [ # noqa: S607 — wrangler is a pinned devDependency, not user input + "wrangler", + "pages", + "deploy", + str(out_dir), + "--project-name", + project_name, + "--branch", + branch, + "--commit-dirty=true", # suppress the "uncommitted changes" warning + ], + env=env, + ) + if result.returncode != 0: + sys.exit(f"Deployment failed (wrangler exit code {result.returncode})") + + suffix = "" if production else "/preview" + print( + f"\n Live at: https://{project_name}.pages.dev{suffix}", + file=sys.stderr, + ) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser( + description="Build (and optionally deploy) the ShmuggingFace review minisite.", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--release-dir", + default="release", + type=Path, + metavar="PATH", + help="Root of the release directory (default: release/)", + ) + parser.add_argument( + "--out-dir", + type=Path, + metavar="PATH", + help="Output directory for the static site (default: release/_shmuggingface/dist)", + ) + parser.add_argument( + "--smf-core", + type=Path, + default=None, + metavar="PATH", + help=( + "Path to a local ShmuggingFaceCore checkout " + "(default: node_modules/@shmuggingface/core from `npm install`)" + ), + ) + parser.add_argument( + "--deploy", + action="store_true", + help="Deploy to Cloudflare Pages after building", + ) + parser.add_argument( + "--production", + action="store_true", + help=( + "With --deploy: push to the production slot (--branch main). " + "Default without this flag is a branch preview (--branch preview). " + "Requires explicit opt-in to prevent accidental production deploys." + ), + ) + parser.add_argument( + "--cf-env", + type=Path, + default=DEFAULT_CF_ENV, + metavar="PATH", + help=f"Cloudflare env file (default: {DEFAULT_CF_ENV})", + ) + parser.add_argument( + "--project-name", + default=DEFAULT_PROJECT, + metavar="NAME", + help=f"Cloudflare Pages project name (default: {DEFAULT_PROJECT})", + ) + args = parser.parse_args() + + release_dir = args.release_dir.resolve() + if not release_dir.is_dir(): + sys.exit(f"Release directory not found: {release_dir}") + + config_dir = release_dir / "_shmuggingface" + config_dir.mkdir(parents=True, exist_ok=True) + config_path = config_dir / "shmuggingface.config.mjs" + out_dir = args.out_dir.resolve() if args.out_dir else (config_dir / "dist") + + # --- Load tiers ---------------------------------------------------------- + print("Loading release tiers …", file=sys.stderr) + datasets = [] + for tier in TIERS: + print(f" {tier}", file=sys.stderr) + tier_data = load_tier(release_dir, tier) + ds = make_dataset_config(tier_data, config_dir) + datasets.append(ds) + + # --- Write config -------------------------------------------------------- + print("Writing shmuggingface.config.mjs …", file=sys.stderr) + site_config = { + "title": "LeadForge Lead Scoring v1 — Pre-Publication Review", + "owner": "leadforge-dev", + "visibility": "Pre-publication review mock — not yet live on Kaggle or Hugging Face", + "reviewerHint": ( + "Review the dataset card copy, metadata accuracy, file listings, column " + "preview, and download behaviour across all three difficulty tiers. " + "The Shmaggle tab mirrors the Kaggle page; the ShmuggingFace tab mirrors " + "the Hugging Face page. Flag anything that looks wrong before the real publish." + ), + } + write_config(site_config, datasets, config_path) + + # --- Ensure ShmuggingFaceCore -------------------------------------------- + smf_core = ensure_smf_core(args.smf_core) + + # --- Build --------------------------------------------------------------- + print("Building static site …", file=sys.stderr) + build_site(config_path, out_dir, smf_core) + print(f"Done. Site at: {out_dir}", file=sys.stderr) + + # --- Deploy -------------------------------------------------------------- + if args.deploy: + print("Deploying to Cloudflare Pages …", file=sys.stderr) + deploy_site(out_dir, args.project_name, args.cf_env, production=args.production) + + +if __name__ == "__main__": + main() diff --git a/tests/scripts/test_build_shmuggingface_site.py b/tests/scripts/test_build_shmuggingface_site.py new file mode 100644 index 0000000..cdcfb6e --- /dev/null +++ b/tests/scripts/test_build_shmuggingface_site.py @@ -0,0 +1,332 @@ +"""Smoke tests for ``scripts/build_shmuggingface_site.py``. + +Covers the three failure modes the self-review identified: + +* **Fabricated Kaggle metadata removed** — ``TIER_USABILITY`` and + ``TIER_MEDAL`` should not appear in the module. +* **Raising on missing manifest / metrics fields** — ``_require`` + should raise ``KeyError`` with a useful message when a required + field is absent; the module must NOT fall back silently to hardcoded + defaults. +* **Per-tier dataset card** — ``make_dataset_config`` must use each + tier's ``dataset_card.md`` as the description body, not the global + ``release/README.md``. +* **All three tier configs generated** — ``load_tier`` returns the + expected structure with a non-empty file list, and ``split`` is + prepended to the column listing. +* **``--branch preview`` is the default** — the ``deploy_site`` + function must pass ``--branch preview`` unless ``production=True``. +* **Stale ``isPrivate: true``** caught — injected bad metadata is + caught downstream by ``lint_platform_metadata`` (cross-checked in + ``test_lint_platform_metadata.py``); here we just verify the site + builder does NOT embed an ``isPrivate`` key. +""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +_SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "build_shmuggingface_site.py" +_REPO_ROOT = Path(__file__).resolve().parents[2] +_spec = importlib.util.spec_from_file_location("build_shmuggingface_site", _SCRIPT_PATH) +assert _spec is not None +assert _spec.loader is not None +smf = importlib.util.module_from_spec(_spec) +sys.modules["build_shmuggingface_site"] = smf +_spec.loader.exec_module(smf) + +_RELEASE_DIR = _REPO_ROOT / "release" +_RELEASE_BUNDLES_PRESENT = (_RELEASE_DIR / "intermediate" / "manifest.json").exists() + + +# --------------------------------------------------------------------------- +# Fabricated metadata removed +# --------------------------------------------------------------------------- + + +def test_no_tier_usability_constant() -> None: + """TIER_USABILITY must not exist — those values were fabricated.""" + assert not hasattr(smf, "TIER_USABILITY"), ( + "TIER_USABILITY is still present in build_shmuggingface_site.py. " + "These were fabricated Kaggle usability scores and must be removed." + ) + + +def test_no_tier_medal_constant() -> None: + """TIER_MEDAL must not exist — those values were fabricated.""" + assert not hasattr(smf, "TIER_MEDAL"), ( + "TIER_MEDAL is still present in build_shmuggingface_site.py. " + "These were fabricated Kaggle medal labels and must be removed." + ) + + +def test_make_dataset_config_no_kaggle_usability(tmp_path: Path) -> None: + """The generated config dict must not include kaggleUsability or kaggleMedals.""" + if not _RELEASE_BUNDLES_PRESENT: + pytest.skip("release/intermediate bundle not present") + tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") + config = smf.make_dataset_config(tier_data, tmp_path) + assert "kaggleUsability" not in config, "make_dataset_config still emits kaggleUsability" + assert "kaggleMedals" not in config, "make_dataset_config still emits kaggleMedals" + + +# --------------------------------------------------------------------------- +# _require raises on missing keys +# --------------------------------------------------------------------------- + + +def test_require_present_key() -> None: + """_require returns the value when the key exists.""" + d = {"n_leads": 5000, "snapshot_day": 30} + assert smf._require(d, "n_leads", "test/manifest.json") == 5000 + + +def test_require_missing_key_raises() -> None: + """_require raises KeyError (not KeyError-silent-default) on miss.""" + d = {"snapshot_day": 30} + with pytest.raises(KeyError, match="n_leads"): + smf._require(d, "n_leads", "test/manifest.json") + + +def test_require_error_includes_context() -> None: + """_require error message includes the context string for debuggability.""" + d: dict = {} + with pytest.raises(KeyError, match="my_context"): + smf._require(d, "missing_key", "my_context") + + +# --------------------------------------------------------------------------- +# Per-tier dataset_card.md as description +# --------------------------------------------------------------------------- + + +def test_render_tier_html_uses_dataset_card(tmp_path: Path) -> None: + """render_tier_html reads dataset_card.md, not README.md.""" + card_content = "# Tier Card\n\nThis is the tier-specific card." + readme_content = "# Global README\n\nThis is the global README." + (tmp_path / "dataset_card.md").write_text(card_content) + (tmp_path / "README.md").write_text(readme_content) + + html = smf.render_tier_html(tmp_path) + + assert "Tier Card" in html + assert "tier-specific card" in html + assert "Global README" not in html + assert "global README" not in html + + +def test_make_dataset_config_uses_per_tier_card(tmp_path: Path) -> None: + """make_dataset_config embeds the tier card, not the global README.""" + if not _RELEASE_BUNDLES_PRESENT: + pytest.skip("release/intermediate bundle not present") + + tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") + # The description HTML must contain content from dataset_card.md. + # The per-tier card ships a tier-specific header — check for that. + config = smf.make_dataset_config(tier_data, tmp_path) + html = config["descriptionHtml"] + assert isinstance(html, str), "descriptionHtml must be a string" + assert len(html) > 200, "descriptionHtml is missing or suspiciously short" + # The global README starts with a heading containing "leadforge-lead-scoring-v1" + # and is embedded verbatim when the old path is used. The dataset_card.md + # is a per-tier doc that leads with the tier name. + assert "Intermediate" in html or "intermediate" in html, ( + "Per-tier card HTML does not mention the tier — may be using the wrong source document" + ) + + +# --------------------------------------------------------------------------- +# split column prepended to column listing +# --------------------------------------------------------------------------- + + +def test_load_tier_split_column_first(tmp_path: Path) -> None: + """load_tier must prepend 'split' to the column list from feature_dictionary.csv.""" + if not _RELEASE_BUNDLES_PRESENT: + pytest.skip("release/intermediate bundle not present") + tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") + assert tier_data["columns"][0] == "split", ( + f"Expected first column to be 'split', got {tier_data['columns'][0]!r}" + ) + + +def test_load_tier_split_column_appears_exactly_once(tmp_path: Path) -> None: + """'split' must appear exactly once at index 0, regardless of whether it's + in feature_dictionary.csv. + + ``load_tier`` unconditionally prepends 'split' to the column list. + Bundles built before this PR won't have 'split' in their + feature_dictionary.csv; bundles built after will. Either way, the + resulting column listing must have 'split' exactly once, first. + """ + if not _RELEASE_BUNDLES_PRESENT: + pytest.skip("release/intermediate bundle not present") + tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") + cols = tier_data["columns"] + assert cols[0] == "split", f"Expected 'split' at index 0 of columns, got {cols[0]!r}" + assert cols.count("split") == 1, ( + f"'split' appears {cols.count('split')} times in column list — " + "expected exactly once. load_tier may be double-prepending." + ) + + +# --------------------------------------------------------------------------- +# All three tiers produce valid configs +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif( + not _RELEASE_BUNDLES_PRESENT, + reason="release bundles not present", +) +@pytest.mark.parametrize("tier", smf.TIERS) +def test_load_tier_structure(tier: str, tmp_path: Path) -> None: + """load_tier returns a dict with non-empty file lists for each tier.""" + tier_data = smf.load_tier(_RELEASE_DIR, tier) + assert tier_data["tier"] == tier + assert tier_data["n_rows"] > 0 + assert len(tier_data["columns"]) > 5 # at least some columns + assert len(tier_data["sample_rows"]) > 0 + + +@pytest.mark.skipif( + not _RELEASE_BUNDLES_PRESENT, + reason="release bundles not present", +) +@pytest.mark.parametrize("tier", smf.TIERS) +def test_make_dataset_config_structure(tier: str, tmp_path: Path) -> None: + """make_dataset_config produces the required fields for each tier.""" + tier_data = smf.load_tier(_RELEASE_DIR, tier) + config = smf.make_dataset_config(tier_data, tmp_path) + + assert config["slug"].endswith(tier) + assert len(config["files"]) >= 5 # csv, feature dict, 3 parquets at minimum + assert config["rowCount"] > 0 + assert config["splits"] == ["train", "valid", "test"] + assert isinstance(config["descriptionHtml"], str), "descriptionHtml must be a string" + assert len(config["descriptionHtml"]) > 100, "descriptionHtml is suspiciously short" + + # Required fields must be present — no silent defaults + for key in ("slug", "title", "license", "task", "rowCount"): + assert key in config, f"Config missing required key: {key!r}" + + +# --------------------------------------------------------------------------- +# _require raises on stale schema (regression guard for the "silent default" bug) +# --------------------------------------------------------------------------- + + +def test_load_tier_raises_on_missing_n_leads(tmp_path: Path) -> None: + """load_tier / make_dataset_config must raise if n_leads is absent from manifest.""" + if not _RELEASE_BUNDLES_PRESENT: + pytest.skip("release/intermediate bundle not present") + + import copy + + tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") + # Simulate a bundle where n_leads was renamed / dropped + bad_manifest = copy.deepcopy(tier_data["manifest"]) + bad_manifest.pop("n_leads") + tier_data_bad = {**tier_data, "manifest": bad_manifest} + + with pytest.raises(KeyError, match="n_leads"): + smf.make_dataset_config(tier_data_bad, tmp_path) + + +# --------------------------------------------------------------------------- +# deploy_site uses --branch preview by default +# --------------------------------------------------------------------------- + + +def test_deploy_site_preview_branch_by_default(tmp_path: Path) -> None: + """deploy_site must default to --branch preview, not --branch main.""" + cf_env = tmp_path / "cf.env" + cf_env.write_text("CLOUDFLARE_ACCOUNT_ID=fake-account\nCLOUDFLARE_API_TOKEN=fake-token\n") + captured_cmd: list[list[str]] = [] + + def fake_run(cmd: list[str], **_kwargs: object) -> MagicMock: + captured_cmd.append(cmd) + result = MagicMock() + result.returncode = 0 + return result + + with patch("subprocess.run", side_effect=fake_run): + smf.deploy_site( + out_dir=tmp_path / "dist", + project_name="test-project", + cf_env_path=cf_env, + production=False, # default + ) + + assert captured_cmd, "subprocess.run was never called" + cmd = captured_cmd[0] + branch_idx = cmd.index("--branch") + assert cmd[branch_idx + 1] == "preview", ( + f"Expected --branch preview but got --branch {cmd[branch_idx + 1]!r}. " + "A stray local deploy must never clobber the production site." + ) + + +def test_deploy_site_main_branch_with_production_flag(tmp_path: Path) -> None: + """deploy_site must use --branch main when production=True.""" + cf_env = tmp_path / "cf.env" + cf_env.write_text("CLOUDFLARE_ACCOUNT_ID=fake-account\nCLOUDFLARE_API_TOKEN=fake-token\n") + captured_cmd: list[list[str]] = [] + + def fake_run(cmd: list[str], **_kwargs: object) -> MagicMock: + captured_cmd.append(cmd) + result = MagicMock() + result.returncode = 0 + return result + + with patch("subprocess.run", side_effect=fake_run): + smf.deploy_site( + out_dir=tmp_path / "dist", + project_name="test-project", + cf_env_path=cf_env, + production=True, + ) + + assert captured_cmd + cmd = captured_cmd[0] + branch_idx = cmd.index("--branch") + assert cmd[branch_idx + 1] == "main", ( + f"Expected --branch main for production deploy but got {cmd[branch_idx + 1]!r}" + ) + + +# --------------------------------------------------------------------------- +# _rewrite_links handles bare relative links +# --------------------------------------------------------------------------- + + +def test_rewrite_links_bare_license() -> None: + """[LICENSE](LICENSE) must be rewritten to an absolute URL.""" + text = "See [LICENSE](LICENSE) for details." + result = smf._rewrite_links(text, "https://github.com/org/repo/blob/main/release/intro") + assert "](LICENSE)" not in result, ( + "_rewrite_links left a bare relative [LICENSE](LICENSE) link — " + "it would 404 on the static host" + ) + assert "https://" in result + + +def test_rewrite_links_parent_dir() -> None: + """[text](../foo) must become an absolute GitHub blob URL.""" + text = "See [guide](../docs/release/break_me_guide.md) for more." + result = smf._rewrite_links(text, "https://example.com/base") + assert "](../docs/" not in result + assert smf.GITHUB_BLOB_BASE in result + + +def test_rewrite_links_absolute_unchanged() -> None: + """Absolute https:// links must not be modified.""" + url = "https://example.com/some/path" + text = f"[click]({url})" + result = smf._rewrite_links(text, "https://github.com/org/repo/blob/main/release/intro") + assert url in result From 136a8988ffadb1a99abebc2cc206f85bb135e836 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 27 May 2026 13:00:44 +0300 Subject: [PATCH 2/3] fix(scripts): address self-review issues in ShmuggingFace site builder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical fixes: - C1: replace silent metric defaults (cr, lr_auc) with _require calls; subtitle now raises on schema drift instead of showing '0.0% conversion' - C2: extract feature_dictionary.csv mutation out of write_flat_csv into dedicated _prepend_split_to_feature_dict(); write_flat_csv now only does what its name says; main() calls both explicitly - C3: fix stale v1.0.1 → v1.0.2 in ensure_smf_core docstring and error msg - C4: pin wrangler from 'latest' to '4.95.0' in package.json Medium fixes: - M1: _require return type -> object → -> Any (avoids cast noise at every call site) - M2: add TierData TypedDict; load_tier and make_dataset_config now typed - M3: fix _BARE_RELATIVE_LINK_RE to capture whole [text](path) construct so images ![alt](src) are never rewritten (lookbehind on ']' is insufficient — '!' precedes '[', not ']') - M4: add comment explaining _VALIDATION_LINK_RE is no-op for tier cards - M5: add --config-only flag to stop after config write, skip Node build - M6: remove duplicate test_load_tier_split_column_first (subsumed by test_load_tier_split_column_appears_exactly_once) - M7: move 'import copy' from inside test function body to top of file Low fixes: - L1: extract kb() inner function to module-level _file_size_kb() - L2: add comment explaining 'downloads'/'likes' string type and zero value - L3: document the asymmetry between _PARENT_LINK_RE (module constant) and _BARE_RELATIVE_LINK_RE (caller-supplied github_base parameter) - L4: standardize all imperative pytest.skip() guards to @pytest.mark.skipif decorator style for consistent skip-at-collection behaviour New tests: - test_make_dataset_config_raises_on_missing_conversion_rate (C1 regression) - test_rewrite_links_image_src_unchanged (M3 regression) Co-Authored-By: Claude Sonnet 4.6 --- package.json | 2 +- scripts/build_public_release.py | 75 +++++++------ scripts/build_shmuggingface_site.py | 106 +++++++++++++----- .../scripts/test_build_shmuggingface_site.py | 59 ++++++---- 4 files changed, 161 insertions(+), 81 deletions(-) diff --git a/package.json b/package.json index b8599c0..4154566 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,7 @@ "@shmuggingface/core": "https://github.com/ShmuggingFace/ShmuggingFaceCore/archive/refs/tags/v1.0.2.tar.gz" }, "devDependencies": { - "wrangler": "latest" + "wrangler": "4.95.0" }, "engines": { "node": ">=20" diff --git a/scripts/build_public_release.py b/scripts/build_public_release.py index ab86a7a..cc51941 100644 --- a/scripts/build_public_release.py +++ b/scripts/build_public_release.py @@ -72,14 +72,6 @@ def write_flat_csv(bundle_dir: Path) -> Path: leakage-risk columns from student_public task splits before they hit disk. The flat CSV is built only for student_public bundles (see ``main()``) and inherits that redaction transitively. - - Also prepends a ``split`` row to ``feature_dictionary.csv`` so the - column spec covers every column in ``lead_scoring.csv``. The - ``split`` column is added here (not in the core bundle writer) - because it only exists in the flat convenience CSV — the Parquet - task splits do not carry it. ``feature_dictionary.csv`` is not - hashed in ``manifest.json``, so this edit does not invalidate the - bundle integrity hashes. """ task_dir = bundle_dir / "tasks" / "converted_within_90_days" frames = [] @@ -92,32 +84,48 @@ def write_flat_csv(bundle_dir: Path) -> Path: merged = pd.concat(frames, ignore_index=True) csv_path = bundle_dir / "lead_scoring.csv" merged.to_csv(csv_path, index=False) + return csv_path - # Prepend split row to feature_dictionary.csv. - fd_path = bundle_dir / "feature_dictionary.csv" - if fd_path.exists(): - fd = pd.read_csv(fd_path) - if "split" not in fd["name"].values: - split_row = pd.DataFrame( - [ - { - "name": "split", - "dtype": "string", - "description": ( - "Partition label: 'train', 'valid', or 'test'. " - "Present in lead_scoring.csv only; the Parquet task " - "splits are already partitioned by filename." - ), - "category": "split_metadata", - "is_target": False, - "leakage_risk": False, - } - ] - ) - fd = pd.concat([split_row, fd], ignore_index=True) - fd.to_csv(fd_path, index=False) - return csv_path +def _prepend_split_to_feature_dict(bundle_dir: Path) -> None: + """Prepend a ``split`` row to ``feature_dictionary.csv``. + + The ``split`` column exists only in ``lead_scoring.csv`` (the flat + convenience CSV) — the Parquet task splits are already partitioned by + filename. Documenting it in ``feature_dictionary.csv`` keeps the + column spec complete for consumers who read the feature dictionary + before loading the CSV. + + ``feature_dictionary.csv`` is not hashed in ``manifest.json``, so + this edit does not invalidate the bundle integrity hashes. + + This is a separate function from :func:`write_flat_csv` because it + mutates a different file; mixing the two responsibilities in one + function would make ``write_flat_csv`` surprising to callers. + """ + fd_path = bundle_dir / "feature_dictionary.csv" + if not fd_path.exists(): + return + fd = pd.read_csv(fd_path) + if "split" not in fd["name"].values: + split_row = pd.DataFrame( + [ + { + "name": "split", + "dtype": "string", + "description": ( + "Partition label: 'train', 'valid', or 'test'. " + "Present in lead_scoring.csv only; the Parquet task " + "splits are already partitioned by filename." + ), + "category": "split_metadata", + "is_target": False, + "leakage_risk": False, + } + ] + ) + fd = pd.concat([split_row, fd], ignore_index=True) + fd.to_csv(fd_path, index=False) def print_summary(bundle_dir: Path, name: str) -> None: @@ -189,10 +197,11 @@ def main() -> None: generation_timestamp=args.generation_timestamp, ) - # Flat CSV for student_public bundles + # Flat CSV + feature-dictionary split row for student_public bundles if exposure_mode == "student_public": csv_path = write_flat_csv(bundle_dir) print(f" Flat CSV: {csv_path}", file=sys.stderr) + _prepend_split_to_feature_dict(bundle_dir) # Validate print(f" Validating {dir_name}...", file=sys.stderr) diff --git a/scripts/build_shmuggingface_site.py b/scripts/build_shmuggingface_site.py index c597582..3c915ac 100644 --- a/scripts/build_shmuggingface_site.py +++ b/scripts/build_shmuggingface_site.py @@ -21,7 +21,11 @@ --smf-core PATH Path to a local ShmuggingFaceCore checkout. Overrides the default, which is the npm-installed package at ``node_modules/@shmuggingface/core`` - (pinned to v1.0.1 via ``package.json``). Run ``npm install`` first. + (pinned to v1.0.2 via ``package.json``). Run ``npm install`` first. +--config-only + Write the ``shmuggingface.config.mjs`` file and stop — do not invoke + the Node build. Useful for inspecting generated config without a + full Node environment. --deploy Deploy the built site to Cloudflare Pages after building. --production @@ -46,6 +50,7 @@ import subprocess import sys from pathlib import Path +from typing import Any, TypedDict try: from markdown_it import MarkdownIt @@ -90,10 +95,22 @@ # release tree root on GitHub. Plain ``[LICENSE](LICENSE)`` and other # bare-name links need an explicit GitHub blob URL. _PARENT_LINK_RE = re.compile(r"\]\(\.\./([^)]+)\)") +# Rewrites ``validation/validation_report.md`` links that appear in the +# global ``release/README.md``. This is a no-op for per-tier +# ``dataset_card.md`` files (which don't contain that path), but kept so +# ``_rewrite_links`` remains safe to call on the global README too. _VALIDATION_LINK_RE = re.compile(r"\]\(validation/validation_report\.md\)") -# Bare relative links: match ``](word.ext)`` or ``](word)`` NOT starting -# with http/https/# (those are already absolute or anchors). -_BARE_RELATIVE_LINK_RE = re.compile(r"\]\((?!https?://|#)([^/][^)]*)\)") +# Bare relative links: match the full ``[text](word.ext)`` or +# ``![alt](image.png)`` construct so we can distinguish images from links. +# Group 1 captures the optional leading ``!``; group 2 the link text/alt; +# group 3 the path. In ``_rewrite_links`` we skip the rewrite when +# group 1 is ``!`` so inline image sources are never mangled. +# Note: _PARENT_LINK_RE uses the module-level GITHUB_BLOB_BASE constant +# (repo root); _BARE_RELATIVE_LINK_RE uses the caller-supplied +# ``github_base`` parameter (tier-specific subdirectory). The asymmetry +# is intentional: ``../`` links always resolve from the repo root, while +# bare names are relative to the document's own directory. +_BARE_RELATIVE_LINK_RE = re.compile(r"(!?)\[([^\]]*)\]\((?!https?://|#)([^/][^)]*)\)") def _rewrite_links(text: str, github_base: str) -> str: @@ -109,7 +126,14 @@ def _rewrite_links(text: str, github_base: str) -> str: text = _VALIDATION_LINK_RE.sub( f"]({GITHUB_BLOB_BASE}/release/validation/validation_report.md)", text ) - text = _BARE_RELATIVE_LINK_RE.sub(rf"]({github_base}/\1)", text) + text = _BARE_RELATIVE_LINK_RE.sub( + lambda m: ( + m.group(0) # image syntax (![alt](src)) — preserve unchanged + if m.group(1) == "!" + else f"[{m.group(2)}]({github_base}/{m.group(3)})" + ), + text, + ) return text @@ -144,7 +168,7 @@ def render_tier_html(tier_dir: Path) -> str: # --------------------------------------------------------------------------- -def _require(d: dict, key: str, context: str) -> object: +def _require(d: dict, key: str, context: str) -> Any: """Return ``d[key]``, raising ``KeyError`` with context on miss. Silent ``dict.get()`` defaults produce plausible-but-false preview @@ -160,7 +184,26 @@ def _require(d: dict, key: str, context: str) -> object: return d[key] -def load_tier(release_dir: Path, tier: str) -> dict: +def _file_size_kb(path: Path) -> str: + """Return a human-readable file size string, e.g. ``'42 KB'``.""" + return f"{max(1, path.stat().st_size // 1024)} KB" + + +class TierData(TypedDict): + """Typed container returned by :func:`load_tier`.""" + + tier: str + tier_dir: Path + task_dir: Path + manifest: dict[str, Any] + ctx_manifest: str + metrics: dict[str, Any] + columns: list[str] + sample_rows: list[dict[str, str]] + n_rows: int + + +def load_tier(release_dir: Path, tier: str) -> TierData: """Load manifest, metrics, feature dictionary, and sample rows for one tier.""" tier_dir = release_dir / tier @@ -210,7 +253,7 @@ def _rel(path: Path, from_dir: Path) -> str: return os.path.relpath(path, from_dir).replace(os.sep, "/") -def make_dataset_config(tier_data: dict, config_dir: Path) -> dict: +def make_dataset_config(tier_data: TierData, config_dir: Path) -> dict: """Build a ShmuggingFace dataset config dict for one tier. Each tier page shows its own ``dataset_card.md`` as the description @@ -224,12 +267,12 @@ def make_dataset_config(tier_data: dict, config_dir: Path) -> dict: ctx_manifest = tier_data["ctx_manifest"] metrics = tier_data["metrics"] label = TIER_LABEL[tier] - medians = metrics.get("medians", {}) - - cr = medians.get("conversion_rate_test", 0.0) - lr_auc = medians.get("lr_auc", 0.0) - # These fields are required — raise immediately on schema drift rather - # than silently defaulting to plausible-but-false values. + ctx_metrics = f"{tier}/metrics.json" + # _require raises on schema drift rather than silently defaulting to + # plausible-but-false values — including for metrics, not just manifest. + medians = _require(metrics, "medians", ctx_metrics) + cr = float(_require(medians, "conversion_rate_test", ctx_metrics)) + lr_auc = float(_require(medians, "lr_auc", ctx_metrics)) n_leads = int(_require(manifest, "n_leads", ctx_manifest)) snapshot_day = int(_require(manifest, "snapshot_day", ctx_manifest)) @@ -244,13 +287,10 @@ def make_dataset_config(tier_data: dict, config_dir: Path) -> dict: valid_rows = int(_require(task_info, "valid_rows", f"{ctx_manifest}[tasks][{TASK}]")) test_rows = int(_require(task_info, "test_rows", f"{ctx_manifest}[tasks][{TASK}]")) - def kb(path: Path) -> str: - return f"{max(1, path.stat().st_size // 1024)} KB" - files = [ { "path": "lead_scoring.csv", - "size": kb(tier_dir / "lead_scoring.csv"), + "size": _file_size_kb(tier_dir / "lead_scoring.csv"), "kind": "CSV", "sourcePath": _rel(tier_dir / "lead_scoring.csv", config_dir), "about": ( @@ -263,7 +303,7 @@ def kb(path: Path) -> str: }, { "path": "feature_dictionary.csv", - "size": kb(tier_dir / "feature_dictionary.csv"), + "size": _file_size_kb(tier_dir / "feature_dictionary.csv"), "kind": "CSV", "sourcePath": _rel(tier_dir / "feature_dictionary.csv", config_dir), "about": ( @@ -273,7 +313,7 @@ def kb(path: Path) -> str: }, { "path": "tasks/converted_within_90_days/train.parquet", - "size": kb(task_dir / "train.parquet"), + "size": _file_size_kb(task_dir / "train.parquet"), "kind": "Parquet", "sourcePath": _rel(task_dir / "train.parquet", config_dir), "about": ( @@ -284,21 +324,21 @@ def kb(path: Path) -> str: }, { "path": "tasks/converted_within_90_days/valid.parquet", - "size": kb(task_dir / "valid.parquet"), + "size": _file_size_kb(task_dir / "valid.parquet"), "kind": "Parquet", "sourcePath": _rel(task_dir / "valid.parquet", config_dir), "about": f"Validation split — {valid_rows:,} leads.", }, { "path": "tasks/converted_within_90_days/test.parquet", - "size": kb(task_dir / "test.parquet"), + "size": _file_size_kb(task_dir / "test.parquet"), "kind": "Parquet", "sourcePath": _rel(task_dir / "test.parquet", config_dir), "about": (f"Test split — {test_rows:,} leads, held out for final evaluation only."), }, { "path": "dataset_card.md", - "size": kb(tier_dir / "dataset_card.md"), + "size": _file_size_kb(tier_dir / "dataset_card.md"), "kind": "Dataset card", "sourcePath": _rel(tier_dir / "dataset_card.md", config_dir), "about": "Auto-generated tier-specific dataset card.", @@ -341,6 +381,10 @@ def kb(path: Path) -> str: "rows": tier_data["sample_rows"], "files": files, "discussions": DISCUSSIONS, + # ShmuggingFaceCore v1.0.2 accepts these as strings and renders + # them verbatim in the stats bar. They are intentionally zero for + # the pre-publication review; the real platform will populate them + # after publish. "downloads": "0", "likes": "0", } @@ -371,7 +415,7 @@ def ensure_smf_core(smf_core: Path | None) -> Path: 1. ``--smf-core PATH`` override (for local dev / CI with a custom checkout). 2. npm-installed package at ``node_modules/@shmuggingface/core`` — the canonical path when ``npm install`` has been run from the repo root - (pinned to v1.0.1 via ``package.json`` / ``package-lock.json``). + (pinned to v1.0.2 via ``package.json`` / ``package-lock.json``). Exits with an informative error if neither source is available. """ @@ -391,7 +435,7 @@ def ensure_smf_core(smf_core: Path | None) -> Path: sys.exit( "ShmuggingFaceCore not found.\n" f" Expected npm installation at: {SMF_CORE_NPM}\n" - " Run `npm install` from the repo root to install the pinned v1.0.1 release,\n" + " Run `npm install` from the repo root to install the pinned v1.0.2 release,\n" " or pass --smf-core PATH to a local checkout." ) @@ -553,6 +597,14 @@ def main() -> None: metavar="NAME", help=f"Cloudflare Pages project name (default: {DEFAULT_PROJECT})", ) + parser.add_argument( + "--config-only", + action="store_true", + help=( + "Write shmuggingface.config.mjs and stop — skip the Node build and deploy. " + "Useful for verifying generated config without a full Node environment." + ), + ) args = parser.parse_args() release_dir = args.release_dir.resolve() @@ -588,6 +640,10 @@ def main() -> None: } write_config(site_config, datasets, config_path) + if args.config_only: + print(f"--config-only: stopping after config write. Config at: {config_path}") + return + # --- Ensure ShmuggingFaceCore -------------------------------------------- smf_core = ensure_smf_core(args.smf_core) diff --git a/tests/scripts/test_build_shmuggingface_site.py b/tests/scripts/test_build_shmuggingface_site.py index cdcfb6e..83db2f0 100644 --- a/tests/scripts/test_build_shmuggingface_site.py +++ b/tests/scripts/test_build_shmuggingface_site.py @@ -24,6 +24,7 @@ from __future__ import annotations +import copy import importlib.util import sys from pathlib import Path @@ -65,10 +66,9 @@ def test_no_tier_medal_constant() -> None: ) +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") def test_make_dataset_config_no_kaggle_usability(tmp_path: Path) -> None: """The generated config dict must not include kaggleUsability or kaggleMedals.""" - if not _RELEASE_BUNDLES_PRESENT: - pytest.skip("release/intermediate bundle not present") tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") config = smf.make_dataset_config(tier_data, tmp_path) assert "kaggleUsability" not in config, "make_dataset_config still emits kaggleUsability" @@ -120,11 +120,9 @@ def test_render_tier_html_uses_dataset_card(tmp_path: Path) -> None: assert "global README" not in html +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") def test_make_dataset_config_uses_per_tier_card(tmp_path: Path) -> None: """make_dataset_config embeds the tier card, not the global README.""" - if not _RELEASE_BUNDLES_PRESENT: - pytest.skip("release/intermediate bundle not present") - tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") # The description HTML must contain content from dataset_card.md. # The per-tier card ships a tier-specific header — check for that. @@ -145,16 +143,7 @@ def test_make_dataset_config_uses_per_tier_card(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_load_tier_split_column_first(tmp_path: Path) -> None: - """load_tier must prepend 'split' to the column list from feature_dictionary.csv.""" - if not _RELEASE_BUNDLES_PRESENT: - pytest.skip("release/intermediate bundle not present") - tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") - assert tier_data["columns"][0] == "split", ( - f"Expected first column to be 'split', got {tier_data['columns'][0]!r}" - ) - - +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") def test_load_tier_split_column_appears_exactly_once(tmp_path: Path) -> None: """'split' must appear exactly once at index 0, regardless of whether it's in feature_dictionary.csv. @@ -164,8 +153,6 @@ def test_load_tier_split_column_appears_exactly_once(tmp_path: Path) -> None: feature_dictionary.csv; bundles built after will. Either way, the resulting column listing must have 'split' exactly once, first. """ - if not _RELEASE_BUNDLES_PRESENT: - pytest.skip("release/intermediate bundle not present") tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") cols = tier_data["columns"] assert cols[0] == "split", f"Expected 'split' at index 0 of columns, got {cols[0]!r}" @@ -221,13 +208,9 @@ def test_make_dataset_config_structure(tier: str, tmp_path: Path) -> None: # --------------------------------------------------------------------------- +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") def test_load_tier_raises_on_missing_n_leads(tmp_path: Path) -> None: """load_tier / make_dataset_config must raise if n_leads is absent from manifest.""" - if not _RELEASE_BUNDLES_PRESENT: - pytest.skip("release/intermediate bundle not present") - - import copy - tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") # Simulate a bundle where n_leads was renamed / dropped bad_manifest = copy.deepcopy(tier_data["manifest"]) @@ -238,6 +221,23 @@ def test_load_tier_raises_on_missing_n_leads(tmp_path: Path) -> None: smf.make_dataset_config(tier_data_bad, tmp_path) +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_make_dataset_config_raises_on_missing_conversion_rate(tmp_path: Path) -> None: + """make_dataset_config must raise if conversion_rate_test is absent from metrics. + + Regression guard for C1: the subtitle uses these metric values and must + never silently default to 0.0 — that would show fabricated stats on the + preview page without any warning. + """ + tier_data = smf.load_tier(_RELEASE_DIR, "intermediate") + bad_metrics = copy.deepcopy(tier_data["metrics"]) + bad_metrics.get("medians", {}).pop("conversion_rate_test", None) + tier_data_bad = {**tier_data, "metrics": bad_metrics} + + with pytest.raises(KeyError, match="conversion_rate_test"): + smf.make_dataset_config(tier_data_bad, tmp_path) + + # --------------------------------------------------------------------------- # deploy_site uses --branch preview by default # --------------------------------------------------------------------------- @@ -330,3 +330,18 @@ def test_rewrite_links_absolute_unchanged() -> None: text = f"[click]({url})" result = smf._rewrite_links(text, "https://github.com/org/repo/blob/main/release/intro") assert url in result + + +def test_rewrite_links_image_src_unchanged() -> None: + """Inline Markdown images ``![alt](image.png)`` must NOT have their src rewritten. + + Regression guard for M3: the bare-relative link regex previously matched + ``](image.png)`` inside image syntax, rewriting the image src to a GitHub + blob URL and breaking images in the rendered preview. + """ + text = "See diagram: ![architecture](arch.png) for details." + result = smf._rewrite_links(text, "https://github.com/org/repo/blob/main/release/intro") + assert "](arch.png)" in result, ( + "_rewrite_links rewrote the image src — " + "the negative lookbehind for '!' is missing or broken" + ) From 4b961c7d360d0203ce09d31c56b9213f74ce4755 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 27 May 2026 13:04:10 +0300 Subject: [PATCH 3/3] fix(scripts,validation): address Copilot review comments on PR 8.4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - package.json: bump engines.node to >=22 (wrangler 4.95.0 requires it) - build_shmuggingface_site.py: pip install error message -> .[dev] not .[publish] (markdown-it-py is in [dev]; .[dev] is the standard install) - validation/invariants.py: add STUDENT_ONLY_DICT_ROWS constant and exempt 'split' from the feature_dictionary subset check in check_exposure_monotonicity; the 'split' row exists only in the student flat CSV feature dictionary (not the instructor bundle) — this is intentional and must not be flagged as a monotonicity violation Co-Authored-By: Claude Sonnet 4.6 --- leadforge/validation/invariants.py | 10 +++++++++- package.json | 2 +- scripts/build_shmuggingface_site.py | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/leadforge/validation/invariants.py b/leadforge/validation/invariants.py index e61514b..d1423a3 100644 --- a/leadforge/validation/invariants.py +++ b/leadforge/validation/invariants.py @@ -25,6 +25,13 @@ SNAPSHOT_FILTERED_TABLES, ) +# Feature-dictionary rows that are intentionally present only in the student +# bundle and are exempt from the subset check in check_exposure_monotonicity. +# ``split`` documents the partition-label column that exists only in the flat +# ``lead_scoring.csv`` convenience export (student_public only); the +# instructor bundle's feature dictionary covers the raw Parquet columns. +STUDENT_ONLY_DICT_ROWS: frozenset[str] = frozenset({"split"}) + def check_determinism(bundle_a: Path, bundle_b: Path) -> list[str]: """Compare two bundles that should be identical (same seed/config). @@ -169,12 +176,13 @@ def check_exposure_monotonicity(student_bundle: Path, instructor_bundle: Path) - # feature_dictionary.csv: student rows must be a subset of instructor rows # (by ``name``). For names present in both, the metadata must agree. + # s_dict = student_bundle / "feature_dictionary.csv" i_dict = instructor_bundle / "feature_dictionary.csv" if s_dict.exists() and i_dict.exists(): s_df = pd.read_csv(s_dict).set_index("name") i_df = pd.read_csv(i_dict).set_index("name") - extra_in_student = set(s_df.index) - set(i_df.index) + extra_in_student = set(s_df.index) - set(i_df.index) - STUDENT_ONLY_DICT_ROWS if extra_in_student: errors.append( "feature_dictionary.csv: student has rows missing from instructor: " diff --git a/package.json b/package.json index 4154566..0d91942 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,6 @@ "wrangler": "4.95.0" }, "engines": { - "node": ">=20" + "node": ">=22" } } diff --git a/scripts/build_shmuggingface_site.py b/scripts/build_shmuggingface_site.py index 3c915ac..9c05fde 100644 --- a/scripts/build_shmuggingface_site.py +++ b/scripts/build_shmuggingface_site.py @@ -55,7 +55,7 @@ try: from markdown_it import MarkdownIt except ImportError: - sys.exit("markdown-it-py is required: pip install -e '.[publish]'") + sys.exit("markdown-it-py is required: pip install -e '.[dev]'") import pandas as pd