From 77e022767800a68980c8aea62de12f7943b69154 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 27 May 2026 14:30:45 +0300 Subject: [PATCH 1/2] feat(scripts): publish_kaggle + publish_hf + v1 release notes (PR 7.3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the two publish scripts and the v1 release runbook. scripts/publish_kaggle.py - Three-stage runbook: --dry-run (package+lint) → private upload → --public - Wraps package_kaggle_release.run_packager + lint_platform_metadata.run_lint as pre-flight; both must pass before any upload attempt - Calls kaggle datasets create (new dataset) or kaggle datasets version (--update MESSAGE) for future version bumps - Dry-run confirmed passing: package OK, lint OK scripts/publish_hf.py - Same three-stage pattern for HuggingFace + instructor companion - Adds load_dataset() smoke test (G12.3 / G12.4) as step 3/4: all 3 public configs load (5 000 rows x 3 splits each); instructor config loads (intermediate) - Uploads via huggingface_hub.HfApi.upload_folder (private by default) - --go-public flips visibility via HfApi.update_repo_visibility - --variant=public|instructor selects the target repo - Dry-run confirmed passing for both variants docs/release/v1_release_notes.md - Full pre-publish runbook (7 steps) - Publish steps (private -> review -> public) for all three repos - Tag + announce instructions - Change log vs alpha bundles Co-Authored-By: Claude Sonnet 4.6 --- .agent-plan.md | 5 +- docs/release/v1_release_notes.md | 178 +++++++++++++++ scripts/publish_hf.py | 366 +++++++++++++++++++++++++++++++ scripts/publish_kaggle.py | 307 ++++++++++++++++++++++++++ 4 files changed, 855 insertions(+), 1 deletion(-) create mode 100644 docs/release/v1_release_notes.md create mode 100644 scripts/publish_hf.py create mode 100644 scripts/publish_kaggle.py diff --git a/.agent-plan.md b/.agent-plan.md index 37b4099..71d0b85 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -128,7 +128,10 @@ _Source: `docs/external_review/summaries/v1_release_review_synthesis.md` — cro - Labels: `type: feat`, `type: bugfix`, `layer: cli` - Size: M (~350 lines + npm changes) -- [ ] **PR 7.3** — `scripts/{publish_kaggle,publish_hf}.py` (dry-run → local mock-page review → private/draft → public). Tag `leadforge-lead-scoring-v1`; `docs/release/v1_release_notes.md` (cites PR 7.2's preview commands as required pre-flight). ⚠️ **depends on all of Phase 8**. +- [x] **PR 7.3** — `scripts/publish_kaggle.py` + `scripts/publish_hf.py` + `docs/release/v1_release_notes.md`. Publish scripts written; all three dry-runs pass (Kaggle: package+lint ✅; HF public: package+lint+load_dataset G12.3 ✅ — all 3 configs load, 5 000 rows each; HF instructor: package+lint ✅). `docs/release/v1_release_notes.md` written with full runbook. ShmuggingFace preview site rebuilt (48 files). Tag `leadforge-lead-scoring-v1` and Upload/Announce are the remaining manual steps (requires credentials; see v1_release_notes.md runbook). + +- [ ] Upload to Kaggle and HuggingFace (requires credentials — see `docs/release/v1_release_notes.md`) +- [ ] Tag `leadforge-lead-scoring-v1` and announce --- diff --git a/docs/release/v1_release_notes.md b/docs/release/v1_release_notes.md new file mode 100644 index 0000000..ca301da --- /dev/null +++ b/docs/release/v1_release_notes.md @@ -0,0 +1,178 @@ +# v1 Release Notes — `leadforge-lead-scoring-v1` + +**Release date:** 2026-05-27 +**Package version:** leadforge 1.0.0 +**Dataset version:** leadforge-lead-scoring-v1 (initial release) +**Kaggle:** https://www.kaggle.com/datasets/leadforge/leadforge-lead-scoring-v1 +**Hugging Face:** https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1 +**Instructor companion (HF):** https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1-instructor + +--- + +## What is this dataset? + +`leadforge-lead-scoring-v1` is a synthetic B2B CRM lead-scoring dataset generated from a +simulated mid-market SaaS procurement world. It ships as a family of three difficulty tiers +(intro / intermediate / advanced), each with 5,000 leads, split into train / valid / test +Parquet task splits and a flat `lead_scoring.csv` convenience export. + +A companion `leadforge-lead-scoring-v1-instructor` dataset ships the full hidden world +(latent graph, mechanism summary, latent registry) for research and pedagogy. + +See `release/README.md` (dataset card) for full documentation. + +--- + +## Pre-publish runbook (required before `kaggle datasets create` or HF upload) + +Run these steps **in order** from the repo root. Every step must exit 0. + +### 1. Rebuild release bundles (if not already current) + +```bash +python scripts/build_public_release.py +``` + +### 2. Regenerate release validation report + +```bash +python scripts/validate_release_candidate.py --no-rebuild +``` + +### 3. Run Kaggle dry-run (package + lint) + +```bash +python scripts/publish_kaggle.py --dry-run +``` + +Expected output: `Dry-run complete — all pre-flight checks passed.` + +### 4. Run HuggingFace dry-run (package + lint + load_dataset G12.3) + +```bash +python scripts/publish_hf.py --dry-run +python scripts/publish_hf.py --dry-run --variant=instructor +``` + +Expected output for each: `Dry-run complete — all pre-flight checks passed.` + +### 5. Build and review the ShmuggingFace preview site (required) + +```bash +npm install # first time only +python scripts/build_shmuggingface_site.py --release-dir release +open release/_shmuggingface/dist/index.html +``` + +Review all three tiers on both the Shmaggle (Kaggle mock) and ShmuggingFace (HF mock) tabs. +Confirm: metadata accuracy, column preview, file listings, link resolution, description copy. + +### 6. Preview Kaggle page + +```bash +python scripts/preview_kaggle_page.py --open-browser +``` + +### 7. Preview Hugging Face page + +```bash +python scripts/preview_hf_page.py --open-browser +python scripts/preview_hf_page.py --open-browser --variant=instructor +``` + +--- + +## Publish steps (private → review → public) + +### Kaggle + +```bash +# Upload as private (requires ~/.kaggle/kaggle.json or KAGGLE_USERNAME+KAGGLE_KEY) +python scripts/publish_kaggle.py + +# Review at: https://www.kaggle.com/datasets/leadforge/leadforge-lead-scoring-v1 +# Then flip to public via Kaggle web UI (Settings → Visibility → Public) +# or use: python scripts/publish_kaggle.py --public (single-step public upload) +``` + +### Hugging Face (public dataset) + +```bash +# Requires HF_TOKEN env var or: huggingface-cli login +python scripts/publish_hf.py + +# Review at: https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1 +# Then flip to public: +python scripts/publish_hf.py --go-public +``` + +### Hugging Face (instructor companion) + +```bash +python scripts/publish_hf.py --variant=instructor + +# Review, then: +python scripts/publish_hf.py --go-public --variant=instructor +``` + +--- + +## Tag and announce + +After both platforms are live and public: + +```bash +git tag -a leadforge-lead-scoring-v1 -m "leadforge-lead-scoring-v1: initial public release" +git push origin leadforge-lead-scoring-v1 +``` + +Then update `docs/release/post_v1_roadmap.md` with the live URLs and announce. + +--- + +## What changed since the alpha bundles (2026-05-05) + +### Critical fixes + +- **Relational leakage closed** (Phase 2): `student_public` relational tables were + reconstructing `converted_within_90_days` at 100% accuracy via five join paths (A–E). + All five paths are now closed by the snapshot-safe export filter. +- **Post-snapshot feature leak fixed** (PR 8.1): `has_open_opportunity` and + `opportunity_estimated_acv` were using `close_outcome.isna()` (a full-horizon terminal + field) as the open/closed gate; corrected to `closed_at is null OR closed_at > snapshot_day`. +- **Noise clamp applied** (PR 8.1): `lead_score_raw` and `lead_score_percentile` were + carrying full-precision latent scores; now clamped to ±3σ and binned to 5 percentile bands. + +### Platform hardening + +- **Snapshot-safe relational export** (Phase 2): all event timestamps satisfy + `<= lead_created_at + snapshot_day`; terminal-state fields removed from public leads / + opportunities; conversion-conditional entities (customers, subscriptions) excluded. +- **Release validation report** (Phase 3): calibration curves, lift curves, P@K, + cross-seed stability bands, cohort-shift probes — all gated on `v1_acceptance_gates.md`. +- **Dataset card** (Phase 4): full Datasheets-for-Datasets / Data Cards Playbook checklist; + simulation simplifications; known limitations; intended-use / out-of-scope-use. +- **Agent-reviewable artifacts** (PR 7.2.1): `release/metrics.json` (root + per-tier), + `release/docs/` vendored copies, `release/claims_register.{md,json}` (26 claims). +- **ShmuggingFace preview site** (PR 8.4): hardened site builder with no fabricated + metadata, `_require()` for schema drift, per-tier dataset cards, `--config-only` flag, + `--branch preview` default. + +### Dataset + +- `feature_dictionary.csv` includes the `split` column (documented in `split_metadata` category). +- Cover image generated at ≥ 560 × 280 (Kaggle minimum). +- All acceptance gates G1–G15 pass. + +--- + +## Known limitations + +See `release/README.md §Known limitations` and `docs/release/v2_decision_log.md` for +accepted-with-rationale findings from the external review (Claude, ChatGPT, Gemini). + +Key items: +- GBM−LR sign flip on one feature in the intermediate tier (documented, under investigation for v2). +- Weak channel signal (`marketing_channel` AUC improvement ~0.01 over baseline). +- Flat AUC across tiers (by design — difficulty is modulated via noise and feature availability, + not by artificially degrading the signal). diff --git a/scripts/publish_hf.py b/scripts/publish_hf.py new file mode 100644 index 0000000..ef85cdf --- /dev/null +++ b/scripts/publish_hf.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python3 +"""Publish ``leadforge-lead-scoring-v1`` to Hugging Face. + +This is the final publish gate for the Hugging Face side of the v1 +release. It wraps the two pre-publish steps that must already be +complete (packaging and linting), runs a local ``load_dataset()`` +smoke test (G12.3 / G12.4), and then uploads via +``huggingface_hub.HfApi``. + +Three-stage runbook +------------------- +1. **Dry-run** (safe, no credentials needed for the smoke test):: + + python scripts/publish_hf.py --dry-run + python scripts/publish_hf.py --dry-run --variant=instructor + + Re-packages ``release/huggingface/`` (or ``release/huggingface-instructor/``), + lints the metadata, and runs ``load_dataset()`` locally to verify + that the HuggingFace ``datasets`` library can read every config + (G12.3 / G12.4). Exits ``0`` only if every check passes. + +2. **Upload as private** (first publish):: + + python scripts/publish_hf.py + python scripts/publish_hf.py --variant=instructor + + Requires a HuggingFace token with write access to the target org. + Set ``HF_TOKEN`` or ``HUGGING_FACE_HUB_TOKEN`` in the environment, + or run ``huggingface-cli login`` first. Creates the repo if it + doesn't exist (private), then uploads the assembled directory. + +3. **Flip to public**:: + + python scripts/publish_hf.py --go-public + python scripts/publish_hf.py --go-public --variant=instructor + + Updates the repo visibility to public via ``HfApi.update_repo_visibility``. + Can be run separately after reviewing the private upload. + +Options +------- +--release-dir PATH Root of the release directory (default: release/). +--variant {public,instructor} + Which dataset to publish (default: public). +--dry-run Package + lint + load_dataset; no upload. +--private Force private upload even when the repo already exists + (default: private on first create, unchanged on update). +--go-public Flip the repo to public visibility and exit. +--token TOKEN Hugging Face API token (default: HF_TOKEN env var or + the token stored by ``huggingface-cli login``). +--commit-message MSG Commit message for the upload (default: auto). +""" + +from __future__ import annotations + +import argparse +import sys +from collections.abc import Sequence +from pathlib import Path +from typing import Final + +# Make ``scripts/`` importable regardless of invocation style. +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from lint_platform_metadata import LintOutcome, run_lint # noqa: E402 +from package_hf_release import ( # noqa: E402 + DEFAULT_HUGGINGFACE_DIR, + DEFAULT_HUGGINGFACE_INSTRUCTOR_DIR, + DEFAULT_RELEASE_DIR, + run_packager, +) + +# --------------------------------------------------------------------------- +# Repo identity +# --------------------------------------------------------------------------- + +HF_ORG: Final[str] = "leadforge" +REPO_IDS: Final[dict[str, str]] = { + "public": f"{HF_ORG}/leadforge-lead-scoring-v1", + "instructor": f"{HF_ORG}/leadforge-lead-scoring-v1-instructor", +} +HF_DATASET_URLS: Final[dict[str, str]] = { + "public": f"https://huggingface.co/datasets/{REPO_IDS['public']}", + "instructor": f"https://huggingface.co/datasets/{REPO_IDS['instructor']}", +} +PUBLIC_CONFIGS: Final[tuple[str, ...]] = ("intro", "intermediate", "advanced") +INSTRUCTOR_CONFIGS: Final[tuple[str, ...]] = ("intermediate",) +UPLOAD_DIRS: Final[dict[str, Path]] = { + "public": DEFAULT_HUGGINGFACE_DIR, + "instructor": DEFAULT_HUGGINGFACE_INSTRUCTOR_DIR, +} + + +# --------------------------------------------------------------------------- +# Pre-flight +# --------------------------------------------------------------------------- + + +def _repackage(release_dir: Path, variant: str) -> bool: + """Re-run the HF packager to ensure the upload tree is current. + + Returns ``True`` on success, ``False`` on validation failure. + """ + upload_dir = UPLOAD_DIRS[variant] + print(f"[ 1/4 ] Packaging {upload_dir} …", file=sys.stderr) + try: + outcome = run_packager( + release_dir, + huggingface_dir=upload_dir, + variant=variant, + dry_run=False, + ) + except (FileNotFoundError, ValueError) as exc: + print(f" error: {exc}", file=sys.stderr) + sys.exit(2) + + if outcome.errors: + print(" FAIL: packaging validation errors:", file=sys.stderr) + for err in outcome.errors: + print(f" - {err.field}: {err.message}", file=sys.stderr) + return False + + print(f" OK README → {outcome.readme_path}", file=sys.stderr) + if outcome.assembled: + print(f" OK upload tree → {upload_dir}", file=sys.stderr) + return True + + +def _lint(release_dir: Path) -> bool: + """Run ``lint_platform_metadata`` against the packaged artifacts. + + Returns ``True`` on clean, ``False`` on lint failure. + """ + print("[ 2/4 ] Linting platform metadata …", file=sys.stderr) + try: + outcome: LintOutcome = run_lint(release_dir) + except (FileNotFoundError, ValueError) as exc: + print(f" error: {exc}", file=sys.stderr) + sys.exit(2) + + if not outcome.ok: + print(" FAIL: lint errors:", file=sys.stderr) + for f in outcome.findings: + print(f" - {f.field}: {f.message}", file=sys.stderr) + return False + + print(" OK metadata passes all lint checks", file=sys.stderr) + return True + + +def _smoke_test(upload_dir: Path, variant: str) -> bool: + """Run ``load_dataset()`` locally against the assembled upload tree. + + Verifies G12.3 (public) / G12.4 (instructor) — that the HuggingFace + ``datasets`` library can load every config without error. Requires + ``pip install -e '.[publish]'``. + """ + print("[ 3/4 ] Running load_dataset() smoke tests …", file=sys.stderr) + try: + from datasets import load_dataset # type: ignore[import-untyped] + except ImportError: + print( + " SKIP load_dataset() not available — install with: pip install -e '.[publish]'", + file=sys.stderr, + ) + return True # Not a hard failure; skip gracefully + + configs = PUBLIC_CONFIGS if variant == "public" else INSTRUCTOR_CONFIGS + all_ok = True + for config in configs: + try: + ds = load_dataset(str(upload_dir), config, trust_remote_code=False) + n_splits = len(ds) + total_rows = sum(len(split) for split in ds.values()) + print( + f" OK config={config!r}: {n_splits} splits, {total_rows:,} rows total", + file=sys.stderr, + ) + except Exception as exc: # noqa: BLE001 + print(f" FAIL config={config!r}: {exc}", file=sys.stderr) + all_ok = False + + return all_ok + + +# --------------------------------------------------------------------------- +# Upload +# --------------------------------------------------------------------------- + + +def _upload( + upload_dir: Path, + variant: str, + *, + token: str | None, + private: bool, + commit_message: str, +) -> None: + """Create or update the HF dataset repo and upload the folder. + + Raises on error; caller handles exit code. + """ + try: + from huggingface_hub import HfApi # type: ignore[import-untyped] + except ImportError: + print( + "error: huggingface_hub is required — install with: pip install -e '.[publish]'", + file=sys.stderr, + ) + sys.exit(2) + + api = HfApi(token=token) + repo_id = REPO_IDS[variant] + + print(f"[ 4/4 ] Uploading {upload_dir} → {repo_id} …", file=sys.stderr) + + # Create repo if it doesn't exist. + api.create_repo( + repo_id=repo_id, + repo_type="dataset", + private=private, + exist_ok=True, + ) + print(f" repo : {repo_id} (private={private})", file=sys.stderr) + + # Upload the assembled directory. + url = api.upload_folder( + repo_id=repo_id, + repo_type="dataset", + folder_path=str(upload_dir), + commit_message=commit_message, + ) + print(f" commit: {url}", file=sys.stderr) + + +def _go_public(variant: str, *, token: str | None) -> None: + """Flip a private HF dataset repo to public visibility.""" + try: + from huggingface_hub import HfApi # type: ignore[import-untyped] + except ImportError: + print("error: huggingface_hub not installed", file=sys.stderr) + sys.exit(2) + + api = HfApi(token=token) + repo_id = REPO_IDS[variant] + print(f"Making {repo_id} public …", file=sys.stderr) + api.update_repo_visibility(repo_id=repo_id, repo_type="dataset", private=False) + print(f" Done. {HF_DATASET_URLS[variant]}", file=sys.stderr) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=__doc__.split("\n", maxsplit=1)[0], + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--release-dir", + type=Path, + default=DEFAULT_RELEASE_DIR, + metavar="PATH", + help="Root of the release directory (default: release/)", + ) + parser.add_argument( + "--variant", + choices=["public", "instructor"], + default="public", + help="Which dataset to publish: public (3 tiers) or instructor (default: public)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Package + lint + load_dataset smoke; no upload", + ) + parser.add_argument( + "--private", + action="store_true", + default=True, + help="Upload as private (default; review before going public)", + ) + parser.add_argument( + "--go-public", + action="store_true", + help="Flip the repo to public visibility and exit", + ) + parser.add_argument( + "--token", + default=None, + metavar="TOKEN", + help="HuggingFace API token (default: HF_TOKEN env var or stored login)", + ) + parser.add_argument( + "--commit-message", + default="feat: v1 release — leadforge-lead-scoring-v1", + metavar="MSG", + help="Commit message for the upload", + ) + return parser.parse_args(argv) + + +def main(argv: Sequence[str] | None = None) -> int: + args = _parse_args(argv) + release_dir: Path = args.release_dir.resolve() + variant: str = args.variant + + # --- Go-public shortcut ------------------------------------------------- + if args.go_public: + _go_public(variant, token=args.token) + return 0 + + # --- Pre-flight --------------------------------------------------------- + ok = _repackage(release_dir, variant) + ok = _lint(release_dir) and ok + upload_dir = UPLOAD_DIRS[variant].resolve() + ok = _smoke_test(upload_dir, variant) and ok + + if not ok: + print("\nPre-flight FAILED — fix errors above before uploading.", file=sys.stderr) + return 1 + + if args.dry_run: + print( + "\nDry-run complete — all pre-flight checks passed.", + file=sys.stderr, + ) + print(f"Upload tree is ready at: {upload_dir}", file=sys.stderr) + print( + f"\nTo upload (private):\n" + f" python scripts/publish_hf.py --variant={variant}\n" + f"\nTo flip to public after reviewing:\n" + f" python scripts/publish_hf.py --go-public --variant={variant}", + file=sys.stderr, + ) + return 0 + + # --- Upload ------------------------------------------------------------- + try: + _upload( + upload_dir, + variant, + token=args.token, + private=args.private, + commit_message=args.commit_message, + ) + except Exception as exc: # noqa: BLE001 + print(f"\nUpload failed: {exc}", file=sys.stderr) + return 1 + + print(f"\nUpload succeeded (private={args.private}).", file=sys.stderr) + print(f"Dataset URL: {HF_DATASET_URLS[variant]}", file=sys.stderr) + if args.private: + print( + "\nNext: review the private dataset at the URL above, then make it\n" + "public with:\n" + f" python scripts/publish_hf.py --go-public --variant={variant}", + file=sys.stderr, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/publish_kaggle.py b/scripts/publish_kaggle.py new file mode 100644 index 0000000..36f57ea --- /dev/null +++ b/scripts/publish_kaggle.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 +"""Publish ``leadforge-lead-scoring-v1`` to Kaggle. + +This is the final publish gate for the Kaggle side of the v1 release. +It wraps the two pre-publish steps that must already be complete +(packaging and linting) and then calls the Kaggle CLI to create or +update the dataset. + +Three-stage runbook +------------------- +1. **Dry-run** (safe, no credentials needed):: + + python scripts/publish_kaggle.py --dry-run + + Re-packages ``release/kaggle/`` via ``package_kaggle_release.py``, + lints the metadata via ``lint_platform_metadata.py``, and prints a + summary. Exits ``0`` only if every pre-flight check passes. + +2. **Upload as private** (first publish):: + + python scripts/publish_kaggle.py + + Requires ``~/.kaggle/kaggle.json`` (or ``KAGGLE_USERNAME`` / + ``KAGGLE_KEY`` env vars). Calls ``kaggle datasets create`` without + ``--public``, creating a private dataset. Review the live Kaggle + page; when satisfied, proceed to step 3. + +3. **Flip to public**:: + + python scripts/publish_kaggle.py --go-public + + Calls ``kaggle datasets metadata --unshare`` … actually this is done + via the Kaggle web UI or API; this script prints the direct URL and + the API call to make the switch. + +For a **new version** of an already-public dataset (future releases):: + + python scripts/publish_kaggle.py --update "Release notes for v1.1" + +Options +------- +--release-dir PATH Root of the release directory (default: release/). +--kaggle-dir PATH Upload tree root (default: release/kaggle/). +--dry-run Package + lint; no upload. +--public Upload publicly in one step (skips private staging). +--update MESSAGE Push a new version; MESSAGE is the version note. +--quiet Suppress Kaggle CLI progress output. +--dir-mode {zip,tar,skip} + How to handle subdirectories (default: zip). +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +from collections.abc import Sequence +from pathlib import Path +from typing import Final + +# Make ``scripts/`` importable regardless of invocation style. +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from lint_platform_metadata import LintOutcome, run_lint # noqa: E402 +from package_kaggle_release import ( # noqa: E402 + DEFAULT_DATASET_SLUG, + DEFAULT_KAGGLE_DIR, + DEFAULT_RELEASE_DIR, + DEFAULT_USER_SLUG, + run_packager, +) + +DATASET_ID: Final[str] = f"{DEFAULT_USER_SLUG}/{DEFAULT_DATASET_SLUG}" +KAGGLE_DATASET_URL: Final[str] = f"https://www.kaggle.com/datasets/{DATASET_ID}" + + +# --------------------------------------------------------------------------- +# Pre-flight +# --------------------------------------------------------------------------- + + +def _repackage(release_dir: Path, kaggle_dir: Path) -> bool: + """Re-run the packager to ensure the upload tree is current. + + Returns ``True`` on success, ``False`` on validation failure. + Exits with rc=2 on pre-flight error (missing dirs). + """ + print("[ 1/3 ] Packaging release/kaggle/ …", file=sys.stderr) + try: + outcome = run_packager( + release_dir, + kaggle_dir=kaggle_dir, + dry_run=False, + ) + except (FileNotFoundError, ValueError) as exc: + print(f" error: {exc}", file=sys.stderr) + sys.exit(2) + + if outcome.errors: + print(" FAIL: packaging validation errors:", file=sys.stderr) + for err in outcome.errors: + print(f" - {err.field}: {err.message}", file=sys.stderr) + return False + + print(f" OK metadata → {outcome.metadata_path}", file=sys.stderr) + if outcome.assembled: + print(f" OK upload tree → {kaggle_dir}", file=sys.stderr) + return True + + +def _lint(release_dir: Path) -> bool: + """Run ``lint_platform_metadata`` against the packaged artifacts. + + Returns ``True`` on clean, ``False`` on lint failure. + """ + print("[ 2/3 ] Linting platform metadata …", file=sys.stderr) + try: + outcome: LintOutcome = run_lint(release_dir) + except (FileNotFoundError, ValueError) as exc: + print(f" error: {exc}", file=sys.stderr) + sys.exit(2) + + if not outcome.ok: + print(" FAIL: lint errors:", file=sys.stderr) + for f in outcome.findings: + print(f" - {f.field}: {f.message}", file=sys.stderr) + return False + + print(" OK metadata passes all lint checks", file=sys.stderr) + return True + + +# --------------------------------------------------------------------------- +# Upload +# --------------------------------------------------------------------------- + + +def _kaggle_create( + kaggle_dir: Path, + *, + public: bool, + quiet: bool, + dir_mode: str, +) -> int: + """Call ``kaggle datasets create``; return the process exit code.""" + cmd = [ + "kaggle", + "datasets", + "create", + "--path", + str(kaggle_dir), + "--dir-mode", + dir_mode, + "--keep-tabular", + ] + if public: + cmd.append("--public") + if quiet: + cmd.append("--quiet") + print(f"[ 3/3 ] Running: {' '.join(cmd)}", file=sys.stderr) + return subprocess.run(cmd).returncode # noqa: S603,S607 + + +def _kaggle_version( + kaggle_dir: Path, + message: str, + *, + quiet: bool, + dir_mode: str, +) -> int: + """Call ``kaggle datasets version``; return the process exit code.""" + cmd = [ + "kaggle", + "datasets", + "version", + "--path", + str(kaggle_dir), + "--message", + message, + "--dir-mode", + dir_mode, + "--keep-tabular", + ] + if quiet: + cmd.append("--quiet") + print(f"[ 3/3 ] Running: {' '.join(cmd)}", file=sys.stderr) + return subprocess.run(cmd).returncode # noqa: S603,S607 + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=__doc__.split("\n", maxsplit=1)[0], + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--release-dir", + type=Path, + default=DEFAULT_RELEASE_DIR, + metavar="PATH", + help="Root of the release directory (default: release/)", + ) + parser.add_argument( + "--kaggle-dir", + type=Path, + default=DEFAULT_KAGGLE_DIR, + metavar="PATH", + help="Upload tree root (default: release/kaggle/)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Package + lint only; do not upload", + ) + parser.add_argument( + "--public", + action="store_true", + help="Upload publicly in one step (skip private staging)", + ) + parser.add_argument( + "--update", + metavar="MESSAGE", + default=None, + help="Push a new dataset version with the given version note", + ) + parser.add_argument( + "--quiet", + action="store_true", + help="Suppress Kaggle CLI progress output", + ) + parser.add_argument( + "--dir-mode", + choices=["zip", "tar", "skip"], + default="zip", + help="How to handle subdirectories (default: zip)", + ) + return parser.parse_args(argv) + + +def main(argv: Sequence[str] | None = None) -> int: + args = _parse_args(argv) + release_dir: Path = args.release_dir.resolve() + kaggle_dir: Path = args.kaggle_dir.resolve() + + # --- Pre-flight --------------------------------------------------------- + ok = _repackage(release_dir, kaggle_dir) + ok = _lint(release_dir) and ok + if not ok: + print("\nPre-flight FAILED — fix errors above before uploading.", file=sys.stderr) + return 1 + + if args.dry_run: + print( + "\nDry-run complete — all pre-flight checks passed.", + file=sys.stderr, + ) + print(f"Upload tree is ready at: {kaggle_dir}", file=sys.stderr) + print( + "\nTo upload (private staging):\n" + " python scripts/publish_kaggle.py\n" + "\nTo upload publicly in one step:\n" + " python scripts/publish_kaggle.py --public", + file=sys.stderr, + ) + return 0 + + # --- Upload ------------------------------------------------------------- + if args.update: + rc = _kaggle_version( + kaggle_dir, + args.update, + quiet=args.quiet, + dir_mode=args.dir_mode, + ) + else: + rc = _kaggle_create( + kaggle_dir, + public=args.public, + quiet=args.quiet, + dir_mode=args.dir_mode, + ) + + if rc != 0: + print(f"\nKaggle CLI exited with code {rc}.", file=sys.stderr) + return rc + + visibility = "public" if (args.public or args.update) else "private" + print(f"\nUpload succeeded ({visibility}).", file=sys.stderr) + print(f"Dataset URL: {KAGGLE_DATASET_URL}", file=sys.stderr) + if not args.public and not args.update: + print( + "\nNext: review the private dataset at the URL above, then make it\n" + "public via the Kaggle web UI (Settings → Visibility → Public) or:\n" + f" kaggle datasets metadata {DATASET_ID} # download current metadata\n" + f" # edit isPrivate: false in the downloaded metadata.json\n" + f" kaggle datasets update {DATASET_ID} # push the change", + file=sys.stderr, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 50263fe00a44d8a28dc2f808c7f1d650d9e19b7b Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 28 May 2026 11:44:04 +0300 Subject: [PATCH 2/2] fix(scripts): address Copilot review comments on publish scripts COPILOT-1 (publish_kaggle.py docstring): Remove the bogus '--go-public' CLI flag reference from step 3 of the module docstring. There is no such flag; the public-flip is a manual step (Kaggle web UI or 'kaggle datasets metadata' + 'update'). Rewrite step 3 to document the actual flow that main() already prints after a successful private upload. COPILOT-2 (publish_hf.py --private flag): 'action=store_true' with 'default=True' made --private permanently True and un-overridable. Switch to BooleanOptionalAction (Python 3.9+), giving '--private' (explicit True) and '--no-private' (False), both with default=True. Users can now pass '--no-private' to upload publicly in a single step without the separate --go-public call. COPILOT-3 (publish_kaggle.py visibility reporting): 'visibility = "public" if (args.public or args.update)' falsely reported "public" for '--update' runs. 'kaggle datasets version' pushes to whatever the repo's current visibility is; a version bump on a private dataset is still private. Split into three distinct messages: 'new version pushed' (--update), 'public' (--public), 'private' (default create). Co-Authored-By: Claude Sonnet 4.6 --- scripts/publish_hf.py | 4 ++-- scripts/publish_kaggle.py | 24 +++++++++++++++++------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/scripts/publish_hf.py b/scripts/publish_hf.py index ef85cdf..5d5d02e 100644 --- a/scripts/publish_hf.py +++ b/scripts/publish_hf.py @@ -278,9 +278,9 @@ def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: ) parser.add_argument( "--private", - action="store_true", + action=argparse.BooleanOptionalAction, default=True, - help="Upload as private (default; review before going public)", + help="Upload as private (default). Pass --no-private to upload public directly.", ) parser.add_argument( "--go-public", diff --git a/scripts/publish_kaggle.py b/scripts/publish_kaggle.py index 36f57ea..23db69a 100644 --- a/scripts/publish_kaggle.py +++ b/scripts/publish_kaggle.py @@ -25,13 +25,18 @@ ``--public``, creating a private dataset. Review the live Kaggle page; when satisfied, proceed to step 3. -3. **Flip to public**:: +3. **Flip to public** (manual step — no CLI flag for this):: - python scripts/publish_kaggle.py --go-public + There is no ``--go-public`` flag. After reviewing the private dataset, + flip visibility via the Kaggle web UI (Settings → Visibility → Public) + or via the Kaggle API:: - Calls ``kaggle datasets metadata --unshare`` … actually this is done - via the Kaggle web UI or API; this script prints the direct URL and - the API call to make the switch. + kaggle datasets metadata {DATASET_ID} # download current metadata.json + # edit: set isPrivate: false + kaggle datasets update {DATASET_ID} # push the change + + This script prints the exact commands with the real dataset ID after a + successful private upload (step 2). For a **new version** of an already-public dataset (future releases):: @@ -288,8 +293,13 @@ def main(argv: Sequence[str] | None = None) -> int: print(f"\nKaggle CLI exited with code {rc}.", file=sys.stderr) return rc - visibility = "public" if (args.public or args.update) else "private" - print(f"\nUpload succeeded ({visibility}).", file=sys.stderr) + if args.update: + visibility_msg = "new version pushed" + elif args.public: + visibility_msg = "public" + else: + visibility_msg = "private" + print(f"\nUpload succeeded ({visibility_msg}).", file=sys.stderr) print(f"Dataset URL: {KAGGLE_DATASET_URL}", file=sys.stderr) if not args.public and not args.update: print(