From e5e907fea17609c884da8a9d031034050c8ac2b9 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 26 May 2026 00:13:45 +0300 Subject: [PATCH] test(scripts): add canonical platform metadata lint --- .agent-plan.md | 11 +- .github/workflows/ci.yml | 2 + scripts/_release_common.py | 13 + scripts/lint_platform_metadata.py | 687 +++++++++++++++++++ tests/scripts/test_lint_platform_metadata.py | 330 +++++++++ 5 files changed, 1042 insertions(+), 1 deletion(-) create mode 100644 scripts/lint_platform_metadata.py create mode 100644 tests/scripts/test_lint_platform_metadata.py diff --git a/.agent-plan.md b/.agent-plan.md index 4c6c5fb..4434017 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -103,10 +103,19 @@ _Source: `docs/external_review/summaries/v1_release_review_synthesis.md` — cro - Labels: `type: docs`, `layer: render` - Size: S (~200 lines across 4 notebooks) +- [x] **SMF-PR5 / PR 8.4a** — `test(scripts): canonical platform metadata lint gate` + - **Add canonical metadata diff/lint step** (HIGH): `scripts/lint_platform_metadata.py` loads the real publication artifacts (`release/kaggle/dataset-metadata.json` + `release/huggingface/README.md`) and fails on Kaggle privacy, license, task, exact platform tag vocabularies, HF split-path/set, Kaggle resource coverage, or task-split schema drift. + - **Offline review bundle guard** (HIGH): lint requires root agent-reviewable artifacts plus per-tier `lead_scoring.csv`, `feature_dictionary.csv`, `dataset_card.md`, `metrics.json`, `manifest.json`, task splits, and snapshot-safe relational tables to be represented in the canonical Kaggle resources list. The resource contract now reuses packaging constants for tiers/tables/root docs instead of shadowing the packager. + - **File-backed schema check**: when tier CSV/parquet files are materialized, lint compares metadata schema declarations to actual CSV headers and parquet Arrow schemas; `--strict-files` turns missing bundle files into release-readiness failures. + - **CI integration**: `release-artifacts-sync` now runs `python scripts/lint_platform_metadata.py` after docs/metrics/claims sync checks, so preview/publish metadata drift blocks PRs. + - **Focused tests**: `tests/scripts/test_lint_platform_metadata.py` covers the clean path and injected failures for private Kaggle metadata, license/task/tag drift, HF split paths absent from Kaggle resources, schema mismatch, and missing per-tier review artifacts. + - Labels: `type: test`, `layer: validation`, `layer: render` + - Size: S (~450 lines, mostly tests) + - [ ] **PR 8.4** — `feat(scripts): integration script + preview hardening` - **Regenerate lockfile + bump to v1.0.1** (HIGH): delete `package-lock.json`, update `package.json` pin to `github:ShmuggingFace/ShmuggingFaceCore#v1.0.1`, regenerate via HTTPS. Fixes SSH lockfile and gets the socks/laundry copy fix in one step. - **Remove fabricated Kaggle usability scores and medals** (HIGH): delete `TIER_USABILITY`, `TIER_MEDAL` constants from `build_shmuggingface_site.py`. These are dead config today (the framework ignores them) but latent misinformation. - - **Make build script read and diff against canonical metadata files** (HIGH): load `release/kaggle/dataset-metadata.json` and `release/huggingface/README.md` in the build script; compare `isPrivate`, tags, license, task, and split counts against the generated config; exit non-zero on mismatch. Structural gap that made `isPrivate: true` invisible in the preview. + - **Remaining preview-generator cleanup** (HIGH): `build_shmuggingface_site.py` no longer exists on `main`; if a ShmuggingFaceCore build path is reintroduced, it must consume or validate against the canonical lint gate from SMF-PR5 rather than hard-code task, tags, license, splits, or schema. - **Raise on missing/malformed manifest+metrics fields** (MEDIUM): replace `manifest.get("n_leads", 5000)` etc. with explicit key lookups that raise clear errors. For a tool whose job is faithful preview, silent defaults produce plausible-but-false pages. - **Use per-tier `dataset_card.md` as each tier's page body** (MEDIUM): currently all three tier pages show the same global README. One-line change per tier in the config builder. - **Pin `wrangler` as devDependency + default to preview branch** (MEDIUM): add `wrangler` to `package.json` devDependencies; change `--branch main` to `--branch preview` as the default; add a `--production` flag for intentional production deploys. Prevents clobbering production on every local run. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3a89d68..6700bb5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -84,6 +84,8 @@ jobs: run: python scripts/build_claims_register.py --check - name: every claim in claims_register_source.yaml resolves & values match run: python scripts/verify_claims_register.py + - name: Kaggle and Hugging Face metadata agree with release preview contract + run: python scripts/lint_platform_metadata.py validate-dataset: name: Validate lead scoring dataset diff --git a/scripts/_release_common.py b/scripts/_release_common.py index 31d7c94..409d934 100644 --- a/scripts/_release_common.py +++ b/scripts/_release_common.py @@ -363,3 +363,16 @@ def load_relational_column_descriptions(release_dir: Path) -> dict[tuple[str, st #: (DGP description, leakage / acceptance bands, break-me guide, etc.). #: Copied wholesale into the upload tree when present. AGENT_REVIEWABLE_DOCS_DIR: Final[str] = "docs" + +#: Required vendored-doc files that should be represented in the +#: canonical platform metadata. This is separate from the directory +#: copy contract above: packagers copy the whole docs subtree, while +#: metadata lint requires these high-value review files to be visible +#: in the platform file list. +AGENT_REVIEWABLE_DOC_FILES: Final[tuple[str, ...]] = ( + "README.md", + "feature_dictionary.md", + "generation_method.md", + "break_me_guide.md", + "relational_table_schemas.csv", +) diff --git a/scripts/lint_platform_metadata.py b/scripts/lint_platform_metadata.py new file mode 100644 index 0000000..441deba --- /dev/null +++ b/scripts/lint_platform_metadata.py @@ -0,0 +1,687 @@ +#!/usr/bin/env python3 +"""Lint canonical Kaggle / Hugging Face metadata before preview or publish. + +This is the explicit diff gate between the two platform artifacts that +drive publication: + +* ``release/kaggle/dataset-metadata.json`` +* ``release/huggingface/README.md`` + +The preview renderers intentionally read those files directly. This +script catches the cases where the canonical files themselves drift: +private Kaggle metadata, license / task / tag mismatches, HF splits +that are absent from the Kaggle resource list, task-split schema drift, +and missing per-tier inputs needed by offline reviewers. + +Exit codes: 0 pass / 1 lint failure / 2 pre-flight error. +""" + +from __future__ import annotations + +import argparse +import csv +import json +import re +import sys +from collections.abc import Iterable, Sequence +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Final + +import yaml + +# Make ``scripts/`` importable regardless of whether this file is run +# as ``python scripts/lint_platform_metadata.py`` or imported from tests. +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from _release_common import ( # noqa: E402 + AGENT_REVIEWABLE_DOC_FILES, + AGENT_REVIEWABLE_DOCS_DIR, + AGENT_REVIEWABLE_ROOT_FILES, +) +from package_hf_release import DEFAULT_TAGS as DEFAULT_HF_TAGS # noqa: E402 +from package_kaggle_release import ( # noqa: E402 + BUNDLE_TABLES, + DEFAULT_TASK, + DEFAULT_TIERS, + fields_from_parquet, +) +from package_kaggle_release import ( + DEFAULT_KEYWORDS as DEFAULT_KAGGLE_KEYWORDS, +) + +DEFAULT_RELEASE_DIR: Final[Path] = Path("release") +HF_SPLIT_TO_FILE_SPLIT: Final[dict[str, str]] = { + "train": "train", + "validation": "valid", + "test": "test", +} +REQUIRED_COMMON_TAGS: Final[frozenset[str]] = frozenset( + {"b2b", "crm", "lead-scoring", "synthetic-data", "tabular"} +) +EXPECTED_KAGGLE_KEYWORDS: Final[frozenset[str]] = frozenset(DEFAULT_KAGGLE_KEYWORDS) +EXPECTED_HF_TAGS: Final[frozenset[str]] = frozenset(DEFAULT_HF_TAGS) +REQUIRED_HF_TASK: Final[str] = "tabular-classification" +REQUIRED_LICENSES: Final[dict[str, str]] = { + "kaggle": "MIT", + "hf": "mit", +} +REQUIRED_TIER_RESOURCES: Final[tuple[str, ...]] = ( + "lead_scoring.csv", + "feature_dictionary.csv", + "dataset_card.md", + "metrics.json", + "manifest.json", + f"tasks/{DEFAULT_TASK}/train.parquet", + f"tasks/{DEFAULT_TASK}/valid.parquet", + f"tasks/{DEFAULT_TASK}/test.parquet", + *(f"tables/{table}.parquet" for table in BUNDLE_TABLES), +) +REQUIRED_ROOT_RESOURCES: Final[tuple[str, ...]] = ( + *(rel for rel, required in AGENT_REVIEWABLE_ROOT_FILES if required), + *(f"{AGENT_REVIEWABLE_DOCS_DIR}/{filename}" for filename in AGENT_REVIEWABLE_DOC_FILES), +) + +_FRONTMATTER_RE: Final[re.Pattern[str]] = re.compile( + r"\A---\n(?P.*?)\n---\n(?P.*)\Z", + re.DOTALL, +) + + +@dataclass(frozen=True) +class LintFinding: + """One platform-metadata mismatch.""" + + field: str + message: str + + +@dataclass(frozen=True) +class LintOutcome: + """Return value from :func:`run_lint`.""" + + findings: tuple[LintFinding, ...] + + @property + def ok(self) -> bool: + return not self.findings + + +def _load_json_object(path: Path) -> dict[str, Any]: + if not path.is_file(): + raise FileNotFoundError(f"missing JSON artifact: {path}") + value = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(value, dict): + raise ValueError(f"{path} is not a JSON object") + return value + + +def _load_hf_frontmatter(path: Path) -> dict[str, Any]: + if not path.is_file(): + raise FileNotFoundError(f"missing HF README artifact: {path}") + text = path.read_text(encoding="utf-8") + match = _FRONTMATTER_RE.match(text) + if not match: + raise ValueError(f"{path} is missing YAML frontmatter") + value = yaml.safe_load(match.group("yaml")) or {} + if not isinstance(value, dict): + raise ValueError(f"{path} frontmatter is not a YAML mapping") + return value + + +def _resource_map(kaggle_metadata: dict[str, Any]) -> dict[str, dict[str, Any]]: + resources = kaggle_metadata.get("resources", []) + if not isinstance(resources, list): + return {} + out: dict[str, dict[str, Any]] = {} + for resource in resources: + if isinstance(resource, dict) and isinstance(resource.get("path"), str): + out[resource["path"]] = resource + return out + + +def _field_signature(resource: dict[str, Any]) -> tuple[tuple[str, str], ...] | None: + schema = resource.get("schema") + if not isinstance(schema, dict): + return None + fields = schema.get("fields") + if not isinstance(fields, list): + return None + signature: list[tuple[str, str]] = [] + for field in fields: + if not isinstance(field, dict): + return None + name = field.get("name") + field_type = field.get("type") + if not isinstance(name, str) or not isinstance(field_type, str): + return None + signature.append((name, field_type)) + return tuple(signature) + + +def _field_names(resource: dict[str, Any]) -> tuple[str, ...] | None: + signature = _field_signature(resource) + if signature is None: + return None + return tuple(name for name, _ in signature) + + +def _as_str_list(value: Any) -> list[str]: + if not isinstance(value, list): + return [] + return [x for x in value if isinstance(x, str)] + + +def _hf_configs(frontmatter: dict[str, Any]) -> list[dict[str, Any]]: + configs = frontmatter.get("configs") + if not isinstance(configs, list): + return [] + return [c for c in configs if isinstance(c, dict)] + + +def _iter_hf_data_files(configs: Iterable[dict[str, Any]]) -> Iterable[tuple[str, str, str]]: + for config in configs: + config_name = config.get("config_name") + data_files = config.get("data_files") + if not isinstance(config_name, str) or not isinstance(data_files, list): + continue + for data_file in data_files: + if not isinstance(data_file, dict): + continue + split = data_file.get("split") + path = data_file.get("path") + if isinstance(split, str) and isinstance(path, str): + yield config_name, split, path + + +def _expected_hf_data_files(tier: str, *, task: str) -> tuple[tuple[str, str], ...]: + return tuple( + (hf_split, f"{tier}/tasks/{task}/{file_split}.parquet") + for hf_split, file_split in HF_SPLIT_TO_FILE_SPLIT.items() + ) + + +def _lint_privacy_license_task_tags( + kaggle_metadata: dict[str, Any], + hf_frontmatter: dict[str, Any], +) -> list[LintFinding]: + findings: list[LintFinding] = [] + + if kaggle_metadata.get("isPrivate") is not False: + findings.append( + LintFinding( + "kaggle.isPrivate", + "expected false so the preview catches the private-publish blocker", + ) + ) + + kaggle_licenses = kaggle_metadata.get("licenses") + kaggle_license = None + if ( + isinstance(kaggle_licenses, list) + and kaggle_licenses + and isinstance(kaggle_licenses[0], dict) + ): + kaggle_license = kaggle_licenses[0].get("name") + if kaggle_license != REQUIRED_LICENSES["kaggle"]: + findings.append( + LintFinding( + "kaggle.licenses[0].name", + f"expected {REQUIRED_LICENSES['kaggle']!r}, got {kaggle_license!r}", + ) + ) + + hf_license = hf_frontmatter.get("license") + if hf_license != REQUIRED_LICENSES["hf"]: + findings.append( + LintFinding( + "hf.license", + f"expected {REQUIRED_LICENSES['hf']!r}, got {hf_license!r}", + ) + ) + + hf_tasks = set(_as_str_list(hf_frontmatter.get("task_categories"))) + if REQUIRED_HF_TASK not in hf_tasks: + findings.append( + LintFinding( + "hf.task_categories", + f"must contain {REQUIRED_HF_TASK!r}", + ) + ) + + kaggle_keywords = set(_as_str_list(kaggle_metadata.get("keywords"))) + hf_tags = set(_as_str_list(hf_frontmatter.get("tags"))) + missing_kaggle_tags = sorted(REQUIRED_COMMON_TAGS - kaggle_keywords) + missing_hf_tags = sorted(REQUIRED_COMMON_TAGS - hf_tags) + if missing_kaggle_tags: + findings.append( + LintFinding( + "kaggle.keywords", + f"missing common topical tag(s): {missing_kaggle_tags}", + ) + ) + if missing_hf_tags: + findings.append( + LintFinding( + "hf.tags", + f"missing common topical tag(s): {missing_hf_tags}", + ) + ) + + if kaggle_keywords != EXPECTED_KAGGLE_KEYWORDS: + findings.append( + LintFinding( + "kaggle.keywords", + ( + "expected exact keyword set " + f"{sorted(EXPECTED_KAGGLE_KEYWORDS)!r}, got {sorted(kaggle_keywords)!r}" + ), + ) + ) + if hf_tags != EXPECTED_HF_TAGS: + findings.append( + LintFinding( + "hf.tags", + f"expected exact tag set {sorted(EXPECTED_HF_TAGS)!r}, got {sorted(hf_tags)!r}", + ) + ) + + # The HF task category should be echoed on Kaggle through the two + # searchable keywords Kaggle actually exposes for this release. + for keyword in ("classification", "tabular"): + if keyword not in kaggle_keywords: + findings.append( + LintFinding( + "kaggle.keywords", + f"missing task-discovery keyword {keyword!r}", + ) + ) + + return findings + + +def _lint_hf_configs( + configs: list[dict[str, Any]], + *, + tiers: Sequence[str], + task: str, +) -> list[LintFinding]: + findings: list[LintFinding] = [] + names = [c.get("config_name") for c in configs if isinstance(c.get("config_name"), str)] + if names != list(tiers): + findings.append( + LintFinding( + "hf.configs", + f"expected config order {list(tiers)!r}, got {names!r}", + ) + ) + + defaults = [c.get("config_name") for c in configs if c.get("default") is True] + if len(defaults) != 1: + findings.append( + LintFinding( + "hf.configs", + f"expected exactly one default config, got {defaults!r}", + ) + ) + + by_name = { + str(config.get("config_name")): config + for config in configs + if isinstance(config.get("config_name"), str) + } + for tier in tiers: + config = by_name.get(tier) + if config is None: + continue + data_files = config.get("data_files") + if not isinstance(data_files, list): + findings.append( + LintFinding( + "hf.configs.data_files", + f"{tier} must declare data_files as a list", + ) + ) + continue + actual: list[tuple[str, str]] = [] + malformed = 0 + for data_file in data_files: + if not isinstance(data_file, dict): + malformed += 1 + continue + split = data_file.get("split") + path = data_file.get("path") + if not isinstance(split, str) or not isinstance(path, str): + malformed += 1 + continue + actual.append((split, path)) + if malformed: + findings.append( + LintFinding( + "hf.configs.data_files", + f"{tier} has {malformed} malformed data_files entrie(s)", + ) + ) + expected = list(_expected_hf_data_files(tier, task=task)) + if actual != expected: + findings.append( + LintFinding( + "hf.configs.data_files", + f"{tier} data_files expected {expected!r}, got {actual!r}", + ) + ) + + return findings + + +def _lint_resource_coverage( + resources: dict[str, dict[str, Any]], + *, + tiers: Sequence[str], + task: str, +) -> list[LintFinding]: + findings: list[LintFinding] = [] + for path in REQUIRED_ROOT_RESOURCES: + if path not in resources: + findings.append( + LintFinding( + "kaggle.resources", + f"missing agent-reviewable root artifact {path!r}", + ) + ) + for tier in tiers: + for suffix in REQUIRED_TIER_RESOURCES: + path = f"{tier}/{suffix}" + if task != DEFAULT_TASK: + path = path.replace(DEFAULT_TASK, task) + if path not in resources: + findings.append( + LintFinding( + "kaggle.resources", + f"missing per-tier review artifact {path!r}", + ) + ) + return findings + + +def _lint_split_and_schema_consistency( + resources: dict[str, dict[str, Any]], + hf_configs: list[dict[str, Any]], + *, + tiers: Sequence[str], + task: str, +) -> list[LintFinding]: + findings: list[LintFinding] = [] + + for _tier, _split, path in _iter_hf_data_files(hf_configs): + if path not in resources: + findings.append( + LintFinding( + "hf.configs.data_files", + f"HF data file {path!r} is absent from Kaggle resources", + ) + ) + + for tier in tiers: + flat_path = f"{tier}/lead_scoring.csv" + flat_fields = _field_names(resources.get(flat_path, {})) + if flat_fields is None: + findings.append( + LintFinding( + "kaggle.resources.schema", + f"{flat_path!r} must declare schema.fields", + ) + ) + continue + if not flat_fields or flat_fields[0] != "split": + findings.append( + LintFinding( + "kaggle.resources.schema", + f"{flat_path!r} must expose the split column first", + ) + ) + task_expected_fields = tuple(name for name in flat_fields if name != "split") + + split_signatures: dict[str, tuple[tuple[str, str], ...]] = {} + for file_split in HF_SPLIT_TO_FILE_SPLIT.values(): + split_path = f"{tier}/tasks/{task}/{file_split}.parquet" + resource = resources.get(split_path) + if resource is None: + continue + signature = _field_signature(resource) + if signature is None: + findings.append( + LintFinding( + "kaggle.resources.schema", + f"{split_path!r} must declare schema.fields", + ) + ) + continue + split_signatures[file_split] = signature + split_names = tuple(name for name, _ in signature) + if split_names != task_expected_fields: + findings.append( + LintFinding( + "kaggle.resources.schema", + f"{split_path!r} schema differs from {flat_path!r} minus split", + ) + ) + + if split_signatures: + first_split, first_signature = next(iter(split_signatures.items())) + for split_name, signature in split_signatures.items(): + if signature != first_signature: + findings.append( + LintFinding( + "kaggle.resources.schema", + (f"{tier} {split_name!r} schema differs from {first_split!r} schema"), + ) + ) + + return findings + + +def _flat_csv_actual_fields(path: Path) -> tuple[str, ...]: + with path.open(newline="", encoding="utf-8") as f: + reader = csv.reader(f) + try: + header = next(reader) + except StopIteration: + return () + return tuple(header) + + +def _parquet_actual_signature(path: Path) -> tuple[tuple[str, str], ...]: + return tuple((field.name, field.type) for field in fields_from_parquet(path)) + + +def _lint_actual_file_schemas( + resources: dict[str, dict[str, Any]], + *, + release_dir: Path, + tiers: Sequence[str], + task: str, + strict_files: bool, +) -> list[LintFinding]: + """Compare metadata schemas to files on disk when bundle files exist. + + Fresh checkouts intentionally do not materialise the heavy tier + bundle directories, so missing files are soft-skipped by default. + Release-readiness jobs can pass ``--strict-files`` to turn those + skips into failures. + """ + + findings: list[LintFinding] = [] + for tier in tiers: + flat_rel = f"{tier}/lead_scoring.csv" + flat_path = release_dir / flat_rel + if not flat_path.is_file(): + if strict_files: + findings.append( + LintFinding( + "release.files", + f"missing release file required for strict schema lint: {flat_rel!r}", + ) + ) + continue + + actual_flat_fields = _flat_csv_actual_fields(flat_path) + declared_flat_fields = _field_names(resources.get(flat_rel, {})) + if declared_flat_fields != actual_flat_fields: + findings.append( + LintFinding( + "kaggle.resources.schema", + ( + f"{flat_rel!r} metadata fields differ from actual CSV header: " + f"declared={declared_flat_fields!r}, actual={actual_flat_fields!r}" + ), + ) + ) + + for file_split in HF_SPLIT_TO_FILE_SPLIT.values(): + split_rel = f"{tier}/tasks/{task}/{file_split}.parquet" + split_path = release_dir / split_rel + if not split_path.is_file(): + if strict_files: + findings.append( + LintFinding( + "release.files", + ( + "missing release file required for strict schema lint: " + f"{split_rel!r}" + ), + ) + ) + continue + declared = _field_signature(resources.get(split_rel, {})) + actual = _parquet_actual_signature(split_path) + if declared != actual: + findings.append( + LintFinding( + "kaggle.resources.schema", + ( + f"{split_rel!r} metadata schema differs from actual parquet " + f"schema: declared={declared!r}, actual={actual!r}" + ), + ) + ) + + return findings + + +def lint_metadata( + kaggle_metadata: dict[str, Any], + hf_frontmatter: dict[str, Any], + *, + tiers: Sequence[str] = DEFAULT_TIERS, + task: str = DEFAULT_TASK, + release_dir: Path | None = None, + strict_files: bool = False, +) -> LintOutcome: + """Run all platform-metadata lint checks against parsed artifacts.""" + + findings: list[LintFinding] = [] + resources = _resource_map(kaggle_metadata) + configs = _hf_configs(hf_frontmatter) + + if not resources: + findings.append(LintFinding("kaggle.resources", "must contain resource objects")) + if not configs: + findings.append(LintFinding("hf.configs", "must contain config objects")) + + findings.extend(_lint_privacy_license_task_tags(kaggle_metadata, hf_frontmatter)) + findings.extend(_lint_hf_configs(configs, tiers=tiers, task=task)) + findings.extend(_lint_resource_coverage(resources, tiers=tiers, task=task)) + findings.extend(_lint_split_and_schema_consistency(resources, configs, tiers=tiers, task=task)) + if release_dir is not None: + findings.extend( + _lint_actual_file_schemas( + resources, + release_dir=release_dir, + tiers=tiers, + task=task, + strict_files=strict_files, + ) + ) + return LintOutcome(findings=tuple(findings)) + + +def run_lint( + release_dir: Path, + *, + tiers: Sequence[str] = DEFAULT_TIERS, + task: str = DEFAULT_TASK, + strict_files: bool = False, +) -> LintOutcome: + """Load canonical artifacts from ``release_dir`` and lint them.""" + + kaggle_metadata = _load_json_object(release_dir / "kaggle" / "dataset-metadata.json") + hf_frontmatter = _load_hf_frontmatter(release_dir / "huggingface" / "README.md") + return lint_metadata( + kaggle_metadata, + hf_frontmatter, + tiers=tiers, + task=task, + release_dir=release_dir, + strict_files=strict_files, + ) + + +def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="lint_platform_metadata", + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--release-dir", + type=Path, + default=DEFAULT_RELEASE_DIR, + help="release tree containing kaggle/ and huggingface/ artifacts (default: %(default)s)", + ) + parser.add_argument( + "--tier", + action="append", + dest="tiers", + default=None, + help="tier/config to validate (repeatable; default: intro/intermediate/advanced)", + ) + parser.add_argument( + "--task", + default=DEFAULT_TASK, + help="task directory under each tier (default: %(default)s)", + ) + parser.add_argument( + "--strict-files", + action="store_true", + help=( + "fail if tier CSV/parquet files are missing instead of soft-skipping " + "file-backed schema checks" + ), + ) + return parser.parse_args(argv) + + +def main(argv: Sequence[str] | None = None) -> int: + args = parse_args(argv) + tiers = tuple(args.tiers) if args.tiers else DEFAULT_TIERS + try: + outcome = run_lint( + args.release_dir, + tiers=tiers, + task=args.task, + strict_files=args.strict_files, + ) + except (FileNotFoundError, ValueError, json.JSONDecodeError, yaml.YAMLError) as exc: + print(f"error: {exc}", file=sys.stderr) + return 2 + if not outcome.ok: + for finding in outcome.findings: + print(f"{finding.field}: {finding.message}", file=sys.stderr) + return 1 + print("platform metadata lint passed", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/scripts/test_lint_platform_metadata.py b/tests/scripts/test_lint_platform_metadata.py new file mode 100644 index 0000000..4ac5b1f --- /dev/null +++ b/tests/scripts/test_lint_platform_metadata.py @@ -0,0 +1,330 @@ +"""Tests for ``scripts/lint_platform_metadata.py``. + +The lint gate compares the two canonical platform artifacts used by +real publication and by the local preview pages: + +* ``release/kaggle/dataset-metadata.json`` +* ``release/huggingface/README.md`` + +It is intentionally metadata-only, so it runs on a fresh checkout even +when the heavy per-tier bundle directories are not materialised. +""" + +from __future__ import annotations + +import importlib.util +import json +import sys +from copy import deepcopy +from pathlib import Path + +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + +_REPO_ROOT = Path(__file__).resolve().parents[2] +_SCRIPT_PATH = _REPO_ROOT / "scripts" / "lint_platform_metadata.py" +_spec = importlib.util.spec_from_file_location("lint_platform_metadata", _SCRIPT_PATH) +assert _spec is not None +assert _spec.loader is not None +lint = importlib.util.module_from_spec(_spec) +sys.modules["lint_platform_metadata"] = lint +_spec.loader.exec_module(lint) + +EXPECTED_ROOT_RESOURCES = ( + "metrics.json", + "claims_register.md", + "claims_register.json", + "docs/README.md", + "docs/feature_dictionary.md", + "docs/generation_method.md", + "docs/break_me_guide.md", + "docs/relational_table_schemas.csv", +) +EXPECTED_TIER_RESOURCES = ( + "lead_scoring.csv", + "feature_dictionary.csv", + "dataset_card.md", + "metrics.json", + "manifest.json", + "tasks/converted_within_90_days/train.parquet", + "tasks/converted_within_90_days/valid.parquet", + "tasks/converted_within_90_days/test.parquet", + "tables/accounts.parquet", + "tables/contacts.parquet", + "tables/leads.parquet", + "tables/touches.parquet", + "tables/sessions.parquet", + "tables/sales_activities.parquet", + "tables/opportunities.parquet", +) + + +def _field(name: str, field_type: str = "string") -> dict[str, str]: + return {"name": name, "type": field_type, "description": f"{name} description"} + + +def _resource(path: str, fields: list[dict[str, str]] | None = None) -> dict[str, object]: + resource: dict[str, object] = {"path": path, "description": f"{path} resource"} + if fields is not None: + resource["schema"] = {"fields": fields} + return resource + + +def _minimal_artifacts() -> tuple[dict[str, object], dict[str, object]]: + flat_fields = [ + _field("split"), + _field("account_id"), + _field("score", "number"), + _field("converted_within_90_days", "boolean"), + ] + task_fields = flat_fields[1:] + resources: list[dict[str, object]] = [] + for path in EXPECTED_ROOT_RESOURCES: + resources.append(_resource(path)) + for tier in lint.DEFAULT_TIERS: + for suffix in EXPECTED_TIER_RESOURCES: + path = f"{tier}/{suffix}" + if suffix == "lead_scoring.csv": + resources.append(_resource(path, flat_fields)) + elif suffix.startswith(f"tasks/{lint.DEFAULT_TASK}/"): + resources.append(_resource(path, task_fields)) + else: + resources.append(_resource(path)) + + kaggle = { + "title": "LeadForge test", + "id": "leadforge/leadforge-lead-scoring-v1", + "subtitle": "A metadata lint fixture", + "description": "body", + "isPrivate": False, + "licenses": [{"name": "MIT"}], + "keywords": [ + "b2b", + "classification", + "crm", + "education", + "lead-scoring", + "saas", + "synthetic-data", + "tabular", + ], + "expectedUpdateFrequency": "never", + "image": "dataset-cover-image.png", + "resources": resources, + } + hf = { + "pretty_name": "LeadForge test", + "license": "mit", + "language": ["en"], + "task_categories": ["tabular-classification"], + "size_categories": ["1K list[str]: + return [f"{finding.field}: {finding.message}" for finding in outcome.findings] + + +def test_lint_accepts_matching_platform_metadata() -> None: + kaggle, hf = _minimal_artifacts() + outcome = lint.lint_metadata(kaggle, hf) + assert outcome.ok + assert outcome.findings == () + + +def test_lint_catches_private_kaggle_metadata() -> None: + kaggle, hf = _minimal_artifacts() + kaggle["isPrivate"] = True + outcome = lint.lint_metadata(kaggle, hf) + assert not outcome.ok + assert any("kaggle.isPrivate" in msg for msg in _messages(outcome)) + + +def test_lint_catches_license_task_and_tag_mismatches() -> None: + kaggle, hf = _minimal_artifacts() + kaggle["licenses"] = [{"name": "CC0"}] + kaggle["keywords"] = ["crm"] + hf["license"] = "apache-2.0" + hf["task_categories"] = ["text-classification"] + hf["tags"] = ["crm"] + outcome = lint.lint_metadata(kaggle, hf) + messages = "\n".join(_messages(outcome)) + assert "kaggle.licenses[0].name" in messages + assert "hf.license" in messages + assert "hf.task_categories" in messages + assert "kaggle.keywords" in messages + assert "hf.tags" in messages + + +def test_lint_catches_hf_split_path_absent_from_kaggle_resources() -> None: + kaggle, hf = _minimal_artifacts() + hf["configs"][0]["data_files"][0]["path"] = "intro/tasks/converted_within_90_days/oops.parquet" + outcome = lint.lint_metadata(kaggle, hf) + messages = "\n".join(_messages(outcome)) + assert "hf.configs.data_files" in messages + assert "oops.parquet" in messages + + +def test_lint_catches_missing_hf_split_entry() -> None: + kaggle, hf = _minimal_artifacts() + hf["configs"][0]["data_files"] = hf["configs"][0]["data_files"][:1] + outcome = lint.lint_metadata(kaggle, hf) + messages = "\n".join(_messages(outcome)) + assert "intro data_files expected" in messages + assert "valid.parquet" in messages + assert "test.parquet" in messages + + +def test_lint_catches_schema_mismatch_between_flat_csv_and_task_splits() -> None: + kaggle, hf = _minimal_artifacts() + mutated = deepcopy(kaggle["resources"]) + for resource in mutated: + if resource["path"] == f"intro/tasks/{lint.DEFAULT_TASK}/train.parquet": + resource["schema"]["fields"] = [_field("account_id"), _field("unexpected")] + break + kaggle["resources"] = mutated + outcome = lint.lint_metadata(kaggle, hf) + messages = "\n".join(_messages(outcome)) + assert "schema differs from 'intro/lead_scoring.csv' minus split" in messages + + +def test_lint_catches_missing_per_tier_review_artifact() -> None: + kaggle, hf = _minimal_artifacts() + kaggle["resources"] = [r for r in kaggle["resources"] if r["path"] != "advanced/metrics.json"] + outcome = lint.lint_metadata(kaggle, hf) + messages = "\n".join(_messages(outcome)) + assert "missing per-tier review artifact 'advanced/metrics.json'" in messages + + +def test_lint_compares_metadata_schema_to_actual_files_when_present(tmp_path: Path) -> None: + kaggle, hf = _minimal_artifacts() + release_dir = tmp_path / "release" + task_dir = release_dir / "intro" / "tasks" / lint.DEFAULT_TASK + task_dir.mkdir(parents=True) + (release_dir / "intro" / "lead_scoring.csv").write_text( + "split,account_id,score,converted_within_90_days\ntrain,acct_1,0.1,false\n", + encoding="utf-8", + ) + pq.write_table( + pa.table( + { + "account_id": pa.array(["acct_1"], pa.string()), + "score": pa.array([0.1], pa.float64()), + "unexpected": pa.array([False], pa.bool_()), + } + ), + task_dir / "train.parquet", + ) + outcome = lint.lint_metadata( + kaggle, + hf, + release_dir=release_dir, + tiers=("intro",), + ) + messages = "\n".join(_messages(outcome)) + assert "metadata schema differs from actual parquet schema" in messages + assert "unexpected" in messages + + +def test_strict_files_fails_when_release_files_are_missing(tmp_path: Path) -> None: + kaggle, hf = _minimal_artifacts() + outcome = lint.lint_metadata( + kaggle, + hf, + release_dir=tmp_path / "release", + tiers=("intro",), + strict_files=True, + ) + messages = "\n".join(_messages(outcome)) + assert "missing release file required for strict schema lint" in messages + assert "intro/lead_scoring.csv" in messages + + +def test_committed_release_artifacts_pass_lint() -> None: + outcome = lint.run_lint(_REPO_ROOT / "release") + assert outcome.ok, _messages(outcome) + + +def test_main_returns_1_on_lint_failure(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + release_dir = tmp_path / "release" + (release_dir / "kaggle").mkdir(parents=True) + (release_dir / "huggingface").mkdir() + kaggle, hf = _minimal_artifacts() + kaggle["isPrivate"] = True + (release_dir / "kaggle" / "dataset-metadata.json").write_text( + json.dumps(kaggle), encoding="utf-8" + ) + (release_dir / "huggingface" / "README.md").write_text( + "---\n" + lint.yaml.safe_dump(hf, sort_keys=False) + "---\nbody\n", + encoding="utf-8", + ) + rc = lint.main(["--release-dir", str(release_dir)]) + assert rc == 1 + captured = capsys.readouterr() + assert "kaggle.isPrivate" in captured.err + + +def test_main_returns_2_on_missing_artifact( + tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + rc = lint.main(["--release-dir", str(tmp_path / "missing")]) + assert rc == 2 + captured = capsys.readouterr() + assert "missing JSON artifact" in captured.err