|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Material review inventory (CIP-0002). |
| 4 | +
|
| 5 | +Builds a simple inventory of \\include{...} usage across: |
| 6 | +- ExecEd lecture sources (execed/_lamd/*.md) |
| 7 | +- Talk sources (e.g. ~/lawrennd/talks/_atomic-human/, _business/, _policy/, _economics/) |
| 8 | +
|
| 9 | +Outputs: |
| 10 | +- JSON (full inventory + frequency tables) |
| 11 | +- CSV (include frequency table) |
| 12 | +
|
| 13 | +Design goals: |
| 14 | +- No third-party dependencies. |
| 15 | +- Works even if target directories live outside this repo. |
| 16 | +- Can be upstreamed/mirrored into lamd tooling later. |
| 17 | +""" |
| 18 | + |
| 19 | +from __future__ import annotations |
| 20 | + |
| 21 | +import argparse |
| 22 | +import csv |
| 23 | +import dataclasses |
| 24 | +import datetime as dt |
| 25 | +import json |
| 26 | +import os |
| 27 | +import re |
| 28 | +import subprocess |
| 29 | +from pathlib import Path |
| 30 | +from typing import Iterable, Optional |
| 31 | + |
| 32 | + |
| 33 | +INCLUDE_RE = re.compile(r"\\include\{([^}]+)\}") |
| 34 | + |
| 35 | + |
| 36 | +SKIP_DIR_NAMES = { |
| 37 | + ".git", |
| 38 | + "_site", |
| 39 | + ".jekyll-cache", |
| 40 | + ".sass-cache", |
| 41 | + "vendor", |
| 42 | + ".venv", |
| 43 | + ".venv-vibesafe", |
| 44 | + "__pycache__", |
| 45 | + ".ipynb_checkpoints", |
| 46 | +} |
| 47 | + |
| 48 | + |
| 49 | +SKIP_FILE_SUFFIXES = ( |
| 50 | + ".posts.html", |
| 51 | + ".posts.html.markdown", |
| 52 | + ".slides.html", |
| 53 | + ".slides.html.markdown", |
| 54 | + ".notes.ipynb.markdown", |
| 55 | + ".ipynb", |
| 56 | +) |
| 57 | + |
| 58 | + |
| 59 | +def run_git_last_change_date(repo_dir: Path, file_path: Path) -> Optional[dt.date]: |
| 60 | + """ |
| 61 | + Return last-change date for file via git, or None if unavailable. |
| 62 | + """ |
| 63 | + try: |
| 64 | + out = subprocess.check_output( |
| 65 | + ["git", "-C", str(repo_dir), "log", "-1", "--format=%cs", "--", str(file_path)], |
| 66 | + stderr=subprocess.DEVNULL, |
| 67 | + text=True, |
| 68 | + ).strip() |
| 69 | + except Exception: |
| 70 | + return None |
| 71 | + if not out: |
| 72 | + return None |
| 73 | + try: |
| 74 | + return dt.date.fromisoformat(out) |
| 75 | + except ValueError: |
| 76 | + return None |
| 77 | + |
| 78 | + |
| 79 | +def find_repo_root(start: Path) -> Optional[Path]: |
| 80 | + cur = start.resolve() |
| 81 | + if cur.is_file(): |
| 82 | + cur = cur.parent |
| 83 | + for p in [cur] + list(cur.parents): |
| 84 | + if (p / ".git").exists(): |
| 85 | + return p |
| 86 | + return None |
| 87 | + |
| 88 | + |
| 89 | +def iter_source_files(root: Path) -> Iterable[Path]: |
| 90 | + """ |
| 91 | + Yield .md and .gpp.markdown files under root, skipping generated artifacts. |
| 92 | + """ |
| 93 | + root = root.resolve() |
| 94 | + if not root.exists(): |
| 95 | + return |
| 96 | + for dirpath, dirnames, filenames in os.walk(root): |
| 97 | + # prune |
| 98 | + dirnames[:] = [d for d in dirnames if d not in SKIP_DIR_NAMES] |
| 99 | + for name in filenames: |
| 100 | + p = Path(dirpath) / name |
| 101 | + if name.endswith(SKIP_FILE_SUFFIXES): |
| 102 | + continue |
| 103 | + if name.endswith(".md") or name.endswith(".gpp.markdown"): |
| 104 | + yield p |
| 105 | + |
| 106 | + |
| 107 | +def parse_frontmatter_session(md_text: str) -> Optional[str]: |
| 108 | + """ |
| 109 | + Best-effort frontmatter parse for 'session:' value. |
| 110 | + """ |
| 111 | + lines = md_text.splitlines() |
| 112 | + if not (lines and lines[0].strip() == "---"): |
| 113 | + return None |
| 114 | + # find second --- |
| 115 | + try: |
| 116 | + end = next(i for i in range(1, len(lines)) if lines[i].strip() == "---") |
| 117 | + except StopIteration: |
| 118 | + return None |
| 119 | + for line in lines[1:end]: |
| 120 | + if line.strip().startswith("session:"): |
| 121 | + return line.split(":", 1)[1].strip().strip('"').strip("'") |
| 122 | + return None |
| 123 | + |
| 124 | + |
| 125 | +def extract_includes(md_text: str) -> list[str]: |
| 126 | + return sorted({m.group(1).strip() for m in INCLUDE_RE.finditer(md_text)}) |
| 127 | + |
| 128 | + |
| 129 | +@dataclasses.dataclass(frozen=True) |
| 130 | +class FileInventory: |
| 131 | + path: str |
| 132 | + includes: list[str] |
| 133 | + session: Optional[str] = None |
| 134 | + last_changed: Optional[str] = None # YYYY-MM-DD when available |
| 135 | + |
| 136 | + |
| 137 | +def build_inventory( |
| 138 | + roots: list[Path], |
| 139 | + since: Optional[dt.date], |
| 140 | +) -> list[FileInventory]: |
| 141 | + inv: list[FileInventory] = [] |
| 142 | + for root in roots: |
| 143 | + repo_root = find_repo_root(root) |
| 144 | + for f in iter_source_files(root): |
| 145 | + try: |
| 146 | + txt = f.read_text(errors="ignore") |
| 147 | + except Exception: |
| 148 | + continue |
| 149 | + includes = extract_includes(txt) |
| 150 | + if not includes: |
| 151 | + continue |
| 152 | + |
| 153 | + session = parse_frontmatter_session(txt) |
| 154 | + |
| 155 | + last_changed: Optional[dt.date] = None |
| 156 | + if repo_root is not None: |
| 157 | + last_changed = run_git_last_change_date(repo_root, f) |
| 158 | + if last_changed is None: |
| 159 | + try: |
| 160 | + last_changed = dt.date.fromtimestamp(f.stat().st_mtime) |
| 161 | + except Exception: |
| 162 | + last_changed = None |
| 163 | + |
| 164 | + if since is not None and last_changed is not None and last_changed < since: |
| 165 | + continue |
| 166 | + |
| 167 | + inv.append( |
| 168 | + FileInventory( |
| 169 | + path=str(f), |
| 170 | + includes=includes, |
| 171 | + session=session, |
| 172 | + last_changed=last_changed.isoformat() if last_changed else None, |
| 173 | + ) |
| 174 | + ) |
| 175 | + return inv |
| 176 | + |
| 177 | + |
| 178 | +def include_frequency(items: list[FileInventory]) -> dict[str, int]: |
| 179 | + freq: dict[str, int] = {} |
| 180 | + for it in items: |
| 181 | + for inc in it.includes: |
| 182 | + freq[inc] = freq.get(inc, 0) + 1 |
| 183 | + return dict(sorted(freq.items(), key=lambda kv: (-kv[1], kv[0]))) |
| 184 | + |
| 185 | + |
| 186 | +def main() -> int: |
| 187 | + ap = argparse.ArgumentParser() |
| 188 | + ap.add_argument("--execed-lamd", type=Path, default=Path("execed/_lamd")) |
| 189 | + ap.add_argument("--talks-dir", type=Path, action="append", default=[]) |
| 190 | + ap.add_argument("--since", type=str, default=None, help="Only include files changed on/after YYYY-MM-DD") |
| 191 | + ap.add_argument("--out-dir", type=Path, default=None, help="Write outputs to this directory") |
| 192 | + args = ap.parse_args() |
| 193 | + |
| 194 | + since: Optional[dt.date] = None |
| 195 | + if args.since: |
| 196 | + since = dt.date.fromisoformat(args.since) |
| 197 | + |
| 198 | + execed_items = build_inventory([args.execed_lamd], since=None) |
| 199 | + talks_items = build_inventory(args.talks_dir, since=since) if args.talks_dir else [] |
| 200 | + |
| 201 | + out = { |
| 202 | + "generated_at": dt.datetime.now(dt.timezone.utc).isoformat(), |
| 203 | + "since": since.isoformat() if since else None, |
| 204 | + "execed": { |
| 205 | + "root": str(args.execed_lamd), |
| 206 | + "files": [dataclasses.asdict(x) for x in execed_items], |
| 207 | + "include_frequency": include_frequency(execed_items), |
| 208 | + }, |
| 209 | + "talks": { |
| 210 | + "roots": [str(p) for p in args.talks_dir], |
| 211 | + "files": [dataclasses.asdict(x) for x in talks_items], |
| 212 | + "include_frequency": include_frequency(talks_items), |
| 213 | + }, |
| 214 | + } |
| 215 | + |
| 216 | + if args.out_dir: |
| 217 | + args.out_dir.mkdir(parents=True, exist_ok=True) |
| 218 | + json_path = args.out_dir / "inventory.json" |
| 219 | + csv_path = args.out_dir / "include_frequency.csv" |
| 220 | + |
| 221 | + json_path.write_text(json.dumps(out, indent=2, sort_keys=True)) |
| 222 | + |
| 223 | + # CSV: combined include frequency with execed + talks counts |
| 224 | + execed_freq = out["execed"]["include_frequency"] |
| 225 | + talks_freq = out["talks"]["include_frequency"] |
| 226 | + all_incs = sorted(set(execed_freq.keys()) | set(talks_freq.keys())) |
| 227 | + with csv_path.open("w", newline="") as f: |
| 228 | + w = csv.writer(f) |
| 229 | + w.writerow(["include", "execed_count", "talks_count", "total"]) |
| 230 | + for inc in all_incs: |
| 231 | + e = int(execed_freq.get(inc, 0)) |
| 232 | + t = int(talks_freq.get(inc, 0)) |
| 233 | + w.writerow([inc, e, t, e + t]) |
| 234 | + |
| 235 | + else: |
| 236 | + print(json.dumps(out, indent=2, sort_keys=True)) |
| 237 | + |
| 238 | + return 0 |
| 239 | + |
| 240 | + |
| 241 | +if __name__ == "__main__": |
| 242 | + raise SystemExit(main()) |
| 243 | + |
0 commit comments