diff --git a/CLAUDE.md b/CLAUDE.md index 060080d..c732359 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,8 +13,15 @@ scans/ input PDFs (gitignored; SCANS_ROOT) data/ outputs (gitignored; DATA_ROOT) pages//page-NN.png rendered images results//page-NN.json extraction results (one PageResult per page) + verifier/.bundle.json pre-processor output: result + per-row bboxes + verifier/.verified.json verifier UI export: hand-corrected PageResult jobs.db SQLite job table +verifier/ static SPA for manual row-by-row verification. + Loads a bundle, renders each row's cropped + image strip next to an editable text field, + exports a corrected verified.json. + core/ schema.py Pydantic models. GeminiPageResult is what the model returns (used as response_schema); @@ -41,12 +48,28 @@ core/ PageLayout (header_bottom_y, body_mid_y, column_mid_x). Used by the per-quadrant cropper in scripts/calibrate_models.py. + `partition_row_lines_by_quadrant(image, + layout)` is the public hook the verifier + pre-processor uses to compute per-row bboxes. continuations.py Read-time merge of `notes="continuation"` rows into the prior entry's raw_text. Pure function; on-disk shape unchanged. cli.py Typer entrypoint: `flowsheets `. Builds dependencies from env, calls into core. + +scripts/ + make_verifier_bundle.py PageResult JSON + page PNG -> verifier + bundle.json with per-quadrant + per-row + bboxes for the SPA to canvas-crop. Hard-codes + SCHEMA_VERSION = 1; bump on incompatible + schema changes. + derive_truth.py .verified.json -> .truth.json + by extracting short uppercased substrings + (page date tokens, jock prefix, artist + portion of raw_text). Single source of + truth for those rules — the UI doesn't + derive truth itself. ``` ## Why these choices diff --git a/README.md b/README.md index 0e6c71c..0632ff1 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,30 @@ Tests are split into: The default test run **excludes** the `external_api` and `slow` markers; CI runs the same default. The golden-page external-API runner is a follow-up. +## Manual verifier + +After the pipeline produces `data/results//page-NN.json`, you can hand-verify and correct entries via the static SPA in `verifier/`. Each row's cropped image strip sits next to its detected text in an editable field. Export emits a `.verified.json` (`PageResult`-shaped, plugs back into the pipeline as ground truth) and `derive_truth` produces a matching `tests/golden/.truth.json`. + +```bash +# Generate a bundle +python -m scripts.make_verifier_bundle \ + data/results//page-NN.json \ + data/pages//page-NN.png \ + --out data/verifier/.bundle.json + +# Open the verifier +python -m http.server 8765 +# then visit: +# http://localhost:8765/verifier/?bundle=/data/verifier/.bundle.json + +# Derive a truth file from the exported verified.json +python -m scripts.derive_truth \ + data/verifier/.verified.json \ + --out tests/golden/.truth.json +``` + +See `verifier/README.md` for the bundle schema, expected file layout, and the substring-derivation rules. + ## Cost calibration Gemini 3.1 Pro charges per input token; one 300-DPI flowsheet page at `media_resolution=high` is ~1120 image tokens plus ~600 prompt tokens. Across the full corpus (~16K pages) input cost lands in the low tens of dollars; output adds modestly. Run the pipeline against a 10–20 page sample first and inspect both quality and `usage_metadata` before scheduling a full run. diff --git a/core/jobs.py b/core/jobs.py index 2f5454d..15a8208 100644 --- a/core/jobs.py +++ b/core/jobs.py @@ -50,11 +50,15 @@ class Job: image_path: str | None result_path: str | None model_version: str | None + verified_at: str | None + verified_path: str | None + corrections_path: str | None created_at: str updated_at: str @classmethod def from_row(cls, row: aiosqlite.Row) -> Self: + keys = set(row.keys()) return cls( pdf_path=row["pdf_path"], page_number=row["page_number"], @@ -64,6 +68,11 @@ def from_row(cls, row: aiosqlite.Row) -> Self: image_path=row["image_path"], result_path=row["result_path"], model_version=row["model_version"], + # Late-added columns are nullable; tolerate their absence on a + # very old jobs.db that hasn't been re-init()ed yet. + verified_at=row["verified_at"] if "verified_at" in keys else None, + verified_path=row["verified_path"] if "verified_path" in keys else None, + corrections_path=(row["corrections_path"] if "corrections_path" in keys else None), created_at=row["created_at"], updated_at=row["updated_at"], ) @@ -71,22 +80,42 @@ def from_row(cls, row: aiosqlite.Row) -> Self: _SCHEMA = """ CREATE TABLE IF NOT EXISTS jobs ( - pdf_path TEXT NOT NULL, - page_number INTEGER NOT NULL, - status TEXT NOT NULL, - attempts INTEGER NOT NULL DEFAULT 0, - last_error TEXT, - image_path TEXT, - result_path TEXT, - model_version TEXT, - created_at TEXT NOT NULL, - updated_at TEXT NOT NULL, + pdf_path TEXT NOT NULL, + page_number INTEGER NOT NULL, + status TEXT NOT NULL, + attempts INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + image_path TEXT, + result_path TEXT, + model_version TEXT, + verified_at TEXT, + verified_path TEXT, + corrections_path TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, PRIMARY KEY (pdf_path, page_number) ); CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status); """ +# Columns added after the initial schema. `init()` runs `ALTER TABLE` for +# each of these against existing databases so older jobs.db files pick up +# the new columns without losing data. +_LATE_COLUMNS: tuple[tuple[str, str], ...] = ( + ("verified_at", "TEXT"), + ("verified_path", "TEXT"), + ("corrections_path", "TEXT"), +) + +# Indexes that depend on late-added columns and therefore must be created +# AFTER the ALTER TABLE migrations run. Keeping them out of `_SCHEMA` +# avoids "no such column" errors when initializing a legacy database. +_POST_MIGRATION_INDEXES: tuple[str, ...] = ( + "CREATE INDEX IF NOT EXISTS idx_jobs_verified_at " + "ON jobs(verified_at) WHERE verified_at IS NOT NULL", +) + def _now() -> str: return datetime.now(UTC).isoformat() @@ -112,6 +141,19 @@ async def init(self) -> None: # rollback journal. The pragma is persistent across connections. await db.execute("PRAGMA journal_mode=WAL") await db.executescript(_SCHEMA) + # ALTER TABLE migrations for late-added columns. CREATE TABLE + # above is idempotent (IF NOT EXISTS), so on a fresh DB this + # is a no-op; on an existing DB it adds the columns. + db.row_factory = aiosqlite.Row + cursor = await db.execute("PRAGMA table_info(jobs)") + existing = {row["name"] for row in await cursor.fetchall()} + for name, col_type in _LATE_COLUMNS: + if name not in existing: + await db.execute(f"ALTER TABLE jobs ADD COLUMN {name} {col_type}") + # Indexes that reference late columns run after the ALTER + # TABLE pass, otherwise SQLite errors on the missing column. + for index_sql in _POST_MIGRATION_INDEXES: + await db.execute(index_sql) await db.commit() @asynccontextmanager @@ -234,6 +276,49 @@ async def mark_low_confidence( clear_error=True, ) + async def mark_verified( + self, + pdf_path: str, + page_number: int, + *, + verified_path: Path, + corrections_path: Path, + ) -> bool: + """Record that a page has been hand-verified via the verifier UI. + + Doesn't change `status` — verification is orthogonal to the + extraction state machine (a `completed` page can be verified; + re-extracting a verified page resets the result but should NOT + clear the verification record by default — that's a separate + decision a human makes via `retry`). + + Returns True if a job row matched, False otherwise. Callers + (e.g. the verifier server) may want to write files even when no + job row exists for the page (test fixtures), so a False return + is not an error. + """ + async with self._connect() as db: + cursor = await db.execute( + """ + UPDATE jobs + SET verified_at = ?, + verified_path = ?, + corrections_path = ?, + updated_at = ? + WHERE pdf_path = ? AND page_number = ? + """, + ( + _now(), + str(verified_path), + str(corrections_path), + _now(), + pdf_path, + page_number, + ), + ) + await db.commit() + return cursor.rowcount > 0 + async def mark_failed(self, pdf_path: str, page_number: int, error: str) -> None: async with self._connect() as db: cursor = await db.execute( diff --git a/core/page_layout.py b/core/page_layout.py index cfaf48b..7ea4cff 100644 --- a/core/page_layout.py +++ b/core/page_layout.py @@ -31,6 +31,8 @@ import numpy as np +from core.schema import QUADRANT_ORDER, QuadrantPosition + if TYPE_CHECKING: from PIL.Image import Image as PILImage @@ -89,6 +91,14 @@ # Comments line and excludes the last body row. _BODY_BOTTOM_SEARCH_BAND = (0.95, 0.99) +# When the top quadrant's last spacing exceeds this multiple of the +# global median row spacing, the trailing line is reattributed to the +# corresponding bottom quadrant. The anomaly signals that body_mid_y +# landed BELOW the bottom block's hour-jock-cell baseline, leaving that +# line in the top partition by mistake. See +# `partition_row_lines_by_quadrant`'s correction-pass comment. +_BOTTOM_BASELINE_REATTRIBUTION_RATIO = 1.3 + @dataclass(frozen=True) class PageLayout: @@ -296,3 +306,80 @@ def _detect_body_bottom_y(row_lines: list[int], h: int) -> int: if not in_band: return int(h * FALLBACK_BODY_BOTTOM_FRACTION) return in_band[-1] + + +def partition_row_lines_by_quadrant( + image: PILImage, layout: PageLayout +) -> dict[QuadrantPosition, list[int]]: + """Detected row-line y-coords, partitioned by quadrant of the body grid. + + Reuses `_detect_row_lines` for the y-coordinates, then classifies each + line by which page-column it spans (left, right, or both, based on ink + density at that y) and which body band it sits in (top vs bottom, by + `layout.body_mid_y`). + + A line spanning both columns is added to BOTH side quadrants — most + printed flowsheet grid lines run full-width and bracket both hour-blocks + of a row. + + Lines outside `[layout.header_bottom_y, layout.body_bottom_y)` are + dropped (header or footer artifacts, not body rows). + + Returns a dict with all four `QUADRANT_ORDER` keys; empty list when + no lines hit a quadrant (blank image, un-printed margin). + """ + w, _h = image.size + grayscale = np.asarray(image.convert("L")) + col_mid = layout.column_mid_x + + all_lines = _detect_row_lines(grayscale, w, col_mid) + + ink = (255 - grayscale).astype(np.float64) / 255.0 + left_w = float(col_mid) + right_w = float(w - col_mid) + threshold = _ROW_LINE_THRESHOLDS[-1] + + out: dict[QuadrantPosition, list[int]] = {q: [] for q in QUADRANT_ORDER} + for y in all_lines: + if not (layout.header_bottom_y <= y < layout.body_bottom_y): + continue + left_ink = float(ink[y, :col_mid].sum()) + right_ink = float(ink[y, col_mid:].sum()) + on_left = left_ink > threshold * left_w + on_right = right_ink > threshold * right_w + if y < layout.body_mid_y: + if on_left: + out["top_left"].append(int(y)) + if on_right: + out["top_right"].append(int(y)) + else: + if on_left: + out["bottom_left"].append(int(y)) + if on_right: + out["bottom_right"].append(int(y)) + + # Correction pass: on some pages `_detect_body_mid_y` lands BELOW the + # bottom-block hour-jock-cell baseline (the anchor at 0.55h prefers the + # gap below the cell over the true inter-block gap above it). The + # baseline line then gets misattributed to the top quadrant, and the + # bottom quadrant's first detected line is row 0's BOTTOM rather than + # its top — shifting every row crop up by one. + # + # Signal: the top quadrant's last spacing is significantly larger than + # the median row spacing across all detected lines (a normal sequence + # has consistent spacing; an anomalous jump at the end means the last + # line belongs to a different sequence — the bottom block). + if len(all_lines) >= 2: + median_spacing = float(np.median(np.diff(np.asarray(all_lines)))) + if median_spacing > 0: + for top_pos, bottom_pos in ( + ("top_left", "bottom_left"), + ("top_right", "bottom_right"), + ): + top_lines = out[top_pos] # type: ignore[index] + if len(top_lines) >= 2: + last_spacing = top_lines[-1] - top_lines[-2] + if last_spacing > _BOTTOM_BASELINE_REATTRIBUTION_RATIO * median_spacing: + moved = top_lines.pop() + out[bottom_pos].insert(0, moved) # type: ignore[index] + return out diff --git a/pyproject.toml b/pyproject.toml index 440f0a3..a9f8629 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,14 @@ dependencies = [ "rich>=13.0.0", "pillow>=10.0", "numpy>=2.0", + # Verifier UI server (verifier/serve.py). The static SPA depends on the + # POST /api/lookup proxy (request-o-matic doesn't emit CORS) and + # POST /api/save (writes verified.json + corrections.json, updates + # jobs.db). httpx is also load-bearing for tests/unit/test_verifier_serve.py + # via httpx.ASGITransport. + "fastapi>=0.115", + "uvicorn>=0.30", + "httpx>=0.27", ] [project.optional-dependencies] diff --git a/scripts/derive_truth.py b/scripts/derive_truth.py new file mode 100644 index 0000000..633eca9 --- /dev/null +++ b/scripts/derive_truth.py @@ -0,0 +1,135 @@ +"""Derive a `GoldenTruth` file from a hand-corrected `PageResult`. + +The verifier UI exports `.verified.json` — a `PageResult` whose +`raw_text` fields have been hand-corrected. This tool extracts short +substrings from those fields and writes a `GoldenTruth`-shaped file +that plugs into the existing parity-test harness. + +Substring rules (codified to match the convention in `tests/golden/*.truth.json`): + + * page_date_substrings: whitespace-delimited tokens of `page_date_raw`. + e.g. "Tues 4/3 90" -> ["Tues", "4/3", "90"] + * jock_substring: first whitespace-delimited token of `jock_raw`, + uppercased and truncated to 4 chars. + e.g. "Andrew" -> "ANDR" + * raw_substring (per row): the artist portion of `raw_text` + (`parse_artist_track`), uppercased, truncated to <=24 chars at + the last whitespace boundary inside the cutoff. If no separator, + use the full text. + +The substrings are deliberately short — `core.golden._icontains` is a +case-insensitive substring match, so short tokens are forgiving of +small misspellings while remaining unambiguous within the WXYC corpus. + +CLI: + + python -m scripts.derive_truth \\ + data/verifier/.verified.json \\ + --out tests/golden/.truth.json +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from core.golden import GoldenTruth, QuadrantTruth, RowTruth +from core.parse import parse_artist_track +from core.schema import PageResult + +_MAX_ROW_SUBSTRING = 24 +_MAX_JOCK_SUBSTRING = 4 + + +def _date_substrings(page_date_raw: str | None) -> list[str]: + """Split `page_date_raw` into whitespace-delimited tokens. + + Empty list when the field is None, empty, or whitespace-only. + """ + if not page_date_raw: + return [] + return page_date_raw.split() + + +def _jock_substring(jock_raw: str | None) -> str | None: + """First whitespace-delimited token of `jock_raw`, uppercased, + truncated to 4 chars. Returns None when the field is missing so the + truth file omits the assertion entirely. + """ + if not jock_raw or not jock_raw.strip(): + return None + first_token = jock_raw.strip().split()[0] + out = first_token.upper()[:_MAX_JOCK_SUBSTRING] + return out or None + + +def _row_substring(raw_text: str) -> str: + """Artist-portion of `raw_text`, uppercased, capped at 24 chars. + + Falls back to the full raw_text when `parse_artist_track` finds no + separator (entries without "Artist - Track" structure, e.g. a + continuation row that wasn't merged). + + The 24-char cap snaps to the last whitespace boundary inside the + cutoff to avoid mid-word truncation. If the artist is one long + word, hard-cut at 24. + """ + artist, _track = parse_artist_track(raw_text) + src = (artist or raw_text or "").strip().upper() + if len(src) <= _MAX_ROW_SUBSTRING: + return src + cut = src.rfind(" ", 0, _MAX_ROW_SUBSTRING) + return src[:cut] if cut > 0 else src[:_MAX_ROW_SUBSTRING] + + +def derive_truth(page: PageResult) -> GoldenTruth: + """Build a `GoldenTruth` from a hand-corrected `PageResult`. + + Quadrants pass through in canonical order. Entries with empty + `raw_text` are skipped (nothing to match against). + """ + quadrants_out: list[QuadrantTruth] = [] + for quad in page.quadrants: + rows = [ + RowTruth(raw_substring=_row_substring(entry.raw_text)) + for entry in quad.entries + if entry.raw_text.strip() + ] + quadrants_out.append( + QuadrantTruth( + position=quad.position, + hour_raw=quad.hour_raw, + jock_substring=_jock_substring(quad.jock_raw), + rows=rows, + ) + ) + return GoldenTruth( + page_date_substrings=_date_substrings(page.page_date_raw), + quadrants=quadrants_out, + ) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Derive a GoldenTruth file from a verified PageResult.", + ) + parser.add_argument("verified", type=Path, help="Path to the verified.json PageResult.") + parser.add_argument("--out", type=Path, required=True, help="Output truth.json path.") + args = parser.parse_args(argv) + + if not args.verified.is_file(): + print(f"verified file not found: {args.verified}", file=sys.stderr) + return 1 + + page = PageResult.model_validate_json(args.verified.read_text()) + truth = derive_truth(page) + + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text(truth.model_dump_json(indent=2, exclude_defaults=False)) + print(f"wrote {args.out}") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/scripts/make_verifier_bundle.py b/scripts/make_verifier_bundle.py new file mode 100644 index 0000000..84b2c84 --- /dev/null +++ b/scripts/make_verifier_bundle.py @@ -0,0 +1,300 @@ +"""Pre-processor: turn a `PageResult` + page image into a verifier bundle. + +The bundle is the input the static `verifier/` UI consumes. It contains the +extraction output (verbatim from the pipeline) plus geometry: a bbox per +quadrant, a bbox per row inside each quadrant, and a relative path to the +source image so the UI can canvas-crop each row in the browser. + +CLI surface: + + python -m scripts.make_verifier_bundle \\ + data/results//page-NN.json \\ + data/pages//page-NN.png \\ + --out data/verifier/.bundle.json + +If `--out` is omitted, the output is written to +`data/verifier/.bundle.json` next to the repo root. + +The bundle is a derivation, not a long-running result — re-running +overwrites. The pre-processor creates the output's parent directory if +it doesn't exist. +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path +from typing import Any + +from PIL import Image + +from core.page_layout import PageLayout, detect_page_layout, partition_row_lines_by_quadrant +from core.schema import QUADRANT_ORDER, Entry, PageResult, QuadrantPosition + +# Bump when the bundle JSON schema becomes incompatible. +# `verifier/README.md` documents the versioning strategy. +# v1: initial schema. +# v2: add `pdf_path` and `page_number` so the verifier UI can target the +# corresponding `jobs.db` row when saving corrections back. +SCHEMA_VERSION = 2 + + +BBox = tuple[int, int, int, int] + + +def _parse_job_key_from_result_path(result_path: Path) -> tuple[str, int] | None: + """Recover `(pdf_path, page_number)` from a pipeline result-JSON path. + + The pipeline writes results at `/results//page-NN.json` + (see `core.pipeline.result_path_for`). Reversing that gives us the + `(pdf_path, page_number)` pair used as the primary key in `jobs.db`. + + Returns `None` when the path doesn't match this layout (e.g. test + fixtures, `/tmp` spike outputs, ad-hoc files) — those bundles save + files only, no DB update. + """ + parts = result_path.parts + if "results" not in parts: + return None + idx = parts.index("results") + after = parts[idx + 1 :] + if len(after) < 2: + return None + *pdf_dir_parts, page_file = after + if not page_file.startswith("page-") or not page_file.endswith(".json"): + return None + try: + page_number = int(page_file[len("page-") : -len(".json")]) + except ValueError: + return None + pdf_path = "/".join(pdf_dir_parts) + ".pdf" + return (pdf_path, page_number) + + +def _quadrant_bboxes(layout: PageLayout, *, page_width: int) -> dict[QuadrantPosition, BBox]: + """Bounding box of each quadrant's body region. + + Quadrants partition the body strip (header_bottom_y .. body_bottom_y) + via `column_mid_x` (left/right) and `body_mid_y` (top/bottom). + """ + return { + "top_left": (0, layout.header_bottom_y, layout.column_mid_x, layout.body_mid_y), + "top_right": (layout.column_mid_x, layout.header_bottom_y, page_width, layout.body_mid_y), + "bottom_left": (0, layout.body_mid_y, layout.column_mid_x, layout.body_bottom_y), + "bottom_right": ( + layout.column_mid_x, + layout.body_mid_y, + page_width, + layout.body_bottom_y, + ), + } + + +def _merge_with_spans(entries: list[Entry]) -> list[tuple[Entry, int]]: + """Apply continuation-row merging and compute each entry's physical-row span. + + This is the geometry-aware companion to `core.continuations.merge_continuations`: + it produces the same merged entries, paired with the number of physical + flowsheet rows each logical entry occupies on the page. + + - notes="continuation": folds into the previous logical entry's raw_text + (verbatim with the existing merge rules) and adds 1 to its span. + - notes="double_height": stays as a single logical entry but spans 2 rows. + - All others: span 1. + + A leading "continuation" with nothing above it is preserved as-is with + span 1, matching `merge_continuations`'s edge-case behavior. + + Why this lives here and not in `core/continuations.py`: span-tracking is + a verifier-geometry concern. The on-disk pipeline doesn't need it. + """ + result: list[tuple[Entry, int]] = [] + for entry in entries: + if entry.notes == "continuation" and result: + prior, prior_span = result[-1] + joined = f"{prior.raw_text.rstrip()} {entry.raw_text.lstrip()}".strip() + # Mark the merged entry as `double_height` so the verifier UI's + # notes dropdown reflects the multi-row nature of the row. The + # original schema enum doesn't distinguish "absorbed continuation" + # from "model-tagged double_height" — both mean "this logical + # entry occupies more than one physical row" for the verifier. + merged = prior.model_copy( + update={ + "raw_text": joined, + "oddities": [*prior.oddities, *entry.oddities], + "notes": "double_height", + } + ) + result[-1] = (merged, prior_span + 1) + elif entry.notes == "double_height": + result.append((entry, 2)) + else: + result.append((entry, 1)) + return result + + +def _assign_row_bboxes( + quad_bbox: BBox, + lines: list[int], + spans: list[int], +) -> list[BBox]: + """Pair logical entries to row strips inside a quadrant. + + `spans` is one int per logical entry: the number of physical row strips + that entry occupies on the page (1 for normal entries; 2 for double_height + or one continuation; 3 for two continuations; etc.). + + Heuristic: + - When `len(lines) >= sum(spans) + 1`, slice consecutive line pairs + according to each entry's span. Entry i's bbox spans from `lines[j]` + to `lines[j + spans[i]]`, with `j` advancing by `spans[i]` between + entries. Trailing lines (beyond what spans require) are ignored. + - Otherwise, even-spacing fallback: divide the quadrant height into + `len(spans)` equal strips, one per logical entry, ignoring the + physical-row count. + + The fallback uses entry count, not physical row count, because uniform + strips are better UX than partial pairing (which would leave the tail + of the quadrant uncropped on entries with wider spans). + """ + if not spans: + return [] + x1, y1, x2, y2 = quad_bbox + total_physical_rows = sum(spans) + if len(lines) >= total_physical_rows + 1: + rows: list[BBox] = [] + j = 0 + for span in spans: + rows.append((x1, lines[j], x2, lines[j + span])) + j += span + return rows + height = y2 - y1 + n_entries = len(spans) + step = height / n_entries + return [ + (x1, y1 + int(round(i * step)), x2, y1 + int(round((i + 1) * step))) + for i in range(n_entries) + ] + + +def make_bundle( + page: PageResult, + *, + image_path: Path, + bundle_path: Path, + job_key: tuple[str, int] | None = None, +) -> dict[str, Any]: + """Assemble the verifier bundle for one page. + + `bundle_path` is used only to compute the relative `image_path` field + — the file isn't written here. The CLI's `main` writes the bundle to + disk; this function is the pure construction step so tests can + inspect the output without filesystem side effects. + """ + image = Image.open(image_path) + layout = detect_page_layout(image) + width, _height = image.size + + quad_boxes = _quadrant_bboxes(layout, page_width=width) + lines_by_quad = partition_row_lines_by_quadrant(image, layout) + + quadrants_out: list[dict[str, Any]] = [] + for position in QUADRANT_ORDER: + # Continuations fold into the previous entry's raw_text; double_height + # stays as one entry. `_merge_with_spans` does the merge and tracks + # how many physical rows each resulting logical entry occupies, so + # the bbox cropper can skip the right number of grid lines per entry. + source_quad = next((q for q in page.quadrants if q.position == position), None) + if source_quad is None: + continue + merged_with_spans = _merge_with_spans(source_quad.entries) + bbox = quad_boxes[position] + lines = lines_by_quad.get(position, []) + spans = [s for _, s in merged_with_spans] + row_boxes = _assign_row_bboxes(bbox, lines, spans=spans) + + entries_out: list[dict[str, Any]] = [] + merged_entries = [e for e, _ in merged_with_spans] + for entry, row_bbox in zip(merged_entries, row_boxes, strict=True): + entries_out.append( + { + "row_index": entry.row_index, + "raw_text": entry.raw_text, + "confidence": entry.confidence, + "type_raw": entry.type_raw, + "notes": entry.notes, + "oddities": list(entry.oddities), + "row_bbox": list(row_bbox), + } + ) + quadrants_out.append( + { + "position": position, + "bbox": list(bbox), + "hour_raw": source_quad.hour_raw, + "jock_raw": source_quad.jock_raw, + "entries": entries_out, + "oddities": list(source_quad.oddities), + } + ) + + image_rel = os.path.relpath(image_path, bundle_path.parent) + pdf_path: str | None = None + page_number: int | None = None + if job_key is not None: + pdf_path, page_number = job_key + return { + "schema_version": SCHEMA_VERSION, + "stem": image_path.stem, + "image_path": image_rel, + "pdf_path": pdf_path, + "page_number": page_number, + "model_version": page.model_version, + "extracted_at": page.extracted_at.isoformat(), + "page_date_raw": page.page_date_raw, + "comments_raw": page.comments_raw, + "oddities": list(page.oddities), + "quadrants": quadrants_out, + } + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Build a verifier bundle from a PageResult JSON + page image.", + ) + parser.add_argument("result", type=Path, help="Path to the extraction result JSON.") + parser.add_argument("image", type=Path, help="Path to the page PNG.") + parser.add_argument( + "--out", + type=Path, + default=None, + help=( + "Output bundle path. Defaults to " + "data/verifier/.bundle.json relative to the cwd." + ), + ) + args = parser.parse_args(argv) + + if not args.result.is_file(): + print(f"result not found: {args.result}", file=sys.stderr) + return 1 + if not args.image.is_file(): + print(f"image not found: {args.image}", file=sys.stderr) + return 1 + + page = PageResult.model_validate_json(args.result.read_text()) + out_path = args.out or Path("data/verifier") / f"{args.image.stem}.bundle.json" + job_key = _parse_job_key_from_result_path(args.result) + bundle = make_bundle(page, image_path=args.image, bundle_path=out_path, job_key=job_key) + + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(bundle, indent=2)) + print(f"wrote {out_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tests/unit/test_derive_truth.py b/tests/unit/test_derive_truth.py new file mode 100644 index 0000000..309ffcb --- /dev/null +++ b/tests/unit/test_derive_truth.py @@ -0,0 +1,221 @@ +"""Tests for `scripts/derive_truth.py`. + +The truth-derivation tool consumes a `.verified.json` (PageResult- +shaped) and emits `.truth.json` (GoldenTruth-shaped) by extracting +short substrings from the user-corrected raw_text. Tests pin the +substring rules and the end-to-end CLI flow. +""" + +from __future__ import annotations + +import json +from datetime import UTC, datetime +from pathlib import Path + +import pytest + +from core.golden import GoldenTruth +from core.schema import QUADRANT_ORDER, Entry, PageResult, Quadrant +from scripts.derive_truth import ( + _date_substrings, + _jock_substring, + _row_substring, + derive_truth, + main, +) + +# -- _date_substrings ------------------------------------------------------- + + +@pytest.mark.parametrize( + ("page_date_raw", "expected"), + [ + ("Tues 4/3 90", ["Tues", "4/3", "90"]), + ("Monday 1 Jan '90", ["Monday", "1", "Jan", "'90"]), + ("", []), + (None, []), + (" ", []), # whitespace-only + ], +) +def test_date_substrings(page_date_raw: str | None, expected: list[str]) -> None: + assert _date_substrings(page_date_raw) == expected + + +# -- _jock_substring -------------------------------------------------------- + + +@pytest.mark.parametrize( + ("jock_raw", "expected"), + [ + ("Andrew", "ANDR"), + ("ANDREW", "ANDR"), + ("Andy J", "ANDY"), # first token only + ("Sam", "SAM"), # shorter than 4 chars passes through + (None, None), + ("", None), + (" ", None), + ], +) +def test_jock_substring(jock_raw: str | None, expected: str | None) -> None: + assert _jock_substring(jock_raw) == expected + + +# -- _row_substring --------------------------------------------------------- + + +@pytest.mark.parametrize( + ("raw_text", "expected"), + [ + # Examples from the plan body, matching existing golden truth convention. + ("Beastie Boys - Sabotage", "BEASTIE BOYS"), + ("Primal Scream - Loaded", "PRIMAL SCREAM"), + ("Bo Diddley - Hey Bo", "BO DIDDLEY"), + ("Elizabeth Cotten - Shake", "ELIZABETH COTTEN"), + ("JUANA MOLINA - la paradoja", "JUANA MOLINA"), + # No separator: full text uppercased, truncated at 24 chars (snap to ws). + ("standalone continuation text here", "STANDALONE CONTINUATION"), + ("short text", "SHORT TEXT"), + # Exactly 24 chars: unchanged. + ("a" * 24, "A" * 24), + # 25 chars no whitespace: hard-cut at 24. + ("a" * 25, "A" * 24), + # Em-dash separator (handled by parse_artist_track). + ("Hermanos Gutiérrez — Aguas Ardientes", "HERMANOS GUTIÉRREZ"), + ], +) +def test_row_substring(raw_text: str, expected: str) -> None: + assert _row_substring(raw_text) == expected + + +# -- derive_truth ----------------------------------------------------------- + + +def _entry(text: str, idx: int = 0) -> Entry: + return Entry(row_index=idx, raw_text=text, confidence="high") + + +def _quad(position: str, jock: str | None, hour: str | None, entries: list[Entry]) -> Quadrant: + return Quadrant( + position=position, # type: ignore[arg-type] + hour_raw=hour, + jock_raw=jock, + entries=entries, + ) + + +def _page(date: str | None, quads: list[Quadrant]) -> PageResult: + return PageResult( + page_date_raw=date, + quadrants=quads, + oddities=[], + model_version="test-verified", + extracted_at=datetime(2026, 5, 10, tzinfo=UTC), + ) + + +def test_derive_truth_returns_golden_truth_with_all_quadrants() -> None: + page = _page( + "Tues 4/3 90", + [ + _quad("top_left", "Andrew", "6AM", [_entry("Primal Scream - Loaded")]), + _quad("top_right", None, "7AM", [_entry("Beastie Boys - Sabotage")]), + _quad("bottom_left", "Andrew", "8AM", [_entry("Bo Diddley - Hey Bo")]), + _quad("bottom_right", None, "9AM", [_entry("Juana Molina - la paradoja")]), + ], + ) + truth = derive_truth(page) + assert isinstance(truth, GoldenTruth) + assert [q.position for q in truth.quadrants] == list(QUADRANT_ORDER) + + +def test_derive_truth_page_date_split_into_tokens() -> None: + page = _page("Tues 4/3 90", [_quad(p, None, None, []) for p in QUADRANT_ORDER]) + truth = derive_truth(page) + assert truth.page_date_substrings == ["Tues", "4/3", "90"] + + +def test_derive_truth_quadrant_substrings_match_rules() -> None: + page = _page( + None, + [ + _quad("top_left", "Andrew", "6AM", [_entry("Primal Scream - Loaded")]), + _quad("top_right", None, None, [_entry("Beastie Boys - Sabotage")]), + _quad("bottom_left", None, None, []), + _quad("bottom_right", None, None, [_entry("Bo Diddley - Hey Bo")]), + ], + ) + truth = derive_truth(page) + by_pos = {q.position: q for q in truth.quadrants} + assert by_pos["top_left"].jock_substring == "ANDR" + assert by_pos["top_left"].hour_raw == "6AM" + assert [r.raw_substring for r in by_pos["top_left"].rows] == ["PRIMAL SCREAM"] + assert by_pos["top_right"].jock_substring is None + assert [r.raw_substring for r in by_pos["top_right"].rows] == ["BEASTIE BOYS"] + assert by_pos["bottom_left"].rows == [] + assert [r.raw_substring for r in by_pos["bottom_right"].rows] == ["BO DIDDLEY"] + + +def test_derive_truth_skips_empty_raw_text_rows() -> None: + """An entry with empty raw_text shouldn't produce a truth row — there's + nothing to match against.""" + page = _page( + None, + [ + _quad("top_left", None, None, [_entry(""), _entry("Primal Scream")]), + _quad("top_right", None, None, []), + _quad("bottom_left", None, None, []), + _quad("bottom_right", None, None, []), + ], + ) + truth = derive_truth(page) + by_pos = {q.position: q for q in truth.quadrants} + assert [r.raw_substring for r in by_pos["top_left"].rows] == ["PRIMAL SCREAM"] + + +# -- main CLI --------------------------------------------------------------- + + +def test_main_writes_truth_file(tmp_path: Path) -> None: + page = _page( + "Tues 4/3 90", + [ + _quad("top_left", "Andrew", "6AM", [_entry("Primal Scream - Loaded")]), + _quad("top_right", None, None, [_entry("Beastie Boys - Sabotage")]), + _quad("bottom_left", None, None, []), + _quad("bottom_right", None, None, []), + ], + ) + verified_path = tmp_path / "verified.json" + verified_path.write_text(page.model_dump_json(indent=2)) + + out_path = tmp_path / "out" / "truth.json" + rc = main([str(verified_path), "--out", str(out_path)]) + assert rc == 0 + + truth = GoldenTruth.load(out_path) + assert truth.page_date_substrings == ["Tues", "4/3", "90"] + by_pos = {q.position: q for q in truth.quadrants} + assert [r.raw_substring for r in by_pos["top_left"].rows] == ["PRIMAL SCREAM"] + + +def test_main_returns_one_when_input_missing(tmp_path: Path) -> None: + rc = main([str(tmp_path / "missing.json"), "--out", str(tmp_path / "out.json")]) + assert rc == 1 + + +def test_main_round_trips_through_pydantic(tmp_path: Path) -> None: + """End-to-end: PageResult on disk → derive_truth main → GoldenTruth on + disk → GoldenTruth.load. Pins the export schema.""" + page = _page("Mon 5 May", [_quad(p, None, None, []) for p in QUADRANT_ORDER]) + verified_path = tmp_path / "verified.json" + verified_path.write_text(page.model_dump_json(indent=2)) + + out_path = tmp_path / "truth.json" + main([str(verified_path), "--out", str(out_path)]) + + # Both load the same data. + loaded_from_disk = GoldenTruth.load(out_path) + assert loaded_from_disk.page_date_substrings == ["Mon", "5", "May"] + # Round-trip a raw dict too — extra fields would be caught by extra=forbid. + raw = json.loads(out_path.read_text()) + GoldenTruth.model_validate(raw) diff --git a/tests/unit/test_jobs.py b/tests/unit/test_jobs.py index 7856a0b..5f8d03f 100644 --- a/tests/unit/test_jobs.py +++ b/tests/unit/test_jobs.py @@ -205,3 +205,83 @@ async def test_pending_for_render(store: JobStore, tmp_path: Path) -> None: pending = await store.next_pending_for_render(limit=10) assert [(j.pdf_path, j.page_number) for j in pending] == [("scans/a.pdf", 2)] + + +# -- verification tracking -------------------------------------------------- + + +async def test_mark_verified_records_paths_without_changing_status( + store: JobStore, tmp_path: Path +) -> None: + """`mark_verified` updates the verification columns but leaves `status` + alone — verification is orthogonal to the extraction state machine.""" + await store.register("scans/a.pdf", 1) + await store.mark_rendered("scans/a.pdf", 1, image_path=tmp_path / "a.png") + await store.mark_completed("scans/a.pdf", 1, result_path=tmp_path / "a.json", model_version="m") + + verified = tmp_path / "a.verified.json" + corrections = tmp_path / "a.corrections.json" + matched = await store.mark_verified( + "scans/a.pdf", 1, verified_path=verified, corrections_path=corrections + ) + assert matched is True + + job = await store.get("scans/a.pdf", 1) + assert job is not None + assert job.status == JobStatus.COMPLETED + assert job.verified_at is not None + assert job.verified_path == str(verified) + assert job.corrections_path == str(corrections) + + +async def test_mark_verified_returns_false_when_no_matching_job( + store: JobStore, tmp_path: Path +) -> None: + """The verifier server may try to record verification for a test + fixture that has no `jobs.db` row. Returns False instead of raising + so the server can fall back to file-only persistence.""" + matched = await store.mark_verified( + "scans/no-such.pdf", + 99, + verified_path=tmp_path / "x.verified.json", + corrections_path=tmp_path / "x.corrections.json", + ) + assert matched is False + + +async def test_init_adds_late_columns_to_existing_db(tmp_path: Path) -> None: + """`init()` against a pre-verification-column DB adds the columns + without losing data.""" + import aiosqlite + + db_path = tmp_path / "old.db" + async with aiosqlite.connect(db_path) as db: + await db.execute( + """ + CREATE TABLE jobs ( + pdf_path TEXT NOT NULL, + page_number INTEGER NOT NULL, + status TEXT NOT NULL, + attempts INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + image_path TEXT, + result_path TEXT, + model_version TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + PRIMARY KEY (pdf_path, page_number) + ) + """ + ) + await db.commit() + + store = JobStore(db_path) + await store.init() + + async with aiosqlite.connect(db_path) as db: + db.row_factory = aiosqlite.Row + rows = await (await db.execute("PRAGMA table_info(jobs)")).fetchall() + cols = {r["name"] for r in rows} + assert "verified_at" in cols + assert "verified_path" in cols + assert "corrections_path" in cols diff --git a/tests/unit/test_make_verifier_bundle.py b/tests/unit/test_make_verifier_bundle.py new file mode 100644 index 0000000..7393fdc --- /dev/null +++ b/tests/unit/test_make_verifier_bundle.py @@ -0,0 +1,476 @@ +"""Tests for `scripts/make_verifier_bundle.py`. + +The pre-processor turns a `PageResult` + page image into a `bundle.json` +the verifier UI consumes. Tests cover the geometry helpers, the bbox +assignment heuristic, the bundle assembly, and the CLI. +""" + +from __future__ import annotations + +import json +from datetime import UTC, datetime +from pathlib import Path + +import pytest +from PIL import Image + +from core.page_layout import PageLayout +from core.schema import QUADRANT_ORDER, Entry, PageResult, Quadrant +from scripts.make_verifier_bundle import ( + SCHEMA_VERSION, + _assign_row_bboxes, + _merge_with_spans, + _parse_job_key_from_result_path, + _quadrant_bboxes, + main, + make_bundle, +) + + +def _layout( + *, + header_bottom_y: int = 100, + body_mid_y: int = 600, + body_bottom_y: int = 1100, + column_mid_x: int = 500, +) -> PageLayout: + return PageLayout( + header_bottom_y=header_bottom_y, + body_mid_y=body_mid_y, + body_bottom_y=body_bottom_y, + column_mid_x=column_mid_x, + ) + + +def _entry(row_index: int, text: str = "X - Y") -> Entry: + return Entry(row_index=row_index, raw_text=text, confidence="high") + + +def _quad(position: str, n_entries: int) -> Quadrant: + return Quadrant( + position=position, # type: ignore[arg-type] + hour_raw=None, + jock_raw=None, + entries=[_entry(i) for i in range(n_entries)], + ) + + +def _page_result(*, comments: str | None = None) -> PageResult: + return PageResult( + page_date_raw="Mon 1 Jan 90", + quadrants=[_quad(p, 3) for p in QUADRANT_ORDER], + comments_raw=comments, + oddities=[], + model_version="test-model", + extracted_at=datetime(2026, 5, 10, tzinfo=UTC), + ) + + +# -- _quadrant_bboxes ------------------------------------------------------- + + +def test_quadrant_bboxes_returns_all_four_quadrants() -> None: + boxes = _quadrant_bboxes(_layout(), page_width=1000) + assert set(boxes.keys()) == set(QUADRANT_ORDER) + + +def test_quadrant_bboxes_match_layout_math() -> None: + """Each quadrant's bbox is bounded by the corresponding layout + coordinates: column_mid_x splits left/right; body_mid_y splits + top/bottom; header_bottom_y is the top of the body; body_bottom_y + is the bottom.""" + layout = _layout( + header_bottom_y=100, + body_mid_y=600, + body_bottom_y=1100, + column_mid_x=500, + ) + boxes = _quadrant_bboxes(layout, page_width=1000) + assert boxes["top_left"] == (0, 100, 500, 600) + assert boxes["top_right"] == (500, 100, 1000, 600) + assert boxes["bottom_left"] == (0, 600, 500, 1100) + assert boxes["bottom_right"] == (500, 600, 1000, 1100) + + +# -- _assign_row_bboxes ----------------------------------------------------- + + +def test_assign_row_bboxes_clean_pairing() -> None: + """When all spans are 1 and n_lines == n_entries + 1, consecutive line + pairs become row top/bottom for each entry.""" + quad_bbox = (0, 100, 500, 400) # height 300 + lines = [100, 200, 300, 400] # 4 lines -> 3 entries + rows = _assign_row_bboxes(quad_bbox, lines, spans=[1, 1, 1]) + assert rows == [ + (0, 100, 500, 200), + (0, 200, 500, 300), + (0, 300, 500, 400), + ] + + +def test_assign_row_bboxes_extra_lines_ignored() -> None: + """When more lines exist than the entry-spans require, the trailing + lines are ignored.""" + quad_bbox = (0, 100, 500, 700) + lines = [100, 200, 300, 400, 500, 600, 700] # 7 lines + rows = _assign_row_bboxes(quad_bbox, lines, spans=[1, 1, 1]) + assert rows == [ + (0, 100, 500, 200), + (0, 200, 500, 300), + (0, 300, 500, 400), + ] + + +def test_assign_row_bboxes_spans_skip_continuation_rows() -> None: + """When an entry's span is 2 (it absorbed a continuation row or is + double_height), its bbox spans two physical row lines, and the NEXT + entry's bbox starts after the second line. This is the load-bearing + behavior for the multiline-entry verifier case.""" + quad_bbox = (0, 800, 1000, 1100) + # Three physical rows: y=800-900, 900-1000, 1000-1100. + # Two logical entries: first spans rows 0-1 (continuation), second is row 2. + lines = [800, 900, 1000, 1100] + rows = _assign_row_bboxes(quad_bbox, lines, spans=[2, 1]) + assert rows == [ + (0, 800, 1000, 1000), # entry 0: spans first TWO physical rows + (0, 1000, 1000, 1100), # entry 1: third physical row, not second + ] + + +def test_assign_row_bboxes_falls_back_to_even_spacing_when_no_lines() -> None: + quad_bbox = (10, 100, 510, 400) # width 500, height 300 + rows = _assign_row_bboxes(quad_bbox, lines=[], spans=[1, 1, 1]) + assert rows == [ + (10, 100, 510, 200), + (10, 200, 510, 300), + (10, 300, 510, 400), + ] + + +def test_assign_row_bboxes_falls_back_to_even_spacing_when_too_few_lines() -> None: + """When detected lines don't cover the total physical row count, even- + spacing fallback divides the quadrant by entry count (not physical + count) — uniform strips are better UX than mis-paired pinned rows.""" + quad_bbox = (0, 100, 500, 700) + rows = _assign_row_bboxes(quad_bbox, lines=[100, 300], spans=[1, 1, 1]) + assert rows == [ + (0, 100, 500, 300), + (0, 300, 500, 500), + (0, 500, 500, 700), + ] + + +def test_assign_row_bboxes_returns_empty_for_zero_entries() -> None: + rows = _assign_row_bboxes((0, 0, 100, 100), lines=[10, 20, 30], spans=[]) + assert rows == [] + + +# -- _merge_with_spans ------------------------------------------------------ + + +def test_merge_with_spans_collapses_continuation_into_span() -> None: + """A continuation entry merges into the previous logical entry and + increments its physical-row span by 1.""" + entries = [ + Entry(row_index=0, raw_text="The Standells - Sometimes Good Guys", confidence="high"), + Entry( + row_index=1, + raw_text="Don't Wear White", + confidence="medium", + notes="continuation", + ), + Entry(row_index=2, raw_text="The Lovedolls - Pearls at Swine", confidence="high"), + ] + result = _merge_with_spans(entries) + assert len(result) == 2 + merged_first, span_first = result[0] + assert merged_first.raw_text == "The Standells - Sometimes Good Guys Don't Wear White" + assert span_first == 2 + # Merged entries inherit `double_height` notes so the verifier dropdown + # reflects the multi-row nature. + assert merged_first.notes == "double_height" + merged_second, span_second = result[1] + assert merged_second.raw_text == "The Lovedolls - Pearls at Swine" + assert span_second == 1 + assert merged_second.notes is None + + +def test_merge_with_spans_double_height_counts_as_two() -> None: + """`notes="double_height"` doesn't trigger a merge but spans 2 rows.""" + entries = [ + Entry(row_index=0, raw_text="X - Y", confidence="high", notes="double_height"), + Entry(row_index=1, raw_text="A - B", confidence="high"), + ] + result = _merge_with_spans(entries) + assert [span for _, span in result] == [2, 1] + assert result[0][0].raw_text == "X - Y" + + +def test_merge_with_spans_consecutive_continuations() -> None: + """A single entry can absorb multiple continuation rows; span grows by + one per continuation.""" + entries = [ + Entry(row_index=0, raw_text="Line A", confidence="high"), + Entry(row_index=1, raw_text="Line B", confidence="high", notes="continuation"), + Entry(row_index=2, raw_text="Line C", confidence="high", notes="continuation"), + ] + result = _merge_with_spans(entries) + assert len(result) == 1 + merged, span = result[0] + assert merged.raw_text == "Line A Line B Line C" + assert span == 3 + assert merged.notes == "double_height" + + +def test_merge_with_spans_leading_continuation_is_preserved() -> None: + """A continuation as the first row has nothing to merge into — stays + as its own entry with span 1, mirroring `merge_continuations`.""" + entries = [ + Entry(row_index=0, raw_text="orphan", confidence="low", notes="continuation"), + Entry(row_index=1, raw_text="A - B", confidence="high"), + ] + result = _merge_with_spans(entries) + assert len(result) == 2 + assert [span for _, span in result] == [1, 1] + assert result[0][0].raw_text == "orphan" + assert result[0][0].notes == "continuation" + + +def test_merge_with_spans_empty_input() -> None: + assert _merge_with_spans([]) == [] + + +# -- make_bundle ------------------------------------------------------------ + + +def _white_page(tmp_path: Path) -> Path: + """A synthetic 1000x1500 white image with a black vertical column + divider at x=500. Detection will land near-real coords, then we + don't care about per-row exactness — the bundle just needs to + assemble without crashing.""" + image = Image.new("RGB", (1000, 1500), color="white") + # Paint the column divider so detect_column_mid_x finds it. + for y in range(1500): + image.putpixel((500, y), (0, 0, 0)) + path = tmp_path / "page.png" + image.save(path) + return path + + +def test_make_bundle_returns_schema_version(tmp_path: Path) -> None: + image_path = _white_page(tmp_path) + bundle_path = tmp_path / "out" / "verifier" / "page.bundle.json" + bundle = make_bundle(_page_result(), image_path=image_path, bundle_path=bundle_path) + assert bundle["schema_version"] == SCHEMA_VERSION == 2 + + +def test_make_bundle_top_level_fields(tmp_path: Path) -> None: + image_path = _white_page(tmp_path) + bundle_path = tmp_path / "page.bundle.json" + bundle = make_bundle(_page_result(), image_path=image_path, bundle_path=bundle_path) + assert bundle["stem"] == "page" + assert bundle["page_date_raw"] == "Mon 1 Jan 90" + assert bundle["comments_raw"] is None + assert bundle["model_version"] == "test-model" + assert bundle["oddities"] == [] + assert len(bundle["quadrants"]) == 4 + # New in v2: job key fields default to null when no job_key is passed. + assert bundle["pdf_path"] is None + assert bundle["page_number"] is None + + +def test_make_bundle_carries_job_key_when_provided(tmp_path: Path) -> None: + """When the bundle pre-processor can recover the (pdf_path, page_number) + job key from the result path, it's preserved in the bundle so the + verifier UI can target the right jobs.db row on save.""" + image_path = _white_page(tmp_path) + bundle = make_bundle( + _page_result(), + image_path=image_path, + bundle_path=tmp_path / "out.bundle.json", + job_key=("1990/April 1990/1990-04apr0106.pdf", 25), + ) + assert bundle["pdf_path"] == "1990/April 1990/1990-04apr0106.pdf" + assert bundle["page_number"] == 25 + + +def test_make_bundle_image_path_is_relative_to_bundle_dir(tmp_path: Path) -> None: + """The bundle stays portable: image_path is computed via os.path.relpath + from the bundle's parent directory to the source image. Tests nested + subdirectories — the bundle in data/verifier/, image in data/pages//.""" + data = tmp_path / "data" + image_path = data / "pages" / "1990-04apr0106" / "page-05.png" + image_path.parent.mkdir(parents=True) + image = Image.new("RGB", (1000, 1500), color="white") + for y in range(1500): + image.putpixel((500, y), (0, 0, 0)) + image.save(image_path) + + bundle_path = data / "verifier" / "page-05.bundle.json" + bundle = make_bundle(_page_result(), image_path=image_path, bundle_path=bundle_path) + assert bundle["image_path"] == "../pages/1990-04apr0106/page-05.png" + + +def test_make_bundle_quadrants_in_canonical_order(tmp_path: Path) -> None: + image_path = _white_page(tmp_path) + bundle = make_bundle( + _page_result(), image_path=image_path, bundle_path=tmp_path / "out.bundle.json" + ) + positions = tuple(q["position"] for q in bundle["quadrants"]) + assert positions == QUADRANT_ORDER + + +def test_make_bundle_each_entry_has_row_bbox(tmp_path: Path) -> None: + image_path = _white_page(tmp_path) + bundle = make_bundle( + _page_result(), image_path=image_path, bundle_path=tmp_path / "out.bundle.json" + ) + for quad in bundle["quadrants"]: + for entry in quad["entries"]: + assert "row_bbox" in entry + bbox = entry["row_bbox"] + assert len(bbox) == 4 + x1, y1, x2, y2 = bbox + assert x2 > x1 and y2 > y1, f"degenerate bbox: {bbox}" + + +def test_make_bundle_quadrant_has_bbox(tmp_path: Path) -> None: + image_path = _white_page(tmp_path) + bundle = make_bundle( + _page_result(), image_path=image_path, bundle_path=tmp_path / "out.bundle.json" + ) + for quad in bundle["quadrants"]: + assert "bbox" in quad + assert len(quad["bbox"]) == 4 + + +# -- CLI -------------------------------------------------------------------- + + +def _write_minimal_result(path: Path) -> None: + page = _page_result() + path.write_text(page.model_dump_json(indent=2)) + + +def test_main_writes_bundle_to_out_path(tmp_path: Path) -> None: + result_path = tmp_path / "result.json" + image_path = _white_page(tmp_path) + _write_minimal_result(result_path) + + out_path = tmp_path / "out" / "page.bundle.json" + rc = main([str(result_path), str(image_path), "--out", str(out_path)]) + assert rc == 0 + assert out_path.is_file() + bundle = json.loads(out_path.read_text()) + assert bundle["schema_version"] == SCHEMA_VERSION + assert len(bundle["quadrants"]) == 4 + + +def test_main_creates_output_parent_directory(tmp_path: Path) -> None: + """Pre-processor creates output dirs that don't exist, matching the + pattern in core/pipeline.py and core/jobs.py.""" + result_path = tmp_path / "result.json" + image_path = _white_page(tmp_path) + _write_minimal_result(result_path) + + out_path = tmp_path / "deeply" / "nested" / "page.bundle.json" + assert not out_path.parent.exists() + + rc = main([str(result_path), str(image_path), "--out", str(out_path)]) + assert rc == 0 + assert out_path.is_file() + + +def test_main_validates_bundle_against_page_result_shape(tmp_path: Path) -> None: + """The bundle must round-trip through PageResult.model_validate_json + after stripping bundle-only fields. This pins the export-schema + contract end-to-end.""" + result_path = tmp_path / "result.json" + image_path = _white_page(tmp_path) + _write_minimal_result(result_path) + + out_path = tmp_path / "page.bundle.json" + main([str(result_path), str(image_path), "--out", str(out_path)]) + + bundle = json.loads(out_path.read_text()) + # Strip bundle-only fields. + for key in ("schema_version", "stem", "image_path"): + bundle.pop(key, None) + for quad in bundle["quadrants"]: + quad.pop("bbox", None) + for entry in quad["entries"]: + entry.pop("row_bbox", None) + PageResult.model_validate(bundle) + + +# -- _parse_job_key_from_result_path ---------------------------------------- + + +def test_parse_job_key_from_pipeline_path() -> None: + """The canonical pipeline-result path resolves to (pdf_path, page_number).""" + p = Path("/var/data/results/1990/April 1990/1990-04apr0106/page-25.json") + assert _parse_job_key_from_result_path(p) == ( + "1990/April 1990/1990-04apr0106.pdf", + 25, + ) + + +def test_parse_job_key_returns_none_for_non_pipeline_path() -> None: + """Test fixtures (/tmp, /private, fixtures/) don't follow the layout.""" + assert _parse_job_key_from_result_path(Path("/tmp/flash-spike/pro/some.json")) is None + assert _parse_job_key_from_result_path(Path("/Users/x/fixtures/result.json")) is None + + +def test_parse_job_key_returns_none_when_filename_not_page() -> None: + """The trailing component must be `page-NN.json`.""" + p = Path("/var/data/results/1990/foo/notpage.json") + assert _parse_job_key_from_result_path(p) is None + + +def test_parse_job_key_returns_none_when_page_index_not_numeric() -> None: + p = Path("/var/data/results/1990/foo/page-abc.json") + assert _parse_job_key_from_result_path(p) is None + + +def test_main_returns_nonzero_when_inputs_missing(tmp_path: Path) -> None: + """Missing input file is a usage error, not a crash. Exit 1 lets + shell scripts react cleanly.""" + rc = main( + [ + str(tmp_path / "missing-result.json"), + str(tmp_path / "missing-page.png"), + "--out", + str(tmp_path / "out.bundle.json"), + ] + ) + assert rc == 1 + + +@pytest.mark.parametrize( + ("entry_text", "expected_bbox_count"), + [ + ("Juana Molina - la paradoja", 1), + ("", 1), # blank entries still get a bbox (UI shows them) + ], +) +def test_make_bundle_handles_entry_text_variants( + tmp_path: Path, entry_text: str, expected_bbox_count: int +) -> None: + image_path = _white_page(tmp_path) + result = PageResult( + page_date_raw=None, + quadrants=[ + Quadrant( + position=p, + hour_raw=None, + jock_raw=None, + entries=[Entry(row_index=0, raw_text=entry_text, confidence="high")], + ) + for p in QUADRANT_ORDER + ], + oddities=[], + model_version="t", + extracted_at=datetime(2026, 5, 10, tzinfo=UTC), + ) + bundle = make_bundle(result, image_path=image_path, bundle_path=tmp_path / "b.json") + assert all(len(q["entries"]) == expected_bbox_count for q in bundle["quadrants"]) diff --git a/tests/unit/test_page_layout.py b/tests/unit/test_page_layout.py index be20f86..126d05d 100644 --- a/tests/unit/test_page_layout.py +++ b/tests/unit/test_page_layout.py @@ -20,7 +20,9 @@ _detect_header_bottom_y, _estimate_row_spacing, detect_page_layout, + partition_row_lines_by_quadrant, ) +from core.schema import QUADRANT_ORDER GOLDEN_DIR = Path(__file__).resolve().parents[1] / "golden" @@ -231,3 +233,104 @@ def test_detect_header_bottom_y_falls_back_when_first_line_too_low() -> None: h = 4200 # 0.3 * h = 1260; first at 1500 is too low to trust. assert _detect_header_bottom_y([1500, 1575, 1650], h) == int(h * FALLBACK_HEADER_FRACTION) + + +# -- partition_row_lines_by_quadrant --------------------------------------- + + +def test_partition_row_lines_returns_quadrant_keys( + golden: tuple[str, Image.Image, dict[str, int]], +) -> None: + """Returned dict has exactly the four quadrant keys in QUADRANT_ORDER.""" + _, image, _ = golden + layout = detect_page_layout(image) + partitions = partition_row_lines_by_quadrant(image, layout) + assert set(partitions.keys()) == set(QUADRANT_ORDER) + + +def test_partition_row_lines_returns_y_coordinates_as_ints( + golden: tuple[str, Image.Image, dict[str, int]], +) -> None: + """Each list value is a pixel y-coordinate (int), matching the contract + of `_detect_row_lines`. The verifier pre-processor consumes these as + crop boundaries, so the integer type is load-bearing.""" + _, image, _ = golden + layout = detect_page_layout(image) + partitions = partition_row_lines_by_quadrant(image, layout) + for ys in partitions.values(): + for y in ys: + assert isinstance(y, int) + + +def test_partition_row_lines_within_correct_body_band( + golden: tuple[str, Image.Image, dict[str, int]], +) -> None: + """All returned y-coords fall within the body region. Top quadrants + stay strictly below `body_mid_y`. Bottom quadrants may include one + line slightly ABOVE `body_mid_y` — the hour-jock-cell baseline of the + bottom block, reattributed by the correction pass when body_mid_y + landed below it. See `partition_row_lines_by_quadrant`'s docstring.""" + _, image, _ = golden + layout = detect_page_layout(image) + partitions = partition_row_lines_by_quadrant(image, layout) + for pos in ("top_left", "top_right"): + for y in partitions[pos]: + assert layout.header_bottom_y <= y < layout.body_mid_y, ( + f"{pos}: y={y} outside top-band [{layout.header_bottom_y}, {layout.body_mid_y})" + ) + for pos in ("bottom_left", "bottom_right"): + for y in partitions[pos]: + assert layout.header_bottom_y <= y < layout.body_bottom_y, ( + f"{pos}: y={y} outside body range " + f"[{layout.header_bottom_y}, {layout.body_bottom_y})" + ) + + +def test_partition_row_lines_finds_content_in_top_band( + golden: tuple[str, Image.Image, dict[str, int]], +) -> None: + """All 5 goldens have detected lines somewhere in the top body band — + the printed grid alone is ~9 lines per quadrant, so at least one side + of the top band must come back populated.""" + stem, image, _ = golden + layout = detect_page_layout(image) + partitions = partition_row_lines_by_quadrant(image, layout) + total_top = len(partitions["top_left"]) + len(partitions["top_right"]) + assert total_top > 0, f"{stem}: no row lines detected in top band" + + +def test_partition_row_lines_reattributes_misclassified_bottom_baseline() -> None: + """When the top quadrant's last spacing is anomalously large (the line + is actually the hour-jock baseline of the bottom block, misattributed + because body_mid_y landed below it), it gets moved to the corresponding + bottom quadrant. + + Pages 20 and 25 of the 1990-04 golden set exhibit this: top_left's last + spacing is 100px vs median 75. The fix moves y≈2251 (page25) from + top_left to bottom_left. + """ + stem = "1990-04apr0106-page25" + image = Image.open(GOLDEN_DIR / f"{stem}.png") + layout = detect_page_layout(image) + partitions = partition_row_lines_by_quadrant(image, layout) + # bottom_left must start with a line ABOVE body_mid_y (the reattributed + # hour-jock baseline). The original first line below body_mid_y was 2352; + # after reattribution, ~2251 should now be the new first line. + assert partitions["bottom_left"][0] < layout.body_mid_y, ( + f"expected first bottom_left line to be reattributed above body_mid_y, " + f"got {partitions['bottom_left'][0]} vs body_mid_y={layout.body_mid_y}" + ) + # And the spacing from the new first line to the next should be ~one row, + # accounting for the hour-jock cell baseline at the top. + diff = partitions["bottom_left"][1] - partitions["bottom_left"][0] + assert 90 < diff < 115, f"unexpected first-row span: {diff}" + + +def test_partition_row_lines_handles_blank_image() -> None: + """A blank image returns four empty lists — no crash, no missing keys.""" + blank = Image.new("RGB", (1000, 1500), color="white") + layout = detect_page_layout(blank) + partitions = partition_row_lines_by_quadrant(blank, layout) + assert set(partitions.keys()) == set(QUADRANT_ORDER) + for ys in partitions.values(): + assert ys == [] diff --git a/tests/unit/test_verifier_serve.py b/tests/unit/test_verifier_serve.py new file mode 100644 index 0000000..3aa54b0 --- /dev/null +++ b/tests/unit/test_verifier_serve.py @@ -0,0 +1,338 @@ +"""Tests for `verifier/serve.py`. + +`/api/save` is the load-bearing endpoint for the verifier UI — it +validates the verified payload as `PageResult`, guards against path +traversal via the bundle stem, writes both files to `data/verifier/`, +and conditionally updates `jobs.db` via `JobStore.mark_verified`. + +These tests use httpx's ASGI transport to exercise the FastAPI app +in-process (no live server needed, no port collision with a running +`verifier/serve.py`). +""" + +from __future__ import annotations + +import json +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +import pytest +from httpx import ASGITransport, AsyncClient + +from core.jobs import JobStore +from core.schema import QUADRANT_ORDER, PageResult, Quadrant + + +def _page_result_dict() -> dict[str, Any]: + """Minimal valid PageResult payload for the verified-export body.""" + return PageResult( + page_date_raw="Mon 1 Jan 90", + quadrants=[ + Quadrant(position=p, hour_raw=None, jock_raw=None, entries=[], oddities=[]) + for p in QUADRANT_ORDER + ], + comments_raw=None, + oddities=[], + model_version="test-model", + extracted_at=datetime(2026, 5, 12, tzinfo=UTC), + ).model_dump(mode="json") + + +def _corrections_dict() -> dict[str, Any]: + return { + "stem": "test", + "model_version": "test-model", + "extracted_at": "2026-05-12T00:00:00Z", + "exported_at": "2026-05-12T00:00:01Z", + "page_corrections": [], + "quadrant_corrections": [], + "row_corrections": [], + "added_rows": [], + "deleted_rows": [], + } + + +@pytest.fixture +def serve_app(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): + """Build a fresh FastAPI app rooted at `tmp_path` so each test + starts with an empty `data/verifier/` and its own `jobs.db`.""" + monkeypatch.setenv("DATA_ROOT", str(tmp_path / "data")) + # Reimport to pick up the environment variable (DATA_ROOT is read at + # module import time, not per-request). + import importlib + + import verifier.serve as serve_mod + + importlib.reload(serve_mod) + yield serve_mod + # Restore the module to its default for any subsequent test that + # imports it without the env override. + monkeypatch.undo() + importlib.reload(serve_mod) + + +async def _client(app): + return AsyncClient(transport=ASGITransport(app=app), base_url="http://test") + + +# -- /api/save body validation --------------------------------------------- + + +async def test_save_rejects_missing_verified(serve_app, tmp_path: Path) -> None: + """Body must include `verified` and `corrections` objects.""" + async with await _client(serve_app.app) as c: + r = await c.post( + "/api/save", + json={"stem": "abc", "corrections": _corrections_dict()}, + ) + assert r.status_code == 400 + assert "verified" in r.json()["detail"] + + +async def test_save_rejects_invalid_pageresult(serve_app, tmp_path: Path) -> None: + """Verified payload must validate as `PageResult` — a malformed one + is rejected before any file write.""" + async with await _client(serve_app.app) as c: + r = await c.post( + "/api/save", + json={ + "stem": "abc", + "verified": {"quadrants": []}, # missing required fields, wrong shape + "corrections": _corrections_dict(), + }, + ) + assert r.status_code == 400 + assert "PageResult" in r.json()["detail"] + # Nothing written. + assert not (tmp_path / "data" / "verifier").exists() + + +async def test_save_rejects_path_traversal_stem(serve_app, tmp_path: Path) -> None: + """`stem` containing `/`, `\\`, or `..` is refused so the server + can't be tricked into writing outside `data/verifier/`. Whitespace- + only stems are also rejected — they'd produce confusing ` .verified.json` + files.""" + async with await _client(serve_app.app) as c: + for bad in ("../escape", "a/b", "..", "a\\b", "", " ", "\t"): + r = await c.post( + "/api/save", + json={ + "stem": bad, + "verified": _page_result_dict(), + "corrections": _corrections_dict(), + }, + ) + assert r.status_code == 400, f"expected 400 for stem={bad!r}, got {r.status_code}" + + +# -- /api/save file persistence -------------------------------------------- + + +async def test_save_writes_both_files(serve_app, tmp_path: Path) -> None: + """A valid payload writes `.verified.json` and + `.corrections.json` under `/verifier/`.""" + async with await _client(serve_app.app) as c: + r = await c.post( + "/api/save", + json={ + "stem": "page25", + "verified": _page_result_dict(), + "corrections": _corrections_dict(), + }, + ) + assert r.status_code == 200 + body = r.json() + assert body["db_updated"] is False # no pdf_path/page_number sent + verifier_dir = tmp_path / "data" / "verifier" + verified = verifier_dir / "page25.verified.json" + corrections = verifier_dir / "page25.corrections.json" + assert verified.is_file() + assert corrections.is_file() + # `verified.json` round-trips through `PageResult` — the on-disk file + # is the consumable artifact, so the test pins its parseability. + PageResult.model_validate_json(verified.read_text()) + # Corrections is opaque JSON — pin only that it's well-formed. + json.loads(corrections.read_text()) + + +async def test_save_strips_bundle_only_fields_via_pydantic_roundtrip( + serve_app, tmp_path: Path +) -> None: + """A client that leaks bundle-only fields (row_bbox, schema_version, + etc.) shouldn't pollute the on-disk verified.json. The server's + `PageResult.model_validate(...).model_dump_json(...)` round-trip + strips unknown fields by Pydantic's default `extra='ignore'`.""" + polluted = _page_result_dict() + # Simulate the UI accidentally leaking bundle metadata into the + # verified payload (these don't belong on PageResult). + polluted["schema_version"] = 2 + polluted["stem"] = "page25" + polluted["image_path"] = "../tests/golden/x.png" + polluted["quadrants"][0]["bbox"] = [0, 0, 100, 100] + async with await _client(serve_app.app) as c: + r = await c.post( + "/api/save", + json={ + "stem": "polluted", + "verified": polluted, + "corrections": _corrections_dict(), + }, + ) + assert r.status_code == 200 + on_disk = json.loads((tmp_path / "data" / "verifier" / "polluted.verified.json").read_text()) + assert "schema_version" not in on_disk + assert "stem" not in on_disk + assert "image_path" not in on_disk + # Per-quadrant bbox is bundle-only too. + assert "bbox" not in on_disk["quadrants"][0] + + +async def test_save_overwrites_previous_files(serve_app, tmp_path: Path) -> None: + """Re-saving the same stem overwrites — verification is the latest + edit state, not an append-only log.""" + payload = { + "stem": "p", + "verified": _page_result_dict(), + "corrections": _corrections_dict(), + } + async with await _client(serve_app.app) as c: + await c.post("/api/save", json=payload) + # Second save with a tweaked date. + payload["verified"]["page_date_raw"] = "Tues 2 Jan 90" + r2 = await c.post("/api/save", json=payload) + assert r2.status_code == 200 + verified = tmp_path / "data" / "verifier" / "p.verified.json" + assert json.loads(verified.read_text())["page_date_raw"] == "Tues 2 Jan 90" + + +# -- /api/save DB integration ---------------------------------------------- + + +async def test_save_updates_jobs_db_when_job_key_matches(serve_app, tmp_path: Path) -> None: + """When `pdf_path` + `page_number` are present AND `jobs.db` has a + matching row, the verification is recorded via `JobStore.mark_verified` + and `db_updated: true` is returned.""" + db_path = tmp_path / "data" / "jobs.db" + db_path.parent.mkdir(parents=True, exist_ok=True) + store = JobStore(db_path) + await store.init() + await store.register("1990/x.pdf", 1) + await store.mark_rendered("1990/x.pdf", 1, image_path=tmp_path / "x.png") + await store.mark_completed("1990/x.pdf", 1, result_path=tmp_path / "x.json", model_version="m") + + async with await _client(serve_app.app) as c: + r = await c.post( + "/api/save", + json={ + "stem": "x-page-01", + "pdf_path": "1990/x.pdf", + "page_number": 1, + "verified": _page_result_dict(), + "corrections": _corrections_dict(), + }, + ) + assert r.status_code == 200 + assert r.json()["db_updated"] is True + + job = await store.get("1990/x.pdf", 1) + assert job is not None + assert job.verified_at is not None + assert job.verified_path is not None and job.verified_path.endswith("x-page-01.verified.json") + assert job.corrections_path is not None and job.corrections_path.endswith( + "x-page-01.corrections.json" + ) + + +async def test_save_returns_db_updated_false_when_no_matching_job( + serve_app, tmp_path: Path +) -> None: + """A job key that doesn't match any row in `jobs.db` is not an error + — the server writes files and reports `db_updated: false`. Lets test + fixtures and ad-hoc pages save without a pre-registered job.""" + db_path = tmp_path / "data" / "jobs.db" + db_path.parent.mkdir(parents=True, exist_ok=True) + # Initialize an empty jobs.db so the file exists but has no matching row. + await JobStore(db_path).init() + + async with await _client(serve_app.app) as c: + r = await c.post( + "/api/save", + json={ + "stem": "ghost", + "pdf_path": "1990/no-such.pdf", + "page_number": 99, + "verified": _page_result_dict(), + "corrections": _corrections_dict(), + }, + ) + assert r.status_code == 200 + assert r.json()["db_updated"] is False + # Files still written. + assert (tmp_path / "data" / "verifier" / "ghost.verified.json").is_file() + + +async def test_save_rejects_bool_page_number(serve_app, tmp_path: Path) -> None: + """`isinstance(x, int)` is True for `bool` in Python — a malformed + `page_number: true` would coerce to 1 and look up the wrong job + row. Defensive: bool is rejected; the save still succeeds but with + `db_updated: false` (treated as no-job-key).""" + db_path = tmp_path / "data" / "jobs.db" + db_path.parent.mkdir(parents=True, exist_ok=True) + store = JobStore(db_path) + await store.init() + await store.register("1990/x.pdf", 1) + + async with await _client(serve_app.app) as c: + r = await c.post( + "/api/save", + json={ + "stem": "bool-test", + "pdf_path": "1990/x.pdf", + "page_number": True, # boolean, not real int + "verified": _page_result_dict(), + "corrections": _corrections_dict(), + }, + ) + assert r.status_code == 200 + assert r.json()["db_updated"] is False # bool was rejected, files only + # The job row at page 1 should NOT have been updated. + job = await store.get("1990/x.pdf", 1) + assert job is not None + assert job.verified_at is None + + +async def test_save_writes_are_atomic_no_tmp_left_behind(serve_app, tmp_path: Path) -> None: + """Atomic writes use `.tmp` siblings + os.replace. After a successful + save, no `.tmp` files remain in data/verifier/.""" + async with await _client(serve_app.app) as c: + await c.post( + "/api/save", + json={ + "stem": "atomic", + "verified": _page_result_dict(), + "corrections": _corrections_dict(), + }, + ) + verifier_dir = tmp_path / "data" / "verifier" + tmp_files = list(verifier_dir.glob("*.tmp")) + assert tmp_files == [], f"unexpected tmp files left: {tmp_files}" + + +async def test_save_skips_db_when_no_jobs_db_file(serve_app, tmp_path: Path) -> None: + """If `data/jobs.db` doesn't exist (no pipeline has run), Save still + succeeds — no DB integration is attempted.""" + # tmp_path/data/jobs.db is absent. + async with await _client(serve_app.app) as c: + r = await c.post( + "/api/save", + json={ + "stem": "no-db", + "pdf_path": "1990/x.pdf", + "page_number": 1, + "verified": _page_result_dict(), + "corrections": _corrections_dict(), + }, + ) + assert r.status_code == 200 + assert r.json()["db_updated"] is False diff --git a/verifier/README.md b/verifier/README.md new file mode 100644 index 0000000..ef96db5 --- /dev/null +++ b/verifier/README.md @@ -0,0 +1,175 @@ +# Flowsheet verifier UI + +A static, dependency-free single-page app for manually verifying flowsheet extraction output. Each row's cropped image strip is shown next to the model-detected text in an editable field. Hand-correct typos, mark hallucinated rows, add missed rows, then export a `verified.json` that flows back into the pipeline as ground truth. + +## Run + +The verifier ships with a tiny FastAPI server that does two things: + +1. Serves `verifier/`, `data/`, and `tests/` as static files. +2. Proxies the **Check artists** lookups through `/api/lookup` to the request-o-matic `/request` endpoint (request-o-matic doesn't emit CORS headers, so a same-origin proxy is simpler than configuring CORS). + +```bash +# from the repo root +.venv/bin/python verifier/serve.py +# default port is 8765; override with VERIFIER_PORT=9000 .venv/bin/python verifier/serve.py + +# then open in a browser: +open "http://localhost:8765/verifier/?bundle=/data/verifier/.bundle.json" +``` + +If you want only the static side and don't need the artist-lookup button, `python -m http.server 8765` from the repo root still works — the Check-artists button will return 404s but everything else functions. + +The `?bundle=...` URL param is the recommended path: the UI fetches the bundle, then resolves the bundle's `image_path` (relative path inside the JSON) and fetches the image too. + +You can also load a bundle via the **Load bundle** file picker, in which case a second **Load image** picker appears. This path works without a server but you must pick both files manually. + +## File layout + +The bundle's `image_path` is **relative to the bundle file's directory**. The expected layout under the repo's `data/` directory: + +``` +data/ + pages//.png # source image + results//.json # pipeline output (input to make_verifier_bundle) + verifier/.bundle.json # pre-processor output, references ../pages//.png + verifier/.verified.json # UI export (download to this directory by convention) +tests/golden/.truth.json # derive_truth output (optional destination) +``` + +## End-to-end workflow + +1. Run the pipeline to produce `data/results//.json`. +2. Generate a bundle: + + ```bash + python -m scripts.make_verifier_bundle \ + data/results//.json \ + data/pages//.png \ + --out data/verifier/.bundle.json + ``` + +3. Open the verifier and load the bundle. +4. Walk the page: each row shows a cropped image strip + the model's `raw_text`. Correct typos, set `type` and `notes` when needed, click ✗ to mark hallucinations, click **+ add row** to insert a row the model missed. +5. Edit the page-level fields: `page_date_raw`, `comments_raw`, `oddities`. +6. Click **Export verified** → downloads `.verified.json`. Move it to `data/verifier/`. +7. (Optional) Derive a `tests/golden/*.truth.json`: + + ```bash + python -m scripts.derive_truth \ + data/verifier/.verified.json \ + --out tests/golden/.truth.json + ``` + +## Bundle schema + +```json +{ + "schema_version": 2, + "pdf_path": "1990/April 1990/1990-04apr0106.pdf", + "page_number": 25, + "stem": "", + "image_path": "", + "model_version": "", + "extracted_at": "", + "page_date_raw": "...", + "comments_raw": "...", + "oddities": ["..."], + "quadrants": [ + { + "position": "top_left", + "bbox": [x1, y1, x2, y2], + "hour_raw": "6AM", + "jock_raw": "Andrew", + "entries": [ + { + "row_index": 0, + "raw_text": "...", + "confidence": "high", + "type_raw": "M", + "notes": null, + "oddities": [], + "row_bbox": [x1, y1, x2, y2] + } + ], + "oddities": [] + } + ] +} +``` + +### Versioning + +`schema_version` is currently `2`. v1 was the initial bundle shape; v2 added the optional `pdf_path` and `page_number` fields so `Save` can target the corresponding `jobs.db` row. Future incompatible changes bump the version; the UI shows an error banner if it sees an unsupported version. Keep `schema_version` set when archiving bundles so older bundles remain loadable. + +## Saving + +Clicking **Save** POSTs the current edit state to the server's `/api/save` endpoint, which: + +1. Writes `data/verifier/.verified.json` — `PageResult`-shaped JSON validating against `core.schema.PageResult`. Bundle-only fields (`schema_version`, `stem`, `image_path`, `pdf_path`, `page_number`, per-entry `row_bbox`) are stripped before validation. Rows marked ✗ are excluded. Rows added via **+ add row** are included. +2. Writes `data/verifier/.corrections.json` — the delta between the loaded bundle and the verified state (shape below). +3. If the bundle has a non-null `pdf_path` + `page_number` (production-pipeline pages do; test fixtures don't), updates the matching `jobs.db` row via `JobStore.mark_verified` — setting `verified_at`, `verified_path`, and `corrections_path`. + +The status bar reports the destination files and whether `jobs.db` was updated: + +> Saved data/verifier/X.verified.json + data/verifier/X.corrections.json · 4 field correction(s), 0 added, 0 deleted · jobs.db updated. + +If you'd rather have a downloadable file, open the saved JSON from `data/verifier/` directly. + +The `corrections.json` shape: + +```json +{ + "stem": "...", + "model_version": "...", + "extracted_at": "...", + "exported_at": "...", + "page_corrections": [ + {"field": "page_date_raw", "original": "...", "corrected": "..."} + ], + "quadrant_corrections": [ + {"position": "top_left", "field": "hour_raw", "original": "6AM", "corrected": "6PM"} + ], + "row_corrections": [ + {"position": "top_left", "row_index": 0, "field": "raw_text", + "original": "Smiths-I wnat", "corrected": "Smiths-I want the one I can't have"} + ], + "added_rows": [ + {"position": "top_left", "row_index": 12, "raw_text": "...", + "type_raw": null, "notes": null} + ], + "deleted_rows": [ + {"position": "top_left", "row_index": 7, "original_raw_text": "..."} + ] +} +``` + +The `verified.json` is the consumable artifact (plugs back into the pipeline as ground truth). The `corrections.json` is the audit record (preserves the original model output for diff analysis). Rows the user neither edited nor marked ✗ produce no entry in either file — by clicking Save, the user is implicitly endorsing every untouched row. + +Truth derivation is a **separate Python tool** (`scripts/derive_truth.py`) rather than a UI button — the substring-extraction rules live in one place (Python, testable), not duplicated in JS. + +## Check artists (request-o-matic lookup) + +Click **Check artists** in the header to look up every row's text via the WXYC library + Discogs reconciliation pipeline. Each row gets a badge with the resolved artist + matched **release** (album / 12") and a confidence score. + +**Important contrast**: the flowsheet records `Artist - Track`, but the library and Discogs match at the **release** level. The badge text is labeled `artist · album: "..."` (full track match) or `artist · sample release: "..."` (artist-only fallback) so this never looks like a near-track-match when it's a release-level result. + +Badge states: + +- **Green** — track found in the library on this release. High confidence the artist is right; the release shown is the album/single containing the played track. +- **Yellow** — one of: + - **`⚠ artist-only · ...`**: the library has the artist but not this specific track. The "sample release" is whichever album of theirs the library indexed first — it's *not* a confirmation that the played track lives there. + - **`⚠ postdates · ...`**: the matched release's year is after the flowsheet's page year. The page year is parsed from `page_date_raw` (1990 for `Thurs 4/5/90`, etc.); when `release_year > page_year` the match is almost certainly a later remix, reissue, or same-name band. + - artwork confidence below 0.5. +- **Grey/italic** — no library match found. Could be a typo, a non-canonical name, or genuinely missing from the WXYC corpus (the library reflects current stock, not 1990 stock — ~30% of mid-density pages will have these). +- **Faded/italic** — stale. You edited the row after running Check; re-run to refresh. + +The lookup goes through request-o-matic's LLM-driven request parser (artist normalization, fuzzy matching) before hitting the LML library search. The badge reflects request-o-matic's `library_results` and `artwork` fields — not LML's `/api/v1/lookup` directly, since the LLM correction layer is the load-bearing piece. + +## Known rough edges (v1) + +- **No autosave / localStorage.** Close the tab and unsaved edits are lost. Export before navigating away. +- **No batch loader.** One bundle at a time. +- **No keyboard shortcuts.** Mouse-driven only. +- **Confidence is not editable.** That field is a model artifact, not user truth. +- **Row crops use detected grid lines when available, even spacing otherwise.** A quadrant where the model over-emitted rows (more entries than handwritten lines) will show vertically squashed crops — visible but possibly mis-cropped at boundaries. Eye your way through it. diff --git a/verifier/app.js b/verifier/app.js new file mode 100644 index 0000000..b251f35 --- /dev/null +++ b/verifier/app.js @@ -0,0 +1,736 @@ +// Flowsheet verifier — vanilla JS, no build step. +// +// Loads a bundle.json (produced by scripts/make_verifier_bundle.py) plus +// the page image it references, renders per-row canvas crops next to +// editable text fields, and on Save POSTs two files to /api/save: +// 1. .verified.json — PageResult-shaped corrected page +// 2. .corrections.json — delta vs the original bundle +// +// Two load paths are supported: +// 1. Server-served bundle: fetch(bundle) then fetch(image) by relative +// URL. Used when the page is served via `python verifier/serve.py`. +// 2. File-picker bundle: read the bundle as text, then prompt for the +// image file separately. +// +// State is split: +// state.originalBundle — immutable snapshot of the loaded bundle. Never +// mutated; used as the diff baseline on save. +// state.bundle — working copy. Mutated by edits and UI flags +// (`_added`, `_deleted`). + +"use strict"; + +const SUPPORTED_SCHEMA_VERSION = 2; + +const state = { + bundle: null, // mutable working copy + originalBundle: null, // immutable snapshot for diffing + pageImage: null, // HTMLImageElement + lookupConcurrency: 4, // parallel /api/lookup requests +}; + +const $ = (sel, root = document) => root.querySelector(sel); + +function setStatus(msg, kind = "info") { + const el = $("#status"); + el.textContent = msg; + el.className = kind === "error" ? "error" : ""; +} + +function cloneDeep(obj) { + return JSON.parse(JSON.stringify(obj)); +} + +// ---- bundle loading ------------------------------------------------------ + +async function loadBundleFromUrlParam() { + const params = new URLSearchParams(location.search); + const path = params.get("bundle"); + if (!path) return false; + try { + const r = await fetch(path); + if (!r.ok) throw new Error(`fetch ${path}: ${r.status}`); + const bundle = await r.json(); + await initBundle(bundle, { bundleUrl: path }); + return true; + } catch (err) { + setStatus(`Failed to load bundle: ${err.message}`, "error"); + return false; + } +} + +async function loadBundleFromFile(file) { + try { + const text = await file.text(); + const bundle = JSON.parse(text); + await initBundle(bundle, { bundleUrl: null }); + } catch (err) { + setStatus(`Failed to parse bundle: ${err.message}`, "error"); + } +} + +async function initBundle(bundle, { bundleUrl }) { + if (bundle.schema_version !== SUPPORTED_SCHEMA_VERSION) { + setStatus( + `Unsupported schema_version ${bundle.schema_version}; ` + + `this UI supports v${SUPPORTED_SCHEMA_VERSION}.`, + "error" + ); + return; + } + state.originalBundle = cloneDeep(bundle); + state.bundle = cloneDeep(bundle); + state.pageImage = null; + + if (bundleUrl) { + const imageUrl = new URL(bundle.image_path, new URL(bundleUrl, location.href)); + state.pageImage = await loadImage(imageUrl.href); + finishInit(); + } else { + $("#image-picker").hidden = false; + setStatus("Bundle loaded. Pick the page image to continue."); + } +} + +function loadImage(src) { + return new Promise((resolve, reject) => { + const img = new Image(); + img.onload = () => resolve(img); + img.onerror = () => reject(new Error(`failed to load image ${src}`)); + img.src = src; + }); +} + +async function loadImageFromFile(file) { + const url = URL.createObjectURL(file); + try { + state.pageImage = await loadImage(url); + finishInit(); + } catch (err) { + setStatus(`Failed to load image: ${err.message}`, "error"); + } +} + +function finishInit() { + setStatus( + `Loaded ${state.bundle.stem} ` + + `(${state.pageImage.naturalWidth}×${state.pageImage.naturalHeight}px).` + ); + $("#app").hidden = false; + $("#save-verified").disabled = false; + $("#toggle-page-view").disabled = false; + $("#check-artists").disabled = false; + $("#page-view-img").src = state.pageImage.src; + renderPageMeta(); + renderQuadrants(); + // Show the full-page reference by default; verifiers asked for this + // because the row crops need page context to be useful. + togglePageView(); +} + +// ---- render: page meta --------------------------------------------------- + +function renderPageMeta() { + const dateEl = $("#page-date-raw"); + dateEl.value = state.bundle.page_date_raw ?? ""; + dateEl.addEventListener("input", () => { + state.bundle.page_date_raw = dateEl.value || null; + }); + + const commentsEl = $("#comments-raw"); + commentsEl.value = state.bundle.comments_raw ?? ""; + commentsEl.addEventListener("input", () => { + state.bundle.comments_raw = commentsEl.value || null; + }); + + const oddEl = $("#oddities"); + oddEl.value = (state.bundle.oddities ?? []).join("\n"); + oddEl.addEventListener("input", () => { + state.bundle.oddities = oddEl.value + .split("\n") + .map(s => s.trim()) + .filter(Boolean); + }); +} + +// ---- render: quadrants --------------------------------------------------- + +function renderQuadrants() { + const container = $("#quadrants-container"); + container.innerHTML = ""; + const tmpl = $("#quadrant-template"); + + for (const quad of state.bundle.quadrants) { + const node = tmpl.content.firstElementChild.cloneNode(true); + $(".quadrant-title", node).textContent = quad.position; + + const hourEl = $(".hour-raw", node); + hourEl.value = quad.hour_raw ?? ""; + hourEl.addEventListener("input", () => { + quad.hour_raw = hourEl.value || null; + }); + + const jockEl = $(".jock-raw", node); + jockEl.value = quad.jock_raw ?? ""; + jockEl.addEventListener("input", () => { + quad.jock_raw = jockEl.value || null; + }); + + const rowsEl = $(".rows", node); + for (const entry of quad.entries) { + rowsEl.appendChild(buildRow(entry, quad)); + } + + $(".add-row", node).addEventListener("click", () => { + const newEntry = { + row_index: quad.entries.length, + raw_text: "", + confidence: "low", + type_raw: null, + notes: null, + oddities: [], + row_bbox: null, + _added: true, + }; + quad.entries.push(newEntry); + rowsEl.appendChild(buildRow(newEntry, quad)); + }); + + container.appendChild(node); + } +} + +function buildRow(entry, quad) { + const tmpl = $("#row-template"); + const node = tmpl.content.firstElementChild.cloneNode(true); + node.dataset.rowIndex = String(entry.row_index); + + const canvas = $(".row-crop", node); + if (entry.row_bbox) { + drawCrop(canvas, entry.row_bbox); + } else { + canvas.outerHTML = `
no crop (added row)
`; + } + + const textEl = $(".raw-text", node); + textEl.value = entry.raw_text; + textEl.addEventListener("input", () => { + entry.raw_text = textEl.value; + // Edit invalidates the lookup badge — show as stale until re-check. + const badge = $(".lookup-badge", node); + if (badge && !badge.hidden) { + badge.classList.add("stale"); + badge.title = "Click 'Check artists' to refresh."; + } + }); + + const typeEl = $(".type-raw input", node); + typeEl.value = entry.type_raw ?? ""; + typeEl.addEventListener("input", () => { + entry.type_raw = typeEl.value || null; + }); + + const notesEl = $(".notes select", node); + const syncNotesView = () => { + notesEl.value = entry.notes ?? ""; + node.classList.toggle("has-notes", !!entry.notes); + }; + syncNotesView(); + notesEl.addEventListener("change", () => { + entry.notes = notesEl.value || null; + syncNotesView(); + }); + + $(".delete-row", node).addEventListener("click", () => { + entry._deleted = !entry._deleted; + node.classList.toggle("deleted", entry._deleted); + }); + + return node; +} + +function drawCrop(canvas, bbox) { + const [x1, y1, x2, y2] = bbox; + const srcW = x2 - x1; + const srcH = y2 - y1; + if (srcW <= 0 || srcH <= 0) { + canvas.outerHTML = `
empty bbox
`; + return; + } + canvas.width = srcW; + canvas.height = srcH; + // Let CSS govern display size — the canvas's intrinsic aspect ratio is + // preserved by `width: 100%; height: auto` in styles.css. This makes the + // crop fill the available column width (full row when the side panel is + // closed; narrower when the page-view panel pushes the editor). + const ctx = canvas.getContext("2d"); + ctx.drawImage(state.pageImage, x1, y1, srcW, srcH, 0, 0, srcW, srcH); +} + +// ---- export: PageResult verified.json ----------------------------------- + +function buildVerifiedExport() { + // Strip bundle-only fields, per-entry row_bbox, and UI flags. Validates + // as PageResult directly. + return { + page_date_raw: state.bundle.page_date_raw, + quadrants: state.bundle.quadrants.map(quad => ({ + position: quad.position, + hour_raw: quad.hour_raw, + jock_raw: quad.jock_raw, + entries: quad.entries + .filter(e => !e._deleted) + .map(e => ({ + row_index: e.row_index, + raw_text: e.raw_text, + type_raw: e.type_raw, + confidence: e.confidence, + notes: e.notes, + oddities: e.oddities ?? [], + })), + oddities: quad.oddities ?? [], + })), + comments_raw: state.bundle.comments_raw, + oddities: state.bundle.oddities ?? [], + model_version: state.bundle.model_version, + extracted_at: state.bundle.extracted_at, + }; +} + +// ---- export: corrections.json (delta) ----------------------------------- + +// Fields that participate in row-level correction tracking. row_bbox is +// derived geometry, not user-editable text, so it never appears as a +// correction. confidence is model output, not user truth. +const ROW_TRACKED_FIELDS = ["raw_text", "type_raw", "notes"]; + +// Page-level and quadrant-level fields the verifier exposes for editing. +const PAGE_TRACKED_FIELDS = ["page_date_raw", "comments_raw"]; +const QUADRANT_TRACKED_FIELDS = ["hour_raw", "jock_raw"]; + +function arraysEqual(a, b) { + if (a == null && b == null) return true; + if (a == null || b == null) return false; + if (a.length !== b.length) return false; + for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false; + return true; +} + +function findOriginalEntry(quadPosition, rowIndex) { + const quad = state.originalBundle.quadrants.find(q => q.position === quadPosition); + if (!quad) return null; + return quad.entries.find(e => e.row_index === rowIndex) ?? null; +} + +function findOriginalQuadrant(position) { + return state.originalBundle.quadrants.find(q => q.position === position) ?? null; +} + +function buildCorrectionsExport() { + const page_corrections = []; + for (const field of PAGE_TRACKED_FIELDS) { + const orig = state.originalBundle[field] ?? null; + const cur = state.bundle[field] ?? null; + if (orig !== cur) { + page_corrections.push({ field, original: orig, corrected: cur }); + } + } + if (!arraysEqual(state.originalBundle.oddities ?? [], state.bundle.oddities ?? [])) { + page_corrections.push({ + field: "oddities", + original: state.originalBundle.oddities ?? [], + corrected: state.bundle.oddities ?? [], + }); + } + + const quadrant_corrections = []; + const row_corrections = []; + const added_rows = []; + const deleted_rows = []; + + for (const quad of state.bundle.quadrants) { + const origQuad = findOriginalQuadrant(quad.position); + if (origQuad) { + for (const field of QUADRANT_TRACKED_FIELDS) { + const orig = origQuad[field] ?? null; + const cur = quad[field] ?? null; + if (orig !== cur) { + quadrant_corrections.push({ + position: quad.position, + field, + original: orig, + corrected: cur, + }); + } + } + } + + for (const entry of quad.entries) { + // Added-and-then-deleted: dropped entirely, no signal worth keeping. + if (entry._added && entry._deleted) continue; + + if (entry._added) { + added_rows.push({ + position: quad.position, + row_index: entry.row_index, + raw_text: entry.raw_text, + type_raw: entry.type_raw, + notes: entry.notes, + }); + continue; + } + + if (entry._deleted) { + const orig = findOriginalEntry(quad.position, entry.row_index); + deleted_rows.push({ + position: quad.position, + row_index: entry.row_index, + original_raw_text: orig?.raw_text ?? null, + }); + continue; + } + + // Existing, not deleted: emit corrections per changed field. + const orig = findOriginalEntry(quad.position, entry.row_index); + if (orig) { + for (const field of ROW_TRACKED_FIELDS) { + const origVal = orig[field] ?? null; + const curVal = entry[field] ?? null; + if (origVal !== curVal) { + row_corrections.push({ + position: quad.position, + row_index: entry.row_index, + field, + original: origVal, + corrected: curVal, + }); + } + } + } + } + } + + return { + stem: state.bundle.stem, + model_version: state.bundle.model_version, + extracted_at: state.bundle.extracted_at, + exported_at: new Date().toISOString(), + page_corrections, + quadrant_corrections, + row_corrections, + added_rows, + deleted_rows, + }; +} + +// ---- file-download helpers ----------------------------------------------- + +async function saveAll() { + if (!state.bundle) return; + const btn = $("#save-verified"); + btn.disabled = true; + const original = btn.textContent; + btn.textContent = "Saving…"; + + const verified = buildVerifiedExport(); + const corrections = buildCorrectionsExport(); + const body = { + stem: state.bundle.stem, + pdf_path: state.bundle.pdf_path ?? null, + page_number: state.bundle.page_number ?? null, + verified, + corrections, + }; + + try { + const r = await fetch("/api/save", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + }); + if (!r.ok) { + const detail = await r.text(); + throw new Error(`/api/save ${r.status}: ${detail}`); + } + const result = await r.json(); + const n = + corrections.row_corrections.length + + corrections.page_corrections.length + + corrections.quadrant_corrections.length; + const dbBit = result.db_updated + ? "jobs.db updated" + : (body.pdf_path && body.page_number != null + ? "no matching job row (files only)" + : "files only (no job key)"); + setStatus( + `Saved ${result.verified_path} + ${result.corrections_path} · ` + + `${n} field correction(s), ${corrections.added_rows.length} added, ` + + `${corrections.deleted_rows.length} deleted · ${dbBit}.` + ); + } catch (err) { + setStatus(`Save failed: ${err.message}`, "error"); + } finally { + btn.disabled = false; + btn.textContent = original; + } +} + +// ---- artist/track lookup (request-o-matic via /api/lookup proxy) -------- + +async function lookupOne(message) { + const r = await fetch("/api/lookup", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ message }), + }); + if (!r.ok) throw new Error(`/api/lookup ${r.status}: ${await r.text()}`); + return await r.json(); +} + +// Same separator regex as `core.parse.parse_artist_track` (Python side). +// Pulls the artist out of a flowsheet `Artist - Track` string for +// comparison against the library-resolved artist. +const ARTIST_TRACK_SEPARATOR = /\s*[-–—]\s*/; + +// Stop words and bibliographic prefixes that the WXYC corpus often +// drops or adds inconsistently. Stripping them prevents a "the" / "a" +// difference from making "the sundays" and "Sundays" look like a +// mismatch. +const ARTIST_TOKEN_STOPWORDS = new Set([ + "the", "a", "an", "and", "&", "feat", "featuring", "ft", "with", "vs", "presents", +]); + +function _tokenize(s) { + return s + .toLowerCase() + .normalize("NFKD") + .replace(/[^\p{L}\p{N}\s]/gu, " ") + .split(/\s+/) + .map((t) => t.replace(/s$/, "")) // crude singularization: boys -> boy + .filter((t) => t.length >= 2 && !ARTIST_TOKEN_STOPWORDS.has(t)); +} + +function parseInputArtist(rawText) { + if (!rawText) return null; + const parts = rawText.split(ARTIST_TRACK_SEPARATOR); + return parts[0]?.trim() || null; +} + +// Returns true when the resolved artist shares NO tokens with the input +// artist (after normalization, stop-word and trailing-s stripping). +// Conservative: when either side has zero meaningful tokens, returns +// false (no signal). Catches "Pure Joy → Coldcut" where the LLM +// fuzzy-matched on a track word; tolerates "the sundays → Sundays" and +// "Beastie Boy → Beastie Boys". +function artistTokensDisjoint(inputArtist, resolvedArtist) { + if (!inputArtist || !resolvedArtist) return false; + const a = new Set(_tokenize(inputArtist)); + const b = new Set(_tokenize(resolvedArtist)); + if (a.size === 0 || b.size === 0) return false; + for (const t of a) if (b.has(t)) return false; + return true; +} + +// Parse a 2- or 4-digit year out of `page_date_raw`. The WXYC corpus spans +// 1990-2001, so 2-digit years 90-99 map to 19xx and 00-01 map to 20xx. +// Returns null when no plausible year is present. +function parsePageYear(pageDateRaw) { + if (!pageDateRaw) return null; + // Try 4-digit year first. + const fourDigit = pageDateRaw.match(/\b(19\d{2}|20\d{2})\b/); + if (fourDigit) return Number(fourDigit[1]); + // Fall back to a 2-digit year. Scan every 2-digit token that's NOT + // surrounded by other digits (so we don't pluck "19" or "90" out of + // "1990"). 2-digit years in the WXYC corpus range (80-99 → 19xx, + // 00-09 → 20xx) win; anything else (a month like "04" or a day like + // "31") is skipped. + const twoDigitMatches = pageDateRaw.matchAll(/(?= 80 && n <= 99) return 1900 + n; + if (n >= 0 && n <= 9) return 2000 + n; + } + return null; +} + +function badgeContentFor(data, pageYear, inputRawText) { + const parsed = data.parsed || {}; + const artwork = data.artwork || {}; + const libResults = data.library_results || []; + const libTop = libResults[0]; + if (!libTop && !artwork.artist) { + return { kind: "empty", text: "no library match" }; + } + + // Prefer library_results (authoritative for the WXYC corpus). Fall back + // to Discogs artwork for orientation. + const artist = libTop?.artist || artwork.artist || parsed.artist || "?"; + // library_results[i].title and artwork.album both denote a RELEASE + // (album / 12") in the library — never a track. The flowsheet records + // tracks, so we label the field explicitly to avoid the visual conflict + // with the flowsheet's "Artist - Track" shape. + const release = libTop?.title || artwork.album || ""; + const conf = typeof artwork.confidence === "number" ? artwork.confidence : null; + const releaseYear = typeof artwork.release_year === "number" ? artwork.release_year : null; + + // Fallback signal: the library reconciler couldn't find the specific + // track and is returning the artist's catalog instead. `song_not_found` + // is the canonical flag; `search_type === "song_as_artist"` means the + // LLM parser couldn't identify the artist and reinterpreted the parsed + // song as the artist (e.g. "Beastie Boy" treated as artist when LLM + // missed the plural). Both mean: artist may be right, but the release + // shown is unrelated to whatever track the DJ actually played. + const fallback = data.song_not_found === true || data.search_type === "song_as_artist"; + + // Anachronism: matched release postdates the flowsheet. + const postdates = pageYear != null && releaseYear != null && releaseYear > pageYear; + + // Artist mismatch: resolved artist shares zero tokens with the artist + // we parsed out of the flowsheet text. Catches request-o-matic + // fuzzy-matching on a track word (Pure Joy → Coldcut via "Pieces") + // even when the release year happens to be plausible. + const inputArtist = parseInputArtist(inputRawText); + const artistMismatch = artistTokensDisjoint(inputArtist, artist); + + const stampBits = []; + if (releaseYear !== null) stampBits.push(String(releaseYear)); + if (conf !== null) stampBits.push(conf.toFixed(2)); + const stamp = stampBits.length ? ` (${stampBits.join(", ")})` : ""; + + let text; + let kind; + if (fallback) { + // "artist-only" makes it clear we have the artist but not the track, + // and "sample release" disclaims the album shown is illustrative + // (whichever release of theirs the library indexed first), not a + // confirmation that this is where the played track lives. + text = release + ? `⚠ artist-only · ${artist} · sample release: "${release}"${stamp}` + : `⚠ artist-only · ${artist}${stamp}`; + kind = "hit-weak"; + } else { + text = release + ? `${artist} · album: "${release}"${stamp}` + : `${artist}${stamp}`; + kind = conf !== null && conf < 0.5 ? "hit-weak" : "hit-strong"; + } + if (postdates) { + text = "⚠ postdates · " + text; + kind = "hit-weak"; + } + if (artistMismatch) { + text = `⚠ different artist (got "${artist}", expected "${inputArtist}") · ${text}`; + kind = "hit-weak"; + } + return { kind, text }; +} + +function applyBadge(rowEl, kind, text, title) { + const badge = $(".lookup-badge", rowEl); + if (!badge) return; + badge.hidden = false; + badge.className = `lookup-badge ${kind}`; + badge.textContent = text; + if (title) badge.title = title; +} + +async function checkArtists() { + if (!state.bundle) return; + const btn = $("#check-artists"); + btn.disabled = true; + const originalLabel = btn.textContent; + const pageYear = parsePageYear(state.bundle.page_date_raw); + + // Collect every non-deleted, non-empty row with its DOM node. + const work = []; + for (const quad of state.bundle.quadrants) { + const quadNode = [...$$(".quadrant")].find( + (n) => $(".quadrant-title", n).textContent === quad.position + ); + if (!quadNode) continue; + const rowNodes = $$(".row", quadNode); + for (let i = 0; i < quad.entries.length; i++) { + const entry = quad.entries[i]; + if (entry._deleted || !entry.raw_text?.trim()) continue; + const rowEl = rowNodes[i]; + if (!rowEl) continue; + work.push({ rowEl, entry }); + applyBadge(rowEl, "loading", "…looking up", ""); + } + } + + let done = 0; + const total = work.length; + const updateBtn = () => { + btn.textContent = `Checking artists (${done}/${total})…`; + }; + updateBtn(); + + // Concurrency-limited fan-out. + const queue = work.slice(); + async function worker() { + while (queue.length) { + const job = queue.shift(); + if (!job) break; + try { + const data = await lookupOne(job.entry.raw_text); + const { kind, text } = badgeContentFor(data, pageYear, job.entry.raw_text); + const aw = data.artwork || {}; + const title = + `parsed_artist=${(data.parsed || {}).artist ?? "?"}; ` + + `library_results=${(data.library_results || []).length}` + + (aw.release_year ? `; release_year=${aw.release_year}` : "") + + (pageYear ? `; page_year=${pageYear}` : ""); + applyBadge(job.rowEl, kind, text, title); + } catch (err) { + applyBadge(job.rowEl, "error", "lookup failed", String(err)); + } + done++; + updateBtn(); + } + } + await Promise.all( + Array.from({ length: state.lookupConcurrency }, () => worker()) + ); + + btn.disabled = false; + btn.textContent = originalLabel; + setStatus( + `Checked ${total} row(s) via request-o-matic` + + (pageYear ? ` (gating release_year > ${pageYear} as anachronistic).` : ".") + ); +} + +function $$(sel, root = document) { + return Array.from(root.querySelectorAll(sel)); +} + +function togglePageView() { + const aside = $("#page-view"); + const main = $("main"); + const btn = $("#toggle-page-view"); + const open = !aside.classList.contains("is-open"); + aside.classList.toggle("is-open", open); + main.classList.toggle("page-view-open", open); + btn.classList.toggle("is-active", open); + aside.setAttribute("aria-hidden", String(!open)); + btn.textContent = open ? "Hide page" : "Show page"; +} + +// ---- wiring -------------------------------------------------------------- + +document.addEventListener("DOMContentLoaded", async () => { + $("#bundle-input").addEventListener("change", (e) => { + const file = e.target.files?.[0]; + if (file) loadBundleFromFile(file); + }); + $("#image-input").addEventListener("change", (e) => { + const file = e.target.files?.[0]; + if (file) loadImageFromFile(file); + }); + $("#save-verified").addEventListener("click", saveAll); + $("#toggle-page-view").addEventListener("click", togglePageView); + $("#check-artists").addEventListener("click", checkArtists); + + await loadBundleFromUrlParam(); +}); diff --git a/verifier/index.html b/verifier/index.html new file mode 100644 index 0000000..22270ae --- /dev/null +++ b/verifier/index.html @@ -0,0 +1,93 @@ + + + + + + Flowsheet verifier + + + +
+

Flowsheet verifier

+
+ + + + + + Pick a bundle.json to begin. +
+
+ + + +
+
+

Page

+
+ + + +
+
+ +
+ +
+
+ + + + + + + + diff --git a/verifier/serve.py b/verifier/serve.py new file mode 100644 index 0000000..f58a804 --- /dev/null +++ b/verifier/serve.py @@ -0,0 +1,235 @@ +"""Dev server for the verifier UI. + +Serves the repo's static files (verifier/, data/, tests/) and provides: + + POST /api/lookup — same-origin proxy to request-o-matic /request + (request-o-matic doesn't emit CORS headers, and + a proxy is simpler than configuring CORS on a + third-party service). + POST /api/save — persist a verifier UI session: writes + .verified.json and .corrections.json + into data/verifier/, and (when the bundle carries + a `pdf_path`/`page_number` pair) updates + `jobs.db` via `JobStore.mark_verified`. + +Run: + + .venv/bin/python verifier/serve.py + +Then open http://localhost:8765/verifier/?bundle=/data/verifier/.bundle.json +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path + +import httpx +import uvicorn +from fastapi import FastAPI, HTTPException, Request +from fastapi.responses import JSONResponse +from fastapi.staticfiles import StaticFiles + +from core.jobs import JobStore +from core.schema import PageResult + +REQUEST_O_MATIC_URL = os.environ.get( + "REQUEST_O_MATIC_URL", + "https://request-o-matic-production.up.railway.app/api/v1/request", +) +PORT = int(os.environ.get("VERIFIER_PORT", "8765")) +REPO_ROOT = Path(__file__).resolve().parents[1] +DATA_ROOT = Path(os.environ.get("DATA_ROOT", REPO_ROOT / "data")) +VERIFIER_DIR = DATA_ROOT / "verifier" +JOBS_DB_PATH = DATA_ROOT / "jobs.db" + +app = FastAPI(docs_url=None, redoc_url=None) + +# Cache of jobs.db paths whose `init()` migrations have already been +# applied this process. Avoids re-running PRAGMAs + ALTER-TABLE checks +# on every /api/save when jobs.db is unchanged across saves. Tests that +# swap DATA_ROOT via importlib.reload get a fresh empty set. +_initialized_jobs_dbs: set[Path] = set() + + +async def _open_jobs_store() -> JobStore | None: + """Return an initialized `JobStore` if `jobs.db` is on disk, else None. + + Runs `JobStore.init()` once per (db_path, process) pair — subsequent + calls hit the in-memory cache and skip the schema-migration round + trip. Re-checks `is_file()` every call so a DB created after the + server starts (e.g., user runs the pipeline mid-session) is picked + up without a server restart. + """ + if not JOBS_DB_PATH.is_file(): + return None + store = JobStore(JOBS_DB_PATH) + if JOBS_DB_PATH not in _initialized_jobs_dbs: + await store.init() + _initialized_jobs_dbs.add(JOBS_DB_PATH) + return store + + +def _safe_stem(stem: str) -> str: + """Reject anything that could escape `data/verifier/` via path traversal. + + Bundle stems come from image filenames (e.g. `1990-04apr0106-page25`) + and are unlikely to contain `/`, but a hostile or malformed POST + shouldn't let the verifier server write outside the verifier dir. + Whitespace-only stems are also refused — they'd produce files named + ` .verified.json` which are confusing and almost certainly a bug. + """ + if not stem or not stem.strip() or "/" in stem or "\\" in stem or stem.startswith(".."): + raise HTTPException(status_code=400, detail=f"invalid stem: {stem!r}") + return stem + + +def _atomic_write_text(path: Path, content: str) -> None: + """Write `content` to `path` via a `.tmp` sibling and `os.replace`. + + Two writes on the save path (verified + corrections) — atomic + individual writes mean a partially-failed save leaves either both + files at their pre-save state OR both at the new state, never a + half-updated state where verified.json reflects the edit but + corrections.json doesn't. + """ + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(content) + os.replace(tmp, path) + + +@app.post("/api/lookup") +async def lookup(request: Request) -> JSONResponse: + """Same-origin proxy to request-o-matic's /request endpoint. + + Accepts the same shape request-o-matic does (`{"message": "..."}`) and + forwards verbatim. Adds skip_slack=True so the lookup doesn't post to + Slack. Returns request-o-matic's response unchanged. + """ + try: + payload = await request.json() + except Exception as exc: # noqa: BLE001 + raise HTTPException(status_code=400, detail=f"invalid JSON body: {exc}") from exc + + message = (payload or {}).get("message", "") + if not message: + raise HTTPException(status_code=400, detail="missing 'message' field") + + forward_body = {"message": message, "skip_slack": True} + async with httpx.AsyncClient(timeout=60) as client: + try: + r = await client.post(REQUEST_O_MATIC_URL, json=forward_body) + except httpx.TimeoutException as exc: + raise HTTPException(status_code=504, detail=f"upstream timeout: {exc}") from exc + except httpx.HTTPError as exc: + raise HTTPException(status_code=502, detail=f"upstream error: {exc}") from exc + if r.status_code >= 400: + raise HTTPException(status_code=r.status_code, detail=r.text) + return JSONResponse(r.json()) + + +@app.post("/api/save") +async def save(request: Request) -> JSONResponse: + """Persist a verifier UI session to disk and (optionally) `jobs.db`. + + Expected body shape: + + { + "stem": "", + "pdf_path": "" | null, + "page_number": | null, + "verified": { ...PageResult... }, + "corrections": { ...corrections... } + } + + Writes: + - `data/verifier/.verified.json` (validated as PageResult) + - `data/verifier/.corrections.json` (verbatim JSON) + + If `pdf_path` and `page_number` are present, also calls + `JobStore.mark_verified` to record the verification in `jobs.db`. + For bundles without a job key (test fixtures), only the files are + written and `db_updated` is False in the response. + """ + try: + payload = await request.json() + except Exception as exc: # noqa: BLE001 + raise HTTPException(status_code=400, detail=f"invalid JSON body: {exc}") from exc + + stem = _safe_stem(str(payload.get("stem", ""))) + verified = payload.get("verified") + corrections = payload.get("corrections") + pdf_path = payload.get("pdf_path") + page_number = payload.get("page_number") + if verified is None or corrections is None: + raise HTTPException( + status_code=400, + detail="body must include `verified` and `corrections` objects", + ) + + # Validate verified against PageResult and keep the parsed model so + # the on-disk JSON is the Pydantic-normalized round-trip rather than + # whatever the client happened to send. This makes the verified file + # a canonical representation that bit-matches what the pipeline + # writes, regardless of any extra fields or non-canonical datetime + # formats the client may have included. + try: + validated = PageResult.model_validate(verified) + except Exception as exc: # noqa: BLE001 + raise HTTPException( + status_code=400, detail=f"verified payload not a valid PageResult: {exc}" + ) from exc + + VERIFIER_DIR.mkdir(parents=True, exist_ok=True) + verified_path = VERIFIER_DIR / f"{stem}.verified.json" + corrections_path = VERIFIER_DIR / f"{stem}.corrections.json" + _atomic_write_text(verified_path, validated.model_dump_json(indent=2)) + _atomic_write_text(corrections_path, json.dumps(corrections, indent=2)) + + db_updated = False + # `isinstance(x, int)` is True for `bool` in Python — explicitly reject + # so a malformed `page_number: true` doesn't coerce to 1 and lookup + # the wrong row. + if pdf_path and isinstance(page_number, int) and not isinstance(page_number, bool): + store = await _open_jobs_store() + if store is not None: + db_updated = await store.mark_verified( + pdf_path=pdf_path, + page_number=page_number, + verified_path=verified_path, + corrections_path=corrections_path, + ) + + # Report paths relative to DATA_ROOT.parent so the UI displays + # `data/verifier/.verified.json` whether `data/` lives under + # the repo root (production) or a tmp dir (tests). Always succeeds — + # both written paths are under DATA_ROOT, which is a child of + # DATA_ROOT.parent by construction. + return JSONResponse( + { + "verified_path": str(verified_path.relative_to(DATA_ROOT.parent)), + "corrections_path": str(corrections_path.relative_to(DATA_ROOT.parent)), + "db_updated": db_updated, + } + ) + + +# Static mounts. Each top-level dir we need to serve gets its own mount so +# the URL structure mirrors the repo layout — `image_path` in bundles is +# relative (`../pages/.../page-NN.png`), and the UI fetches relative to +# the bundle URL. `/data` honors the same DATA_ROOT override that writes +# use so the read and write sides stay in sync when DATA_ROOT is moved. +app.mount("/verifier", StaticFiles(directory=REPO_ROOT / "verifier", html=True), name="verifier") +app.mount("/data", StaticFiles(directory=DATA_ROOT, check_dir=False), name="data") +app.mount("/tests", StaticFiles(directory=REPO_ROOT / "tests"), name="tests") + + +def main() -> None: + # Pass the app object directly rather than an import string — the script + # is invoked as a file (verifier/serve.py), not as a package import. + uvicorn.run(app, host="127.0.0.1", port=PORT, log_level="warning") + + +if __name__ == "__main__": + main() diff --git a/verifier/styles.css b/verifier/styles.css new file mode 100644 index 0000000..203c937 --- /dev/null +++ b/verifier/styles.css @@ -0,0 +1,344 @@ +:root { + --bg: #fafaf7; + --fg: #1a1a1a; + --muted: #6b6b6b; + --border: #d0d0c8; + --accent: #2a5fb0; + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; +} + +* { box-sizing: border-box; } + +body { + margin: 0; + background: var(--bg); + color: var(--fg); + font-size: 14px; + line-height: 1.4; +} + +body > header { + position: fixed; + top: 0; + left: 0; + right: 0; + z-index: 100; + background: var(--bg); + border-bottom: 1px solid var(--border); + padding: 12px 24px; +} + +body > header h1 { + margin: 0 0 8px 0; + font-size: 18px; + font-weight: 600; +} + +.controls { + display: flex; + gap: 12px; + align-items: center; + flex-wrap: wrap; +} + +.file-picker { + display: inline-block; +} + +.file-picker input[type="file"] { + display: none; +} + +.file-picker span { + display: inline-block; + padding: 4px 12px; + background: white; + border: 1px solid var(--border); + border-radius: 4px; + cursor: pointer; +} + +.file-picker span:hover { background: #f0f0e8; } + +button { + padding: 4px 12px; + background: var(--accent); + color: white; + border: none; + border-radius: 4px; + cursor: pointer; + font-size: 14px; +} + +button:disabled { + background: var(--muted); + cursor: not-allowed; + opacity: 0.6; +} + +#status { + color: var(--muted); + font-size: 13px; +} + +#status.error { color: #b00; } + +main { + padding: 110px 24px 80px 24px; + max-width: 1200px; + margin: 0 auto; + transition: margin-right 0.2s ease; +} + +main.page-view-open { + margin-right: 50vw; + max-width: none; +} + +section { + margin-bottom: 24px; +} + +section h2 { + font-size: 14px; + font-weight: 600; + color: var(--muted); + margin: 0 0 8px 0; + text-transform: uppercase; + letter-spacing: 0.04em; +} + +.meta-grid { + display: grid; + grid-template-columns: 1fr; + gap: 8px; + max-width: 720px; +} + +.meta-grid label { + display: flex; + flex-direction: column; + gap: 4px; + font-size: 12px; + color: var(--muted); +} + +.meta-grid input, +.meta-grid textarea { + font-family: inherit; + font-size: 14px; + padding: 6px 8px; + border: 1px solid var(--border); + border-radius: 4px; + background: white; + color: var(--fg); +} + +.meta-grid textarea { resize: vertical; } + +.quadrant { + background: white; + border: 1px solid var(--border); + border-radius: 6px; + padding: 12px 16px; + margin-bottom: 16px; +} + +.quadrant-header { + display: flex; + gap: 16px; + align-items: center; + flex-wrap: wrap; + margin-bottom: 12px; + padding-bottom: 8px; + border-bottom: 1px solid var(--border); +} + +.quadrant-title { + margin: 0; + font-size: 13px; + font-weight: 600; + color: var(--accent); + text-transform: none; + letter-spacing: 0; + min-width: 110px; +} + +.quadrant-header label { + display: flex; + flex-direction: row; + gap: 6px; + align-items: center; + font-size: 12px; + color: var(--muted); +} + +.quadrant-header input { + font-size: 13px; + padding: 3px 6px; + border: 1px solid var(--border); + border-radius: 3px; + width: 80px; +} + +.rows { display: flex; flex-direction: column; gap: 6px; } + +.row { + display: flex; + flex-direction: column; + gap: 4px; + padding: 6px 0; + border-bottom: 1px dotted var(--border); +} + +.row:last-child { border-bottom: none; } + +.row.deleted { opacity: 0.35; } + +.row.has-notes .notes select { + border-color: var(--accent); + background: #eef3fb; + font-weight: 600; +} + +.row-crop { + width: 100%; + height: auto; + background: #f0f0e8; + border: 1px solid var(--border); + border-radius: 3px; + display: block; +} + +.row-crop.no-crop { + height: 40px; + display: flex; + align-items: center; + justify-content: center; + color: var(--muted); + font-style: italic; + font-size: 12px; +} + +.row-fields { + display: flex; + flex-direction: column; + gap: 4px; + min-width: 0; +} + +.lookup-badge { + font-size: 11px; + padding: 3px 8px; + border-radius: 3px; + display: inline-flex; + align-items: center; + gap: 4px; + background: #f0f0e8; + color: var(--muted); + font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + border: 1px solid var(--border); + align-self: flex-start; + max-width: 100%; +} + +.lookup-badge.loading { background: #f0f0e8; } +.lookup-badge.hit-strong { background: #e6f5e6; color: #1f6f1f; border-color: #b8e0b8; } +.lookup-badge.hit-weak { background: #fff5e0; color: #8a5a00; border-color: #ecd9a8; } +.lookup-badge.empty { background: #f0f0e8; color: var(--muted); font-style: italic; } +.lookup-badge.error { background: #fee; color: #b00; border-color: #f8c0c0; } +.lookup-badge.stale { opacity: 0.45; font-style: italic; } + +.row-fields .raw-text { + font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + font-size: 13px; + padding: 5px 8px; + border: 1px solid var(--border); + border-radius: 3px; + background: white; + color: var(--fg); + width: 100%; +} + +.row-controls { + display: flex; + gap: 8px; + align-items: center; + font-size: 11px; + color: var(--muted); +} + +.row-controls select, +.row-controls input[type="text"] { + font-size: 12px; + padding: 2px 4px; + border: 1px solid var(--border); + border-radius: 3px; + background: white; + font-family: inherit; +} + +.row-controls .type-raw input[type="text"] { + width: 14em; +} + +.row-controls button.delete-row { + padding: 2px 6px; + font-size: 12px; + background: white; + color: var(--muted); + border: 1px solid var(--border); +} + +.row-controls button.delete-row:hover { + background: #fee; + color: #b00; + border-color: #f8c0c0; +} + +/* Page-view side panel: toggled by the "Show page" button in the header. + Fixed to the right edge of the viewport, occupies half the width when + open; content under
reflows via `main.page-view-open`. */ +#page-view { + position: fixed; + top: 100px; + right: 0; + bottom: 0; + width: 50vw; + background: #1a1a1a; + border-left: 1px solid var(--border); + z-index: 50; + overflow: auto; + padding: 12px; + transform: translateX(100%); + transition: transform 0.2s ease; +} + +#page-view.is-open { + transform: translateX(0); +} + +#page-view img { + width: 100%; + height: auto; + display: block; + background: white; +} + +#toggle-page-view.is-active { + background: #1a4080; +} + +.add-row { + margin-top: 8px; + padding: 4px 12px; + background: white; + color: var(--accent); + border: 1px dashed var(--border); + font-size: 12px; +} + +.add-row:hover { + background: #f0f4fa; + border-style: solid; +}