From c83ae59ec3a5b139c272396267b06d45da6feb14 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Mon, 11 May 2026 09:47:03 -0700 Subject: [PATCH 1/5] feat(verifier): static SPA for manual row-by-row verification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a serverless verifier UI that shows each row's cropped image strip next to its model-detected text in editable fields. Three new pieces wired together: - scripts/make_verifier_bundle.py — pre-processor: PageResult JSON + page PNG into a bundle.json with per-quadrant and per-row pixel bboxes. Continuation-merged entries get a physical-row span so their crops cover the wrapped lines instead of nudging subsequent rows out of alignment; double_height entries get span=2 inherently. - scripts/derive_truth.py — verified.json into tests/golden/.truth.json by extracting short uppercased substrings (whitespace-tokenized date, 4-char jock prefix, 24-char artist portion via parse_artist_track). Substring rules live in Python so they're testable in one place. - verifier/ — static HTML/JS/CSS SPA (no build step). Loads a bundle via ?bundle= URL param or file picker, canvas-crops each row, lets the user edit raw_text / type_raw / notes / hour_raw / jock_raw / page meta, mark hallucinations (x), and add missed rows (+). Export emits two files: .verified.json (PageResult-shaped, plugs back into the pipeline) and .corrections.json (delta vs the immutable bundle snapshot — page/quadrant/row corrections, added_rows, deleted_rows). Also lifts a public partition_row_lines_by_quadrant helper out of core/page_layout's private detection internals — same row-line detector, partitioned per quadrant by body_mid_y and column-side ink density. Bundle layout: data/verifier/.bundle.json with image_path computed as os.path.relpath to data/pages//.png so bundles are portable. SCHEMA_VERSION = 1 hardcoded; UI rejects unknown versions. 465 tests pass, ruff/mypy clean. --- CLAUDE.md | 23 ++ README.md | 24 ++ core/page_layout.py | 54 +++ scripts/derive_truth.py | 135 +++++++ scripts/make_verifier_bundle.py | 254 +++++++++++++ tests/unit/test_derive_truth.py | 221 +++++++++++ tests/unit/test_make_verifier_bundle.py | 423 +++++++++++++++++++++ tests/unit/test_page_layout.py | 72 ++++ verifier/README.md | 150 ++++++++ verifier/app.js | 468 ++++++++++++++++++++++++ verifier/index.html | 91 +++++ verifier/styles.css | 316 ++++++++++++++++ 12 files changed, 2231 insertions(+) create mode 100644 scripts/derive_truth.py create mode 100644 scripts/make_verifier_bundle.py create mode 100644 tests/unit/test_derive_truth.py create mode 100644 tests/unit/test_make_verifier_bundle.py create mode 100644 verifier/README.md create mode 100644 verifier/app.js create mode 100644 verifier/index.html create mode 100644 verifier/styles.css diff --git a/CLAUDE.md b/CLAUDE.md index 060080d..c732359 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,8 +13,15 @@ scans/ input PDFs (gitignored; SCANS_ROOT) data/ outputs (gitignored; DATA_ROOT) pages//page-NN.png rendered images results//page-NN.json extraction results (one PageResult per page) + verifier/.bundle.json pre-processor output: result + per-row bboxes + verifier/.verified.json verifier UI export: hand-corrected PageResult jobs.db SQLite job table +verifier/ static SPA for manual row-by-row verification. + Loads a bundle, renders each row's cropped + image strip next to an editable text field, + exports a corrected verified.json. + core/ schema.py Pydantic models. GeminiPageResult is what the model returns (used as response_schema); @@ -41,12 +48,28 @@ core/ PageLayout (header_bottom_y, body_mid_y, column_mid_x). Used by the per-quadrant cropper in scripts/calibrate_models.py. + `partition_row_lines_by_quadrant(image, + layout)` is the public hook the verifier + pre-processor uses to compute per-row bboxes. continuations.py Read-time merge of `notes="continuation"` rows into the prior entry's raw_text. Pure function; on-disk shape unchanged. cli.py Typer entrypoint: `flowsheets `. Builds dependencies from env, calls into core. + +scripts/ + make_verifier_bundle.py PageResult JSON + page PNG -> verifier + bundle.json with per-quadrant + per-row + bboxes for the SPA to canvas-crop. Hard-codes + SCHEMA_VERSION = 1; bump on incompatible + schema changes. + derive_truth.py .verified.json -> .truth.json + by extracting short uppercased substrings + (page date tokens, jock prefix, artist + portion of raw_text). Single source of + truth for those rules — the UI doesn't + derive truth itself. ``` ## Why these choices diff --git a/README.md b/README.md index 0e6c71c..0632ff1 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,30 @@ Tests are split into: The default test run **excludes** the `external_api` and `slow` markers; CI runs the same default. The golden-page external-API runner is a follow-up. +## Manual verifier + +After the pipeline produces `data/results//page-NN.json`, you can hand-verify and correct entries via the static SPA in `verifier/`. Each row's cropped image strip sits next to its detected text in an editable field. Export emits a `.verified.json` (`PageResult`-shaped, plugs back into the pipeline as ground truth) and `derive_truth` produces a matching `tests/golden/.truth.json`. + +```bash +# Generate a bundle +python -m scripts.make_verifier_bundle \ + data/results//page-NN.json \ + data/pages//page-NN.png \ + --out data/verifier/.bundle.json + +# Open the verifier +python -m http.server 8765 +# then visit: +# http://localhost:8765/verifier/?bundle=/data/verifier/.bundle.json + +# Derive a truth file from the exported verified.json +python -m scripts.derive_truth \ + data/verifier/.verified.json \ + --out tests/golden/.truth.json +``` + +See `verifier/README.md` for the bundle schema, expected file layout, and the substring-derivation rules. + ## Cost calibration Gemini 3.1 Pro charges per input token; one 300-DPI flowsheet page at `media_resolution=high` is ~1120 image tokens plus ~600 prompt tokens. Across the full corpus (~16K pages) input cost lands in the low tens of dollars; output adds modestly. Run the pipeline against a 10–20 page sample first and inspect both quality and `usage_metadata` before scheduling a full run. diff --git a/core/page_layout.py b/core/page_layout.py index cfaf48b..6d40948 100644 --- a/core/page_layout.py +++ b/core/page_layout.py @@ -31,6 +31,8 @@ import numpy as np +from core.schema import QUADRANT_ORDER, QuadrantPosition + if TYPE_CHECKING: from PIL.Image import Image as PILImage @@ -296,3 +298,55 @@ def _detect_body_bottom_y(row_lines: list[int], h: int) -> int: if not in_band: return int(h * FALLBACK_BODY_BOTTOM_FRACTION) return in_band[-1] + + +def partition_row_lines_by_quadrant( + image: PILImage, layout: PageLayout +) -> dict[QuadrantPosition, list[int]]: + """Detected row-line y-coords, partitioned by quadrant of the body grid. + + Reuses `_detect_row_lines` for the y-coordinates, then classifies each + line by which page-column it spans (left, right, or both, based on ink + density at that y) and which body band it sits in (top vs bottom, by + `layout.body_mid_y`). + + A line spanning both columns is added to BOTH side quadrants — most + printed flowsheet grid lines run full-width and bracket both hour-blocks + of a row. + + Lines outside `[layout.header_bottom_y, layout.body_bottom_y)` are + dropped (header or footer artifacts, not body rows). + + Returns a dict with all four `QUADRANT_ORDER` keys; empty list when + no lines hit a quadrant (blank image, un-printed margin). + """ + w, _h = image.size + grayscale = np.asarray(image.convert("L")) + col_mid = layout.column_mid_x + + all_lines = _detect_row_lines(grayscale, w, col_mid) + + ink = (255 - grayscale).astype(np.float64) / 255.0 + left_w = float(col_mid) + right_w = float(w - col_mid) + threshold = _ROW_LINE_THRESHOLDS[-1] + + out: dict[QuadrantPosition, list[int]] = {q: [] for q in QUADRANT_ORDER} + for y in all_lines: + if not (layout.header_bottom_y <= y < layout.body_bottom_y): + continue + left_ink = float(ink[y, :col_mid].sum()) + right_ink = float(ink[y, col_mid:].sum()) + on_left = left_ink > threshold * left_w + on_right = right_ink > threshold * right_w + if y < layout.body_mid_y: + if on_left: + out["top_left"].append(int(y)) + if on_right: + out["top_right"].append(int(y)) + else: + if on_left: + out["bottom_left"].append(int(y)) + if on_right: + out["bottom_right"].append(int(y)) + return out diff --git a/scripts/derive_truth.py b/scripts/derive_truth.py new file mode 100644 index 0000000..633eca9 --- /dev/null +++ b/scripts/derive_truth.py @@ -0,0 +1,135 @@ +"""Derive a `GoldenTruth` file from a hand-corrected `PageResult`. + +The verifier UI exports `.verified.json` — a `PageResult` whose +`raw_text` fields have been hand-corrected. This tool extracts short +substrings from those fields and writes a `GoldenTruth`-shaped file +that plugs into the existing parity-test harness. + +Substring rules (codified to match the convention in `tests/golden/*.truth.json`): + + * page_date_substrings: whitespace-delimited tokens of `page_date_raw`. + e.g. "Tues 4/3 90" -> ["Tues", "4/3", "90"] + * jock_substring: first whitespace-delimited token of `jock_raw`, + uppercased and truncated to 4 chars. + e.g. "Andrew" -> "ANDR" + * raw_substring (per row): the artist portion of `raw_text` + (`parse_artist_track`), uppercased, truncated to <=24 chars at + the last whitespace boundary inside the cutoff. If no separator, + use the full text. + +The substrings are deliberately short — `core.golden._icontains` is a +case-insensitive substring match, so short tokens are forgiving of +small misspellings while remaining unambiguous within the WXYC corpus. + +CLI: + + python -m scripts.derive_truth \\ + data/verifier/.verified.json \\ + --out tests/golden/.truth.json +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from core.golden import GoldenTruth, QuadrantTruth, RowTruth +from core.parse import parse_artist_track +from core.schema import PageResult + +_MAX_ROW_SUBSTRING = 24 +_MAX_JOCK_SUBSTRING = 4 + + +def _date_substrings(page_date_raw: str | None) -> list[str]: + """Split `page_date_raw` into whitespace-delimited tokens. + + Empty list when the field is None, empty, or whitespace-only. + """ + if not page_date_raw: + return [] + return page_date_raw.split() + + +def _jock_substring(jock_raw: str | None) -> str | None: + """First whitespace-delimited token of `jock_raw`, uppercased, + truncated to 4 chars. Returns None when the field is missing so the + truth file omits the assertion entirely. + """ + if not jock_raw or not jock_raw.strip(): + return None + first_token = jock_raw.strip().split()[0] + out = first_token.upper()[:_MAX_JOCK_SUBSTRING] + return out or None + + +def _row_substring(raw_text: str) -> str: + """Artist-portion of `raw_text`, uppercased, capped at 24 chars. + + Falls back to the full raw_text when `parse_artist_track` finds no + separator (entries without "Artist - Track" structure, e.g. a + continuation row that wasn't merged). + + The 24-char cap snaps to the last whitespace boundary inside the + cutoff to avoid mid-word truncation. If the artist is one long + word, hard-cut at 24. + """ + artist, _track = parse_artist_track(raw_text) + src = (artist or raw_text or "").strip().upper() + if len(src) <= _MAX_ROW_SUBSTRING: + return src + cut = src.rfind(" ", 0, _MAX_ROW_SUBSTRING) + return src[:cut] if cut > 0 else src[:_MAX_ROW_SUBSTRING] + + +def derive_truth(page: PageResult) -> GoldenTruth: + """Build a `GoldenTruth` from a hand-corrected `PageResult`. + + Quadrants pass through in canonical order. Entries with empty + `raw_text` are skipped (nothing to match against). + """ + quadrants_out: list[QuadrantTruth] = [] + for quad in page.quadrants: + rows = [ + RowTruth(raw_substring=_row_substring(entry.raw_text)) + for entry in quad.entries + if entry.raw_text.strip() + ] + quadrants_out.append( + QuadrantTruth( + position=quad.position, + hour_raw=quad.hour_raw, + jock_substring=_jock_substring(quad.jock_raw), + rows=rows, + ) + ) + return GoldenTruth( + page_date_substrings=_date_substrings(page.page_date_raw), + quadrants=quadrants_out, + ) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Derive a GoldenTruth file from a verified PageResult.", + ) + parser.add_argument("verified", type=Path, help="Path to the verified.json PageResult.") + parser.add_argument("--out", type=Path, required=True, help="Output truth.json path.") + args = parser.parse_args(argv) + + if not args.verified.is_file(): + print(f"verified file not found: {args.verified}", file=sys.stderr) + return 1 + + page = PageResult.model_validate_json(args.verified.read_text()) + truth = derive_truth(page) + + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text(truth.model_dump_json(indent=2, exclude_defaults=False)) + print(f"wrote {args.out}") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/scripts/make_verifier_bundle.py b/scripts/make_verifier_bundle.py new file mode 100644 index 0000000..932d854 --- /dev/null +++ b/scripts/make_verifier_bundle.py @@ -0,0 +1,254 @@ +"""Pre-processor: turn a `PageResult` + page image into a verifier bundle. + +The bundle is the input the static `verifier/` UI consumes. It contains the +extraction output (verbatim from the pipeline) plus geometry: a bbox per +quadrant, a bbox per row inside each quadrant, and a relative path to the +source image so the UI can canvas-crop each row in the browser. + +CLI surface: + + python -m scripts.make_verifier_bundle \\ + data/results//page-NN.json \\ + data/pages//page-NN.png \\ + --out data/verifier/.bundle.json + +If `--out` is omitted, the output is written to +`data/verifier/.bundle.json` next to the repo root. + +The bundle is a derivation, not a long-running result — re-running +overwrites. The pre-processor creates the output's parent directory if +it doesn't exist. +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path +from typing import Any + +from PIL import Image + +from core.page_layout import PageLayout, detect_page_layout, partition_row_lines_by_quadrant +from core.schema import QUADRANT_ORDER, Entry, PageResult, QuadrantPosition + +# Bump when the bundle JSON schema becomes incompatible. +# `verifier/README.md` documents the versioning strategy. +SCHEMA_VERSION = 1 + + +BBox = tuple[int, int, int, int] + + +def _quadrant_bboxes(layout: PageLayout, *, page_width: int) -> dict[QuadrantPosition, BBox]: + """Bounding box of each quadrant's body region. + + Quadrants partition the body strip (header_bottom_y .. body_bottom_y) + via `column_mid_x` (left/right) and `body_mid_y` (top/bottom). + """ + return { + "top_left": (0, layout.header_bottom_y, layout.column_mid_x, layout.body_mid_y), + "top_right": (layout.column_mid_x, layout.header_bottom_y, page_width, layout.body_mid_y), + "bottom_left": (0, layout.body_mid_y, layout.column_mid_x, layout.body_bottom_y), + "bottom_right": ( + layout.column_mid_x, + layout.body_mid_y, + page_width, + layout.body_bottom_y, + ), + } + + +def _merge_with_spans(entries: list[Entry]) -> list[tuple[Entry, int]]: + """Apply continuation-row merging and compute each entry's physical-row span. + + This is the geometry-aware companion to `core.continuations.merge_continuations`: + it produces the same merged entries, paired with the number of physical + flowsheet rows each logical entry occupies on the page. + + - notes="continuation": folds into the previous logical entry's raw_text + (verbatim with the existing merge rules) and adds 1 to its span. + - notes="double_height": stays as a single logical entry but spans 2 rows. + - All others: span 1. + + A leading "continuation" with nothing above it is preserved as-is with + span 1, matching `merge_continuations`'s edge-case behavior. + + Why this lives here and not in `core/continuations.py`: span-tracking is + a verifier-geometry concern. The on-disk pipeline doesn't need it. + """ + result: list[tuple[Entry, int]] = [] + for entry in entries: + if entry.notes == "continuation" and result: + prior, prior_span = result[-1] + joined = f"{prior.raw_text.rstrip()} {entry.raw_text.lstrip()}".strip() + merged = prior.model_copy( + update={ + "raw_text": joined, + "oddities": [*prior.oddities, *entry.oddities], + } + ) + result[-1] = (merged, prior_span + 1) + elif entry.notes == "double_height": + result.append((entry, 2)) + else: + result.append((entry, 1)) + return result + + +def _assign_row_bboxes( + quad_bbox: BBox, + lines: list[int], + spans: list[int], +) -> list[BBox]: + """Pair logical entries to row strips inside a quadrant. + + `spans` is one int per logical entry: the number of physical row strips + that entry occupies on the page (1 for normal entries; 2 for double_height + or one continuation; 3 for two continuations; etc.). + + Heuristic: + - When `len(lines) >= sum(spans) + 1`, slice consecutive line pairs + according to each entry's span. Entry i's bbox spans from `lines[j]` + to `lines[j + spans[i]]`, with `j` advancing by `spans[i]` between + entries. Trailing lines (beyond what spans require) are ignored. + - Otherwise, even-spacing fallback: divide the quadrant height into + `len(spans)` equal strips, one per logical entry, ignoring the + physical-row count. + + The fallback uses entry count, not physical row count, because uniform + strips are better UX than partial pairing (which would leave the tail + of the quadrant uncropped on entries with wider spans). + """ + if not spans: + return [] + x1, y1, x2, y2 = quad_bbox + total_physical_rows = sum(spans) + if len(lines) >= total_physical_rows + 1: + rows: list[BBox] = [] + j = 0 + for span in spans: + rows.append((x1, lines[j], x2, lines[j + span])) + j += span + return rows + height = y2 - y1 + n_entries = len(spans) + step = height / n_entries + return [ + (x1, y1 + int(round(i * step)), x2, y1 + int(round((i + 1) * step))) + for i in range(n_entries) + ] + + +def make_bundle( + page: PageResult, + *, + image_path: Path, + bundle_path: Path, +) -> dict[str, Any]: + """Assemble the verifier bundle for one page. + + `bundle_path` is used only to compute the relative `image_path` field + — the file isn't written here. The CLI's `main` writes the bundle to + disk; this function is the pure construction step so tests can + inspect the output without filesystem side effects. + """ + image = Image.open(image_path) + layout = detect_page_layout(image) + width, _height = image.size + + quad_boxes = _quadrant_bboxes(layout, page_width=width) + lines_by_quad = partition_row_lines_by_quadrant(image, layout) + + quadrants_out: list[dict[str, Any]] = [] + for position in QUADRANT_ORDER: + # Continuations fold into the previous entry's raw_text; double_height + # stays as one entry. `_merge_with_spans` does the merge and tracks + # how many physical rows each resulting logical entry occupies, so + # the bbox cropper can skip the right number of grid lines per entry. + source_quad = next((q for q in page.quadrants if q.position == position), None) + if source_quad is None: + continue + merged_with_spans = _merge_with_spans(source_quad.entries) + bbox = quad_boxes[position] + lines = lines_by_quad.get(position, []) + spans = [s for _, s in merged_with_spans] + row_boxes = _assign_row_bboxes(bbox, lines, spans=spans) + + entries_out: list[dict[str, Any]] = [] + merged_entries = [e for e, _ in merged_with_spans] + for entry, row_bbox in zip(merged_entries, row_boxes, strict=True): + entries_out.append( + { + "row_index": entry.row_index, + "raw_text": entry.raw_text, + "confidence": entry.confidence, + "type_raw": entry.type_raw, + "notes": entry.notes, + "oddities": list(entry.oddities), + "row_bbox": list(row_bbox), + } + ) + quadrants_out.append( + { + "position": position, + "bbox": list(bbox), + "hour_raw": source_quad.hour_raw, + "jock_raw": source_quad.jock_raw, + "entries": entries_out, + "oddities": list(source_quad.oddities), + } + ) + + image_rel = os.path.relpath(image_path, bundle_path.parent) + return { + "schema_version": SCHEMA_VERSION, + "stem": image_path.stem, + "image_path": image_rel, + "model_version": page.model_version, + "extracted_at": page.extracted_at.isoformat(), + "page_date_raw": page.page_date_raw, + "comments_raw": page.comments_raw, + "oddities": list(page.oddities), + "quadrants": quadrants_out, + } + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Build a verifier bundle from a PageResult JSON + page image.", + ) + parser.add_argument("result", type=Path, help="Path to the extraction result JSON.") + parser.add_argument("image", type=Path, help="Path to the page PNG.") + parser.add_argument( + "--out", + type=Path, + default=None, + help=( + "Output bundle path. Defaults to " + "data/verifier/.bundle.json relative to the cwd." + ), + ) + args = parser.parse_args(argv) + + if not args.result.is_file(): + print(f"result not found: {args.result}", file=sys.stderr) + return 1 + if not args.image.is_file(): + print(f"image not found: {args.image}", file=sys.stderr) + return 1 + + page = PageResult.model_validate_json(args.result.read_text()) + out_path = args.out or Path("data/verifier") / f"{args.image.stem}.bundle.json" + bundle = make_bundle(page, image_path=args.image, bundle_path=out_path) + + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(bundle, indent=2)) + print(f"wrote {out_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tests/unit/test_derive_truth.py b/tests/unit/test_derive_truth.py new file mode 100644 index 0000000..309ffcb --- /dev/null +++ b/tests/unit/test_derive_truth.py @@ -0,0 +1,221 @@ +"""Tests for `scripts/derive_truth.py`. + +The truth-derivation tool consumes a `.verified.json` (PageResult- +shaped) and emits `.truth.json` (GoldenTruth-shaped) by extracting +short substrings from the user-corrected raw_text. Tests pin the +substring rules and the end-to-end CLI flow. +""" + +from __future__ import annotations + +import json +from datetime import UTC, datetime +from pathlib import Path + +import pytest + +from core.golden import GoldenTruth +from core.schema import QUADRANT_ORDER, Entry, PageResult, Quadrant +from scripts.derive_truth import ( + _date_substrings, + _jock_substring, + _row_substring, + derive_truth, + main, +) + +# -- _date_substrings ------------------------------------------------------- + + +@pytest.mark.parametrize( + ("page_date_raw", "expected"), + [ + ("Tues 4/3 90", ["Tues", "4/3", "90"]), + ("Monday 1 Jan '90", ["Monday", "1", "Jan", "'90"]), + ("", []), + (None, []), + (" ", []), # whitespace-only + ], +) +def test_date_substrings(page_date_raw: str | None, expected: list[str]) -> None: + assert _date_substrings(page_date_raw) == expected + + +# -- _jock_substring -------------------------------------------------------- + + +@pytest.mark.parametrize( + ("jock_raw", "expected"), + [ + ("Andrew", "ANDR"), + ("ANDREW", "ANDR"), + ("Andy J", "ANDY"), # first token only + ("Sam", "SAM"), # shorter than 4 chars passes through + (None, None), + ("", None), + (" ", None), + ], +) +def test_jock_substring(jock_raw: str | None, expected: str | None) -> None: + assert _jock_substring(jock_raw) == expected + + +# -- _row_substring --------------------------------------------------------- + + +@pytest.mark.parametrize( + ("raw_text", "expected"), + [ + # Examples from the plan body, matching existing golden truth convention. + ("Beastie Boys - Sabotage", "BEASTIE BOYS"), + ("Primal Scream - Loaded", "PRIMAL SCREAM"), + ("Bo Diddley - Hey Bo", "BO DIDDLEY"), + ("Elizabeth Cotten - Shake", "ELIZABETH COTTEN"), + ("JUANA MOLINA - la paradoja", "JUANA MOLINA"), + # No separator: full text uppercased, truncated at 24 chars (snap to ws). + ("standalone continuation text here", "STANDALONE CONTINUATION"), + ("short text", "SHORT TEXT"), + # Exactly 24 chars: unchanged. + ("a" * 24, "A" * 24), + # 25 chars no whitespace: hard-cut at 24. + ("a" * 25, "A" * 24), + # Em-dash separator (handled by parse_artist_track). + ("Hermanos Gutiérrez — Aguas Ardientes", "HERMANOS GUTIÉRREZ"), + ], +) +def test_row_substring(raw_text: str, expected: str) -> None: + assert _row_substring(raw_text) == expected + + +# -- derive_truth ----------------------------------------------------------- + + +def _entry(text: str, idx: int = 0) -> Entry: + return Entry(row_index=idx, raw_text=text, confidence="high") + + +def _quad(position: str, jock: str | None, hour: str | None, entries: list[Entry]) -> Quadrant: + return Quadrant( + position=position, # type: ignore[arg-type] + hour_raw=hour, + jock_raw=jock, + entries=entries, + ) + + +def _page(date: str | None, quads: list[Quadrant]) -> PageResult: + return PageResult( + page_date_raw=date, + quadrants=quads, + oddities=[], + model_version="test-verified", + extracted_at=datetime(2026, 5, 10, tzinfo=UTC), + ) + + +def test_derive_truth_returns_golden_truth_with_all_quadrants() -> None: + page = _page( + "Tues 4/3 90", + [ + _quad("top_left", "Andrew", "6AM", [_entry("Primal Scream - Loaded")]), + _quad("top_right", None, "7AM", [_entry("Beastie Boys - Sabotage")]), + _quad("bottom_left", "Andrew", "8AM", [_entry("Bo Diddley - Hey Bo")]), + _quad("bottom_right", None, "9AM", [_entry("Juana Molina - la paradoja")]), + ], + ) + truth = derive_truth(page) + assert isinstance(truth, GoldenTruth) + assert [q.position for q in truth.quadrants] == list(QUADRANT_ORDER) + + +def test_derive_truth_page_date_split_into_tokens() -> None: + page = _page("Tues 4/3 90", [_quad(p, None, None, []) for p in QUADRANT_ORDER]) + truth = derive_truth(page) + assert truth.page_date_substrings == ["Tues", "4/3", "90"] + + +def test_derive_truth_quadrant_substrings_match_rules() -> None: + page = _page( + None, + [ + _quad("top_left", "Andrew", "6AM", [_entry("Primal Scream - Loaded")]), + _quad("top_right", None, None, [_entry("Beastie Boys - Sabotage")]), + _quad("bottom_left", None, None, []), + _quad("bottom_right", None, None, [_entry("Bo Diddley - Hey Bo")]), + ], + ) + truth = derive_truth(page) + by_pos = {q.position: q for q in truth.quadrants} + assert by_pos["top_left"].jock_substring == "ANDR" + assert by_pos["top_left"].hour_raw == "6AM" + assert [r.raw_substring for r in by_pos["top_left"].rows] == ["PRIMAL SCREAM"] + assert by_pos["top_right"].jock_substring is None + assert [r.raw_substring for r in by_pos["top_right"].rows] == ["BEASTIE BOYS"] + assert by_pos["bottom_left"].rows == [] + assert [r.raw_substring for r in by_pos["bottom_right"].rows] == ["BO DIDDLEY"] + + +def test_derive_truth_skips_empty_raw_text_rows() -> None: + """An entry with empty raw_text shouldn't produce a truth row — there's + nothing to match against.""" + page = _page( + None, + [ + _quad("top_left", None, None, [_entry(""), _entry("Primal Scream")]), + _quad("top_right", None, None, []), + _quad("bottom_left", None, None, []), + _quad("bottom_right", None, None, []), + ], + ) + truth = derive_truth(page) + by_pos = {q.position: q for q in truth.quadrants} + assert [r.raw_substring for r in by_pos["top_left"].rows] == ["PRIMAL SCREAM"] + + +# -- main CLI --------------------------------------------------------------- + + +def test_main_writes_truth_file(tmp_path: Path) -> None: + page = _page( + "Tues 4/3 90", + [ + _quad("top_left", "Andrew", "6AM", [_entry("Primal Scream - Loaded")]), + _quad("top_right", None, None, [_entry("Beastie Boys - Sabotage")]), + _quad("bottom_left", None, None, []), + _quad("bottom_right", None, None, []), + ], + ) + verified_path = tmp_path / "verified.json" + verified_path.write_text(page.model_dump_json(indent=2)) + + out_path = tmp_path / "out" / "truth.json" + rc = main([str(verified_path), "--out", str(out_path)]) + assert rc == 0 + + truth = GoldenTruth.load(out_path) + assert truth.page_date_substrings == ["Tues", "4/3", "90"] + by_pos = {q.position: q for q in truth.quadrants} + assert [r.raw_substring for r in by_pos["top_left"].rows] == ["PRIMAL SCREAM"] + + +def test_main_returns_one_when_input_missing(tmp_path: Path) -> None: + rc = main([str(tmp_path / "missing.json"), "--out", str(tmp_path / "out.json")]) + assert rc == 1 + + +def test_main_round_trips_through_pydantic(tmp_path: Path) -> None: + """End-to-end: PageResult on disk → derive_truth main → GoldenTruth on + disk → GoldenTruth.load. Pins the export schema.""" + page = _page("Mon 5 May", [_quad(p, None, None, []) for p in QUADRANT_ORDER]) + verified_path = tmp_path / "verified.json" + verified_path.write_text(page.model_dump_json(indent=2)) + + out_path = tmp_path / "truth.json" + main([str(verified_path), "--out", str(out_path)]) + + # Both load the same data. + loaded_from_disk = GoldenTruth.load(out_path) + assert loaded_from_disk.page_date_substrings == ["Mon", "5", "May"] + # Round-trip a raw dict too — extra fields would be caught by extra=forbid. + raw = json.loads(out_path.read_text()) + GoldenTruth.model_validate(raw) diff --git a/tests/unit/test_make_verifier_bundle.py b/tests/unit/test_make_verifier_bundle.py new file mode 100644 index 0000000..9e10649 --- /dev/null +++ b/tests/unit/test_make_verifier_bundle.py @@ -0,0 +1,423 @@ +"""Tests for `scripts/make_verifier_bundle.py`. + +The pre-processor turns a `PageResult` + page image into a `bundle.json` +the verifier UI consumes. Tests cover the geometry helpers, the bbox +assignment heuristic, the bundle assembly, and the CLI. +""" + +from __future__ import annotations + +import json +from datetime import UTC, datetime +from pathlib import Path + +import pytest +from PIL import Image + +from core.page_layout import PageLayout +from core.schema import QUADRANT_ORDER, Entry, PageResult, Quadrant +from scripts.make_verifier_bundle import ( + SCHEMA_VERSION, + _assign_row_bboxes, + _merge_with_spans, + _quadrant_bboxes, + main, + make_bundle, +) + + +def _layout( + *, + header_bottom_y: int = 100, + body_mid_y: int = 600, + body_bottom_y: int = 1100, + column_mid_x: int = 500, +) -> PageLayout: + return PageLayout( + header_bottom_y=header_bottom_y, + body_mid_y=body_mid_y, + body_bottom_y=body_bottom_y, + column_mid_x=column_mid_x, + ) + + +def _entry(row_index: int, text: str = "X - Y") -> Entry: + return Entry(row_index=row_index, raw_text=text, confidence="high") + + +def _quad(position: str, n_entries: int) -> Quadrant: + return Quadrant( + position=position, # type: ignore[arg-type] + hour_raw=None, + jock_raw=None, + entries=[_entry(i) for i in range(n_entries)], + ) + + +def _page_result(*, comments: str | None = None) -> PageResult: + return PageResult( + page_date_raw="Mon 1 Jan 90", + quadrants=[_quad(p, 3) for p in QUADRANT_ORDER], + comments_raw=comments, + oddities=[], + model_version="test-model", + extracted_at=datetime(2026, 5, 10, tzinfo=UTC), + ) + + +# -- _quadrant_bboxes ------------------------------------------------------- + + +def test_quadrant_bboxes_returns_all_four_quadrants() -> None: + boxes = _quadrant_bboxes(_layout(), page_width=1000) + assert set(boxes.keys()) == set(QUADRANT_ORDER) + + +def test_quadrant_bboxes_match_layout_math() -> None: + """Each quadrant's bbox is bounded by the corresponding layout + coordinates: column_mid_x splits left/right; body_mid_y splits + top/bottom; header_bottom_y is the top of the body; body_bottom_y + is the bottom.""" + layout = _layout( + header_bottom_y=100, + body_mid_y=600, + body_bottom_y=1100, + column_mid_x=500, + ) + boxes = _quadrant_bboxes(layout, page_width=1000) + assert boxes["top_left"] == (0, 100, 500, 600) + assert boxes["top_right"] == (500, 100, 1000, 600) + assert boxes["bottom_left"] == (0, 600, 500, 1100) + assert boxes["bottom_right"] == (500, 600, 1000, 1100) + + +# -- _assign_row_bboxes ----------------------------------------------------- + + +def test_assign_row_bboxes_clean_pairing() -> None: + """When all spans are 1 and n_lines == n_entries + 1, consecutive line + pairs become row top/bottom for each entry.""" + quad_bbox = (0, 100, 500, 400) # height 300 + lines = [100, 200, 300, 400] # 4 lines -> 3 entries + rows = _assign_row_bboxes(quad_bbox, lines, spans=[1, 1, 1]) + assert rows == [ + (0, 100, 500, 200), + (0, 200, 500, 300), + (0, 300, 500, 400), + ] + + +def test_assign_row_bboxes_extra_lines_ignored() -> None: + """When more lines exist than the entry-spans require, the trailing + lines are ignored.""" + quad_bbox = (0, 100, 500, 700) + lines = [100, 200, 300, 400, 500, 600, 700] # 7 lines + rows = _assign_row_bboxes(quad_bbox, lines, spans=[1, 1, 1]) + assert rows == [ + (0, 100, 500, 200), + (0, 200, 500, 300), + (0, 300, 500, 400), + ] + + +def test_assign_row_bboxes_spans_skip_continuation_rows() -> None: + """When an entry's span is 2 (it absorbed a continuation row or is + double_height), its bbox spans two physical row lines, and the NEXT + entry's bbox starts after the second line. This is the load-bearing + behavior for the multiline-entry verifier case.""" + quad_bbox = (0, 800, 1000, 1100) + # Three physical rows: y=800-900, 900-1000, 1000-1100. + # Two logical entries: first spans rows 0-1 (continuation), second is row 2. + lines = [800, 900, 1000, 1100] + rows = _assign_row_bboxes(quad_bbox, lines, spans=[2, 1]) + assert rows == [ + (0, 800, 1000, 1000), # entry 0: spans first TWO physical rows + (0, 1000, 1000, 1100), # entry 1: third physical row, not second + ] + + +def test_assign_row_bboxes_falls_back_to_even_spacing_when_no_lines() -> None: + quad_bbox = (10, 100, 510, 400) # width 500, height 300 + rows = _assign_row_bboxes(quad_bbox, lines=[], spans=[1, 1, 1]) + assert rows == [ + (10, 100, 510, 200), + (10, 200, 510, 300), + (10, 300, 510, 400), + ] + + +def test_assign_row_bboxes_falls_back_to_even_spacing_when_too_few_lines() -> None: + """When detected lines don't cover the total physical row count, even- + spacing fallback divides the quadrant by entry count (not physical + count) — uniform strips are better UX than mis-paired pinned rows.""" + quad_bbox = (0, 100, 500, 700) + rows = _assign_row_bboxes(quad_bbox, lines=[100, 300], spans=[1, 1, 1]) + assert rows == [ + (0, 100, 500, 300), + (0, 300, 500, 500), + (0, 500, 500, 700), + ] + + +def test_assign_row_bboxes_returns_empty_for_zero_entries() -> None: + rows = _assign_row_bboxes((0, 0, 100, 100), lines=[10, 20, 30], spans=[]) + assert rows == [] + + +# -- _merge_with_spans ------------------------------------------------------ + + +def test_merge_with_spans_collapses_continuation_into_span() -> None: + """A continuation entry merges into the previous logical entry and + increments its physical-row span by 1.""" + entries = [ + Entry(row_index=0, raw_text="The Standells - Sometimes Good Guys", confidence="high"), + Entry( + row_index=1, + raw_text="Don't Wear White", + confidence="medium", + notes="continuation", + ), + Entry(row_index=2, raw_text="The Lovedolls - Pearls at Swine", confidence="high"), + ] + result = _merge_with_spans(entries) + assert len(result) == 2 + merged_first, span_first = result[0] + assert merged_first.raw_text == "The Standells - Sometimes Good Guys Don't Wear White" + assert span_first == 2 + merged_second, span_second = result[1] + assert merged_second.raw_text == "The Lovedolls - Pearls at Swine" + assert span_second == 1 + + +def test_merge_with_spans_double_height_counts_as_two() -> None: + """`notes="double_height"` doesn't trigger a merge but spans 2 rows.""" + entries = [ + Entry(row_index=0, raw_text="X - Y", confidence="high", notes="double_height"), + Entry(row_index=1, raw_text="A - B", confidence="high"), + ] + result = _merge_with_spans(entries) + assert [span for _, span in result] == [2, 1] + assert result[0][0].raw_text == "X - Y" + + +def test_merge_with_spans_consecutive_continuations() -> None: + """A single entry can absorb multiple continuation rows; span grows by + one per continuation.""" + entries = [ + Entry(row_index=0, raw_text="Line A", confidence="high"), + Entry(row_index=1, raw_text="Line B", confidence="high", notes="continuation"), + Entry(row_index=2, raw_text="Line C", confidence="high", notes="continuation"), + ] + result = _merge_with_spans(entries) + assert len(result) == 1 + merged, span = result[0] + assert merged.raw_text == "Line A Line B Line C" + assert span == 3 + + +def test_merge_with_spans_leading_continuation_is_preserved() -> None: + """A continuation as the first row has nothing to merge into — stays + as its own entry with span 1, mirroring `merge_continuations`.""" + entries = [ + Entry(row_index=0, raw_text="orphan", confidence="low", notes="continuation"), + Entry(row_index=1, raw_text="A - B", confidence="high"), + ] + result = _merge_with_spans(entries) + assert len(result) == 2 + assert [span for _, span in result] == [1, 1] + assert result[0][0].raw_text == "orphan" + assert result[0][0].notes == "continuation" + + +def test_merge_with_spans_empty_input() -> None: + assert _merge_with_spans([]) == [] + + +# -- make_bundle ------------------------------------------------------------ + + +def _white_page(tmp_path: Path) -> Path: + """A synthetic 1000x1500 white image with a black vertical column + divider at x=500. Detection will land near-real coords, then we + don't care about per-row exactness — the bundle just needs to + assemble without crashing.""" + image = Image.new("RGB", (1000, 1500), color="white") + # Paint the column divider so detect_column_mid_x finds it. + for y in range(1500): + image.putpixel((500, y), (0, 0, 0)) + path = tmp_path / "page.png" + image.save(path) + return path + + +def test_make_bundle_returns_schema_version(tmp_path: Path) -> None: + image_path = _white_page(tmp_path) + bundle_path = tmp_path / "out" / "verifier" / "page.bundle.json" + bundle = make_bundle(_page_result(), image_path=image_path, bundle_path=bundle_path) + assert bundle["schema_version"] == SCHEMA_VERSION == 1 + + +def test_make_bundle_top_level_fields(tmp_path: Path) -> None: + image_path = _white_page(tmp_path) + bundle_path = tmp_path / "page.bundle.json" + bundle = make_bundle(_page_result(), image_path=image_path, bundle_path=bundle_path) + assert bundle["stem"] == "page" + assert bundle["page_date_raw"] == "Mon 1 Jan 90" + assert bundle["comments_raw"] is None + assert bundle["model_version"] == "test-model" + assert bundle["oddities"] == [] + assert len(bundle["quadrants"]) == 4 + + +def test_make_bundle_image_path_is_relative_to_bundle_dir(tmp_path: Path) -> None: + """The bundle stays portable: image_path is computed via os.path.relpath + from the bundle's parent directory to the source image. Tests nested + subdirectories — the bundle in data/verifier/, image in data/pages//.""" + data = tmp_path / "data" + image_path = data / "pages" / "1990-04apr0106" / "page-05.png" + image_path.parent.mkdir(parents=True) + image = Image.new("RGB", (1000, 1500), color="white") + for y in range(1500): + image.putpixel((500, y), (0, 0, 0)) + image.save(image_path) + + bundle_path = data / "verifier" / "page-05.bundle.json" + bundle = make_bundle(_page_result(), image_path=image_path, bundle_path=bundle_path) + assert bundle["image_path"] == "../pages/1990-04apr0106/page-05.png" + + +def test_make_bundle_quadrants_in_canonical_order(tmp_path: Path) -> None: + image_path = _white_page(tmp_path) + bundle = make_bundle( + _page_result(), image_path=image_path, bundle_path=tmp_path / "out.bundle.json" + ) + positions = tuple(q["position"] for q in bundle["quadrants"]) + assert positions == QUADRANT_ORDER + + +def test_make_bundle_each_entry_has_row_bbox(tmp_path: Path) -> None: + image_path = _white_page(tmp_path) + bundle = make_bundle( + _page_result(), image_path=image_path, bundle_path=tmp_path / "out.bundle.json" + ) + for quad in bundle["quadrants"]: + for entry in quad["entries"]: + assert "row_bbox" in entry + bbox = entry["row_bbox"] + assert len(bbox) == 4 + x1, y1, x2, y2 = bbox + assert x2 > x1 and y2 > y1, f"degenerate bbox: {bbox}" + + +def test_make_bundle_quadrant_has_bbox(tmp_path: Path) -> None: + image_path = _white_page(tmp_path) + bundle = make_bundle( + _page_result(), image_path=image_path, bundle_path=tmp_path / "out.bundle.json" + ) + for quad in bundle["quadrants"]: + assert "bbox" in quad + assert len(quad["bbox"]) == 4 + + +# -- CLI -------------------------------------------------------------------- + + +def _write_minimal_result(path: Path) -> None: + page = _page_result() + path.write_text(page.model_dump_json(indent=2)) + + +def test_main_writes_bundle_to_out_path(tmp_path: Path) -> None: + result_path = tmp_path / "result.json" + image_path = _white_page(tmp_path) + _write_minimal_result(result_path) + + out_path = tmp_path / "out" / "page.bundle.json" + rc = main([str(result_path), str(image_path), "--out", str(out_path)]) + assert rc == 0 + assert out_path.is_file() + bundle = json.loads(out_path.read_text()) + assert bundle["schema_version"] == 1 + assert len(bundle["quadrants"]) == 4 + + +def test_main_creates_output_parent_directory(tmp_path: Path) -> None: + """Pre-processor creates output dirs that don't exist, matching the + pattern in core/pipeline.py and core/jobs.py.""" + result_path = tmp_path / "result.json" + image_path = _white_page(tmp_path) + _write_minimal_result(result_path) + + out_path = tmp_path / "deeply" / "nested" / "page.bundle.json" + assert not out_path.parent.exists() + + rc = main([str(result_path), str(image_path), "--out", str(out_path)]) + assert rc == 0 + assert out_path.is_file() + + +def test_main_validates_bundle_against_page_result_shape(tmp_path: Path) -> None: + """The bundle must round-trip through PageResult.model_validate_json + after stripping bundle-only fields. This pins the export-schema + contract end-to-end.""" + result_path = tmp_path / "result.json" + image_path = _white_page(tmp_path) + _write_minimal_result(result_path) + + out_path = tmp_path / "page.bundle.json" + main([str(result_path), str(image_path), "--out", str(out_path)]) + + bundle = json.loads(out_path.read_text()) + # Strip bundle-only fields. + for key in ("schema_version", "stem", "image_path"): + bundle.pop(key, None) + for quad in bundle["quadrants"]: + quad.pop("bbox", None) + for entry in quad["entries"]: + entry.pop("row_bbox", None) + PageResult.model_validate(bundle) + + +def test_main_returns_nonzero_when_inputs_missing(tmp_path: Path) -> None: + """Missing input file is a usage error, not a crash. Exit 1 lets + shell scripts react cleanly.""" + rc = main( + [ + str(tmp_path / "missing-result.json"), + str(tmp_path / "missing-page.png"), + "--out", + str(tmp_path / "out.bundle.json"), + ] + ) + assert rc == 1 + + +@pytest.mark.parametrize( + ("entry_text", "expected_bbox_count"), + [ + ("Juana Molina - la paradoja", 1), + ("", 1), # blank entries still get a bbox (UI shows them) + ], +) +def test_make_bundle_handles_entry_text_variants( + tmp_path: Path, entry_text: str, expected_bbox_count: int +) -> None: + image_path = _white_page(tmp_path) + result = PageResult( + page_date_raw=None, + quadrants=[ + Quadrant( + position=p, + hour_raw=None, + jock_raw=None, + entries=[Entry(row_index=0, raw_text=entry_text, confidence="high")], + ) + for p in QUADRANT_ORDER + ], + oddities=[], + model_version="t", + extracted_at=datetime(2026, 5, 10, tzinfo=UTC), + ) + bundle = make_bundle(result, image_path=image_path, bundle_path=tmp_path / "b.json") + assert all(len(q["entries"]) == expected_bbox_count for q in bundle["quadrants"]) diff --git a/tests/unit/test_page_layout.py b/tests/unit/test_page_layout.py index be20f86..0d94bad 100644 --- a/tests/unit/test_page_layout.py +++ b/tests/unit/test_page_layout.py @@ -20,7 +20,9 @@ _detect_header_bottom_y, _estimate_row_spacing, detect_page_layout, + partition_row_lines_by_quadrant, ) +from core.schema import QUADRANT_ORDER GOLDEN_DIR = Path(__file__).resolve().parents[1] / "golden" @@ -231,3 +233,73 @@ def test_detect_header_bottom_y_falls_back_when_first_line_too_low() -> None: h = 4200 # 0.3 * h = 1260; first at 1500 is too low to trust. assert _detect_header_bottom_y([1500, 1575, 1650], h) == int(h * FALLBACK_HEADER_FRACTION) + + +# -- partition_row_lines_by_quadrant --------------------------------------- + + +def test_partition_row_lines_returns_quadrant_keys( + golden: tuple[str, Image.Image, dict[str, int]], +) -> None: + """Returned dict has exactly the four quadrant keys in QUADRANT_ORDER.""" + _, image, _ = golden + layout = detect_page_layout(image) + partitions = partition_row_lines_by_quadrant(image, layout) + assert set(partitions.keys()) == set(QUADRANT_ORDER) + + +def test_partition_row_lines_returns_y_coordinates_as_ints( + golden: tuple[str, Image.Image, dict[str, int]], +) -> None: + """Each list value is a pixel y-coordinate (int), matching the contract + of `_detect_row_lines`. The verifier pre-processor consumes these as + crop boundaries, so the integer type is load-bearing.""" + _, image, _ = golden + layout = detect_page_layout(image) + partitions = partition_row_lines_by_quadrant(image, layout) + for ys in partitions.values(): + for y in ys: + assert isinstance(y, int) + + +def test_partition_row_lines_within_correct_body_band( + golden: tuple[str, Image.Image, dict[str, int]], +) -> None: + """All returned y-coords fall within the body region and on the + correct side of body_mid_y for their quadrant.""" + _, image, _ = golden + layout = detect_page_layout(image) + partitions = partition_row_lines_by_quadrant(image, layout) + for pos in ("top_left", "top_right"): + for y in partitions[pos]: + assert layout.header_bottom_y <= y < layout.body_mid_y, ( + f"{pos}: y={y} outside top-band [{layout.header_bottom_y}, {layout.body_mid_y})" + ) + for pos in ("bottom_left", "bottom_right"): + for y in partitions[pos]: + assert layout.body_mid_y <= y < layout.body_bottom_y, ( + f"{pos}: y={y} outside bottom-band [{layout.body_mid_y}, {layout.body_bottom_y})" + ) + + +def test_partition_row_lines_finds_content_in_top_band( + golden: tuple[str, Image.Image, dict[str, int]], +) -> None: + """All 5 goldens have detected lines somewhere in the top body band — + the printed grid alone is ~9 lines per quadrant, so at least one side + of the top band must come back populated.""" + stem, image, _ = golden + layout = detect_page_layout(image) + partitions = partition_row_lines_by_quadrant(image, layout) + total_top = len(partitions["top_left"]) + len(partitions["top_right"]) + assert total_top > 0, f"{stem}: no row lines detected in top band" + + +def test_partition_row_lines_handles_blank_image() -> None: + """A blank image returns four empty lists — no crash, no missing keys.""" + blank = Image.new("RGB", (1000, 1500), color="white") + layout = detect_page_layout(blank) + partitions = partition_row_lines_by_quadrant(blank, layout) + assert set(partitions.keys()) == set(QUADRANT_ORDER) + for ys in partitions.values(): + assert ys == [] diff --git a/verifier/README.md b/verifier/README.md new file mode 100644 index 0000000..16a2d3b --- /dev/null +++ b/verifier/README.md @@ -0,0 +1,150 @@ +# Flowsheet verifier UI + +A static, dependency-free single-page app for manually verifying flowsheet extraction output. Each row's cropped image strip is shown next to the model-detected text in an editable field. Hand-correct typos, mark hallucinated rows, add missed rows, then export a `verified.json` that flows back into the pipeline as ground truth. + +## Run + +The UI is static HTML + JS + CSS. It needs a local HTTP server so the browser can fetch the bundle JSON and the page image relative to it. + +```bash +# from the repo root +python -m http.server 8765 + +# then open in a browser: +open "http://localhost:8765/verifier/?bundle=/data/verifier/.bundle.json" +``` + +The `?bundle=...` URL param is the recommended path: the UI fetches the bundle, then resolves the bundle's `image_path` (relative path inside the JSON) and fetches the image too. + +You can also load a bundle via the **Load bundle** file picker, in which case a second **Load image** picker appears. This path works without a server but you must pick both files manually. + +## File layout + +The bundle's `image_path` is **relative to the bundle file's directory**. The expected layout under the repo's `data/` directory: + +``` +data/ + pages//.png # source image + results//.json # pipeline output (input to make_verifier_bundle) + verifier/.bundle.json # pre-processor output, references ../pages//.png + verifier/.verified.json # UI export (download to this directory by convention) +tests/golden/.truth.json # derive_truth output (optional destination) +``` + +## End-to-end workflow + +1. Run the pipeline to produce `data/results//.json`. +2. Generate a bundle: + + ```bash + python -m scripts.make_verifier_bundle \ + data/results//.json \ + data/pages//.png \ + --out data/verifier/.bundle.json + ``` + +3. Open the verifier and load the bundle. +4. Walk the page: each row shows a cropped image strip + the model's `raw_text`. Correct typos, set `type` and `notes` when needed, click ✗ to mark hallucinations, click **+ add row** to insert a row the model missed. +5. Edit the page-level fields: `page_date_raw`, `comments_raw`, `oddities`. +6. Click **Export verified** → downloads `.verified.json`. Move it to `data/verifier/`. +7. (Optional) Derive a `tests/golden/*.truth.json`: + + ```bash + python -m scripts.derive_truth \ + data/verifier/.verified.json \ + --out tests/golden/.truth.json + ``` + +## Bundle schema + +```json +{ + "schema_version": 1, + "stem": "", + "image_path": "", + "model_version": "", + "extracted_at": "", + "page_date_raw": "...", + "comments_raw": "...", + "oddities": ["..."], + "quadrants": [ + { + "position": "top_left", + "bbox": [x1, y1, x2, y2], + "hour_raw": "6AM", + "jock_raw": "Andrew", + "entries": [ + { + "row_index": 0, + "raw_text": "...", + "confidence": "high", + "type_raw": "M", + "notes": null, + "oddities": [], + "row_bbox": [x1, y1, x2, y2] + } + ], + "oddities": [] + } + ] +} +``` + +### Versioning + +`schema_version` is currently `1`. Future incompatible changes bump the version; the UI shows an error banner if it sees an unsupported version. Keep `schema_version` set when archiving bundles so older bundles remain loadable. + +## Exports + +Clicking **Export verified + corrections** downloads two files in sequence: + +1. `.verified.json` — `PageResult`-shaped JSON validating against `core.schema.PageResult`. Bundle-only fields (`schema_version`, `stem`, `image_path`, per-entry `row_bbox`) are stripped. Rows marked ✗ are excluded. Rows added via **+ add row** are included. +2. `.corrections.json` — the delta between the loaded bundle and the verified export, plus the set of rows the user reviewed: + +```json +{ + "stem": "...", + "model_version": "...", + "extracted_at": "...", + "exported_at": "...", + "page_corrections": [ + {"field": "page_date_raw", "original": "...", "corrected": "..."} + ], + "quadrant_corrections": [ + {"position": "top_left", "field": "hour_raw", "original": "6AM", "corrected": "6PM"} + ], + "row_corrections": [ + {"position": "top_left", "row_index": 0, "field": "raw_text", + "original": "Smiths-I wnat", "corrected": "Smiths-I want the one I can't have"} + ], + "verified_rows": [ + {"position": "top_left", "row_index": 0} + ], + "added_rows": [ + {"position": "top_left", "row_index": 12, "raw_text": "...", + "type_raw": null, "notes": null} + ], + "deleted_rows": [ + {"position": "top_left", "row_index": 7, "original_raw_text": "..."} + ] +} +``` + +The verified.json is the consumable artifact (plugs back into the pipeline as ground truth). The corrections.json is the audit record (preserves the original model output for analysis, separates "user reviewed and accepted" from "user reviewed and corrected" from "user never touched"). + +**Verification semantics**: +- A row's `verified` checkbox is **off** by default. +- Editing any field on a row (raw_text, type, notes) **auto-sets** `verified` to on. +- Toggling ✗ (delete) auto-sets `verified` to on (a deliberate action). +- The user can manually flip the checkbox to mark an unchanged row as reviewed. +- Rows added via **+ add row** are implicitly verified (they were typed by the user). + +Truth derivation is a **separate Python tool** (`scripts/derive_truth.py`) rather than a UI button — the substring-extraction rules live in one place (Python, testable), not duplicated in JS. + +## Known rough edges (v1) + +- **No autosave / localStorage.** Close the tab and unsaved edits are lost. Export before navigating away. +- **No batch loader.** One bundle at a time. +- **No keyboard shortcuts.** Mouse-driven only. +- **Confidence is not editable.** That field is a model artifact, not user truth. +- **Row crops use detected grid lines when available, even spacing otherwise.** A quadrant where the model over-emitted rows (more entries than handwritten lines) will show vertically squashed crops — visible but possibly mis-cropped at boundaries. Eye your way through it. diff --git a/verifier/app.js b/verifier/app.js new file mode 100644 index 0000000..0e3c34d --- /dev/null +++ b/verifier/app.js @@ -0,0 +1,468 @@ +// Flowsheet verifier — vanilla JS, no build step. +// +// Loads a bundle.json (produced by scripts/make_verifier_bundle.py) plus +// the page image it references, renders per-row canvas crops next to +// editable text fields, and exports: +// 1. .verified.json — PageResult-shaped corrected page +// 2. .corrections.json — delta vs the original bundle, plus the +// set of rows the user marked verified +// +// Two load paths are supported: +// 1. Server-served bundle: fetch(bundle) then fetch(image) by relative +// URL. Used when the page is opened via `python -m http.server`. +// 2. File-picker bundle: read the bundle as text, then prompt for the +// image file separately. +// +// State is split: +// state.originalBundle — immutable snapshot of the loaded bundle. Never +// mutated; used as the diff baseline on export. +// state.bundle — working copy. Mutated by edits and UI flags +// (`_verified`, `_added`, `_deleted`). + +"use strict"; + +const SUPPORTED_SCHEMA_VERSION = 1; + +const state = { + bundle: null, // mutable working copy + originalBundle: null, // immutable snapshot for diffing + pageImage: null, // HTMLImageElement +}; + +const $ = (sel, root = document) => root.querySelector(sel); + +function setStatus(msg, kind = "info") { + const el = $("#status"); + el.textContent = msg; + el.className = kind === "error" ? "error" : ""; +} + +function cloneDeep(obj) { + return JSON.parse(JSON.stringify(obj)); +} + +// ---- bundle loading ------------------------------------------------------ + +async function loadBundleFromUrlParam() { + const params = new URLSearchParams(location.search); + const path = params.get("bundle"); + if (!path) return false; + try { + const r = await fetch(path); + if (!r.ok) throw new Error(`fetch ${path}: ${r.status}`); + const bundle = await r.json(); + await initBundle(bundle, { bundleUrl: path }); + return true; + } catch (err) { + setStatus(`Failed to load bundle: ${err.message}`, "error"); + return false; + } +} + +async function loadBundleFromFile(file) { + try { + const text = await file.text(); + const bundle = JSON.parse(text); + await initBundle(bundle, { bundleUrl: null }); + } catch (err) { + setStatus(`Failed to parse bundle: ${err.message}`, "error"); + } +} + +async function initBundle(bundle, { bundleUrl }) { + if (bundle.schema_version !== SUPPORTED_SCHEMA_VERSION) { + setStatus( + `Unsupported schema_version ${bundle.schema_version}; ` + + `this UI supports v${SUPPORTED_SCHEMA_VERSION}.`, + "error" + ); + return; + } + state.originalBundle = cloneDeep(bundle); + state.bundle = cloneDeep(bundle); + state.pageImage = null; + + if (bundleUrl) { + const imageUrl = new URL(bundle.image_path, new URL(bundleUrl, location.href)); + state.pageImage = await loadImage(imageUrl.href); + finishInit(); + } else { + $("#image-picker").hidden = false; + setStatus("Bundle loaded. Pick the page image to continue."); + } +} + +function loadImage(src) { + return new Promise((resolve, reject) => { + const img = new Image(); + img.onload = () => resolve(img); + img.onerror = () => reject(new Error(`failed to load image ${src}`)); + img.src = src; + }); +} + +async function loadImageFromFile(file) { + const url = URL.createObjectURL(file); + try { + state.pageImage = await loadImage(url); + finishInit(); + } catch (err) { + setStatus(`Failed to load image: ${err.message}`, "error"); + } +} + +function finishInit() { + setStatus( + `Loaded ${state.bundle.stem} ` + + `(${state.pageImage.naturalWidth}×${state.pageImage.naturalHeight}px).` + ); + $("#app").hidden = false; + $("#export-verified").disabled = false; + $("#toggle-page-view").disabled = false; + $("#page-view-img").src = state.pageImage.src; + renderPageMeta(); + renderQuadrants(); +} + +// ---- render: page meta --------------------------------------------------- + +function renderPageMeta() { + const dateEl = $("#page-date-raw"); + dateEl.value = state.bundle.page_date_raw ?? ""; + dateEl.addEventListener("input", () => { + state.bundle.page_date_raw = dateEl.value || null; + }); + + const commentsEl = $("#comments-raw"); + commentsEl.value = state.bundle.comments_raw ?? ""; + commentsEl.addEventListener("input", () => { + state.bundle.comments_raw = commentsEl.value || null; + }); + + const oddEl = $("#oddities"); + oddEl.value = (state.bundle.oddities ?? []).join("\n"); + oddEl.addEventListener("input", () => { + state.bundle.oddities = oddEl.value + .split("\n") + .map(s => s.trim()) + .filter(Boolean); + }); +} + +// ---- render: quadrants --------------------------------------------------- + +function renderQuadrants() { + const container = $("#quadrants-container"); + container.innerHTML = ""; + const tmpl = $("#quadrant-template"); + + for (const quad of state.bundle.quadrants) { + const node = tmpl.content.firstElementChild.cloneNode(true); + $(".quadrant-title", node).textContent = quad.position; + + const hourEl = $(".hour-raw", node); + hourEl.value = quad.hour_raw ?? ""; + hourEl.addEventListener("input", () => { + quad.hour_raw = hourEl.value || null; + }); + + const jockEl = $(".jock-raw", node); + jockEl.value = quad.jock_raw ?? ""; + jockEl.addEventListener("input", () => { + quad.jock_raw = jockEl.value || null; + }); + + const rowsEl = $(".rows", node); + for (const entry of quad.entries) { + rowsEl.appendChild(buildRow(entry, quad)); + } + + $(".add-row", node).addEventListener("click", () => { + const newEntry = { + row_index: quad.entries.length, + raw_text: "", + confidence: "low", + type_raw: null, + notes: null, + oddities: [], + row_bbox: null, + _added: true, + }; + quad.entries.push(newEntry); + rowsEl.appendChild(buildRow(newEntry, quad)); + }); + + container.appendChild(node); + } +} + +function buildRow(entry, quad) { + const tmpl = $("#row-template"); + const node = tmpl.content.firstElementChild.cloneNode(true); + node.dataset.rowIndex = String(entry.row_index); + + const canvas = $(".row-crop", node); + if (entry.row_bbox) { + drawCrop(canvas, entry.row_bbox); + } else { + canvas.outerHTML = `
no crop (added row)
`; + } + + const textEl = $(".raw-text", node); + textEl.value = entry.raw_text; + textEl.addEventListener("input", () => { + entry.raw_text = textEl.value; + }); + + const typeEl = $(".type-raw input", node); + typeEl.value = entry.type_raw ?? ""; + typeEl.addEventListener("input", () => { + entry.type_raw = typeEl.value || null; + }); + + const notesEl = $(".notes select", node); + notesEl.value = entry.notes ?? ""; + notesEl.addEventListener("change", () => { + entry.notes = notesEl.value || null; + }); + + $(".delete-row", node).addEventListener("click", () => { + entry._deleted = !entry._deleted; + node.classList.toggle("deleted", entry._deleted); + }); + + return node; +} + +function drawCrop(canvas, bbox) { + const [x1, y1, x2, y2] = bbox; + const srcW = x2 - x1; + const srcH = y2 - y1; + if (srcW <= 0 || srcH <= 0) { + canvas.outerHTML = `
empty bbox
`; + return; + } + canvas.width = srcW; + canvas.height = srcH; + // Let CSS govern display size — the canvas's intrinsic aspect ratio is + // preserved by `width: 100%; height: auto` in styles.css. This makes the + // crop fill the available column width (full row when the side panel is + // closed; narrower when the page-view panel pushes the editor). + const ctx = canvas.getContext("2d"); + ctx.drawImage(state.pageImage, x1, y1, srcW, srcH, 0, 0, srcW, srcH); +} + +// ---- export: PageResult verified.json ----------------------------------- + +function buildVerifiedExport() { + // Strip bundle-only fields, per-entry row_bbox, and UI flags. Validates + // as PageResult directly. + return { + page_date_raw: state.bundle.page_date_raw, + quadrants: state.bundle.quadrants.map(quad => ({ + position: quad.position, + hour_raw: quad.hour_raw, + jock_raw: quad.jock_raw, + entries: quad.entries + .filter(e => !e._deleted) + .map(e => ({ + row_index: e.row_index, + raw_text: e.raw_text, + type_raw: e.type_raw, + confidence: e.confidence, + notes: e.notes, + oddities: e.oddities ?? [], + })), + oddities: quad.oddities ?? [], + })), + comments_raw: state.bundle.comments_raw, + oddities: state.bundle.oddities ?? [], + model_version: state.bundle.model_version, + extracted_at: state.bundle.extracted_at, + }; +} + +// ---- export: corrections.json (delta) ----------------------------------- + +// Fields that participate in row-level correction tracking. row_bbox is +// derived geometry, not user-editable text, so it never appears as a +// correction. confidence is model output, not user truth. +const ROW_TRACKED_FIELDS = ["raw_text", "type_raw", "notes"]; + +// Page-level and quadrant-level fields the verifier exposes for editing. +const PAGE_TRACKED_FIELDS = ["page_date_raw", "comments_raw"]; +const QUADRANT_TRACKED_FIELDS = ["hour_raw", "jock_raw"]; + +function arraysEqual(a, b) { + if (a == null && b == null) return true; + if (a == null || b == null) return false; + if (a.length !== b.length) return false; + for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false; + return true; +} + +function findOriginalEntry(quadPosition, rowIndex) { + const quad = state.originalBundle.quadrants.find(q => q.position === quadPosition); + if (!quad) return null; + return quad.entries.find(e => e.row_index === rowIndex) ?? null; +} + +function findOriginalQuadrant(position) { + return state.originalBundle.quadrants.find(q => q.position === position) ?? null; +} + +function buildCorrectionsExport() { + const page_corrections = []; + for (const field of PAGE_TRACKED_FIELDS) { + const orig = state.originalBundle[field] ?? null; + const cur = state.bundle[field] ?? null; + if (orig !== cur) { + page_corrections.push({ field, original: orig, corrected: cur }); + } + } + if (!arraysEqual(state.originalBundle.oddities ?? [], state.bundle.oddities ?? [])) { + page_corrections.push({ + field: "oddities", + original: state.originalBundle.oddities ?? [], + corrected: state.bundle.oddities ?? [], + }); + } + + const quadrant_corrections = []; + const row_corrections = []; + const added_rows = []; + const deleted_rows = []; + + for (const quad of state.bundle.quadrants) { + const origQuad = findOriginalQuadrant(quad.position); + if (origQuad) { + for (const field of QUADRANT_TRACKED_FIELDS) { + const orig = origQuad[field] ?? null; + const cur = quad[field] ?? null; + if (orig !== cur) { + quadrant_corrections.push({ + position: quad.position, + field, + original: orig, + corrected: cur, + }); + } + } + } + + for (const entry of quad.entries) { + // Added-and-then-deleted: dropped entirely, no signal worth keeping. + if (entry._added && entry._deleted) continue; + + if (entry._added) { + added_rows.push({ + position: quad.position, + row_index: entry.row_index, + raw_text: entry.raw_text, + type_raw: entry.type_raw, + notes: entry.notes, + }); + continue; + } + + if (entry._deleted) { + const orig = findOriginalEntry(quad.position, entry.row_index); + deleted_rows.push({ + position: quad.position, + row_index: entry.row_index, + original_raw_text: orig?.raw_text ?? null, + }); + continue; + } + + // Existing, not deleted: emit corrections per changed field. + const orig = findOriginalEntry(quad.position, entry.row_index); + if (orig) { + for (const field of ROW_TRACKED_FIELDS) { + const origVal = orig[field] ?? null; + const curVal = entry[field] ?? null; + if (origVal !== curVal) { + row_corrections.push({ + position: quad.position, + row_index: entry.row_index, + field, + original: origVal, + corrected: curVal, + }); + } + } + } + } + } + + return { + stem: state.bundle.stem, + model_version: state.bundle.model_version, + extracted_at: state.bundle.extracted_at, + exported_at: new Date().toISOString(), + page_corrections, + quadrant_corrections, + row_corrections, + added_rows, + deleted_rows, + }; +} + +// ---- file-download helpers ----------------------------------------------- + +function downloadJson(filename, data) { + const blob = new Blob([JSON.stringify(data, null, 2)], { type: "application/json" }); + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = url; + a.download = filename; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); +} + +function exportAll() { + const verified = buildVerifiedExport(); + const corrections = buildCorrectionsExport(); + downloadJson(`${state.bundle.stem}.verified.json`, verified); + downloadJson(`${state.bundle.stem}.corrections.json`, corrections); + const n = + corrections.row_corrections.length + + corrections.page_corrections.length + + corrections.quadrant_corrections.length; + setStatus( + `Exported verified + corrections (${n} field correction(s), ` + + `${corrections.added_rows.length} added, ` + + `${corrections.deleted_rows.length} deleted).` + ); +} + +function togglePageView() { + const aside = $("#page-view"); + const main = $("main"); + const btn = $("#toggle-page-view"); + const open = !aside.classList.contains("is-open"); + aside.classList.toggle("is-open", open); + main.classList.toggle("page-view-open", open); + btn.classList.toggle("is-active", open); + aside.setAttribute("aria-hidden", String(!open)); + btn.textContent = open ? "Hide page" : "Show page"; +} + +// ---- wiring -------------------------------------------------------------- + +document.addEventListener("DOMContentLoaded", async () => { + $("#bundle-input").addEventListener("change", (e) => { + const file = e.target.files?.[0]; + if (file) loadBundleFromFile(file); + }); + $("#image-input").addEventListener("change", (e) => { + const file = e.target.files?.[0]; + if (file) loadImageFromFile(file); + }); + $("#export-verified").addEventListener("click", exportAll); + $("#toggle-page-view").addEventListener("click", togglePageView); + + await loadBundleFromUrlParam(); +}); diff --git a/verifier/index.html b/verifier/index.html new file mode 100644 index 0000000..a382037 --- /dev/null +++ b/verifier/index.html @@ -0,0 +1,91 @@ + + + + + + Flowsheet verifier + + + +
+

Flowsheet verifier

+
+ + + + + Pick a bundle.json to begin. +
+
+ + + +
+
+

Page

+
+ + + +
+
+ +
+ +
+
+ + + + + + + + diff --git a/verifier/styles.css b/verifier/styles.css new file mode 100644 index 0000000..8068230 --- /dev/null +++ b/verifier/styles.css @@ -0,0 +1,316 @@ +:root { + --bg: #fafaf7; + --fg: #1a1a1a; + --muted: #6b6b6b; + --border: #d0d0c8; + --accent: #2a5fb0; + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; +} + +* { box-sizing: border-box; } + +body { + margin: 0; + background: var(--bg); + color: var(--fg); + font-size: 14px; + line-height: 1.4; +} + +body > header { + position: fixed; + top: 0; + left: 0; + right: 0; + z-index: 100; + background: var(--bg); + border-bottom: 1px solid var(--border); + padding: 12px 24px; +} + +body > header h1 { + margin: 0 0 8px 0; + font-size: 18px; + font-weight: 600; +} + +.controls { + display: flex; + gap: 12px; + align-items: center; + flex-wrap: wrap; +} + +.file-picker { + display: inline-block; +} + +.file-picker input[type="file"] { + display: none; +} + +.file-picker span { + display: inline-block; + padding: 4px 12px; + background: white; + border: 1px solid var(--border); + border-radius: 4px; + cursor: pointer; +} + +.file-picker span:hover { background: #f0f0e8; } + +button { + padding: 4px 12px; + background: var(--accent); + color: white; + border: none; + border-radius: 4px; + cursor: pointer; + font-size: 14px; +} + +button:disabled { + background: var(--muted); + cursor: not-allowed; + opacity: 0.6; +} + +#status { + color: var(--muted); + font-size: 13px; +} + +#status.error { color: #b00; } + +main { + padding: 110px 24px 80px 24px; + max-width: 1200px; + margin: 0 auto; + transition: margin-right 0.2s ease; +} + +main.page-view-open { + margin-right: 50vw; + max-width: none; +} + +section { + margin-bottom: 24px; +} + +section h2 { + font-size: 14px; + font-weight: 600; + color: var(--muted); + margin: 0 0 8px 0; + text-transform: uppercase; + letter-spacing: 0.04em; +} + +.meta-grid { + display: grid; + grid-template-columns: 1fr; + gap: 8px; + max-width: 720px; +} + +.meta-grid label { + display: flex; + flex-direction: column; + gap: 4px; + font-size: 12px; + color: var(--muted); +} + +.meta-grid input, +.meta-grid textarea { + font-family: inherit; + font-size: 14px; + padding: 6px 8px; + border: 1px solid var(--border); + border-radius: 4px; + background: white; + color: var(--fg); +} + +.meta-grid textarea { resize: vertical; } + +.quadrant { + background: white; + border: 1px solid var(--border); + border-radius: 6px; + padding: 12px 16px; + margin-bottom: 16px; +} + +.quadrant-header { + display: flex; + gap: 16px; + align-items: center; + flex-wrap: wrap; + margin-bottom: 12px; + padding-bottom: 8px; + border-bottom: 1px solid var(--border); +} + +.quadrant-title { + margin: 0; + font-size: 13px; + font-weight: 600; + color: var(--accent); + text-transform: none; + letter-spacing: 0; + min-width: 110px; +} + +.quadrant-header label { + display: flex; + flex-direction: row; + gap: 6px; + align-items: center; + font-size: 12px; + color: var(--muted); +} + +.quadrant-header input { + font-size: 13px; + padding: 3px 6px; + border: 1px solid var(--border); + border-radius: 3px; + width: 80px; +} + +.rows { display: flex; flex-direction: column; gap: 6px; } + +.row { + display: flex; + flex-direction: column; + gap: 4px; + padding: 6px 0; + border-bottom: 1px dotted var(--border); +} + +.row:last-child { border-bottom: none; } + +.row.deleted { opacity: 0.35; } + +.row-crop { + width: 100%; + height: auto; + background: #f0f0e8; + border: 1px solid var(--border); + border-radius: 3px; + display: block; +} + +.row-crop.no-crop { + height: 40px; + display: flex; + align-items: center; + justify-content: center; + color: var(--muted); + font-style: italic; + font-size: 12px; +} + +.row-fields { + display: flex; + flex-direction: column; + gap: 4px; + min-width: 0; +} + +.row-fields .raw-text { + font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + font-size: 13px; + padding: 5px 8px; + border: 1px solid var(--border); + border-radius: 3px; + background: white; + color: var(--fg); + width: 100%; +} + +.row-controls { + display: flex; + gap: 8px; + align-items: center; + font-size: 11px; + color: var(--muted); +} + +.row-controls select, +.row-controls input[type="text"] { + font-size: 12px; + padding: 2px 4px; + border: 1px solid var(--border); + border-radius: 3px; + background: white; + font-family: inherit; +} + +.row-controls .type-raw input[type="text"] { + width: 14em; +} + +.row-controls button.delete-row { + padding: 2px 6px; + font-size: 12px; + background: white; + color: var(--muted); + border: 1px solid var(--border); +} + +.row-controls button.delete-row:hover { + background: #fee; + color: #b00; + border-color: #f8c0c0; +} + +/* Page-view side panel: toggled by the "Show page" button in the header. + Fixed to the right edge of the viewport, occupies half the width when + open; content under
reflows via `main.page-view-open`. */ +#page-view { + position: fixed; + top: 100px; + right: 0; + bottom: 0; + width: 50vw; + background: #1a1a1a; + border-left: 1px solid var(--border); + z-index: 50; + overflow: auto; + padding: 12px; + transform: translateX(100%); + transition: transform 0.2s ease; +} + +#page-view.is-open { + transform: translateX(0); +} + +#page-view img { + width: 100%; + height: auto; + display: block; + background: white; +} + +#toggle-page-view.is-active { + background: #1a4080; +} + +.add-row { + margin-top: 8px; + padding: 4px 12px; + background: white; + color: var(--accent); + border: 1px dashed var(--border); + font-size: 12px; +} + +.add-row:hover { + background: #f0f4fa; + border-style: solid; +} From b2254fdf45e08fe154eee1be379c4ce083336a4e Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Tue, 12 May 2026 07:19:31 -0700 Subject: [PATCH 2/5] feat(verifier): jobs.db persistence, request-o-matic check, geometry fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DB-backed Save replaces file-download Export. The UI now POSTs to /api/save (verifier/serve.py — a small FastAPI server that also same-origin-proxies request-o-matic and serves the static SPA + data + tests dirs). Save writes .verified.json and .corrections.json into data/verifier/ and, when the bundle carries pdf_path + page_number, records the verification in jobs.db via the new JobStore.mark_verified method. jobs.db gains verified_at, verified_path, corrections_path columns plus a partial index on verified_at; init() runs ALTER TABLE for legacy DBs so existing data is preserved. Bundles bump to SCHEMA_VERSION=2 with optional pdf_path/page_number auto-detected when the result JSON lives under data/results//page-NN.json (null for test fixtures, where Save falls back to file-only persistence). Check artists button looks each row up via request-o-matic's /request endpoint through the same-origin proxy. Per-row badges show resolved artist + matched release with three gating signals stacked on top of the raw confidence: postdates (release_year > the year parsed from bundle.page_date_raw), artist-only fallback (search_type=song_as_artist or song_not_found=true — request-o-matic found the artist but not the played track, and the release shown is one of theirs picked arbitrarily), and disjoint-artist tokens (no shared tokens after stop-word and trailing-s normalization — catches request-o-matic fuzzy-matching on a track word and returning an unrelated artist, e.g. "Pure Joy - Pieces" -> "Coldcut - More Beats & Pieces"). The badge labels resolved release names with "album:" or "sample release:" prefix so they can't be mistaken for a track-level match (the flowsheet records artist - track, but the library matches at release level). partition_row_lines_by_quadrant gets a correction pass for the bottom-block hour-jock-cell baseline. _detect_body_mid_y's gap-by-anchor heuristic sometimes lands body_mid_y BELOW the bottom block's hour-jock baseline, which misattributes that line to the top quadrant — shifting every bottom-quadrant row crop up by one (a quadrant's row 0 crop showed row 1 content). Fix: when the top quadrant's last spacing exceeds 1.3x the median row spacing, that line moves to the corresponding bottom quadrant. _merge_with_spans propagates notes="double_height" to entries that absorbed a continuation, so the notes dropdown reflects multi-physical-row entries instead of showing (none). UI polish: page-view side panel opens by default on bundle load (verifiers need the full-page reference); notes select shows "(none)" instead of blank and the row gets a tinted background when notes is non-null; type_raw is a free-text input matching the schema's str | None (covers doodles like "hand-drawn smiley" and compound values like "O/std"); each row stacks crop above editable field so the layout reflows cleanly when the page-view panel is open. 474 tests, ruff/mypy clean. --- core/jobs.py | 105 +++++++- core/page_layout.py | 25 ++ scripts/make_verifier_bundle.py | 50 +++- tests/unit/test_jobs.py | 80 ++++++ tests/unit/test_make_verifier_bundle.py | 57 ++++- tests/unit/test_page_layout.py | 39 ++- verifier/README.md | 45 +++- verifier/app.js | 319 ++++++++++++++++++++++-- verifier/index.html | 6 +- verifier/serve.py | 186 ++++++++++++++ verifier/styles.css | 28 +++ 11 files changed, 889 insertions(+), 51 deletions(-) create mode 100644 verifier/serve.py diff --git a/core/jobs.py b/core/jobs.py index 2f5454d..15a8208 100644 --- a/core/jobs.py +++ b/core/jobs.py @@ -50,11 +50,15 @@ class Job: image_path: str | None result_path: str | None model_version: str | None + verified_at: str | None + verified_path: str | None + corrections_path: str | None created_at: str updated_at: str @classmethod def from_row(cls, row: aiosqlite.Row) -> Self: + keys = set(row.keys()) return cls( pdf_path=row["pdf_path"], page_number=row["page_number"], @@ -64,6 +68,11 @@ def from_row(cls, row: aiosqlite.Row) -> Self: image_path=row["image_path"], result_path=row["result_path"], model_version=row["model_version"], + # Late-added columns are nullable; tolerate their absence on a + # very old jobs.db that hasn't been re-init()ed yet. + verified_at=row["verified_at"] if "verified_at" in keys else None, + verified_path=row["verified_path"] if "verified_path" in keys else None, + corrections_path=(row["corrections_path"] if "corrections_path" in keys else None), created_at=row["created_at"], updated_at=row["updated_at"], ) @@ -71,22 +80,42 @@ def from_row(cls, row: aiosqlite.Row) -> Self: _SCHEMA = """ CREATE TABLE IF NOT EXISTS jobs ( - pdf_path TEXT NOT NULL, - page_number INTEGER NOT NULL, - status TEXT NOT NULL, - attempts INTEGER NOT NULL DEFAULT 0, - last_error TEXT, - image_path TEXT, - result_path TEXT, - model_version TEXT, - created_at TEXT NOT NULL, - updated_at TEXT NOT NULL, + pdf_path TEXT NOT NULL, + page_number INTEGER NOT NULL, + status TEXT NOT NULL, + attempts INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + image_path TEXT, + result_path TEXT, + model_version TEXT, + verified_at TEXT, + verified_path TEXT, + corrections_path TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, PRIMARY KEY (pdf_path, page_number) ); CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status); """ +# Columns added after the initial schema. `init()` runs `ALTER TABLE` for +# each of these against existing databases so older jobs.db files pick up +# the new columns without losing data. +_LATE_COLUMNS: tuple[tuple[str, str], ...] = ( + ("verified_at", "TEXT"), + ("verified_path", "TEXT"), + ("corrections_path", "TEXT"), +) + +# Indexes that depend on late-added columns and therefore must be created +# AFTER the ALTER TABLE migrations run. Keeping them out of `_SCHEMA` +# avoids "no such column" errors when initializing a legacy database. +_POST_MIGRATION_INDEXES: tuple[str, ...] = ( + "CREATE INDEX IF NOT EXISTS idx_jobs_verified_at " + "ON jobs(verified_at) WHERE verified_at IS NOT NULL", +) + def _now() -> str: return datetime.now(UTC).isoformat() @@ -112,6 +141,19 @@ async def init(self) -> None: # rollback journal. The pragma is persistent across connections. await db.execute("PRAGMA journal_mode=WAL") await db.executescript(_SCHEMA) + # ALTER TABLE migrations for late-added columns. CREATE TABLE + # above is idempotent (IF NOT EXISTS), so on a fresh DB this + # is a no-op; on an existing DB it adds the columns. + db.row_factory = aiosqlite.Row + cursor = await db.execute("PRAGMA table_info(jobs)") + existing = {row["name"] for row in await cursor.fetchall()} + for name, col_type in _LATE_COLUMNS: + if name not in existing: + await db.execute(f"ALTER TABLE jobs ADD COLUMN {name} {col_type}") + # Indexes that reference late columns run after the ALTER + # TABLE pass, otherwise SQLite errors on the missing column. + for index_sql in _POST_MIGRATION_INDEXES: + await db.execute(index_sql) await db.commit() @asynccontextmanager @@ -234,6 +276,49 @@ async def mark_low_confidence( clear_error=True, ) + async def mark_verified( + self, + pdf_path: str, + page_number: int, + *, + verified_path: Path, + corrections_path: Path, + ) -> bool: + """Record that a page has been hand-verified via the verifier UI. + + Doesn't change `status` — verification is orthogonal to the + extraction state machine (a `completed` page can be verified; + re-extracting a verified page resets the result but should NOT + clear the verification record by default — that's a separate + decision a human makes via `retry`). + + Returns True if a job row matched, False otherwise. Callers + (e.g. the verifier server) may want to write files even when no + job row exists for the page (test fixtures), so a False return + is not an error. + """ + async with self._connect() as db: + cursor = await db.execute( + """ + UPDATE jobs + SET verified_at = ?, + verified_path = ?, + corrections_path = ?, + updated_at = ? + WHERE pdf_path = ? AND page_number = ? + """, + ( + _now(), + str(verified_path), + str(corrections_path), + _now(), + pdf_path, + page_number, + ), + ) + await db.commit() + return cursor.rowcount > 0 + async def mark_failed(self, pdf_path: str, page_number: int, error: str) -> None: async with self._connect() as db: cursor = await db.execute( diff --git a/core/page_layout.py b/core/page_layout.py index 6d40948..c2efcd2 100644 --- a/core/page_layout.py +++ b/core/page_layout.py @@ -349,4 +349,29 @@ def partition_row_lines_by_quadrant( out["bottom_left"].append(int(y)) if on_right: out["bottom_right"].append(int(y)) + + # Correction pass: on some pages `_detect_body_mid_y` lands BELOW the + # bottom-block hour-jock-cell baseline (the anchor at 0.55h prefers the + # gap below the cell over the true inter-block gap above it). The + # baseline line then gets misattributed to the top quadrant, and the + # bottom quadrant's first detected line is row 0's BOTTOM rather than + # its top — shifting every row crop up by one. + # + # Signal: the top quadrant's last spacing is significantly larger than + # the median row spacing across all detected lines (a normal sequence + # has consistent spacing; an anomalous jump at the end means the last + # line belongs to a different sequence — the bottom block). + if len(all_lines) >= 2: + median_spacing = float(np.median(np.diff(np.asarray(all_lines)))) + if median_spacing > 0: + for top_pos, bottom_pos in ( + ("top_left", "bottom_left"), + ("top_right", "bottom_right"), + ): + top_lines = out[top_pos] # type: ignore[index] + if len(top_lines) >= 2: + last_spacing = top_lines[-1] - top_lines[-2] + if last_spacing > 1.3 * median_spacing: + moved = top_lines.pop() + out[bottom_pos].insert(0, moved) # type: ignore[index] return out diff --git a/scripts/make_verifier_bundle.py b/scripts/make_verifier_bundle.py index 932d854..84b2c84 100644 --- a/scripts/make_verifier_bundle.py +++ b/scripts/make_verifier_bundle.py @@ -36,12 +36,44 @@ # Bump when the bundle JSON schema becomes incompatible. # `verifier/README.md` documents the versioning strategy. -SCHEMA_VERSION = 1 +# v1: initial schema. +# v2: add `pdf_path` and `page_number` so the verifier UI can target the +# corresponding `jobs.db` row when saving corrections back. +SCHEMA_VERSION = 2 BBox = tuple[int, int, int, int] +def _parse_job_key_from_result_path(result_path: Path) -> tuple[str, int] | None: + """Recover `(pdf_path, page_number)` from a pipeline result-JSON path. + + The pipeline writes results at `/results//page-NN.json` + (see `core.pipeline.result_path_for`). Reversing that gives us the + `(pdf_path, page_number)` pair used as the primary key in `jobs.db`. + + Returns `None` when the path doesn't match this layout (e.g. test + fixtures, `/tmp` spike outputs, ad-hoc files) — those bundles save + files only, no DB update. + """ + parts = result_path.parts + if "results" not in parts: + return None + idx = parts.index("results") + after = parts[idx + 1 :] + if len(after) < 2: + return None + *pdf_dir_parts, page_file = after + if not page_file.startswith("page-") or not page_file.endswith(".json"): + return None + try: + page_number = int(page_file[len("page-") : -len(".json")]) + except ValueError: + return None + pdf_path = "/".join(pdf_dir_parts) + ".pdf" + return (pdf_path, page_number) + + def _quadrant_bboxes(layout: PageLayout, *, page_width: int) -> dict[QuadrantPosition, BBox]: """Bounding box of each quadrant's body region. @@ -84,10 +116,16 @@ def _merge_with_spans(entries: list[Entry]) -> list[tuple[Entry, int]]: if entry.notes == "continuation" and result: prior, prior_span = result[-1] joined = f"{prior.raw_text.rstrip()} {entry.raw_text.lstrip()}".strip() + # Mark the merged entry as `double_height` so the verifier UI's + # notes dropdown reflects the multi-row nature of the row. The + # original schema enum doesn't distinguish "absorbed continuation" + # from "model-tagged double_height" — both mean "this logical + # entry occupies more than one physical row" for the verifier. merged = prior.model_copy( update={ "raw_text": joined, "oddities": [*prior.oddities, *entry.oddities], + "notes": "double_height", } ) result[-1] = (merged, prior_span + 1) @@ -147,6 +185,7 @@ def make_bundle( *, image_path: Path, bundle_path: Path, + job_key: tuple[str, int] | None = None, ) -> dict[str, Any]: """Assemble the verifier bundle for one page. @@ -203,10 +242,16 @@ def make_bundle( ) image_rel = os.path.relpath(image_path, bundle_path.parent) + pdf_path: str | None = None + page_number: int | None = None + if job_key is not None: + pdf_path, page_number = job_key return { "schema_version": SCHEMA_VERSION, "stem": image_path.stem, "image_path": image_rel, + "pdf_path": pdf_path, + "page_number": page_number, "model_version": page.model_version, "extracted_at": page.extracted_at.isoformat(), "page_date_raw": page.page_date_raw, @@ -242,7 +287,8 @@ def main(argv: list[str] | None = None) -> int: page = PageResult.model_validate_json(args.result.read_text()) out_path = args.out or Path("data/verifier") / f"{args.image.stem}.bundle.json" - bundle = make_bundle(page, image_path=args.image, bundle_path=out_path) + job_key = _parse_job_key_from_result_path(args.result) + bundle = make_bundle(page, image_path=args.image, bundle_path=out_path, job_key=job_key) out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(json.dumps(bundle, indent=2)) diff --git a/tests/unit/test_jobs.py b/tests/unit/test_jobs.py index 7856a0b..5f8d03f 100644 --- a/tests/unit/test_jobs.py +++ b/tests/unit/test_jobs.py @@ -205,3 +205,83 @@ async def test_pending_for_render(store: JobStore, tmp_path: Path) -> None: pending = await store.next_pending_for_render(limit=10) assert [(j.pdf_path, j.page_number) for j in pending] == [("scans/a.pdf", 2)] + + +# -- verification tracking -------------------------------------------------- + + +async def test_mark_verified_records_paths_without_changing_status( + store: JobStore, tmp_path: Path +) -> None: + """`mark_verified` updates the verification columns but leaves `status` + alone — verification is orthogonal to the extraction state machine.""" + await store.register("scans/a.pdf", 1) + await store.mark_rendered("scans/a.pdf", 1, image_path=tmp_path / "a.png") + await store.mark_completed("scans/a.pdf", 1, result_path=tmp_path / "a.json", model_version="m") + + verified = tmp_path / "a.verified.json" + corrections = tmp_path / "a.corrections.json" + matched = await store.mark_verified( + "scans/a.pdf", 1, verified_path=verified, corrections_path=corrections + ) + assert matched is True + + job = await store.get("scans/a.pdf", 1) + assert job is not None + assert job.status == JobStatus.COMPLETED + assert job.verified_at is not None + assert job.verified_path == str(verified) + assert job.corrections_path == str(corrections) + + +async def test_mark_verified_returns_false_when_no_matching_job( + store: JobStore, tmp_path: Path +) -> None: + """The verifier server may try to record verification for a test + fixture that has no `jobs.db` row. Returns False instead of raising + so the server can fall back to file-only persistence.""" + matched = await store.mark_verified( + "scans/no-such.pdf", + 99, + verified_path=tmp_path / "x.verified.json", + corrections_path=tmp_path / "x.corrections.json", + ) + assert matched is False + + +async def test_init_adds_late_columns_to_existing_db(tmp_path: Path) -> None: + """`init()` against a pre-verification-column DB adds the columns + without losing data.""" + import aiosqlite + + db_path = tmp_path / "old.db" + async with aiosqlite.connect(db_path) as db: + await db.execute( + """ + CREATE TABLE jobs ( + pdf_path TEXT NOT NULL, + page_number INTEGER NOT NULL, + status TEXT NOT NULL, + attempts INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + image_path TEXT, + result_path TEXT, + model_version TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + PRIMARY KEY (pdf_path, page_number) + ) + """ + ) + await db.commit() + + store = JobStore(db_path) + await store.init() + + async with aiosqlite.connect(db_path) as db: + db.row_factory = aiosqlite.Row + rows = await (await db.execute("PRAGMA table_info(jobs)")).fetchall() + cols = {r["name"] for r in rows} + assert "verified_at" in cols + assert "verified_path" in cols + assert "corrections_path" in cols diff --git a/tests/unit/test_make_verifier_bundle.py b/tests/unit/test_make_verifier_bundle.py index 9e10649..7393fdc 100644 --- a/tests/unit/test_make_verifier_bundle.py +++ b/tests/unit/test_make_verifier_bundle.py @@ -20,6 +20,7 @@ SCHEMA_VERSION, _assign_row_bboxes, _merge_with_spans, + _parse_job_key_from_result_path, _quadrant_bboxes, main, make_bundle, @@ -185,9 +186,13 @@ def test_merge_with_spans_collapses_continuation_into_span() -> None: merged_first, span_first = result[0] assert merged_first.raw_text == "The Standells - Sometimes Good Guys Don't Wear White" assert span_first == 2 + # Merged entries inherit `double_height` notes so the verifier dropdown + # reflects the multi-row nature. + assert merged_first.notes == "double_height" merged_second, span_second = result[1] assert merged_second.raw_text == "The Lovedolls - Pearls at Swine" assert span_second == 1 + assert merged_second.notes is None def test_merge_with_spans_double_height_counts_as_two() -> None: @@ -214,6 +219,7 @@ def test_merge_with_spans_consecutive_continuations() -> None: merged, span = result[0] assert merged.raw_text == "Line A Line B Line C" assert span == 3 + assert merged.notes == "double_height" def test_merge_with_spans_leading_continuation_is_preserved() -> None: @@ -255,7 +261,7 @@ def test_make_bundle_returns_schema_version(tmp_path: Path) -> None: image_path = _white_page(tmp_path) bundle_path = tmp_path / "out" / "verifier" / "page.bundle.json" bundle = make_bundle(_page_result(), image_path=image_path, bundle_path=bundle_path) - assert bundle["schema_version"] == SCHEMA_VERSION == 1 + assert bundle["schema_version"] == SCHEMA_VERSION == 2 def test_make_bundle_top_level_fields(tmp_path: Path) -> None: @@ -268,6 +274,24 @@ def test_make_bundle_top_level_fields(tmp_path: Path) -> None: assert bundle["model_version"] == "test-model" assert bundle["oddities"] == [] assert len(bundle["quadrants"]) == 4 + # New in v2: job key fields default to null when no job_key is passed. + assert bundle["pdf_path"] is None + assert bundle["page_number"] is None + + +def test_make_bundle_carries_job_key_when_provided(tmp_path: Path) -> None: + """When the bundle pre-processor can recover the (pdf_path, page_number) + job key from the result path, it's preserved in the bundle so the + verifier UI can target the right jobs.db row on save.""" + image_path = _white_page(tmp_path) + bundle = make_bundle( + _page_result(), + image_path=image_path, + bundle_path=tmp_path / "out.bundle.json", + job_key=("1990/April 1990/1990-04apr0106.pdf", 25), + ) + assert bundle["pdf_path"] == "1990/April 1990/1990-04apr0106.pdf" + assert bundle["page_number"] == 25 def test_make_bundle_image_path_is_relative_to_bundle_dir(tmp_path: Path) -> None: @@ -338,7 +362,7 @@ def test_main_writes_bundle_to_out_path(tmp_path: Path) -> None: assert rc == 0 assert out_path.is_file() bundle = json.loads(out_path.read_text()) - assert bundle["schema_version"] == 1 + assert bundle["schema_version"] == SCHEMA_VERSION assert len(bundle["quadrants"]) == 4 @@ -379,6 +403,35 @@ def test_main_validates_bundle_against_page_result_shape(tmp_path: Path) -> None PageResult.model_validate(bundle) +# -- _parse_job_key_from_result_path ---------------------------------------- + + +def test_parse_job_key_from_pipeline_path() -> None: + """The canonical pipeline-result path resolves to (pdf_path, page_number).""" + p = Path("/var/data/results/1990/April 1990/1990-04apr0106/page-25.json") + assert _parse_job_key_from_result_path(p) == ( + "1990/April 1990/1990-04apr0106.pdf", + 25, + ) + + +def test_parse_job_key_returns_none_for_non_pipeline_path() -> None: + """Test fixtures (/tmp, /private, fixtures/) don't follow the layout.""" + assert _parse_job_key_from_result_path(Path("/tmp/flash-spike/pro/some.json")) is None + assert _parse_job_key_from_result_path(Path("/Users/x/fixtures/result.json")) is None + + +def test_parse_job_key_returns_none_when_filename_not_page() -> None: + """The trailing component must be `page-NN.json`.""" + p = Path("/var/data/results/1990/foo/notpage.json") + assert _parse_job_key_from_result_path(p) is None + + +def test_parse_job_key_returns_none_when_page_index_not_numeric() -> None: + p = Path("/var/data/results/1990/foo/page-abc.json") + assert _parse_job_key_from_result_path(p) is None + + def test_main_returns_nonzero_when_inputs_missing(tmp_path: Path) -> None: """Missing input file is a usage error, not a crash. Exit 1 lets shell scripts react cleanly.""" diff --git a/tests/unit/test_page_layout.py b/tests/unit/test_page_layout.py index 0d94bad..126d05d 100644 --- a/tests/unit/test_page_layout.py +++ b/tests/unit/test_page_layout.py @@ -265,8 +265,11 @@ def test_partition_row_lines_returns_y_coordinates_as_ints( def test_partition_row_lines_within_correct_body_band( golden: tuple[str, Image.Image, dict[str, int]], ) -> None: - """All returned y-coords fall within the body region and on the - correct side of body_mid_y for their quadrant.""" + """All returned y-coords fall within the body region. Top quadrants + stay strictly below `body_mid_y`. Bottom quadrants may include one + line slightly ABOVE `body_mid_y` — the hour-jock-cell baseline of the + bottom block, reattributed by the correction pass when body_mid_y + landed below it. See `partition_row_lines_by_quadrant`'s docstring.""" _, image, _ = golden layout = detect_page_layout(image) partitions = partition_row_lines_by_quadrant(image, layout) @@ -277,8 +280,9 @@ def test_partition_row_lines_within_correct_body_band( ) for pos in ("bottom_left", "bottom_right"): for y in partitions[pos]: - assert layout.body_mid_y <= y < layout.body_bottom_y, ( - f"{pos}: y={y} outside bottom-band [{layout.body_mid_y}, {layout.body_bottom_y})" + assert layout.header_bottom_y <= y < layout.body_bottom_y, ( + f"{pos}: y={y} outside body range " + f"[{layout.header_bottom_y}, {layout.body_bottom_y})" ) @@ -295,6 +299,33 @@ def test_partition_row_lines_finds_content_in_top_band( assert total_top > 0, f"{stem}: no row lines detected in top band" +def test_partition_row_lines_reattributes_misclassified_bottom_baseline() -> None: + """When the top quadrant's last spacing is anomalously large (the line + is actually the hour-jock baseline of the bottom block, misattributed + because body_mid_y landed below it), it gets moved to the corresponding + bottom quadrant. + + Pages 20 and 25 of the 1990-04 golden set exhibit this: top_left's last + spacing is 100px vs median 75. The fix moves y≈2251 (page25) from + top_left to bottom_left. + """ + stem = "1990-04apr0106-page25" + image = Image.open(GOLDEN_DIR / f"{stem}.png") + layout = detect_page_layout(image) + partitions = partition_row_lines_by_quadrant(image, layout) + # bottom_left must start with a line ABOVE body_mid_y (the reattributed + # hour-jock baseline). The original first line below body_mid_y was 2352; + # after reattribution, ~2251 should now be the new first line. + assert partitions["bottom_left"][0] < layout.body_mid_y, ( + f"expected first bottom_left line to be reattributed above body_mid_y, " + f"got {partitions['bottom_left'][0]} vs body_mid_y={layout.body_mid_y}" + ) + # And the spacing from the new first line to the next should be ~one row, + # accounting for the hour-jock cell baseline at the top. + diff = partitions["bottom_left"][1] - partitions["bottom_left"][0] + assert 90 < diff < 115, f"unexpected first-row span: {diff}" + + def test_partition_row_lines_handles_blank_image() -> None: """A blank image returns four empty lists — no crash, no missing keys.""" blank = Image.new("RGB", (1000, 1500), color="white") diff --git a/verifier/README.md b/verifier/README.md index 16a2d3b..eb146a2 100644 --- a/verifier/README.md +++ b/verifier/README.md @@ -4,16 +4,22 @@ A static, dependency-free single-page app for manually verifying flowsheet extra ## Run -The UI is static HTML + JS + CSS. It needs a local HTTP server so the browser can fetch the bundle JSON and the page image relative to it. +The verifier ships with a tiny FastAPI server that does two things: + +1. Serves `verifier/`, `data/`, and `tests/` as static files. +2. Proxies the **Check artists** lookups through `/api/lookup` to the request-o-matic `/request` endpoint (request-o-matic doesn't emit CORS headers, so a same-origin proxy is simpler than configuring CORS). ```bash # from the repo root -python -m http.server 8765 +.venv/bin/python verifier/serve.py +# default port is 8765; override with VERIFIER_PORT=9000 .venv/bin/python verifier/serve.py # then open in a browser: open "http://localhost:8765/verifier/?bundle=/data/verifier/.bundle.json" ``` +If you want only the static side and don't need the artist-lookup button, `python -m http.server 8765` from the repo root still works — the Check-artists button will return 404s but everything else functions. + The `?bundle=...` URL param is the recommended path: the UI fetches the bundle, then resolves the bundle's `image_path` (relative path inside the JSON) and fetches the image too. You can also load a bundle via the **Load bundle** file picker, in which case a second **Load image** picker appears. This path works without a server but you must pick both files manually. @@ -94,12 +100,21 @@ tests/golden/.truth.json # derive_truth output (optional destinat `schema_version` is currently `1`. Future incompatible changes bump the version; the UI shows an error banner if it sees an unsupported version. Keep `schema_version` set when archiving bundles so older bundles remain loadable. -## Exports +## Saving + +Clicking **Save** POSTs the current edit state to the server's `/api/save` endpoint, which: + +1. Writes `data/verifier/.verified.json` — `PageResult`-shaped JSON validating against `core.schema.PageResult`. Bundle-only fields (`schema_version`, `stem`, `image_path`, `pdf_path`, `page_number`, per-entry `row_bbox`) are stripped before validation. Rows marked ✗ are excluded. Rows added via **+ add row** are included. +2. Writes `data/verifier/.corrections.json` — the delta between the loaded bundle and the verified state (shape below). +3. If the bundle has a non-null `pdf_path` + `page_number` (production-pipeline pages do; test fixtures don't), updates the matching `jobs.db` row via `JobStore.mark_verified` — setting `verified_at`, `verified_path`, and `corrections_path`. + +The status bar reports the destination files and whether `jobs.db` was updated: + +> Saved data/verifier/X.verified.json + data/verifier/X.corrections.json · 4 field correction(s), 0 added, 0 deleted · jobs.db updated. -Clicking **Export verified + corrections** downloads two files in sequence: +If you'd rather have a downloadable file, open the saved JSON from `data/verifier/` directly. -1. `.verified.json` — `PageResult`-shaped JSON validating against `core.schema.PageResult`. Bundle-only fields (`schema_version`, `stem`, `image_path`, per-entry `row_bbox`) are stripped. Rows marked ✗ are excluded. Rows added via **+ add row** are included. -2. `.corrections.json` — the delta between the loaded bundle and the verified export, plus the set of rows the user reviewed: +The `corrections.json` shape: ```json { @@ -141,6 +156,24 @@ The verified.json is the consumable artifact (plugs back into the pipeline as gr Truth derivation is a **separate Python tool** (`scripts/derive_truth.py`) rather than a UI button — the substring-extraction rules live in one place (Python, testable), not duplicated in JS. +## Check artists (request-o-matic lookup) + +Click **Check artists** in the header to look up every row's text via the WXYC library + Discogs reconciliation pipeline. Each row gets a badge with the resolved artist + matched **release** (album / 12") and a confidence score. + +**Important contrast**: the flowsheet records `Artist - Track`, but the library and Discogs match at the **release** level. The badge text is labeled `artist · album: "..."` (full track match) or `artist · sample release: "..."` (artist-only fallback) so this never looks like a near-track-match when it's a release-level result. + +Badge states: + +- **Green** — track found in the library on this release. High confidence the artist is right; the release shown is the album/single containing the played track. +- **Yellow** — one of: + - **`⚠ artist-only · ...`**: the library has the artist but not this specific track. The "sample release" is whichever album of theirs the library indexed first — it's *not* a confirmation that the played track lives there. + - **`⚠ postdates · ...`**: the matched release's year is after the flowsheet's page year. The page year is parsed from `page_date_raw` (1990 for `Thurs 4/5/90`, etc.); when `release_year > page_year` the match is almost certainly a later remix, reissue, or same-name band. + - artwork confidence below 0.5. +- **Grey/italic** — no library match found. Could be a typo, a non-canonical name, or genuinely missing from the WXYC corpus (the library reflects current stock, not 1990 stock — ~30% of mid-density pages will have these). +- **Faded/italic** — stale. You edited the row after running Check; re-run to refresh. + +The lookup goes through request-o-matic's LLM-driven request parser (artist normalization, fuzzy matching) before hitting the LML library search. The badge reflects request-o-matic's `library_results` and `artwork` fields — not LML's `/api/v1/lookup` directly, since the LLM correction layer is the load-bearing piece. + ## Known rough edges (v1) - **No autosave / localStorage.** Close the tab and unsaved edits are lost. Export before navigating away. diff --git a/verifier/app.js b/verifier/app.js index 0e3c34d..dfb6ae9 100644 --- a/verifier/app.js +++ b/verifier/app.js @@ -21,12 +21,13 @@ "use strict"; -const SUPPORTED_SCHEMA_VERSION = 1; +const SUPPORTED_SCHEMA_VERSION = 2; const state = { bundle: null, // mutable working copy originalBundle: null, // immutable snapshot for diffing pageImage: null, // HTMLImageElement + lookupConcurrency: 4, // parallel /api/lookup requests }; const $ = (sel, root = document) => root.querySelector(sel); @@ -117,11 +118,15 @@ function finishInit() { `(${state.pageImage.naturalWidth}×${state.pageImage.naturalHeight}px).` ); $("#app").hidden = false; - $("#export-verified").disabled = false; + $("#save-verified").disabled = false; $("#toggle-page-view").disabled = false; + $("#check-artists").disabled = false; $("#page-view-img").src = state.pageImage.src; renderPageMeta(); renderQuadrants(); + // Show the full-page reference by default; verifiers asked for this + // because the row crops need page context to be useful. + togglePageView(); } // ---- render: page meta --------------------------------------------------- @@ -212,6 +217,12 @@ function buildRow(entry, quad) { textEl.value = entry.raw_text; textEl.addEventListener("input", () => { entry.raw_text = textEl.value; + // Edit invalidates the lookup badge — show as stale until re-check. + const badge = $(".lookup-badge", node); + if (badge && !badge.hidden) { + badge.classList.add("stale"); + badge.title = "Click 'Check artists' to refresh."; + } }); const typeEl = $(".type-raw input", node); @@ -221,9 +232,14 @@ function buildRow(entry, quad) { }); const notesEl = $(".notes select", node); - notesEl.value = entry.notes ?? ""; + const syncNotesView = () => { + notesEl.value = entry.notes ?? ""; + node.classList.toggle("has-notes", !!entry.notes); + }; + syncNotesView(); notesEl.addEventListener("change", () => { entry.notes = notesEl.value || null; + syncNotesView(); }); $(".delete-row", node).addEventListener("click", () => { @@ -410,34 +426,286 @@ function buildCorrectionsExport() { // ---- file-download helpers ----------------------------------------------- -function downloadJson(filename, data) { - const blob = new Blob([JSON.stringify(data, null, 2)], { type: "application/json" }); - const url = URL.createObjectURL(blob); - const a = document.createElement("a"); - a.href = url; - a.download = filename; - document.body.appendChild(a); - a.click(); - document.body.removeChild(a); - URL.revokeObjectURL(url); -} +async function saveAll() { + if (!state.bundle) return; + const btn = $("#save-verified"); + btn.disabled = true; + const original = btn.textContent; + btn.textContent = "Saving…"; -function exportAll() { const verified = buildVerifiedExport(); const corrections = buildCorrectionsExport(); - downloadJson(`${state.bundle.stem}.verified.json`, verified); - downloadJson(`${state.bundle.stem}.corrections.json`, corrections); - const n = - corrections.row_corrections.length + - corrections.page_corrections.length + - corrections.quadrant_corrections.length; + const body = { + stem: state.bundle.stem, + pdf_path: state.bundle.pdf_path ?? null, + page_number: state.bundle.page_number ?? null, + verified, + corrections, + }; + + try { + const r = await fetch("/api/save", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + }); + if (!r.ok) { + const detail = await r.text(); + throw new Error(`/api/save ${r.status}: ${detail}`); + } + const result = await r.json(); + const n = + corrections.row_corrections.length + + corrections.page_corrections.length + + corrections.quadrant_corrections.length; + const dbBit = result.db_updated + ? "jobs.db updated" + : (body.pdf_path && body.page_number != null + ? "no matching job row (files only)" + : "files only (no job key)"); + setStatus( + `Saved ${result.verified_path} + ${result.corrections_path} · ` + + `${n} field correction(s), ${corrections.added_rows.length} added, ` + + `${corrections.deleted_rows.length} deleted · ${dbBit}.` + ); + } catch (err) { + setStatus(`Save failed: ${err.message}`, "error"); + } finally { + btn.disabled = false; + btn.textContent = original; + } +} + +// ---- artist/track lookup (request-o-matic via /api/lookup proxy) -------- + +async function lookupOne(message) { + const r = await fetch("/api/lookup", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ message }), + }); + if (!r.ok) throw new Error(`/api/lookup ${r.status}: ${await r.text()}`); + return await r.json(); +} + +// Same separator regex as `core.parse.parse_artist_track` (Python side). +// Pulls the artist out of a flowsheet `Artist - Track` string for +// comparison against the library-resolved artist. +const ARTIST_TRACK_SEPARATOR = /\s*[-–—]\s*/; + +// Stop words and bibliographic prefixes that the WXYC corpus often +// drops or adds inconsistently. Stripping them prevents a "the" / "a" +// difference from making "the sundays" and "Sundays" look like a +// mismatch. +const ARTIST_TOKEN_STOPWORDS = new Set([ + "the", "a", "an", "and", "&", "feat", "featuring", "ft", "with", "vs", "presents", +]); + +function _tokenize(s) { + return s + .toLowerCase() + .normalize("NFKD") + .replace(/[^\p{L}\p{N}\s]/gu, " ") + .split(/\s+/) + .map((t) => t.replace(/s$/, "")) // crude singularization: boys -> boy + .filter((t) => t.length >= 2 && !ARTIST_TOKEN_STOPWORDS.has(t)); +} + +function parseInputArtist(rawText) { + if (!rawText) return null; + const parts = rawText.split(ARTIST_TRACK_SEPARATOR); + return parts[0]?.trim() || null; +} + +// Returns true when the resolved artist shares NO tokens with the input +// artist (after normalization, stop-word and trailing-s stripping). +// Conservative: when either side has zero meaningful tokens, returns +// false (no signal). Catches "Pure Joy → Coldcut" where the LLM +// fuzzy-matched on a track word; tolerates "the sundays → Sundays" and +// "Beastie Boy → Beastie Boys". +function artistTokensDisjoint(inputArtist, resolvedArtist) { + if (!inputArtist || !resolvedArtist) return false; + const a = new Set(_tokenize(inputArtist)); + const b = new Set(_tokenize(resolvedArtist)); + if (a.size === 0 || b.size === 0) return false; + for (const t of a) if (b.has(t)) return false; + return true; +} + +// Parse a 2- or 4-digit year out of `page_date_raw`. The WXYC corpus spans +// 1990-2001, so 2-digit years 90-99 map to 19xx and 00-01 map to 20xx. +// Returns null when no plausible year is present. +function parsePageYear(pageDateRaw) { + if (!pageDateRaw) return null; + // Try 4-digit year first. + const fourDigit = pageDateRaw.match(/\b(19\d{2}|20\d{2})\b/); + if (fourDigit) return Number(fourDigit[1]); + // Fall back to a 2-digit year. Scan every 2-digit token that's NOT + // surrounded by other digits (so we don't pluck "19" or "90" out of + // "1990"). 2-digit years in the WXYC corpus range (80-99 → 19xx, + // 00-09 → 20xx) win; anything else (a month like "04" or a day like + // "31") is skipped. + const twoDigitMatches = pageDateRaw.matchAll(/(?= 80 && n <= 99) return 1900 + n; + if (n >= 0 && n <= 9) return 2000 + n; + } + return null; +} + +function badgeContentFor(data, pageYear, inputRawText) { + const parsed = data.parsed || {}; + const artwork = data.artwork || {}; + const libResults = data.library_results || []; + const libTop = libResults[0]; + if (!libTop && !artwork.artist) { + return { kind: "empty", text: "no library match" }; + } + + // Prefer library_results (authoritative for the WXYC corpus). Fall back + // to Discogs artwork for orientation. + const artist = libTop?.artist || artwork.artist || parsed.artist || "?"; + // library_results[i].title and artwork.album both denote a RELEASE + // (album / 12") in the library — never a track. The flowsheet records + // tracks, so we label the field explicitly to avoid the visual conflict + // with the flowsheet's "Artist - Track" shape. + const release = libTop?.title || artwork.album || ""; + const conf = typeof artwork.confidence === "number" ? artwork.confidence : null; + const releaseYear = typeof artwork.release_year === "number" ? artwork.release_year : null; + + // Fallback signal: the library reconciler couldn't find the specific + // track and is returning the artist's catalog instead. `song_not_found` + // is the canonical flag; `search_type === "song_as_artist"` means the + // LLM parser couldn't identify the artist and reinterpreted the parsed + // song as the artist (e.g. "Beastie Boy" treated as artist when LLM + // missed the plural). Both mean: artist may be right, but the release + // shown is unrelated to whatever track the DJ actually played. + const fallback = data.song_not_found === true || data.search_type === "song_as_artist"; + + // Anachronism: matched release postdates the flowsheet. + const postdates = pageYear != null && releaseYear != null && releaseYear > pageYear; + + // Artist mismatch: resolved artist shares zero tokens with the artist + // we parsed out of the flowsheet text. Catches request-o-matic + // fuzzy-matching on a track word (Pure Joy → Coldcut via "Pieces") + // even when the release year happens to be plausible. + const inputArtist = parseInputArtist(inputRawText); + const artistMismatch = artistTokensDisjoint(inputArtist, artist); + + const stampBits = []; + if (releaseYear !== null) stampBits.push(String(releaseYear)); + if (conf !== null) stampBits.push(conf.toFixed(2)); + const stamp = stampBits.length ? ` (${stampBits.join(", ")})` : ""; + + let text; + let kind; + if (fallback) { + // "artist-only" makes it clear we have the artist but not the track, + // and "sample release" disclaims the album shown is illustrative + // (whichever release of theirs the library indexed first), not a + // confirmation that this is where the played track lives. + text = release + ? `⚠ artist-only · ${artist} · sample release: "${release}"${stamp}` + : `⚠ artist-only · ${artist}${stamp}`; + kind = "hit-weak"; + } else { + text = release + ? `${artist} · album: "${release}"${stamp}` + : `${artist}${stamp}`; + kind = conf !== null && conf < 0.5 ? "hit-weak" : "hit-strong"; + } + if (postdates) { + text = "⚠ postdates · " + text; + kind = "hit-weak"; + } + if (artistMismatch) { + text = `⚠ different artist (got "${artist}", expected "${inputArtist}") · ${text}`; + kind = "hit-weak"; + } + return { kind, text }; +} + +function applyBadge(rowEl, kind, text, title) { + const badge = $(".lookup-badge", rowEl); + if (!badge) return; + badge.hidden = false; + badge.className = `lookup-badge ${kind}`; + badge.textContent = text; + if (title) badge.title = title; +} + +async function checkArtists() { + if (!state.bundle) return; + const btn = $("#check-artists"); + btn.disabled = true; + const originalLabel = btn.textContent; + const pageYear = parsePageYear(state.bundle.page_date_raw); + + // Collect every non-deleted, non-empty row with its DOM node. + const work = []; + for (const quad of state.bundle.quadrants) { + const quadNode = [...$$(".quadrant")].find( + (n) => $(".quadrant-title", n).textContent === quad.position + ); + if (!quadNode) continue; + const rowNodes = $$(".row", quadNode); + for (let i = 0; i < quad.entries.length; i++) { + const entry = quad.entries[i]; + if (entry._deleted || !entry.raw_text?.trim()) continue; + const rowEl = rowNodes[i]; + if (!rowEl) continue; + work.push({ rowEl, entry }); + applyBadge(rowEl, "loading", "…looking up", ""); + } + } + + let done = 0; + const total = work.length; + const updateBtn = () => { + btn.textContent = `Checking artists (${done}/${total})…`; + }; + updateBtn(); + + // Concurrency-limited fan-out. + const queue = work.slice(); + async function worker() { + while (queue.length) { + const job = queue.shift(); + if (!job) break; + try { + const data = await lookupOne(job.entry.raw_text); + const { kind, text } = badgeContentFor(data, pageYear, job.entry.raw_text); + const aw = data.artwork || {}; + const title = + `parsed_artist=${(data.parsed || {}).artist ?? "?"}; ` + + `library_results=${(data.library_results || []).length}` + + (aw.release_year ? `; release_year=${aw.release_year}` : "") + + (pageYear ? `; page_year=${pageYear}` : ""); + applyBadge(job.rowEl, kind, text, title); + } catch (err) { + applyBadge(job.rowEl, "error", "lookup failed", String(err)); + } + done++; + updateBtn(); + } + } + await Promise.all( + Array.from({ length: state.lookupConcurrency }, () => worker()) + ); + + btn.disabled = false; + btn.textContent = originalLabel; setStatus( - `Exported verified + corrections (${n} field correction(s), ` + - `${corrections.added_rows.length} added, ` + - `${corrections.deleted_rows.length} deleted).` + `Checked ${total} row(s) via request-o-matic` + + (pageYear ? ` (gating release_year > ${pageYear} as anachronistic).` : ".") ); } +function $$(sel, root = document) { + return Array.from(root.querySelectorAll(sel)); +} + function togglePageView() { const aside = $("#page-view"); const main = $("main"); @@ -461,8 +729,9 @@ document.addEventListener("DOMContentLoaded", async () => { const file = e.target.files?.[0]; if (file) loadImageFromFile(file); }); - $("#export-verified").addEventListener("click", exportAll); + $("#save-verified").addEventListener("click", saveAll); $("#toggle-page-view").addEventListener("click", togglePageView); + $("#check-artists").addEventListener("click", checkArtists); await loadBundleFromUrlParam(); }); diff --git a/verifier/index.html b/verifier/index.html index a382037..22270ae 100644 --- a/verifier/index.html +++ b/verifier/index.html @@ -19,7 +19,8 @@

Flowsheet verifier

- + + Pick a bundle.json to begin. @@ -66,13 +67,14 @@

+