diff --git a/CLAUDE.md b/CLAUDE.md index 5983ba8..060080d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -66,7 +66,7 @@ cli.py Typer entrypoint: `flowsheets `. | Confidence per row | ✓ | re-OCR queue for low-confidence | | Special-case `notes` (continuation/double_height/crossed_out/illegible) | continuation merged at read-time via `core.continuations.merge_continuations` (on-disk JSON keeps the raw tag); double_height/crossed_out/illegible captured verbatim | double_height/crossed_out/illegible structured + filtered | | Left-margin type column (H/M/L/Std/O/R) | captured verbatim into `Entry.type_raw` (doodle-tolerant) | normalized + reconciled against rotation lists | -| Comments field | ignored | captured | +| Comments field | captured verbatim into `GeminiPageResult.comments_raw` (null if blank/unreadable) | normalized / dedup-checked against entry text | | Date normalization to ISO | raw only | reconciled with filename's year/range | | Reconciliation against `@wxyc/shared` canonical artists | — | fuzzy-match + auto-correct | | Bulk full-corpus run | not in this PR | calibrate first, then schedule | diff --git a/README.md b/README.md index f5b82b5..b82c6b6 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ For each page of every PDF under `scans/`: 3. Stores a JSON result file with the per-row `raw_text`, `artist_guess`, `track_guess`, `confidence`, and any phase-2 `notes` (continuation, double-height, crossed-out, illegible). 4. Tracks every page in a SQLite job table so reruns are idempotent and partial failures resume. -Phase 1 captures only the per-row "Artist – Track" text. The left-margin H/M/L/Std/O/R type column, multi-row continuations, double-height handwriting, the comments field, and reconciliation against the WXYC library DB are all phase 2 — see `PLAN.md`. +Phase 1 captures the per-row "Artist – Track" text and the four-quadrant frame. Phase 2 adds the left-margin H/M/L/Std/O/R type column (`Entry.type_raw`), the bottom-of-page comments field (`GeminiPageResult.comments_raw`), and continues to roll out continuation/double-height handling and reconciliation against the WXYC library DB — see `PLAN.md`. ## Quickstart diff --git a/core/prompts.py b/core/prompts.py index 25d540e..5164242 100644 --- a/core/prompts.py +++ b/core/prompts.py @@ -82,6 +82,16 @@ Also capture: - page_date_raw: the date as written at the top of the page, verbatim (e.g. "Monday 1 Jan '90"). Null if blank or unreadable. + - comments_raw: the verbatim contents of the printed "Comments" field + at the bottom of the page. This is a free-text band the DJ writes + in — short notes about the broadcast, dedications, jokes + (e.g. "declared today anti-Valentines Day"). Transcribe verbatim: + do not fix spelling, do not expand abbreviations, do not normalize + punctuation. Join multi-line entries with a single newline. Use + JSON null (not the string "null", not an empty string) when the + Comments band is blank, unreadable, or absent from the form. Do + NOT also put the comments contents into the page-level `oddities` + list — they belong here, only here. ## Oddities — surface anything the schema doesn't model @@ -104,10 +114,11 @@ - "an arrow is drawn from row 3 down to row 6 (re-ordering)" - "rows 12-15 are bracketed with the label 'Smarty's Group/Album'" - * Page-level oddities: anything OUTSIDE the four quadrants — content - the schema simply has no field for. Examples: + * Page-level oddities: anything OUTSIDE the four quadrants AND outside + the Comments band — content the schema simply has no field for. The + Comments band has its own `comments_raw` field; do not also list it + here. Examples: - "the entire page is rotated 180 degrees" - - "Comments field at the bottom contains: 'declared today anti-Valentines Day...'" - "a weather note above the date reads: '25 degrees month wind chill 5'" - "a DJ-handoff note at the top of the right column says: 'F.S. Earl - Charles next'" - "marginal note in left margin near row 3 of top-left quadrant: 'Cool!'" diff --git a/core/schema.py b/core/schema.py index f06d966..5c03974 100644 --- a/core/schema.py +++ b/core/schema.py @@ -1,8 +1,9 @@ """Pydantic models for the Gemini structured-output contract. The response_schema sent to Gemini and the on-disk shape are *almost* -the same model — they share `page_date_raw`, `quadrants`, and page-level -`oddities`. They differ in two fields the caller owns, not Gemini: +the same model — they share `page_date_raw`, `quadrants`, `comments_raw`, +and page-level `oddities`. They differ in two fields the caller owns, +not Gemini: * `model_version` — the SDK arg, set by the pipeline at write-time. * `extracted_at` — wall-clock UTC at the call site. @@ -17,9 +18,10 @@ caller-set fields, populated by `pipeline._process_one_job`. Phase 1 captures the per-row text and the four-quadrant frame. Phase 2 -adds the left-margin type column (H/M/L/Std/O/R/R⇒, in `Entry.type_raw`) -and is iteratively rolling out continuation/double-height handling, the -comments field, and reconciliation against the WXYC library. +adds the left-margin type column (H/M/L/Std/O/R/R⇒, in `Entry.type_raw`), +the bottom-of-page comments field (`GeminiPageResult.comments_raw`), and +is iteratively rolling out continuation/double-height handling and +reconciliation against the WXYC library. """ from __future__ import annotations @@ -136,15 +138,30 @@ class GeminiPageResult(BaseModel): "bottom_right. Always return all four even if a quadrant is blank." ) ) + comments_raw: str | None = Field( + default=None, + description=( + "Verbatim contents of the printed 'Comments' field at the bottom of " + "the page (free-text DJ commentary about the broadcast — e.g. " + '"declared today anti-Valentines Day"). Null when the field is ' + "blank, unreadable, or absent from the form. Keep verbatim: do not " + "normalize spelling, fix grammar, expand abbreviations, or truncate. " + "Multi-line entries are joined with a single newline. This field " + "replaces capturing the comments field as a page-level oddity — " + "do NOT also list the comments contents under `oddities`." + ), + ) oddities: list[str] = Field( default_factory=list, description=( "Free-text descriptions of anything on the page OUTSIDE the four " - "quadrants — content the schema doesn't have a place for. Examples: " - "the page is rotated, the comments field at the bottom contains text, " - "there is a header note above the date, the right column has a " - "DJ-handoff message, marginal notes appear next to the grid. Empty " - "list if nothing unusual. Each item is one short sentence." + "quadrants and the comments field — content the schema doesn't have " + "a place for. Examples: the page is rotated, there is a header note " + "above the date, the right column has a DJ-handoff message, " + "marginal notes appear next to the grid. Empty list if nothing " + "unusual. Each item is one short sentence. The bottom comments " + "field has its own `comments_raw` slot — do not repeat its " + "contents here." ), ) diff --git a/tests/unit/test_prompts.py b/tests/unit/test_prompts.py index 45a38fd..01796bb 100644 --- a/tests/unit/test_prompts.py +++ b/tests/unit/test_prompts.py @@ -112,6 +112,56 @@ def test_prompt_warns_against_duplicating_existing_fields() -> None: assert "do not repeat" in text or "don't repeat" in text +def test_prompt_captures_bottom_comments_field() -> None: + """Phase 2: the bottom-of-page Comments band lands in `comments_raw`. The + prompt must (a) name the field, (b) locate it (bottom of the page so the + model knows what to look at), and (c) say verbatim — otherwise it'll get + cleaned up like an editor.""" + text = PAGE_EXTRACTION_PROMPT + assert "comments_raw" in text + assert "bottom" in text.lower() + assert "verbatim" in text.lower() + + +def test_prompt_specifies_json_null_for_blank_comments_field() -> None: + """Blank Comments band must be null, not "" — same convention as + `type_raw` / `hour_raw` / `jock_raw`. Otherwise consumers can't + distinguish "blank" from "they wrote an empty string".""" + # The model must be told what to emit when the field is blank. + assert "comments_raw" in PAGE_EXTRACTION_PROMPT + # Either a dedicated "blank -> null" sentence near comments_raw, or the + # global JSON-null rule must be in force. Check the prompt explicitly + # tells the model to use null for a blank comments field. + lowered = PAGE_EXTRACTION_PROMPT.lower() + # Look for "null" near "comments" — anything that gives the model the + # signal. Cheap proximity check: same sentence-ish window. + idx = lowered.find("comments_raw") + window = lowered[idx : idx + 500] + assert "null" in window, "expected the comments_raw section to specify null for blank fields" + + +def test_prompt_keeps_comments_out_of_page_oddities() -> None: + """Before Phase 2, the prompt nudged the model to stash the Comments + contents inside `oddities` (as a page-level oddity). With `comments_raw` + in place, double-capturing would dilute oddities and produce duplicated + text downstream. The prompt must explicitly tell the model NOT to do + that, and the old illustrative example must be gone from the oddities + section.""" + text = PAGE_EXTRACTION_PROMPT + # The old illustrative example contained the literal "Comments field at + # the bottom contains" — that must be removed. + assert "Comments field at the bottom contains" not in text, ( + "old page-oddities example for the comments field must be removed — " + "the contents now belong in `comments_raw` instead" + ) + # A clear negation must remain so a future prompt edit can't re-introduce + # the duplication by accident. + lowered = text.lower() + assert "do not" in lowered and "comments" in lowered, ( + "expected a 'do not … comments' clause anchoring the negation" + ) + + # -- QUADRANT_EXTRACTION_PROMPT_TEMPLATE ----------------------------------- diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 434252d..16609aa 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -370,6 +370,82 @@ def test_can_be_promoted_to_page_result(self) -> None: assert page.model_version == "gemini-3.1-pro-preview" +class TestCommentsRaw: + """The bottom-of-page Comments field is captured into `comments_raw` on + `GeminiPageResult` (Phase 2). It's verbatim like the other `_raw` fields + — no normalization, no truncation. Inheritance means `PageResult` gets + the field for free and old extractions (no `comments_raw` key) still + validate so we don't invalidate the existing corpus.""" + + def _quads(self) -> list[Quadrant]: + return [ + Quadrant(position=p, hour_raw=None, jock_raw=None, entries=[]) + for p in ("top_left", "top_right", "bottom_left", "bottom_right") + ] + + def test_defaults_to_none_on_gemini_page_result(self) -> None: + result = GeminiPageResult(page_date_raw=None, quadrants=self._quads()) + assert result.comments_raw is None + + def test_defaults_to_none_on_page_result(self) -> None: + page = PageResult( + page_date_raw=None, + quadrants=self._quads(), + model_version="m", + extracted_at=datetime.now(UTC), + ) + assert page.comments_raw is None + + def test_accepts_verbatim_string(self) -> None: + text = "declared today anti-Valentines Day" + result = GeminiPageResult( + page_date_raw=None, + quadrants=self._quads(), + comments_raw=text, + ) + assert result.comments_raw == text + + def test_round_trips_through_json(self) -> None: + text = "declared today anti-Valentines Day" + page = PageResult( + page_date_raw=None, + quadrants=self._quads(), + model_version="m", + extracted_at=datetime.now(UTC), + comments_raw=text, + ) + rebuilt = PageResult.model_validate_json(page.model_dump_json()) + assert rebuilt.comments_raw == text + + def test_response_schema_names_comments_raw(self) -> None: + """Gemini will only populate fields named in the response_schema.""" + schema_json = json.dumps(GeminiPageResult.model_json_schema()) + assert "comments_raw" in schema_json + + def test_old_extraction_json_without_comments_raw_validates(self) -> None: + """The 34 existing corpus JSONs have no `comments_raw` key. Validation + must accept that — the field defaults to None on missing input. + Otherwise we'd invalidate every prior extraction the day we land this.""" + old_extraction = { + "page_date_raw": "Monday 1 Jan '90", + "model_version": "gemini-3.1-pro-preview", + "extracted_at": datetime.now(UTC).isoformat(), + "oddities": [], + "quadrants": [ + { + "position": p, + "hour_raw": None, + "jock_raw": None, + "oddities": [], + "entries": [], + } + for p in ("top_left", "top_right", "bottom_left", "bottom_right") + ], + } + page = PageResult.model_validate_json(json.dumps(old_extraction)) + assert page.comments_raw is None + + def test_page_result_schema_has_no_additional_properties_key() -> None: """Google's response_schema validator rejects `additionalProperties`.