From b37a92abd7ecf13845398c07541905ab6a0d062b Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Sun, 10 May 2026 16:25:45 -0700 Subject: [PATCH 1/2] feat(comments): read-time normalize and entry-overlap diagnostics for comments_raw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the shape of core.continuations: pure read-time functions over PageResult, no I/O, no on-disk mutation. normalize_comments collapses intra-line whitespace, trims per line, drops empty interior lines, and returns None when the result is empty — verbatim on disk stays verbatim per the org data-safety rule. find_comment_entry_overlaps returns one record per (normalized comments line, matching entry) pair where the line is a case- and whitespace-insensitive substring of an entry's raw_text on the same page; results come back in canonical quadrant order with ascending row_index. Categorization, the audit CLI, and any wiring into the pipeline run path are intentionally deferred — this lands the primitives so the calibration scorer (and a future audit tool) can call them. Closes #34 --- core/comments.py | 160 +++++++++++++++++++++ tests/unit/test_comments.py | 280 ++++++++++++++++++++++++++++++++++++ 2 files changed, 440 insertions(+) create mode 100644 core/comments.py create mode 100644 tests/unit/test_comments.py diff --git a/core/comments.py b/core/comments.py new file mode 100644 index 0000000..8501418 --- /dev/null +++ b/core/comments.py @@ -0,0 +1,160 @@ +"""Phase-2 read-time normalization and duplication diagnostics for `comments_raw`. + +The Phase-1 prompt captures the bottom-of-page Comments field verbatim +into `GeminiPageResult.comments_raw` (see `core/schema.py`). Verbatim is +right for ingestion but awkward to consume: whitespace is ragged, and +the same handwritten text sometimes appears in both the comments band +AND inside one of the four hour-quadrants (e.g. a DJ scribbles a song +title in the comments band that they also wrote into the grid). + +This module mirrors the shape of `core.continuations`: pure read-time +functions, no I/O, no on-disk mutation. The on-disk `comments_raw` +stays verbatim per the org data-safety rule. Callers compute the +normalized form at read time. + +Two primitives: + + * `normalize_comments(raw)` — collapses internal whitespace per line, + trims each line, drops empty interior lines, returns None if the + result is empty. Idempotent. + * `find_comment_entry_overlaps(page)` — diagnostic: returns one record + per (comment line, entry) pair where a normalized comment line is a + substring of an entry's `raw_text` after case- and + whitespace-insensitive matching. Empty list when there's nothing to + flag. + +No categorization here — the ticket explicitly defers that to a later +pass once we have a corpus of filled-in comments bands to look at. +""" + +from __future__ import annotations + +import re + +from pydantic import BaseModel, NonNegativeInt + +from core.schema import PageResult, QuadrantPosition + +# Multiple runs of any whitespace character on a single line collapse to +# one ASCII space. Newlines are handled separately (line breaks carry +# structural meaning — see the `comments_raw` field description). +_INTRA_LINE_WHITESPACE = re.compile(r"[^\S\n]+") + + +def normalize_comments(raw: str | None) -> str | None: + """Return a read-time normalized form of `comments_raw`. + + Pure and idempotent. None passes through. The transform: + + * splits on newlines so multi-line entries stay multi-line, + * collapses runs of intra-line whitespace (spaces, tabs, etc.) to + a single space within each line, + * trims leading/trailing whitespace on each line, + * drops lines that are empty after trimming — blank interior lines + are noise, and downstream per-line iteration shouldn't have to + filter them, + * returns None when the result is empty, so callers have one + sentinel for "nothing useful here" instead of having to also + check for ``""``. + + The verbatim on-disk shape is preserved; callers run this at read + time when they want a cleaner form. + """ + if raw is None: + return None + lines = [_INTRA_LINE_WHITESPACE.sub(" ", line).strip() for line in raw.split("\n")] + kept = [line for line in lines if line] + if not kept: + return None + return "\n".join(kept) + + +class CommentEntryOverlap(BaseModel): + """One (comment line, entry) duplication record. + + Surfaces the case where text in the bottom comments band also + appears inside a grid entry on the same page. The fields are the + minimum a caller (calibration scorer, future audit CLI) needs to + locate the duplicate on the page and inspect both sides. + """ + + matched_text: str + """The normalized comments substring that matched. Lowercase, + whitespace-collapsed — this is the form the matcher compared, not + the verbatim on-disk text.""" + + position: QuadrantPosition + """Which quadrant the matched entry lives in.""" + + row_index: NonNegativeInt + """0-based row position of the matched entry within its quadrant.""" + + entry_raw_text: str + """The full `raw_text` of the matched entry, verbatim from disk — + so a caller can show context around the substring match.""" + + +def find_comment_entry_overlaps(page: PageResult) -> list[CommentEntryOverlap]: + """Diagnose duplication between `comments_raw` and grid entries. + + Returns one record per (normalized comment line, matching entry) + pair where the comment line is a substring of an entry's + ``raw_text`` after both sides are normalized (lowercased and + whitespace-collapsed). Empty list when: + + * ``page.comments_raw`` is None, + * normalization yields None (whitespace-only comments), + * no entry contains any comment line. + + Matching is **per-line**, not against the whole normalized comments + blob. A two-line comments band with one dedication and one song + title surfaces only the song-title line if just that line + duplicates a grid row — the unrelated dedication doesn't pollute + the diagnostic. + + Records come back in canonical quadrant order (top_left, top_right, + bottom_left, bottom_right) with ascending ``row_index`` within each + quadrant. Stable ordering keeps diagnostic output diffable across + runs. + + Pure: no mutation of the input ``page``, no I/O. + """ + normalized = normalize_comments(page.comments_raw) + if normalized is None: + return [] + + # Case-insensitive matching is symmetric — fold both sides to the + # same case so a comments band scribbled in caps still matches an + # entry transcribed in lower case (and vice versa). + needles = [line.casefold() for line in normalized.split("\n") if line] + if not needles: + return [] + + overlaps: list[CommentEntryOverlap] = [] + for quadrant in page.quadrants: + for entry in sorted(quadrant.entries, key=lambda e: e.row_index): + entry_normalized = _normalize_for_match(entry.raw_text) + if not entry_normalized: + continue + for needle in needles: + if needle in entry_normalized: + overlaps.append( + CommentEntryOverlap( + matched_text=needle, + position=quadrant.position, + row_index=entry.row_index, + entry_raw_text=entry.raw_text, + ) + ) + return overlaps + + +def _normalize_for_match(text: str) -> str: + """Lowercased, whitespace-collapsed form used for substring matching. + + Mirrors what `normalize_comments` does to each line, plus lowercase. + Kept separate from `normalize_comments` because the entry side + doesn't need the empty-to-None convention — entries always have + non-empty `raw_text` by schema. + """ + return _INTRA_LINE_WHITESPACE.sub(" ", text).strip().casefold() diff --git a/tests/unit/test_comments.py b/tests/unit/test_comments.py new file mode 100644 index 0000000..3feba92 --- /dev/null +++ b/tests/unit/test_comments.py @@ -0,0 +1,280 @@ +"""Tests for `core.comments`: read-time normalize + entry-overlap diagnostic.""" + +from __future__ import annotations + +from datetime import UTC, datetime + +import pytest + +from core.comments import ( + CommentEntryOverlap, + find_comment_entry_overlaps, + normalize_comments, +) +from core.schema import Entry, PageResult, Quadrant, QuadrantPosition + +# --------------------------------------------------------------------------- +# normalize_comments +# --------------------------------------------------------------------------- + + +def test_normalize_none_passes_through() -> None: + assert normalize_comments(None) is None + + +def test_normalize_empty_string_returns_none() -> None: + """The on-disk shape uses None to mean "blank"; an empty-after-trim + string should normalize to None so consumers have one sentinel.""" + assert normalize_comments("") is None + assert normalize_comments(" ") is None + assert normalize_comments("\n\n\n") is None + assert normalize_comments(" \n \n ") is None + + +def test_normalize_collapses_internal_whitespace_within_line() -> None: + """Multiple spaces or tabs within a single line collapse to one space. + Substring matching downstream needs whitespace-stable input.""" + assert normalize_comments("happy birthday mike") == "happy birthday mike" + assert normalize_comments("happy\tbirthday\t\tmike") == "happy birthday mike" + + +def test_normalize_trims_leading_and_trailing_whitespace() -> None: + assert normalize_comments(" hello ") == "hello" + + +def test_normalize_trims_per_line() -> None: + """Each line in a multi-line comment is independently trimmed.""" + raw = " line one \n line two " + assert normalize_comments(raw) == "line one\nline two" + + +def test_normalize_preserves_line_breaks() -> None: + """The on-disk shape promises multi-line entries are joined with a + single newline (see GeminiPageResult.comments_raw). Normalize keeps + those line breaks intact — they carry structural meaning.""" + raw = "happy bday mike\nrequest: stereolab - ping pong" + assert normalize_comments(raw) == "happy bday mike\nrequest: stereolab - ping pong" + + +def test_normalize_drops_blank_interior_lines() -> None: + """Blank lines between content lines collapse — they don't carry the + same structural meaning as content-bearing lines, and downstream + per-line iteration shouldn't have to filter empties.""" + raw = "happy bday mike\n\n\nrequest: stereolab" + assert normalize_comments(raw) == "happy bday mike\nrequest: stereolab" + + +def test_normalize_is_idempotent() -> None: + """`normalize(normalize(x)) == normalize(x)` for any input. A bug + where we ever introduced a state-dependent transform would regress + this. Parameterized across the inputs the other tests exercise.""" + inputs = [ + None, + "", + " ", + "happy birthday", + " line one \n line two ", + "happy bday mike\n\n\nrequest: stereolab", + "single", + ] + for raw in inputs: + once = normalize_comments(raw) + twice = normalize_comments(once) + assert twice == once, f"not idempotent for {raw!r}: {once!r} -> {twice!r}" + + +@pytest.mark.parametrize( + "raw,expected", + [ + (None, None), + ("", None), + (" ", None), + ("single", "single"), + (" hello ", "hello"), + ("a b", "a b"), + ("line one\nline two", "line one\nline two"), + (" line one \n line two ", "line one\nline two"), + ], +) +def test_normalize_parametrized(raw: str | None, expected: str | None) -> None: + assert normalize_comments(raw) == expected + + +# --------------------------------------------------------------------------- +# find_comment_entry_overlaps +# --------------------------------------------------------------------------- + + +def _page( + *, + comments_raw: str | None, + entries_by_position: dict[QuadrantPosition, list[Entry]] | None = None, +) -> PageResult: + """Build a synthetic PageResult for overlap tests. + + Always emits all four quadrants in canonical order so the + `GeminiPageResult` validator is happy. Entries default to empty per + quadrant unless `entries_by_position` supplies them. + """ + entries_by_position = entries_by_position or {} + quadrants = [ + Quadrant( + position=pos, + entries=entries_by_position.get(pos, []), + ) + for pos in ("top_left", "top_right", "bottom_left", "bottom_right") + ] + return PageResult( + page_date_raw=None, + quadrants=quadrants, + comments_raw=comments_raw, + oddities=[], + model_version="test-model", + extracted_at=datetime(2026, 1, 1, tzinfo=UTC), + ) + + +def _entry(row_index: int, raw_text: str) -> Entry: + return Entry(row_index=row_index, raw_text=raw_text, confidence="high") + + +def test_overlaps_none_comments_returns_empty() -> None: + page = _page(comments_raw=None) + assert find_comment_entry_overlaps(page) == [] + + +def test_overlaps_empty_after_normalize_returns_empty() -> None: + """A whitespace-only `comments_raw` normalizes to None — no overlap + work to do.""" + page = _page(comments_raw=" \n \n ") + assert find_comment_entry_overlaps(page) == [] + + +def test_overlaps_no_match_returns_empty() -> None: + page = _page( + comments_raw="happy bday mike", + entries_by_position={ + "top_left": [_entry(0, "JUANA MOLINA - LA PARADOJA")], + }, + ) + assert find_comment_entry_overlaps(page) == [] + + +def test_overlaps_single_match_surfaces_position_and_row() -> None: + """A single comment line that also appears in an entry's raw_text + on the same page is the canonical duplication case.""" + page = _page( + comments_raw="stereolab - ping pong", + entries_by_position={ + "top_right": [ + _entry(0, "JUANA MOLINA - LA PARADOJA"), + _entry(1, "STEREOLAB - PING PONG"), + ], + }, + ) + overlaps = find_comment_entry_overlaps(page) + assert len(overlaps) == 1 + overlap = overlaps[0] + assert isinstance(overlap, CommentEntryOverlap) + assert overlap.position == "top_right" + assert overlap.row_index == 1 + assert overlap.matched_text == "stereolab - ping pong" + assert overlap.entry_raw_text == "STEREOLAB - PING PONG" + + +def test_overlaps_case_insensitive() -> None: + """Match is case-insensitive; DJs scribble in mixed case.""" + page = _page( + comments_raw="STEREOLAB - PING PONG", + entries_by_position={ + "top_left": [_entry(0, "stereolab - ping pong")], + }, + ) + overlaps = find_comment_entry_overlaps(page) + assert len(overlaps) == 1 + assert overlaps[0].entry_raw_text == "stereolab - ping pong" + + +def test_overlaps_whitespace_insensitive() -> None: + """The matcher collapses whitespace on both sides before comparing — + a comment with weird inner spacing should still match an entry that + doesn't, and vice versa.""" + page = _page( + comments_raw="stereolab - ping pong", + entries_by_position={ + "bottom_left": [_entry(0, "STEREOLAB - PING PONG")], + }, + ) + overlaps = find_comment_entry_overlaps(page) + assert len(overlaps) == 1 + assert overlaps[0].position == "bottom_left" + + +def test_overlaps_multi_line_one_matches() -> None: + """Multi-line `comments_raw` is matched per-line — a comments band + that has both a dedication and a song title should surface only + the song-title line when that line duplicates a grid entry.""" + page = _page( + comments_raw="happy bday mike\nstereolab - ping pong", + entries_by_position={ + "top_left": [_entry(2, "STEREOLAB - PING PONG")], + }, + ) + overlaps = find_comment_entry_overlaps(page) + assert len(overlaps) == 1 + assert overlaps[0].matched_text == "stereolab - ping pong" + assert overlaps[0].row_index == 2 + + +def test_overlaps_multiple_entries_match_same_line() -> None: + """If the same comment line happens to substring-match more than one + entry on the page, surface one record per (substring, entry) pair — + callers want to see every duplicate, not just the first.""" + page = _page( + comments_raw="stereolab", + entries_by_position={ + "top_left": [_entry(0, "STEREOLAB - PING PONG")], + "bottom_right": [_entry(0, "STEREOLAB - JENNY ONDIOLINE")], + }, + ) + overlaps = find_comment_entry_overlaps(page) + assert len(overlaps) == 2 + positions = {o.position for o in overlaps} + assert positions == {"top_left", "bottom_right"} + + +def test_overlaps_substring_match_not_full_line() -> None: + """The comment line is the needle; the entry's raw_text is the + haystack. A short comment that's a substring of a longer entry row + should match.""" + page = _page( + comments_raw="ping pong", + entries_by_position={ + "top_left": [_entry(0, "STEREOLAB - PING PONG")], + }, + ) + overlaps = find_comment_entry_overlaps(page) + assert len(overlaps) == 1 + assert overlaps[0].matched_text == "ping pong" + + +def test_overlaps_preserves_quadrant_canonical_order() -> None: + """Multiple matches across quadrants are returned in canonical + quadrant order, with row_index ascending within each quadrant. + Stable ordering keeps diagnostic output diffable across runs.""" + page = _page( + comments_raw="stereolab", + entries_by_position={ + "bottom_right": [_entry(3, "STEREOLAB - LATE")], + "top_left": [ + _entry(0, "STEREOLAB - EARLY"), + _entry(2, "STEREOLAB - MIDDLE"), + ], + }, + ) + overlaps = find_comment_entry_overlaps(page) + assert [(o.position, o.row_index) for o in overlaps] == [ + ("top_left", 0), + ("top_left", 2), + ("bottom_right", 3), + ] From 4e73fba53c4ea5d6f808f302db79c5c7d49ff9c2 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Sun, 10 May 2026 16:50:12 -0700 Subject: [PATCH 2/2] review: add comment_line_raw and dedupe per-line normalize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses code-review feedback on the original PR. (1) `CommentEntryOverlap` now carries `comment_line_raw` — the verbatim comments-band line that produced the match, with whitespace and case preserved — parallel to `entry_raw_text` on the entry side. A future audit CLI wants to display both sides exactly as the DJ wrote them; the casefolded form (`matched_text`) stays for matcher introspection. (2) Extracted `_normalize_line` as the shared per-line primitive used by both `normalize_comments` and the matcher; the prior `_normalize_for_match` helper duplicated the same regex+strip composition. (3) Docstring on `find_comment_entry_overlaps` now states the scope explicitly — only `entry.raw_text` is the haystack; `oddities` lists at both quadrant and page levels are intentionally not searched. (4) Dropped the redundant empty-line filter that the refactored line walk no longer needs. --- core/comments.py | 63 +++++++++++++++++++++++-------------- tests/unit/test_comments.py | 41 ++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 23 deletions(-) diff --git a/core/comments.py b/core/comments.py index 8501418..8b60b5d 100644 --- a/core/comments.py +++ b/core/comments.py @@ -41,6 +41,18 @@ _INTRA_LINE_WHITESPACE = re.compile(r"[^\S\n]+") +def _normalize_line(text: str) -> str: + """Collapse intra-line whitespace and trim. The shared per-line primitive. + + Used by `normalize_comments` to build the joined string and by + `find_comment_entry_overlaps` to build the substring needles and to + normalize the entry haystacks. Casefolding lives at the call site — + consumers want both the original-case form (for display) and the + casefolded form (for matching). + """ + return _INTRA_LINE_WHITESPACE.sub(" ", text).strip() + + def normalize_comments(raw: str | None) -> str | None: """Return a read-time normalized form of `comments_raw`. @@ -62,8 +74,7 @@ def normalize_comments(raw: str | None) -> str | None: """ if raw is None: return None - lines = [_INTRA_LINE_WHITESPACE.sub(" ", line).strip() for line in raw.split("\n")] - kept = [line for line in lines if line] + kept = [line for line in (_normalize_line(line) for line in raw.split("\n")) if line] if not kept: return None return "\n".join(kept) @@ -83,6 +94,13 @@ class CommentEntryOverlap(BaseModel): whitespace-collapsed — this is the form the matcher compared, not the verbatim on-disk text.""" + comment_line_raw: str + """The verbatim comments-band line that produced the match, exactly + as it appears in `page.comments_raw` (whitespace and case preserved). + Parallel to `entry_raw_text` on the entry side — a caller that wants + to display the duplicate to a human shows this and `entry_raw_text` + together; the matcher's casefolded form lives in `matched_text`.""" + position: QuadrantPosition """Which quadrant the matched entry lives in.""" @@ -112,6 +130,12 @@ def find_comment_entry_overlaps(page: PageResult) -> list[CommentEntryOverlap]: duplicates a grid row — the unrelated dedication doesn't pollute the diagnostic. + Scope: only ``entry.raw_text`` is matched against. Page-level and + quadrant-level ``oddities`` lists are intentionally not searched — + they have their own non-overlapping role (surface unmodeled + phenomena), and a "comment also appears as an oddity" duplicate is + not what callers of this diagnostic are asking about. + Records come back in canonical quadrant order (top_left, top_right, bottom_left, bottom_right) with ascending ``row_index`` within each quadrant. Stable ordering keeps diagnostic output diffable across @@ -119,42 +143,35 @@ def find_comment_entry_overlaps(page: PageResult) -> list[CommentEntryOverlap]: Pure: no mutation of the input ``page``, no I/O. """ - normalized = normalize_comments(page.comments_raw) - if normalized is None: + if page.comments_raw is None: return [] - # Case-insensitive matching is symmetric — fold both sides to the - # same case so a comments band scribbled in caps still matches an - # entry transcribed in lower case (and vice versa). - needles = [line.casefold() for line in normalized.split("\n") if line] + # Walk the verbatim comments_raw line-by-line so each (raw_line, + # needle) pair stays paired — the record carries both forms. + needles: list[tuple[str, str]] = [] + for raw_line in page.comments_raw.split("\n"): + normalized = _normalize_line(raw_line) + if not normalized: + continue + needles.append((raw_line, normalized.casefold())) if not needles: return [] overlaps: list[CommentEntryOverlap] = [] for quadrant in page.quadrants: for entry in sorted(quadrant.entries, key=lambda e: e.row_index): - entry_normalized = _normalize_for_match(entry.raw_text) - if not entry_normalized: + entry_haystack = _normalize_line(entry.raw_text).casefold() + if not entry_haystack: continue - for needle in needles: - if needle in entry_normalized: + for raw_line, needle in needles: + if needle in entry_haystack: overlaps.append( CommentEntryOverlap( matched_text=needle, + comment_line_raw=raw_line, position=quadrant.position, row_index=entry.row_index, entry_raw_text=entry.raw_text, ) ) return overlaps - - -def _normalize_for_match(text: str) -> str: - """Lowercased, whitespace-collapsed form used for substring matching. - - Mirrors what `normalize_comments` does to each line, plus lowercase. - Kept separate from `normalize_comments` because the entry side - doesn't need the empty-to-None convention — entries always have - non-empty `raw_text` by schema. - """ - return _INTRA_LINE_WHITESPACE.sub(" ", text).strip().casefold() diff --git a/tests/unit/test_comments.py b/tests/unit/test_comments.py index 3feba92..6f1191d 100644 --- a/tests/unit/test_comments.py +++ b/tests/unit/test_comments.py @@ -179,9 +179,50 @@ def test_overlaps_single_match_surfaces_position_and_row() -> None: assert overlap.position == "top_right" assert overlap.row_index == 1 assert overlap.matched_text == "stereolab - ping pong" + assert overlap.comment_line_raw == "stereolab - ping pong" assert overlap.entry_raw_text == "STEREOLAB - PING PONG" +def test_overlaps_carry_verbatim_comment_line() -> None: + """`comment_line_raw` is the verbatim line from `page.comments_raw` — + parallel to `entry_raw_text` on the entry side. A future audit CLI + needs the original case AND any quirky whitespace the DJ wrote, so + the matched record carries both the casefolded form used for matching + AND the verbatim line that triggered it. Without this field, a + consumer that wants to display the comment exactly as it appears on + the page would have nothing to show.""" + page = _page( + comments_raw=" STEREOLAB - Ping Pong ", + entries_by_position={ + "top_left": [_entry(0, "stereolab - ping pong")], + }, + ) + overlaps = find_comment_entry_overlaps(page) + assert len(overlaps) == 1 + overlap = overlaps[0] + # `matched_text` is the casefolded, whitespace-collapsed form used + # for matching. + assert overlap.matched_text == "stereolab - ping pong" + # `comment_line_raw` is the verbatim line from `page.comments_raw`, + # whitespace and case preserved. + assert overlap.comment_line_raw == " STEREOLAB - Ping Pong " + + +def test_overlaps_carry_per_line_verbatim_for_multi_line_comments() -> None: + """`comment_line_raw` is per-line, not the whole comments band. A + multi-line comments band where only one line duplicates a grid entry + should surface only that line's verbatim form — not the whole blob.""" + page = _page( + comments_raw="HAPPY BDAY MIKE\nSTEREOLAB - PING PONG\nrequest line @ 962-7768", + entries_by_position={ + "top_left": [_entry(2, "stereolab - ping pong")], + }, + ) + overlaps = find_comment_entry_overlaps(page) + assert len(overlaps) == 1 + assert overlaps[0].comment_line_raw == "STEREOLAB - PING PONG" + + def test_overlaps_case_insensitive() -> None: """Match is case-insensitive; DJs scribble in mixed case.""" page = _page(