From b37a92abd7ecf13845398c07541905ab6a0d062b Mon Sep 17 00:00:00 2001
From: Jake Bromberg <jake@funlandresearch.com>
Date: Sun, 10 May 2026 16:25:45 -0700
Subject: [PATCH 1/2] feat(comments): read-time normalize and entry-overlap
 diagnostics for comments_raw
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors the shape of core.continuations: pure read-time functions over PageResult, no I/O, no on-disk mutation. normalize_comments collapses intra-line whitespace, trims per line, drops empty interior lines, and returns None when the result is empty — verbatim on disk stays verbatim per the org data-safety rule. find_comment_entry_overlaps returns one record per (normalized comments line, matching entry) pair where the line is a case- and whitespace-insensitive substring of an entry's raw_text on the same page; results come back in canonical quadrant order with ascending row_index. Categorization, the audit CLI, and any wiring into the pipeline run path are intentionally deferred — this lands the primitives so the calibration scorer (and a future audit tool) can call them.

Closes #34
---
 core/comments.py            | 160 +++++++++++++++++++++
 tests/unit/test_comments.py | 280 ++++++++++++++++++++++++++++++++++++
 2 files changed, 440 insertions(+)
 create mode 100644 core/comments.py
 create mode 100644 tests/unit/test_comments.py

diff --git a/core/comments.py b/core/comments.py
new file mode 100644
index 0000000..8501418
--- /dev/null
+++ b/core/comments.py
@@ -0,0 +1,160 @@
+"""Phase-2 read-time normalization and duplication diagnostics for `comments_raw`.
+
+The Phase-1 prompt captures the bottom-of-page Comments field verbatim
+into `GeminiPageResult.comments_raw` (see `core/schema.py`). Verbatim is
+right for ingestion but awkward to consume: whitespace is ragged, and
+the same handwritten text sometimes appears in both the comments band
+AND inside one of the four hour-quadrants (e.g. a DJ scribbles a song
+title in the comments band that they also wrote into the grid).
+
+This module mirrors the shape of `core.continuations`: pure read-time
+functions, no I/O, no on-disk mutation. The on-disk `comments_raw`
+stays verbatim per the org data-safety rule. Callers compute the
+normalized form at read time.
+
+Two primitives:
+
+  * `normalize_comments(raw)` — collapses internal whitespace per line,
+    trims each line, drops empty interior lines, returns None if the
+    result is empty. Idempotent.
+  * `find_comment_entry_overlaps(page)` — diagnostic: returns one record
+    per (comment line, entry) pair where a normalized comment line is a
+    substring of an entry's `raw_text` after case- and
+    whitespace-insensitive matching. Empty list when there's nothing to
+    flag.
+
+No categorization here — the ticket explicitly defers that to a later
+pass once we have a corpus of filled-in comments bands to look at.
+"""
+
+from __future__ import annotations
+
+import re
+
+from pydantic import BaseModel, NonNegativeInt
+
+from core.schema import PageResult, QuadrantPosition
+
+# Multiple runs of any whitespace character on a single line collapse to
+# one ASCII space. Newlines are handled separately (line breaks carry
+# structural meaning — see the `comments_raw` field description).
+_INTRA_LINE_WHITESPACE = re.compile(r"[^\S\n]+")
+
+
+def normalize_comments(raw: str | None) -> str | None:
+    """Return a read-time normalized form of `comments_raw`.
+
+    Pure and idempotent. None passes through. The transform:
+
+      * splits on newlines so multi-line entries stay multi-line,
+      * collapses runs of intra-line whitespace (spaces, tabs, etc.) to
+        a single space within each line,
+      * trims leading/trailing whitespace on each line,
+      * drops lines that are empty after trimming — blank interior lines
+        are noise, and downstream per-line iteration shouldn't have to
+        filter them,
+      * returns None when the result is empty, so callers have one
+        sentinel for "nothing useful here" instead of having to also
+        check for ``""``.
+
+    The verbatim on-disk shape is preserved; callers run this at read
+    time when they want a cleaner form.
+    """
+    if raw is None:
+        return None
+    lines = [_INTRA_LINE_WHITESPACE.sub(" ", line).strip() for line in raw.split("\n")]
+    kept = [line for line in lines if line]
+    if not kept:
+        return None
+    return "\n".join(kept)
+
+
+class CommentEntryOverlap(BaseModel):
+    """One (comment line, entry) duplication record.
+
+    Surfaces the case where text in the bottom comments band also
+    appears inside a grid entry on the same page. The fields are the
+    minimum a caller (calibration scorer, future audit CLI) needs to
+    locate the duplicate on the page and inspect both sides.
+    """
+
+    matched_text: str
+    """The normalized comments substring that matched. Lowercase,
+    whitespace-collapsed — this is the form the matcher compared, not
+    the verbatim on-disk text."""
+
+    position: QuadrantPosition
+    """Which quadrant the matched entry lives in."""
+
+    row_index: NonNegativeInt
+    """0-based row position of the matched entry within its quadrant."""
+
+    entry_raw_text: str
+    """The full `raw_text` of the matched entry, verbatim from disk —
+    so a caller can show context around the substring match."""
+
+
+def find_comment_entry_overlaps(page: PageResult) -> list[CommentEntryOverlap]:
+    """Diagnose duplication between `comments_raw` and grid entries.
+
+    Returns one record per (normalized comment line, matching entry)
+    pair where the comment line is a substring of an entry's
+    ``raw_text`` after both sides are normalized (lowercased and
+    whitespace-collapsed). Empty list when:
+
+      * ``page.comments_raw`` is None,
+      * normalization yields None (whitespace-only comments),
+      * no entry contains any comment line.
+
+    Matching is **per-line**, not against the whole normalized comments
+    blob. A two-line comments band with one dedication and one song
+    title surfaces only the song-title line if just that line
+    duplicates a grid row — the unrelated dedication doesn't pollute
+    the diagnostic.
+
+    Records come back in canonical quadrant order (top_left, top_right,
+    bottom_left, bottom_right) with ascending ``row_index`` within each
+    quadrant. Stable ordering keeps diagnostic output diffable across
+    runs.
+
+    Pure: no mutation of the input ``page``, no I/O.
+    """
+    normalized = normalize_comments(page.comments_raw)
+    if normalized is None:
+        return []
+
+    # Case-insensitive matching is symmetric — fold both sides to the
+    # same case so a comments band scribbled in caps still matches an
+    # entry transcribed in lower case (and vice versa).
+    needles = [line.casefold() for line in normalized.split("\n") if line]
+    if not needles:
+        return []
+
+    overlaps: list[CommentEntryOverlap] = []
+    for quadrant in page.quadrants:
+        for entry in sorted(quadrant.entries, key=lambda e: e.row_index):
+            entry_normalized = _normalize_for_match(entry.raw_text)
+            if not entry_normalized:
+                continue
+            for needle in needles:
+                if needle in entry_normalized:
+                    overlaps.append(
+                        CommentEntryOverlap(
+                            matched_text=needle,
+                            position=quadrant.position,
+                            row_index=entry.row_index,
+                            entry_raw_text=entry.raw_text,
+                        )
+                    )
+    return overlaps
+
+
+def _normalize_for_match(text: str) -> str:
+    """Lowercased, whitespace-collapsed form used for substring matching.
+
+    Mirrors what `normalize_comments` does to each line, plus lowercase.
+    Kept separate from `normalize_comments` because the entry side
+    doesn't need the empty-to-None convention — entries always have
+    non-empty `raw_text` by schema.
+    """
+    return _INTRA_LINE_WHITESPACE.sub(" ", text).strip().casefold()
diff --git a/tests/unit/test_comments.py b/tests/unit/test_comments.py
new file mode 100644
index 0000000..3feba92
--- /dev/null
+++ b/tests/unit/test_comments.py
@@ -0,0 +1,280 @@
+"""Tests for `core.comments`: read-time normalize + entry-overlap diagnostic."""
+
+from __future__ import annotations
+
+from datetime import UTC, datetime
+
+import pytest
+
+from core.comments import (
+    CommentEntryOverlap,
+    find_comment_entry_overlaps,
+    normalize_comments,
+)
+from core.schema import Entry, PageResult, Quadrant, QuadrantPosition
+
+# ---------------------------------------------------------------------------
+# normalize_comments
+# ---------------------------------------------------------------------------
+
+
+def test_normalize_none_passes_through() -> None:
+    assert normalize_comments(None) is None
+
+
+def test_normalize_empty_string_returns_none() -> None:
+    """The on-disk shape uses None to mean "blank"; an empty-after-trim
+    string should normalize to None so consumers have one sentinel."""
+    assert normalize_comments("") is None
+    assert normalize_comments("   ") is None
+    assert normalize_comments("\n\n\n") is None
+    assert normalize_comments("  \n  \n  ") is None
+
+
+def test_normalize_collapses_internal_whitespace_within_line() -> None:
+    """Multiple spaces or tabs within a single line collapse to one space.
+    Substring matching downstream needs whitespace-stable input."""
+    assert normalize_comments("happy   birthday    mike") == "happy birthday mike"
+    assert normalize_comments("happy\tbirthday\t\tmike") == "happy birthday mike"
+
+
+def test_normalize_trims_leading_and_trailing_whitespace() -> None:
+    assert normalize_comments("  hello  ") == "hello"
+
+
+def test_normalize_trims_per_line() -> None:
+    """Each line in a multi-line comment is independently trimmed."""
+    raw = "  line one  \n  line two  "
+    assert normalize_comments(raw) == "line one\nline two"
+
+
+def test_normalize_preserves_line_breaks() -> None:
+    """The on-disk shape promises multi-line entries are joined with a
+    single newline (see GeminiPageResult.comments_raw). Normalize keeps
+    those line breaks intact — they carry structural meaning."""
+    raw = "happy bday mike\nrequest: stereolab - ping pong"
+    assert normalize_comments(raw) == "happy bday mike\nrequest: stereolab - ping pong"
+
+
+def test_normalize_drops_blank_interior_lines() -> None:
+    """Blank lines between content lines collapse — they don't carry the
+    same structural meaning as content-bearing lines, and downstream
+    per-line iteration shouldn't have to filter empties."""
+    raw = "happy bday mike\n\n\nrequest: stereolab"
+    assert normalize_comments(raw) == "happy bday mike\nrequest: stereolab"
+
+
+def test_normalize_is_idempotent() -> None:
+    """`normalize(normalize(x)) == normalize(x)` for any input. A bug
+    where we ever introduced a state-dependent transform would regress
+    this. Parameterized across the inputs the other tests exercise."""
+    inputs = [
+        None,
+        "",
+        "   ",
+        "happy   birthday",
+        "  line one  \n  line two  ",
+        "happy bday mike\n\n\nrequest: stereolab",
+        "single",
+    ]
+    for raw in inputs:
+        once = normalize_comments(raw)
+        twice = normalize_comments(once)
+        assert twice == once, f"not idempotent for {raw!r}: {once!r} -> {twice!r}"
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        (None, None),
+        ("", None),
+        ("   ", None),
+        ("single", "single"),
+        ("  hello  ", "hello"),
+        ("a   b", "a b"),
+        ("line one\nline two", "line one\nline two"),
+        ("  line one  \n  line two  ", "line one\nline two"),
+    ],
+)
+def test_normalize_parametrized(raw: str | None, expected: str | None) -> None:
+    assert normalize_comments(raw) == expected
+
+
+# ---------------------------------------------------------------------------
+# find_comment_entry_overlaps
+# ---------------------------------------------------------------------------
+
+
+def _page(
+    *,
+    comments_raw: str | None,
+    entries_by_position: dict[QuadrantPosition, list[Entry]] | None = None,
+) -> PageResult:
+    """Build a synthetic PageResult for overlap tests.
+
+    Always emits all four quadrants in canonical order so the
+    `GeminiPageResult` validator is happy. Entries default to empty per
+    quadrant unless `entries_by_position` supplies them.
+    """
+    entries_by_position = entries_by_position or {}
+    quadrants = [
+        Quadrant(
+            position=pos,
+            entries=entries_by_position.get(pos, []),
+        )
+        for pos in ("top_left", "top_right", "bottom_left", "bottom_right")
+    ]
+    return PageResult(
+        page_date_raw=None,
+        quadrants=quadrants,
+        comments_raw=comments_raw,
+        oddities=[],
+        model_version="test-model",
+        extracted_at=datetime(2026, 1, 1, tzinfo=UTC),
+    )
+
+
+def _entry(row_index: int, raw_text: str) -> Entry:
+    return Entry(row_index=row_index, raw_text=raw_text, confidence="high")
+
+
+def test_overlaps_none_comments_returns_empty() -> None:
+    page = _page(comments_raw=None)
+    assert find_comment_entry_overlaps(page) == []
+
+
+def test_overlaps_empty_after_normalize_returns_empty() -> None:
+    """A whitespace-only `comments_raw` normalizes to None — no overlap
+    work to do."""
+    page = _page(comments_raw="   \n  \n  ")
+    assert find_comment_entry_overlaps(page) == []
+
+
+def test_overlaps_no_match_returns_empty() -> None:
+    page = _page(
+        comments_raw="happy bday mike",
+        entries_by_position={
+            "top_left": [_entry(0, "JUANA MOLINA - LA PARADOJA")],
+        },
+    )
+    assert find_comment_entry_overlaps(page) == []
+
+
+def test_overlaps_single_match_surfaces_position_and_row() -> None:
+    """A single comment line that also appears in an entry's raw_text
+    on the same page is the canonical duplication case."""
+    page = _page(
+        comments_raw="stereolab - ping pong",
+        entries_by_position={
+            "top_right": [
+                _entry(0, "JUANA MOLINA - LA PARADOJA"),
+                _entry(1, "STEREOLAB - PING PONG"),
+            ],
+        },
+    )
+    overlaps = find_comment_entry_overlaps(page)
+    assert len(overlaps) == 1
+    overlap = overlaps[0]
+    assert isinstance(overlap, CommentEntryOverlap)
+    assert overlap.position == "top_right"
+    assert overlap.row_index == 1
+    assert overlap.matched_text == "stereolab - ping pong"
+    assert overlap.entry_raw_text == "STEREOLAB - PING PONG"
+
+
+def test_overlaps_case_insensitive() -> None:
+    """Match is case-insensitive; DJs scribble in mixed case."""
+    page = _page(
+        comments_raw="STEREOLAB - PING PONG",
+        entries_by_position={
+            "top_left": [_entry(0, "stereolab - ping pong")],
+        },
+    )
+    overlaps = find_comment_entry_overlaps(page)
+    assert len(overlaps) == 1
+    assert overlaps[0].entry_raw_text == "stereolab - ping pong"
+
+
+def test_overlaps_whitespace_insensitive() -> None:
+    """The matcher collapses whitespace on both sides before comparing —
+    a comment with weird inner spacing should still match an entry that
+    doesn't, and vice versa."""
+    page = _page(
+        comments_raw="stereolab    -   ping  pong",
+        entries_by_position={
+            "bottom_left": [_entry(0, "STEREOLAB - PING PONG")],
+        },
+    )
+    overlaps = find_comment_entry_overlaps(page)
+    assert len(overlaps) == 1
+    assert overlaps[0].position == "bottom_left"
+
+
+def test_overlaps_multi_line_one_matches() -> None:
+    """Multi-line `comments_raw` is matched per-line — a comments band
+    that has both a dedication and a song title should surface only
+    the song-title line when that line duplicates a grid entry."""
+    page = _page(
+        comments_raw="happy bday mike\nstereolab - ping pong",
+        entries_by_position={
+            "top_left": [_entry(2, "STEREOLAB - PING PONG")],
+        },
+    )
+    overlaps = find_comment_entry_overlaps(page)
+    assert len(overlaps) == 1
+    assert overlaps[0].matched_text == "stereolab - ping pong"
+    assert overlaps[0].row_index == 2
+
+
+def test_overlaps_multiple_entries_match_same_line() -> None:
+    """If the same comment line happens to substring-match more than one
+    entry on the page, surface one record per (substring, entry) pair —
+    callers want to see every duplicate, not just the first."""
+    page = _page(
+        comments_raw="stereolab",
+        entries_by_position={
+            "top_left": [_entry(0, "STEREOLAB - PING PONG")],
+            "bottom_right": [_entry(0, "STEREOLAB - JENNY ONDIOLINE")],
+        },
+    )
+    overlaps = find_comment_entry_overlaps(page)
+    assert len(overlaps) == 2
+    positions = {o.position for o in overlaps}
+    assert positions == {"top_left", "bottom_right"}
+
+
+def test_overlaps_substring_match_not_full_line() -> None:
+    """The comment line is the needle; the entry's raw_text is the
+    haystack. A short comment that's a substring of a longer entry row
+    should match."""
+    page = _page(
+        comments_raw="ping pong",
+        entries_by_position={
+            "top_left": [_entry(0, "STEREOLAB - PING PONG")],
+        },
+    )
+    overlaps = find_comment_entry_overlaps(page)
+    assert len(overlaps) == 1
+    assert overlaps[0].matched_text == "ping pong"
+
+
+def test_overlaps_preserves_quadrant_canonical_order() -> None:
+    """Multiple matches across quadrants are returned in canonical
+    quadrant order, with row_index ascending within each quadrant.
+    Stable ordering keeps diagnostic output diffable across runs."""
+    page = _page(
+        comments_raw="stereolab",
+        entries_by_position={
+            "bottom_right": [_entry(3, "STEREOLAB - LATE")],
+            "top_left": [
+                _entry(0, "STEREOLAB - EARLY"),
+                _entry(2, "STEREOLAB - MIDDLE"),
+            ],
+        },
+    )
+    overlaps = find_comment_entry_overlaps(page)
+    assert [(o.position, o.row_index) for o in overlaps] == [
+        ("top_left", 0),
+        ("top_left", 2),
+        ("bottom_right", 3),
+    ]

From 4e73fba53c4ea5d6f808f302db79c5c7d49ff9c2 Mon Sep 17 00:00:00 2001
From: Jake Bromberg <jake@funlandresearch.com>
Date: Sun, 10 May 2026 16:50:12 -0700
Subject: [PATCH 2/2] review: add comment_line_raw and dedupe per-line
 normalize
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses code-review feedback on the original PR. (1) `CommentEntryOverlap` now carries `comment_line_raw` — the verbatim comments-band line that produced the match, with whitespace and case preserved — parallel to `entry_raw_text` on the entry side. A future audit CLI wants to display both sides exactly as the DJ wrote them; the casefolded form (`matched_text`) stays for matcher introspection. (2) Extracted `_normalize_line` as the shared per-line primitive used by both `normalize_comments` and the matcher; the prior `_normalize_for_match` helper duplicated the same regex+strip composition. (3) Docstring on `find_comment_entry_overlaps` now states the scope explicitly — only `entry.raw_text` is the haystack; `oddities` lists at both quadrant and page levels are intentionally not searched. (4) Dropped the redundant empty-line filter that the refactored line walk no longer needs.
---
 core/comments.py            | 63 +++++++++++++++++++++++--------------
 tests/unit/test_comments.py | 41 ++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 23 deletions(-)

diff --git a/core/comments.py b/core/comments.py
index 8501418..8b60b5d 100644
--- a/core/comments.py
+++ b/core/comments.py
@@ -41,6 +41,18 @@
 _INTRA_LINE_WHITESPACE = re.compile(r"[^\S\n]+")
 
 
+def _normalize_line(text: str) -> str:
+    """Collapse intra-line whitespace and trim. The shared per-line primitive.
+
+    Used by `normalize_comments` to build the joined string and by
+    `find_comment_entry_overlaps` to build the substring needles and to
+    normalize the entry haystacks. Casefolding lives at the call site —
+    consumers want both the original-case form (for display) and the
+    casefolded form (for matching).
+    """
+    return _INTRA_LINE_WHITESPACE.sub(" ", text).strip()
+
+
 def normalize_comments(raw: str | None) -> str | None:
     """Return a read-time normalized form of `comments_raw`.
 
@@ -62,8 +74,7 @@ def normalize_comments(raw: str | None) -> str | None:
     """
     if raw is None:
         return None
-    lines = [_INTRA_LINE_WHITESPACE.sub(" ", line).strip() for line in raw.split("\n")]
-    kept = [line for line in lines if line]
+    kept = [line for line in (_normalize_line(line) for line in raw.split("\n")) if line]
     if not kept:
         return None
     return "\n".join(kept)
@@ -83,6 +94,13 @@ class CommentEntryOverlap(BaseModel):
     whitespace-collapsed — this is the form the matcher compared, not
     the verbatim on-disk text."""
 
+    comment_line_raw: str
+    """The verbatim comments-band line that produced the match, exactly
+    as it appears in `page.comments_raw` (whitespace and case preserved).
+    Parallel to `entry_raw_text` on the entry side — a caller that wants
+    to display the duplicate to a human shows this and `entry_raw_text`
+    together; the matcher's casefolded form lives in `matched_text`."""
+
     position: QuadrantPosition
     """Which quadrant the matched entry lives in."""
 
@@ -112,6 +130,12 @@ def find_comment_entry_overlaps(page: PageResult) -> list[CommentEntryOverlap]:
     duplicates a grid row — the unrelated dedication doesn't pollute
     the diagnostic.
 
+    Scope: only ``entry.raw_text`` is matched against. Page-level and
+    quadrant-level ``oddities`` lists are intentionally not searched —
+    they have their own non-overlapping role (surface unmodeled
+    phenomena), and a "comment also appears as an oddity" duplicate is
+    not what callers of this diagnostic are asking about.
+
     Records come back in canonical quadrant order (top_left, top_right,
     bottom_left, bottom_right) with ascending ``row_index`` within each
     quadrant. Stable ordering keeps diagnostic output diffable across
@@ -119,42 +143,35 @@ def find_comment_entry_overlaps(page: PageResult) -> list[CommentEntryOverlap]:
 
     Pure: no mutation of the input ``page``, no I/O.
     """
-    normalized = normalize_comments(page.comments_raw)
-    if normalized is None:
+    if page.comments_raw is None:
         return []
 
-    # Case-insensitive matching is symmetric — fold both sides to the
-    # same case so a comments band scribbled in caps still matches an
-    # entry transcribed in lower case (and vice versa).
-    needles = [line.casefold() for line in normalized.split("\n") if line]
+    # Walk the verbatim comments_raw line-by-line so each (raw_line,
+    # needle) pair stays paired — the record carries both forms.
+    needles: list[tuple[str, str]] = []
+    for raw_line in page.comments_raw.split("\n"):
+        normalized = _normalize_line(raw_line)
+        if not normalized:
+            continue
+        needles.append((raw_line, normalized.casefold()))
     if not needles:
         return []
 
     overlaps: list[CommentEntryOverlap] = []
     for quadrant in page.quadrants:
         for entry in sorted(quadrant.entries, key=lambda e: e.row_index):
-            entry_normalized = _normalize_for_match(entry.raw_text)
-            if not entry_normalized:
+            entry_haystack = _normalize_line(entry.raw_text).casefold()
+            if not entry_haystack:
                 continue
-            for needle in needles:
-                if needle in entry_normalized:
+            for raw_line, needle in needles:
+                if needle in entry_haystack:
                     overlaps.append(
                         CommentEntryOverlap(
                             matched_text=needle,
+                            comment_line_raw=raw_line,
                             position=quadrant.position,
                             row_index=entry.row_index,
                             entry_raw_text=entry.raw_text,
                         )
                     )
     return overlaps
-
-
-def _normalize_for_match(text: str) -> str:
-    """Lowercased, whitespace-collapsed form used for substring matching.
-
-    Mirrors what `normalize_comments` does to each line, plus lowercase.
-    Kept separate from `normalize_comments` because the entry side
-    doesn't need the empty-to-None convention — entries always have
-    non-empty `raw_text` by schema.
-    """
-    return _INTRA_LINE_WHITESPACE.sub(" ", text).strip().casefold()
diff --git a/tests/unit/test_comments.py b/tests/unit/test_comments.py
index 3feba92..6f1191d 100644
--- a/tests/unit/test_comments.py
+++ b/tests/unit/test_comments.py
@@ -179,9 +179,50 @@ def test_overlaps_single_match_surfaces_position_and_row() -> None:
     assert overlap.position == "top_right"
     assert overlap.row_index == 1
     assert overlap.matched_text == "stereolab - ping pong"
+    assert overlap.comment_line_raw == "stereolab - ping pong"
     assert overlap.entry_raw_text == "STEREOLAB - PING PONG"
 
 
+def test_overlaps_carry_verbatim_comment_line() -> None:
+    """`comment_line_raw` is the verbatim line from `page.comments_raw` —
+    parallel to `entry_raw_text` on the entry side. A future audit CLI
+    needs the original case AND any quirky whitespace the DJ wrote, so
+    the matched record carries both the casefolded form used for matching
+    AND the verbatim line that triggered it. Without this field, a
+    consumer that wants to display the comment exactly as it appears on
+    the page would have nothing to show."""
+    page = _page(
+        comments_raw="  STEREOLAB - Ping Pong  ",
+        entries_by_position={
+            "top_left": [_entry(0, "stereolab - ping pong")],
+        },
+    )
+    overlaps = find_comment_entry_overlaps(page)
+    assert len(overlaps) == 1
+    overlap = overlaps[0]
+    # `matched_text` is the casefolded, whitespace-collapsed form used
+    # for matching.
+    assert overlap.matched_text == "stereolab - ping pong"
+    # `comment_line_raw` is the verbatim line from `page.comments_raw`,
+    # whitespace and case preserved.
+    assert overlap.comment_line_raw == "  STEREOLAB - Ping Pong  "
+
+
+def test_overlaps_carry_per_line_verbatim_for_multi_line_comments() -> None:
+    """`comment_line_raw` is per-line, not the whole comments band. A
+    multi-line comments band where only one line duplicates a grid entry
+    should surface only that line's verbatim form — not the whole blob."""
+    page = _page(
+        comments_raw="HAPPY BDAY MIKE\nSTEREOLAB - PING PONG\nrequest line @ 962-7768",
+        entries_by_position={
+            "top_left": [_entry(2, "stereolab - ping pong")],
+        },
+    )
+    overlaps = find_comment_entry_overlaps(page)
+    assert len(overlaps) == 1
+    assert overlaps[0].comment_line_raw == "STEREOLAB - PING PONG"
+
+
 def test_overlaps_case_insensitive() -> None:
     """Match is case-insensitive; DJs scribble in mixed case."""
     page = _page(