Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 177 additions & 0 deletions core/comments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""Phase-2 read-time normalization and duplication diagnostics for `comments_raw`.

The Phase-1 prompt captures the bottom-of-page Comments field verbatim
into `GeminiPageResult.comments_raw` (see `core/schema.py`). Verbatim is
right for ingestion but awkward to consume: whitespace is ragged, and
the same handwritten text sometimes appears in both the comments band
AND inside one of the four hour-quadrants (e.g. a DJ scribbles a song
title in the comments band that they also wrote into the grid).

This module mirrors the shape of `core.continuations`: pure read-time
functions, no I/O, no on-disk mutation. The on-disk `comments_raw`
stays verbatim per the org data-safety rule. Callers compute the
normalized form at read time.

Two primitives:

* `normalize_comments(raw)` — collapses internal whitespace per line,
trims each line, drops empty interior lines, returns None if the
result is empty. Idempotent.
* `find_comment_entry_overlaps(page)` — diagnostic: returns one record
per (comment line, entry) pair where a normalized comment line is a
substring of an entry's `raw_text` after case- and
whitespace-insensitive matching. Empty list when there's nothing to
flag.

No categorization here — the ticket explicitly defers that to a later
pass once we have a corpus of filled-in comments bands to look at.
"""

from __future__ import annotations

import re

from pydantic import BaseModel, NonNegativeInt

from core.schema import PageResult, QuadrantPosition

# Multiple runs of any whitespace character on a single line collapse to
# one ASCII space. Newlines are handled separately (line breaks carry
# structural meaning — see the `comments_raw` field description).
_INTRA_LINE_WHITESPACE = re.compile(r"[^\S\n]+")


def _normalize_line(text: str) -> str:
"""Collapse intra-line whitespace and trim. The shared per-line primitive.

Used by `normalize_comments` to build the joined string and by
`find_comment_entry_overlaps` to build the substring needles and to
normalize the entry haystacks. Casefolding lives at the call site —
consumers want both the original-case form (for display) and the
casefolded form (for matching).
"""
return _INTRA_LINE_WHITESPACE.sub(" ", text).strip()


def normalize_comments(raw: str | None) -> str | None:
"""Return a read-time normalized form of `comments_raw`.

Pure and idempotent. None passes through. The transform:

* splits on newlines so multi-line entries stay multi-line,
* collapses runs of intra-line whitespace (spaces, tabs, etc.) to
a single space within each line,
* trims leading/trailing whitespace on each line,
* drops lines that are empty after trimming — blank interior lines
are noise, and downstream per-line iteration shouldn't have to
filter them,
* returns None when the result is empty, so callers have one
sentinel for "nothing useful here" instead of having to also
check for ``""``.

The verbatim on-disk shape is preserved; callers run this at read
time when they want a cleaner form.
"""
if raw is None:
return None
kept = [line for line in (_normalize_line(line) for line in raw.split("\n")) if line]
if not kept:
return None
return "\n".join(kept)


class CommentEntryOverlap(BaseModel):
"""One (comment line, entry) duplication record.

Surfaces the case where text in the bottom comments band also
appears inside a grid entry on the same page. The fields are the
minimum a caller (calibration scorer, future audit CLI) needs to
locate the duplicate on the page and inspect both sides.
"""

matched_text: str
"""The normalized comments substring that matched. Lowercase,
whitespace-collapsed — this is the form the matcher compared, not
the verbatim on-disk text."""

comment_line_raw: str
"""The verbatim comments-band line that produced the match, exactly
as it appears in `page.comments_raw` (whitespace and case preserved).
Parallel to `entry_raw_text` on the entry side — a caller that wants
to display the duplicate to a human shows this and `entry_raw_text`
together; the matcher's casefolded form lives in `matched_text`."""

position: QuadrantPosition
"""Which quadrant the matched entry lives in."""

row_index: NonNegativeInt
"""0-based row position of the matched entry within its quadrant."""

entry_raw_text: str
"""The full `raw_text` of the matched entry, verbatim from disk —
so a caller can show context around the substring match."""


def find_comment_entry_overlaps(page: PageResult) -> list[CommentEntryOverlap]:
"""Diagnose duplication between `comments_raw` and grid entries.

Returns one record per (normalized comment line, matching entry)
pair where the comment line is a substring of an entry's
``raw_text`` after both sides are normalized (lowercased and
whitespace-collapsed). Empty list when:

* ``page.comments_raw`` is None,
* normalization yields None (whitespace-only comments),
* no entry contains any comment line.

Matching is **per-line**, not against the whole normalized comments
blob. A two-line comments band with one dedication and one song
title surfaces only the song-title line if just that line
duplicates a grid row — the unrelated dedication doesn't pollute
the diagnostic.

Scope: only ``entry.raw_text`` is matched against. Page-level and
quadrant-level ``oddities`` lists are intentionally not searched —
they have their own non-overlapping role (surface unmodeled
phenomena), and a "comment also appears as an oddity" duplicate is
not what callers of this diagnostic are asking about.

Records come back in canonical quadrant order (top_left, top_right,
bottom_left, bottom_right) with ascending ``row_index`` within each
quadrant. Stable ordering keeps diagnostic output diffable across
runs.

Pure: no mutation of the input ``page``, no I/O.
"""
if page.comments_raw is None:
return []

# Walk the verbatim comments_raw line-by-line so each (raw_line,
# needle) pair stays paired — the record carries both forms.
needles: list[tuple[str, str]] = []
for raw_line in page.comments_raw.split("\n"):
normalized = _normalize_line(raw_line)
if not normalized:
continue
needles.append((raw_line, normalized.casefold()))
if not needles:
return []

overlaps: list[CommentEntryOverlap] = []
for quadrant in page.quadrants:
for entry in sorted(quadrant.entries, key=lambda e: e.row_index):
entry_haystack = _normalize_line(entry.raw_text).casefold()
if not entry_haystack:
continue
for raw_line, needle in needles:
if needle in entry_haystack:
overlaps.append(
CommentEntryOverlap(
matched_text=needle,
comment_line_raw=raw_line,
position=quadrant.position,
row_index=entry.row_index,
entry_raw_text=entry.raw_text,
)
)
return overlaps
Loading
Loading