diff --git a/core/continuations.py b/core/continuations.py index 1b8d2d0..a422cde 100644 --- a/core/continuations.py +++ b/core/continuations.py @@ -39,13 +39,15 @@ def merge_continuations(entries: list[Entry]) -> list[Entry]: continuation still reports "high". Substring matching doesn't care; if a future re-OCR queue filters by confidence and wants to flag merged rows, take the min over the contributing rows. - * `artist_guess` / `track_guess` — kept as-is. After a merge - these may not reflect the joined `raw_text`; downstream - consumers that care should re-parse from `raw_text`. * `row_index` — the predecessor's index is preserved. The continuation row's index is dropped from the merged view; the on-disk JSON still has both grid positions if needed. + Callers that want an artist/track split from the merged `raw_text` + should apply `core.parse.parse_artist_track` after merging — the + split is deterministic at read time and is intentionally not stored + on `Entry`. + The merge is **lossy with respect to internal whitespace** at the wrap boundary — multiple spaces / tabs collapse to a single space. Verbatim whitespace round-tripping requires reading the on-disk diff --git a/core/parse.py b/core/parse.py new file mode 100644 index 0000000..9bf015f --- /dev/null +++ b/core/parse.py @@ -0,0 +1,60 @@ +"""Read-time artist/track split for `Entry.raw_text`. + +The Phase-1 schema asked Gemini to fill `artist_guess` and `track_guess` +alongside the verbatim `raw_text`. The split is deterministic (separate +on the first whitespace-bracketed dash) and producing it on the model +side spends output tokens for no analytic benefit. The fields were +dropped from the response schema; this module is what downstream +consumers call at read time to recover them. + +Sibling pattern: `core.continuations.merge_continuations`, +`core.comments.normalize_comments`. Pure function, no I/O. The on-disk +shape stays verbatim; the split happens in memory on read. + +Separator convention: a single ASCII hyphen, en-dash, or em-dash with +whitespace on both sides. Compound band names ("X-Ray Spex") have no +surrounding whitespace and are NOT split. Multiple separators in one +line split on the first; the track side keeps the rest. +""" + +from __future__ import annotations + +import re + +# ASCII hyphen, en-dash (U+2013), em-dash (U+2014). Whitespace-bracketed +# so we don't split compound band names like "X-Ray Spex" on their +# internal hyphen. +_SEPARATOR = re.compile(r"\s+[-–—]\s+") + + +def parse_artist_track(raw_text: str | None) -> tuple[str | None, str | None]: + """Split a row's verbatim text into best-effort (artist, track) parts. + + Returns: + * `(None, None)` when the input is None, empty, or whitespace-only. + * `(artist, None)` when no separator is present — the line names + an artist with no track (common for early-90s flowsheets, e.g. + "STEREOLAB" alone). + * `(None, track)` when the separator opens the line with nothing + before it (e.g. " - LA PARADOJA"). + * `(artist, None)` when the separator closes the line with nothing + after it (e.g. "JUANA MOLINA - "). + * `(artist, track)` for the dominant case ("ARTIST - TRACK"). When + multiple separators appear, we split on the first only and the + rest stays in the track. + + Each side is stripped of leading/trailing whitespace. Empty sides + become None so callers have one sentinel for "nothing useful here". + """ + if raw_text is None: + return (None, None) + if not raw_text.strip(): + return (None, None) + # Split the unstripped text so a leading or trailing " - " still has + # the whitespace on both sides of the dash that the regex needs. + split = _SEPARATOR.split(raw_text, maxsplit=1) + if len(split) == 1: + return (raw_text.strip(), None) + artist = split[0].strip() or None + track = split[1].strip() or None + return (artist, track) diff --git a/core/prompts.py b/core/prompts.py index 8389b8a..27e0c13 100644 --- a/core/prompts.py +++ b/core/prompts.py @@ -28,7 +28,7 @@ call. Pulls only `comments_raw` (the verbatim contents of the printed "Comments:" band) from the bottom band of the page. -The per-row guidance (raw_text / artist_guess / confidence / notes +The per-row guidance (raw_text / type_raw / confidence / notes tags / etc.) is duplicated across the page and quadrant prompts. They must stay in sync. The shared row-level content is enforced by parallel contract tests in `tests/unit/test_prompts.py`; if you change one, @@ -64,8 +64,6 @@ drawn shape that is NOT a letter; do not invent a description from these examples or copy them onto rows where you cannot read the mark. When in doubt, return null. - - artist_guess: best-effort parse of the part left of the dash, or null - - track_guess: best-effort parse of the part right of the dash, or null - confidence: "high" if the row is clearly legible, "medium" if you had to guess one or two characters, "low" if mostly illegible - notes: null in the common case. Use one of these tags only when relevant: @@ -172,8 +170,6 @@ drawn shape that is NOT a letter; do not invent a description from these examples or copy them onto rows where you cannot read the mark. When in doubt, return null. - - artist_guess: best-effort parse of the part left of the dash, or null - - track_guess: best-effort parse of the part right of the dash, or null - confidence: "high" if the row is clearly legible, "medium" if you had to guess one or two characters, "low" if mostly illegible - notes: null in the common case. Use one of these tags only when relevant: diff --git a/core/schema.py b/core/schema.py index 5c03974..7ae3f6d 100644 --- a/core/schema.py +++ b/core/schema.py @@ -60,14 +60,6 @@ class Entry(BaseModel): "normal entry. Null if the circle is blank." ), ) - artist_guess: str | None = Field( - default=None, - description="Best-effort parse of the artist portion (left of the dash).", - ) - track_guess: str | None = Field( - default=None, - description="Best-effort parse of the track portion (right of the dash).", - ) confidence: Confidence = Field( description="high if the row is clearly legible; low if mostly illegible.", ) diff --git a/core/spot_check.py b/core/spot_check.py index 76ef605..dce35f0 100644 --- a/core/spot_check.py +++ b/core/spot_check.py @@ -12,6 +12,8 @@ from dataclasses import dataclass, field from pathlib import Path +from core.parse import parse_artist_track + @dataclass(frozen=True) class EntryRef: @@ -28,7 +30,7 @@ class EntryRef: class PageReport: """Hit/miss accounting for a single result JSON. - `total` counts entries that had an `artist_guess`. `with_track` is + `total` counts entries we could derive an artist for. `with_track` is the joint-mode denominator: entries with both an artist and a track. The two miss lists hold the rows whose lookup returned False, so a caller can drill into specific transcriptions. @@ -46,19 +48,37 @@ class PageReport: def collect_entries(results_root: Path) -> list[EntryRef]: """Walk every result JSON under `results_root` and return queryable rows. - Skips entries with no `artist_guess` (continuation rows have no - artist by design). An empty `track_guess` becomes `None` so callers - can distinguish "no track to check" from "track is the empty string". + Two on-disk shapes coexist: + + * Pre-audit (34 legacy corpus JSONs): `artist_guess` / `track_guess` + are present on every entry. A null `artist_guess` is the explicit + "continuation row" sentinel — skip those. + * Post-audit (everything new): no `artist_guess` / `track_guess` + keys. Artist and track are derived from `raw_text` via + `core.parse.parse_artist_track`. + + The branch is on KEY PRESENCE, not value truthiness, so the legacy + "explicit null means skip" contract is preserved on legacy rows + while new rows get the deterministic parse. An empty track becomes + `None` so callers can distinguish "no track to check" from "track + is the empty string". """ rows: list[EntryRef] = [] for path in sorted(results_root.rglob("*.json")): data = json.loads(path.read_text()) for q in data.get("quadrants", []): for e in q.get("entries", []): - artist = (e.get("artist_guess") or "").strip() - if not artist: - continue - track = (e.get("track_guess") or "").strip() or None + if "artist_guess" in e: + legacy = (e.get("artist_guess") or "").strip() + if not legacy: + continue + artist = legacy + track = (e.get("track_guess") or "").strip() or None + else: + parsed_artist, track = parse_artist_track(e.get("raw_text")) + if parsed_artist is None: + continue + artist = parsed_artist rows.append( EntryRef( page_path=path, diff --git a/scripts/spot_check_discogs.py b/scripts/spot_check_discogs.py index d0a68d7..90fa9db 100755 --- a/scripts/spot_check_discogs.py +++ b/scripts/spot_check_discogs.py @@ -2,8 +2,10 @@ """spot_check_discogs.py — sanity-check Gemini-extracted entries against the local Discogs PostgreSQL cache. -Walks every `data/results/**/*.json` and for each entry with an -`artist_guess` checks two questions against the cache: +Walks every `data/results/**/*.json` and for each entry with a +derivable artist (from `artist_guess` on the 34 legacy JSONs, or from +`parse_artist_track(raw_text)` on post-schema-trim extractions) checks +two questions against the cache: * artist — is this artist name in the WXYC library? (broad, cheap) * joint — does any release for this WXYC-owned artist contain a @@ -156,7 +158,7 @@ def print_report( print() print("=" * 72) print(f"Pages: {len(pages)}") - print(f"Entries with artist_guess: {total}") + print(f"Entries with derivable artist: {total}") print(f" artist hits: {artist_hit_total}/{total}") print(f"Entries with artist + track: {with_track}") print(f" joint hits (artist+track): {joint_hit_total}/{with_track}") @@ -216,7 +218,7 @@ def main(argv: list[str]) -> int: rows = collect_entries(results_root) if not rows: - print(f"No entries with artist_guess under {results_root}.") + print(f"No entries with a derivable artist under {results_root}.") return 0 unique_artists = sorted({r.artist for r in rows}) diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py index d8eb566..46be44a 100644 --- a/tests/integration/test_pipeline.py +++ b/tests/integration/test_pipeline.py @@ -41,8 +41,6 @@ def _build_page_result(date: str = "Monday 1 Jan '90") -> PageResult: Entry( row_index=0, raw_text="LED ZEP - TRAMPLED", - artist_guess="LED ZEP", - track_guess="TRAMPLED", confidence="high", ) ], diff --git a/tests/unit/test_parse.py b/tests/unit/test_parse.py new file mode 100644 index 0000000..dcc8a4c --- /dev/null +++ b/tests/unit/test_parse.py @@ -0,0 +1,114 @@ +"""Tests for `core.parse.parse_artist_track`. + +`parse_artist_track` is the read-time replacement for the +`Entry.artist_guess` / `Entry.track_guess` fields that used to live on +the Gemini response schema. The model now returns only `raw_text`; the +split happens here, deterministically, at read time. + +Sibling pattern: `core.continuations.merge_continuations`, +`core.comments.normalize_comments`. Pure function, no I/O. +""" + +from __future__ import annotations + +import pytest + +from core.parse import parse_artist_track + + +class TestParseArtistTrack: + @pytest.mark.parametrize( + ("raw", "expected"), + [ + # The dominant case: ASCII hyphen with single spaces. + ("JUANA MOLINA - LA PARADOJA", ("JUANA MOLINA", "LA PARADOJA")), + # Album in parens belongs to the track side, not split off. + ( + "JUANA MOLINA - LA PARADOJA (DOGA)", + ("JUANA MOLINA", "LA PARADOJA (DOGA)"), + ), + ], + ) + def test_simple_ascii_hyphen(self, raw: str, expected: tuple[str | None, str | None]) -> None: + assert parse_artist_track(raw) == expected + + @pytest.mark.parametrize( + ("dash", "label"), + [ + ("–", "en-dash"), + ("—", "em-dash"), + ], + ) + def test_unicode_dashes_split_the_same(self, dash: str, label: str) -> None: + """DJs sometimes write en/em-dashes; the model occasionally normalizes + a hand-drawn dash to unicode. Treat them like ASCII hyphens so both + on-disk shapes split consistently.""" + raw = f"JESSICA PRATT {dash} BACK, BABY" + assert parse_artist_track(raw) == ("JESSICA PRATT", "BACK, BABY"), label + + def test_no_dash_returns_artist_only(self) -> None: + """An entry with no separator is documented as artist-only. The + Stereolab fixture case (artist-only row) appears in the WXYC + canonical-artists test data; the dominant historical reading is + that the line names the artist and omits the track.""" + assert parse_artist_track("STEREOLAB") == ("STEREOLAB", None) + + def test_multiple_dashes_split_on_first(self) -> None: + """Hand-written track titles can themselves contain hyphens + ("Duke Ellington & John Coltrane - In a Sentimental Mood" + wouldn't trigger this — the artist '&' isn't a dash — but a + title like 'X-Ray' on the track side will). Split on the first + separator so the artist segment stays intact.""" + assert parse_artist_track("ARTIST - TRACK - PART TWO") == ( + "ARTIST", + "TRACK - PART TWO", + ) + + def test_strips_whitespace_per_side(self) -> None: + """Leading/trailing whitespace on either side gets stripped from + both sides of the result. Internal whitespace inside artist or + track is preserved.""" + assert parse_artist_track(" THE BAND - THE SONG ") == ("THE BAND", "THE SONG") + + def test_none_input_returns_none_pair(self) -> None: + assert parse_artist_track(None) == (None, None) + + def test_empty_string_returns_none_pair(self) -> None: + assert parse_artist_track("") == (None, None) + + def test_whitespace_only_returns_none_pair(self) -> None: + assert parse_artist_track(" ") == (None, None) + + def test_empty_artist_side_returns_none(self) -> None: + """A line that opens with the separator (' - TRACK') has no artist; + the function shouldn't promote an empty string into a real value.""" + assert parse_artist_track(" - LA PARADOJA") == (None, "LA PARADOJA") + + def test_empty_track_side_returns_none(self) -> None: + """A trailing separator with nothing after it has no track.""" + assert parse_artist_track("JUANA MOLINA - ") == ("JUANA MOLINA", None) + + def test_hyphen_with_no_spaces_does_not_split(self) -> None: + """Compound words like 'X-Ray Spex' must NOT be split on the + internal hyphen — the convention is that the separator is a dash + SURROUNDED by whitespace. This is the load-bearing constraint + that lets the function work on a corpus that mixes hyphenated + band names with separator dashes.""" + assert parse_artist_track("X-RAY SPEX") == ("X-RAY SPEX", None) + + def test_separator_matches_across_newlines(self) -> None: + """The regex's `\\s+` class includes `\\n`, so a separator that + straddles a newline still splits. Rare in practice — Gemini's + `raw_text` is usually a single line — but pinning the behavior + here means a future tightening (e.g. `[^\\S\\n]+` to exclude + newlines) is a deliberate decision rather than a regression. + """ + assert parse_artist_track("ARTIST\n - TRACK") == ("ARTIST", "TRACK") + + def test_idempotent_when_artist_only_recomputed(self) -> None: + """Round-tripping the result through join+split must be a no-op + for the artist-only case: a downstream consumer that re-stores + the artist as `raw_text` and re-parses shouldn't flip the result.""" + artist, track = parse_artist_track("STEREOLAB") + assert (artist, track) == ("STEREOLAB", None) + assert parse_artist_track(artist) == ("STEREOLAB", None) diff --git a/tests/unit/test_prompts.py b/tests/unit/test_prompts.py index ebc37a5..30067a9 100644 --- a/tests/unit/test_prompts.py +++ b/tests/unit/test_prompts.py @@ -28,8 +28,6 @@ def test_prompt_names_each_quadrant_position(position: str) -> None: "row_index", "raw_text", "type_raw", - "artist_guess", - "track_guess", "confidence", "notes", ], @@ -180,8 +178,6 @@ def test_quadrant_template_substitutes_position(position: str) -> None: "row_index", "raw_text", "type_raw", - "artist_guess", - "track_guess", "confidence", "notes", ], diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 16609aa..c082267 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -22,22 +22,37 @@ class TestEntry: def test_minimal_entry(self) -> None: e = Entry(row_index=0, raw_text="LED ZEP - TRAMPLED", confidence="high") assert e.row_index == 0 - assert e.artist_guess is None - assert e.track_guess is None assert e.notes is None def test_full_entry(self) -> None: e = Entry( row_index=2, raw_text="LED ZEP - TRAMPLED", - artist_guess="Led Zeppelin", - track_guess="Trampled Under Foot", confidence="medium", notes="continuation", ) - assert e.artist_guess == "Led Zeppelin" + assert e.raw_text == "LED ZEP - TRAMPLED" assert e.confidence == "medium" + def test_artist_guess_track_guess_keys_are_ignored(self) -> None: + """The 34 pre-audit corpus JSONs carry `artist_guess` and `track_guess`. + The new schema must accept the old shape (extra keys silently ignored) + and round-trip the entry without those keys reappearing on dump. That's + the load-bearing backward-compat contract this PR rests on.""" + e = Entry.model_validate( + { + "row_index": 0, + "raw_text": "JUANA MOLINA - LA PARADOJA", + "artist_guess": "JUANA MOLINA", + "track_guess": "LA PARADOJA", + "confidence": "high", + } + ) + assert e.raw_text == "JUANA MOLINA - LA PARADOJA" + dumped = e.model_dump() + assert "artist_guess" not in dumped + assert "track_guess" not in dumped + def test_invalid_confidence_rejected(self) -> None: with pytest.raises(ValidationError): Entry(row_index=0, raw_text="x", confidence="very-high") # type: ignore[arg-type] diff --git a/tests/unit/test_spot_check.py b/tests/unit/test_spot_check.py index 69a98fa..2eb8992 100644 --- a/tests/unit/test_spot_check.py +++ b/tests/unit/test_spot_check.py @@ -84,6 +84,9 @@ def _entry( track: str | None = None, raw: str = "x", ) -> dict: + """Legacy-shape entry: the 34 pre-audit corpus JSONs carry + `artist_guess` / `track_guess` keys; `collect_entries` still honors + them when present.""" return { "row_index": row_index, "raw_text": raw, @@ -95,6 +98,18 @@ def _entry( } +def _entry_v2(row_index: int, *, raw: str) -> dict: + """Post-audit entry: no `artist_guess` / `track_guess`. The artist + and track come from `parse_artist_track(raw_text)` at read time.""" + return { + "row_index": row_index, + "raw_text": raw, + "confidence": "high", + "notes": None, + "oddities": [], + } + + # -- collect_entries ------------------------------------------------------- @@ -150,6 +165,53 @@ def test_walks_subdirectories_in_sorted_order(self, tmp_path: Path) -> None: def test_returns_empty_list_for_empty_root(self, tmp_path: Path) -> None: assert collect_entries(tmp_path) == [] + def test_derives_artist_track_from_raw_text_for_post_audit_entries( + self, tmp_path: Path + ) -> None: + """The post-#41 on-disk shape has no `artist_guess`/`track_guess` + keys — both fall back to `parse_artist_track(raw_text)`. Without + this fallback, `collect_entries` would silently return zero rows + on every new corpus extraction.""" + page = _make_page( + [ + _entry_v2(0, raw="JUANA MOLINA - LA PARADOJA"), + _entry_v2(1, raw="STEREOLAB"), # artist-only, no track + _entry_v2(2, raw=" "), # blank row, skipped + ], + ) + (tmp_path / "p.json").write_text(json.dumps(page)) + + rows = collect_entries(tmp_path) + + assert {(r.artist, r.track) for r in rows} == { + ("JUANA MOLINA", "LA PARADOJA"), + ("STEREOLAB", None), + } + + def test_legacy_artist_guess_takes_precedence_over_parsing(self, tmp_path: Path) -> None: + """On the 34 pre-audit JSONs `artist_guess` / `track_guess` exist + and may have been hand-corrected by the original model in ways + the deterministic parser can't reproduce. Honor the stored value + on those rows so a re-run of the spot-check doesn't drift.""" + page = _make_page( + [ + { + "row_index": 0, + "raw_text": "JUANA MOLINA - LA PARADOJA", + "artist_guess": "Juana Molina", + "track_guess": "la paradoja", + "confidence": "high", + "notes": None, + "oddities": [], + } + ], + ) + (tmp_path / "p.json").write_text(json.dumps(page)) + + rows = collect_entries(tmp_path) + + assert (rows[0].artist, rows[0].track) == ("Juana Molina", "la paradoja") + def test_carries_quadrant_position_and_row_index(self, tmp_path: Path) -> None: page = _make_page( [_entry(7, artist="Pavement", track="Cut Your Hair")],