Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions core/continuations.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,15 @@ def merge_continuations(entries: list[Entry]) -> list[Entry]:
continuation still reports "high". Substring matching doesn't
care; if a future re-OCR queue filters by confidence and wants
to flag merged rows, take the min over the contributing rows.
* `artist_guess` / `track_guess` — kept as-is. After a merge
these may not reflect the joined `raw_text`; downstream
consumers that care should re-parse from `raw_text`.
* `row_index` — the predecessor's index is preserved. The
continuation row's index is dropped from the merged view; the
on-disk JSON still has both grid positions if needed.

Callers that want an artist/track split from the merged `raw_text`
should apply `core.parse.parse_artist_track` after merging — the
split is deterministic at read time and is intentionally not stored
on `Entry`.

The merge is **lossy with respect to internal whitespace** at the
wrap boundary — multiple spaces / tabs collapse to a single space.
Verbatim whitespace round-tripping requires reading the on-disk
Expand Down
60 changes: 60 additions & 0 deletions core/parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Read-time artist/track split for `Entry.raw_text`.

The Phase-1 schema asked Gemini to fill `artist_guess` and `track_guess`
alongside the verbatim `raw_text`. The split is deterministic (separate
on the first whitespace-bracketed dash) and producing it on the model
side spends output tokens for no analytic benefit. The fields were
dropped from the response schema; this module is what downstream
consumers call at read time to recover them.

Sibling pattern: `core.continuations.merge_continuations`,
`core.comments.normalize_comments`. Pure function, no I/O. The on-disk
shape stays verbatim; the split happens in memory on read.

Separator convention: a single ASCII hyphen, en-dash, or em-dash with
whitespace on both sides. Compound band names ("X-Ray Spex") have no
surrounding whitespace and are NOT split. Multiple separators in one
line split on the first; the track side keeps the rest.
"""

from __future__ import annotations

import re

# ASCII hyphen, en-dash (U+2013), em-dash (U+2014). Whitespace-bracketed
# so we don't split compound band names like "X-Ray Spex" on their
# internal hyphen.
_SEPARATOR = re.compile(r"\s+[-–—]\s+")


def parse_artist_track(raw_text: str | None) -> tuple[str | None, str | None]:
"""Split a row's verbatim text into best-effort (artist, track) parts.

Returns:
* `(None, None)` when the input is None, empty, or whitespace-only.
* `(artist, None)` when no separator is present — the line names
an artist with no track (common for early-90s flowsheets, e.g.
"STEREOLAB" alone).
* `(None, track)` when the separator opens the line with nothing
before it (e.g. " - LA PARADOJA").
* `(artist, None)` when the separator closes the line with nothing
after it (e.g. "JUANA MOLINA - ").
* `(artist, track)` for the dominant case ("ARTIST - TRACK"). When
multiple separators appear, we split on the first only and the
rest stays in the track.

Each side is stripped of leading/trailing whitespace. Empty sides
become None so callers have one sentinel for "nothing useful here".
"""
if raw_text is None:
return (None, None)
if not raw_text.strip():
return (None, None)
# Split the unstripped text so a leading or trailing " - " still has
# the whitespace on both sides of the dash that the regex needs.
split = _SEPARATOR.split(raw_text, maxsplit=1)
if len(split) == 1:
return (raw_text.strip(), None)
artist = split[0].strip() or None
track = split[1].strip() or None
return (artist, track)
6 changes: 1 addition & 5 deletions core/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
call. Pulls only `comments_raw` (the verbatim contents of the
printed "Comments:" band) from the bottom band of the page.

The per-row guidance (raw_text / artist_guess / confidence / notes
The per-row guidance (raw_text / type_raw / confidence / notes
tags / etc.) is duplicated across the page and quadrant prompts. They
must stay in sync. The shared row-level content is enforced by parallel
contract tests in `tests/unit/test_prompts.py`; if you change one,
Expand Down Expand Up @@ -64,8 +64,6 @@
drawn shape that is NOT a letter; do not invent a description from
these examples or copy them onto rows where you cannot read the mark.
When in doubt, return null.
- artist_guess: best-effort parse of the part left of the dash, or null
- track_guess: best-effort parse of the part right of the dash, or null
- confidence: "high" if the row is clearly legible, "medium" if you had to
guess one or two characters, "low" if mostly illegible
- notes: null in the common case. Use one of these tags only when relevant:
Expand Down Expand Up @@ -172,8 +170,6 @@
drawn shape that is NOT a letter; do not invent a description from
these examples or copy them onto rows where you cannot read the mark.
When in doubt, return null.
- artist_guess: best-effort parse of the part left of the dash, or null
- track_guess: best-effort parse of the part right of the dash, or null
- confidence: "high" if the row is clearly legible, "medium" if you had to
guess one or two characters, "low" if mostly illegible
- notes: null in the common case. Use one of these tags only when relevant:
Expand Down
8 changes: 0 additions & 8 deletions core/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,6 @@ class Entry(BaseModel):
"normal entry. Null if the circle is blank."
),
)
artist_guess: str | None = Field(
default=None,
description="Best-effort parse of the artist portion (left of the dash).",
)
track_guess: str | None = Field(
default=None,
description="Best-effort parse of the track portion (right of the dash).",
)
confidence: Confidence = Field(
description="high if the row is clearly legible; low if mostly illegible.",
)
Expand Down
36 changes: 28 additions & 8 deletions core/spot_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from dataclasses import dataclass, field
from pathlib import Path

from core.parse import parse_artist_track


@dataclass(frozen=True)
class EntryRef:
Expand All @@ -28,7 +30,7 @@ class EntryRef:
class PageReport:
"""Hit/miss accounting for a single result JSON.

`total` counts entries that had an `artist_guess`. `with_track` is
`total` counts entries we could derive an artist for. `with_track` is
the joint-mode denominator: entries with both an artist and a track.
The two miss lists hold the rows whose lookup returned False, so a
caller can drill into specific transcriptions.
Expand All @@ -46,19 +48,37 @@ class PageReport:
def collect_entries(results_root: Path) -> list[EntryRef]:
"""Walk every result JSON under `results_root` and return queryable rows.

Skips entries with no `artist_guess` (continuation rows have no
artist by design). An empty `track_guess` becomes `None` so callers
can distinguish "no track to check" from "track is the empty string".
Two on-disk shapes coexist:

* Pre-audit (34 legacy corpus JSONs): `artist_guess` / `track_guess`
are present on every entry. A null `artist_guess` is the explicit
"continuation row" sentinel — skip those.
* Post-audit (everything new): no `artist_guess` / `track_guess`
keys. Artist and track are derived from `raw_text` via
`core.parse.parse_artist_track`.

The branch is on KEY PRESENCE, not value truthiness, so the legacy
"explicit null means skip" contract is preserved on legacy rows
while new rows get the deterministic parse. An empty track becomes
`None` so callers can distinguish "no track to check" from "track
is the empty string".
"""
rows: list[EntryRef] = []
for path in sorted(results_root.rglob("*.json")):
data = json.loads(path.read_text())
for q in data.get("quadrants", []):
for e in q.get("entries", []):
artist = (e.get("artist_guess") or "").strip()
if not artist:
continue
track = (e.get("track_guess") or "").strip() or None
if "artist_guess" in e:
legacy = (e.get("artist_guess") or "").strip()
if not legacy:
continue
artist = legacy
track = (e.get("track_guess") or "").strip() or None
else:
parsed_artist, track = parse_artist_track(e.get("raw_text"))
if parsed_artist is None:
continue
artist = parsed_artist
rows.append(
EntryRef(
page_path=path,
Expand Down
10 changes: 6 additions & 4 deletions scripts/spot_check_discogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
"""spot_check_discogs.py — sanity-check Gemini-extracted entries against the
local Discogs PostgreSQL cache.

Walks every `data/results/**/*.json` and for each entry with an
`artist_guess` checks two questions against the cache:
Walks every `data/results/**/*.json` and for each entry with a
derivable artist (from `artist_guess` on the 34 legacy JSONs, or from
`parse_artist_track(raw_text)` on post-schema-trim extractions) checks
two questions against the cache:

* artist — is this artist name in the WXYC library? (broad, cheap)
* joint — does any release for this WXYC-owned artist contain a
Expand Down Expand Up @@ -156,7 +158,7 @@ def print_report(
print()
print("=" * 72)
print(f"Pages: {len(pages)}")
print(f"Entries with artist_guess: {total}")
print(f"Entries with derivable artist: {total}")
print(f" artist hits: {artist_hit_total}/{total}")
print(f"Entries with artist + track: {with_track}")
print(f" joint hits (artist+track): {joint_hit_total}/{with_track}")
Expand Down Expand Up @@ -216,7 +218,7 @@ def main(argv: list[str]) -> int:

rows = collect_entries(results_root)
if not rows:
print(f"No entries with artist_guess under {results_root}.")
print(f"No entries with a derivable artist under {results_root}.")
return 0

unique_artists = sorted({r.artist for r in rows})
Expand Down
2 changes: 0 additions & 2 deletions tests/integration/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ def _build_page_result(date: str = "Monday 1 Jan '90") -> PageResult:
Entry(
row_index=0,
raw_text="LED ZEP - TRAMPLED",
artist_guess="LED ZEP",
track_guess="TRAMPLED",
confidence="high",
)
],
Expand Down
114 changes: 114 additions & 0 deletions tests/unit/test_parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""Tests for `core.parse.parse_artist_track`.

`parse_artist_track` is the read-time replacement for the
`Entry.artist_guess` / `Entry.track_guess` fields that used to live on
the Gemini response schema. The model now returns only `raw_text`; the
split happens here, deterministically, at read time.

Sibling pattern: `core.continuations.merge_continuations`,
`core.comments.normalize_comments`. Pure function, no I/O.
"""

from __future__ import annotations

import pytest

from core.parse import parse_artist_track


class TestParseArtistTrack:
@pytest.mark.parametrize(
("raw", "expected"),
[
# The dominant case: ASCII hyphen with single spaces.
("JUANA MOLINA - LA PARADOJA", ("JUANA MOLINA", "LA PARADOJA")),
# Album in parens belongs to the track side, not split off.
(
"JUANA MOLINA - LA PARADOJA (DOGA)",
("JUANA MOLINA", "LA PARADOJA (DOGA)"),
),
],
)
def test_simple_ascii_hyphen(self, raw: str, expected: tuple[str | None, str | None]) -> None:
assert parse_artist_track(raw) == expected

@pytest.mark.parametrize(
("dash", "label"),
[
("–", "en-dash"),
("—", "em-dash"),
],
)
def test_unicode_dashes_split_the_same(self, dash: str, label: str) -> None:
"""DJs sometimes write en/em-dashes; the model occasionally normalizes
a hand-drawn dash to unicode. Treat them like ASCII hyphens so both
on-disk shapes split consistently."""
raw = f"JESSICA PRATT {dash} BACK, BABY"
assert parse_artist_track(raw) == ("JESSICA PRATT", "BACK, BABY"), label

def test_no_dash_returns_artist_only(self) -> None:
"""An entry with no separator is documented as artist-only. The
Stereolab fixture case (artist-only row) appears in the WXYC
canonical-artists test data; the dominant historical reading is
that the line names the artist and omits the track."""
assert parse_artist_track("STEREOLAB") == ("STEREOLAB", None)

def test_multiple_dashes_split_on_first(self) -> None:
"""Hand-written track titles can themselves contain hyphens
("Duke Ellington & John Coltrane - In a Sentimental Mood"
wouldn't trigger this — the artist '&' isn't a dash — but a
title like 'X-Ray' on the track side will). Split on the first
separator so the artist segment stays intact."""
assert parse_artist_track("ARTIST - TRACK - PART TWO") == (
"ARTIST",
"TRACK - PART TWO",
)

def test_strips_whitespace_per_side(self) -> None:
"""Leading/trailing whitespace on either side gets stripped from
both sides of the result. Internal whitespace inside artist or
track is preserved."""
assert parse_artist_track(" THE BAND - THE SONG ") == ("THE BAND", "THE SONG")

def test_none_input_returns_none_pair(self) -> None:
assert parse_artist_track(None) == (None, None)

def test_empty_string_returns_none_pair(self) -> None:
assert parse_artist_track("") == (None, None)

def test_whitespace_only_returns_none_pair(self) -> None:
assert parse_artist_track(" ") == (None, None)

def test_empty_artist_side_returns_none(self) -> None:
"""A line that opens with the separator (' - TRACK') has no artist;
the function shouldn't promote an empty string into a real value."""
assert parse_artist_track(" - LA PARADOJA") == (None, "LA PARADOJA")

def test_empty_track_side_returns_none(self) -> None:
"""A trailing separator with nothing after it has no track."""
assert parse_artist_track("JUANA MOLINA - ") == ("JUANA MOLINA", None)

def test_hyphen_with_no_spaces_does_not_split(self) -> None:
"""Compound words like 'X-Ray Spex' must NOT be split on the
internal hyphen — the convention is that the separator is a dash
SURROUNDED by whitespace. This is the load-bearing constraint
that lets the function work on a corpus that mixes hyphenated
band names with separator dashes."""
assert parse_artist_track("X-RAY SPEX") == ("X-RAY SPEX", None)

def test_separator_matches_across_newlines(self) -> None:
"""The regex's `\\s+` class includes `\\n`, so a separator that
straddles a newline still splits. Rare in practice — Gemini's
`raw_text` is usually a single line — but pinning the behavior
here means a future tightening (e.g. `[^\\S\\n]+` to exclude
newlines) is a deliberate decision rather than a regression.
"""
assert parse_artist_track("ARTIST\n - TRACK") == ("ARTIST", "TRACK")

def test_idempotent_when_artist_only_recomputed(self) -> None:
"""Round-tripping the result through join+split must be a no-op
for the artist-only case: a downstream consumer that re-stores
the artist as `raw_text` and re-parses shouldn't flip the result."""
artist, track = parse_artist_track("STEREOLAB")
assert (artist, track) == ("STEREOLAB", None)
assert parse_artist_track(artist) == ("STEREOLAB", None)
4 changes: 0 additions & 4 deletions tests/unit/test_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ def test_prompt_names_each_quadrant_position(position: str) -> None:
"row_index",
"raw_text",
"type_raw",
"artist_guess",
"track_guess",
"confidence",
"notes",
],
Expand Down Expand Up @@ -180,8 +178,6 @@ def test_quadrant_template_substitutes_position(position: str) -> None:
"row_index",
"raw_text",
"type_raw",
"artist_guess",
"track_guess",
"confidence",
"notes",
],
Expand Down
Loading
Loading