WXYC · jakebromberg · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/core/continuations.py b/core/continuations.py
@@ -39,13 +39,15 @@ def merge_continuations(entries: list[Entry]) -> list[Entry]:
         continuation still reports "high". Substring matching doesn't
         care; if a future re-OCR queue filters by confidence and wants
         to flag merged rows, take the min over the contributing rows.
-      * `artist_guess` / `track_guess` — kept as-is. After a merge
-        these may not reflect the joined `raw_text`; downstream
-        consumers that care should re-parse from `raw_text`.
       * `row_index` — the predecessor's index is preserved. The
         continuation row's index is dropped from the merged view; the
         on-disk JSON still has both grid positions if needed.
 
+    Callers that want an artist/track split from the merged `raw_text`
+    should apply `core.parse.parse_artist_track` after merging — the
+    split is deterministic at read time and is intentionally not stored
+    on `Entry`.
+
     The merge is **lossy with respect to internal whitespace** at the
     wrap boundary — multiple spaces / tabs collapse to a single space.
     Verbatim whitespace round-tripping requires reading the on-disk

diff --git a/core/parse.py b/core/parse.py
@@ -0,0 +1,60 @@
+"""Read-time artist/track split for `Entry.raw_text`.
+
+The Phase-1 schema asked Gemini to fill `artist_guess` and `track_guess`
+alongside the verbatim `raw_text`. The split is deterministic (separate
+on the first whitespace-bracketed dash) and producing it on the model
+side spends output tokens for no analytic benefit. The fields were
+dropped from the response schema; this module is what downstream
+consumers call at read time to recover them.
+
+Sibling pattern: `core.continuations.merge_continuations`,
+`core.comments.normalize_comments`. Pure function, no I/O. The on-disk
+shape stays verbatim; the split happens in memory on read.
+
+Separator convention: a single ASCII hyphen, en-dash, or em-dash with
+whitespace on both sides. Compound band names ("X-Ray Spex") have no
+surrounding whitespace and are NOT split. Multiple separators in one
+line split on the first; the track side keeps the rest.
+"""
+
+from __future__ import annotations
+
+import re
+
+# ASCII hyphen, en-dash (U+2013), em-dash (U+2014). Whitespace-bracketed
+# so we don't split compound band names like "X-Ray Spex" on their
+# internal hyphen.
+_SEPARATOR = re.compile(r"\s+[-–—]\s+")
+
+
+def parse_artist_track(raw_text: str | None) -> tuple[str | None, str | None]:
+    """Split a row's verbatim text into best-effort (artist, track) parts.
+
+    Returns:
+      * `(None, None)` when the input is None, empty, or whitespace-only.
+      * `(artist, None)` when no separator is present — the line names
+        an artist with no track (common for early-90s flowsheets, e.g.
+        "STEREOLAB" alone).
+      * `(None, track)` when the separator opens the line with nothing
+        before it (e.g. " - LA PARADOJA").
+      * `(artist, None)` when the separator closes the line with nothing
+        after it (e.g. "JUANA MOLINA - ").
+      * `(artist, track)` for the dominant case ("ARTIST - TRACK"). When
+        multiple separators appear, we split on the first only and the
+        rest stays in the track.
+
+    Each side is stripped of leading/trailing whitespace. Empty sides
+    become None so callers have one sentinel for "nothing useful here".
+    """
+    if raw_text is None:
+        return (None, None)
+    if not raw_text.strip():
+        return (None, None)
+    # Split the unstripped text so a leading or trailing " - " still has
+    # the whitespace on both sides of the dash that the regex needs.
+    split = _SEPARATOR.split(raw_text, maxsplit=1)
+    if len(split) == 1:
+        return (raw_text.strip(), None)
+    artist = split[0].strip() or None
+    track = split[1].strip() or None
+    return (artist, track)
diff --git a/core/prompts.py b/core/prompts.py
@@ -28,7 +28,7 @@
     call. Pulls only `comments_raw` (the verbatim contents of the
     printed "Comments:" band) from the bottom band of the page.
 
-The per-row guidance (raw_text / artist_guess / confidence / notes
+The per-row guidance (raw_text / type_raw / confidence / notes
 tags / etc.) is duplicated across the page and quadrant prompts. They
 must stay in sync. The shared row-level content is enforced by parallel
 contract tests in `tests/unit/test_prompts.py`; if you change one,
@@ -64,8 +64,6 @@
     drawn shape that is NOT a letter; do not invent a description from
     these examples or copy them onto rows where you cannot read the mark.
     When in doubt, return null.
-  - artist_guess: best-effort parse of the part left of the dash, or null
-  - track_guess: best-effort parse of the part right of the dash, or null
   - confidence: "high" if the row is clearly legible, "medium" if you had to
     guess one or two characters, "low" if mostly illegible
   - notes: null in the common case. Use one of these tags only when relevant:
@@ -172,8 +170,6 @@
     drawn shape that is NOT a letter; do not invent a description from
     these examples or copy them onto rows where you cannot read the mark.
     When in doubt, return null.
-  - artist_guess: best-effort parse of the part left of the dash, or null
-  - track_guess: best-effort parse of the part right of the dash, or null
   - confidence: "high" if the row is clearly legible, "medium" if you had to
     guess one or two characters, "low" if mostly illegible
   - notes: null in the common case. Use one of these tags only when relevant:

diff --git a/core/schema.py b/core/schema.py
@@ -60,14 +60,6 @@ class Entry(BaseModel):
             "normal entry. Null if the circle is blank."
         ),
     )
-    artist_guess: str | None = Field(
-        default=None,
-        description="Best-effort parse of the artist portion (left of the dash).",
-    )
-    track_guess: str | None = Field(
-        default=None,
-        description="Best-effort parse of the track portion (right of the dash).",
-    )
     confidence: Confidence = Field(
         description="high if the row is clearly legible; low if mostly illegible.",
     )

diff --git a/core/spot_check.py b/core/spot_check.py
@@ -12,6 +12,8 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 
+from core.parse import parse_artist_track
+
 
 @dataclass(frozen=True)
 class EntryRef:
@@ -28,7 +30,7 @@ class EntryRef:
 class PageReport:
     """Hit/miss accounting for a single result JSON.
 
-    `total` counts entries that had an `artist_guess`. `with_track` is
+    `total` counts entries we could derive an artist for. `with_track` is
     the joint-mode denominator: entries with both an artist and a track.
     The two miss lists hold the rows whose lookup returned False, so a
     caller can drill into specific transcriptions.
@@ -46,19 +48,37 @@ class PageReport:
 def collect_entries(results_root: Path) -> list[EntryRef]:
     """Walk every result JSON under `results_root` and return queryable rows.
 
-    Skips entries with no `artist_guess` (continuation rows have no
-    artist by design). An empty `track_guess` becomes `None` so callers
-    can distinguish "no track to check" from "track is the empty string".
+    Two on-disk shapes coexist:
+
+      * Pre-audit (34 legacy corpus JSONs): `artist_guess` / `track_guess`
+        are present on every entry. A null `artist_guess` is the explicit
+        "continuation row" sentinel — skip those.
+      * Post-audit (everything new): no `artist_guess` / `track_guess`
+        keys. Artist and track are derived from `raw_text` via
+        `core.parse.parse_artist_track`.
+
+    The branch is on KEY PRESENCE, not value truthiness, so the legacy
+    "explicit null means skip" contract is preserved on legacy rows
+    while new rows get the deterministic parse. An empty track becomes
+    `None` so callers can distinguish "no track to check" from "track
+    is the empty string".
     """
     rows: list[EntryRef] = []
     for path in sorted(results_root.rglob("*.json")):
         data = json.loads(path.read_text())
         for q in data.get("quadrants", []):
             for e in q.get("entries", []):
-                artist = (e.get("artist_guess") or "").strip()
-                if not artist:
-                    continue
-                track = (e.get("track_guess") or "").strip() or None
+                if "artist_guess" in e:
+                    legacy = (e.get("artist_guess") or "").strip()
+                    if not legacy:
+                        continue
+                    artist = legacy
+                    track = (e.get("track_guess") or "").strip() or None
+                else:
+                    parsed_artist, track = parse_artist_track(e.get("raw_text"))
+                    if parsed_artist is None:
+                        continue
+                    artist = parsed_artist
                 rows.append(
                     EntryRef(
                         page_path=path,

diff --git a/scripts/spot_check_discogs.py b/scripts/spot_check_discogs.py
@@ -2,8 +2,10 @@
 """spot_check_discogs.py — sanity-check Gemini-extracted entries against the
 local Discogs PostgreSQL cache.
 
-Walks every `data/results/**/*.json` and for each entry with an
-`artist_guess` checks two questions against the cache:
+Walks every `data/results/**/*.json` and for each entry with a
+derivable artist (from `artist_guess` on the 34 legacy JSONs, or from
+`parse_artist_track(raw_text)` on post-schema-trim extractions) checks
+two questions against the cache:
 
   * artist  — is this artist name in the WXYC library? (broad, cheap)
   * joint   — does any release for this WXYC-owned artist contain a
@@ -156,7 +158,7 @@ def print_report(
     print()
     print("=" * 72)
     print(f"Pages:                          {len(pages)}")
-    print(f"Entries with artist_guess:      {total}")
+    print(f"Entries with derivable artist:  {total}")
     print(f"  artist hits:                  {artist_hit_total}/{total}")
     print(f"Entries with artist + track:    {with_track}")
     print(f"  joint hits (artist+track):    {joint_hit_total}/{with_track}")
@@ -216,7 +218,7 @@ def main(argv: list[str]) -> int:
 
     rows = collect_entries(results_root)
     if not rows:
-        print(f"No entries with artist_guess under {results_root}.")
+        print(f"No entries with a derivable artist under {results_root}.")
         return 0
 
     unique_artists = sorted({r.artist for r in rows})

diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py
@@ -41,8 +41,6 @@ def _build_page_result(date: str = "Monday 1 Jan '90") -> PageResult:
                     Entry(
                         row_index=0,
                         raw_text="LED ZEP - TRAMPLED",
-                        artist_guess="LED ZEP",
-                        track_guess="TRAMPLED",
                         confidence="high",
                     )
                 ],

diff --git a/tests/unit/test_parse.py b/tests/unit/test_parse.py
@@ -0,0 +1,114 @@
+"""Tests for `core.parse.parse_artist_track`.
+
+`parse_artist_track` is the read-time replacement for the
+`Entry.artist_guess` / `Entry.track_guess` fields that used to live on
+the Gemini response schema. The model now returns only `raw_text`; the
+split happens here, deterministically, at read time.
+
+Sibling pattern: `core.continuations.merge_continuations`,
+`core.comments.normalize_comments`. Pure function, no I/O.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from core.parse import parse_artist_track
+
+
+class TestParseArtistTrack:
+    @pytest.mark.parametrize(
+        ("raw", "expected"),
+        [
+            # The dominant case: ASCII hyphen with single spaces.
+            ("JUANA MOLINA - LA PARADOJA", ("JUANA MOLINA", "LA PARADOJA")),
+            # Album in parens belongs to the track side, not split off.
+            (
+                "JUANA MOLINA - LA PARADOJA (DOGA)",
+                ("JUANA MOLINA", "LA PARADOJA (DOGA)"),
+            ),
+        ],
+    )
+    def test_simple_ascii_hyphen(self, raw: str, expected: tuple[str | None, str | None]) -> None:
+        assert parse_artist_track(raw) == expected
+
+    @pytest.mark.parametrize(
+        ("dash", "label"),
+        [
+            ("–", "en-dash"),
+            ("—", "em-dash"),
+        ],
+    )
+    def test_unicode_dashes_split_the_same(self, dash: str, label: str) -> None:
+        """DJs sometimes write en/em-dashes; the model occasionally normalizes
+        a hand-drawn dash to unicode. Treat them like ASCII hyphens so both
+        on-disk shapes split consistently."""
+        raw = f"JESSICA PRATT {dash} BACK, BABY"
+        assert parse_artist_track(raw) == ("JESSICA PRATT", "BACK, BABY"), label
+
+    def test_no_dash_returns_artist_only(self) -> None:
+        """An entry with no separator is documented as artist-only. The
+        Stereolab fixture case (artist-only row) appears in the WXYC
+        canonical-artists test data; the dominant historical reading is
+        that the line names the artist and omits the track."""
+        assert parse_artist_track("STEREOLAB") == ("STEREOLAB", None)
+
+    def test_multiple_dashes_split_on_first(self) -> None:
+        """Hand-written track titles can themselves contain hyphens
+        ("Duke Ellington & John Coltrane - In a Sentimental Mood"
+        wouldn't trigger this — the artist '&' isn't a dash — but a
+        title like 'X-Ray' on the track side will). Split on the first
+        separator so the artist segment stays intact."""
+        assert parse_artist_track("ARTIST - TRACK - PART TWO") == (
+            "ARTIST",
+            "TRACK - PART TWO",
+        )
+
+    def test_strips_whitespace_per_side(self) -> None:
+        """Leading/trailing whitespace on either side gets stripped from
+        both sides of the result. Internal whitespace inside artist or
+        track is preserved."""
+        assert parse_artist_track("  THE BAND   -   THE SONG  ") == ("THE BAND", "THE SONG")
+
+    def test_none_input_returns_none_pair(self) -> None:
+        assert parse_artist_track(None) == (None, None)
+
+    def test_empty_string_returns_none_pair(self) -> None:
+        assert parse_artist_track("") == (None, None)
+
+    def test_whitespace_only_returns_none_pair(self) -> None:
+        assert parse_artist_track("   ") == (None, None)
+
+    def test_empty_artist_side_returns_none(self) -> None:
+        """A line that opens with the separator (' - TRACK') has no artist;
+        the function shouldn't promote an empty string into a real value."""
+        assert parse_artist_track(" - LA PARADOJA") == (None, "LA PARADOJA")
+
+    def test_empty_track_side_returns_none(self) -> None:
+        """A trailing separator with nothing after it has no track."""
+        assert parse_artist_track("JUANA MOLINA - ") == ("JUANA MOLINA", None)
+
+    def test_hyphen_with_no_spaces_does_not_split(self) -> None:
+        """Compound words like 'X-Ray Spex' must NOT be split on the
+        internal hyphen — the convention is that the separator is a dash
+        SURROUNDED by whitespace. This is the load-bearing constraint
+        that lets the function work on a corpus that mixes hyphenated
+        band names with separator dashes."""
+        assert parse_artist_track("X-RAY SPEX") == ("X-RAY SPEX", None)
+
+    def test_separator_matches_across_newlines(self) -> None:
+        """The regex's `\\s+` class includes `\\n`, so a separator that
+        straddles a newline still splits. Rare in practice — Gemini's
+        `raw_text` is usually a single line — but pinning the behavior
+        here means a future tightening (e.g. `[^\\S\\n]+` to exclude
+        newlines) is a deliberate decision rather than a regression.
+        """
+        assert parse_artist_track("ARTIST\n - TRACK") == ("ARTIST", "TRACK")
+
+    def test_idempotent_when_artist_only_recomputed(self) -> None:
+        """Round-tripping the result through join+split must be a no-op
+        for the artist-only case: a downstream consumer that re-stores
+        the artist as `raw_text` and re-parses shouldn't flip the result."""
+        artist, track = parse_artist_track("STEREOLAB")
+        assert (artist, track) == ("STEREOLAB", None)
+        assert parse_artist_track(artist) == ("STEREOLAB", None)
diff --git a/tests/unit/test_prompts.py b/tests/unit/test_prompts.py
@@ -28,8 +28,6 @@ def test_prompt_names_each_quadrant_position(position: str) -> None:
         "row_index",
         "raw_text",
         "type_raw",
-        "artist_guess",
-        "track_guess",
         "confidence",
         "notes",
     ],
@@ -180,8 +178,6 @@ def test_quadrant_template_substitutes_position(position: str) -> None:
         "row_index",
         "raw_text",
         "type_raw",
-        "artist_guess",
-        "track_guess",
         "confidence",
         "notes",
     ],