Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion core/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* never invent content; mark unreadable rows confidence=low,
* tag special-case rows in `notes` and skip parsing them.

Three top-level prompts:
Four top-level prompts:

* `PAGE_EXTRACTION_PROMPT` — Gemini and the page-level qwen-vl adapter.
The model sees the whole page; the schema demands all four quadrants.
Expand All @@ -24,6 +24,10 @@
call. Pulls only `page_date_raw` and page-level oddities from the
top band of the page.

* `FOOTER_EXTRACTION_PROMPT` — the per-quadrant adapter's footer-strip
call. Pulls only `comments_raw` (the verbatim contents of the
printed "Comments:" band) from the bottom band of the page.

The per-row guidance (raw_text / artist_guess / confidence / notes
tags / etc.) is duplicated across the page and quadrant prompts. They
must stay in sync. The shared row-level content is enforced by parallel
Expand Down Expand Up @@ -251,3 +255,31 @@

Return only the structured JSON described by the response schema.
"""


FOOTER_EXTRACTION_PROMPT = """\
You are reading the bottom footer strip of a 1990s WXYC handwritten
radio flowsheet page. The image is a horizontal slice from the very
bottom of the page — below the four hour-blocks of the broadcast grid.
It contains the printed "Comments:" label and a free-text band where
the DJ writes short notes about the broadcast (dedications, jokes,
themed-show titles — e.g. "declared today anti-Valentines Day").

Capture:
- comments_raw: the verbatim contents of the Comments band, as
written. Do not fix spelling, do not expand abbreviations, do not
normalize punctuation. Join multi-line entries with a single
newline. Use JSON null (not the string "null", not an empty
string) when the Comments band is blank, unreadable, or absent.

The crop may include a few pixels of the bottom row of the broadcast
grid just above the printed "Comments:" line. Do NOT transcribe row
content from above the Comments line — those entries are captured by a
separate call against the bottom quadrants. Only transcribe what the
DJ wrote in the Comments band itself.

Never invent content. If the band is unreadable, return null rather
than guessing.

Return only the structured JSON described by the response schema.
"""
72 changes: 60 additions & 12 deletions scripts/calibrate_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,12 @@

modal-qwen-vl-quad
Per-quadrant Qwen-VL on Modal: crops the page into
4 sub-images plus a header strip, calls the model 5x
per page, assembles a PageResult locally. Eliminates
cross-quadrant content placement errors that the
single-shot `modal-qwen-vl` adapter still suffers.
~5x cost (~$0.05-0.10/page); full corpus ~$1000-1500.
4 sub-images plus a header strip and a footer strip,
calls the model 6x per page, assembles a PageResult
locally. Eliminates cross-quadrant content placement
errors that the single-shot `modal-qwen-vl` adapter
still suffers. ~6x cost (~$0.06-0.12/page); full
corpus ~$1200-1800.

local-quadrant-smoke
Local-only crop-quality smoke check: runs Churro
Expand Down Expand Up @@ -98,6 +99,7 @@
from core.golden import GoldenTruth, RowCountDiscrepancy, compare_row_counts # noqa: E402
from core.page_layout import PageLayout, detect_page_layout # noqa: E402
from core.prompts import ( # noqa: E402
FOOTER_EXTRACTION_PROMPT,
HEADER_EXTRACTION_PROMPT,
PAGE_EXTRACTION_PROMPT,
QUADRANT_EXTRACTION_PROMPT_TEMPLATE,
Expand Down Expand Up @@ -408,6 +410,22 @@ def transcribe(image_path: Path) -> PageResult:
"""


FOOTER_WIRE_SCHEMA: dict[str, Any] = {
"type": "object",
"properties": {
"comments_raw": {"type": ["string", "null"]},
},
"required": ["comments_raw"],
"additionalProperties": False,
}
"""JSON Schema for the footer-strip call in `modal-qwen-vl-quad`.

Mirrors `HEADER_WIRE_SCHEMA`'s shape: one-off, inline, no Pydantic
indirection. Pulls only `comments_raw` (the verbatim contents of the
printed "Comments:" band at the bottom of the page).
"""


def make_modal_qwen_vl_quad_adapter(
model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct",
) -> TranscribeFn:
Expand All @@ -416,24 +434,29 @@ def make_modal_qwen_vl_quad_adapter(
Eliminates layout misplacement by construction: instead of asking
the model to spatially attribute rows from one full-page image to
the right quadrant slot in the JSON wrapper, we crop the page
locally and call the model 5 times — once per quadrant + once on
the header strip — and assemble the page server-side.
locally and call the model 6 times — once per quadrant, once on
the header strip, and once on the footer strip — and assemble the
page server-side.

Each call is grammar-constrained (xgrammar). The four quadrant
schemas pin `position` to a singleton enum so the model literally
cannot mislabel the cell. The header schema is small and ad-hoc,
capturing only `page_date_raw` and page-level oddities (DJ-handoff
notes, weather notes, marginal annotations above the grid).
notes, weather notes, marginal annotations above the grid). The
footer schema captures only `comments_raw` — the verbatim contents
of the printed "Comments:" band at the bottom of the page.

All 5 calls run inside one `with app.run():` block. Modal reuses
All 6 calls run inside one `with app.run():` block. Modal reuses
the warm container across them — the first call pays whatever
cold-start applies; calls 2-5 are warm. Per-page wall time is
roughly `cold + 4 * warm`, not `5 * warm`.
cold-start applies; calls 2-6 are warm. Per-page wall time is
roughly `cold + 5 * warm`, not `6 * warm`.

On per-quadrant JSON failure, the affected quadrant is replaced by
a `_quadrant_fallback` carrying the raw text in one entry tagged
`notes="parse_failed"`. Other quadrants still validate; the page
is never lost wholesale.
is never lost wholesale. Header and footer parse failures leave
their respective fields at their defaults (`page_date_raw=None`,
`oddities=[]`, `comments_raw=None`) without failing the page.
"""
from PIL import Image

Expand All @@ -450,10 +473,12 @@ def transcribe(image_path: Path) -> PageResult:
image = Image.open(image_path).convert("RGB")
layout = detect_page_layout(image)
header_image = _crop_header_strip(image, layout)
footer_image = _crop_footer_strip(image, layout)
crops = _crop_quadrants(image, layout)

page_date_raw: str | None = None
page_oddities: list[str] = []
comments_raw: str | None = None
quadrants: list[Quadrant] = []

with app.run():
Expand Down Expand Up @@ -494,9 +519,26 @@ def transcribe(image_path: Path) -> PageResult:
except Exception:
quadrants.append(_quadrant_fallback(text, position))

# Footer call — surfaces the bottom Comments band, which the
# quadrant crops deliberately exclude (they stop at
# body_bottom_y). Same fault-tolerance shape as the header
# call: a parse failure leaves comments_raw at None.
try:
footer_text: str = transcribe_qwen_vl.remote(
_png_bytes(footer_image),
FOOTER_EXTRACTION_PROMPT,
model_id,
json_schema=FOOTER_WIRE_SCHEMA,
)
footer_data = json.loads(footer_text)
comments_raw = footer_data.get("comments_raw")
except Exception:
pass # leave default; not worth failing the page

return PageResult(
page_date_raw=page_date_raw,
quadrants=quadrants,
comments_raw=comments_raw,
model_version=f"modal-qwen-vl-quad:{model_id}",
extracted_at=datetime.now(UTC),
oddities=page_oddities,
Expand Down Expand Up @@ -602,6 +644,12 @@ def _crop_header_strip(image: PILImage, layout: PageLayout) -> PILImage:
return image.crop((0, 0, w, layout.header_bottom_y))


def _crop_footer_strip(image: PILImage, layout: PageLayout) -> PILImage:
"""The footer strip — printed "Comments:" line + free-text DJ commentary — below the body grid."""
w, h = image.size
return image.crop((0, layout.body_bottom_y, w, h))


def _crop_quadrants(image: PILImage, layout: PageLayout) -> dict[QuadrantPosition, PILImage]:
"""Split the page body into 4 quadrants on the detected grid lines.

Expand Down
88 changes: 81 additions & 7 deletions tests/unit/test_calibrate_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,11 @@ def _painted_page(width: int, height: int, layout: PageLayout) -> object:
draw.rectangle(
(layout.column_mid_x, layout.body_mid_y, width, layout.body_bottom_y), fill=(255, 255, 0)
) # BR
# Footer band below body_bottom_y is left white so any leakage into
# the bottom quadrants would be visible.
# Paint the footer band below body_bottom_y so a footer crop is
# identifiable by sampling a pixel; leakage into the bottom quadrants
# would still be visible since the color is distinct from the bottom-
# quadrant fills.
draw.rectangle((0, layout.body_bottom_y, width, height), fill=(128, 128, 128)) # footer
return image


Expand All @@ -338,6 +341,36 @@ def test_crop_header_strip_uses_layout_header_bottom_y() -> None:
assert strip.getpixel((400, 60)) == (10, 10, 10)


def test_crop_footer_strip_uses_layout_body_bottom_y() -> None:
"""The footer crop must start at body_bottom_y and run to the bottom
of the image. It is the band that contains the printed Comments: line
and any handwritten free-text below it — content the quadrant crops
deliberately exclude."""
layout = PageLayout(header_bottom_y=120, body_mid_y=550, body_bottom_y=970, column_mid_x=400)
image = _painted_page(800, 1000, layout)
strip = cm._crop_footer_strip(image, layout)
assert strip.size == (800, 1000 - 970)
# The painted footer is solid (128,128,128); sample its center.
assert strip.getpixel((400, 15)) == (128, 128, 128)


def test_crop_footer_strip_excludes_body_grid() -> None:
"""The footer crop must NOT pull pixels from the bottom quadrants —
if it did, the model would helpfully transcribe the last row of those
quadrants into comments_raw."""
layout = PageLayout(header_bottom_y=120, body_mid_y=550, body_bottom_y=970, column_mid_x=400)
image = _painted_page(800, 1000, layout)
strip = cm._crop_footer_strip(image, layout)
# Bottom-left was painted (0,0,255); bottom-right was painted (255,255,0).
# Sweep the whole footer strip and make sure neither color appears.
w, h = strip.size
for y in range(h):
for x in range(w):
pixel = strip.getpixel((x, y))
assert pixel != (0, 0, 255), f"bottom-left bled into footer at ({x},{y})"
assert pixel != (255, 255, 0), f"bottom-right bled into footer at ({x},{y})"


def test_crop_quadrants_returns_canonical_keys() -> None:
layout = PageLayout(header_bottom_y=120, body_mid_y=550, body_bottom_y=970, column_mid_x=400)
image = _painted_page(800, 1000, layout)
Expand Down Expand Up @@ -642,8 +675,9 @@ def fake_run() -> object:
def test_modal_qwen_vl_quad_adapter_happy_path(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""5 RPC calls in canonical order, each with the right schema and prompt;
the assembled PageResult round-trips the header date and four quadrants."""
"""6 RPC calls in canonical order, each with the right schema and prompt;
the assembled PageResult round-trips the header date, four quadrants,
and the comments band."""
image = tmp_path / "1990-04apr0106-page05.png"
_save_fixture_page(image)
fake_remote = _patch_modal_for_quadrant(
Expand All @@ -654,19 +688,20 @@ def test_modal_qwen_vl_quad_adapter_happy_path(
_quadrant_json("top_right", "7AM", "DJ B"),
_quadrant_json("bottom_left", "8AM", "DJ C"),
_quadrant_json("bottom_right", "9AM", "DJ D"),
'{"comments_raw": "declared today anti-Valentines Day"}',
],
)

transcribe = cm.make_modal_qwen_vl_quad_adapter("test-model")
result = transcribe(image)

# 5 calls: 1 header + 4 quadrants in canonical order.
assert fake_remote.call_count == 5
# 6 calls: 1 header + 4 quadrants + 1 footer, in canonical order.
assert fake_remote.call_count == 6
calls = fake_remote.call_args_list

# Call 0: header.
header_args = calls[0]
from core.prompts import HEADER_EXTRACTION_PROMPT
from core.prompts import FOOTER_EXTRACTION_PROMPT, HEADER_EXTRACTION_PROMPT

assert header_args.args[1] == HEADER_EXTRACTION_PROMPT
assert header_args.kwargs["json_schema"] == cm.HEADER_WIRE_SCHEMA
Expand All @@ -678,12 +713,18 @@ def test_modal_qwen_vl_quad_adapter_happy_path(
schema = calls[i].kwargs["json_schema"]
assert schema["properties"]["position"] == {"enum": [position]}

# Call 5: footer.
footer_args = calls[5]
assert footer_args.args[1] == FOOTER_EXTRACTION_PROMPT
assert footer_args.kwargs["json_schema"] == cm.FOOTER_WIRE_SCHEMA

# Assembled PageResult.
assert result.page_date_raw == "Mon 1 Jan 90"
assert result.oddities == ["weather: snowy"]
assert [q.position for q in result.quadrants] == list(QUADRANT_ORDER)
assert result.quadrants[0].hour_raw == "6AM"
assert result.quadrants[3].jock_raw == "DJ D"
assert result.comments_raw == "declared today anti-Valentines Day"
assert result.model_version == "modal-qwen-vl-quad:test-model"


Expand All @@ -702,6 +743,7 @@ def test_modal_qwen_vl_quad_adapter_quadrant_fallback_on_malformed_json(
"not json {{", # second quadrant returns garbage
_quadrant_json("bottom_left", "8AM", "C"),
_quadrant_json("bottom_right", "9AM", "D"),
'{"comments_raw": null}',
],
)

Expand Down Expand Up @@ -741,6 +783,7 @@ def test_modal_qwen_vl_quad_adapter_header_failure_does_not_fail_page(
_quadrant_json("top_right", "7AM", "B"),
_quadrant_json("bottom_left", "8AM", "C"),
_quadrant_json("bottom_right", "9AM", "D"),
'{"comments_raw": "valid footer"}',
],
)

Expand All @@ -752,6 +795,37 @@ def test_modal_qwen_vl_quad_adapter_header_failure_does_not_fail_page(
assert [q.position for q in result.quadrants] == list(QUADRANT_ORDER)
# Quadrant data still flows through.
assert result.quadrants[0].hour_raw == "6AM"
# Footer call is independent of the header call — its content survives.
assert result.comments_raw == "valid footer"


def test_modal_qwen_vl_quad_adapter_footer_failure_does_not_fail_page(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""A malformed footer response leaves comments_raw at None; the page
still validates with all four quadrants and the header date intact."""
image = tmp_path / "1990-04apr0106-page25.png"
_save_fixture_page(image)
_patch_modal_for_quadrant(
monkeypatch,
side_effect=[
'{"page_date_raw": "Mon 1 Jan 90", "oddities": []}',
_quadrant_json("top_left", "6AM", "A"),
_quadrant_json("top_right", "7AM", "B"),
_quadrant_json("bottom_left", "8AM", "C"),
_quadrant_json("bottom_right", "9AM", "D"),
"garbage response from footer call",
],
)

transcribe = cm.make_modal_qwen_vl_quad_adapter("test-model")
result = transcribe(image)

assert result.comments_raw is None
# The rest of the page is untouched.
assert result.page_date_raw == "Mon 1 Jan 90"
assert [q.position for q in result.quadrants] == list(QUADRANT_ORDER)
assert result.quadrants[0].hour_raw == "6AM"


# -- _run_row_count_check / _format_discrepancy --------------------------------
Expand Down
36 changes: 36 additions & 0 deletions tests/unit/test_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest

from core.prompts import (
FOOTER_EXTRACTION_PROMPT,
HEADER_EXTRACTION_PROMPT,
PAGE_EXTRACTION_PROMPT,
QUADRANT_EXTRACTION_PROMPT_TEMPLATE,
Expand Down Expand Up @@ -268,3 +269,38 @@ def test_header_prompt_scopes_oddities_to_page_level() -> None:

def test_header_prompt_forbids_invented_content() -> None:
assert "Never invent content" in HEADER_EXTRACTION_PROMPT


# -- FOOTER_EXTRACTION_PROMPT ----------------------------------------------


def test_footer_prompt_captures_comments_raw() -> None:
assert "comments_raw" in FOOTER_EXTRACTION_PROMPT


def test_footer_prompt_demands_verbatim_transcription() -> None:
"""The Comments band is free-text DJ commentary — the model must not
clean it up like an editor."""
assert "verbatim" in FOOTER_EXTRACTION_PROMPT.lower()


def test_footer_prompt_specifies_json_null_for_blank() -> None:
"""Blank comments band must round-trip as null, not "" — same convention
as page_date_raw / hour_raw / jock_raw."""
assert "JSON null" in FOOTER_EXTRACTION_PROMPT


def test_footer_prompt_scopes_to_footer_band() -> None:
"""The footer crop slightly overlaps the bottom-quadrant baseline; the
prompt must tell the model to ignore content above the Comments line —
otherwise the model will helpfully transcribe the last row of the
bottom quadrants into comments_raw."""
text = FOOTER_EXTRACTION_PROMPT.lower()
assert "comments" in text
# Negate transcribing content from above the Comments line.
assert "do not" in text
assert "above" in text


def test_footer_prompt_forbids_invented_content() -> None:
assert "Never invent content" in FOOTER_EXTRACTION_PROMPT
Loading