Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,15 @@ scans/ input PDFs (gitignored; SCANS_ROOT)
data/ outputs (gitignored; DATA_ROOT)
pages/<rel-pdf>/page-NN.png rendered images
results/<rel-pdf>/page-NN.json extraction results (one PageResult per page)
verifier/<stem>.bundle.json pre-processor output: result + per-row bboxes
verifier/<stem>.verified.json verifier UI export: hand-corrected PageResult
jobs.db SQLite job table

verifier/ static SPA for manual row-by-row verification.
Loads a bundle, renders each row's cropped
image strip next to an editable text field,
exports a corrected verified.json.

core/
schema.py Pydantic models. GeminiPageResult is what
the model returns (used as response_schema);
Expand All @@ -41,12 +48,28 @@ core/
PageLayout (header_bottom_y, body_mid_y,
column_mid_x). Used by the per-quadrant
cropper in scripts/calibrate_models.py.
`partition_row_lines_by_quadrant(image,
layout)` is the public hook the verifier
pre-processor uses to compute per-row bboxes.
continuations.py Read-time merge of `notes="continuation"`
rows into the prior entry's raw_text.
Pure function; on-disk shape unchanged.

cli.py Typer entrypoint: `flowsheets <subcommand>`.
Builds dependencies from env, calls into core.

scripts/
make_verifier_bundle.py PageResult JSON + page PNG -> verifier
bundle.json with per-quadrant + per-row
bboxes for the SPA to canvas-crop. Hard-codes
SCHEMA_VERSION = 1; bump on incompatible
schema changes.
derive_truth.py <stem>.verified.json -> <stem>.truth.json
by extracting short uppercased substrings
(page date tokens, jock prefix, artist
portion of raw_text). Single source of
truth for those rules — the UI doesn't
derive truth itself.
```

## Why these choices
Expand Down
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,30 @@ Tests are split into:

The default test run **excludes** the `external_api` and `slow` markers; CI runs the same default. The golden-page external-API runner is a follow-up.

## Manual verifier

After the pipeline produces `data/results/<rel>/page-NN.json`, you can hand-verify and correct entries via the static SPA in `verifier/`. Each row's cropped image strip sits next to its detected text in an editable field. Export emits a `<stem>.verified.json` (`PageResult`-shaped, plugs back into the pipeline as ground truth) and `derive_truth` produces a matching `tests/golden/<stem>.truth.json`.

```bash
# Generate a bundle
python -m scripts.make_verifier_bundle \
data/results/<rel>/page-NN.json \
data/pages/<rel>/page-NN.png \
--out data/verifier/<stem>.bundle.json

# Open the verifier
python -m http.server 8765
# then visit:
# http://localhost:8765/verifier/?bundle=/data/verifier/<stem>.bundle.json

# Derive a truth file from the exported verified.json
python -m scripts.derive_truth \
data/verifier/<stem>.verified.json \
--out tests/golden/<stem>.truth.json
```

See `verifier/README.md` for the bundle schema, expected file layout, and the substring-derivation rules.

## Cost calibration

Gemini 3.1 Pro charges per input token; one 300-DPI flowsheet page at `media_resolution=high` is ~1120 image tokens plus ~600 prompt tokens. Across the full corpus (~16K pages) input cost lands in the low tens of dollars; output adds modestly. Run the pipeline against a 10–20 page sample first and inspect both quality and `usage_metadata` before scheduling a full run.
Expand Down
105 changes: 95 additions & 10 deletions core/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,15 @@ class Job:
image_path: str | None
result_path: str | None
model_version: str | None
verified_at: str | None
verified_path: str | None
corrections_path: str | None
created_at: str
updated_at: str

@classmethod
def from_row(cls, row: aiosqlite.Row) -> Self:
keys = set(row.keys())
return cls(
pdf_path=row["pdf_path"],
page_number=row["page_number"],
Expand All @@ -64,29 +68,54 @@ def from_row(cls, row: aiosqlite.Row) -> Self:
image_path=row["image_path"],
result_path=row["result_path"],
model_version=row["model_version"],
# Late-added columns are nullable; tolerate their absence on a
# very old jobs.db that hasn't been re-init()ed yet.
verified_at=row["verified_at"] if "verified_at" in keys else None,
verified_path=row["verified_path"] if "verified_path" in keys else None,
corrections_path=(row["corrections_path"] if "corrections_path" in keys else None),
created_at=row["created_at"],
updated_at=row["updated_at"],
)


_SCHEMA = """
CREATE TABLE IF NOT EXISTS jobs (
pdf_path TEXT NOT NULL,
page_number INTEGER NOT NULL,
status TEXT NOT NULL,
attempts INTEGER NOT NULL DEFAULT 0,
last_error TEXT,
image_path TEXT,
result_path TEXT,
model_version TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
pdf_path TEXT NOT NULL,
page_number INTEGER NOT NULL,
status TEXT NOT NULL,
attempts INTEGER NOT NULL DEFAULT 0,
last_error TEXT,
image_path TEXT,
result_path TEXT,
model_version TEXT,
verified_at TEXT,
verified_path TEXT,
corrections_path TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
PRIMARY KEY (pdf_path, page_number)
);

CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
"""

# Columns added after the initial schema. `init()` runs `ALTER TABLE` for
# each of these against existing databases so older jobs.db files pick up
# the new columns without losing data.
_LATE_COLUMNS: tuple[tuple[str, str], ...] = (
("verified_at", "TEXT"),
("verified_path", "TEXT"),
("corrections_path", "TEXT"),
)

# Indexes that depend on late-added columns and therefore must be created
# AFTER the ALTER TABLE migrations run. Keeping them out of `_SCHEMA`
# avoids "no such column" errors when initializing a legacy database.
_POST_MIGRATION_INDEXES: tuple[str, ...] = (
"CREATE INDEX IF NOT EXISTS idx_jobs_verified_at "
"ON jobs(verified_at) WHERE verified_at IS NOT NULL",
)


def _now() -> str:
return datetime.now(UTC).isoformat()
Expand All @@ -112,6 +141,19 @@ async def init(self) -> None:
# rollback journal. The pragma is persistent across connections.
await db.execute("PRAGMA journal_mode=WAL")
await db.executescript(_SCHEMA)
# ALTER TABLE migrations for late-added columns. CREATE TABLE
# above is idempotent (IF NOT EXISTS), so on a fresh DB this
# is a no-op; on an existing DB it adds the columns.
db.row_factory = aiosqlite.Row
cursor = await db.execute("PRAGMA table_info(jobs)")
existing = {row["name"] for row in await cursor.fetchall()}
for name, col_type in _LATE_COLUMNS:
if name not in existing:
await db.execute(f"ALTER TABLE jobs ADD COLUMN {name} {col_type}")
# Indexes that reference late columns run after the ALTER
# TABLE pass, otherwise SQLite errors on the missing column.
for index_sql in _POST_MIGRATION_INDEXES:
await db.execute(index_sql)
await db.commit()

@asynccontextmanager
Expand Down Expand Up @@ -234,6 +276,49 @@ async def mark_low_confidence(
clear_error=True,
)

async def mark_verified(
self,
pdf_path: str,
page_number: int,
*,
verified_path: Path,
corrections_path: Path,
) -> bool:
"""Record that a page has been hand-verified via the verifier UI.

Doesn't change `status` — verification is orthogonal to the
extraction state machine (a `completed` page can be verified;
re-extracting a verified page resets the result but should NOT
clear the verification record by default — that's a separate
decision a human makes via `retry`).

Returns True if a job row matched, False otherwise. Callers
(e.g. the verifier server) may want to write files even when no
job row exists for the page (test fixtures), so a False return
is not an error.
"""
async with self._connect() as db:
cursor = await db.execute(
"""
UPDATE jobs
SET verified_at = ?,
verified_path = ?,
corrections_path = ?,
updated_at = ?
WHERE pdf_path = ? AND page_number = ?
""",
(
_now(),
str(verified_path),
str(corrections_path),
_now(),
pdf_path,
page_number,
),
)
await db.commit()
return cursor.rowcount > 0

async def mark_failed(self, pdf_path: str, page_number: int, error: str) -> None:
async with self._connect() as db:
cursor = await db.execute(
Expand Down
87 changes: 87 additions & 0 deletions core/page_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@

import numpy as np

from core.schema import QUADRANT_ORDER, QuadrantPosition

if TYPE_CHECKING:
from PIL.Image import Image as PILImage

Expand Down Expand Up @@ -89,6 +91,14 @@
# Comments line and excludes the last body row.
_BODY_BOTTOM_SEARCH_BAND = (0.95, 0.99)

# When the top quadrant's last spacing exceeds this multiple of the
# global median row spacing, the trailing line is reattributed to the
# corresponding bottom quadrant. The anomaly signals that body_mid_y
# landed BELOW the bottom block's hour-jock-cell baseline, leaving that
# line in the top partition by mistake. See
# `partition_row_lines_by_quadrant`'s correction-pass comment.
_BOTTOM_BASELINE_REATTRIBUTION_RATIO = 1.3


@dataclass(frozen=True)
class PageLayout:
Expand Down Expand Up @@ -296,3 +306,80 @@ def _detect_body_bottom_y(row_lines: list[int], h: int) -> int:
if not in_band:
return int(h * FALLBACK_BODY_BOTTOM_FRACTION)
return in_band[-1]


def partition_row_lines_by_quadrant(
image: PILImage, layout: PageLayout
) -> dict[QuadrantPosition, list[int]]:
"""Detected row-line y-coords, partitioned by quadrant of the body grid.

Reuses `_detect_row_lines` for the y-coordinates, then classifies each
line by which page-column it spans (left, right, or both, based on ink
density at that y) and which body band it sits in (top vs bottom, by
`layout.body_mid_y`).

A line spanning both columns is added to BOTH side quadrants — most
printed flowsheet grid lines run full-width and bracket both hour-blocks
of a row.

Lines outside `[layout.header_bottom_y, layout.body_bottom_y)` are
dropped (header or footer artifacts, not body rows).

Returns a dict with all four `QUADRANT_ORDER` keys; empty list when
no lines hit a quadrant (blank image, un-printed margin).
"""
w, _h = image.size
grayscale = np.asarray(image.convert("L"))
col_mid = layout.column_mid_x

all_lines = _detect_row_lines(grayscale, w, col_mid)

ink = (255 - grayscale).astype(np.float64) / 255.0
left_w = float(col_mid)
right_w = float(w - col_mid)
threshold = _ROW_LINE_THRESHOLDS[-1]

out: dict[QuadrantPosition, list[int]] = {q: [] for q in QUADRANT_ORDER}
for y in all_lines:
if not (layout.header_bottom_y <= y < layout.body_bottom_y):
continue
left_ink = float(ink[y, :col_mid].sum())
right_ink = float(ink[y, col_mid:].sum())
on_left = left_ink > threshold * left_w
on_right = right_ink > threshold * right_w
if y < layout.body_mid_y:
if on_left:
out["top_left"].append(int(y))
if on_right:
out["top_right"].append(int(y))
else:
if on_left:
out["bottom_left"].append(int(y))
if on_right:
out["bottom_right"].append(int(y))

# Correction pass: on some pages `_detect_body_mid_y` lands BELOW the
# bottom-block hour-jock-cell baseline (the anchor at 0.55h prefers the
# gap below the cell over the true inter-block gap above it). The
# baseline line then gets misattributed to the top quadrant, and the
# bottom quadrant's first detected line is row 0's BOTTOM rather than
# its top — shifting every row crop up by one.
#
# Signal: the top quadrant's last spacing is significantly larger than
# the median row spacing across all detected lines (a normal sequence
# has consistent spacing; an anomalous jump at the end means the last
# line belongs to a different sequence — the bottom block).
if len(all_lines) >= 2:
median_spacing = float(np.median(np.diff(np.asarray(all_lines))))
if median_spacing > 0:
for top_pos, bottom_pos in (
("top_left", "bottom_left"),
("top_right", "bottom_right"),
):
top_lines = out[top_pos] # type: ignore[index]
if len(top_lines) >= 2:
last_spacing = top_lines[-1] - top_lines[-2]
if last_spacing > _BOTTOM_BASELINE_REATTRIBUTION_RATIO * median_spacing:
moved = top_lines.pop()
out[bottom_pos].insert(0, moved) # type: ignore[index]
return out
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ dependencies = [
"rich>=13.0.0",
"pillow>=10.0",
"numpy>=2.0",
# Verifier UI server (verifier/serve.py). The static SPA depends on the
# POST /api/lookup proxy (request-o-matic doesn't emit CORS) and
# POST /api/save (writes verified.json + corrections.json, updates
# jobs.db). httpx is also load-bearing for tests/unit/test_verifier_serve.py
# via httpx.ASGITransport.
"fastapi>=0.115",
"uvicorn>=0.30",
"httpx>=0.27",
]

[project.optional-dependencies]
Expand Down
Loading
Loading