From 1a8c41f196b8a85d01b34f129cd14f8f2775ea6c Mon Sep 17 00:00:00 2001 From: maish Date: Sun, 14 Jun 2026 21:28:40 +0800 Subject: [PATCH 1/3] chore: stop tracking .claude/ local config Untrack the committed .claude/settings.json and gitignore the whole .claude/ directory so personal Claude Code config (permissions, skills) is never pushed to this public repo. Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/settings.json | 57 ------------------------------------------- .gitignore | 3 +++ 2 files changed, 3 insertions(+), 57 deletions(-) delete mode 100644 .claude/settings.json diff --git a/.claude/settings.json b/.claude/settings.json deleted file mode 100644 index 6a1c27b..0000000 --- a/.claude/settings.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "permissions": { - "allow": [ - "Bash(python:*)", - "Bash(python3:*)", - "Bash(pytest:*)", - "Bash(pip:*)", - "Bash(source:*)", - "Bash(ls:*)", - "Bash(find:*)", - "Bash(grep:*)", - "Bash(rg:*)", - "Bash(wc:*)", - "Bash(du:*)", - "Bash(head:*)", - "Bash(tail:*)", - "Bash(mkdir:*)", - "Bash(rmdir:*)", - "Bash(cat:*)", - "Bash(git status:*)", - "Bash(git diff:*)", - "Bash(git log:*)", - "Bash(git show:*)", - "Bash(git branch:*)", - "Bash(git add:*)", - "Bash(git mv:*)", - "Bash(git rm:*)", - "Bash(git checkout:*)", - "Bash(git reset HEAD *)", - "Bash(wait *)", - "Bash(N_FINTABNET=20 SCAN_LIMIT=800 FINTABNET_CACHE=/tmp/fintabnet_shards python scripts/build_fintabnet_fixtures.py)", - "Bash(N_FINTABNET=200 SCAN_LIMIT=5000 FINTABNET_CACHE=/tmp/fintabnet_shards python scripts/build_fintabnet_fixtures.py)", - "Bash(unzip -l /Users/pebbleroad/Documents/table2rules/dist/table2rules-0.3.0-py3-none-any.whl)", - "Bash(/tmp/t2r-fresh/bin/pip install *)", - "Bash(/tmp/t2r-fresh/bin/pip list *)", - "Bash(/tmp/t2r-fresh/bin/python /tmp/t2r-fresh/smoke.py)", - "Bash(/tmp/t2r-fresh/bin/table2rules)", - "Bash(/tmp/t2r-fresh/bin/python -m table2rules --help)", - "Bash(/tmp/t2r-fresh/bin/python -c ' *)", - "Bash(ruff check *)", - "Bash(ruff format *)", - "Bash(mypy src/table2rules)", - "Bash(sed -n '80,90p' /Users/pebbleroad/Documents/table2rules/src/table2rules/grid_parser.py)", - "Bash(sed -n '255,265p' /Users/pebbleroad/Documents/table2rules/src/table2rules/grid_parser.py)", - "Bash(/tmp/t2r-final/bin/pip install *)", - "Bash(/tmp/t2r-final/bin/python /tmp/t2r-fresh/smoke.py)", - "Bash(/tmp/t2r-final/bin/table2rules)" - ], - "ask": [ - "Bash(git commit:*)", - "Bash(git push:*)", - "Bash(git reset --hard:*)", - "Bash(rm:*)", - "Bash(rm -rf:*)" - ] - } -} diff --git a/.gitignore b/.gitignore index 604f7f4..c19fc37 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,6 @@ build/ # Local benchmark outputs (gold should be versioned) benchmarks/current/ + +# Claude Code local config (settings, skills) — never commit +/.claude/ From 8ed5f9b83b7841e9c1886e0f7140eb8e00f13ccb Mon Sep 17 00:00:00 2001 From: maish Date: Sun, 14 Jun 2026 22:04:00 +0800 Subject: [PATCH 2/3] fix(core): thread label-only row-group headers in real docling schedule shapes Three shapes from real TableItem.export_to_html output still dropped the line-item title from grouped values' row_headers; 0.6.x only handled the idealized forms. - Narrow title -> full-width description band -> values: the title's row-group extent now extends THROUGH an immediately-following full-width description band (absorbed as a nested header member, not a boundary), so the title threads as the outer ancestor instead of terminating the extent and being dropped. The absorbed band is bounded by the same extent so it does not leak past the next title. - Multi-cell title rows (leading item number/key + textual title, "10 | Travel delay" / "3. | Permanent loss of:"): a label-only row is a group header when at most ONE of its label cells is numeric-only. This admits number+title while still rejecting a data row whose value columns merely happen to be empty ("Average: | 80.2 | 10.7 | 3.3", >=2 numeric) -- replaces the old exactly-one- cell guard. A repeating key column (same col+text on the group's value rows) is excluded from the promoted title so it is not duplicated in the path. - Two-column Label|Value schedules: Signal D promotes the left column to the row-label/stub even under a single-row thead header, scoped to max_cols == 2 so multi-column property tables are untouched. Also yields proper one-record-per-line output for plain 2-col relational tables ("North | Sales: 100") instead of two disconnected "Header: value" lines. 4 existing golds improve (2-col relational tables now bind row<->value); no value or label text dropped. New fixtures label-only-title-then-description-band and label-only-title-number-key-matrix. Full suite green; ruff + mypy clean. Known limitation (pre-existing, separate gate interaction, tracked for follow-up): a ". | Group: | (empty)" sub-grouped header with a colspan title over a promoted descriptor column still falls back to flat. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 32 +++++++ ...label-only-title-number-key-matrix.out.txt | 16 ++++ ...l-only-title-then-description-band.out.txt | 5 ++ .../multi-table/nested-in-cell.out.txt | 6 +- .../empty-rows-pdf-artifact.out.txt | 6 +- .../relational/multiple-tbody.out.txt | 12 +-- .../relational/tfoot-before-tbody.out.txt | 6 +- src/table2rules/_core.py | 83 +++++++++++++++---- src/table2rules/grid_parser.py | 18 ++++ .../label-only-title-number-key-matrix.md | 27 ++++++ .../label-only-title-then-description-band.md | 30 +++++++ 11 files changed, 205 insertions(+), 36 deletions(-) create mode 100644 benchmarks/gold/rules/fixtures/matrix/label-only-title-number-key-matrix.out.txt create mode 100644 benchmarks/gold/rules/fixtures/matrix/label-only-title-then-description-band.out.txt create mode 100644 tests/fixtures/matrix/label-only-title-number-key-matrix.md create mode 100644 tests/fixtures/matrix/label-only-title-then-description-band.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 1313a65..b54bb65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,38 @@ All notable changes to `table2rules` are documented here. Dates are in ## [Unreleased] +### Fixed + +- **Label-only row-group headers now thread through real docling schedule + shapes.** Three additional shapes from real `TableItem.export_to_html` output + were dropping the line-item title from grouped values' `row_headers`: + - *Narrow title → full-width description band → values.* The title's row-group + extent now extends *through* an immediately-following full-width description + band (a nested member of the header block, not a boundary), so the title + threads as the outer ancestor instead of being dropped: + `9. Trip Cancellation > If your trip is cancelled… > 1. Adult insured person | …`. + - *Multi-cell title rows* (a leading item number/key plus a textual title, e.g. + `10 | Travel delay`) are now recognized as group headers. A row is a title + when at most one of its label cells is numeric-only — this admits the + number+title shape while still rejecting a data row whose value columns merely + happen to be empty (`Average: | 80.2 | 10.7 | 3.3`, ≥2 numeric). A repeating + key column is excluded from the promoted title so it is not duplicated. + - *Two-column `Label | Value` schedules.* The left column is now promoted to the + row-label/stub even under a single-row thead header (`Benefit | Maximum limit`) + — Signal D, scoped to exactly two columns so multi-column property tables are + untouched. This also produces proper one-record-per-line output for plain + two-column relational tables (`North | Sales: 100`) instead of splitting each + row into two disconnected `Header: value` lines. + + New fixtures `matrix/label-only-title-then-description-band` and + `matrix/label-only-title-number-key-matrix`. + + *Known limitation:* a sub-grouped header of the form `. | Group: | (empty)` + with a `colspan` title over a promoted descriptor column still falls back to + flat (the spanned cell trips the gate's "rules originate from ``" + invariant). This is a pre-existing, separate gate interaction — not the + label-only-threading path — and is tracked for a follow-up. + ## [0.6.2] — 2026-06-14 ### Fixed diff --git a/benchmarks/gold/rules/fixtures/matrix/label-only-title-number-key-matrix.out.txt b/benchmarks/gold/rules/fixtures/matrix/label-only-title-number-key-matrix.out.txt new file mode 100644 index 0000000..98cacb1 --- /dev/null +++ b/benchmarks/gold/rules/fixtures/matrix/label-only-title-number-key-matrix.out.txt @@ -0,0 +1,16 @@ +Trip Cancellation > If your trip is cancelled due to specified events. > 9 > 1. Adult insured person | Value Plan > Individual: 5,000 +Trip Cancellation > If your trip is cancelled due to specified events. > 9 > 1. Adult insured person | Value Plan > Family: 10,000 +Trip Cancellation > If your trip is cancelled due to specified events. > 9 > 1. Adult insured person | Economy Plan > Individual: 3,000 +Trip Cancellation > If your trip is cancelled due to specified events. > 9 > 1. Adult insured person | Economy Plan > Family: 6,000 +Trip Cancellation > If your trip is cancelled due to specified events. > 9 > 2. Child insured person | Value Plan > Individual: 2,500 +Trip Cancellation > If your trip is cancelled due to specified events. > 9 > 2. Child insured person | Value Plan > Family: 5,000 +Trip Cancellation > If your trip is cancelled due to specified events. > 9 > 2. Child insured person | Economy Plan > Individual: 1,500 +Trip Cancellation > If your trip is cancelled due to specified events. > 9 > 2. Child insured person | Economy Plan > Family: 3,000 +Travel delay > If the departure of your public transport is delayed by six hours. > 10 > 1. Adult insured person | Value Plan > Individual: 100 +Travel delay > If the departure of your public transport is delayed by six hours. > 10 > 1. Adult insured person | Value Plan > Family: 200 +Travel delay > If the departure of your public transport is delayed by six hours. > 10 > 1. Adult insured person | Economy Plan > Individual: 150 +Travel delay > If the departure of your public transport is delayed by six hours. > 10 > 1. Adult insured person | Economy Plan > Family: 300 +Travel delay > If the departure of your public transport is delayed by six hours. > 10 > 2. Child insured person | Value Plan > Individual: 50 +Travel delay > If the departure of your public transport is delayed by six hours. > 10 > 2. Child insured person | Value Plan > Family: 100 +Travel delay > If the departure of your public transport is delayed by six hours. > 10 > 2. Child insured person | Economy Plan > Individual: 75 +Travel delay > If the departure of your public transport is delayed by six hours. > 10 > 2. Child insured person | Economy Plan > Family: 150 \ No newline at end of file diff --git a/benchmarks/gold/rules/fixtures/matrix/label-only-title-then-description-band.out.txt b/benchmarks/gold/rules/fixtures/matrix/label-only-title-then-description-band.out.txt new file mode 100644 index 0000000..2518398 --- /dev/null +++ b/benchmarks/gold/rules/fixtures/matrix/label-only-title-then-description-band.out.txt @@ -0,0 +1,5 @@ +9. Trip Cancellation > If your trip is cancelled due to specified events before departure. > 1. Adult insured person | Maximum limit (S$): 5,000 +9. Trip Cancellation > If your trip is cancelled due to specified events before departure. > 2. Child insured person | Maximum limit (S$): 2,500 +10. Travel Delay > If the departure of your public transport is delayed by at least six hours. > 1. Adult insured person | Maximum limit (S$): 100 per six hours up to 1,500 +10. Travel Delay > If the departure of your public transport is delayed by at least six hours. > 2. Child insured person | Maximum limit (S$): 50 per six hours up to 1,500 +11. Trip Postponement \ No newline at end of file diff --git a/benchmarks/gold/rules/fixtures/multi-table/nested-in-cell.out.txt b/benchmarks/gold/rules/fixtures/multi-table/nested-in-cell.out.txt index 676a6b3..9324e15 100644 --- a/benchmarks/gold/rules/fixtures/multi-table/nested-in-cell.out.txt +++ b/benchmarks/gold/rules/fixtures/multi-table/nested-in-cell.out.txt @@ -2,7 +2,5 @@ A-001 | Status: Open A-001 | Comment: Parent row has nested summary: k, v; x, 1 A-002 | Status: Closed A-002 | Comment: Normal row -Metric: Total Open -Value: 1 -Metric: Total Closed -Value: 1 \ No newline at end of file +Total Open | Value: 1 +Total Closed | Value: 1 \ No newline at end of file diff --git a/benchmarks/gold/rules/fixtures/relational/empty-rows-pdf-artifact.out.txt b/benchmarks/gold/rules/fixtures/relational/empty-rows-pdf-artifact.out.txt index 542543d..26a9385 100644 --- a/benchmarks/gold/rules/fixtures/relational/empty-rows-pdf-artifact.out.txt +++ b/benchmarks/gold/rules/fixtures/relational/empty-rows-pdf-artifact.out.txt @@ -1,4 +1,2 @@ -Item: Widget -Qty: 10 -Item: Gadget -Qty: 20 \ No newline at end of file +Widget | Qty: 10 +Gadget | Qty: 20 \ No newline at end of file diff --git a/benchmarks/gold/rules/fixtures/relational/multiple-tbody.out.txt b/benchmarks/gold/rules/fixtures/relational/multiple-tbody.out.txt index 9b7c134..6a63054 100644 --- a/benchmarks/gold/rules/fixtures/relational/multiple-tbody.out.txt +++ b/benchmarks/gold/rules/fixtures/relational/multiple-tbody.out.txt @@ -1,8 +1,4 @@ -Region: North -Sales: 100 -Region: South -Sales: 200 -Region: East -Sales: 150 -Region: West -Sales: 180 \ No newline at end of file +North | Sales: 100 +South | Sales: 200 +East | Sales: 150 +West | Sales: 180 \ No newline at end of file diff --git a/benchmarks/gold/rules/fixtures/relational/tfoot-before-tbody.out.txt b/benchmarks/gold/rules/fixtures/relational/tfoot-before-tbody.out.txt index 763620a..e74ec9c 100644 --- a/benchmarks/gold/rules/fixtures/relational/tfoot-before-tbody.out.txt +++ b/benchmarks/gold/rules/fixtures/relational/tfoot-before-tbody.out.txt @@ -1,5 +1,3 @@ Total | Amount: 300 -Item: Widget -Amount: 100 -Item: Gadget -Amount: 200 \ No newline at end of file +Widget | Amount: 100 +Gadget | Amount: 200 \ No newline at end of file diff --git a/src/table2rules/_core.py b/src/table2rules/_core.py index 48889a9..d8fd3d0 100644 --- a/src/table2rules/_core.py +++ b/src/table2rules/_core.py @@ -411,15 +411,33 @@ def _label_cols(r: int) -> List[int]: cols.append(c) return cols - def _single_label_origin(r: int) -> bool: - # A group header is exactly one label source cell (a title, possibly - # colspan'd). More than one distinct non-empty label cell means a data - # row, not a divider — do not thread it. - origins = set() - for c in _label_cols(r): - cell = grid[r][c] - origins.add(cell.get("origin", (r, c)) if cell.get("is_span_copy") else (r, c)) - return len(origins) == 1 + def _cell_text(r: int, c: int) -> str: + cell = grid[r][c] + if cell.get("is_span_copy"): + o = cell.get("origin", (r, c)) + return (grid[o[0]][o[1]].get("text") or "").strip() + return (cell.get("text") or "").strip() + + def _is_numeric_only(text: str) -> bool: + # No alphabetic character but at least one digit — a bare item number + # ("3.", "10"), reusing the parser's universal "letters label, digits + # measure" signal. A group title carries text; a mis-promoted value cell + # is a number. + return ( + bool(text) and not any(ch.isalpha() for ch in text) and any(ch.isdigit() for ch in text) + ) + + def _title_like(r: int) -> bool: + # A group-header title carries at most ONE numeric-only label cell (a + # leading item number, e.g. "10 | Travel delay" or "3. | Permanent loss + # of:"). Two or more numeric label cells means a data row whose value + # columns merely happen to be empty (e.g. a header that over-promoted + # numeric columns to row labels, "Average: | 80.2 | 10.7 | 3.3") — + # threading it would invent a breadcrumb, so it stays an is_label. + cols = _label_cols(r) + if not cols: + return False + return sum(1 for c in cols if _is_numeric_only(_cell_text(r, c))) <= 1 # A row already carrying a rowgroup cell (a full-width band promoted above, # or a source scope="rowgroup") is a boundary, not a label-only candidate. @@ -432,7 +450,7 @@ def _single_label_origin(r: int) -> bool: and r not in band_rows and not _has_value(r) and bool(_label_cols(r)) - and _single_label_origin(r) + and _title_like(r) for r in range(n_rows) ] @@ -448,23 +466,56 @@ def _single_label_origin(r: int) -> bool: s_end = r r += 1 # advance past the stack for the outer loop - # Extent: down to the row before the next boundary (next label stack or - # full-width band). Bounded by a value row's presence. + # Absorb a run of full-width band rows immediately following the title + # stack (a description band under the title) into this header block — + # they are nested members, not a boundary. Without this the title's + # extent would terminate at the band and the title would be dropped (the + # narrow-title-then-full-width-description shape). + header_end = s_end + while header_end + 1 < n_rows and (header_end + 1) in band_rows: + header_end += 1 + + # Extent: from the first row after the header block to the row before the + # next group start — the next title, or a full-width band that begins a + # new section (one appearing AFTER a value row). A band absorbed above is + # part of this header, not a boundary. extent_end = n_rows - 1 - for rr in range(s_end + 1, n_rows): - if is_label_row[rr] or rr in band_rows: + saw_value = False + for rr in range(header_end + 1, n_rows): + if is_label_row[rr]: extent_end = rr - 1 break - has_data_row = any(_has_value(rr) for rr in range(s_end + 1, extent_end + 1)) - if not has_data_row: + if rr in band_rows and saw_value: + extent_end = rr - 1 + break + if _has_value(rr): + saw_value = True + value_rows = [rr for rr in range(header_end + 1, extent_end + 1) if _has_value(rr)] + if not value_rows: continue + # Promote each title cell, EXCLUDING a key column whose (column, text) + # repeats on a value row of the group — a repeating item-number/key + # already threads via the value rows' own labels; promoting it again + # would duplicate it in the path. The remaining cells are the title. for rr in range(s_start, s_end + 1): for c in _label_cols(rr): + text = _cell_text(rr, c) + if any(_cell_text(vr, c) == text for vr in value_rows): + continue grid[rr][c]["type"] = "th" grid[rr][c]["scope"] = "rowgroup" grid[rr][c]["rowgroup_extent_end"] = extent_end + # Bound the absorbed description band(s) by the same extent so a + # full-width description does not leak past the next narrow title (its + # colspan is wider, so the maze's colspan rule would not close it). + for rr in range(s_end + 1, header_end + 1): + for c in range(n_cols): + cell = grid[rr][c] + if cell.get("scope") == "rowgroup": + cell["rowgroup_extent_end"] = extent_end + def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]: """Runs the full pipeline and returns rules plus the gate verdict. diff --git a/src/table2rules/grid_parser.py b/src/table2rules/grid_parser.py index 0779dee..4b28391 100644 --- a/src/table2rules/grid_parser.py +++ b/src/table2rules/grid_parser.py @@ -487,6 +487,24 @@ def _descriptor_like(col: int) -> bool: if _descriptor_like(c): promote_cols.add(c) + # --- Signal D: stub column in a 2-column Label|Value schedule --- + # In a two-column table the left column is the row-label/stub and the + # right column is its value — even when col 0 carries a thead header, + # which Signals A/C/B all skip (they need a multi-row/rowspan header or + # an unlabeled column). This is the single-row-thead + # "Benefit | Maximum limit (S$)" schedule shape. Scoped to exactly two + # columns so multi-column property tables (where col 0 is one data field + # among several) are untouched; col 0 must be descriptor-like and col 1 + # must carry values, so a two-column all-text table is left alone. + if ( + max_cols == 2 + and 0 not in promote_cols + and _descriptor_like(0) + and body_nonempty[1] >= 1 + and not _descriptor_like(1) + ): + promote_cols.add(0) + if promote_cols: for c in sorted(promote_cols): for r in range(data_start_row_idx, len(grid)): diff --git a/tests/fixtures/matrix/label-only-title-number-key-matrix.md b/tests/fixtures/matrix/label-only-title-number-key-matrix.md new file mode 100644 index 0000000..5ee1067 --- /dev/null +++ b/tests/fixtures/matrix/label-only-title-number-key-matrix.md @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + +
Value PlanEconomy Plan
IndividualFamilyIndividualFamily
9Trip Cancellation
9If your trip is cancelled due to specified events.
91. Adult insured person5,00010,0003,0006,000
92. Child insured person2,5005,0001,5003,000
10Travel delay
10If the departure of your public transport is delayed by six hours.
101. Adult insured person100200150300
102. Child insured person5010075150
diff --git a/tests/fixtures/matrix/label-only-title-then-description-band.md b/tests/fixtures/matrix/label-only-title-then-description-band.md new file mode 100644 index 0000000..d45b44b --- /dev/null +++ b/tests/fixtures/matrix/label-only-title-then-description-band.md @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + +
BenefitMaximum limit (S$)
9. Trip Cancellation
If your trip is cancelled due to specified events before departure.
1. Adult insured person5,000
2. Child insured person2,500
10. Travel Delay
If the departure of your public transport is delayed by at least six hours.
1. Adult insured person100 per six hours up to 1,500
2. Child insured person50 per six hours up to 1,500
11. Trip Postponement
From 0c8b782614458dcc6d4a1e5abe8c2f6c4e44d75e Mon Sep 17 00:00:00 2001 From: maish Date: Sun, 14 Jun 2026 22:04:24 +0800 Subject: [PATCH 3/3] chore: release 0.6.3 Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b54bb65..013b653 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to `table2rules` are documented here. Dates are in ## [Unreleased] +## [0.6.3] — 2026-06-14 + ### Fixed - **Label-only row-group headers now thread through real docling schedule diff --git a/pyproject.toml b/pyproject.toml index 062c92d..72aa2a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "table2rules" -version = "0.6.2" +version = "0.6.3" description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding." readme = "README.md" license = "MIT"