diff --git a/CHANGELOG.md b/CHANGELOG.md index f14a86e..1313a65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,30 @@ All notable changes to `table2rules` are documented here. Dates are in ## [Unreleased] +## [0.6.2] — 2026-06-14 + +### Fixed + +- **Label-only row-group headers are now threaded across columns** (multi-column + matrix variant). A label-only band groups a *row range*, but 0.6.1 only reached + it from the value cell's own column and the row-label columns. When a group + header sits in a leading stub / line-number column while the sub-rows leave + that column empty and carry their identity in a different column (a numbered + schedule with `plan × cover` value columns), the band was unreachable — so the + header was **dropped entirely** rather than threaded. The maze now scans all + columns for label-only bands (full-width and source `scope="rowgroup"` bands + keep the column-restricted scan, so unrelated stub dividers don't cross-attach), + and each group's extent still closes at the next group: + + ``` + 10. Travel Delay > Adult under 70 | PLANS > BASIC: 100 + ``` + + This also recovers data in three real-world corpus tables (`pubtabnet-180372`, + `-357665`, `-374857`) whose year / cohort group headers — in a stub column the + data rows leave empty — were silently dropped in 0.6.1. New fixture + `matrix/label-only-rowgroup-stub-column`. + ## [0.6.1] — 2026-06-14 ### Fixed diff --git a/benchmarks/gold/rules/fixtures/matrix/label-only-rowgroup-stub-column.out.txt b/benchmarks/gold/rules/fixtures/matrix/label-only-rowgroup-stub-column.out.txt new file mode 100644 index 0000000..cd88958 --- /dev/null +++ b/benchmarks/gold/rules/fixtures/matrix/label-only-rowgroup-stub-column.out.txt @@ -0,0 +1,6 @@ +9. Trip Cancellation > If the trip is cancelled | PLANS > BASIC: 5,000 +9. Trip Cancellation > If the trip is cancelled | PLANS > ELITE: 10,000 +10. Travel Delay > Adult under 70 | PLANS > BASIC: 100 +10. Travel Delay > Adult under 70 | PLANS > ELITE: 200 +10. Travel Delay > Child | PLANS > BASIC: 50 +10. Travel Delay > Child | PLANS > ELITE: 100 \ No newline at end of file diff --git a/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-180372.out.txt b/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-180372.out.txt index e977ca4..abab048 100644 --- a/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-180372.out.txt +++ b/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-180372.out.txt @@ -1,70 +1,70 @@ -Municipality of Bologna | resident population > N.: 2398 -Municipality of Bologna | living abroad > %: 0.71 -Municipality of Bologna | no fixed abode > %: 0.12 -Municipality of Bologna | refusals objectors > %: 1.00 -Municipality of Bologna | immigrated > %: 0.25 -Municipality of Bologna | delays > %: 2.04 -Municipality of Bologna | other* > %: 1.54 -Emilia Romagna | resident population > N.: 29662 -Emilia Romagna | living abroad > %: 0.24 -Emilia Romagna | no fixed abode > %: 0.09 -Emilia Romagna | refusals objectors > %: 0.29 -Emilia Romagna | immigrated > %: 0.22 -Emilia Romagna | delays > %: 0.51 -Emilia Romagna | other* > %: 0.22 -Municipality of Bologna | resident population > N.: 2560 -Municipality of Bologna | living abroad > %: 1.09 -Municipality of Bologna | no fixed abode > %: 0.43 -Municipality of Bologna | refusals objectors > %: 0.47 -Municipality of Bologna | immigrated > %: 0.78 -Municipality of Bologna | delays > %: 1.05 -Municipality of Bologna | other* > %: 0.27 -Emilia Romagna | resident population > N.: 30916 -Emilia Romagna | living abroad > %: 0.27 -Emilia Romagna | no fixed abode > %: 0.07 -Emilia Romagna | refusals objectors > %: 0.35 -Emilia Romagna | immigrated > %: 0.22 -Emilia Romagna | delays > %: 0.37 -Emilia Romagna | other* > %: 0.11 -Municipality of Bologna | resident population > N.: 2429 -Municipality of Bologna | living abroad > %: 0,00 -Municipality of Bologna | no fixed abode > %: 0.08 -Municipality of Bologna | refusals objectors > %: 0.54 -Municipality of Bologna | immigrated > %: 1.19 -Municipality of Bologna | delays > %: 0.86 -Municipality of Bologna | other* > %: 0.70 -Emilia Romagna | resident population > N.: 31727 -Emilia Romagna | living abroad > %: 0.20 -Emilia Romagna | no fixed abode > %: 0.00 -Emilia Romagna | refusals objectors > %: 0.20 -Emilia Romagna | immigrated > %: 0.40 -Emilia Romagna | delays > %: 0.50 -Emilia Romagna | other* > %: 0.2 -Municipality of Bologna | resident population > N.: 2619 -Municipality of Bologna | living abroad > %: 0.04 -Municipality of Bologna | no fixed abode > %: 0.19 -Municipality of Bologna | refusals objectors > %: 1.26 -Municipality of Bologna | immigrated > %: 0.38 -Municipality of Bologna | delays > %: 1.07 -Municipality of Bologna | other* > %: 0.53 -Emilia Romagna | resident population > N.: 32866 -Emilia Romagna | living abroad > %: 0.25 -Emilia Romagna | no fixed abode > %: 0.05 -Emilia Romagna | refusals objectors > %: 0.60 -Emilia Romagna | immigrated > %: 0.26 -Emilia Romagna | delays > %: 0.58 -Emilia Romagna | other* > %: 0.25 -Municipality of Bologna | resident population > N.: 2706 -Municipality of Bologna | living abroad > %: 0.00 -Municipality of Bologna | no fixed abode > %: 0.04 -Municipality of Bologna | refusals objectors > %: 1.55 -Municipality of Bologna | immigrated > %: 0.59 -Municipality of Bologna | delays > %: 1.07 -Municipality of Bologna | other* > %: 0.74 -Emilia Romagna | resident population > N.: 34950 -Emilia Romagna | living abroad > %: 0.21 -Emilia Romagna | no fixed abode > %: 0.04 -Emilia Romagna | refusals objectors > %: 1.00 -Emilia Romagna | immigrated > %: 0.23 -Emilia Romagna | delays > %: 0.69 -Emilia Romagna | other* > %: 0.38 \ No newline at end of file +1998 > Municipality of Bologna | resident population > N.: 2398 +1998 > Municipality of Bologna | living abroad > %: 0.71 +1998 > Municipality of Bologna | no fixed abode > %: 0.12 +1998 > Municipality of Bologna | refusals objectors > %: 1.00 +1998 > Municipality of Bologna | immigrated > %: 0.25 +1998 > Municipality of Bologna | delays > %: 2.04 +1998 > Municipality of Bologna | other* > %: 1.54 +1998 > Emilia Romagna | resident population > N.: 29662 +1998 > Emilia Romagna | living abroad > %: 0.24 +1998 > Emilia Romagna | no fixed abode > %: 0.09 +1998 > Emilia Romagna | refusals objectors > %: 0.29 +1998 > Emilia Romagna | immigrated > %: 0.22 +1998 > Emilia Romagna | delays > %: 0.51 +1998 > Emilia Romagna | other* > %: 0.22 +1999 > Municipality of Bologna | resident population > N.: 2560 +1999 > Municipality of Bologna | living abroad > %: 1.09 +1999 > Municipality of Bologna | no fixed abode > %: 0.43 +1999 > Municipality of Bologna | refusals objectors > %: 0.47 +1999 > Municipality of Bologna | immigrated > %: 0.78 +1999 > Municipality of Bologna | delays > %: 1.05 +1999 > Municipality of Bologna | other* > %: 0.27 +1999 > Emilia Romagna | resident population > N.: 30916 +1999 > Emilia Romagna | living abroad > %: 0.27 +1999 > Emilia Romagna | no fixed abode > %: 0.07 +1999 > Emilia Romagna | refusals objectors > %: 0.35 +1999 > Emilia Romagna | immigrated > %: 0.22 +1999 > Emilia Romagna | delays > %: 0.37 +1999 > Emilia Romagna | other* > %: 0.11 +2000 > Municipality of Bologna | resident population > N.: 2429 +2000 > Municipality of Bologna | living abroad > %: 0,00 +2000 > Municipality of Bologna | no fixed abode > %: 0.08 +2000 > Municipality of Bologna | refusals objectors > %: 0.54 +2000 > Municipality of Bologna | immigrated > %: 1.19 +2000 > Municipality of Bologna | delays > %: 0.86 +2000 > Municipality of Bologna | other* > %: 0.70 +2000 > Emilia Romagna | resident population > N.: 31727 +2000 > Emilia Romagna | living abroad > %: 0.20 +2000 > Emilia Romagna | no fixed abode > %: 0.00 +2000 > Emilia Romagna | refusals objectors > %: 0.20 +2000 > Emilia Romagna | immigrated > %: 0.40 +2000 > Emilia Romagna | delays > %: 0.50 +2000 > Emilia Romagna | other* > %: 0.2 +2001 > Municipality of Bologna | resident population > N.: 2619 +2001 > Municipality of Bologna | living abroad > %: 0.04 +2001 > Municipality of Bologna | no fixed abode > %: 0.19 +2001 > Municipality of Bologna | refusals objectors > %: 1.26 +2001 > Municipality of Bologna | immigrated > %: 0.38 +2001 > Municipality of Bologna | delays > %: 1.07 +2001 > Municipality of Bologna | other* > %: 0.53 +2001 > Emilia Romagna | resident population > N.: 32866 +2001 > Emilia Romagna | living abroad > %: 0.25 +2001 > Emilia Romagna | no fixed abode > %: 0.05 +2001 > Emilia Romagna | refusals objectors > %: 0.60 +2001 > Emilia Romagna | immigrated > %: 0.26 +2001 > Emilia Romagna | delays > %: 0.58 +2001 > Emilia Romagna | other* > %: 0.25 +2002 > Municipality of Bologna | resident population > N.: 2706 +2002 > Municipality of Bologna | living abroad > %: 0.00 +2002 > Municipality of Bologna | no fixed abode > %: 0.04 +2002 > Municipality of Bologna | refusals objectors > %: 1.55 +2002 > Municipality of Bologna | immigrated > %: 0.59 +2002 > Municipality of Bologna | delays > %: 1.07 +2002 > Municipality of Bologna | other* > %: 0.74 +2002 > Emilia Romagna | resident population > N.: 34950 +2002 > Emilia Romagna | living abroad > %: 0.21 +2002 > Emilia Romagna | no fixed abode > %: 0.04 +2002 > Emilia Romagna | refusals objectors > %: 1.00 +2002 > Emilia Romagna | immigrated > %: 0.23 +2002 > Emilia Romagna | delays > %: 0.69 +2002 > Emilia Romagna | other* > %: 0.38 \ No newline at end of file diff --git a/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-357665.out.txt b/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-357665.out.txt index 7a6057f..64cdea9 100644 --- a/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-357665.out.txt +++ b/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-357665.out.txt @@ -12,16 +12,16 @@ Quitters† > Women (n = 179) | Annual change in mL per pack-year to 1991: 1.5 Quitters† > Women (n = 179) | Annual change in mL per pack-year to 1991: 0.3, 2.6 Quitters† > Women (n = 179) | Annual change in mL per pack-year to 1991: 0.01 Quitters† > Women (n = 179) | Annual change in mL per packs per day between 1991 and 2002: -6.4 -Term: Quadratic -Annual change in mL per pack-year to 1991: -0.04 -Annual change in mL per pack-year to 1991: -0.08,-0.01 -Annual change in mL per pack-year to 1991: 0.01 -Annual change in mL per packs per day between 1991 and 2002: na -Term: Cubic -Annual change in mL per pack-year to 1991: 0.0004 -Annual change in mL per pack-year to 1991: 0.0001,0.0006 -Annual change in mL per pack-year to 1991: 0.006 -Annual change in mL per packs per day between 1991 and 2002: na +Quitters† | Term: Quadratic +Quitters† | Annual change in mL per pack-year to 1991: -0.04 +Quitters† | Annual change in mL per pack-year to 1991: -0.08,-0.01 +Quitters† | Annual change in mL per pack-year to 1991: 0.01 +Quitters† | Annual change in mL per packs per day between 1991 and 2002: na +Quitters† | Term: Cubic +Quitters† | Annual change in mL per pack-year to 1991: 0.0004 +Quitters† | Annual change in mL per pack-year to 1991: 0.0001,0.0006 +Quitters† | Annual change in mL per pack-year to 1991: 0.006 +Quitters† | Annual change in mL per packs per day between 1991 and 2002: na Persistent smokers† > Men (n = 595) | Term: Linear Persistent smokers† > Men (n = 595) | Annual change in mL per pack-year to 1991: -0.1 Persistent smokers† > Men (n = 595) | Annual change in mL per pack-year to 1991: -0.3, 0.12 @@ -32,13 +32,13 @@ Persistent smokers† > Women (n = 556) | Annual change in mL per pack-year to 1 Persistent smokers† > Women (n = 556) | Annual change in mL per pack-year to 1991: 0.5, 2.3 Persistent smokers† > Women (n = 556) | Annual change in mL per pack-year to 1991: 0.001 Persistent smokers† > Women (n = 556) | Annual change in mL per packs per day between 1991 and 2002: -11.6 -Term: Quadratic -Annual change in mL per pack-year to 1991: -0.06 -Annual change in mL per pack-year to 1991: -0.08, -0.02 -Annual change in mL per pack-year to 1991: <0.001 -Annual change in mL per packs per day between 1991 and 2002: na -Term: Cubic -Annual change in mL per pack-year to 1991: 0.0006 -Annual change in mL per pack-year to 1991: 0.0003, 0.0008 -Annual change in mL per pack-year to 1991: <0.001 -Annual change in mL per packs per day between 1991 and 2002: na \ No newline at end of file +Persistent smokers† | Term: Quadratic +Persistent smokers† | Annual change in mL per pack-year to 1991: -0.06 +Persistent smokers† | Annual change in mL per pack-year to 1991: -0.08, -0.02 +Persistent smokers† | Annual change in mL per pack-year to 1991: <0.001 +Persistent smokers† | Annual change in mL per packs per day between 1991 and 2002: na +Persistent smokers† | Term: Cubic +Persistent smokers† | Annual change in mL per pack-year to 1991: 0.0006 +Persistent smokers† | Annual change in mL per pack-year to 1991: 0.0003, 0.0008 +Persistent smokers† | Annual change in mL per pack-year to 1991: <0.001 +Persistent smokers† | Annual change in mL per packs per day between 1991 and 2002: na \ No newline at end of file diff --git a/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-374857.out.txt b/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-374857.out.txt index e57fefa..3f70fd8 100644 --- a/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-374857.out.txt +++ b/benchmarks/gold/rules/realworld/pubtabnet/pubtabnet-374857.out.txt @@ -1,30 +1,30 @@ -Municipality of Bologna | 12 months > DPT > %: 98.7 -Municipality of Bologna | 12 months > Polio > %: 98.5 -Municipality of Bologna | 12 months > Hepatitis B > %: 97.1 -Municipality of Bologna | 24 months > DPT > %: 98.3 -Municipality of Bologna | 24 months > Polio > %: 97.9 -Municipality of Bologna | 24 months > Hepatitis B > %: 96.5 -Emilia Romagna | 12 months > DPT > %: 98.5 -Emilia Romagna | 12 months > Polio > %: 98.4 -Emilia Romagna | 12 months > Hepatitis B > %: 98.1 -Emilia Romagna | 24 months > DPT > %: 98.5 -Emilia Romagna | 24 months > Polio > %: 98.4 -Emilia Romagna | 24 months > Hepatitis B > %: 98.0 -Italy | 24 months > DPT > %: 96.1 -Italy | 24 months > Polio > %: 96.0 -Italy | 24 months > Hepatitis B > %: 94.7 -Municipality of Bologna | 12 months > DPT > %: 96.7 -Municipality of Bologna | 12 months > Polio > %: 96.7 -Municipality of Bologna | 12 months > Hepatitis B > %: 95.9 -Municipality of Bologna | 24 months > DPT > %: 97.5 -Municipality of Bologna | 24 months > Polio > %: 97.4 -Municipality of Bologna | 24 months > Hepatitis B > %: 96.0 -Emilia Romagna | 12 months > DPT > %: 98.2 -Emilia Romagna | 12 months > Polio > %: 98.2 -Emilia Romagna | 12 months > Hepatitis B > %: 97.7 -Emilia Romagna | 24 months > DPT > %: 98.1 -Emilia Romagna | 24 months > Polio > %: 98.0 -Emilia Romagna | 24 months > Hepatitis B > %: 97.5 -Italy* | 24 months > DPT > %: 96.9 -Italy* | 24 months > Polio > %: 96.7 -Italy* | 24 months > Hepatitis B > %: 95.7 \ No newline at end of file +2001 > Municipality of Bologna | 12 months > DPT > %: 98.7 +2001 > Municipality of Bologna | 12 months > Polio > %: 98.5 +2001 > Municipality of Bologna | 12 months > Hepatitis B > %: 97.1 +2001 > Municipality of Bologna | 24 months > DPT > %: 98.3 +2001 > Municipality of Bologna | 24 months > Polio > %: 97.9 +2001 > Municipality of Bologna | 24 months > Hepatitis B > %: 96.5 +2001 > Emilia Romagna | 12 months > DPT > %: 98.5 +2001 > Emilia Romagna | 12 months > Polio > %: 98.4 +2001 > Emilia Romagna | 12 months > Hepatitis B > %: 98.1 +2001 > Emilia Romagna | 24 months > DPT > %: 98.5 +2001 > Emilia Romagna | 24 months > Polio > %: 98.4 +2001 > Emilia Romagna | 24 months > Hepatitis B > %: 98.0 +2001 > Italy | 24 months > DPT > %: 96.1 +2001 > Italy | 24 months > Polio > %: 96.0 +2001 > Italy | 24 months > Hepatitis B > %: 94.7 +2002 > Municipality of Bologna | 12 months > DPT > %: 96.7 +2002 > Municipality of Bologna | 12 months > Polio > %: 96.7 +2002 > Municipality of Bologna | 12 months > Hepatitis B > %: 95.9 +2002 > Municipality of Bologna | 24 months > DPT > %: 97.5 +2002 > Municipality of Bologna | 24 months > Polio > %: 97.4 +2002 > Municipality of Bologna | 24 months > Hepatitis B > %: 96.0 +2002 > Emilia Romagna | 12 months > DPT > %: 98.2 +2002 > Emilia Romagna | 12 months > Polio > %: 98.2 +2002 > Emilia Romagna | 12 months > Hepatitis B > %: 97.7 +2002 > Emilia Romagna | 24 months > DPT > %: 98.1 +2002 > Emilia Romagna | 24 months > Polio > %: 98.0 +2002 > Emilia Romagna | 24 months > Hepatitis B > %: 97.5 +2002 > Italy* | 24 months > DPT > %: 96.9 +2002 > Italy* | 24 months > Polio > %: 96.7 +2002 > Italy* | 24 months > Hepatitis B > %: 95.7 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 908a154..062c92d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "table2rules" -version = "0.6.1" +version = "0.6.2" description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding." readme = "README.md" license = "MIT" diff --git a/src/table2rules/maze_pathfinder.py b/src/table2rules/maze_pathfinder.py index 0e974c2..dad9007 100644 --- a/src/table2rules/maze_pathfinder.py +++ b/src/table2rules/maze_pathfinder.py @@ -151,8 +151,18 @@ def find_headers_for_cell( # Bands are ordered topmost-first (origin row ascending) and prepended, so # the row path reads outer-band > inner-group > row-labels, mirroring the # multi-level column path. + # + # A *label-only* band (one carrying an explicit ``rowgroup_extent_end``) + # groups a ROW RANGE, so it must reach every value row in its extent + # regardless of which column its single label cell sits in — e.g. a numbered + # schedule whose group header is in the line-number/stub column while the + # sub-rows leave that column empty and carry their identity in a different + # column. Such bands are therefore scanned across ALL columns. Full-width and + # source ``scope="rowgroup"`` bands keep the column-restricted scan (own + # column + row-label columns) so unrelated stub dividers don't cross-attach. + own_cols = {col, *row_header_columns} bands: List[Tuple[int, str]] = [] # (origin_row, text) - for scan_col in [col, *row_header_columns]: + for scan_col in range(len(grid[0])): for r in range(row - 1, -1, -1): cell = grid[r][scan_col] if not cell or not cell.get("text", "").strip(): @@ -167,6 +177,11 @@ def find_headers_for_cell( else: origin = (r, scan_col) origin_cell = cell + # A column-restricted band (no stored extent) is only honored from + # the value's own column or a row-label column; a label-only band + # (stored extent) is honored from any column. + if origin_cell.get("rowgroup_extent_end") is None and scan_col not in own_cols: + continue if origin in seen_origins: continue origin_row, origin_col = origin diff --git a/tests/fixtures/matrix/label-only-rowgroup-stub-column.md b/tests/fixtures/matrix/label-only-rowgroup-stub-column.md new file mode 100644 index 0000000..281b230 --- /dev/null +++ b/tests/fixtures/matrix/label-only-rowgroup-stub-column.md @@ -0,0 +1,22 @@ + + + + + + + + + + + +
SECTIONPLANS
BASICELITE
9. Trip Cancellation
If the trip is cancelled5,00010,000
10. Travel Delay
Adult under 70100200
Child50100