From 5cb2efea0407e0e3fb192439007e2fb959f66a5c Mon Sep 17 00:00:00 2001 From: maish Date: Thu, 11 Jun 2026 11:37:09 +0800 Subject: [PATCH 1/2] Fix reprinted continuation-page headers appended as data rows on multi-page merge Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 15 ++++++ src/table_stitcher/adapters/docling.py | 27 ++++++++++ tests/test_docling_adapter.py | 73 ++++++++++++++++++++++++++ 3 files changed, 115 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae74ebd..e84e21b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,21 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html ## [Unreleased] +### Fixed + +- **Reprinted continuation-page headers appended as data rows on multi-page + merge** (`adapters/docling.py`). When a table's column header was a + multi-row (hierarchical) header reprinted at the top of each page, + `_grid_to_dataframe` emitted every grid row — including the rows Docling + flagged `column_header=True` — as DataFrame body rows, so each page's + reprinted header (and the anchor's own header) survived the merge as bogus + data rows, misaligning the table. Leading `column_header=True` grid rows are + now excluded from the body; they are still reconstructed as the header block + on injection. The fix keys on Docling's structural header flags, so it is + immune to per-cell OCR drift (e.g. `(S$)` vs `($$)`) and needs no threshold + tuning. Single-row headers and tables without header flags are unaffected. + A `debug`-level log reports how many header rows were excluded. + ## [0.4.2] — 2026-06-08 ### Fixed diff --git a/src/table_stitcher/adapters/docling.py b/src/table_stitcher/adapters/docling.py index aa865b2..6ce4a2c 100644 --- a/src/table_stitcher/adapters/docling.py +++ b/src/table_stitcher/adapters/docling.py @@ -291,6 +291,20 @@ def _grid_to_dataframe(table: Any, doc: Any) -> pd.DataFrame: if not grid: return pd.DataFrame() + # Leading grid rows Docling flagged as column headers. These are + # reconstructed separately on injection (_extract_original_header_rows), so + # they must never become body rows: otherwise a multi-row header reprinted + # at the top of each continuation page is concatenated into the merged body + # as bogus data rows. A single-row header is already absorbed as the column + # row below; this additionally drops the 2nd..Nth rows of a multi-row + # header (and the full header on continuation pages parsed as headerless). + n_header_grid = 0 + for row in grid: + if row and any(getattr(c, "column_header", False) for c in row if c): + n_header_grid += 1 + else: + break + all_rows = [] for row in grid: row_data = [getattr(cell, "text", str(cell)) if cell else "" for cell in row] @@ -363,6 +377,19 @@ def _grid_to_dataframe(table: Any, doc: Any) -> pd.DataFrame: header = first_row data_rows = real_content_rows[1:] + # The column-name logic above may consume only the first row as the header. + # When Docling flagged more leading rows as column headers, drop all of them + # from the body so reprinted continuation-page headers don't survive as data. + if n_header_grid > 0: + if n_header_grid > 1: + log.debug( + "Excluded %d column-header rows from table body " + "(multi-row header; reconstructed separately on injection).", + n_header_grid, + ) + data_rows = real_content_rows[n_header_grid:] + pre_header_rows = [] + clean_header = [] for h in header: h_str = str(h).strip() diff --git a/tests/test_docling_adapter.py b/tests/test_docling_adapter.py index cc7182c..47b9443 100644 --- a/tests/test_docling_adapter.py +++ b/tests/test_docling_adapter.py @@ -388,6 +388,79 @@ def test_without_member_data_falls_back_to_flat(self): assert all(cell.col_span == 1 for cell in td.grid[1]) +class TestMultiRowHeaderExcludedFromBody: + """Grid rows flagged ``column_header=True`` must not become body rows. + + A multi-row header reprinted at the top of every continuation page would + otherwise be concatenated into the merged table as bogus data rows + (the header is reconstructed separately on injection). + """ + + @staticmethod + def _cell(text, r, c, *, col_span=1, header): + return TableCell( + text=text, + row_span=1, + col_span=col_span, + column_header=header, + row_header=False, + start_row_offset_idx=r, + end_row_offset_idx=r + 1, + start_col_offset_idx=c, + end_col_offset_idx=c + col_span, + ) + + def _table(self) -> TableData: + c = self._cell + # 2-row header: [SECTION | AMOUNT(col_span=2)] then [SECTION | BASIC | PREMIUM] + amount = c("AMOUNT", 0, 1, col_span=2, header=True) + h0 = [c("SECTION", 0, 0, header=True), amount, amount] # span repeated in grid + h1 = [ + c("SECTION", 1, 0, header=True), + c("BASIC", 1, 1, header=True), + c("PREMIUM", 1, 2, header=True), + ] + d0 = [ + c("Death", 2, 0, header=False), + c("100", 2, 1, header=False), + c("200", 2, 2, header=False), + ] + d1 = [ + c("Injury", 3, 0, header=False), + c("50", 3, 1, header=False), + c("75", 3, 2, header=False), + ] + grid = [h0, h1, d0, d1] + flat = [h0[0], amount, *h1, *d0, *d1] + return TableData(num_rows=4, num_cols=3, table_cells=flat, grid=grid) + + def test_all_header_rows_excluded_from_body(self): + table = SimpleNamespace(data=self._table()) + df = _grid_to_dataframe(table, doc=None) + + # Only the two data rows survive; both header rows are gone. + assert df.shape[0] == 2 + body_text = " ".join(str(v) for v in df.to_numpy().ravel()) + for header_token in ("AMOUNT", "BASIC", "PREMIUM"): + assert header_token not in body_text + assert "Death" in body_text and "Injury" in body_text + + def test_single_row_header_unaffected(self): + """A single-row header is absorbed as the column row exactly as before.""" + c = self._cell + grid = [ + [c("Name", 0, 0, header=True), c("Score", 0, 1, header=True)], + [c("Alice", 1, 0, header=False), c("10", 1, 1, header=False)], + ] + flat = [grid[0][0], grid[0][1], grid[1][0], grid[1][1]] + table = SimpleNamespace(data=TableData(num_rows=2, num_cols=2, table_cells=flat, grid=grid)) + df = _grid_to_dataframe(table, doc=None) + + assert list(df.columns) == ["Name", "Score"] + assert df.shape[0] == 1 + assert df.iloc[0].tolist() == ["Alice", "10"] + + class TestAdapterProtocol: """Verify DoclingAdapter satisfies the protocol.""" From f625fe99eb2a2f735b0b2364fbe291ba25b657ca Mon Sep 17 00:00:00 2001 From: maish Date: Thu, 11 Jun 2026 11:37:25 +0800 Subject: [PATCH 2/2] chore: release 0.4.3 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e84e21b..5eb9bb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html ## [Unreleased] +## [0.4.3] — 2026-06-08 + ### Fixed - **Reprinted continuation-page headers appended as data rows on multi-page diff --git a/pyproject.toml b/pyproject.toml index 29bbd94..058603d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "table-stitcher" -version = "0.4.2" +version = "0.4.3" description = "Reassemble tables split across page boundaries in PDF extraction" readme = "README.md" license = "MIT"