From 5cb2efea0407e0e3fb192439007e2fb959f66a5c Mon Sep 17 00:00:00 2001
From: maish <maish@pebbleroad.com>
Date: Thu, 11 Jun 2026 11:37:09 +0800
Subject: [PATCH 1/2] Fix reprinted continuation-page headers appended as data
 rows on multi-page merge

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                           | 15 ++++++
 src/table_stitcher/adapters/docling.py | 27 ++++++++++
 tests/test_docling_adapter.py          | 73 ++++++++++++++++++++++++++
 3 files changed, 115 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae74ebd..e84e21b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,21 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html
 
 ## [Unreleased]
 
+### Fixed
+
+- **Reprinted continuation-page headers appended as data rows on multi-page
+  merge** (`adapters/docling.py`). When a table's column header was a
+  multi-row (hierarchical) header reprinted at the top of each page,
+  `_grid_to_dataframe` emitted every grid row — including the rows Docling
+  flagged `column_header=True` — as DataFrame body rows, so each page's
+  reprinted header (and the anchor's own header) survived the merge as bogus
+  data rows, misaligning the table. Leading `column_header=True` grid rows are
+  now excluded from the body; they are still reconstructed as the header block
+  on injection. The fix keys on Docling's structural header flags, so it is
+  immune to per-cell OCR drift (e.g. `(S$)` vs `($$)`) and needs no threshold
+  tuning. Single-row headers and tables without header flags are unaffected.
+  A `debug`-level log reports how many header rows were excluded.
+
 ## [0.4.2] — 2026-06-08
 
 ### Fixed
diff --git a/src/table_stitcher/adapters/docling.py b/src/table_stitcher/adapters/docling.py
index aa865b2..6ce4a2c 100644
--- a/src/table_stitcher/adapters/docling.py
+++ b/src/table_stitcher/adapters/docling.py
@@ -291,6 +291,20 @@ def _grid_to_dataframe(table: Any, doc: Any) -> pd.DataFrame:
     if not grid:
         return pd.DataFrame()
 
+    # Leading grid rows Docling flagged as column headers. These are
+    # reconstructed separately on injection (_extract_original_header_rows), so
+    # they must never become body rows: otherwise a multi-row header reprinted
+    # at the top of each continuation page is concatenated into the merged body
+    # as bogus data rows. A single-row header is already absorbed as the column
+    # row below; this additionally drops the 2nd..Nth rows of a multi-row
+    # header (and the full header on continuation pages parsed as headerless).
+    n_header_grid = 0
+    for row in grid:
+        if row and any(getattr(c, "column_header", False) for c in row if c):
+            n_header_grid += 1
+        else:
+            break
+
     all_rows = []
     for row in grid:
         row_data = [getattr(cell, "text", str(cell)) if cell else "" for cell in row]
@@ -363,6 +377,19 @@ def _grid_to_dataframe(table: Any, doc: Any) -> pd.DataFrame:
         header = first_row
         data_rows = real_content_rows[1:]
 
+    # The column-name logic above may consume only the first row as the header.
+    # When Docling flagged more leading rows as column headers, drop all of them
+    # from the body so reprinted continuation-page headers don't survive as data.
+    if n_header_grid > 0:
+        if n_header_grid > 1:
+            log.debug(
+                "Excluded %d column-header rows from table body "
+                "(multi-row header; reconstructed separately on injection).",
+                n_header_grid,
+            )
+        data_rows = real_content_rows[n_header_grid:]
+        pre_header_rows = []
+
     clean_header = []
     for h in header:
         h_str = str(h).strip()
diff --git a/tests/test_docling_adapter.py b/tests/test_docling_adapter.py
index cc7182c..47b9443 100644
--- a/tests/test_docling_adapter.py
+++ b/tests/test_docling_adapter.py
@@ -388,6 +388,79 @@ def test_without_member_data_falls_back_to_flat(self):
         assert all(cell.col_span == 1 for cell in td.grid[1])
 
 
+class TestMultiRowHeaderExcludedFromBody:
+    """Grid rows flagged ``column_header=True`` must not become body rows.
+
+    A multi-row header reprinted at the top of every continuation page would
+    otherwise be concatenated into the merged table as bogus data rows
+    (the header is reconstructed separately on injection).
+    """
+
+    @staticmethod
+    def _cell(text, r, c, *, col_span=1, header):
+        return TableCell(
+            text=text,
+            row_span=1,
+            col_span=col_span,
+            column_header=header,
+            row_header=False,
+            start_row_offset_idx=r,
+            end_row_offset_idx=r + 1,
+            start_col_offset_idx=c,
+            end_col_offset_idx=c + col_span,
+        )
+
+    def _table(self) -> TableData:
+        c = self._cell
+        # 2-row header: [SECTION | AMOUNT(col_span=2)] then [SECTION | BASIC | PREMIUM]
+        amount = c("AMOUNT", 0, 1, col_span=2, header=True)
+        h0 = [c("SECTION", 0, 0, header=True), amount, amount]  # span repeated in grid
+        h1 = [
+            c("SECTION", 1, 0, header=True),
+            c("BASIC", 1, 1, header=True),
+            c("PREMIUM", 1, 2, header=True),
+        ]
+        d0 = [
+            c("Death", 2, 0, header=False),
+            c("100", 2, 1, header=False),
+            c("200", 2, 2, header=False),
+        ]
+        d1 = [
+            c("Injury", 3, 0, header=False),
+            c("50", 3, 1, header=False),
+            c("75", 3, 2, header=False),
+        ]
+        grid = [h0, h1, d0, d1]
+        flat = [h0[0], amount, *h1, *d0, *d1]
+        return TableData(num_rows=4, num_cols=3, table_cells=flat, grid=grid)
+
+    def test_all_header_rows_excluded_from_body(self):
+        table = SimpleNamespace(data=self._table())
+        df = _grid_to_dataframe(table, doc=None)
+
+        # Only the two data rows survive; both header rows are gone.
+        assert df.shape[0] == 2
+        body_text = " ".join(str(v) for v in df.to_numpy().ravel())
+        for header_token in ("AMOUNT", "BASIC", "PREMIUM"):
+            assert header_token not in body_text
+        assert "Death" in body_text and "Injury" in body_text
+
+    def test_single_row_header_unaffected(self):
+        """A single-row header is absorbed as the column row exactly as before."""
+        c = self._cell
+        grid = [
+            [c("Name", 0, 0, header=True), c("Score", 0, 1, header=True)],
+            [c("Alice", 1, 0, header=False), c("10", 1, 1, header=False)],
+        ]
+        flat = [grid[0][0], grid[0][1], grid[1][0], grid[1][1]]
+        table = SimpleNamespace(data=TableData(num_rows=2, num_cols=2, table_cells=flat, grid=grid))
+        df = _grid_to_dataframe(table, doc=None)
+
+        assert list(df.columns) == ["Name", "Score"]
+        assert df.shape[0] == 1
+        assert df.iloc[0].tolist() == ["Alice", "10"]
+
+
 class TestAdapterProtocol:
     """Verify DoclingAdapter satisfies the protocol."""
 

From f625fe99eb2a2f735b0b2364fbe291ba25b657ca Mon Sep 17 00:00:00 2001
From: maish <maish@pebbleroad.com>
Date: Thu, 11 Jun 2026 11:37:25 +0800
Subject: [PATCH 2/2] chore: release 0.4.3

---
 CHANGELOG.md   | 2 ++
 pyproject.toml | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e84e21b..5eb9bb8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html
 
 ## [Unreleased]
 
+## [0.4.3] — 2026-06-08
+
 ### Fixed
 
 - **Reprinted continuation-page headers appended as data rows on multi-page
diff --git a/pyproject.toml b/pyproject.toml
index 29bbd94..058603d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "table-stitcher"
-version = "0.4.2"
+version = "0.4.3"
 description = "Reassemble tables split across page boundaries in PDF extraction"
 readme = "README.md"
 license = "MIT"