Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,23 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html

## [Unreleased]

## [0.4.3] — 2026-06-08

### Fixed

- **Reprinted continuation-page headers appended as data rows on multi-page
merge** (`adapters/docling.py`). When a table's column header was a
multi-row (hierarchical) header reprinted at the top of each page,
`_grid_to_dataframe` emitted every grid row — including the rows Docling
flagged `column_header=True` — as DataFrame body rows, so each page's
reprinted header (and the anchor's own header) survived the merge as bogus
data rows, misaligning the table. Leading `column_header=True` grid rows are
now excluded from the body; they are still reconstructed as the header block
on injection. The fix keys on Docling's structural header flags, so it is
immune to per-cell OCR drift (e.g. `(S$)` vs `($$)`) and needs no threshold
tuning. Single-row headers and tables without header flags are unaffected.
A `debug`-level log reports how many header rows were excluded.

## [0.4.2] — 2026-06-08

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "table-stitcher"
version = "0.4.2"
version = "0.4.3"
description = "Reassemble tables split across page boundaries in PDF extraction"
readme = "README.md"
license = "MIT"
Expand Down
27 changes: 27 additions & 0 deletions src/table_stitcher/adapters/docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,20 @@ def _grid_to_dataframe(table: Any, doc: Any) -> pd.DataFrame:
if not grid:
return pd.DataFrame()

# Leading grid rows Docling flagged as column headers. These are
# reconstructed separately on injection (_extract_original_header_rows), so
# they must never become body rows: otherwise a multi-row header reprinted
# at the top of each continuation page is concatenated into the merged body
# as bogus data rows. A single-row header is already absorbed as the column
# row below; this additionally drops the 2nd..Nth rows of a multi-row
# header (and the full header on continuation pages parsed as headerless).
n_header_grid = 0
for row in grid:
if row and any(getattr(c, "column_header", False) for c in row if c):
n_header_grid += 1
else:
break

all_rows = []
for row in grid:
row_data = [getattr(cell, "text", str(cell)) if cell else "" for cell in row]
Expand Down Expand Up @@ -363,6 +377,19 @@ def _grid_to_dataframe(table: Any, doc: Any) -> pd.DataFrame:
header = first_row
data_rows = real_content_rows[1:]

# The column-name logic above may consume only the first row as the header.
# When Docling flagged more leading rows as column headers, drop all of them
# from the body so reprinted continuation-page headers don't survive as data.
if n_header_grid > 0:
if n_header_grid > 1:
log.debug(
"Excluded %d column-header rows from table body "
"(multi-row header; reconstructed separately on injection).",
n_header_grid,
)
data_rows = real_content_rows[n_header_grid:]
pre_header_rows = []

clean_header = []
for h in header:
h_str = str(h).strip()
Expand Down
73 changes: 73 additions & 0 deletions tests/test_docling_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,79 @@ def test_without_member_data_falls_back_to_flat(self):
assert all(cell.col_span == 1 for cell in td.grid[1])


class TestMultiRowHeaderExcludedFromBody:
"""Grid rows flagged ``column_header=True`` must not become body rows.

A multi-row header reprinted at the top of every continuation page would
otherwise be concatenated into the merged table as bogus data rows
(the header is reconstructed separately on injection).
"""

@staticmethod
def _cell(text, r, c, *, col_span=1, header):
return TableCell(
text=text,
row_span=1,
col_span=col_span,
column_header=header,
row_header=False,
start_row_offset_idx=r,
end_row_offset_idx=r + 1,
start_col_offset_idx=c,
end_col_offset_idx=c + col_span,
)

def _table(self) -> TableData:
c = self._cell
# 2-row header: [SECTION | AMOUNT(col_span=2)] then [SECTION | BASIC | PREMIUM]
amount = c("AMOUNT", 0, 1, col_span=2, header=True)
h0 = [c("SECTION", 0, 0, header=True), amount, amount] # span repeated in grid
h1 = [
c("SECTION", 1, 0, header=True),
c("BASIC", 1, 1, header=True),
c("PREMIUM", 1, 2, header=True),
]
d0 = [
c("Death", 2, 0, header=False),
c("100", 2, 1, header=False),
c("200", 2, 2, header=False),
]
d1 = [
c("Injury", 3, 0, header=False),
c("50", 3, 1, header=False),
c("75", 3, 2, header=False),
]
grid = [h0, h1, d0, d1]
flat = [h0[0], amount, *h1, *d0, *d1]
return TableData(num_rows=4, num_cols=3, table_cells=flat, grid=grid)

def test_all_header_rows_excluded_from_body(self):
table = SimpleNamespace(data=self._table())
df = _grid_to_dataframe(table, doc=None)

# Only the two data rows survive; both header rows are gone.
assert df.shape[0] == 2
body_text = " ".join(str(v) for v in df.to_numpy().ravel())
for header_token in ("AMOUNT", "BASIC", "PREMIUM"):
assert header_token not in body_text
assert "Death" in body_text and "Injury" in body_text

def test_single_row_header_unaffected(self):
"""A single-row header is absorbed as the column row exactly as before."""
c = self._cell
grid = [
[c("Name", 0, 0, header=True), c("Score", 0, 1, header=True)],
[c("Alice", 1, 0, header=False), c("10", 1, 1, header=False)],
]
flat = [grid[0][0], grid[0][1], grid[1][0], grid[1][1]]
table = SimpleNamespace(data=TableData(num_rows=2, num_cols=2, table_cells=flat, grid=grid))
df = _grid_to_dataframe(table, doc=None)

assert list(df.columns) == ["Name", "Score"]
assert df.shape[0] == 1
assert df.iloc[0].tolist() == ["Alice", "10"]


class TestAdapterProtocol:
"""Verify DoclingAdapter satisfies the protocol."""

Expand Down
Loading