Skip to content

Commit d1e25e1

Browse files
feat(dedup): cross-section duplicate table removal
Two signals, in priority order: 1. Declared ownership — results.json tables may set owning_section (schema already supported it). Per-section prompt context now renders that table's full pipe layout ONLY when writing the owning section; other sections see a REFERENCE-ONLY stub that explicitly tells the LLM "do NOT re-emit \begin{table}". 2. Structural fingerprint — (normalized caption, column count, row count). First occurrence wins; later duplicates are replaced with a `% (duplicate table removed)` comment that keeps any surrounding \ref{tab:...} resolvable. The pass is a dict-level operation (section_key → LaTeX), so it lives in sanitize/tables.py outside the per-section pipeline. Invoked from write_paper after _gen_all_sections and before citation/label fixing — otherwise label recovery would try to heal a block we're about to drop. Progress events fire per-removed-block (warning kind under the section stage) with the dedup reason, so agents using --progress jsonl can surface dedup activity. Also: Table.from_dict now carries owning_section; to_prompt_context accepts a section_key filter; _context/_gen_section switched from a single string context to a context_fn builder so each section gets a section-specific prompt. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ed3517e commit d1e25e1

3 files changed

Lines changed: 226 additions & 16 deletions

File tree

skills/hermes-sci/package/hermes_sci/results.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class Table:
7272
caption: str
7373
headers: list[str]
7474
rows: list[list[str]]
75+
owning_section: str = ""
7576

7677
@classmethod
7778
def from_dict(cls, d: dict) -> "Table":
@@ -80,6 +81,7 @@ def from_dict(cls, d: dict) -> "Table":
8081
caption=str(d.get("caption", "")).strip(),
8182
headers=[str(h) for h in (d.get("headers") or [])],
8283
rows=[[str(c) for c in row] for row in (d.get("rows") or [])],
84+
owning_section=str(d.get("owning_section", "")).strip(),
8385
)
8486

8587

@@ -114,9 +116,14 @@ def all_numeric_values(self) -> list[float]:
114116
out.extend(_scan_numbers(self.raw_log))
115117
return out
116118

117-
def to_prompt_context(self) -> str:
118-
"""Render as compact text for LLM prompt. Keeps specific numbers
119-
verbatim so the writer can drop them into prose."""
119+
def to_prompt_context(self, section_key: Optional[str] = None) -> str:
120+
"""Render as compact text for LLM prompt.
121+
122+
When `section_key` is set, tables with an `owning_section` that does
123+
NOT match are rendered as reference-only stubs (label + caption)
124+
instead of full pipe tables. This stops the LLM from happily
125+
re-emitting the same `\\begin{table}` in every section.
126+
"""
120127
lines = ["EXPERIMENT RESULTS (author-supplied — use these exact numbers):", ""]
121128

122129
# Verbatim string block — names/versions/hardware to be copied EXACTLY,
@@ -148,14 +155,32 @@ def to_prompt_context(self) -> str:
148155
lines.append(f" - {meth}{m.name} = {m.value}{m.unit}{sp}{ctx}")
149156
lines.append("")
150157
if self.tables:
158+
def _owns(t: Table) -> bool:
159+
if not section_key:
160+
return True
161+
if not t.owning_section:
162+
return True # unassigned tables visible everywhere
163+
return t.owning_section == section_key
164+
165+
own, other = [], []
166+
for t in self.tables:
167+
(own if _owns(t) else other).append(t)
168+
151169
lines.append("Tables:")
152170
lines.append(" LaTeX labels to use: " +
153171
", ".join(f"tab:{t.id}" for t in self.tables))
154172
lines.append(" When you render a \\begin{table}, add "
155173
"\\label{tab:<id>} matching the ID below, and "
156174
"cross-reference with Table~\\ref{tab:<id>}. "
157175
"Never write 'Table ??' or a bare \\ref{}.")
158-
for t in self.tables:
176+
if section_key and other:
177+
refs = ", ".join(f"tab:{t.id} (owned by {t.owning_section})"
178+
for t in other)
179+
lines.append(f" REFERENCE-ONLY in this section "
180+
f"(already rendered elsewhere — use "
181+
f"Table~\\ref{{tab:<id>}} only, do NOT "
182+
f"re-emit \\begin{{table}}): {refs}")
183+
for t in own:
159184
lines.append("")
160185
lines.append(f" [\\label{{tab:{t.id}}}] {t.caption}")
161186
lines.append(f" | {' | '.join(t.headers)} |")
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
"""Cross-section table de-duplication.
2+
3+
The LLM sometimes re-emits the same `\\begin{table} ... \\end{table}` in
4+
multiple sections (e.g. a complexity-buckets table in both Experiments and
5+
Results) — pdflatex happily compiles two tables with the same \\label, which
6+
then poisons every `\\ref{tab:...}` in the paper.
7+
8+
Dedup operates on the *dict* of section-name → LaTeX body (not a single
9+
string), so it can choose which section keeps the full table and which ones
10+
are demoted to a bare reference comment.
11+
12+
Strategy — two complementary signals:
13+
14+
1. **Declared ownership** (preferred): when a table has `owning_section`
15+
set in results.json, its `\\begin{table} ... \\label{tab:<id>}` block
16+
is kept ONLY in that section; occurrences elsewhere are replaced by
17+
a short LaTeX comment referencing the label so `\\ref` still resolves.
18+
19+
2. **Structural fingerprint** (fallback for tables without owning_section
20+
or without `\\label`): block hash = (normalized caption, column count,
21+
row count). First occurrence wins; later duplicates are replaced by
22+
a comment that points back to the first one.
23+
24+
This pass is a *cross-section* operation, hence it lives outside the
25+
per-section SANITIZE_PIPELINE and is invoked explicitly from writeup.py
26+
after `_gen_all_sections`.
27+
"""
28+
from __future__ import annotations
29+
30+
import logging
31+
import re
32+
from typing import Optional
33+
34+
log = logging.getLogger("hermes_sci.sanitize.tables")
35+
36+
# Match a full LaTeX table environment. Non-greedy; survives nested tabular.
37+
_TABLE_ENV = re.compile(
38+
r"\\begin\{table\*?\}(?P<body>.*?)\\end\{table\*?\}",
39+
re.DOTALL,
40+
)
41+
_CAPTION = re.compile(r"\\caption\{(?P<cap>.*?)\}", re.DOTALL)
42+
_LABEL = re.compile(r"\\label\{(?P<id>tab:[A-Za-z0-9_:\-]+)\}")
43+
_COLS = re.compile(r"\\begin\{tabular\*?\}(?:\[[^\]]*\])?\{(?P<spec>[^}]*)\}")
44+
_ROW_END = re.compile(r"\\\\")
45+
46+
47+
def _norm_caption(s: str) -> str:
48+
s = re.sub(r"\\[a-zA-Z]+\*?", " ", s) # strip \commands
49+
s = re.sub(r"[{}~]", " ", s) # strip braces / tildes
50+
s = re.sub(r"\s+", " ", s).strip().lower()
51+
return s
52+
53+
54+
def _col_count(tabular_spec: str) -> int:
55+
"""Count columns in a tabular spec like `|l|c|c|r|` → 4."""
56+
spec = re.sub(r"[^lcrpXmbj]", "", tabular_spec or "")
57+
return len(spec)
58+
59+
60+
def _fingerprint(table_src: str) -> tuple[str, int, int]:
61+
cap_m = _CAPTION.search(table_src)
62+
cap = _norm_caption(cap_m.group("cap")) if cap_m else ""
63+
cols_m = _COLS.search(table_src)
64+
cols = _col_count(cols_m.group("spec")) if cols_m else 0
65+
rows = len(_ROW_END.findall(table_src))
66+
return (cap, cols, rows)
67+
68+
69+
def _label_id(table_src: str) -> Optional[str]:
70+
m = _LABEL.search(table_src)
71+
return m.group("id") if m else None
72+
73+
74+
def dedup_tables(
75+
sections: dict[str, str],
76+
*,
77+
table_ownership: Optional[dict[str, str]] = None,
78+
) -> tuple[dict[str, str], list[dict]]:
79+
"""Remove duplicate `\\begin{table}` blocks across sections.
80+
81+
Args:
82+
sections: section_key → LaTeX body.
83+
table_ownership: map of "tab:<id>" → owning_section. A block with a
84+
label in this map is kept ONLY in its owning section. Pass {}
85+
to rely solely on structural fingerprinting.
86+
87+
Returns:
88+
(new_sections, events) — `events` is a list of dicts describing each
89+
removed block (useful for verification_report.json and progress
90+
callbacks).
91+
"""
92+
table_ownership = table_ownership or {}
93+
seen_fp: dict[tuple[str, int, int], tuple[str, str]] = {}
94+
# label_seen: label_id → (section_key that kept it)
95+
label_seen: dict[str, str] = {}
96+
events: list[dict] = []
97+
98+
def _demote(label: Optional[str], kept_in: Optional[str]) -> str:
99+
"""Replacement text for a removed table block."""
100+
if label and kept_in:
101+
return (f"% (duplicate table \\ref{{{label}}} removed — "
102+
f"rendered in section {kept_in!r})\n")
103+
if label:
104+
return f"% (duplicate table \\ref{{{label}}} removed)\n"
105+
return "% (duplicate table removed)\n"
106+
107+
out: dict[str, str] = {}
108+
for sec_key, body in sections.items():
109+
def _replace(m: re.Match) -> str:
110+
block = m.group(0)
111+
inner = m.group("body")
112+
label = _label_id(inner)
113+
# 1. Declared ownership — label known, section mismatch → demote.
114+
if label and label in table_ownership:
115+
owner = table_ownership[label]
116+
if owner and owner != sec_key:
117+
events.append({
118+
"reason": "owning_section",
119+
"label": label,
120+
"found_in": sec_key,
121+
"owner": owner,
122+
})
123+
return _demote(label, owner)
124+
# If this is the owning section OR unassigned, fall through.
125+
# 2. Label already seen in an earlier section → demote.
126+
if label and label in label_seen:
127+
kept_in = label_seen[label]
128+
events.append({
129+
"reason": "duplicate_label",
130+
"label": label,
131+
"found_in": sec_key,
132+
"kept_in": kept_in,
133+
})
134+
return _demote(label, kept_in)
135+
# 3. Structural fingerprint match → demote.
136+
fp = _fingerprint(inner)
137+
if fp != ("", 0, 0) and fp in seen_fp:
138+
kept_in, kept_label = seen_fp[fp]
139+
events.append({
140+
"reason": "fingerprint",
141+
"fingerprint": list(fp),
142+
"found_in": sec_key,
143+
"kept_in": kept_in,
144+
"kept_label": kept_label,
145+
})
146+
return _demote(label or kept_label, kept_in)
147+
# Keep it — record.
148+
if label:
149+
label_seen[label] = sec_key
150+
if fp != ("", 0, 0):
151+
seen_fp[fp] = (sec_key, label or "")
152+
return block
153+
154+
out[sec_key] = _TABLE_ENV.sub(_replace, body)
155+
156+
if events:
157+
log.info("dedup_tables: removed %d duplicate block(s) across %d sections",
158+
len(events), len(sections))
159+
return out, events

skills/hermes-sci/package/hermes_sci/writeup.py

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import shutil
2323
import subprocess
2424
import time
25-
from typing import Optional
25+
from typing import Callable, Optional
2626

2727
from jinja2 import Environment, FileSystemLoader
2828

@@ -32,6 +32,7 @@
3232
from .progress import Progress, ProgressCallback, emit, noop as _noop_progress
3333
from .results import Results, from_dict as results_from_dict, load as results_load
3434
from .sanitize import sanitize_latex as _sanitize_latex
35+
from .sanitize.tables import dedup_tables as _dedup_tables
3536
from .verify import audit as verify_audit, annotate_unverified
3637

3738
log = logging.getLogger("hermes_sci.writeup")
@@ -178,13 +179,19 @@ class Paper:
178179
sections: dict[str, str]
179180

180181

181-
def _context(idea: dict, results_blob, hw_hint: str, bib_keys: set[str]) -> str:
182-
"""Assemble the shared prompt context.
182+
def _context(idea: dict, results_blob, hw_hint: str, bib_keys: set[str],
183+
section_key: Optional[str] = None) -> str:
184+
"""Assemble the prompt context.
183185
184186
`results_blob` may be:
185187
- None (no results; Phase 1 placeholder-style output)
186188
- str (free-form markdown; Phase 1 compat)
187189
- Results dataclass (Phase 3 structured — rendered with exact numbers)
190+
191+
`section_key` filters tables: each table with an `owning_section` is only
192+
fully rendered when writing that section; otherwise it's listed as a
193+
reference-only label. Prevents duplicate `\\begin{table}` blocks across
194+
sections (the LLM happily re-emits tables it sees in its prompt).
188195
"""
189196
parts = [
190197
"IDEA METADATA",
@@ -200,7 +207,7 @@ def _context(idea: dict, results_blob, hw_hint: str, bib_keys: set[str]) -> str:
200207
f"ALLOWED BIB KEYS (use only these): {sorted(bib_keys) or '(none)'}",
201208
]
202209
if isinstance(results_blob, Results):
203-
parts += ["", results_blob.to_prompt_context(),
210+
parts += ["", results_blob.to_prompt_context(section_key=section_key),
204211
"",
205212
"STRICT: Only cite numbers that appear in the metrics or "
206213
"tables above. Do NOT invent percentages, BLEU scores, "
@@ -215,11 +222,12 @@ def _context(idea: dict, results_blob, hw_hint: str, bib_keys: set[str]) -> str:
215222

216223

217224
async def _gen_section(
218-
cfg: BackendConfig, key: str, context: str, model: Optional[str],
219-
critique: bool,
225+
cfg: BackendConfig, key: str, context_fn: Callable[[str], str],
226+
model: Optional[str], critique: bool,
220227
) -> tuple[str, str]:
221228
"""Generate (optionally self-critiqued) LaTeX for one section."""
222229
instr = SECTION_PROMPTS[key]
230+
context = context_fn(key)
223231
user = (
224232
f"{context}\n\nTASK: {instr}\n\n"
225233
f"Return ONLY the LaTeX body (no \\section header)."
@@ -250,7 +258,7 @@ async def _gen_section(
250258

251259

252260
async def _gen_all_sections(
253-
cfg: BackendConfig, context: str, model: Optional[str],
261+
cfg: BackendConfig, context_fn: Callable[[str], str], model: Optional[str],
254262
sections: list[str], critique: bool, parallel: bool,
255263
concurrency: Optional[int] = None,
256264
progress: ProgressCallback = _noop_progress,
@@ -273,7 +281,7 @@ def _emit_done(key: str, ok: bool) -> None:
273281
async def gated(key: str) -> tuple[str, str]:
274282
async with sem:
275283
try:
276-
r = await _gen_section(cfg, key, context, model, critique)
284+
r = await _gen_section(cfg, key, context_fn, model, critique)
277285
_emit_done(key, ok=True)
278286
return r
279287
except Exception:
@@ -286,7 +294,7 @@ async def gated(key: str) -> tuple[str, str]:
286294
results = []
287295
for k in sections:
288296
try:
289-
r = await _gen_section(cfg, k, context, model, critique)
297+
r = await _gen_section(cfg, k, context_fn, model, critique)
290298
results.append(r)
291299
_emit_done(k, ok=True)
292300
except Exception as e: # noqa: BLE001
@@ -350,12 +358,15 @@ def write_paper(
350358
sections = sections or list(SECTION_PROMPTS.keys())
351359
hw = hw or detect_hardware()
352360
allowed = _bib_keys(TEMPLATE_DIR / "references.bib")
353-
ctx = _context(idea, results, hint_for_prompt(hw), allowed)
361+
hw_hint = hint_for_prompt(hw)
362+
363+
def context_fn(section_key: str) -> str:
364+
return _context(idea, results, hw_hint, allowed, section_key=section_key)
354365

355366
log.info("generating %d sections (parallel=%s critique=%s) on hardware=%s peak=%s",
356367
len(sections), parallel, critique, hw.tier, is_minimax_peak())
357368
raw_sections = asyncio.run(
358-
_gen_all_sections(cfg, ctx, model, sections, critique, parallel,
369+
_gen_all_sections(cfg, context_fn, model, sections, critique, parallel,
359370
concurrency=concurrency, progress=progress)
360371
)
361372

@@ -367,10 +378,25 @@ def write_paper(
367378
emit(progress, Progress(kind="stage_end", stage="coherence",
368379
meta={"duration_s": time.time() - t_coh}))
369380

370-
# Collect table IDs from Results for label-recovery pass.
381+
# Cross-section duplicate-table removal. Runs BEFORE citation / label
382+
# fixing so the label-recovery pass doesn't attempt to re-label a block
383+
# we're about to delete.
384+
table_ownership: dict[str, str] = {}
371385
known_table_ids: set[str] = set()
372386
if isinstance(results, Results):
373387
known_table_ids = {t.id for t in results.tables}
388+
for t in results.tables:
389+
if t.owning_section:
390+
table_ownership[f"tab:{t.id}"] = t.owning_section
391+
raw_sections, dedup_events = _dedup_tables(
392+
raw_sections, table_ownership=table_ownership,
393+
)
394+
for e in dedup_events:
395+
emit(progress, Progress(kind="warning", stage="section",
396+
message=f"dedup {e.get('reason')}: "
397+
f"{e.get('label') or '(no label)'} "
398+
f"in {e.get('found_in')}",
399+
meta=e))
374400

375401
# Strip citations whose keys aren't in the bib; ensure table labels.
376402
cleaned: dict[str, str] = {}

0 commit comments

Comments
 (0)