feat(dedup): cross-section duplicate table removal

easyvibecoding · claude · easyvibecoding · commit d1e25e12a672 · 2026-04-14T18:24:16.000+08:00
Two signals, in priority order:

1. Declared ownership — results.json tables may set owning_section
   (schema already supported it). Per-section prompt context now
   renders that table's full pipe layout ONLY when writing the
   owning section; other sections see a REFERENCE-ONLY stub that
   explicitly tells the LLM "do NOT re-emit \begin{table}".

2. Structural fingerprint — (normalized caption, column count,
   row count). First occurrence wins; later duplicates are replaced
   with a `% (duplicate table removed)` comment that keeps any
   surrounding \ref{tab:...} resolvable.

The pass is a dict-level operation (section_key → LaTeX), so it
lives in sanitize/tables.py outside the per-section pipeline.
Invoked from write_paper after _gen_all_sections and before
citation/label fixing — otherwise label recovery would try to
heal a block we're about to drop.

Progress events fire per-removed-block (warning kind under the
section stage) with the dedup reason, so agents using --progress
jsonl can surface dedup activity.

Also: Table.from_dict now carries owning_section; to_prompt_context
accepts a section_key filter; _context/_gen_section switched from
a single string context to a context_fn builder so each section
gets a section-specific prompt.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/skills/hermes-sci/package/hermes_sci/results.py b/skills/hermes-sci/package/hermes_sci/results.py
@@ -72,6 +72,7 @@ class Table:
     caption: str
     headers: list[str]
     rows: list[list[str]]
+    owning_section: str = ""
 
     @classmethod
     def from_dict(cls, d: dict) -> "Table":
@@ -80,6 +81,7 @@ def from_dict(cls, d: dict) -> "Table":
             caption=str(d.get("caption", "")).strip(),
             headers=[str(h) for h in (d.get("headers") or [])],
             rows=[[str(c) for c in row] for row in (d.get("rows") or [])],
+            owning_section=str(d.get("owning_section", "")).strip(),
         )
 
 
@@ -114,9 +116,14 @@ def all_numeric_values(self) -> list[float]:
             out.extend(_scan_numbers(self.raw_log))
         return out
 
-    def to_prompt_context(self) -> str:
-        """Render as compact text for LLM prompt. Keeps specific numbers
-        verbatim so the writer can drop them into prose."""
+    def to_prompt_context(self, section_key: Optional[str] = None) -> str:
+        """Render as compact text for LLM prompt.
+
+        When `section_key` is set, tables with an `owning_section` that does
+        NOT match are rendered as reference-only stubs (label + caption)
+        instead of full pipe tables. This stops the LLM from happily
+        re-emitting the same `\\begin{table}` in every section.
+        """
         lines = ["EXPERIMENT RESULTS (author-supplied — use these exact numbers):", ""]
 
         # Verbatim string block — names/versions/hardware to be copied EXACTLY,
@@ -148,14 +155,32 @@ def to_prompt_context(self) -> str:
                 lines.append(f"  - {meth}{m.name} = {m.value}{m.unit}{sp}{ctx}")
             lines.append("")
         if self.tables:
+            def _owns(t: Table) -> bool:
+                if not section_key:
+                    return True
+                if not t.owning_section:
+                    return True  # unassigned tables visible everywhere
+                return t.owning_section == section_key
+
+            own, other = [], []
+            for t in self.tables:
+                (own if _owns(t) else other).append(t)
+
             lines.append("Tables:")
             lines.append("  LaTeX labels to use: " +
                          ", ".join(f"tab:{t.id}" for t in self.tables))
             lines.append("  When you render a \\begin{table}, add "
                          "\\label{tab:<id>} matching the ID below, and "
                          "cross-reference with Table~\\ref{tab:<id>}. "
                          "Never write 'Table ??' or a bare \\ref{}.")
-            for t in self.tables:
+            if section_key and other:
+                refs = ", ".join(f"tab:{t.id} (owned by {t.owning_section})"
+                                 for t in other)
+                lines.append(f"  REFERENCE-ONLY in this section "
+                             f"(already rendered elsewhere — use "
+                             f"Table~\\ref{{tab:<id>}} only, do NOT "
+                             f"re-emit \\begin{{table}}): {refs}")
+            for t in own:
                 lines.append("")
                 lines.append(f"  [\\label{{tab:{t.id}}}] {t.caption}")
                 lines.append(f"    | {' | '.join(t.headers)} |")
diff --git a/skills/hermes-sci/package/hermes_sci/sanitize/tables.py b/skills/hermes-sci/package/hermes_sci/sanitize/tables.py
@@ -0,0 +1,159 @@
+"""Cross-section table de-duplication.
+
+The LLM sometimes re-emits the same `\\begin{table} ... \\end{table}` in
+multiple sections (e.g. a complexity-buckets table in both Experiments and
+Results) — pdflatex happily compiles two tables with the same \\label, which
+then poisons every `\\ref{tab:...}` in the paper.
+
+Dedup operates on the *dict* of section-name → LaTeX body (not a single
+string), so it can choose which section keeps the full table and which ones
+are demoted to a bare reference comment.
+
+Strategy — two complementary signals:
+
+  1. **Declared ownership** (preferred): when a table has `owning_section`
+     set in results.json, its `\\begin{table} ... \\label{tab:<id>}` block
+     is kept ONLY in that section; occurrences elsewhere are replaced by
+     a short LaTeX comment referencing the label so `\\ref` still resolves.
+
+  2. **Structural fingerprint** (fallback for tables without owning_section
+     or without `\\label`): block hash = (normalized caption, column count,
+     row count). First occurrence wins; later duplicates are replaced by
+     a comment that points back to the first one.
+
+This pass is a *cross-section* operation, hence it lives outside the
+per-section SANITIZE_PIPELINE and is invoked explicitly from writeup.py
+after `_gen_all_sections`.
+"""
+from __future__ import annotations
+
+import logging
+import re
+from typing import Optional
+
+log = logging.getLogger("hermes_sci.sanitize.tables")
+
+# Match a full LaTeX table environment. Non-greedy; survives nested tabular.
+_TABLE_ENV = re.compile(
+    r"\\begin\{table\*?\}(?P<body>.*?)\\end\{table\*?\}",
+    re.DOTALL,
+)
+_CAPTION = re.compile(r"\\caption\{(?P<cap>.*?)\}", re.DOTALL)
+_LABEL = re.compile(r"\\label\{(?P<id>tab:[A-Za-z0-9_:\-]+)\}")
+_COLS = re.compile(r"\\begin\{tabular\*?\}(?:\[[^\]]*\])?\{(?P<spec>[^}]*)\}")
+_ROW_END = re.compile(r"\\\\")
+
+
+def _norm_caption(s: str) -> str:
+    s = re.sub(r"\\[a-zA-Z]+\*?", " ", s)      # strip \commands
+    s = re.sub(r"[{}~]", " ", s)               # strip braces / tildes
+    s = re.sub(r"\s+", " ", s).strip().lower()
+    return s
+
+
+def _col_count(tabular_spec: str) -> int:
+    """Count columns in a tabular spec like `|l|c|c|r|` → 4."""
+    spec = re.sub(r"[^lcrpXmbj]", "", tabular_spec or "")
+    return len(spec)
+
+
+def _fingerprint(table_src: str) -> tuple[str, int, int]:
+    cap_m = _CAPTION.search(table_src)
+    cap = _norm_caption(cap_m.group("cap")) if cap_m else ""
+    cols_m = _COLS.search(table_src)
+    cols = _col_count(cols_m.group("spec")) if cols_m else 0
+    rows = len(_ROW_END.findall(table_src))
+    return (cap, cols, rows)
+
+
+def _label_id(table_src: str) -> Optional[str]:
+    m = _LABEL.search(table_src)
+    return m.group("id") if m else None
+
+
+def dedup_tables(
+    sections: dict[str, str],
+    *,
+    table_ownership: Optional[dict[str, str]] = None,
+) -> tuple[dict[str, str], list[dict]]:
+    """Remove duplicate `\\begin{table}` blocks across sections.
+
+    Args:
+        sections: section_key → LaTeX body.
+        table_ownership: map of "tab:<id>" → owning_section. A block with a
+            label in this map is kept ONLY in its owning section. Pass {}
+            to rely solely on structural fingerprinting.
+
+    Returns:
+        (new_sections, events) — `events` is a list of dicts describing each
+        removed block (useful for verification_report.json and progress
+        callbacks).
+    """
+    table_ownership = table_ownership or {}
+    seen_fp: dict[tuple[str, int, int], tuple[str, str]] = {}
+    # label_seen: label_id → (section_key that kept it)
+    label_seen: dict[str, str] = {}
+    events: list[dict] = []
+
+    def _demote(label: Optional[str], kept_in: Optional[str]) -> str:
+        """Replacement text for a removed table block."""
+        if label and kept_in:
+            return (f"% (duplicate table \\ref{{{label}}} removed — "
+                    f"rendered in section {kept_in!r})\n")
+        if label:
+            return f"% (duplicate table \\ref{{{label}}} removed)\n"
+        return "% (duplicate table removed)\n"
+
+    out: dict[str, str] = {}
+    for sec_key, body in sections.items():
+        def _replace(m: re.Match) -> str:
+            block = m.group(0)
+            inner = m.group("body")
+            label = _label_id(inner)
+            # 1. Declared ownership — label known, section mismatch → demote.
+            if label and label in table_ownership:
+                owner = table_ownership[label]
+                if owner and owner != sec_key:
+                    events.append({
+                        "reason": "owning_section",
+                        "label": label,
+                        "found_in": sec_key,
+                        "owner": owner,
+                    })
+                    return _demote(label, owner)
+                # If this is the owning section OR unassigned, fall through.
+            # 2. Label already seen in an earlier section → demote.
+            if label and label in label_seen:
+                kept_in = label_seen[label]
+                events.append({
+                    "reason": "duplicate_label",
+                    "label": label,
+                    "found_in": sec_key,
+                    "kept_in": kept_in,
+                })
+                return _demote(label, kept_in)
+            # 3. Structural fingerprint match → demote.
+            fp = _fingerprint(inner)
+            if fp != ("", 0, 0) and fp in seen_fp:
+                kept_in, kept_label = seen_fp[fp]
+                events.append({
+                    "reason": "fingerprint",
+                    "fingerprint": list(fp),
+                    "found_in": sec_key,
+                    "kept_in": kept_in,
+                    "kept_label": kept_label,
+                })
+                return _demote(label or kept_label, kept_in)
+            # Keep it — record.
+            if label:
+                label_seen[label] = sec_key
+            if fp != ("", 0, 0):
+                seen_fp[fp] = (sec_key, label or "")
+            return block
+
+        out[sec_key] = _TABLE_ENV.sub(_replace, body)
+
+    if events:
+        log.info("dedup_tables: removed %d duplicate block(s) across %d sections",
+                 len(events), len(sections))
+    return out, events
diff --git a/skills/hermes-sci/package/hermes_sci/writeup.py b/skills/hermes-sci/package/hermes_sci/writeup.py
@@ -22,7 +22,7 @@
 import shutil
 import subprocess
 import time
-from typing import Optional
+from typing import Callable, Optional
 
 from jinja2 import Environment, FileSystemLoader
 
@@ -32,6 +32,7 @@
 from .progress import Progress, ProgressCallback, emit, noop as _noop_progress
 from .results import Results, from_dict as results_from_dict, load as results_load
 from .sanitize import sanitize_latex as _sanitize_latex
+from .sanitize.tables import dedup_tables as _dedup_tables
 from .verify import audit as verify_audit, annotate_unverified
 
 log = logging.getLogger("hermes_sci.writeup")
@@ -178,13 +179,19 @@ class Paper:
     sections: dict[str, str]
 
 
-def _context(idea: dict, results_blob, hw_hint: str, bib_keys: set[str]) -> str:
-    """Assemble the shared prompt context.
+def _context(idea: dict, results_blob, hw_hint: str, bib_keys: set[str],
+             section_key: Optional[str] = None) -> str:
+    """Assemble the prompt context.
 
     `results_blob` may be:
       - None (no results; Phase 1 placeholder-style output)
       - str (free-form markdown; Phase 1 compat)
       - Results dataclass (Phase 3 structured — rendered with exact numbers)
+
+    `section_key` filters tables: each table with an `owning_section` is only
+    fully rendered when writing that section; otherwise it's listed as a
+    reference-only label. Prevents duplicate `\\begin{table}` blocks across
+    sections (the LLM happily re-emits tables it sees in its prompt).
     """
     parts = [
         "IDEA METADATA",
@@ -200,7 +207,7 @@ def _context(idea: dict, results_blob, hw_hint: str, bib_keys: set[str]) -> str:
         f"ALLOWED BIB KEYS (use only these): {sorted(bib_keys) or '(none)'}",
     ]
     if isinstance(results_blob, Results):
-        parts += ["", results_blob.to_prompt_context(),
+        parts += ["", results_blob.to_prompt_context(section_key=section_key),
                   "",
                   "STRICT: Only cite numbers that appear in the metrics or "
                   "tables above. Do NOT invent percentages, BLEU scores, "
@@ -215,11 +222,12 @@ def _context(idea: dict, results_blob, hw_hint: str, bib_keys: set[str]) -> str:
 
 
 async def _gen_section(
-    cfg: BackendConfig, key: str, context: str, model: Optional[str],
-    critique: bool,
+    cfg: BackendConfig, key: str, context_fn: Callable[[str], str],
+    model: Optional[str], critique: bool,
 ) -> tuple[str, str]:
     """Generate (optionally self-critiqued) LaTeX for one section."""
     instr = SECTION_PROMPTS[key]
+    context = context_fn(key)
     user = (
         f"{context}\n\nTASK: {instr}\n\n"
         f"Return ONLY the LaTeX body (no \\section header)."
@@ -250,7 +258,7 @@ async def _gen_section(
 
 
 async def _gen_all_sections(
-    cfg: BackendConfig, context: str, model: Optional[str],
+    cfg: BackendConfig, context_fn: Callable[[str], str], model: Optional[str],
     sections: list[str], critique: bool, parallel: bool,
     concurrency: Optional[int] = None,
     progress: ProgressCallback = _noop_progress,
@@ -273,7 +281,7 @@ def _emit_done(key: str, ok: bool) -> None:
         async def gated(key: str) -> tuple[str, str]:
             async with sem:
                 try:
-                    r = await _gen_section(cfg, key, context, model, critique)
+                    r = await _gen_section(cfg, key, context_fn, model, critique)
                     _emit_done(key, ok=True)
                     return r
                 except Exception:
@@ -286,7 +294,7 @@ async def gated(key: str) -> tuple[str, str]:
         results = []
         for k in sections:
             try:
-                r = await _gen_section(cfg, k, context, model, critique)
+                r = await _gen_section(cfg, k, context_fn, model, critique)
                 results.append(r)
                 _emit_done(k, ok=True)
             except Exception as e:  # noqa: BLE001
@@ -350,12 +358,15 @@ def write_paper(
     sections = sections or list(SECTION_PROMPTS.keys())
     hw = hw or detect_hardware()
     allowed = _bib_keys(TEMPLATE_DIR / "references.bib")
-    ctx = _context(idea, results, hint_for_prompt(hw), allowed)
+    hw_hint = hint_for_prompt(hw)
+
+    def context_fn(section_key: str) -> str:
+        return _context(idea, results, hw_hint, allowed, section_key=section_key)
 
     log.info("generating %d sections (parallel=%s critique=%s) on hardware=%s peak=%s",
              len(sections), parallel, critique, hw.tier, is_minimax_peak())
     raw_sections = asyncio.run(
-        _gen_all_sections(cfg, ctx, model, sections, critique, parallel,
+        _gen_all_sections(cfg, context_fn, model, sections, critique, parallel,
                           concurrency=concurrency, progress=progress)
     )
 
@@ -367,10 +378,25 @@ def write_paper(
         emit(progress, Progress(kind="stage_end", stage="coherence",
                                 meta={"duration_s": time.time() - t_coh}))
 
-    # Collect table IDs from Results for label-recovery pass.
+    # Cross-section duplicate-table removal. Runs BEFORE citation / label
+    # fixing so the label-recovery pass doesn't attempt to re-label a block
+    # we're about to delete.
+    table_ownership: dict[str, str] = {}
     known_table_ids: set[str] = set()
     if isinstance(results, Results):
         known_table_ids = {t.id for t in results.tables}
+        for t in results.tables:
+            if t.owning_section:
+                table_ownership[f"tab:{t.id}"] = t.owning_section
+    raw_sections, dedup_events = _dedup_tables(
+        raw_sections, table_ownership=table_ownership,
+    )
+    for e in dedup_events:
+        emit(progress, Progress(kind="warning", stage="section",
+                                message=f"dedup {e.get('reason')}: "
+                                        f"{e.get('label') or '(no label)'} "
+                                        f"in {e.get('found_in')}",
+                                meta=e))
 
     # Strip citations whose keys aren't in the bib; ensure table labels.
     cleaned: dict[str, str] = {}