Skip to content

Commit 3902ca3

Browse files
fix(sanitize): prose underscores + orphan $ from truncated LLM output
Real-run demo produced a 6-page PDF only after three manual fixes. Two were systematic sanitize gaps; this commit closes both. 1. escape.py — _BARE_UNDERSCORE rule. The LLM writes Greek letter names as prose ("lambda and epsilon_target") instead of math ($\lambda$ and $\epsilon_{target}$). The prose escape pass handled %, &, <, > but not _. pdflatex then sees `_t` in text mode, tries to open a subscript, and bails with the misleading "Missing $ inserted". Added \_ escape with `(?<!\\)_(?!\{)` so already-escaped \_ and legitimate `_{...}` subscript-like forms (left as-is; the author can wrap in $ if they meant math) pass through. 2. sanitize/math_balance.py — drop orphan `$`. LLM truncates an inline equation mid-sentence: "...compare with $" immediately followed by \section{...}. The half-open math mode cascades into spurious "Missing $ inserted" errors pages later. New pass counts unescaped `$` outside display/tabular math blocks; if odd, drops the last one. Log-warns with offset so the author can fix the truncated prose by hand if they care. Wired into SANITIZE_PIPELINE BEFORE escape_prose_specials, because escape's math-segment scanner assumes balanced inline math. Non-fix (documented): the third demo failure — a structurally mangled equation with \emph{} inside math and malformed \frac — is an LLM generation-quality problem, not something sanitize can recover from. The existing log-driven retry pass already attempts that; stronger retries belong in writeup.py, not here. Tests: 15 new cases. Total suite now 58 passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 21f062c commit 3902ca3

5 files changed

Lines changed: 221 additions & 1 deletion

File tree

skills/hermes-sci/package/hermes_sci/sanitize/escape.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@
4141
_BARE_LT = re.compile(r"(?<![\\$])<(?!=)(?=\s|\d|[A-Za-z])")
4242
_BARE_GT = re.compile(r"(?<![\\$])>(?!=)(?=\s|\d|[A-Za-z])")
4343
_BARE_AMP = re.compile(r"(?<!\\)&(?![A-Za-z]{2,6};)")
44+
# Prose underscores: `epsilon_target`, `file_name` → must be \_ or LaTeX tries
45+
# to open a subscript (which only works in math mode). Skip `\_` (already
46+
# escaped) and `_{...}` (common math-like usage the LLM forgot to wrap in $).
47+
_BARE_UNDERSCORE = re.compile(r"(?<!\\)_(?!\{)")
4448

4549

4650
def _unescape_table_amps(s: str) -> str:
@@ -55,6 +59,7 @@ def _escape_chunk(prose: str) -> str:
5559
prose = _BARE_AMP.sub(r"\\&", prose)
5660
prose = _BARE_LT.sub(r"$<$", prose)
5761
prose = _BARE_GT.sub(r"$>$", prose)
62+
prose = _BARE_UNDERSCORE.sub(r"\\_", prose)
5863
return prose
5964

6065

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""Drop orphan `$` signs left behind by truncated LLM output.
2+
3+
Failure mode this targets: MiniMax / any LLM sometimes truncates an inline
4+
equation mid-sentence, e.g.
5+
6+
We compare against the ensemble variant with $
7+
8+
followed immediately by `\\section{Experiments}`. The unmatched `$` opens
9+
math mode that never closes, so pdflatex fails with "Missing $ inserted"
10+
anywhere downstream — producing a cascade of misleading errors.
11+
12+
We can't reliably guess what the LLM intended to write, so we don't try to
13+
reconstruct the equation. We drop the orphan instead: outside a section the
14+
unclosed `$` is unsalvageable prose anyway, and leaving it breaks the whole
15+
paper. A logged warning surfaces the event so the author can fix the prose
16+
by hand if they care about that sentence.
17+
18+
Scope: only considers `$` that are NOT inside a display-math or tabular
19+
environment (those match as a whole via _MATH_BLOCK below and are preserved
20+
verbatim). Paired `$...$` inline math is fine — we remove only the last
21+
unmatched `$` when the count is odd.
22+
"""
23+
from __future__ import annotations
24+
25+
import logging
26+
import re
27+
28+
log = logging.getLogger("hermes_sci.sanitize.math_balance")
29+
30+
# Display / block math environments whose internal `$` we must NOT count.
31+
_MATH_BLOCK = re.compile(
32+
r"\\begin\{equation\*?\}.*?\\end\{equation\*?\}"
33+
r"|\\begin\{align\*?\}.*?\\end\{align\*?\}"
34+
r"|\\begin\{gather\*?\}.*?\\end\{gather\*?\}"
35+
r"|\\begin\{multline\*?\}.*?\\end\{multline\*?\}"
36+
r"|\\begin\{tabular\*?\}.*?\\end\{tabular\*?\}"
37+
r"|\\begin\{array\}.*?\\end\{array\}"
38+
r"|\\\[.*?\\\]"
39+
r"|\$\$.*?\$\$",
40+
re.DOTALL,
41+
)
42+
43+
# Escaped dollar (literal `$` character) — already fine, must be ignored by
44+
# the counter. Matches `\$`.
45+
_ESCAPED_DOLLAR = re.compile(r"\\\$")
46+
47+
48+
def balance_inline_math(s: str) -> str:
49+
"""If the prose section has an odd number of unescaped `$`, drop the
50+
last one. Returns input unchanged when already balanced.
51+
"""
52+
# Black out regions the scanner must ignore: math blocks + literal \$.
53+
blanked = _MATH_BLOCK.sub(lambda m: " " * len(m.group(0)), s)
54+
blanked = _ESCAPED_DOLLAR.sub(" ", blanked)
55+
56+
dollar_positions = [i for i, ch in enumerate(blanked) if ch == "$"]
57+
if len(dollar_positions) % 2 == 0:
58+
return s
59+
60+
# Odd count → drop the LAST unpaired `$`. Heuristic: most LLM-truncation
61+
# cases leave the stray `$` at the very end of a sentence. Dropping the
62+
# last occurrence recovers the balanced state and preserves any earlier
63+
# `$x$` pairs the author did finish.
64+
last = dollar_positions[-1]
65+
log.warning("dropping orphan $ at offset %d (odd count = %d)",
66+
last, len(dollar_positions))
67+
return s[:last] + s[last + 1:]

skills/hermes-sci/package/hermes_sci/sanitize/pipeline.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
5. markdown → LaTeX (bold / italic / heading)
1010
6. strip bad \\input \\include
1111
7. wrap lonely \\item
12-
8. escape prose specials (%, &, <, >) — runs last so earlier passes'
12+
8. balance inline math — drop orphan `$` from truncated LLM output
13+
BEFORE the escape pass, whose math-segment scanner assumes balanced `$`
14+
9. escape prose specials (%, &, <, >, _) — runs last so earlier passes'
1315
output is also escaped
1416
1517
To add a pass: write a module with a `str → str` function, append to
@@ -26,6 +28,7 @@
2628
from .fences import strip_code_fences
2729
from .items import wrap_lonely_items
2830
from .markdown import md_to_latex
31+
from .math_balance import balance_inline_math
2932
from .packages import apply_package_fallbacks
3033
from .reasoning import strip_reasoning
3134

@@ -39,6 +42,7 @@
3942
md_to_latex,
4043
strip_bad_commands,
4144
wrap_lonely_items,
45+
balance_inline_math,
4246
escape_prose_specials,
4347
]
4448

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""Prose-level underscore escape.
2+
3+
Regression for the demo paper pipeline run where the LLM wrote
4+
`(lambda and epsilon_target)` in running prose. Unescaped `_` outside
5+
math mode makes pdflatex emit "Missing $ inserted" and bail.
6+
"""
7+
from __future__ import annotations
8+
9+
from hermes_sci.sanitize.escape import escape_prose_specials
10+
11+
12+
def test_prose_underscore_escaped():
13+
src = "hyperparameters (lambda and epsilon_target) require tuning"
14+
out = escape_prose_specials(src)
15+
assert r"epsilon\_target" in out
16+
assert "epsilon_target" not in out.replace(r"\_", "@")
17+
18+
19+
def test_inline_math_underscores_preserved():
20+
src = r"the threshold $\epsilon_{target}$ is set to $x_1$"
21+
out = escape_prose_specials(src)
22+
# Everything inside $...$ survives untouched.
23+
assert r"$\epsilon_{target}$" in out
24+
assert r"$x_1$" in out
25+
26+
27+
def test_display_math_underscores_preserved():
28+
src = (
29+
r"See eq:" "\n"
30+
r"\begin{equation}" "\n"
31+
r" f(x) = w_i^\top x_1" "\n"
32+
r"\end{equation}" "\n"
33+
r"end."
34+
)
35+
out = escape_prose_specials(src)
36+
assert "w_i^\\top x_1" in out # untouched inside equation
37+
assert out.endswith("end.")
38+
39+
40+
def test_already_escaped_underscore_untouched():
41+
src = r"see file\_name for details"
42+
out = escape_prose_specials(src)
43+
assert out == src
44+
45+
46+
def test_subscript_like_underscore_in_prose_kept_raw():
47+
r"""`word_{sub}` in prose probably means the LLM forgot `$...$`.
48+
We leave it alone — escaping to `word\_{sub}` would look worse, and
49+
`_{` is almost always followed by math content; the author can fix
50+
the math-mode wrapping themselves. Documented behavior, not an oversight.
51+
"""
52+
src = r"the term x_{target} appears unescaped"
53+
out = escape_prose_specials(src)
54+
# `_{` is NOT escaped — regex requires lookahead `(?!\{)`.
55+
assert r"x_{target}" in out
56+
57+
58+
def test_underscore_in_textbf_argument_escaped():
59+
src = r"\textbf{epsilon_target} is our hyperparam"
60+
out = escape_prose_specials(src)
61+
assert r"\textbf{epsilon\_target}" in out
62+
63+
64+
def test_all_prose_specials_together():
65+
src = "100% improvement in R&D; compare A<B, use file_name for x>y"
66+
out = escape_prose_specials(src)
67+
assert r"100\%" in out
68+
assert r"R\&D" in out
69+
assert r"file\_name" in out
70+
assert "$<$" in out and "$>$" in out
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""Orphan `$` detection & removal.
2+
3+
Regression for the demo paper run where MiniMax truncated an inline
4+
equation mid-sentence (`with $` followed by `\\section{...}`), leaving an
5+
unclosed math mode that crashed pdflatex with a misleading error pointing
6+
far downstream.
7+
"""
8+
from __future__ import annotations
9+
10+
from hermes_sci.sanitize.math_balance import balance_inline_math
11+
12+
13+
def test_balanced_inline_math_unchanged():
14+
src = r"we set $x = 1$ and $y = 2$; done."
15+
assert balance_inline_math(src) == src
16+
17+
18+
def test_no_math_unchanged():
19+
src = "plain text with no dollar signs at all."
20+
assert balance_inline_math(src) == src
21+
22+
23+
def test_orphan_dollar_at_sentence_end_dropped():
24+
src = "We compare against the ensemble variant with $\n\n\\section{Experiments}"
25+
out = balance_inline_math(src)
26+
assert "$" not in out
27+
assert r"\section{Experiments}" in out
28+
29+
30+
def test_orphan_dollar_preserves_earlier_balanced_pair():
31+
src = r"we have $x=1$ and later a stray $ appears."
32+
out = balance_inline_math(src)
33+
# Earlier balanced `$x=1$` must survive.
34+
assert "$x=1$" in out
35+
# Stray one removed.
36+
assert out.count("$") == 2
37+
38+
39+
def test_display_math_ignored_by_counter():
40+
src = (
41+
r"prose $x$ more." "\n"
42+
r"\begin{equation}" "\n"
43+
r"a = b" "\n"
44+
r"\end{equation}" "\n"
45+
r"\[ c = d \]" "\n"
46+
r"final."
47+
)
48+
# Two `$` in prose (balanced) + `$$` inside display shouldn't confuse us.
49+
assert balance_inline_math(src) == src
50+
51+
52+
def test_escaped_dollar_ignored():
53+
src = r"literal \$100 sign and paired $x$ — balanced."
54+
# `\$` is a LaTeX literal dollar, must not be counted as inline-math delim.
55+
assert balance_inline_math(src) == src
56+
57+
58+
def test_triple_dollar_in_prose_drops_last():
59+
src = "three stray $ dollars $ in $ a row"
60+
out = balance_inline_math(src)
61+
assert out.count("$") == 2 # one removed → even count restored
62+
63+
64+
def test_dollar_inside_display_math_does_not_mask_prose_orphan():
65+
"""A bare `$` in prose + a self-contained display-math block
66+
should still detect the prose `$` as orphan."""
67+
src = (
68+
r"prose $ orphan here." "\n"
69+
r"\begin{equation}" "\n"
70+
r"x + y = z" "\n"
71+
r"\end{equation}"
72+
)
73+
out = balance_inline_math(src)
74+
assert "$" not in out.split(r"\begin{equation}")[0]

0 commit comments

Comments
 (0)