Skip to content

Commit e34f871

Browse files
committed
🐛 Relax to aggressive mid-line block starts
1 parent c015d35 commit e34f871

2 files changed

Lines changed: 324 additions & 3 deletions

File tree

bibtexparser/splitter.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,24 @@ def _reset_block_status(self, current_char_index: int) -> None:
5757
self._implicit_comment_start_line = self._current_line
5858
self._implicit_comment_start: Optional[int] = current_char_index
5959

60+
def _is_at_line_start(self, pos: int) -> bool:
61+
"""Check if position is at the start of a line (after optional whitespace).
62+
63+
This is used to determine whether an @ sign should be treated as a new
64+
block start (for error recovery) or as content within a field value.
65+
We only want to abort parsing and start a new block if the @ is at the
66+
beginning of a line, to avoid false positives with @ signs in content.
67+
"""
68+
# Scan backwards from pos to find either newline or non-whitespace
69+
for i in range(pos - 1, -1, -1):
70+
char = self.bibstr[i]
71+
if char == "\n":
72+
return True
73+
elif not char.isspace():
74+
return False
75+
# Start of string counts as line start
76+
return True
77+
6078
def _end_implicit_comment(self, end_char_index) -> Optional[ImplicitComment]:
6179
if self._implicit_comment_start is None:
6280
return # No implicit comment started
@@ -122,7 +140,11 @@ def _move_to_closed_bracket(self) -> int:
122140
return m.start()
123141
else:
124142
num_additional_brackets -= 1
125-
elif m.group(0).startswith("@"):
143+
elif m.group(0).startswith("@") and self._is_at_line_start(m.start()):
144+
# Only abort if the @ is at the start of a line.
145+
# This allows @ signs in field values (e.g., "LeQua @ {CLEF}")
146+
# while still providing error recovery when a new block starts
147+
# on a new line within an unclosed block.
126148
self._unaccepted_mark = m
127149
raise BlockAbortedException(
128150
abort_reason=f"Unexpected block start: `{m.group(0)}`. "
@@ -169,8 +191,13 @@ def _is_escaped():
169191
self._unaccepted_mark = next_mark
170192
return next_mark.start()
171193

172-
# Sanity-check: If new block is starting, we abort
173-
elif next_mark.group(0).startswith("@"):
194+
# Sanity-check: If new block is starting at line start, we abort.
195+
# We only abort if the @ is at the start of a line to allow @ signs
196+
# in field values (e.g., "LeQua @ {CLEF}") while still providing
197+
# error recovery when a new block starts on a new line.
198+
elif next_mark.group(0).startswith("@") and self._is_at_line_start(
199+
next_mark.start()
200+
):
174201
self._unaccepted_mark = next_mark
175202

176203
if currently_quote_escaped:
Lines changed: 294 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
"""Tests for block start detection behavior.
2+
3+
These tests verify the fix for issue #488 and the tradeoffs discussed in PR #416:
4+
- @ signs in field values should not be treated as new block starts
5+
- Multiple blocks on the same line should be parsed correctly
6+
- Error recovery should still work when a new block starts at line start
7+
"""
8+
9+
from textwrap import dedent
10+
11+
import pytest
12+
13+
from bibtexparser.splitter import Splitter
14+
15+
16+
# =============================================================================
17+
# Test: @ signs in field values (issue #488)
18+
# =============================================================================
19+
20+
21+
@pytest.mark.parametrize(
22+
"bibtex_str,expected_key,expected_field,expected_substring",
23+
[
24+
pytest.param(
25+
dedent(
26+
"""\
27+
@inproceedings{DBLP:conf/cikm/EsuliM021,
28+
author = {Andrea Esuli},
29+
title = {LeQua @ {CLEF} 2022: {A} Shared Task},
30+
year = {2021}
31+
}"""
32+
),
33+
"DBLP:conf/cikm/EsuliM021",
34+
"title",
35+
"@ {CLEF}",
36+
id="at_sign_space_brace_in_title",
37+
),
38+
pytest.param(
39+
'@article{test, email = {john.doe@example.com}}',
40+
"test",
41+
"email",
42+
"john.doe@example.com",
43+
id="email_address_in_braces",
44+
),
45+
pytest.param(
46+
'@article{test, email = "john.doe@example.com"}',
47+
"test",
48+
"email",
49+
"john.doe@example.com",
50+
id="email_address_in_quotes",
51+
),
52+
pytest.param(
53+
"@article{test, note = {Contact alice@a.com or bob@b.com}}",
54+
"test",
55+
"note",
56+
"alice@a.com",
57+
id="multiple_at_signs",
58+
),
59+
pytest.param(
60+
"@article{test, title = {Workshop @ {ICML} 2023}}",
61+
"test",
62+
"title",
63+
"@ {ICML}",
64+
id="at_sign_followed_by_brace",
65+
),
66+
pytest.param(
67+
'@article{test, title = "BibTeX entries start with @article{"}',
68+
"test",
69+
"title",
70+
"@article{",
71+
id="literal_at_entry_in_quotes",
72+
),
73+
pytest.param(
74+
# Note: 3 closing braces - inner {}, title field, entry
75+
"@article{test, title = {BibTeX entries start with @article{}}}",
76+
"test",
77+
"title",
78+
"@article{",
79+
id="literal_at_entry_in_braces",
80+
),
81+
],
82+
)
83+
def test_at_sign_in_field_value(
84+
bibtex_str: str, expected_key: str, expected_field: str, expected_substring: str
85+
):
86+
"""@ signs in field values should be parsed as content, not block starts."""
87+
library = Splitter(bibtex_str).split()
88+
89+
assert len(library.failed_blocks) == 0
90+
assert len(library.entries) == 1
91+
assert library.entries[0].key == expected_key
92+
assert expected_substring in library.entries[0][expected_field]
93+
94+
95+
# =============================================================================
96+
# Test: Multiple blocks on the same line
97+
# =============================================================================
98+
99+
100+
@pytest.mark.parametrize(
101+
"bibtex_str,expected_entry_keys",
102+
[
103+
pytest.param(
104+
"@article{key1, title={A}} @book{key2, title={B}}",
105+
["key1", "key2"],
106+
id="two_entries_with_space",
107+
),
108+
pytest.param(
109+
"@article{key1,title={A}}@book{key2,title={B}}",
110+
["key1", "key2"],
111+
id="two_entries_no_space",
112+
),
113+
pytest.param(
114+
"@article{a, x={1}} @book{b, y={2}} @misc{c, z={3}}",
115+
["a", "b", "c"],
116+
id="three_entries",
117+
),
118+
],
119+
)
120+
def test_multiple_entries_same_line(bibtex_str: str, expected_entry_keys: list):
121+
"""Multiple well-formed entries on the same line should all be parsed."""
122+
library = Splitter(bibtex_str).split()
123+
124+
assert len(library.failed_blocks) == 0
125+
assert len(library.entries) == len(expected_entry_keys)
126+
assert [e.key for e in library.entries] == expected_entry_keys
127+
128+
129+
@pytest.mark.parametrize(
130+
"bibtex_str,expected_entries,expected_strings,expected_comments",
131+
[
132+
pytest.param(
133+
'@article{key1, title={A}} @string{mystr = "value"}',
134+
1,
135+
1,
136+
0,
137+
id="entry_and_string",
138+
),
139+
pytest.param(
140+
"@article{key1, title={A}} @comment{A comment}",
141+
1,
142+
0,
143+
1,
144+
id="entry_and_comment",
145+
),
146+
],
147+
)
148+
def test_mixed_blocks_same_line(
149+
bibtex_str: str, expected_entries: int, expected_strings: int, expected_comments: int
150+
):
151+
"""Different block types on the same line should all be parsed."""
152+
library = Splitter(bibtex_str).split()
153+
154+
assert len(library.failed_blocks) == 0
155+
assert len(library.entries) == expected_entries
156+
assert len(library.strings) == expected_strings
157+
assert len(library.comments) == expected_comments
158+
159+
160+
# =============================================================================
161+
# Test: Error recovery when new block starts at line start
162+
# =============================================================================
163+
164+
165+
@pytest.mark.parametrize(
166+
"bibtex_str,expected_valid_key",
167+
[
168+
pytest.param(
169+
dedent(
170+
"""\
171+
@article{broken, title={Unclosed
172+
@article{valid, title={Valid Entry}}"""
173+
),
174+
"valid",
175+
id="unclosed_entry_field",
176+
),
177+
pytest.param(
178+
dedent(
179+
"""\
180+
@string{broken = {unclosed value
181+
@article{valid, title={Valid Entry}}"""
182+
),
183+
"valid",
184+
id="unclosed_string",
185+
),
186+
pytest.param(
187+
dedent(
188+
"""\
189+
@article{broken, title={Unclosed
190+
@article{valid, title={Valid Entry}}"""
191+
),
192+
"valid",
193+
id="indented_new_block",
194+
),
195+
],
196+
)
197+
def test_error_recovery_at_line_start(bibtex_str: str, expected_valid_key: str):
198+
"""New block at line start should trigger recovery from malformed block."""
199+
library = Splitter(bibtex_str).split()
200+
201+
assert len(library.failed_blocks) == 1
202+
assert len(library.entries) == 1
203+
assert library.entries[0].key == expected_valid_key
204+
205+
206+
def test_error_recovery_preserves_failed_block_raw():
207+
"""The failed block should contain raw text up to where recovery started."""
208+
bibtex_str = dedent(
209+
"""\
210+
@article{broken, title={This is unclosed
211+
@article{valid, title={OK}}"""
212+
)
213+
library = Splitter(bibtex_str).split()
214+
215+
assert len(library.failed_blocks) == 1
216+
failed = library.failed_blocks[0]
217+
assert "broken" in failed.raw
218+
assert "This is unclosed" in failed.raw
219+
220+
221+
# =============================================================================
222+
# Test: No false recovery for @ mid-line
223+
# =============================================================================
224+
225+
226+
@pytest.mark.parametrize(
227+
"bibtex_str",
228+
[
229+
pytest.param(
230+
"@article{test, title={unclosed @misc{fake}",
231+
id="at_entry_mid_line",
232+
),
233+
pytest.param(
234+
"@article{test, title={text @ {more} unclosed",
235+
id="at_brace_mid_line",
236+
),
237+
],
238+
)
239+
def test_no_false_recovery_mid_line(bibtex_str: str):
240+
"""@ mid-line should not trigger false error recovery."""
241+
library = Splitter(bibtex_str).split()
242+
243+
# Should fail as one block, no recovery
244+
assert len(library.failed_blocks) == 1
245+
assert len(library.entries) == 0
246+
247+
248+
# =============================================================================
249+
# Test: Edge cases
250+
# =============================================================================
251+
252+
253+
@pytest.mark.parametrize(
254+
"bibtex_str",
255+
[
256+
pytest.param(
257+
"@article{test, title={Hello}}",
258+
id="block_at_file_start",
259+
),
260+
pytest.param(
261+
" \t @article{test, title={Hello}}",
262+
id="block_after_whitespace_only",
263+
),
264+
pytest.param(
265+
"@article{test, title={L1 {L2 {user@email.com} back} done}}",
266+
id="nested_braces_with_at",
267+
),
268+
],
269+
)
270+
def test_edge_cases_entries(bibtex_str: str):
271+
"""Various edge cases should parse without failure."""
272+
library = Splitter(bibtex_str).split()
273+
274+
assert len(library.failed_blocks) == 0
275+
assert len(library.entries) == 1
276+
277+
278+
def test_preamble_with_at_sign():
279+
"""@ sign inside a preamble block."""
280+
bibtex_str = '@preamble{"Contact: admin@site.org"}'
281+
library = Splitter(bibtex_str).split()
282+
283+
assert len(library.failed_blocks) == 0
284+
assert len(library.preambles) == 1
285+
286+
287+
def test_explicit_comment_with_at_sign():
288+
"""@ sign inside an explicit comment block."""
289+
bibtex_str = "@comment{Email: test@example.com}"
290+
library = Splitter(bibtex_str).split()
291+
292+
assert len(library.failed_blocks) == 0
293+
assert len(library.comments) == 1
294+
assert "test@example.com" in library.comments[0].comment

0 commit comments

Comments
 (0)