Skip to content

Commit e423708

Browse files
committed
add test case for chapter filtering
1 parent aef5d5d commit e423708

3 files changed

Lines changed: 59 additions & 25 deletions

File tree

machine/corpora/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from .paratext_project_settings import ParatextProjectSettings
2828
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
2929
from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
30-
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
30+
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter
3131
from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
3232
from .paratext_text_corpus import ParatextTextCorpus
3333
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ def __init__(
3131
def update_usfm(
3232
self,
3333
book_id: str,
34-
chapters: Optional[Sequence[int]] = None,
3534
rows: Optional[Sequence[UpdateUsfmRow]] = None,
35+
chapters: Optional[Sequence[int]] = None,
3636
full_name: Optional[str] = None,
3737
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
3838
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
@@ -65,7 +65,7 @@ def update_usfm(
6565
try:
6666
tokenizer = UsfmTokenizer(self._settings.stylesheet)
6767
tokens = tokenizer.tokenize(usfm)
68-
tokens = self.filter_tokens_by_chapter(tokens, chapters)
68+
tokens = filter_tokens_by_chapter(tokens, chapters)
6969
parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification)
7070
parser.process_tokens()
7171
return handler.get_usfm(self._settings.stylesheet)
@@ -77,24 +77,25 @@ def update_usfm(
7777
)
7878
raise RuntimeError(error_message) from e
7979

80-
def filter_tokens_by_chapter(
81-
self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
82-
) -> Sequence[UsfmToken]:
83-
if chapters is None:
84-
return tokens
85-
tokens_within_chapters: List[UsfmToken] = []
86-
in_chapter: bool = False
87-
for index, token in enumerate(tokens):
88-
if index == 0 and token.marker == "id":
89-
tokens_within_chapters.append(token)
90-
if 1 in chapters:
91-
in_chapter = True
92-
elif token.type == UsfmTokenType.CHAPTER:
93-
if token.data and int(token.data) in chapters:
94-
in_chapter = True
95-
tokens_within_chapters.append(token)
96-
else:
97-
in_chapter = False
98-
elif in_chapter:
80+
81+
def filter_tokens_by_chapter(
82+
tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
83+
) -> Sequence[UsfmToken]:
84+
if chapters is None:
85+
return tokens
86+
tokens_within_chapters: List[UsfmToken] = []
87+
in_chapter: bool = False
88+
for index, token in enumerate(tokens):
89+
if index == 0 and token.marker == "id":
90+
tokens_within_chapters.append(token)
91+
if 1 in chapters:
92+
in_chapter = True
93+
elif token.type == UsfmTokenType.CHAPTER:
94+
if token.data and int(token.data) in chapters:
95+
in_chapter = True
9996
tokens_within_chapters.append(token)
100-
return tokens_within_chapters
97+
else:
98+
in_chapter = False
99+
elif in_chapter:
100+
tokens_within_chapters.append(token)
101+
return tokens_within_chapters

tests/corpora/test_update_usfm_parser_handler.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@
99
UpdateUsfmParserHandler,
1010
UpdateUsfmRow,
1111
UpdateUsfmTextBehavior,
12+
UsfmParser,
13+
UsfmTokenizer,
1214
UsfmUpdateBlock,
1315
UsfmUpdateBlockElementType,
1416
UsfmUpdateBlockHandler,
15-
parse_usfm,
17+
filter_tokens_by_chapter,
1618
)
1719

1820

@@ -1494,13 +1496,39 @@ def test_update_block_footnote_at_start_of_chapter_with_preceding_text():
14941496
)
14951497

14961498

1499+
def test_filter_chapters() -> None:
1500+
usfm = r"""\id MAT - Test
1501+
\h Matthew
1502+
\c 1
1503+
\v 1 Some text
1504+
\v 2
1505+
\v 3 Other text
1506+
\c 2
1507+
\v 1 Some text
1508+
\c 3
1509+
\v 1 Some text
1510+
\c 4
1511+
\v 1 Some text
1512+
"""
1513+
chapters = [2, 4]
1514+
target = update_usfm(chapters=chapters, source=usfm)
1515+
result = r"""\id MAT
1516+
\c 2
1517+
\v 1 Some text
1518+
\c 4
1519+
\v 1 Some text
1520+
"""
1521+
assert_usfm_equals(target, result)
1522+
1523+
14971524
def scr_ref(*refs: str) -> List[ScriptureRef]:
14981525
return [ScriptureRef.parse(ref) for ref in refs]
14991526

15001527

15011528
def update_usfm(
15021529
rows: Optional[Sequence[UpdateUsfmRow]] = None,
15031530
source: Optional[str] = None,
1531+
chapters: Optional[Sequence[int]] = None,
15041532
id_text: Optional[str] = None,
15051533
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW,
15061534
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
@@ -1516,6 +1544,7 @@ def update_usfm(
15161544
return updater.update_usfm(
15171545
"MAT",
15181546
rows,
1547+
chapters,
15191548
id_text,
15201549
text_behavior,
15211550
paragraph_behavior,
@@ -1542,7 +1571,11 @@ def update_usfm(
15421571
lambda _: False,
15431572
compare_segments,
15441573
)
1545-
parse_usfm(source, updater)
1574+
tokenizer = UsfmTokenizer()
1575+
tokens = tokenizer.tokenize(source)
1576+
tokens = filter_tokens_by_chapter(tokens, chapters)
1577+
parser = UsfmParser(tokens, updater)
1578+
parser.process_tokens()
15461579
return updater.get_usfm()
15471580

15481581

0 commit comments

Comments
 (0)