Skip to content

Commit aef5d5d

Browse files
committed
move filtering before token processing
1 parent d8ca02d commit aef5d5d

2 files changed

Lines changed: 33 additions & 26 deletions

File tree

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC
2-
from typing import Callable, Iterable, Optional, Sequence, Union
2+
from typing import Callable, Iterable, List, Optional, Sequence, Union
33

44
from .paratext_project_file_handler import ParatextProjectFileHandler
55
from .paratext_project_settings import ParatextProjectSettings
@@ -10,7 +10,9 @@
1010
UpdateUsfmRow,
1111
UpdateUsfmTextBehavior,
1212
)
13-
from .usfm_parser import parse_usfm
13+
from .usfm_parser import UsfmParser
14+
from .usfm_token import UsfmTokenType
15+
from .usfm_tokenizer import UsfmToken, UsfmTokenizer
1416
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
1517

1618

@@ -61,12 +63,38 @@ def update_usfm(
6163
compare_segments=compare_segments,
6264
)
6365
try:
64-
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
65-
return handler.get_usfm(self._settings.stylesheet, chapters)
66+
tokenizer = UsfmTokenizer(self._settings.stylesheet)
67+
tokens = tokenizer.tokenize(usfm)
68+
tokens = self.filter_tokens_by_chapter(tokens, chapters)
69+
parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification)
70+
parser.process_tokens()
71+
return handler.get_usfm(self._settings.stylesheet)
6672
except Exception as e:
6773
error_message = (
6874
f"An error occurred while parsing the usfm for '{book_id}'"
6975
f"{f' in project {self._settings.name}' if self._settings.name else ''}"
7076
f". Error: '{e}'"
7177
)
7278
raise RuntimeError(error_message) from e
79+
80+
def filter_tokens_by_chapter(
81+
self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
82+
) -> Sequence[UsfmToken]:
83+
if chapters is None:
84+
return tokens
85+
tokens_within_chapters: List[UsfmToken] = []
86+
in_chapter: bool = False
87+
for index, token in enumerate(tokens):
88+
if index == 0 and token.marker == "id":
89+
tokens_within_chapters.append(token)
90+
if 1 in chapters:
91+
in_chapter = True
92+
elif token.type == UsfmTokenType.CHAPTER:
93+
if token.data and int(token.data) in chapters:
94+
in_chapter = True
95+
tokens_within_chapters.append(token)
96+
else:
97+
in_chapter = False
98+
elif in_chapter:
99+
tokens_within_chapters.append(token)
100+
return tokens_within_chapters

machine/corpora/update_usfm_parser_handler.py

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -334,15 +334,11 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -
334334
if embed_outside_of_block:
335335
self._end_update_block(state, [scripture_ref])
336336

337-
def get_usfm(
338-
self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None
339-
) -> str:
337+
def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
340338
if isinstance(stylesheet, str):
341339
stylesheet = UsfmStylesheet(stylesheet)
342340
tokenizer = UsfmTokenizer(stylesheet)
343341
tokens = list(self._tokens)
344-
if chapters is not None:
345-
tokens = self._get_incremental_draft_tokens(tokens, chapters)
346342
if len(self._remarks) > 0:
347343
remark_tokens: List[UsfmToken] = []
348344
for remark in self._remarks:
@@ -354,23 +350,6 @@ def get_usfm(
354350
tokens[index + 1 : index + 1] = remark_tokens
355351
return tokenizer.detokenize(tokens)
356352

357-
def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]:
358-
incremental_draft_tokens: List[UsfmToken] = []
359-
in_chapter: bool = False
360-
for index, token in enumerate(tokens):
361-
if index == 0 and token.marker == "id":
362-
incremental_draft_tokens.append(token)
363-
continue
364-
elif token.type == UsfmTokenType.CHAPTER:
365-
if token.data and int(token.data) in chapters:
366-
in_chapter = True
367-
incremental_draft_tokens.append(token)
368-
else:
369-
in_chapter = False
370-
elif in_chapter:
371-
incremental_draft_tokens.append(token)
372-
return incremental_draft_tokens
373-
374353
def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
375354
row_texts: List[str] = []
376355
row_metadata = None

0 commit comments

Comments
 (0)