|
1 | 1 | from abc import ABC |
2 | | -from typing import Callable, Iterable, Optional, Sequence, Union |
| 2 | +from typing import Callable, Iterable, List, Optional, Sequence, Union |
3 | 3 |
|
4 | 4 | from .paratext_project_file_handler import ParatextProjectFileHandler |
5 | 5 | from .paratext_project_settings import ParatextProjectSettings |
|
10 | 10 | UpdateUsfmRow, |
11 | 11 | UpdateUsfmTextBehavior, |
12 | 12 | ) |
13 | | -from .usfm_parser import parse_usfm |
| 13 | +from .usfm_parser import UsfmParser |
| 14 | +from .usfm_token import UsfmTokenType |
| 15 | +from .usfm_tokenizer import UsfmToken, UsfmTokenizer |
14 | 16 | from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError |
15 | 17 |
|
16 | 18 |
|
@@ -61,12 +63,38 @@ def update_usfm( |
61 | 63 | compare_segments=compare_segments, |
62 | 64 | ) |
63 | 65 | try: |
64 | | - parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) |
65 | | - return handler.get_usfm(self._settings.stylesheet, chapters) |
| 66 | + tokenizer = UsfmTokenizer(self._settings.stylesheet) |
| 67 | + tokens = tokenizer.tokenize(usfm) |
| 68 | + tokens = self.filter_tokens_by_chapter(tokens, chapters) |
| 69 | + parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification) |
| 70 | + parser.process_tokens() |
| 71 | + return handler.get_usfm(self._settings.stylesheet) |
66 | 72 | except Exception as e: |
67 | 73 | error_message = ( |
68 | 74 | f"An error occurred while parsing the usfm for '{book_id}'" |
69 | 75 | f"{f' in project {self._settings.name}' if self._settings.name else ''}" |
70 | 76 | f". Error: '{e}'" |
71 | 77 | ) |
72 | 78 | raise RuntimeError(error_message) from e |
| 79 | + |
| 80 | + def filter_tokens_by_chapter( |
| 81 | + self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None |
| 82 | + ) -> Sequence[UsfmToken]: |
| 83 | + if chapters is None: |
| 84 | + return tokens |
| 85 | + tokens_within_chapters: List[UsfmToken] = [] |
| 86 | + in_chapter: bool = False |
| 87 | + for index, token in enumerate(tokens): |
| 88 | + if index == 0 and token.marker == "id": |
| 89 | + tokens_within_chapters.append(token) |
| 90 | + if 1 in chapters: |
| 91 | + in_chapter = True |
| 92 | + elif token.type == UsfmTokenType.CHAPTER: |
| 93 | + if token.data and int(token.data) in chapters: |
| 94 | + in_chapter = True |
| 95 | + tokens_within_chapters.append(token) |
| 96 | + else: |
| 97 | + in_chapter = False |
| 98 | + elif in_chapter: |
| 99 | + tokens_within_chapters.append(token) |
| 100 | + return tokens_within_chapters |
0 commit comments