Skip to content

Commit f1dc4f1

Browse files
authored
Key terms updates for SILNLP and Serval consistency (#257)
* Update key terms handling to support silnlp features * Port sillsdev/machine#362 * Remove non-pt localizations from pt xml (to match Machine C#) * Add key terms as partial words * Port 'Expose chapter numbers' sillsdev/machine#369 * Add content type property to rows
1 parent acff116 commit f1dc4f1

34 files changed

Lines changed: 430 additions & 14324 deletions

machine/corpora/BiblicalTermsPt.xml

Lines changed: 0 additions & 14205 deletions
Large diffs are not rendered by default.

machine/corpora/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from .dictionary_text_corpus import DictionaryTextCorpus
1010
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
1111
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
12+
from .file_paratext_project_terms_parser import FileParatextProjectTermsParser
1213
from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
1314
from .file_paratext_project_versification_error_detector import FileParatextProjectVersificationErrorDetector
1415
from .flatten import flatten
@@ -25,7 +26,7 @@
2526
from .paratext_project_file_handler import ParatextProjectFileHandler
2627
from .paratext_project_settings import ParatextProjectSettings
2728
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
28-
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
29+
from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
2930
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
3031
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
3132
from .paratext_text_corpus import ParatextTextCorpus
@@ -47,6 +48,7 @@
4748
from .text_file_text import TextFileText
4849
from .text_file_text_corpus import TextFileTextCorpus
4950
from .text_row import TextRow, TextRowFlags
51+
from .text_row_content_type import TextRowContentType
5052
from .token_processors import (
5153
escape_spaces,
5254
lowercase,
@@ -101,6 +103,7 @@
101103
"batch",
102104
"Corpus",
103105
"create_versification_ref_corpus",
106+
"TextRowContentType",
104107
"DblBundleTextCorpus",
105108
"DictionaryAlignmentCorpus",
106109
"DictionaryTextCorpus",
@@ -109,10 +112,12 @@
109112
"extract_scripture_corpus",
110113
"FileParatextProjectFileHandler",
111114
"FileParatextProjectSettingsParser",
115+
"FileParatextProjectTermsParser",
112116
"FileParatextProjectTextUpdater",
113117
"FileParatextProjectVersificationErrorDetector",
114118
"flatten",
115119
"is_scripture",
120+
"KeyTerm",
116121
"lowercase",
117122
"MemoryAlignmentCollection",
118123
"MemoryStreamContainer",

machine/corpora/corpora_utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,14 @@ def get_split_indices(
4949
return set(rand.sample(range(corpus_size), min(split_size, corpus_size)))
5050

5151

52-
def get_files(file_patterns: Iterable[str]) -> Iterable[Tuple[str, str]]:
52+
def get_files(file_patterns: Iterable[str]) -> Iterable[Tuple[str, str, int]]:
5353
file_patterns = list(file_patterns)
5454
if len(file_patterns) == 1 and os.path.isfile(file_patterns[0]):
55-
yield ("*all*", file_patterns[0])
55+
yield ("*all*", file_patterns[0], 0)
5656
else:
5757
for i, file_pattern in enumerate(file_patterns):
5858
if os.path.isfile(file_pattern):
59-
yield (str(i), file_pattern)
59+
yield (str(i), file_pattern, i)
6060
continue
6161

6262
if "*" not in file_pattern and "?" not in file_pattern and not os.path.exists(file_pattern):
@@ -89,7 +89,7 @@ def get_files(file_patterns: Iterable[str]) -> Iterable[Tuple[str, str]]:
8989
updated_id += group
9090
if len(updated_id) > 0:
9191
id = updated_id
92-
yield (id, filename)
92+
yield (id, filename, i)
9393

9494

9595
def gen(iterable: Iterable[T] = []) -> Generator[T, None, None]:
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from ..utils.typeshed import StrPath
2+
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
3+
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
4+
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
5+
6+
7+
class FileParatextProjectTermsParser(ParatextProjectTermsParserBase):
8+
def __init__(self, project_dir: StrPath) -> None:
9+
super().__init__(
10+
FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse()
11+
)

machine/corpora/key_term.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from dataclasses import dataclass
2+
from typing import List
3+
4+
from ..scripture.verse_ref import VerseRef
5+
6+
7+
@dataclass(frozen=True)
8+
class KeyTerm:
9+
id: str
10+
category: str
11+
domain: str
12+
renderings: List[str]
13+
references: List[VerseRef]
14+
renderings_patterns: List[str]

machine/corpora/n_parallel_text_corpus.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,14 @@
88
from .text_corpus import TextCorpus
99
from .text_corpus_enumerator import TextCorpusEnumerator
1010
from .text_row import TextRow, TextRowFlags
11+
from .text_row_content_type import TextRowContentType
1112

1213

1314
class _RangeRow:
1415
refs: List[Any]
1516
segment: List[str]
1617
is_sentence_start: bool = False
18+
content_type: TextRowContentType = TextRowContentType.SEGMENT
1719

1820
@property
1921
def is_in_range(self):
@@ -36,6 +38,7 @@ def __init__(self, n: int):
3638
self.text_id = ""
3739
self.versifications: Optional[List[Versification]] = None
3840
self.row_ref_comparer = None
41+
self.content_type = TextRowContentType.SEGMENT
3942

4043
@property
4144
def is_in_range(self) -> bool:
@@ -44,6 +47,7 @@ def is_in_range(self) -> bool:
4447
def add_text_row(self, row: TextRow, index: int):
4548
self.text_id = row.text_id
4649
self.rows[index].refs.append(row.ref)
50+
self.rows[index].content_type = row.content_type
4751
if self.rows[index].is_empty:
4852
self.rows[index].is_sentence_start = row.is_sentence_start
4953
self.rows[index].segment.extend(row.segment)
@@ -53,6 +57,7 @@ def create_row(self) -> NParallelTextRow:
5357
reference_refs: List[Any] = [r.refs[0] if len(r.refs) > 0 else None for r in self.rows if len(r.refs) > 0]
5458
for i in range(len(self.rows)):
5559
row = self.rows[i]
60+
self.content_type = row.content_type
5661

5762
if (
5863
self.versifications is not None
@@ -62,7 +67,7 @@ def create_row(self) -> NParallelTextRow:
6267
refs[i] = [cast(ScriptureRef, r).change_versification(self.versifications[i]) for r in reference_refs]
6368
else:
6469
refs[i] = row.refs.copy()
65-
n_parallel_text_row = NParallelTextRow(self.text_id, refs)
70+
n_parallel_text_row = NParallelTextRow(self.text_id, refs, self.content_type)
6671
n_parallel_text_row.n_segments = [r.segment.copy() for r in self.rows]
6772
n_parallel_text_row.n_flags = [
6873
TextRowFlags.SENTENCE_START if r.is_sentence_start else TextRowFlags.NONE for r in self.rows
@@ -288,6 +293,7 @@ def _create_rows(
288293
yield range_info.create_row()
289294

290295
default_refs = [[r.ref for r in rows if r is not None][0]]
296+
content_type = TextRowContentType.SEGMENT
291297

292298
text_id: Optional[str] = None
293299
refs: List[List[Any]] = []
@@ -298,6 +304,7 @@ def _create_rows(
298304
for i in range(len(rows)):
299305
row = rows[i]
300306
if row is not None:
307+
content_type = row.content_type
301308
text_id = text_id or row.text_id
302309
if self.corpora[i].is_scripture:
303310
refs[i] = self._correct_versification([row.ref] if row.ref is None else default_refs, i)
@@ -314,7 +321,7 @@ def _create_rows(
314321
)
315322
refs = [r or default_refs for r in refs]
316323

317-
new_row = NParallelTextRow(cast(str, text_id), refs)
324+
new_row = NParallelTextRow(cast(str, text_id), refs, content_type)
318325
new_row.n_segments = [r.segment if r is not None else [] for r in rows]
319326
new_row.n_flags = flags
320327
yield new_row

machine/corpora/n_parallel_text_row.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
11
from typing import Any, Sequence
22

33
from .text_row import TextRowFlags
4+
from .text_row_content_type import TextRowContentType
45

56

67
class NParallelTextRow:
7-
def __init__(self, text_id: str, n_refs: Sequence[Sequence[Any]]):
8+
def __init__(
9+
self,
10+
text_id: str,
11+
n_refs: Sequence[Sequence[Any]],
12+
content_type: TextRowContentType = TextRowContentType.SEGMENT,
13+
):
814
if len([n_ref for n_ref in n_refs if n_ref is not None and len(n_ref) > 0]) == 0:
915
raise ValueError(f"Refs must be provided but n_refs={n_refs}")
1016
self._text_id = text_id
1117
self._n_refs = n_refs
1218
self._n = len(n_refs)
1319
self.n_segments: Sequence[Sequence[str]] = [[] for _ in range(0, self._n)]
1420
self.n_flags: Sequence[TextRowFlags] = [TextRowFlags.SENTENCE_START for _ in range(0, self._n)]
21+
self._content_type = content_type
1522

1623
@property
1724
def text_id(self) -> str:
@@ -21,6 +28,10 @@ def text_id(self) -> str:
2128
def ref(self) -> Any:
2229
return self._n_refs[0][0]
2330

31+
@property
32+
def content_type(self) -> TextRowContentType:
33+
return self._content_type
34+
2435
@property
2536
def n_refs(self) -> Sequence[Sequence[Any]]:
2637
return self._n_refs
@@ -42,6 +53,6 @@ def text(self, i: int) -> str:
4253
return " ".join(self.n_segments[i])
4354

4455
def invert(self) -> "NParallelTextRow":
45-
inverted_row = NParallelTextRow(self._text_id, list(reversed(self._n_refs)))
56+
inverted_row = NParallelTextRow(self._text_id, list(reversed(self._n_refs)), content_type=self.content_type)
4657
inverted_row.n_flags = list(reversed(self.n_flags))
4758
return inverted_row

machine/corpora/parallel_text_corpus.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from .corpora_utils import get_split_indices
2828
from .corpus import Corpus
2929
from .parallel_text_row import ParallelTextRow
30+
from .text_row_content_type import TextRowContentType
3031
from .token_processors import escape_spaces, lowercase, normalize, unescape_spaces
3132

3233
if TYPE_CHECKING:
@@ -401,10 +402,11 @@ def to_hf_dataset(
401402
ref_column: Optional[str] = "ref",
402403
translation_column: str = "translation",
403404
alignment_column: Optional[str] = "alignment",
405+
content_type_column: Optional[str] = "content_type",
404406
) -> Dataset:
405407
try:
406408
from datasets.arrow_dataset import Dataset
407-
from datasets.features.features import Features, FeatureType, Sequence, Value
409+
from datasets.features.features import ClassLabel, Features, FeatureType, Sequence, Value
408410
from datasets.features.translation import Translation
409411
except ImportError:
410412
raise RuntimeError("datasets is not installed.")
@@ -416,6 +418,8 @@ def to_hf_dataset(
416418
features_dict[ref_column] = Sequence(Value("string"))
417419
if alignment_column is not None:
418420
features_dict[alignment_column] = Sequence({source_lang: Value("int32"), target_lang: Value("int32")})
421+
if content_type_column is not None:
422+
features_dict[content_type_column] = ClassLabel(names=[e.name for e in TextRowContentType])
419423
features = Features(features_dict)
420424

421425
def iterable() -> Iterable[dict]:
@@ -426,6 +430,8 @@ def iterable() -> Iterable[dict]:
426430
example[text_id_column] = row.text_id
427431
if ref_column is not None:
428432
example[ref_column] = row.refs
433+
if content_type_column is not None:
434+
example[content_type_column] = row.content_type.name
429435
example[translation_column] = {source_lang: row.source_text, target_lang: row.target_text}
430436
if alignment_column is not None:
431437
src_indices: List[int] = []

machine/corpora/parallel_text_row.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from .aligned_word_pair import AlignedWordPair
66
from .text_row import TextRowFlags
7+
from .text_row_content_type import TextRowContentType
78

89

910
class ParallelTextRow(Sequence[Sequence[str]]):
@@ -17,6 +18,7 @@ def __init__(
1718
aligned_word_pairs: Optional[Collection[AlignedWordPair]] = None,
1819
source_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
1920
target_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
21+
content_type: TextRowContentType = TextRowContentType.SEGMENT,
2022
) -> None:
2123
if not text_id:
2224
raise ValueError("A text_id must be set.")
@@ -25,6 +27,7 @@ def __init__(
2527
self._text_id = text_id
2628
self._source_refs = source_refs
2729
self._target_refs = target_refs
30+
self._content_type = content_type
2831
self.source_segment = source_segment
2932
self.target_segment = target_segment
3033
self.aligned_word_pairs = aligned_word_pairs
@@ -51,6 +54,10 @@ def ref(self) -> Any:
5154
def refs(self) -> Sequence[Any]:
5255
return self.target_refs if len(self.source_refs) == 0 else self.source_refs
5356

57+
@property
58+
def content_type(self) -> TextRowContentType:
59+
return self._content_type
60+
5461
@property
5562
def is_source_sentence_start(self) -> bool:
5663
return TextRowFlags.SENTENCE_START in self.source_flags
@@ -107,4 +114,5 @@ def invert(self) -> ParallelTextRow:
107114
None if self.aligned_word_pairs is None else [wp.invert() for wp in self.aligned_word_pairs],
108115
self.target_flags,
109116
self.source_flags,
117+
self.content_type,
110118
)

machine/corpora/paratext_backup_terms_corpus.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1-
from typing import List, Sequence, Tuple
1+
from typing import Sequence
22
from zipfile import ZipFile
33

44
from ..utils.typeshed import StrPath
55
from .dictionary_text_corpus import DictionaryTextCorpus
6+
from .key_term import KeyTerm
67
from .memory_text import MemoryText
78
from .text_row import TextRow
9+
from .text_row_content_type import TextRowContentType
810
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
911
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
1012

@@ -15,7 +17,7 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g
1517

1618
with ZipFile(filename, "r") as archive:
1719
settings = ZipParatextProjectSettingsParser(archive).parse()
18-
glosses: List[Tuple[str, List[str]]] = ZipParatextProjectTermsParser(archive, settings).parse(
20+
key_terms: Sequence[KeyTerm] = ZipParatextProjectTermsParser(archive, settings).parse(
1921
term_categories, use_term_glosses
2022
)
2123
text_id = (
@@ -24,5 +26,11 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g
2426
f"{settings.biblical_terms_file_name}"
2527
)
2628

27-
text = MemoryText(text_id, [TextRow(text_id, kvp[0], kvp[1]) for kvp in glosses])
29+
text = MemoryText(
30+
text_id,
31+
[
32+
TextRow(text_id, key_term.id, key_term.renderings, content_type=TextRowContentType.WORD)
33+
for key_term in key_terms
34+
],
35+
)
2836
self._add_text(text)

0 commit comments

Comments
 (0)