Skip to content

Commit 8087e0d

Browse files
committed
Change data_type property to content_type; address other reviewer comments
1 parent f25702e commit 8087e0d

17 files changed

Lines changed: 64 additions & 70 deletions

machine/corpora/memory_text.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,9 @@
77

88

99
class MemoryText(Text):
10-
def __init__(
11-
self, id: str, rows: Iterable[TextRow] = [], data_type: TextRowContentType = TextRowContentType.SEGMENT
12-
) -> None:
10+
def __init__(self, id: str, rows: Iterable[TextRow] = []) -> None:
1311
self._id = id
1412
self._rows = list(rows)
15-
if any([r.data_type != data_type for r in self._rows]):
16-
raise ValueError(f"{type(data_type)} of rows must match text {type(data_type)} {data_type}")
17-
self._data_type = data_type
1813

1914
@property
2015
def id(self) -> str:
@@ -24,9 +19,5 @@ def id(self) -> str:
2419
def sort_key(self) -> str:
2520
return self._id
2621

27-
@property
28-
def data_type(self) -> TextRowContentType:
29-
return self._data_type
30-
3122
def _get_rows(self) -> Generator[TextRow, None, None]:
3223
return gen(self._rows)

machine/corpora/n_parallel_text_corpus.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class _RangeRow:
1515
refs: List[Any]
1616
segment: List[str]
1717
is_sentence_start: bool = False
18-
data_type: TextRowContentType = TextRowContentType.SEGMENT
18+
content_type: TextRowContentType = TextRowContentType.SEGMENT
1919

2020
@property
2121
def is_in_range(self):
@@ -38,7 +38,7 @@ def __init__(self, n: int):
3838
self.text_id = ""
3939
self.versifications: Optional[List[Versification]] = None
4040
self.row_ref_comparer = None
41-
self.data_type = TextRowContentType.SEGMENT
41+
self.content_type = TextRowContentType.SEGMENT
4242

4343
@property
4444
def is_in_range(self) -> bool:
@@ -47,7 +47,7 @@ def is_in_range(self) -> bool:
4747
def add_text_row(self, row: TextRow, index: int):
4848
self.text_id = row.text_id
4949
self.rows[index].refs.append(row.ref)
50-
self.rows[index].data_type = row.data_type
50+
self.rows[index].content_type = row.content_type
5151
if self.rows[index].is_empty:
5252
self.rows[index].is_sentence_start = row.is_sentence_start
5353
self.rows[index].segment.extend(row.segment)
@@ -57,7 +57,7 @@ def create_row(self) -> NParallelTextRow:
5757
reference_refs: List[Any] = [r.refs[0] if len(r.refs) > 0 else None for r in self.rows if len(r.refs) > 0]
5858
for i in range(len(self.rows)):
5959
row = self.rows[i]
60-
self.data_type = row.data_type
60+
self.content_type = row.content_type
6161

6262
if (
6363
self.versifications is not None
@@ -67,7 +67,7 @@ def create_row(self) -> NParallelTextRow:
6767
refs[i] = [cast(ScriptureRef, r).change_versification(self.versifications[i]) for r in reference_refs]
6868
else:
6969
refs[i] = row.refs.copy()
70-
n_parallel_text_row = NParallelTextRow(self.text_id, refs, self.data_type)
70+
n_parallel_text_row = NParallelTextRow(self.text_id, refs, self.content_type)
7171
n_parallel_text_row.n_segments = [r.segment.copy() for r in self.rows]
7272
n_parallel_text_row.n_flags = [
7373
TextRowFlags.SENTENCE_START if r.is_sentence_start else TextRowFlags.NONE for r in self.rows
@@ -293,7 +293,7 @@ def _create_rows(
293293
yield range_info.create_row()
294294

295295
default_refs = [[r.ref for r in rows if r is not None][0]]
296-
data_type = TextRowContentType.SEGMENT
296+
content_type = TextRowContentType.SEGMENT
297297

298298
text_id: Optional[str] = None
299299
refs: List[List[Any]] = []
@@ -304,7 +304,7 @@ def _create_rows(
304304
for i in range(len(rows)):
305305
row = rows[i]
306306
if row is not None:
307-
data_type = row.data_type
307+
content_type = row.content_type
308308
text_id = text_id or row.text_id
309309
if self.corpora[i].is_scripture:
310310
refs[i] = self._correct_versification([row.ref] if row.ref is None else default_refs, i)
@@ -321,7 +321,7 @@ def _create_rows(
321321
)
322322
refs = [r or default_refs for r in refs]
323323

324-
new_row = NParallelTextRow(cast(str, text_id), refs, data_type)
324+
new_row = NParallelTextRow(cast(str, text_id), refs, content_type)
325325
new_row.n_segments = [r.segment if r is not None else [] for r in rows]
326326
new_row.n_flags = flags
327327
yield new_row

machine/corpora/n_parallel_text_row.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66

77
class NParallelTextRow:
88
def __init__(
9-
self, text_id: str, n_refs: Sequence[Sequence[Any]], data_type: TextRowContentType = TextRowContentType.SEGMENT
9+
self,
10+
text_id: str,
11+
n_refs: Sequence[Sequence[Any]],
12+
content_type: TextRowContentType = TextRowContentType.SEGMENT,
1013
):
1114
if len([n_ref for n_ref in n_refs if n_ref is not None and len(n_ref) > 0]) == 0:
1215
raise ValueError(f"Refs must be provided but n_refs={n_refs}")
@@ -15,7 +18,7 @@ def __init__(
1518
self._n = len(n_refs)
1619
self.n_segments: Sequence[Sequence[str]] = [[] for _ in range(0, self._n)]
1720
self.n_flags: Sequence[TextRowFlags] = [TextRowFlags.SENTENCE_START for _ in range(0, self._n)]
18-
self._data_type = data_type
21+
self._content_type = content_type
1922

2023
@property
2124
def text_id(self) -> str:
@@ -26,8 +29,8 @@ def ref(self) -> Any:
2629
return self._n_refs[0][0]
2730

2831
@property
29-
def data_type(self) -> TextRowContentType:
30-
return self._data_type
32+
def content_type(self) -> TextRowContentType:
33+
return self._content_type
3134

3235
@property
3336
def n_refs(self) -> Sequence[Sequence[Any]]:
@@ -50,6 +53,6 @@ def text(self, i: int) -> str:
5053
return " ".join(self.n_segments[i])
5154

5255
def invert(self) -> "NParallelTextRow":
53-
inverted_row = NParallelTextRow(self._text_id, list(reversed(self._n_refs)), data_type=self.data_type)
56+
inverted_row = NParallelTextRow(self._text_id, list(reversed(self._n_refs)), content_type=self.content_type)
5457
inverted_row.n_flags = list(reversed(self.n_flags))
5558
return inverted_row

machine/corpora/parallel_text_corpus.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def to_hf_dataset(
402402
ref_column: Optional[str] = "ref",
403403
translation_column: str = "translation",
404404
alignment_column: Optional[str] = "alignment",
405-
data_type_column: Optional[str] = "data_type",
405+
content_type_column: Optional[str] = "content_type",
406406
) -> Dataset:
407407
try:
408408
from datasets.arrow_dataset import Dataset
@@ -418,8 +418,8 @@ def to_hf_dataset(
418418
features_dict[ref_column] = Sequence(Value("string"))
419419
if alignment_column is not None:
420420
features_dict[alignment_column] = Sequence({source_lang: Value("int32"), target_lang: Value("int32")})
421-
if data_type_column is not None:
422-
features_dict[data_type_column] = ClassLabel(names=[e.name for e in TextRowContentType])
421+
if content_type_column is not None:
422+
features_dict[content_type_column] = ClassLabel(names=[e.name for e in TextRowContentType])
423423
features = Features(features_dict)
424424

425425
def iterable() -> Iterable[dict]:
@@ -430,8 +430,8 @@ def iterable() -> Iterable[dict]:
430430
example[text_id_column] = row.text_id
431431
if ref_column is not None:
432432
example[ref_column] = row.refs
433-
if data_type_column is not None:
434-
example[data_type_column] = row.data_type.name
433+
if content_type_column is not None:
434+
example[content_type_column] = row.content_type.name
435435
example[translation_column] = {source_lang: row.source_text, target_lang: row.target_text}
436436
if alignment_column is not None:
437437
src_indices: List[int] = []

machine/corpora/parallel_text_row.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def __init__(
1818
aligned_word_pairs: Optional[Collection[AlignedWordPair]] = None,
1919
source_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
2020
target_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
21-
data_type: TextRowContentType = TextRowContentType.SEGMENT,
21+
content_type: TextRowContentType = TextRowContentType.SEGMENT,
2222
) -> None:
2323
if not text_id:
2424
raise ValueError("A text_id must be set.")
@@ -27,7 +27,7 @@ def __init__(
2727
self._text_id = text_id
2828
self._source_refs = source_refs
2929
self._target_refs = target_refs
30-
self._data_type = data_type
30+
self._content_type = content_type
3131
self.source_segment = source_segment
3232
self.target_segment = target_segment
3333
self.aligned_word_pairs = aligned_word_pairs
@@ -55,8 +55,8 @@ def refs(self) -> Sequence[Any]:
5555
return self.target_refs if len(self.source_refs) == 0 else self.source_refs
5656

5757
@property
58-
def data_type(self) -> TextRowContentType:
59-
return self._data_type
58+
def content_type(self) -> TextRowContentType:
59+
return self._content_type
6060

6161
@property
6262
def is_source_sentence_start(self) -> bool:
@@ -114,5 +114,5 @@ def invert(self) -> ParallelTextRow:
114114
None if self.aligned_word_pairs is None else [wp.invert() for wp in self.aligned_word_pairs],
115115
self.target_flags,
116116
self.source_flags,
117-
self.data_type,
117+
self.content_type,
118118
)

machine/corpora/paratext_backup_terms_corpus.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g
2929
text = MemoryText(
3030
text_id,
3131
[
32-
TextRow(text_id, key_term.id, key_term.renderings, data_type=TextRowContentType.WORD)
32+
TextRow(text_id, key_term.id, key_term.renderings, content_type=TextRowContentType.WORD)
3333
for key_term in key_terms
3434
],
35-
data_type=TextRowContentType.WORD,
35+
content_type=TextRowContentType.WORD,
3636
)
3737
self._add_text(text)

machine/corpora/scripture_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
class ScriptureText(TextBase):
1414
def __init__(self, id: str, versification: Optional[Versification] = None) -> None:
15-
super().__init__(id, get_scripture_text_sort_key(id), data_type=TextRowContentType.SEGMENT)
15+
super().__init__(id, get_scripture_text_sort_key(id), content_type=TextRowContentType.SEGMENT)
1616
self._versification = ENGLISH_VERSIFICATION if versification is None else versification
1717

1818
@property

machine/corpora/standard_parallel_text_corpus.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,5 +87,5 @@ def _get_rows(self, text_ids: Optional[Iterable[str]]) -> Generator[ParallelText
8787
if compare_alignment_corpus == 0 and alignment_row is not None
8888
else None
8989
),
90-
data_type=n_row.data_type,
90+
content_type=n_row.content_type,
9191
)

machine/corpora/text.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,3 @@ def id(self) -> str: ...
1313
@property
1414
@abstractmethod
1515
def sort_key(self) -> str: ...
16-
17-
@property
18-
@abstractmethod
19-
def data_type(self) -> TextRowContentType: ...

machine/corpora/text_base.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66

77

88
class TextBase(Text):
9-
def __init__(self, id: str, sort_key: str, data_type: TextRowContentType = TextRowContentType.SEGMENT) -> None:
9+
def __init__(self, id: str, sort_key: str, content_type: TextRowContentType = TextRowContentType.SEGMENT) -> None:
1010
self._id = id
1111
self._sort_key = sort_key
12-
self._data_type = data_type
12+
self._content_type = content_type
1313

1414
@property
1515
def id(self) -> str:
@@ -20,12 +20,12 @@ def sort_key(self) -> str:
2020
return self._sort_key
2121

2222
@property
23-
def data_type(self) -> TextRowContentType:
24-
return self._data_type
23+
def content_type(self) -> TextRowContentType:
24+
return self._content_type
2525

2626
def _create_row(self, text: str, ref: Any, flags: TextRowFlags = TextRowFlags.SENTENCE_START) -> TextRow:
2727
text = text.strip()
28-
return TextRow(self.id, ref, [text] if len(text) > 0 else [], flags, data_type=self.data_type)
28+
return TextRow(self.id, ref, [text] if len(text) > 0 else [], flags, content_type=self.content_type)
2929

3030
def _create_empty_row(self, ref: Any, flags: TextRowFlags = TextRowFlags.NONE) -> TextRow:
31-
return TextRow(self.id, ref, flags=flags, data_type=self.data_type)
31+
return TextRow(self.id, ref, flags=flags, content_type=self.content_type)

0 commit comments

Comments
 (0)