Skip to content

Commit ece862b

Browse files
committed
improve things further
1 parent effa76d commit ece862b

1 file changed

Lines changed: 13 additions & 20 deletions

File tree

botok/utils/corpus_normalization.py

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,6 @@ def normalize_corpus(
135135
# Keep only Tibetan block (U+0F00-U+0FFF), ASCII space, and the digit placeholder D
136136
_NON_TIBETAN_RE = re.compile(r"[^\u0F00-\u0FFF D]")
137137
_PUNCT_OR_SPACE_RE = re.compile(rf"[{_PUNCT} ]+")
138-
# space_after_tshegs: any punct-containing run → single shad surrounded by spaces
139-
_PUNCT_RUN_RE = re.compile(rf"[ ]*[{_PUNCT}][{_PUNCT} ]*")
140138
_MULTI_SPACE_RE = re.compile(r" {2,}")
141139
# Split on tsheg or space while capturing the delimiter
142140
_TSHEG_OR_SPACE_RE = re.compile(r"(\u0F0B| )")
@@ -222,25 +220,26 @@ def flush_sskt() -> None:
222220

223221
def normalize_for_perplexity(
224222
text: str,
225-
space_after_tshegs: bool = False,
226-
space_sskt: bool = False,
223+
space_sskt: bool = True,
227224
fold_sskt: bool = False,
228225
) -> str:
229226
"""
230227
Normalize Tibetan text for perplexity calculation.
231228
229+
Every sentence boundary — whether marked by Tibetan punctuation
230+
(U+0F0D–U+0F14) or a plain space in the source — is rendered as a shad
231+
token (`` ། ``) surrounded by spaces. Syllables within a sentence are
232+
separated by plain spaces (tshegs are removed). The result uses space
233+
as the sole token delimiter and ``།`` as an explicit sentence-boundary
234+
marker.
235+
232236
Parameters
233237
----------
234238
text:
235239
Input text.
236-
space_after_tshegs:
237-
If ``True``, punctuation sequences are replaced by a shad (``།``)
238-
surrounded by spaces instead of a plain space, making sentence
239-
boundaries explicit.
240240
space_sskt:
241241
If ``True``, non-standard (Sanskrit) syllables are split into their
242-
constituent stacks, each separated by a space. Recommended
243-
together with ``space_after_tshegs``.
242+
constituent stacks, each separated by a space.
244243
fold_sskt:
245244
If ``True``, consecutive runs of non-standard (Sanskrit) syllables
246245
are collapsed to the single placeholder token ``S``. Takes
@@ -263,10 +262,8 @@ def normalize_for_perplexity(
263262
with commas) with the placeholder ``D``.
264263
8. Strip any character outside the Tibetan Unicode block (U+0F00-
265264
U+0FFF), keeping spaces and the ``D`` placeholder.
266-
9. Collapse punctuation / space runs:
267-
- default: to a single space.
268-
- space_after_tshegs: punct-containing runs → `` ། ``
269-
(shad surrounded by spaces); remaining space runs collapsed.
265+
9. Any run of punctuation and/or spaces → shad token `` ། ``
266+
surrounded by spaces.
270267
9b. (space_sskt / fold_sskt) Process non-standard syllable tokens.
271268
10. Replace every remaining tsheg (U+0F0B) with a space; collapse
272269
multiple spaces. Space is now the sole token delimiter.
@@ -299,12 +296,8 @@ def normalize_for_perplexity(
299296
# 8) Strip characters outside the Tibetan block (keep D placeholder and spaces)
300297
text = _NON_TIBETAN_RE.sub(" ", text)
301298

302-
# 9) Collapse punctuation / space runs
303-
if space_after_tshegs:
304-
text = _PUNCT_RUN_RE.sub(" \u0F0D ", text)
305-
text = _MULTI_SPACE_RE.sub(" ", text)
306-
else:
307-
text = _PUNCT_OR_SPACE_RE.sub(" ", text)
299+
# 9) Any run of punctuation and/or spaces → shad token surrounded by spaces
300+
text = _PUNCT_OR_SPACE_RE.sub(" \u0F0D ", text)
308301

309302
# 9b) Sanskrit syllable handling
310303
if space_sskt or fold_sskt:

0 commit comments

Comments
 (0)