Skip to content

Commit effa76d

Browse files
committed
use space as delimiters for perplexity
1 parent 4751b15 commit effa76d

1 file changed

Lines changed: 14 additions & 21 deletions

File tree

botok/utils/corpus_normalization.py

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -131,19 +131,15 @@ def normalize_corpus(
131131
_LETTER_BEFORE_NL_RE = re.compile(rf"([{_LETTER}])\n")
132132
_YIG_MGO_RE = re.compile(rf"[{_YIG_MGO_START}]+[{_PUNCT}]*")
133133
# Tibetan digits U+0F20-U+0F33, ASCII digits, comma as thousands-separator
134-
_DIGIT_RUN_RE = re.compile(r"[0-9\u0F20-\u0F33][0-9\u0F20-\u0F33,]*")
134+
_DIGIT_RUN_RE = re.compile(r"[0-9\u0F20-\u0F33][0-9\u0F20-\u0F33, ]*")
135135
# Keep only Tibetan block (U+0F00-U+0FFF), ASCII space, and the digit placeholder D
136136
_NON_TIBETAN_RE = re.compile(r"[^\u0F00-\u0FFF D]")
137137
_PUNCT_OR_SPACE_RE = re.compile(rf"[{_PUNCT} ]+")
138-
_LETTER_SPACE_RE = re.compile(rf"([{_LETTER}]) ")
139-
_LETTER_SPACE_REPL = r"\1" + "\u0F0B " # backreference + literal tsheg + space
140138
# space_after_tshegs: any punct-containing run → single shad surrounded by spaces
141139
_PUNCT_RUN_RE = re.compile(rf"[ ]*[{_PUNCT}][{_PUNCT} ]*")
142140
_MULTI_SPACE_RE = re.compile(r" {2,}")
143141
# Split on tsheg or space while capturing the delimiter
144142
_TSHEG_OR_SPACE_RE = re.compile(r"(\u0F0B| )")
145-
# space_after_tshegs: insert a space after any tsheg not already followed by one
146-
_TSHEG_NO_SPACE_RE = re.compile(r"\u0F0B(?! )")
147143

148144

149145
def _process_sskt(text: str, space_sskt: bool, fold_sskt: bool) -> str:
@@ -153,8 +149,8 @@ def _process_sskt(text: str, space_sskt: bool, fold_sskt: bool) -> str:
153149
mark sentence/clause boundaries. The function walks each tsheg-delimited
154150
piece and, for non-standard syllables:
155151
156-
* ``space_sskt``: expands the syllable into its constituent stacks, each
157-
followed by a tsheg, with spaces between stacks.
152+
* ``space_sskt``: expands the syllable into its constituent stacks,
153+
separated by spaces.
158154
* ``fold_sskt``: accumulates consecutive non-standard syllables and
159155
replaces the whole run with the placeholder ``S``.
160156
"""
@@ -208,10 +204,10 @@ def flush_sskt() -> None:
208204
out.append(delim)
209205
elif space_sskt:
210206
stacks = split_into_stacks(content)
211-
out.append(" ".join(s + "\u0F0B" for s in stacks))
207+
out.append(" ".join(stacks))
212208
if delim == " ":
213209
out.append(delim)
214-
# tsheg delimiter absorbed as the tsheg of the last stack
210+
# tsheg delimiter absorbed (tshegs are stripped at end of pipeline)
215211
else:
216212
out.append(content)
217213
if delim:
@@ -240,11 +236,10 @@ def normalize_for_perplexity(
240236
space_after_tshegs:
241237
If ``True``, punctuation sequences are replaced by a shad (``།``)
242238
surrounded by spaces instead of a plain space, making sentence
243-
boundaries explicit. Each syllable then ends with ``་ `` (tsheg +
244-
space).
239+
boundaries explicit.
245240
space_sskt:
246241
If ``True``, non-standard (Sanskrit) syllables are split into their
247-
constituent stacks, each receiving its own tsheg. Recommended
242+
constituent stacks, each separated by a space. Recommended
248243
together with ``space_after_tshegs``.
249244
fold_sskt:
250245
If ``True``, consecutive runs of non-standard (Sanskrit) syllables
@@ -253,7 +248,8 @@ def normalize_for_perplexity(
253248
254249
Steps applied after ``normalize_corpus``:
255250
1. Replace NYIS TSHEG (U+0FD2) with TSHEG (U+0F0B); fold runs of
256-
consecutive TSHEGs to one.
251+
consecutive TSHEGs to one. Tshegs serve as syllable delimiters
252+
throughout intermediate processing and are removed in step 10.
257253
2. Remove honorific particles U+0F35 / U+0F37 and TSA-PHRU (U+0F39).
258254
3. Normalize nasalization marks: NYI ZLA (U+0F82) and SNA LDAN
259255
(U+0F83) → RJES SU NGA RO (U+0F7E).
@@ -272,8 +268,8 @@ def normalize_for_perplexity(
272268
- space_after_tshegs: punct-containing runs → `` ། ``
273269
(shad surrounded by spaces); remaining space runs collapsed.
274270
9b. (space_sskt / fold_sskt) Process non-standard syllable tokens.
275-
10. Ensure every syllable-final letter before a space carries a TSHEG:
276-
letter + space → letter + U+0F0B + space.
271+
10. Replace every remaining tsheg (U+0F0B) with a space; collapse
272+
multiple spaces. Space is now the sole token delimiter.
277273
11. Strip leading/trailing whitespace.
278274
"""
279275
text = normalize_corpus(text)
@@ -314,12 +310,9 @@ def normalize_for_perplexity(
314310
if space_sskt or fold_sskt:
315311
text = _process_sskt(text, space_sskt, fold_sskt)
316312

317-
# 9c) In space_after_tshegs mode, guarantee a space follows every tsheg
318-
if space_after_tshegs:
319-
text = _TSHEG_NO_SPACE_RE.sub("\u0F0B ", text)
320-
321-
# 10) Ensure syllable-final letters carry a TSHEG before any space
322-
text = _LETTER_SPACE_RE.sub(_LETTER_SPACE_REPL, text)
313+
# 10) Replace all tshegs with spaces; space is the sole token delimiter
314+
text = text.replace("\u0F0B", " ")
315+
text = _MULTI_SPACE_RE.sub(" ", text)
323316

324317
return text.strip()
325318

0 commit comments

Comments
 (0)