@@ -131,19 +131,15 @@ def normalize_corpus(
131131_LETTER_BEFORE_NL_RE = re .compile (rf"([{ _LETTER } ])\n" )
132132_YIG_MGO_RE = re .compile (rf"[{ _YIG_MGO_START } ]+[{ _PUNCT } ]*" )
133133# Tibetan digits U+0F20-U+0F33, ASCII digits, comma as thousands-separator
134- _DIGIT_RUN_RE = re .compile (r"[0-9\u0F20-\u0F33][0-9\u0F20-\u0F33,]*" )
134+ _DIGIT_RUN_RE = re .compile (r"[0-9\u0F20-\u0F33][0-9\u0F20-\u0F33, ]*" )
135135# Keep only Tibetan block (U+0F00-U+0FFF), ASCII space, and the digit placeholder D
136136_NON_TIBETAN_RE = re .compile (r"[^\u0F00-\u0FFF D]" )
137137_PUNCT_OR_SPACE_RE = re .compile (rf"[{ _PUNCT } ]+" )
138- _LETTER_SPACE_RE = re .compile (rf"([{ _LETTER } ]) " )
139- _LETTER_SPACE_REPL = r"\1" + "\u0F0B " # backreference + literal tsheg + space
140138# space_after_tshegs: any punct-containing run → single shad surrounded by spaces
141139_PUNCT_RUN_RE = re .compile (rf"[ ]*[{ _PUNCT } ][{ _PUNCT } ]*" )
142140_MULTI_SPACE_RE = re .compile (r" {2,}" )
143141# Split on tsheg or space while capturing the delimiter
144142_TSHEG_OR_SPACE_RE = re .compile (r"(\u0F0B| )" )
145- # space_after_tshegs: insert a space after any tsheg not already followed by one
146- _TSHEG_NO_SPACE_RE = re .compile (r"\u0F0B(?! )" )
147143
148144
149145def _process_sskt (text : str , space_sskt : bool , fold_sskt : bool ) -> str :
@@ -153,8 +149,8 @@ def _process_sskt(text: str, space_sskt: bool, fold_sskt: bool) -> str:
153149 mark sentence/clause boundaries. The function walks each tsheg-delimited
154150 piece and, for non-standard syllables:
155151
156- * ``space_sskt``: expands the syllable into its constituent stacks, each
157- followed by a tsheg, with spaces between stacks .
152+ * ``space_sskt``: expands the syllable into its constituent stacks,
153+ separated by spaces.
158154 * ``fold_sskt``: accumulates consecutive non-standard syllables and
159155 replaces the whole run with the placeholder ``S``.
160156 """
@@ -208,10 +204,10 @@ def flush_sskt() -> None:
208204 out .append (delim )
209205 elif space_sskt :
210206 stacks = split_into_stacks (content )
211- out .append (" " .join (s + " \u0F0B " for s in stacks ))
207+ out .append (" " .join (stacks ))
212208 if delim == " " :
213209 out .append (delim )
214- # tsheg delimiter absorbed as the tsheg of the last stack
210+ # tsheg delimiter absorbed (tshegs are stripped at end of pipeline)
215211 else :
216212 out .append (content )
217213 if delim :
@@ -240,11 +236,10 @@ def normalize_for_perplexity(
240236 space_after_tshegs:
241237 If ``True``, punctuation sequences are replaced by a shad (``།``)
242238 surrounded by spaces instead of a plain space, making sentence
243- boundaries explicit. Each syllable then ends with ``་ `` (tsheg +
244- space).
239+ boundaries explicit.
245240 space_sskt:
246241 If ``True``, non-standard (Sanskrit) syllables are split into their
247- constituent stacks, each receiving its own tsheg . Recommended
242+ constituent stacks, each separated by a space . Recommended
248243 together with ``space_after_tshegs``.
249244 fold_sskt:
250245 If ``True``, consecutive runs of non-standard (Sanskrit) syllables
@@ -253,7 +248,8 @@ def normalize_for_perplexity(
253248
254249 Steps applied after ``normalize_corpus``:
255250 1. Replace NYIS TSHEG (U+0FD2) with TSHEG (U+0F0B); fold runs of
256- consecutive TSHEGs to one.
251+ consecutive TSHEGs to one. Tshegs serve as syllable delimiters
252+ throughout intermediate processing and are removed in step 10.
257253 2. Remove honorific particles U+0F35 / U+0F37 and TSA-PHRU (U+0F39).
258254 3. Normalize nasalization marks: NYI ZLA (U+0F82) and SNA LDAN
259255 (U+0F83) → RJES SU NGA RO (U+0F7E).
@@ -272,8 +268,8 @@ def normalize_for_perplexity(
272268 - space_after_tshegs: punct-containing runs → `` ། ``
273269 (shad surrounded by spaces); remaining space runs collapsed.
274270 9b. (space_sskt / fold_sskt) Process non-standard syllable tokens.
275- 10. Ensure every syllable-final letter before a space carries a TSHEG:
276- letter + space → letter + U+0F0B + space .
271+ 10. Replace every remaining tsheg (U+0F0B) with a space; collapse
272+ multiple spaces. Space is now the sole token delimiter .
277273 11. Strip leading/trailing whitespace.
278274 """
279275 text = normalize_corpus (text )
@@ -314,12 +310,9 @@ def normalize_for_perplexity(
314310 if space_sskt or fold_sskt :
315311 text = _process_sskt (text , space_sskt , fold_sskt )
316312
317- # 9c) In space_after_tshegs mode, guarantee a space follows every tsheg
318- if space_after_tshegs :
319- text = _TSHEG_NO_SPACE_RE .sub ("\u0F0B " , text )
320-
321- # 10) Ensure syllable-final letters carry a TSHEG before any space
322- text = _LETTER_SPACE_RE .sub (_LETTER_SPACE_REPL , text )
313+ # 10) Replace all tshegs with spaces; space is the sole token delimiter
314+ text = text .replace ("\u0F0B " , " " )
315+ text = _MULTI_SPACE_RE .sub (" " , text )
323316
324317 return text .strip ()
325318
0 commit comments