@@ -135,8 +135,6 @@ def normalize_corpus(
135135# Keep only Tibetan block (U+0F00-U+0FFF), ASCII space, and the digit placeholder D
136136_NON_TIBETAN_RE = re .compile (r"[^\u0F00-\u0FFF D]" )
137137_PUNCT_OR_SPACE_RE = re .compile (rf"[{ _PUNCT } ]+" )
138- # space_after_tshegs: any punct-containing run → single shad surrounded by spaces
139- _PUNCT_RUN_RE = re .compile (rf"[ ]*[{ _PUNCT } ][{ _PUNCT } ]*" )
140138_MULTI_SPACE_RE = re .compile (r" {2,}" )
141139# Split on tsheg or space while capturing the delimiter
142140_TSHEG_OR_SPACE_RE = re .compile (r"(\u0F0B| )" )
@@ -222,25 +220,26 @@ def flush_sskt() -> None:
222220
223221def normalize_for_perplexity (
224222 text : str ,
225- space_after_tshegs : bool = False ,
226- space_sskt : bool = False ,
223+ space_sskt : bool = True ,
227224 fold_sskt : bool = False ,
228225) -> str :
229226 """
230227 Normalize Tibetan text for perplexity calculation.
231228
229+ Every sentence boundary — whether marked by Tibetan punctuation
230+ (U+0F0D–U+0F14) or a plain space in the source — is rendered as a shad
231+ token (`` ། ``) surrounded by spaces. Syllables within a sentence are
232+ separated by plain spaces (tshegs are removed). The result uses space
233+ as the sole token delimiter and ``།`` as an explicit sentence-boundary
234+ marker.
235+
232236 Parameters
233237 ----------
234238 text:
235239 Input text.
236- space_after_tshegs:
237- If ``True``, punctuation sequences are replaced by a shad (``།``)
238- surrounded by spaces instead of a plain space, making sentence
239- boundaries explicit.
240240 space_sskt:
241241 If ``True``, non-standard (Sanskrit) syllables are split into their
242- constituent stacks, each separated by a space. Recommended
243- together with ``space_after_tshegs``.
242+ constituent stacks, each separated by a space.
244243 fold_sskt:
245244 If ``True``, consecutive runs of non-standard (Sanskrit) syllables
246245 are collapsed to the single placeholder token ``S``. Takes
@@ -263,10 +262,8 @@ def normalize_for_perplexity(
263262 with commas) with the placeholder ``D``.
264263 8. Strip any character outside the Tibetan Unicode block (U+0F00-
265264 U+0FFF), keeping spaces and the ``D`` placeholder.
266- 9. Collapse punctuation / space runs:
267- - default: to a single space.
268- - space_after_tshegs: punct-containing runs → `` ། ``
269- (shad surrounded by spaces); remaining space runs collapsed.
265+ 9. Any run of punctuation and/or spaces → shad token `` ། ``
266+ surrounded by spaces.
270267 9b. (space_sskt / fold_sskt) Process non-standard syllable tokens.
271268 10. Replace every remaining tsheg (U+0F0B) with a space; collapse
272269 multiple spaces. Space is now the sole token delimiter.
@@ -299,12 +296,8 @@ def normalize_for_perplexity(
299296 # 8) Strip characters outside the Tibetan block (keep D placeholder and spaces)
300297 text = _NON_TIBETAN_RE .sub (" " , text )
301298
302- # 9) Collapse punctuation / space runs
303- if space_after_tshegs :
304- text = _PUNCT_RUN_RE .sub (" \u0F0D " , text )
305- text = _MULTI_SPACE_RE .sub (" " , text )
306- else :
307- text = _PUNCT_OR_SPACE_RE .sub (" " , text )
299+ # 9) Any run of punctuation and/or spaces → shad token surrounded by spaces
300+ text = _PUNCT_OR_SPACE_RE .sub (" \u0F0D " , text )
308301
309302 # 9b) Sanskrit syllable handling
310303 if space_sskt or fold_sskt :
0 commit comments