@@ -120,6 +120,52 @@ def normalize_corpus(
120120
121121 return s
122122
123+
124+ # ---------------------------------------------------------------------------
125+ # merge_lines helpers
126+ # ---------------------------------------------------------------------------
127+
128+ # Fold consecutive tshegs that are immediately before / after a newline.
129+ _MULTI_TSHEG_BEFORE_NL_RE = re .compile (r"\u0F0B{2,}(?=\n)" )
130+ _MULTI_TSHEG_AFTER_NL_RE = re .compile (r"(?<=\n)\u0F0B{2,}" )
131+
132+
133+ def merge_lines (text : str ) -> str :
134+ """Merge a multi-line Tibetan string into a single continuous line.
135+
136+ Designed for word-wrapped or page-OCR'd text where newlines are
137+ typographic artefacts rather than sentence boundaries.
138+
139+ Steps:
140+ 1. Normalize line-break sequences (CRLF, CR, NEL, LS, PS) to LF.
141+ 2. Remove ASCII spaces / tabs immediately before or after each LF.
142+ 3. Fold runs of two or more tshegs (U+0F0B) that sit at the very end
143+ or very beginning of a line down to a single tsheg.
144+ 4. When a Tibetan letter (U+0F40-U+0FBC) ends a line with no trailing
145+ tsheg, insert a space between it and the LF so the syllable
146+ boundary is preserved after the LF is removed.
147+ 5. Remove all remaining LF characters.
148+ """
149+ # 1) Normalize line endings
150+ text = _LINEBREAKS_RE .sub ("\n " , text )
151+
152+ # 2) Remove spaces/tabs around newlines
153+ text = re .sub (r"[ \t]+\n" , "\n " , text )
154+ text = re .sub (r"\n[ \t]+" , "\n " , text )
155+
156+ # 3) Fold tsheg runs at line boundaries to one tsheg
157+ text = _MULTI_TSHEG_BEFORE_NL_RE .sub ("\u0F0B " , text )
158+ text = _MULTI_TSHEG_AFTER_NL_RE .sub ("\u0F0B " , text )
159+
160+ # 4) Letter at line-end without trailing tsheg → insert space
161+ text = _LETTER_BEFORE_NL_RE .sub (r"\1 \n" , text )
162+
163+ # 5) Drop all newlines
164+ text = text .replace ("\n " , "" )
165+
166+ return text
167+
168+
123169# U+0FD2 is excluded (NYIS TSHEG → converted to U+0F0B earlier).
124170# U+0FD5-U+0FD8 are svasti/auspicious signs, structurally identical to yig-mgo.
125171_YIG_MGO_START = r"\u0F01-\u0F07\u0F09\u0F0A\u0FD0\u0FD1\u0FD3-\u0FD8"
@@ -128,6 +174,17 @@ def normalize_corpus(
128174
129175# Compiled patterns for normalize_for_perplexity
130176_MULTI_TSHEG_RE = re .compile (r"\u0F0B{2,}" )
177+ # Tibetan vowel signs (U+0F71-U+0F84).
178+ _VOWEL = r"\u0F71-\u0F84"
179+ # ག (U+0F42), ཤ (U+0F64), ཀ (U+0F40): their right-side vertical bar is
180+ # typographically shared with the shad mark. When one of these ends a line
181+ # with no preceding tsheg the shad is implicit and must be made explicit.
182+ _GA_SHA_KA_NL_RE = re .compile (rf"([གཤཀ][{ _VOWEL } ]?)\n" )
183+ # Any other Tibetan letter before a newline gets a tsheg (syllable boundary)
184+ # so the syllables are not merged when the newline is removed. We use a
185+ # tsheg rather than a space so that step 9 does NOT promote the boundary to
186+ # a shad token (only explicit ། in the source should become sentence
187+ # boundaries).
131188_LETTER_BEFORE_NL_RE = re .compile (rf"([{ _LETTER } ])\n" )
132189_YIG_MGO_RE = re .compile (rf"[{ _YIG_MGO_START } ]+[{ _PUNCT } ]*" )
133190# Tibetan digits U+0F20-U+0F33, ASCII digits, comma as thousands-separator
@@ -138,6 +195,51 @@ def normalize_corpus(
138195_MULTI_SPACE_RE = re .compile (r" {2,}" )
139196# Split on tsheg or space while capturing the delimiter
140197_TSHEG_OR_SPACE_RE = re .compile (r"(\u0F0B| )" )
198+ # Tibetan brackets (ANG KHANG KHEPA open/close) must be isolated tokens
199+ _BRACKET_RE = re .compile (r"([\u0F3C\u0F3D])" )
200+ # Case affixes: always start with འ (U+0F60) followed by a specific letter.
201+ # Longest alternative (འིས) must precede the prefix it shares (འི).
202+ _AFFIX_RE = re .compile (rf"^([{ _LETTER } ]+)(འིས|འི|འོ|འམ|འང|འས|འད|འར)$" )
203+
204+
205+ def _split_syllable_affixes (syllable : str ) -> str :
206+ """Recursively split Tibetan case affixes from a syllable body.
207+
208+ Affixes (འི, འོ, འམ, འང, འིས, འར, འད, འས) always start with འ
209+ followed by a specific letter. The function peels off one affix per
210+ call and recurses on the remaining stem, joining parts with spaces.
211+
212+ Special case: a syllable ending in ``འུར`` has its final ``ར`` split
213+ off as a separate token (the ``འུ`` oblique marker stays with the
214+ stem).
215+
216+ Examples::
217+
218+ _split_syllable_affixes("རྒྱལའི") → "རྒྱལ འི"
219+ _split_syllable_affixes("པའིའོ") → "པ འི འོ"
220+ _split_syllable_affixes("བཀའུར") → "བཀའུ ར"
221+ """
222+ # *འུར: split off the allative ར, keep the oblique འུ with the stem
223+ if syllable .endswith ("\u0F60 \u0F74 \u0F62 " ) and len (syllable ) > 3 :
224+ return _split_syllable_affixes (syllable [:- 1 ]) + " \u0F62 "
225+ m = _AFFIX_RE .match (syllable )
226+ if not m :
227+ return syllable
228+ stem , affix = m .group (1 ), m .group (2 )
229+ return _split_syllable_affixes (stem ) + " " + affix
230+
231+
232+ def _apply_affix_splits (text : str ) -> str :
233+ """Apply :func:`_split_syllable_affixes` to every syllable token in *text*.
234+
235+ Tokens are delimited by tshegs (U+0F0B) or spaces; only tokens that
236+ contain at least one Tibetan letter are processed.
237+ """
238+ parts = _TSHEG_OR_SPACE_RE .split (text )
239+ for i in range (0 , len (parts ), 2 ):
240+ if any (0x0F40 <= ord (c ) <= 0x0FBC for c in parts [i ]):
241+ parts [i ] = _split_syllable_affixes (parts [i ])
242+ return "" .join (parts )
141243
142244
143245def _process_sskt (text : str , space_sskt : bool , fold_sskt : bool ) -> str :
@@ -203,9 +305,8 @@ def flush_sskt() -> None:
203305 elif space_sskt :
204306 stacks = split_into_stacks (content )
205307 out .append (" " .join (stacks ))
206- if delim == " " :
207- out .append (delim )
208- # tsheg delimiter absorbed (tshegs are stripped at end of pipeline)
308+ if delim :
309+ out .append (delim ) # tsheg → space in step 10; space stays
209310 else :
210311 out .append (content )
211312 if delim :
@@ -252,8 +353,15 @@ def normalize_for_perplexity(
252353 2. Remove honorific particles U+0F35 / U+0F37 and TSA-PHRU (U+0F39).
253354 3. Normalize nasalization marks: NYI ZLA (U+0F82) and SNA LDAN
254355 (U+0F83) → RJES SU NGA RO (U+0F7E).
255- 4. Where a Tibetan letter (U+0F40-U+0FBC) is followed by a newline,
256- insert a space to preserve the syllable boundary.
356+ 3.5 Typographic shad: ག (U+0F42), ཤ (U+0F64), or ཀ (U+0F40) whose
357+ right-side vertical bar is visually shared with the shad mark.
358+ When such a consonant (+ optional vowel) ends a line without a
359+ preceding tsheg, the shad is implicit; make it explicit by
360+ inserting U+0F0D before the newline.
361+ 4. Where any other Tibetan letter (U+0F40-U+0FBC) is followed by a
362+ newline, insert a tsheg (U+0F0B) to preserve the syllable
363+ boundary without creating a sentence boundary. (A space here
364+ would be promoted to ། by step 9 — incorrect.)
257365 5. Remove all remaining newlines.
258366 6. Remove yig-mgo opening marks (U+0F01-U+0F07, U+0F09, U+0F0A,
259367 U+0FD0, U+0FD1, U+0FD3-U+0FD8 incl. svasti signs) together with
@@ -264,7 +372,13 @@ def normalize_for_perplexity(
264372 U+0FFF), keeping spaces and the ``D`` placeholder.
265373 9. Any run of punctuation and/or spaces → shad token `` ། ``
266374 surrounded by spaces.
267- 9b. (space_sskt / fold_sskt) Process non-standard syllable tokens.
375+ 9b. Surround Tibetan brackets ༼ (U+0F3C) and ༽ (U+0F3D) with spaces
376+ so they become standalone tokens.
377+ 9c. Split case affixes (འི, འོ, འམ, འང, འིས, འར, འད, འས) from
378+ syllable bodies by inserting a space before each affix. Stacked
379+ affixes (e.g. འིའོ) are split one at a time, yielding separate
380+ tokens. Syllables ending in འུར have the final ར split off.
381+ 9d. (space_sskt / fold_sskt) Process non-standard syllable tokens.
268382 10. Replace every remaining tsheg (U+0F0B) with a space; collapse
269383 multiple spaces. Space is now the sole token delimiter.
270384 11. Strip leading/trailing whitespace.
@@ -281,8 +395,13 @@ def normalize_for_perplexity(
281395 # 3) Normalize nasalization marks to RJES SU NGA RO (U+0F7E)
282396 text = text .replace ("\u0F82 " , "\u0F7E " ).replace ("\u0F83 " , "\u0F7E " )
283397
284- # 4) Letter before newline → letter + space + newline
285- text = _LETTER_BEFORE_NL_RE .sub (r"\1 \n" , text )
398+ # 3.5) Typographic shad: ག/ཤ/ཀ (+ optional vowel) at line end → add ། before \n
399+ text = _GA_SHA_KA_NL_RE .sub (r"\1" + "\u0F0D \n " , text )
400+
401+ # 4) Any remaining Tibetan letter before \n → letter + tsheg + \n
402+ # (tsheg = syllable delimiter; will become a space in step 10, but
403+ # will NOT be promoted to a sentence boundary by step 9)
404+ text = _LETTER_BEFORE_NL_RE .sub (r"\1" + "\u0F0B \n " , text )
286405
287406 # 5) Drop all newlines
288407 text = text .replace ("\n " , "" )
@@ -299,7 +418,13 @@ def normalize_for_perplexity(
299418 # 9) Any run of punctuation and/or spaces → shad token surrounded by spaces
300419 text = _PUNCT_OR_SPACE_RE .sub (" \u0F0D " , text )
301420
302- # 9b) Sanskrit syllable handling
421+ # 9b) Isolate Tibetan brackets so they are not absorbed into adjacent syllables
422+ text = _BRACKET_RE .sub (r" \1 " , text )
423+
424+ # 9c) Split case affixes from syllable bodies
425+ text = _apply_affix_splits (text )
426+
427+ # 9d) Sanskrit syllable handling
303428 if space_sskt or fold_sskt :
304429 text = _process_sskt (text , space_sskt , fold_sskt )
305430
0 commit comments