Skip to content

Commit 8fdc5ca

Browse files
committed
fix(something): add merge_lines function
1 parent ece862b commit 8fdc5ca

1 file changed

Lines changed: 134 additions & 9 deletions

File tree

botok/utils/corpus_normalization.py

Lines changed: 134 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,52 @@ def normalize_corpus(
120120

121121
return s
122122

123+
124+
# ---------------------------------------------------------------------------
125+
# merge_lines helpers
126+
# ---------------------------------------------------------------------------
127+
128+
# Fold consecutive tshegs that are immediately before / after a newline.
129+
_MULTI_TSHEG_BEFORE_NL_RE = re.compile(r"\u0F0B{2,}(?=\n)")
130+
_MULTI_TSHEG_AFTER_NL_RE = re.compile(r"(?<=\n)\u0F0B{2,}")
131+
132+
133+
def merge_lines(text: str) -> str:
134+
"""Merge a multi-line Tibetan string into a single continuous line.
135+
136+
Designed for word-wrapped or page-OCR'd text where newlines are
137+
typographic artefacts rather than sentence boundaries.
138+
139+
Steps:
140+
1. Normalize line-break sequences (CRLF, CR, NEL, LS, PS) to LF.
141+
2. Remove ASCII spaces / tabs immediately before or after each LF.
142+
3. Fold runs of two or more tshegs (U+0F0B) that sit at the very end
143+
or very beginning of a line down to a single tsheg.
144+
4. When a Tibetan letter (U+0F40-U+0FBC) ends a line with no trailing
145+
tsheg, insert a space between it and the LF so the syllable
146+
boundary is preserved after the LF is removed.
147+
5. Remove all remaining LF characters.
148+
"""
149+
# 1) Normalize line endings
150+
text = _LINEBREAKS_RE.sub("\n", text)
151+
152+
# 2) Remove spaces/tabs around newlines
153+
text = re.sub(r"[ \t]+\n", "\n", text)
154+
text = re.sub(r"\n[ \t]+", "\n", text)
155+
156+
# 3) Fold tsheg runs at line boundaries to one tsheg
157+
text = _MULTI_TSHEG_BEFORE_NL_RE.sub("\u0F0B", text)
158+
text = _MULTI_TSHEG_AFTER_NL_RE.sub("\u0F0B", text)
159+
160+
# 4) Letter at line-end without trailing tsheg → insert space
161+
text = _LETTER_BEFORE_NL_RE.sub(r"\1 \n", text)
162+
163+
# 5) Drop all newlines
164+
text = text.replace("\n", "")
165+
166+
return text
167+
168+
123169
# U+0FD2 is excluded (NYIS TSHEG → converted to U+0F0B earlier).
124170
# U+0FD5-U+0FD8 are svasti/auspicious signs, structurally identical to yig-mgo.
125171
_YIG_MGO_START = r"\u0F01-\u0F07\u0F09\u0F0A\u0FD0\u0FD1\u0FD3-\u0FD8"
@@ -128,6 +174,17 @@ def normalize_corpus(
128174

129175
# Compiled patterns for normalize_for_perplexity
130176
_MULTI_TSHEG_RE = re.compile(r"\u0F0B{2,}")
177+
# Tibetan vowel signs (U+0F71-U+0F84).
178+
_VOWEL = r"\u0F71-\u0F84"
179+
# ག (U+0F42), ཤ (U+0F64), ཀ (U+0F40): their right-side vertical bar is
180+
# typographically shared with the shad mark. When one of these ends a line
181+
# with no preceding tsheg the shad is implicit and must be made explicit.
182+
_GA_SHA_KA_NL_RE = re.compile(rf"([གཤཀ][{_VOWEL}]?)\n")
183+
# Any other Tibetan letter before a newline gets a tsheg (syllable boundary)
184+
# so the syllables are not merged when the newline is removed. We use a
185+
# tsheg rather than a space so that step 9 does NOT promote the boundary to
186+
# a shad token (only explicit ། in the source should become sentence
187+
# boundaries).
131188
_LETTER_BEFORE_NL_RE = re.compile(rf"([{_LETTER}])\n")
132189
_YIG_MGO_RE = re.compile(rf"[{_YIG_MGO_START}]+[{_PUNCT}]*")
133190
# Tibetan digits U+0F20-U+0F33, ASCII digits, comma as thousands-separator
@@ -138,6 +195,51 @@ def normalize_corpus(
138195
_MULTI_SPACE_RE = re.compile(r" {2,}")
139196
# Split on tsheg or space while capturing the delimiter
140197
_TSHEG_OR_SPACE_RE = re.compile(r"(\u0F0B| )")
198+
# Tibetan brackets (ANG KHANG KHEPA open/close) must be isolated tokens
199+
_BRACKET_RE = re.compile(r"([\u0F3C\u0F3D])")
200+
# Case affixes: always start with འ (U+0F60) followed by a specific letter.
201+
# Longest alternative (འིས) must precede the prefix it shares (འི).
202+
_AFFIX_RE = re.compile(rf"^([{_LETTER}]+)(འིས|འི|འོ|འམ|འང|འས|འད|འར)$")
203+
204+
205+
def _split_syllable_affixes(syllable: str) -> str:
206+
"""Recursively split Tibetan case affixes from a syllable body.
207+
208+
Affixes (འི, འོ, འམ, འང, འིས, འར, འད, འས) always start with འ
209+
followed by a specific letter. The function peels off one affix per
210+
call and recurses on the remaining stem, joining parts with spaces.
211+
212+
Special case: a syllable ending in ``འུར`` has its final ``ར`` split
213+
off as a separate token (the ``འུ`` oblique marker stays with the
214+
stem).
215+
216+
Examples::
217+
218+
_split_syllable_affixes("རྒྱལའི") → "རྒྱལ འི"
219+
_split_syllable_affixes("པའིའོ") → "པ འི འོ"
220+
_split_syllable_affixes("བཀའུར") → "བཀའུ ར"
221+
"""
222+
# *འུར: split off the allative ར, keep the oblique འུ with the stem
223+
if syllable.endswith("\u0F60\u0F74\u0F62") and len(syllable) > 3:
224+
return _split_syllable_affixes(syllable[:-1]) + " \u0F62"
225+
m = _AFFIX_RE.match(syllable)
226+
if not m:
227+
return syllable
228+
stem, affix = m.group(1), m.group(2)
229+
return _split_syllable_affixes(stem) + " " + affix
230+
231+
232+
def _apply_affix_splits(text: str) -> str:
233+
"""Apply :func:`_split_syllable_affixes` to every syllable token in *text*.
234+
235+
Tokens are delimited by tshegs (U+0F0B) or spaces; only tokens that
236+
contain at least one Tibetan letter are processed.
237+
"""
238+
parts = _TSHEG_OR_SPACE_RE.split(text)
239+
for i in range(0, len(parts), 2):
240+
if any(0x0F40 <= ord(c) <= 0x0FBC for c in parts[i]):
241+
parts[i] = _split_syllable_affixes(parts[i])
242+
return "".join(parts)
141243

142244

143245
def _process_sskt(text: str, space_sskt: bool, fold_sskt: bool) -> str:
@@ -203,9 +305,8 @@ def flush_sskt() -> None:
203305
elif space_sskt:
204306
stacks = split_into_stacks(content)
205307
out.append(" ".join(stacks))
206-
if delim == " ":
207-
out.append(delim)
208-
# tsheg delimiter absorbed (tshegs are stripped at end of pipeline)
308+
if delim:
309+
out.append(delim) # tsheg → space in step 10; space stays
209310
else:
210311
out.append(content)
211312
if delim:
@@ -252,8 +353,15 @@ def normalize_for_perplexity(
252353
2. Remove honorific particles U+0F35 / U+0F37 and TSA-PHRU (U+0F39).
253354
3. Normalize nasalization marks: NYI ZLA (U+0F82) and SNA LDAN
254355
(U+0F83) → RJES SU NGA RO (U+0F7E).
255-
4. Where a Tibetan letter (U+0F40-U+0FBC) is followed by a newline,
256-
insert a space to preserve the syllable boundary.
356+
3.5 Typographic shad: ག (U+0F42), ཤ (U+0F64), or ཀ (U+0F40) whose
357+
right-side vertical bar is visually shared with the shad mark.
358+
When such a consonant (+ optional vowel) ends a line without a
359+
preceding tsheg, the shad is implicit; make it explicit by
360+
inserting U+0F0D before the newline.
361+
4. Where any other Tibetan letter (U+0F40-U+0FBC) is followed by a
362+
newline, insert a tsheg (U+0F0B) to preserve the syllable
363+
boundary without creating a sentence boundary. (A space here
364+
would be promoted to ། by step 9 — incorrect.)
257365
5. Remove all remaining newlines.
258366
6. Remove yig-mgo opening marks (U+0F01-U+0F07, U+0F09, U+0F0A,
259367
U+0FD0, U+0FD1, U+0FD3-U+0FD8 incl. svasti signs) together with
@@ -264,7 +372,13 @@ def normalize_for_perplexity(
264372
U+0FFF), keeping spaces and the ``D`` placeholder.
265373
9. Any run of punctuation and/or spaces → shad token `` ། ``
266374
surrounded by spaces.
267-
9b. (space_sskt / fold_sskt) Process non-standard syllable tokens.
375+
9b. Surround Tibetan brackets ༼ (U+0F3C) and ༽ (U+0F3D) with spaces
376+
so they become standalone tokens.
377+
9c. Split case affixes (འི, འོ, འམ, འང, འིས, འར, འད, འས) from
378+
syllable bodies by inserting a space before each affix. Stacked
379+
affixes (e.g. འིའོ) are split one at a time, yielding separate
380+
tokens. Syllables ending in འུར have the final ར split off.
381+
9d. (space_sskt / fold_sskt) Process non-standard syllable tokens.
268382
10. Replace every remaining tsheg (U+0F0B) with a space; collapse
269383
multiple spaces. Space is now the sole token delimiter.
270384
11. Strip leading/trailing whitespace.
@@ -281,8 +395,13 @@ def normalize_for_perplexity(
281395
# 3) Normalize nasalization marks to RJES SU NGA RO (U+0F7E)
282396
text = text.replace("\u0F82", "\u0F7E").replace("\u0F83", "\u0F7E")
283397

284-
# 4) Letter before newline → letter + space + newline
285-
text = _LETTER_BEFORE_NL_RE.sub(r"\1 \n", text)
398+
# 3.5) Typographic shad: ག/ཤ/ཀ (+ optional vowel) at line end → add ། before \n
399+
text = _GA_SHA_KA_NL_RE.sub(r"\1" + "\u0F0D\n", text)
400+
401+
# 4) Any remaining Tibetan letter before \n → letter + tsheg + \n
402+
# (tsheg = syllable delimiter; will become a space in step 10, but
403+
# will NOT be promoted to a sentence boundary by step 9)
404+
text = _LETTER_BEFORE_NL_RE.sub(r"\1" + "\u0F0B\n", text)
286405

287406
# 5) Drop all newlines
288407
text = text.replace("\n", "")
@@ -299,7 +418,13 @@ def normalize_for_perplexity(
299418
# 9) Any run of punctuation and/or spaces → shad token surrounded by spaces
300419
text = _PUNCT_OR_SPACE_RE.sub(" \u0F0D ", text)
301420

302-
# 9b) Sanskrit syllable handling
421+
# 9b) Isolate Tibetan brackets so they are not absorbed into adjacent syllables
422+
text = _BRACKET_RE.sub(r" \1 ", text)
423+
424+
# 9c) Split case affixes from syllable bodies
425+
text = _apply_affix_splits(text)
426+
427+
# 9d) Sanskrit syllable handling
303428
if space_sskt or fold_sskt:
304429
text = _process_sskt(text, space_sskt, fold_sskt)
305430

0 commit comments

Comments
 (0)