1+ import re
2+ import unicodedata
3+ from enum import Enum
4+ from .unicode_normalization import normalize_unicode
5+
6+ # Normalize all line breaks to '\n'
7+ _LINEBREAKS_RE = re .compile (r"\r\n?|\u0085|\u2028|\u2029" )
8+
9+ # Zero-width and invisible characters to remove (includes BOM everywhere)
10+ _ZERO_WIDTH_STRIP = dict .fromkeys (map (ord , [
11+ "\u200B " , # ZERO WIDTH SPACE
12+ "\u2060 " , # WORD JOINER
13+ "\uFEFF " , # ZERO WIDTH NO-BREAK SPACE / BOM (remove even mid-text)
14+ "\u180E " , # MONGOLIAN VOWEL SEPARATOR (deprecated)
15+ "\u034F " , # COMBINING GRAPHEME JOINER
16+ ]))
17+
18+ # Map all Unicode spaces (and horizontal ASCII whitespace) to ASCII space
19+ _UNICODE_SPACES = [
20+ "\u00A0 " , # NO-BREAK SPACE
21+ "\u1680 " , "\u2000 " , "\u2001 " , "\u2002 " , "\u2003 " , "\u2004 " ,
22+ "\u2005 " , "\u2006 " , "\u2007 " , "\u2008 " , "\u2009 " , "\u200A " ,
23+ "\u202F " , "\u205F " , "\u3000 " , # narrow, medium, ideographic spaces
24+ "\t " , "\x0b " , "\x0c " # TAB, VT, FF
25+ ]
26+ _SPACE_TO_ASCII = {ord (ch ): " " for ch in _UNICODE_SPACES }
27+
28+
29+ def normalize_spaces (
30+ text : str ,
31+ collapse_internal_spaces : bool = True ,
32+ tibetan_specific : bool = True ,
33+ ) -> str :
34+ """
35+ Normalize spaces in text.
36+
37+ Steps:
38+ 1. Collapse multiple newlines to one.
39+ 2. Remove spaces next to newlines.
40+ 3. Collapse multiple spaces to one.
41+ 4. Apply Tibetan-specific space normalization rules.
42+
43+ Tibetan-specific rules:
44+ - Remove space after tsheg (U+0F0B, U+0F0C, U+0FD2) if followed by
45+ initial letter (U+0F40-U+0F6C) or shad (U+0F0D-U+0F11)
46+ - Remove space between final letter (U+0F40-U+0FBC) and tsheg
47+ """
48+ if not text :
49+ return ""
50+
51+ s = text
52+
53+ # 1) Collapse multiple newlines
54+ s = re .sub (r"\n{2,}" , "\n " , s )
55+
56+ # 2) Remove spaces next to newlines
57+ s = re .sub (r"[ ]+\n" , "\n " , s )
58+ s = re .sub (r"\n[ ]+" , "\n " , s )
59+
60+ # 3) Collapse space runs
61+ if collapse_internal_spaces :
62+ s = re .sub (r" {2,}" , " " , s )
63+
64+ # 4) Tibetan-specific space normalization
65+ if tibetan_specific :
66+ # Remove space after tsheg if followed by initial letter or shad
67+ s = re .sub (r"([\u0f0b\u0f0c\u0fd2]) +([\u0f40-\u0f6c\u0f0d-\u0f11])" , r"\1\2" , s )
68+ # Remove space between final letter and tsheg
69+ s = re .sub (r"([\u0f40-\u0fbc]) +([\u0f0b\u0f0c\u0fd2])" , r"\1\2" , s )
70+
71+ return s
72+
73+ def normalize_corpus (
74+ text : str ,
75+ strip_control : bool = True ,
76+ collapse_internal_spaces : bool = True ,
77+ ) -> str :
78+ """
79+ General-purpose Unicode normalization.
80+
81+ Steps:
82+ 1. Normalize to NFC.
83+ 2. Convert all line breaks to '\n '.
84+ 3. Remove zero-width / invisible characters (incl. all BOMs).
85+ 4. Map Unicode spaces and tabs to plain ASCII space.
86+ 5. Optionally remove control characters (except newline).
87+ 6. Normalize spaces (including Tibetan-specific rules).
88+ 7. Apply Tibetan Unicode normalization.
89+
90+ Keeps ZWJ/ZWNJ (joiners) intact.
91+ """
92+ if not text :
93+ return ""
94+
95+ # 1) NFC normalization
96+ s = unicodedata .normalize ("NFC" , text )
97+
98+ # 2) Normalize line breaks
99+ s = _LINEBREAKS_RE .sub ("\n " , s )
100+
101+ # 3) Remove zero-width & BOM
102+ s = s .translate (_ZERO_WIDTH_STRIP )
103+
104+ # 4) Normalize spaces to ASCII space
105+ s = s .translate (_SPACE_TO_ASCII )
106+
107+ # 5) Optionally strip control characters (but keep newline)
108+ if strip_control :
109+ s = "" .join (
110+ ch for ch in s
111+ if ch == "\n " or (unicodedata .category (ch )[0 ] != "C" )
112+ )
113+
114+ # 6) Normalize spaces
115+ s = normalize_spaces (s , collapse_internal_spaces = collapse_internal_spaces )
116+
117+ # 7) Tibetan Unicode normalization
118+ s = normalize_unicode (s )
119+ # no graphical distinction between 0f0b and 0f0c
120+ s = s .replace ("\u0f0c " , "\u0f0b " )
121+ # double shad is just two shad
122+ s = s .replace ("\u0f0e " , "\u0f0d \u0f0d " )
123+
124+ return s
125+
126+
127+ def _run_sanity_checks () -> None :
128+ """Lightweight checks exercising each normalization rule."""
129+
130+ def _assert_equal (actual : str , expected : str , label : str ) -> None :
131+ if actual != expected :
132+ raise AssertionError (f"{ label } failed: { actual !r} != { expected !r} " )
133+
134+ # normalize_spaces: collapse newlines/spaces and trim around newlines
135+ _assert_equal (
136+ normalize_spaces ("a\n \n b \n c" ),
137+ "a\n b\n c" ,
138+ "normalize_spaces basic spacing" ,
139+ )
140+
141+ # normalize_spaces: Tibetan-specific spacing around tsheg and finals
142+ tibetan_sample = "\u0f0b \u0f40 \u0f66 \u0f0b " # tsheg, initial, final, tsheg
143+ _assert_equal (
144+ normalize_spaces (tibetan_sample ),
145+ "\u0f0b \u0f40 \u0f66 \u0f0b " ,
146+ "normalize_spaces tibetan spacing" ,
147+ )
148+
149+ # normalize_corpus: line breaks, zero-width strip, space mapping, control strip,
150+ # Tibetan Unicode tweaks (0f0c→0f0b, 0f0e→double shad)
151+ corpus_sample = "a\u00a0 \u200b b\r \n c\u0f0c \u0f0e \u0001 "
152+ _assert_equal (
153+ normalize_corpus (corpus_sample ),
154+ "a b\n c\u0f0b \u0f0d \u0f0d " ,
155+ "normalize_corpus full pipeline" ,
156+ )
157+
158+
159+ if __name__ == "__main__" :
160+ _run_sanity_checks ()
161+ print ("corpus_normalization sanity checks passed" )
0 commit comments