Skip to content

Commit f269e72

Browse files
committed
add corpus normalization function
1 parent bfdb3cb commit f269e72

1 file changed

Lines changed: 161 additions & 0 deletions

File tree

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import re
2+
import unicodedata
3+
from enum import Enum
4+
from .unicode_normalization import normalize_unicode
5+
6+
# Normalize all line breaks to '\n'
7+
_LINEBREAKS_RE = re.compile(r"\r\n?|\u0085|\u2028|\u2029")
8+
9+
# Zero-width and invisible characters to remove (includes BOM everywhere)
10+
_ZERO_WIDTH_STRIP = dict.fromkeys(map(ord, [
11+
"\u200B", # ZERO WIDTH SPACE
12+
"\u2060", # WORD JOINER
13+
"\uFEFF", # ZERO WIDTH NO-BREAK SPACE / BOM (remove even mid-text)
14+
"\u180E", # MONGOLIAN VOWEL SEPARATOR (deprecated)
15+
"\u034F", # COMBINING GRAPHEME JOINER
16+
]))
17+
18+
# Map all Unicode spaces (and horizontal ASCII whitespace) to ASCII space
19+
_UNICODE_SPACES = [
20+
"\u00A0", # NO-BREAK SPACE
21+
"\u1680", "\u2000", "\u2001", "\u2002", "\u2003", "\u2004",
22+
"\u2005", "\u2006", "\u2007", "\u2008", "\u2009", "\u200A",
23+
"\u202F", "\u205F", "\u3000", # narrow, medium, ideographic spaces
24+
"\t", "\x0b", "\x0c" # TAB, VT, FF
25+
]
26+
_SPACE_TO_ASCII = {ord(ch): " " for ch in _UNICODE_SPACES}
27+
28+
29+
def normalize_spaces(
30+
text: str,
31+
collapse_internal_spaces: bool = True,
32+
tibetan_specific: bool = True,
33+
) -> str:
34+
"""
35+
Normalize spaces in text.
36+
37+
Steps:
38+
1. Collapse multiple newlines to one.
39+
2. Remove spaces next to newlines.
40+
3. Collapse multiple spaces to one.
41+
4. Apply Tibetan-specific space normalization rules.
42+
43+
Tibetan-specific rules:
44+
- Remove space after tsheg (U+0F0B, U+0F0C, U+0FD2) if followed by
45+
initial letter (U+0F40-U+0F6C) or shad (U+0F0D-U+0F11)
46+
- Remove space between final letter (U+0F40-U+0FBC) and tsheg
47+
"""
48+
if not text:
49+
return ""
50+
51+
s = text
52+
53+
# 1) Collapse multiple newlines
54+
s = re.sub(r"\n{2,}", "\n", s)
55+
56+
# 2) Remove spaces next to newlines
57+
s = re.sub(r"[ ]+\n", "\n", s)
58+
s = re.sub(r"\n[ ]+", "\n", s)
59+
60+
# 3) Collapse space runs
61+
if collapse_internal_spaces:
62+
s = re.sub(r" {2,}", " ", s)
63+
64+
# 4) Tibetan-specific space normalization
65+
if tibetan_specific:
66+
# Remove space after tsheg if followed by initial letter or shad
67+
s = re.sub(r"([\u0f0b\u0f0c\u0fd2]) +([\u0f40-\u0f6c\u0f0d-\u0f11])", r"\1\2", s)
68+
# Remove space between final letter and tsheg
69+
s = re.sub(r"([\u0f40-\u0fbc]) +([\u0f0b\u0f0c\u0fd2])", r"\1\2", s)
70+
71+
return s
72+
73+
def normalize_corpus(
74+
text: str,
75+
strip_control: bool = True,
76+
collapse_internal_spaces: bool = True,
77+
) -> str:
78+
"""
79+
General-purpose Unicode normalization.
80+
81+
Steps:
82+
1. Normalize to NFC.
83+
2. Convert all line breaks to '\n'.
84+
3. Remove zero-width / invisible characters (incl. all BOMs).
85+
4. Map Unicode spaces and tabs to plain ASCII space.
86+
5. Optionally remove control characters (except newline).
87+
6. Normalize spaces (including Tibetan-specific rules).
88+
7. Apply Tibetan Unicode normalization.
89+
90+
Keeps ZWJ/ZWNJ (joiners) intact.
91+
"""
92+
if not text:
93+
return ""
94+
95+
# 1) NFC normalization
96+
s = unicodedata.normalize("NFC", text)
97+
98+
# 2) Normalize line breaks
99+
s = _LINEBREAKS_RE.sub("\n", s)
100+
101+
# 3) Remove zero-width & BOM
102+
s = s.translate(_ZERO_WIDTH_STRIP)
103+
104+
# 4) Normalize spaces to ASCII space
105+
s = s.translate(_SPACE_TO_ASCII)
106+
107+
# 5) Optionally strip control characters (but keep newline)
108+
if strip_control:
109+
s = "".join(
110+
ch for ch in s
111+
if ch == "\n" or (unicodedata.category(ch)[0] != "C")
112+
)
113+
114+
# 6) Normalize spaces
115+
s = normalize_spaces(s, collapse_internal_spaces=collapse_internal_spaces)
116+
117+
# 7) Tibetan Unicode normalization
118+
s = normalize_unicode(s)
119+
# no graphical distinction between 0f0b and 0f0c
120+
s = s.replace("\u0f0c", "\u0f0b")
121+
# double shad is just two shad
122+
s = s.replace("\u0f0e", "\u0f0d\u0f0d")
123+
124+
return s
125+
126+
127+
def _run_sanity_checks() -> None:
128+
"""Lightweight checks exercising each normalization rule."""
129+
130+
def _assert_equal(actual: str, expected: str, label: str) -> None:
131+
if actual != expected:
132+
raise AssertionError(f"{label} failed: {actual!r} != {expected!r}")
133+
134+
# normalize_spaces: collapse newlines/spaces and trim around newlines
135+
_assert_equal(
136+
normalize_spaces("a\n\n b \n c"),
137+
"a\nb\nc",
138+
"normalize_spaces basic spacing",
139+
)
140+
141+
# normalize_spaces: Tibetan-specific spacing around tsheg and finals
142+
tibetan_sample = "\u0f0b \u0f40 \u0f66 \u0f0b" # tsheg, initial, final, tsheg
143+
_assert_equal(
144+
normalize_spaces(tibetan_sample),
145+
"\u0f0b\u0f40 \u0f66\u0f0b",
146+
"normalize_spaces tibetan spacing",
147+
)
148+
149+
# normalize_corpus: line breaks, zero-width strip, space mapping, control strip,
150+
# Tibetan Unicode tweaks (0f0c→0f0b, 0f0e→double shad)
151+
corpus_sample = "a\u00a0\u200b b\r\nc\u0f0c\u0f0e\u0001"
152+
_assert_equal(
153+
normalize_corpus(corpus_sample),
154+
"a b\nc\u0f0b\u0f0d\u0f0d",
155+
"normalize_corpus full pipeline",
156+
)
157+
158+
159+
if __name__ == "__main__":
160+
_run_sanity_checks()
161+
print("corpus_normalization sanity checks passed")

0 commit comments

Comments
 (0)