We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 6ae4f14 commit 4540176Copy full SHA for 4540176
1 file changed
src/error_align/utils.py
@@ -1,10 +1,14 @@
1
+import unicodedata
2
from dataclasses import dataclass
3
from enum import IntEnum
4
from itertools import chain, combinations
5
6
import regex as re
7
from unidecode import unidecode
8
9
+# Build a translation table that maps all Mn (non-spacing mark) code points to None
10
+_MN_TABLE = str.maketrans({cp: None for cp in range(0x110000) if unicodedata.category(chr(cp)) == "Mn"})
11
+
12
13
class OpType(IntEnum):
14
MATCH = 0
@@ -152,7 +156,7 @@ def basic_normalizer(text: str) -> str:
152
156
str: The normalized text.
153
157
154
158
"""
155
- return text.lower()
159
+ return text.lower().translate(_MN_TABLE)
160
161
162
def ensure_length_preservation(normalizer: callable) -> callable:
0 commit comments