Skip to content

Commit 4540176

Browse files
committed
Added a translate call to remove non-spacing unicode characters after lower-casing
1 parent 6ae4f14 commit 4540176

1 file changed

Lines changed: 5 additions & 1 deletion

File tree

src/error_align/utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
1+
import unicodedata
12
from dataclasses import dataclass
23
from enum import IntEnum
34
from itertools import chain, combinations
45

56
import regex as re
67
from unidecode import unidecode
78

9+
# Build a translation table that maps all Mn (non-spacing mark) code points to None
10+
_MN_TABLE = str.maketrans({cp: None for cp in range(0x110000) if unicodedata.category(chr(cp)) == "Mn"})
11+
812

913
class OpType(IntEnum):
1014
MATCH = 0
@@ -152,7 +156,7 @@ def basic_normalizer(text: str) -> str:
152156
str: The normalized text.
153157
154158
"""
155-
return text.lower()
159+
return text.lower().translate(_MN_TABLE)
156160

157161

158162
def ensure_length_preservation(normalizer: callable) -> callable:

0 commit comments

Comments
 (0)