Skip to content

Commit d572efa

Browse files
committed
Simplified fix for non-spacing lower-casing characters
1 parent 5a8fe71 commit d572efa

2 files changed

Lines changed: 14 additions & 7 deletions

File tree

src/error_align/utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
1-
import unicodedata
21
from dataclasses import dataclass
32
from enum import IntEnum
43
from itertools import chain, combinations
54

65
import regex as re
76
from unidecode import unidecode
87

9-
# Build a translation table that maps all Mn (non-spacing mark) code points to None
10-
_MN_TABLE = str.maketrans({cp: None for cp in range(0x110000) if unicodedata.category(chr(cp)) == "Mn"})
11-
128

139
class OpType(IntEnum):
1410
MATCH = 0
@@ -153,7 +149,11 @@ def basic_tokenizer(text: str) -> list:
153149

154150

155151
def basic_normalizer(text: str) -> str:
156-
"""Default normalizer that only converts text to lowercase.
152+
"""Default normalizer that converts text to lowercase.
153+
154+
U+0130 (İ, Latin capital letter I with dot above) is replaced with a plain
155+
'I' before lowercasing to prevent the length-expanding decomposition that
156+
Python's str.lower() would otherwise produce ('i' + combining dot above).
157157
158158
Args:
159159
text (str): The input text to normalize.
@@ -162,7 +162,7 @@ def basic_normalizer(text: str) -> str:
162162
str: The normalized text.
163163
164164
"""
165-
return text.lower().translate(_MN_TABLE)
165+
return text.replace("\u0130", "I").lower()
166166

167167

168168
def ensure_length_preservation(normalizer: callable) -> callable:

tests/test_default.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix
99
from error_align.error_align import prepare_graph_metadata
1010
from error_align.graph_metadata import SubgraphMetadata
11-
from error_align.utils import Alignment, OpType, categorize_char, ensure_length_preservation
11+
from error_align.utils import Alignment, OpType, basic_normalizer, categorize_char, ensure_length_preservation
1212

1313

1414
def test_error_align() -> None:
@@ -232,3 +232,10 @@ def bad_normalizer(text: str) -> str:
232232
raise AssertionError("Expected ValueError for length mismatch.")
233233
except ValueError:
234234
pass
235+
236+
237+
def test_basic_normalizer_dotted_capital_i() -> None:
238+
"""Regression test: U+0130 (İ) must not expand length when lowercased."""
239+
result = basic_normalizer("İstanbul")
240+
assert result == "istanbul"
241+
assert len(result) == len("İstanbul")

0 commit comments

Comments
 (0)