Skip to content

Commit b1ac420

Browse files
authored
Fix: Unicode lower-casing does not preserve length (#19)
* Added a translate call to remove non-spacing unicode characters after lower-casing * Added fallback to False for characters that transliterate to the empty string in character classification utils functions * Simplified fix for non-spacing lower-casing characters * Added tests for empty unidecode chars
1 parent 6ae4f14 commit b1ac420

2 files changed

Lines changed: 37 additions & 5 deletions

File tree

src/error_align/utils.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,10 @@ def is_vowel(c: str) -> bool:
9393
9494
"""
9595
assert len(c) == 1, "Input must be a single character."
96-
return unidecode(c)[0] in "aeiouy"
96+
decode_char = unidecode(c)
97+
if len(decode_char) == 0:
98+
return False
99+
return decode_char[0] in "aeiouy"
97100

98101

99102
def is_consonant(c: str) -> bool:
@@ -107,7 +110,10 @@ def is_consonant(c: str) -> bool:
107110
108111
"""
109112
assert len(c) == 1, "Input must be a single character."
110-
return unidecode(c)[0] in "bcdfghjklmnpqrstvwxyz"
113+
decode_char = unidecode(c)
114+
if len(decode_char) == 0:
115+
return False
116+
return decode_char[0] in "bcdfghjklmnpqrstvwxyz"
111117

112118

113119
def categorize_char(c: str) -> int:
@@ -143,7 +149,11 @@ def basic_tokenizer(text: str) -> list:
143149

144150

145151
def basic_normalizer(text: str) -> str:
146-
"""Default normalizer that only converts text to lowercase.
152+
"""Default normalizer that converts text to lowercase.
153+
154+
U+0130 (İ, Latin capital letter I with dot above) is replaced with a plain
155+
'I' before lowercasing to prevent the length-expanding decomposition that
156+
Python's str.lower() would otherwise produce ('i' + combining dot above).
147157
148158
Args:
149159
text (str): The input text to normalize.
@@ -152,7 +162,7 @@ def basic_normalizer(text: str) -> str:
152162
str: The normalized text.
153163
154164
"""
155-
return text.lower()
165+
return text.replace("\u0130", "I").lower()
156166

157167

158168
def ensure_length_preservation(normalizer: callable) -> callable:

tests/test_default.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,15 @@
88
from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix
99
from error_align.error_align import prepare_graph_metadata
1010
from error_align.graph_metadata import SubgraphMetadata
11-
from error_align.utils import Alignment, OpType, categorize_char, ensure_length_preservation
11+
from error_align.utils import (
12+
Alignment,
13+
OpType,
14+
basic_normalizer,
15+
categorize_char,
16+
ensure_length_preservation,
17+
is_consonant,
18+
is_vowel,
19+
)
1220

1321

1422
def test_error_align() -> None:
@@ -232,3 +240,17 @@ def bad_normalizer(text: str) -> str:
232240
raise AssertionError("Expected ValueError for length mismatch.")
233241
except ValueError:
234242
pass
243+
244+
245+
def test_is_vowel_and_is_consonant_with_empty_unidecode() -> None:
246+
"""Regression test: characters that unidecode to '' must return False instead of crashing."""
247+
# U+0300 (combining grave accent) unidecodes to an empty string
248+
assert is_vowel("\u0300") is False
249+
assert is_consonant("\u0300") is False
250+
251+
252+
def test_basic_normalizer_dotted_capital_i() -> None:
253+
"""Regression test: U+0130 (İ) must not expand length when lowercased."""
254+
result = basic_normalizer("İstanbul")
255+
assert result == "istanbul"
256+
assert len(result) == len("İstanbul")

0 commit comments

Comments
 (0)