Fix: Unicode lower-casing does not preserve length (#19)

borgholt · web-flow · commit b1ac420d81a1 · 2026-04-02T11:23:13.000+02:00
* Added a translate call to remove non-spacing unicode characters after lower-casing

* Added fallback to False for characters that transliterate to the empty string in character classification utils functions

* Simplified fix for non-spacing lower-casing characters

* Added tests for empty unidecode chars
diff --git a/src/error_align/utils.py b/src/error_align/utils.py
@@ -93,7 +93,10 @@ def is_vowel(c: str) -> bool:
 
     """
     assert len(c) == 1, "Input must be a single character."
-    return unidecode(c)[0] in "aeiouy"
+    decode_char = unidecode(c)
+    if len(decode_char) == 0:
+        return False
+    return decode_char[0] in "aeiouy"
 
 
 def is_consonant(c: str) -> bool:
@@ -107,7 +110,10 @@ def is_consonant(c: str) -> bool:
 
     """
     assert len(c) == 1, "Input must be a single character."
-    return unidecode(c)[0] in "bcdfghjklmnpqrstvwxyz"
+    decode_char = unidecode(c)
+    if len(decode_char) == 0:
+        return False
+    return decode_char[0] in "bcdfghjklmnpqrstvwxyz"
 
 
 def categorize_char(c: str) -> int:
@@ -143,7 +149,11 @@ def basic_tokenizer(text: str) -> list:
 
 
 def basic_normalizer(text: str) -> str:
-    """Default normalizer that only converts text to lowercase.
+    """Default normalizer that converts text to lowercase.
+
+    U+0130 (İ, Latin capital letter I with dot above) is replaced with a plain
+    'I' before lowercasing to prevent the length-expanding decomposition that
+    Python's str.lower() would otherwise produce ('i' + combining dot above).
 
     Args:
         text (str): The input text to normalize.
@@ -152,7 +162,7 @@ def basic_normalizer(text: str) -> str:
         str: The normalized text.
 
     """
-    return text.lower()
+    return text.replace("\u0130", "I").lower()
 
 
 def ensure_length_preservation(normalizer: callable) -> callable:
diff --git a/tests/test_default.py b/tests/test_default.py
@@ -8,7 +8,15 @@
 from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix
 from error_align.error_align import prepare_graph_metadata
 from error_align.graph_metadata import SubgraphMetadata
-from error_align.utils import Alignment, OpType, categorize_char, ensure_length_preservation
+from error_align.utils import (
+    Alignment,
+    OpType,
+    basic_normalizer,
+    categorize_char,
+    ensure_length_preservation,
+    is_consonant,
+    is_vowel,
+)
 
 
 def test_error_align() -> None:
@@ -232,3 +240,17 @@ def bad_normalizer(text: str) -> str:
         raise AssertionError("Expected ValueError for length mismatch.")
     except ValueError:
         pass
+
+
+def test_is_vowel_and_is_consonant_with_empty_unidecode() -> None:
+    """Regression test: characters that unidecode to '' must return False instead of crashing."""
+    # U+0300 (combining grave accent) unidecodes to an empty string
+    assert is_vowel("\u0300") is False
+    assert is_consonant("\u0300") is False
+
+
+def test_basic_normalizer_dotted_capital_i() -> None:
+    """Regression test: U+0130 (İ) must not expand length when lowercased."""
+    result = basic_normalizer("İstanbul")
+    assert result == "istanbul"
+    assert len(result) == len("İstanbul")