Simplified fix for non-spacing lower-casing characters

borgholt · borgholt · commit d572efaf2f8a · 2026-03-30T11:46:09.000+02:00
diff --git a/src/error_align/utils.py b/src/error_align/utils.py
@@ -1,14 +1,10 @@
-import unicodedata
 from dataclasses import dataclass
 from enum import IntEnum
 from itertools import chain, combinations
 
 import regex as re
 from unidecode import unidecode
 
-# Build a translation table that maps all Mn (non-spacing mark) code points to None
-_MN_TABLE = str.maketrans({cp: None for cp in range(0x110000) if unicodedata.category(chr(cp)) == "Mn"})
-
 
 class OpType(IntEnum):
     MATCH = 0
@@ -153,7 +149,11 @@ def basic_tokenizer(text: str) -> list:
 
 
 def basic_normalizer(text: str) -> str:
-    """Default normalizer that only converts text to lowercase.
+    """Default normalizer that converts text to lowercase.
+
+    U+0130 (İ, Latin capital letter I with dot above) is replaced with a plain
+    'I' before lowercasing to prevent the length-expanding decomposition that
+    Python's str.lower() would otherwise produce ('i' + combining dot above).
 
     Args:
         text (str): The input text to normalize.
@@ -162,7 +162,7 @@ def basic_normalizer(text: str) -> str:
         str: The normalized text.
 
     """
-    return text.lower().translate(_MN_TABLE)
+    return text.replace("\u0130", "I").lower()
 
 
 def ensure_length_preservation(normalizer: callable) -> callable:
diff --git a/tests/test_default.py b/tests/test_default.py
@@ -8,7 +8,7 @@
 from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix
 from error_align.error_align import prepare_graph_metadata
 from error_align.graph_metadata import SubgraphMetadata
-from error_align.utils import Alignment, OpType, categorize_char, ensure_length_preservation
+from error_align.utils import Alignment, OpType, basic_normalizer, categorize_char, ensure_length_preservation
 
 
 def test_error_align() -> None:
@@ -232,3 +232,10 @@ def bad_normalizer(text: str) -> str:
         raise AssertionError("Expected ValueError for length mismatch.")
     except ValueError:
         pass
+
+
+def test_basic_normalizer_dotted_capital_i() -> None:
+    """Regression test: U+0130 (İ) must not expand length when lowercased."""
+    result = basic_normalizer("İstanbul")
+    assert result == "istanbul"
+    assert len(result) == len("İstanbul")