diff --git a/src/error_align/utils.py b/src/error_align/utils.py index 4917481..5b8a4ad 100644 --- a/src/error_align/utils.py +++ b/src/error_align/utils.py @@ -93,7 +93,10 @@ def is_vowel(c: str) -> bool: """ assert len(c) == 1, "Input must be a single character." - return unidecode(c)[0] in "aeiouy" + decode_char = unidecode(c) + if len(decode_char) == 0: + return False + return decode_char[0] in "aeiouy" def is_consonant(c: str) -> bool: @@ -107,7 +110,10 @@ def is_consonant(c: str) -> bool: """ assert len(c) == 1, "Input must be a single character." - return unidecode(c)[0] in "bcdfghjklmnpqrstvwxyz" + decode_char = unidecode(c) + if len(decode_char) == 0: + return False + return decode_char[0] in "bcdfghjklmnpqrstvwxyz" def categorize_char(c: str) -> int: @@ -143,7 +149,11 @@ def basic_tokenizer(text: str) -> list: def basic_normalizer(text: str) -> str: - """Default normalizer that only converts text to lowercase. + """Default normalizer that converts text to lowercase. + + U+0130 (İ, Latin capital letter I with dot above) is replaced with a plain + 'I' before lowercasing to prevent the length-expanding decomposition that + Python's str.lower() would otherwise produce ('i' + combining dot above). Args: text (str): The input text to normalize. @@ -152,7 +162,7 @@ def basic_normalizer(text: str) -> str: str: The normalized text. """ - return text.lower() + return text.replace("\u0130", "I").lower() def ensure_length_preservation(normalizer: callable) -> callable: diff --git a/tests/test_default.py b/tests/test_default.py index 19e8239..d33952c 100644 --- a/tests/test_default.py +++ b/tests/test_default.py @@ -8,7 +8,15 @@ from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix from error_align.error_align import prepare_graph_metadata from error_align.graph_metadata import SubgraphMetadata -from error_align.utils import Alignment, OpType, categorize_char, ensure_length_preservation +from error_align.utils import ( + Alignment, + OpType, + basic_normalizer, + categorize_char, + ensure_length_preservation, + is_consonant, + is_vowel, +) def test_error_align() -> None: @@ -232,3 +240,17 @@ def bad_normalizer(text: str) -> str: raise AssertionError("Expected ValueError for length mismatch.") except ValueError: pass + + +def test_is_vowel_and_is_consonant_with_empty_unidecode() -> None: + """Regression test: characters that unidecode to '' must return False instead of crashing.""" + # U+0300 (combining grave accent) unidecodes to an empty string + assert is_vowel("\u0300") is False + assert is_consonant("\u0300") is False + + +def test_basic_normalizer_dotted_capital_i() -> None: + """Regression test: U+0130 (İ) must not expand length when lowercased.""" + result = basic_normalizer("İstanbul") + assert result == "istanbul" + assert len(result) == len("İstanbul")