Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions src/error_align/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ def is_vowel(c: str) -> bool:

"""
assert len(c) == 1, "Input must be a single character."
return unidecode(c)[0] in "aeiouy"
decode_char = unidecode(c)
if len(decode_char) == 0:
return False
return decode_char[0] in "aeiouy"
Comment thread
borgholt marked this conversation as resolved.


def is_consonant(c: str) -> bool:
Expand All @@ -107,7 +110,10 @@ def is_consonant(c: str) -> bool:

"""
assert len(c) == 1, "Input must be a single character."
return unidecode(c)[0] in "bcdfghjklmnpqrstvwxyz"
decode_char = unidecode(c)
if len(decode_char) == 0:
return False
return decode_char[0] in "bcdfghjklmnpqrstvwxyz"
Comment thread
borgholt marked this conversation as resolved.


def categorize_char(c: str) -> int:
Expand Down Expand Up @@ -143,7 +149,11 @@ def basic_tokenizer(text: str) -> list:


def basic_normalizer(text: str) -> str:
"""Default normalizer that only converts text to lowercase.
"""Default normalizer that converts text to lowercase.

U+0130 (İ, Latin capital letter I with dot above) is replaced with a plain
'I' before lowercasing to prevent the length-expanding decomposition that
Python's str.lower() would otherwise produce ('i' + combining dot above).

Args:
text (str): The input text to normalize.
Expand All @@ -152,7 +162,7 @@ def basic_normalizer(text: str) -> str:
str: The normalized text.

"""
return text.lower()
return text.replace("\u0130", "I").lower()
Comment thread
borgholt marked this conversation as resolved.


def ensure_length_preservation(normalizer: callable) -> callable:
Expand Down
24 changes: 23 additions & 1 deletion tests/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,15 @@
from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix
from error_align.error_align import prepare_graph_metadata
from error_align.graph_metadata import SubgraphMetadata
from error_align.utils import Alignment, OpType, categorize_char, ensure_length_preservation
from error_align.utils import (
Alignment,
OpType,
basic_normalizer,
categorize_char,
ensure_length_preservation,
is_consonant,
is_vowel,
)


def test_error_align() -> None:
Expand Down Expand Up @@ -232,3 +240,17 @@ def bad_normalizer(text: str) -> str:
raise AssertionError("Expected ValueError for length mismatch.")
except ValueError:
pass


def test_is_vowel_and_is_consonant_with_empty_unidecode() -> None:
"""Regression test: characters that unidecode to '' must return False instead of crashing."""
# U+0300 (combining grave accent) unidecodes to an empty string
assert is_vowel("\u0300") is False
assert is_consonant("\u0300") is False


def test_basic_normalizer_dotted_capital_i() -> None:
"""Regression test: U+0130 (İ) must not expand length when lowercased."""
result = basic_normalizer("İstanbul")
assert result == "istanbul"
assert len(result) == len("İstanbul")
Loading