error-align/src/error_align/utils.py at 63b5d3d586d17347519dbd5a4c0d26d0d56fb8bb · corticph/error-align · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
from dataclasses import dataclass
from enum import IntEnum
from itertools import chain, combinations

import regex as re
from unidecode import unidecode


class OpType(IntEnum):
    MATCH = 0
    INSERT = 1
    DELETE = 2
    SUBSTITUTE = 3


@dataclass
class Alignment:
    """Class representing an operation with its type and cost."""

    op_type: OpType
    ref_slice: slice | None = None
    hyp_slice: slice | None = None
    ref: str | None = None
    hyp: str | None = None
    left_compound: bool = False
    right_compound: bool = False

    def __post_init__(self):
        if self.op_type == OpType.MATCH:
            if self.ref is None or self.hyp is None:
                raise ValueError("MATCH operation must have non-empty ref or hyp.")
            if self.left_compound or self.right_compound:
                raise ValueError("MATCH operation cannot have compound markers.")
        elif self.op_type == OpType.INSERT:
            if self.hyp is None or self.ref is not None:
                raise ValueError("INSERT operation must have non-empty hyp and empty ref.")
        elif self.op_type == OpType.DELETE:
            if self.hyp is not None or self.ref is None:
                raise ValueError("DELETE operation must have non-empty ref and empty hyp.")
        elif self.op_type == OpType.SUBSTITUTE:
            if self.ref is None or self.hyp is None:
                raise ValueError("SUBSTITUTE operation must have both ref and hyp.")

    @property
    def hyp_with_compound_markers(self) -> str:
        """Return the hypothesis with compound markers if applicable."""
        if self.hyp is None:
            return None
        return f'{"-" if self.left_compound else ""}"{self.hyp}"{"-" if self.right_compound else ""}'

    def __repr__(self) -> str:
        if self.op_type == OpType.DELETE:
            return f'Alignment({self.op_type.name}: "{self.ref}")'
        if self.op_type == OpType.INSERT:
            return f"Alignment({self.op_type.name}: {self.hyp_with_compound_markers})"
        if self.op_type == OpType.SUBSTITUTE:
            return f'Alignment({self.op_type.name}: {self.hyp_with_compound_markers} -> "{self.ref}")'
        return f'Alignment({self.op_type.name}: "{self.hyp}" == "{self.ref}")'


def op_type_powerset() -> chain:
    """Generate all possible combinations of operation types, except the empty set.

    Returns:
        Generator: All possible combinations of operation types.

    """
    op_types = list(OpType)
    op_combinations = [combinations(op_types, r) for r in range(1, len(op_types) + 1)]
    return chain.from_iterable(op_combinations)


START_DELIMITER = "<"
END_DELIMITER = ">"
DELIMITERS = {START_DELIMITER, END_DELIMITER}

OP_TYPE_MAP = {op_type.value: op_type for op_type in OpType}
OP_TYPE_COMBO_MAP = {i: op_types for i, op_types in enumerate(op_type_powerset())}
OP_TYPE_COMBO_MAP_INV = {v: k for k, v in OP_TYPE_COMBO_MAP.items()}

NUMERIC_TOKEN = r"\p{N}+([,.]\p{N}+)*(?=\s|$)"
STANDARD_TOKEN = r"[\p{L}\p{N}]+(['][\p{L}\p{N}]+)*'?"


def is_vowel(c: str) -> bool:
    """Check if the normalized character is a vowel.

    Args:
        c (str): The character to check.

    Returns:
        bool: True if the character is a vowel, False otherwise.

    """
    assert len(c) == 1, "Input must be a single character."
    decode_char = unidecode(c)
    if len(decode_char) == 0:
        return False
    return decode_char[0] in "aeiouy"


def is_consonant(c: str) -> bool:
    """Check if the normalized character is a consonant.

    Args:
        c (str): The character to check.

    Returns:
        bool: True if the character is a consonant, False otherwise.

    """
    assert len(c) == 1, "Input must be a single character."
    decode_char = unidecode(c)
    if len(decode_char) == 0:
        return False
    return decode_char[0] in "bcdfghjklmnpqrstvwxyz"


def categorize_char(c: str) -> int:
    """Categorize a character as 'vowel', 'consonant', or 'unvoiced'.

    Args:
        c (str): The character to categorize.

    Returns:
        str: The category of the character.

    """
    if c in DELIMITERS:
        return 0
    if is_consonant(c):
        return 1
    if is_vowel(c):
        return 2
    return 3  # NOTE: Unvoiced characters (only apostrophes are expected by default).


def basic_tokenizer(text: str) -> list:
    """Default tokenizer that splits text into words based on whitespace.

    Args:
        text (str): The input text to tokenize.

    Returns:
        list: A list of tokens (words).

    """
    return list(re.finditer(rf"({NUMERIC_TOKEN})|({STANDARD_TOKEN})", text, re.UNICODE | re.VERBOSE))


def basic_normalizer(text: str) -> str:
    """Default normalizer that converts text to lowercase.

    U+0130 (İ, Latin capital letter I with dot above) is replaced with a plain
    'I' before lowercasing to prevent the length-expanding decomposition that
    Python's str.lower() would otherwise produce ('i' + combining dot above).

    Args:
        text (str): The input text to normalize.

    Returns:
        str: The normalized text.

    """
    return text.replace("\u0130", "I").lower()


def ensure_length_preservation(normalizer: callable) -> callable:
    """Decorator to ensure that the normalizer preserves the length of the input text.

    Args:
        normalizer (callable): The normalizer function to wrap.

    Returns:
        callable: The wrapped normalizer function that preserves length.

    """

    def wrapper(text: str, *args: list, **kwargs: dict) -> str:
        normalized = normalizer(text, *args, **kwargs)
        if len(normalized) != len(text):
            raise ValueError("Normalizer must preserve length.")
        return normalized

    return wrapper


def unpack_regex_match(tokenizer: callable) -> callable:
    """Unpack a regex Match object to extract the matched string.

    Args:
        tokenizer (callable): A function to tokenize the sequences. Must be regex-based and return Match objects.

    Returns:
        callable: A function that unpacks a list of Match objects into tuples of (matched string, span).

    """

    def wrapper(text: str, *args: list, **kwargs: dict) -> list[tuple[str, tuple[int, int]]]:
        matches = tokenizer(text, *args, **kwargs)
        return [(match.group(), match.span()) for match in matches]

    return wrapper


def translate_slice(segment_slice: slice, index_map: list[int]) -> None | slice:
    """Translate a slice from the alignment sequence back to the original sequence.

    Args:
        segment_slice (slice): The slice in the alignment sequence.
        index_map (list[int]): The mapping from alignment indices to original sequence indices.

    Returns:
        None | slice: The translated slice in the original sequence, or None if no valid indices.

    """
    slice_indices = index_map[segment_slice]
    slice_indices = list(filter(lambda x: x >= 0, slice_indices))
    if len(slice_indices) == 0:
        return None
    start, end = int(slice_indices[0]), int(slice_indices[-1] + 1)
    return slice(start, end)