|
1 | 1 | import logging |
2 | 2 |
|
3 | 3 | from importlib import import_module |
4 | | -from re import Pattern, compile |
| 4 | +from re import Pattern |
| 5 | +from regex import compile |
5 | 6 | from unicodedata import normalize as precomp_normalize |
6 | 7 |
|
7 | 8 | from scriptshifter.exceptions import BREAK, CONT |
|
13 | 14 |
|
14 | 15 | logger = logging.getLogger(__name__) |
15 | 16 |
|
16 | | -WORD_PTN = compile(r"\w") |
17 | | -WB_PTN = compile(r"\W") |
| 17 | +# Beginning-of-word pattern. |
| 18 | +BOW_PTN = compile(r"(?<=[\p{P}\p{Z}]|^)[\p{L}\p{M}\p{S}]") |
| 19 | +# End-of-word pattern. |
| 20 | +EOW_PTN = compile(r"[\p{L}\p{M}\p{S}](?=[\p{P}\p{Z}]|$)") |
18 | 21 |
|
19 | 22 |
|
20 | 23 | class Transliterator: |
@@ -107,33 +110,10 @@ def normalize_src(self): |
107 | 110 | for nk, nv in norm_rules.items(): |
108 | 111 | self.src = self.src.replace(nk, nv) |
109 | 112 |
|
110 | | - return self.run_hook("post_normalize") |
111 | | - |
112 | | - def cur_at_bow(self, cur=None): |
113 | | - """ |
114 | | - Check if cursor is at the beginning of a word. |
115 | | -
|
116 | | - @param cur(int): Position to check. By default, the current cursor. |
117 | | - """ |
118 | | - if cur is None: |
119 | | - cur = self.cur |
120 | | - return ( |
121 | | - self.cur == 0 |
122 | | - or WB_PTN.match(self.src[cur - 1]) |
123 | | - ) and WORD_PTN.match(self.src[cur]) |
124 | | - |
125 | | - def cur_at_eow(self, cur=None): |
126 | | - """ |
127 | | - Check if cursor is at the end of a word. |
| 113 | + self.bow_coords = {m.span()[0] for m in BOW_PTN.finditer(self.src)} |
| 114 | + self.eow_coords = {m.span()[0] for m in EOW_PTN.finditer(self.src)} |
128 | 115 |
|
129 | | - @param cur(int): Position to check. By default, the current cursor. |
130 | | - """ |
131 | | - if cur is None: |
132 | | - cur = self.cur |
133 | | - return ( |
134 | | - cur == len(self.src) - 1 |
135 | | - or WB_PTN.match(self.src[cur + 1]) |
136 | | - ) and WORD_PTN.match(self.src[cur]) |
| 116 | + return self.run_hook("post_normalize") |
137 | 117 |
|
138 | 118 |
|
139 | 119 | def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): |
@@ -209,11 +189,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): |
209 | 189 | ctx.cur_flags = 0 |
210 | 190 |
|
211 | 191 | # Look for a word boundary and flag word beginning/end it if found. |
212 | | - if ctx.cur_at_bow(): |
| 192 | + if ctx.cur in ctx.bow_coords: |
213 | 193 | # Beginning of word. |
214 | 194 | logger.debug(f"Beginning of word at position {ctx.cur}.") |
215 | 195 | ctx.cur_flags |= BOW |
216 | | - if ctx.cur_at_eow(): |
| 196 | + if ctx.cur in ctx.eow_coords: |
217 | 197 | # End of word. |
218 | 198 | logger.debug(f"End of word at position {ctx.cur}.") |
219 | 199 | ctx.cur_flags |= EOW |
@@ -319,7 +299,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): |
319 | 299 | # Can't rely on EOW flag, we must check on the last |
320 | 300 | # character of the potential match. |
321 | 301 | ctx.src_tk.flags & EOW |
322 | | - and not ctx.cur_at_eow(ctx.cur + step - 1) |
| 302 | + and ctx.cur + step - 1 not in ctx.eow_coords |
323 | 303 | ) |
324 | 304 | ): |
325 | 305 | continue |
|
0 commit comments