Skip to content

Commit 1e480d7

Browse files
authored
Merge pull request #208 from lcnetdev/precomp_tests
Precomp tests
2 parents 741e626 + aa924fb commit 1e480d7

6 files changed

Lines changed: 224 additions & 81 deletions

File tree

scriptshifter/trans.py

Lines changed: 96 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717
logger = logging.getLogger(__name__)
1818

1919

20-
class Context:
20+
class Transliterator:
2121
"""
22-
Context used within the transliteration and passed to hook functions.
22+
Context carrying the state of transliteration process.
2323
2424
Use within a `with` block for proper cleanup.
2525
"""
@@ -35,6 +35,10 @@ def src(self):
3535
def src(self):
3636
raise NotImplementedError("Attribute is read-only.")
3737

38+
@property
39+
def cur_char(self):
40+
return self.src[self.cur]
41+
3842
def __init__(self, lang, src, t_dir, options={}):
3943
"""
4044
Initialize a context.
@@ -64,6 +68,72 @@ def __enter__(self):
6468
def __exit__(self, exc_type, exc_value, traceback):
6569
self.conn.close()
6670

71+
def run_hook(self, hname):
72+
ret = None
73+
for hook_def in self.hooks.get(hname, []):
74+
fn = getattr(
75+
import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
76+
hook_def["fn_name"]
77+
)
78+
ret = fn(self, **hook_def["kwargs"])
79+
if ret in (BREAK, CONT):
80+
# This will stop parsing hooks functions and tell the caller to
81+
# break out of the outer loop or skip iteration.
82+
return ret
83+
84+
return ret
85+
86+
def normalize_src(self):
87+
"""
88+
Normalize source text according to rules.
89+
90+
NOTE: this manipluates the protected source attribute so it may not
91+
correspond to the originally provided source.
92+
"""
93+
# Normalize precomposed Unicode characters.
94+
#
95+
# In using diacritics, LC standards prefer the decomposed form
96+
# (combining diacritic + base character) to the pre-composed form
97+
# (single Unicode symbol for the letter with diacritic).
98+
#
99+
# Note: only safe for R2S.
100+
if self.t_dir == FEAT_R2S:
101+
logger.debug("Normalizing pre-composed symbols.")
102+
self._src = precomp_normalize("NFD", self.src)
103+
104+
norm_rules = get_lang_normalize(self.conn, self.lang_id)
105+
106+
for nk, nv in norm_rules.items():
107+
self._src = self.src.replace(nk, nv)
108+
109+
return self.run_hook("post_normalize")
110+
111+
def cur_at_bow(self, cur=None):
112+
"""
113+
Check if cursor is at the beginning of a word.
114+
115+
@param cur(int): Position to check. By default, the current cursor.
116+
"""
117+
if cur is None:
118+
cur = self.cur
119+
return (
120+
self.cur == 0
121+
or self.src[cur - 1] in WORD_BOUNDARY
122+
) and (self.src[cur] not in WORD_BOUNDARY)
123+
124+
def cur_at_eow(self, cur=None):
125+
"""
126+
Check if cursor is at the end of a word.
127+
128+
@param cur(int): Position to check. By default, the current cursor.
129+
"""
130+
if cur is None:
131+
cur = self.cur
132+
return (
133+
cur == len(self.src) - 1
134+
or self.src[cur + 1] in WORD_BOUNDARY
135+
) and (self.src[cur] not in WORD_BOUNDARY)
136+
67137

68138
def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
69139
"""
@@ -101,7 +171,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
101171

102172
src = src.strip()
103173
options["capitalize"] = capitalize
104-
with Context(lang, src, t_dir, options) as ctx:
174+
with Transliterator(lang, src, t_dir, options) as ctx:
105175

106176
if t_dir == FEAT_S2R and not ctx.general["has_s2r"]:
107177
raise NotImplementedError(
@@ -118,12 +188,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
118188

119189
# This hook may take over the whole transliteration process or delegate
120190
# it to some external process, and return the output string directly.
121-
if _run_hook("post_config", ctx) == BREAK:
191+
if ctx.run_hook("post_config") == BREAK:
122192
return getattr(ctx, "dest", ""), ctx.warnings
123193

124-
# _normalize_src returns the results of the post_normalize hook.
125-
if _normalize_src(
126-
ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
194+
# ctx.normalize_src returns the results of the post_normalize hook.
195+
if ctx.normalize_src() == BREAK:
127196
return getattr(ctx, "dest", ""), ctx.warnings
128197

129198
logger.debug(f"Normalized source: {ctx.src}")
@@ -137,21 +206,20 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
137206
# Reset cursor position flags.
138207
# Carry over extended "beginning of word" flag.
139208
ctx.cur_flags = 0
140-
cur_char = ctx.src[ctx.cur]
141209

142210
# Look for a word boundary and flag word beginning/end it if found.
143-
if _is_bow(ctx.cur, ctx, WORD_BOUNDARY):
211+
if ctx.cur_at_bow():
144212
# Beginning of word.
145213
logger.debug(f"Beginning of word at position {ctx.cur}.")
146214
ctx.cur_flags |= BOW
147-
if _is_eow(ctx.cur, ctx, WORD_BOUNDARY):
215+
if ctx.cur_at_eow():
148216
# End of word.
149217
logger.debug(f"End of word at position {ctx.cur}.")
150218
ctx.cur_flags |= EOW
151219

152220
# This hook may skip the parsing of the current
153221
# token or exit the scanning loop altogether.
154-
hret = _run_hook("begin_input_token", ctx)
222+
hret = ctx.run_hook("begin_input_token")
155223
if hret == BREAK:
156224
logger.debug("Breaking text scanning from hook signal.")
157225
break
@@ -165,7 +233,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
165233
while True:
166234
ctx.ignoring = False
167235
for ctx.tk in get_lang_ignore(ctx.conn, ctx.lang_id):
168-
hret = _run_hook("pre_ignore_token", ctx)
236+
hret = ctx.run_hook("pre_ignore_token")
169237
if hret == BREAK:
170238
break
171239
if hret == CONT:
@@ -187,7 +255,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
187255

188256
if _matching:
189257
# The position matches an ignore token.
190-
hret = _run_hook("on_ignore_match", ctx)
258+
hret = ctx.run_hook("on_ignore_match")
191259
if hret == BREAK:
192260
break
193261
if hret == CONT:
@@ -202,7 +270,6 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
202270
ctx.ignoring = False
203271
break
204272

205-
cur_char = ctx.src[ctx.cur]
206273
ctx.ignoring = True
207274
break
208275
# We looked through all ignore tokens, not found any. Move on.
@@ -221,7 +288,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
221288
ctx.match = False
222289

223290
for ctx.src_tk, ctx.dest_str in lang_map:
224-
hret = _run_hook("pre_tx_token", ctx)
291+
hret = ctx.run_hook("pre_tx_token")
225292
if hret == BREAK:
226293
break
227294
if hret == CONT:
@@ -237,7 +304,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
237304
# point value) than the current character, then break the loop
238305
# without a match, because we know there won't be any more
239306
# match due to the alphabetical ordering.
240-
if ctx.src_tk.content[0] > cur_char:
307+
if ctx.src_tk.content[0] > ctx.cur_char:
241308
logger.debug(
242309
f"{ctx.src_tk.content} is after "
243310
f"{ctx.src[ctx.cur:ctx.cur + step]}. "
@@ -247,11 +314,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
247314
# If src_tk has a WB flag but the token is not at WB, skip.
248315
if (
249316
(ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW)
250-
or
251-
# Can't rely on EOW flag, we must check on the last
252-
# character of the potential match.
253-
(ctx.src_tk.flags & EOW and not _is_eow(
254-
ctx.cur + step - 1, ctx, WORD_BOUNDARY))
317+
or (
318+
# Can't rely on EOW flag, we must check on the last
319+
# character of the potential match.
320+
ctx.src_tk.flags & EOW
321+
and not ctx.cur_at_eow(ctx.cur + step - 1)
322+
)
255323
):
256324
continue
257325

@@ -262,7 +330,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
262330
ctx.match = True
263331
# This hook may skip this token or break out of the token
264332
# lookup for the current position.
265-
hret = _run_hook("on_tx_token_match", ctx)
333+
hret = ctx.run_hook("on_tx_token_match")
266334
if hret == BREAK:
267335
break
268336
if hret == CONT:
@@ -300,17 +368,18 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
300368

301369
if ctx.match is False:
302370
delattr(ctx, "match")
303-
hret = _run_hook("on_no_tx_token_match", ctx)
371+
hret = ctx.run_hook("on_no_tx_token_match")
304372
if hret == BREAK:
305373
break
306374
if hret == CONT:
307375
continue
308376

309377
# No match found. Copy non-mapped character (one at a time).
310378
logger.info(
311-
f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
379+
f"Token {ctx.cur_char} "
380+
f"(\\u{hex(ord(ctx.cur_char))[2:]}) "
312381
f"at position {ctx.cur} is not mapped.")
313-
ctx.dest_ls.append(cur_char)
382+
ctx.dest_ls.append(ctx.cur_char)
314383
ctx.cur += 1
315384
else:
316385
delattr(ctx, "match")
@@ -320,7 +389,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
320389

321390
# This hook may take care of the assembly and cause the function to
322391
# return its own return value.
323-
hret = _run_hook("pre_assembly", ctx)
392+
hret = ctx.run_hook("pre_assembly")
324393
if hret is not None:
325394
return hret, ctx.warnings
326395

@@ -329,62 +398,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
329398

330399
# This hook may reassign the output string and/or cause the function to
331400
# return it immediately.
332-
hret = _run_hook("post_assembly", ctx)
401+
hret = ctx.run_hook("post_assembly")
333402
if hret is not None:
334403
return hret, ctx.warnings
335404

336405
# Strip multiple spaces and leading/trailing whitespace.
337406
ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
338407

339408
return ctx.dest, ctx.warnings
340-
341-
342-
def _normalize_src(ctx, norm_rules):
343-
"""
344-
Normalize source text according to rules.
345-
346-
NOTE: this manipluates the protected source attribute so it may not
347-
correspond to the originally provided source.
348-
"""
349-
# Normalize precomposed Unicode characters.
350-
#
351-
# In using diacritics, LC standards prefer the decomposed form (combining
352-
# diacritic + base character) to the pre-composed form (single Unicode
353-
# symbol for the letter with diacritic).
354-
#
355-
# Note: only safe for R2S.
356-
if ctx.t_dir == FEAT_R2S:
357-
logger.debug("Normalizing pre-composed symbols.")
358-
ctx._src = precomp_normalize("NFD", ctx.src)
359-
360-
for nk, nv in norm_rules.items():
361-
ctx._src = ctx.src.replace(nk, nv)
362-
363-
return _run_hook("post_normalize", ctx)
364-
365-
366-
def _is_bow(cur, ctx, word_boundary):
367-
return (cur == 0 or ctx.src[cur - 1] in word_boundary) and (
368-
ctx.src[cur] not in word_boundary)
369-
370-
371-
def _is_eow(cur, ctx, word_boundary):
372-
return (
373-
cur == len(ctx.src) - 1
374-
or ctx.src[cur + 1] in word_boundary
375-
) and (ctx.src[cur] not in word_boundary)
376-
377-
378-
def _run_hook(hname, ctx):
379-
ret = None
380-
for hook_def in ctx.hooks.get(hname, []):
381-
fn = getattr(
382-
import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
383-
hook_def["fn_name"])
384-
ret = fn(ctx, **hook_def["kwargs"])
385-
if ret in (BREAK, CONT):
386-
# This will stop parsing hooks functions and tell the caller to
387-
# break out of the outer loop or skip iteration.
388-
return ret
389-
390-
return ret

0 commit comments

Comments
 (0)