Skip to content

Commit c5fa891

Browse files
committed
WIP precomposed tests.
1 parent 44b2e05 commit c5fa891

5 files changed

Lines changed: 176 additions & 51 deletions

File tree

scriptshifter/trans.py

Lines changed: 50 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,45 @@ def __enter__(self):
6464
def __exit__(self, exc_type, exc_value, traceback):
6565
self.conn.close()
6666

67+
def run_hook(self, hname):
68+
ret = None
69+
for hook_def in self.hooks.get(hname, []):
70+
fn = getattr(
71+
import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
72+
hook_def["fn_name"]
73+
)
74+
ret = fn(self, **hook_def["kwargs"])
75+
if ret in (BREAK, CONT):
76+
# This will stop parsing hooks functions and tell the caller to
77+
# break out of the outer loop or skip iteration.
78+
return ret
79+
80+
return ret
81+
82+
def normalize_src(self):
83+
"""
84+
Normalize source text according to rules.
85+
86+
NOTE: this manipluates the protected source attribute so it may not
87+
correspond to the originally provided source.
88+
"""
89+
norm_rules = get_lang_normalize(self.conn, self.lang_id)
90+
# Normalize precomposed Unicode characters.
91+
#
92+
# In using diacritics, LC standards prefer the decomposed form
93+
# (combining diacritic + base character) to the pre-composed form
94+
# (single Unicode symbol for the letter with diacritic).
95+
#
96+
# Note: only safe for R2S.
97+
if self.t_dir == FEAT_R2S:
98+
logger.debug("Normalizing pre-composed symbols.")
99+
self._src = precomp_normalize("NFD", self.src)
100+
101+
for nk, nv in norm_rules.items():
102+
self._src = self.src.replace(nk, nv)
103+
104+
return self.run_hook("post_normalize")
105+
67106

68107
def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
69108
"""
@@ -118,12 +157,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
118157

119158
# This hook may take over the whole transliteration process or delegate
120159
# it to some external process, and return the output string directly.
121-
if _run_hook("post_config", ctx) == BREAK:
160+
if ctx.run_hook("post_config") == BREAK:
122161
return getattr(ctx, "dest", ""), ctx.warnings
123162

124-
# _normalize_src returns the results of the post_normalize hook.
125-
if _normalize_src(
126-
ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
163+
# ctx.normalize_src returns the results of the post_normalize hook.
164+
if ctx.normalize_src() == BREAK:
127165
return getattr(ctx, "dest", ""), ctx.warnings
128166

129167
logger.debug(f"Normalized source: {ctx.src}")
@@ -151,7 +189,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
151189

152190
# This hook may skip the parsing of the current
153191
# token or exit the scanning loop altogether.
154-
hret = _run_hook("begin_input_token", ctx)
192+
hret = ctx.run_hook("begin_input_token")
155193
if hret == BREAK:
156194
logger.debug("Breaking text scanning from hook signal.")
157195
break
@@ -165,7 +203,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
165203
while True:
166204
ctx.ignoring = False
167205
for ctx.tk in get_lang_ignore(ctx.conn, ctx.lang_id):
168-
hret = _run_hook("pre_ignore_token", ctx)
206+
hret = ctx.run_hook("pre_ignore_token")
169207
if hret == BREAK:
170208
break
171209
if hret == CONT:
@@ -187,7 +225,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
187225

188226
if _matching:
189227
# The position matches an ignore token.
190-
hret = _run_hook("on_ignore_match", ctx)
228+
hret = ctx.run_hook("on_ignore_match")
191229
if hret == BREAK:
192230
break
193231
if hret == CONT:
@@ -221,7 +259,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
221259
ctx.match = False
222260

223261
for ctx.src_tk, ctx.dest_str in lang_map:
224-
hret = _run_hook("pre_tx_token", ctx)
262+
hret = ctx.run_hook("pre_tx_token")
225263
if hret == BREAK:
226264
break
227265
if hret == CONT:
@@ -262,7 +300,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
262300
ctx.match = True
263301
# This hook may skip this token or break out of the token
264302
# lookup for the current position.
265-
hret = _run_hook("on_tx_token_match", ctx)
303+
hret = ctx.run_hook("on_tx_token_match")
266304
if hret == BREAK:
267305
break
268306
if hret == CONT:
@@ -300,7 +338,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
300338

301339
if ctx.match is False:
302340
delattr(ctx, "match")
303-
hret = _run_hook("on_no_tx_token_match", ctx)
341+
hret = ctx.run_hook("on_no_tx_token_match")
304342
if hret == BREAK:
305343
break
306344
if hret == CONT:
@@ -320,7 +358,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
320358

321359
# This hook may take care of the assembly and cause the function to
322360
# return its own return value.
323-
hret = _run_hook("pre_assembly", ctx)
361+
hret = ctx.run_hook("pre_assembly")
324362
if hret is not None:
325363
return hret, ctx.warnings
326364

@@ -329,7 +367,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
329367

330368
# This hook may reassign the output string and/or cause the function to
331369
# return it immediately.
332-
hret = _run_hook("post_assembly", ctx)
370+
hret = ctx.run_hook("post_assembly")
333371
if hret is not None:
334372
return hret, ctx.warnings
335373

@@ -339,30 +377,6 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
339377
return ctx.dest, ctx.warnings
340378

341379

342-
def _normalize_src(ctx, norm_rules):
343-
"""
344-
Normalize source text according to rules.
345-
346-
NOTE: this manipluates the protected source attribute so it may not
347-
correspond to the originally provided source.
348-
"""
349-
# Normalize precomposed Unicode characters.
350-
#
351-
# In using diacritics, LC standards prefer the decomposed form (combining
352-
# diacritic + base character) to the pre-composed form (single Unicode
353-
# symbol for the letter with diacritic).
354-
#
355-
# Note: only safe for R2S.
356-
if ctx.t_dir == FEAT_R2S:
357-
logger.debug("Normalizing pre-composed symbols.")
358-
ctx._src = precomp_normalize("NFD", ctx.src)
359-
360-
for nk, nv in norm_rules.items():
361-
ctx._src = ctx.src.replace(nk, nv)
362-
363-
return _run_hook("post_normalize", ctx)
364-
365-
366380
def _is_bow(cur, ctx, word_boundary):
367381
return (cur == 0 or ctx.src[cur - 1] in word_boundary) and (
368382
ctx.src[cur] not in word_boundary)
@@ -373,18 +387,3 @@ def _is_eow(cur, ctx, word_boundary):
373387
cur == len(ctx.src) - 1
374388
or ctx.src[cur + 1] in word_boundary
375389
) and (ctx.src[cur] not in word_boundary)
376-
377-
378-
def _run_hook(hname, ctx):
379-
ret = None
380-
for hook_def in ctx.hooks.get(hname, []):
381-
fn = getattr(
382-
import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
383-
hook_def["fn_name"])
384-
ret = fn(ctx, **hook_def["kwargs"])
385-
if ret in (BREAK, CONT):
386-
# This will stop parsing hooks functions and tell the caller to
387-
# break out of the outer loop or skip iteration.
388-
return ret
389-
390-
return ret
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
À / À, Á / Á, Â / Â, Ã / Ã, Ā / Ā, Ă / Ă, Ȧ / Ȧ, Ä / Ä, Å / Å, Æ / Æ, Ç / Ç, È / È, É / É, Ê / Ê, Ē / Ē, Ĕ / Ĕ, Ė / Ė, Ë / Ë, Ì / Ì, Í / Í, Î / Î, Ĩ / Ĩ, Ī / Ī, Ĭ / Ĭ, İ / İ, Ï / Ï, Ð / Ð, Ñ / Ñ, Ò / Ò, Ó / Ó, Ô / Ô, Õ / Õ, Ō / Ō, Ŏ / Ŏ, Ȯ / Ȯ, Ö / Ö, Ø / Ø, Ù / Ù, Ú / Ú, Û / Û, Ũ / Ũ, Ū / Ū, Ŭ / Ŭ, Ü / Ü, Ý / Ý, Ÿ / Ÿ, Þ / Þ, ß / ß, à / à, á / á, â / â, ã / ã, ā / ā, ă / ă, ä / ä, å / å, æ / æ, ç / ç, è / è, é / é, ê / ê, ē / ē, ĕ / ĕ, ė / ė, ë / ë, ì / ì, í / í, î / î, ī / ī, ĭ / ĭ, ï / ï, ð / ð, ñ / n, ò / ò, ó / ó, ô / ô, õ / õ, ō / ō, ŏ / ŏ, ȯ / ȯ, ö / ö, ø / ø, ù / ù, ú / ú, û / û, ū / ū, ŭ / ŭ, ü / ü, ý / ý, þ / þ, ÿ / ÿ, Ġ / Ġ, ġ / ġ, Ś / Ś, ś / ś, 

test/data/precomp_samples.csv

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
À,
2+
Á,
3+
Â,
4+
Ã,
5+
Ā,
6+
Ă,
7+
Ȧ,
8+
Ä,
9+
Å,
10+
Æ,Æ
11+
Ç,
12+
È,
13+
É,
14+
Ê,
15+
Ē,
16+
Ĕ,
17+
Ė,
18+
Ë,
19+
Ì,
20+
Í,
21+
Î,
22+
Ĩ,
23+
Ī,
24+
Ĭ,
25+
İ,
26+
Ï,
27+
Ð,Ð
28+
Ñ,
29+
Ò,
30+
Ó,
31+
Ô,
32+
Õ,
33+
Ō,
34+
Ŏ,
35+
Ȯ,
36+
Ö,
37+
Ø,Ø
38+
Ù,
39+
Ú,
40+
Û,
41+
Ũ,
42+
Ū,
43+
Ŭ,
44+
Ü,
45+
Ý,
46+
Ÿ,
47+
Þ,Þ
48+
ß,ß
49+
à,
50+
á,
51+
â,
52+
ã,
53+
ā,
54+
ă,
55+
ä,
56+
å,
57+
æ,æ
58+
ç,
59+
è,
60+
é,
61+
ê,
62+
ē,
63+
ĕ,
64+
ė,
65+
ë,
66+
ì,
67+
í,
68+
î,
69+
ī,
70+
ĭ,
71+
ï,
72+
ð,ð
73+
ñ,
74+
ò,
75+
ó,
76+
ô,
77+
õ,
78+
ō,
79+
ŏ,
80+
ȯ,
81+
ö,
82+
ø,ø
83+
ù,
84+
ú,
85+
û,
86+
ū,
87+
ŭ,
88+
ü,
89+
ý,
90+
þ,þ
91+
ÿ,
92+
Ġ,
93+
ġ,
94+
Ś,
95+
ś,
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from csv import reader
2+
from os import environ, path, unlink
3+
from unittest import TestCase
4+
5+
from scriptshifter.trans import Context, FEAT_R2S
6+
from scriptshifter.tables import init_db
7+
8+
from test import TEST_DATA_DIR
9+
10+
11+
def setUpModule():
12+
init_db()
13+
14+
15+
def tearDownModule():
16+
unlink(environ["TXL_DB_PATH"])
17+
18+
19+
class TestNormalization(TestCase):
20+
""" Source normalization tests. """
21+
22+
def test_norm_decompose_r2s(self):
23+
with open(path.join(
24+
TEST_DATA_DIR, "precomp_samples.csv"), newline="") as fh:
25+
data = reader(fh)
26+
27+
for precomp, decomp in data:
28+
with Context("rot3", precomp, FEAT_R2S, {}) as ctx:
29+
ctx.normalize_src()
30+
self.assertEqual(ctx.src, decomp)

0 commit comments

Comments
 (0)