WIP precomposed tests.

scossu · scossu · commit c5fa891622a8 · 2025-04-02T09:01:44.000-04:00
diff --git a/scriptshifter/trans.py b/scriptshifter/trans.py
@@ -64,6 +64,45 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback):
         self.conn.close()
 
+    def run_hook(self, hname):
+        ret = None
+        for hook_def in self.hooks.get(hname, []):
+            fn = getattr(
+                import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
+                hook_def["fn_name"]
+            )
+            ret = fn(self, **hook_def["kwargs"])
+            if ret in (BREAK, CONT):
+                # This will stop parsing hooks functions and tell the caller to
+                # break out of the outer loop or skip iteration.
+                return ret
+
+        return ret
+
+    def normalize_src(self):
+        """
+        Normalize source text according to rules.
+
+        NOTE: this manipluates the protected source attribute so it may not
+        correspond to the originally provided source.
+        """
+        norm_rules = get_lang_normalize(self.conn, self.lang_id)
+        # Normalize precomposed Unicode characters.
+        #
+        # In using diacritics, LC standards prefer the decomposed form
+        # (combining diacritic + base character) to the pre-composed form
+        # (single Unicode symbol for the letter with diacritic).
+        #
+        # Note: only safe for R2S.
+        if self.t_dir == FEAT_R2S:
+            logger.debug("Normalizing pre-composed symbols.")
+            self._src = precomp_normalize("NFD", self.src)
+
+        for nk, nv in norm_rules.items():
+            self._src = self.src.replace(nk, nv)
+
+        return self.run_hook("post_normalize")
+
 
 def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
     """
@@ -118,12 +157,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
         # This hook may take over the whole transliteration process or delegate
         # it to some external process, and return the output string directly.
-        if _run_hook("post_config", ctx) == BREAK:
+        if ctx.run_hook("post_config") == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
-        # _normalize_src returns the results of the post_normalize hook.
-        if _normalize_src(
-                ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
+        # ctx.normalize_src returns the results of the post_normalize hook.
+        if ctx.normalize_src() == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
         logger.debug(f"Normalized source: {ctx.src}")
@@ -151,7 +189,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
             # This hook may skip the parsing of the current
             # token or exit the scanning loop altogether.
-            hret = _run_hook("begin_input_token", ctx)
+            hret = ctx.run_hook("begin_input_token")
             if hret == BREAK:
                 logger.debug("Breaking text scanning from hook signal.")
                 break
@@ -165,7 +203,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
             while True:
                 ctx.ignoring = False
                 for ctx.tk in get_lang_ignore(ctx.conn, ctx.lang_id):
-                    hret = _run_hook("pre_ignore_token", ctx)
+                    hret = ctx.run_hook("pre_ignore_token")
                     if hret == BREAK:
                         break
                     if hret == CONT:
@@ -187,7 +225,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
                     if _matching:
                         # The position matches an ignore token.
-                        hret = _run_hook("on_ignore_match", ctx)
+                        hret = ctx.run_hook("on_ignore_match")
                         if hret == BREAK:
                             break
                         if hret == CONT:
@@ -221,7 +259,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
             ctx.match = False
 
             for ctx.src_tk, ctx.dest_str in lang_map:
-                hret = _run_hook("pre_tx_token", ctx)
+                hret = ctx.run_hook("pre_tx_token")
                 if hret == BREAK:
                     break
                 if hret == CONT:
@@ -262,7 +300,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
                     ctx.match = True
                     # This hook may skip this token or break out of the token
                     # lookup for the current position.
-                    hret = _run_hook("on_tx_token_match", ctx)
+                    hret = ctx.run_hook("on_tx_token_match")
                     if hret == BREAK:
                         break
                     if hret == CONT:
@@ -300,7 +338,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
             if ctx.match is False:
                 delattr(ctx, "match")
-                hret = _run_hook("on_no_tx_token_match", ctx)
+                hret = ctx.run_hook("on_no_tx_token_match")
                 if hret == BREAK:
                     break
                 if hret == CONT:
@@ -320,7 +358,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
         # This hook may take care of the assembly and cause the function to
         # return its own return value.
-        hret = _run_hook("pre_assembly", ctx)
+        hret = ctx.run_hook("pre_assembly")
         if hret is not None:
             return hret, ctx.warnings
 
@@ -329,7 +367,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
         # This hook may reassign the output string and/or cause the function to
         # return it immediately.
-        hret = _run_hook("post_assembly", ctx)
+        hret = ctx.run_hook("post_assembly")
         if hret is not None:
             return hret, ctx.warnings
 
@@ -339,30 +377,6 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
         return ctx.dest, ctx.warnings
 
 
-def _normalize_src(ctx, norm_rules):
-    """
-    Normalize source text according to rules.
-
-    NOTE: this manipluates the protected source attribute so it may not
-    correspond to the originally provided source.
-    """
-    # Normalize precomposed Unicode characters.
-    #
-    # In using diacritics, LC standards prefer the decomposed form (combining
-    # diacritic + base character) to the pre-composed form (single Unicode
-    # symbol for the letter with diacritic).
-    #
-    # Note: only safe for R2S.
-    if ctx.t_dir == FEAT_R2S:
-        logger.debug("Normalizing pre-composed symbols.")
-        ctx._src = precomp_normalize("NFD", ctx.src)
-
-    for nk, nv in norm_rules.items():
-        ctx._src = ctx.src.replace(nk, nv)
-
-    return _run_hook("post_normalize", ctx)
-
-
 def _is_bow(cur, ctx, word_boundary):
     return (cur == 0 or ctx.src[cur - 1] in word_boundary) and (
             ctx.src[cur] not in word_boundary)
@@ -373,18 +387,3 @@ def _is_eow(cur, ctx, word_boundary):
         cur == len(ctx.src) - 1
         or ctx.src[cur + 1] in word_boundary
     ) and (ctx.src[cur] not in word_boundary)
-
-
-def _run_hook(hname, ctx):
-    ret = None
-    for hook_def in ctx.hooks.get(hname, []):
-        fn = getattr(
-                import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
-                hook_def["fn_name"])
-        ret = fn(ctx, **hook_def["kwargs"])
-        if ret in (BREAK, CONT):
-            # This will stop parsing hooks functions and tell the caller to
-            # break out of the outer loop or skip iteration.
-            return ret
-
-    return ret
diff --git a/test/data/precomp_samples-single_line.csv b/test/data/precomp_samples-single_line.csv
@@ -0,0 +1 @@
+À / À, Á / Á, Â / Â, Ã / Ã, Ā / Ā, Ă / Ă, Ȧ / Ȧ, Ä / Ä, Å / Å, Æ / Æ, Ç / Ç, È / È, É / É, Ê / Ê, Ē / Ē, Ĕ / Ĕ, Ė / Ė, Ë / Ë, Ì / Ì, Í / Í, Î / Î, Ĩ / Ĩ, Ī / Ī, Ĭ / Ĭ, İ / İ, Ï / Ï, Ð / Ð, Ñ / Ñ, Ò / Ò, Ó / Ó, Ô / Ô, Õ / Õ, Ō / Ō, Ŏ / Ŏ, Ȯ / Ȯ, Ö / Ö, Ø / Ø, Ù / Ù, Ú / Ú, Û / Û, Ũ / Ũ, Ū / Ū, Ŭ / Ŭ, Ü / Ü, Ý / Ý, Ÿ / Ÿ, Þ / Þ, ß / ß, à / à, á / á, â / â, ã / ã, ā / ā, ă / ă, ä / ä, å / å, æ / æ, ç / ç, è / è, é / é, ê / ê, ē / ē, ĕ / ĕ, ė / ė, ë / ë, ì / ì, í / í, î / î, ī / ī, ĭ / ĭ, ï / ï, ð / ð, ñ / n, ò / ò, ó / ó, ô / ô, õ / õ, ō / ō, ŏ / ŏ, ȯ / ȯ, ö / ö, ø / ø, ù / ù, ú / ú, û / û, ū / ū, ŭ / ŭ, ü / ü, ý / ý, þ / þ, ÿ / ÿ, Ġ / Ġ, ġ / ġ, Ś / Ś, ś / ś, 
diff --git a/test/data/precomp_samples.csv b/test/data/precomp_samples.csv
@@ -0,0 +1,95 @@
+À,À
+Á,Á
+Â,Â
+Ã,Ã
+Ā,Ā
+Ă,Ă
+Ȧ,Ȧ
+Ä,Ä
+Å,Å
+Æ,Æ
+Ç,Ç
+È,È
+É,É
+Ê,Ê
+Ē,Ē
+Ĕ,Ĕ
+Ė,Ė
+Ë,Ë
+Ì,Ì
+Í,Í
+Î,Î
+Ĩ,Ĩ
+Ī,Ī
+Ĭ,Ĭ
+İ,İ
+Ï,Ï
+Ð,Ð
+Ñ,Ñ
+Ò,Ò
+Ó,Ó
+Ô,Ô
+Õ,Õ
+Ō,Ō
+Ŏ,Ŏ
+Ȯ,Ȯ
+Ö,Ö
+Ø,Ø
+Ù,Ù
+Ú,Ú
+Û,Û
+Ũ,Ũ
+Ū,Ū
+Ŭ,Ŭ
+Ü,Ü
+Ý,Ý
+Ÿ,Ÿ
+Þ,Þ
+ß,ß
+à,à
+á,á
+â,â
+ã,ã
+ā,ā
+ă,ă
+ä,ä
+å,å
+æ,æ
+ç,ç
+è,è
+é,é
+ê,ê
+ē,ē
+ĕ,ĕ
+ė,ė
+ë,ë
+ì,ì
+í,í
+î,î
+ī,ī
+ĭ,ĭ
+ï,ï
+ð,ð
+ñ,ñ
+ò,ò
+ó,ó
+ô,ô
+õ,õ
+ō,ō
+ŏ,ŏ
+ȯ,ȯ
+ö,ö
+ø,ø
+ù,ù
+ú,ú
+û,û
+ū,ū
+ŭ,ŭ
+ü,ü
+ý,ý
+þ,þ
+ÿ,ÿ
+Ġ,Ġ
+ġ,ġ
+Ś,Ś
+ś,ś
diff --git a/test/unittest/test04_normalization.py b/test/unittest/test04_normalization.py
@@ -0,0 +1,30 @@
+from csv import reader
+from os import environ, path, unlink
+from unittest import TestCase
+
+from scriptshifter.trans import Context, FEAT_R2S
+from scriptshifter.tables import init_db
+
+from test import TEST_DATA_DIR
+
+
+def setUpModule():
+    init_db()
+
+
+def tearDownModule():
+    unlink(environ["TXL_DB_PATH"])
+
+
+class TestNormalization(TestCase):
+    """ Source normalization tests. """
+
+    def test_norm_decompose_r2s(self):
+        with open(path.join(
+                TEST_DATA_DIR, "precomp_samples.csv"), newline="") as fh:
+            data = reader(fh)
+
+            for precomp, decomp in data:
+                with Context("rot3", precomp, FEAT_R2S, {}) as ctx:
+                    ctx.normalize_src()
+                    self.assertEqual(ctx.src, decomp)
diff --git a/test/unittest/test05_rest_api.py b/test/unittest/test05_rest_api.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+À / À, Á / Á, Â / Â, Ã / Ã, Ā / Ā, Ă / Ă, Ȧ / Ȧ, Ä / Ä, Å / Å, Æ / Æ, Ç / Ç, È / È, É / É, Ê / Ê, Ē / Ē, Ĕ / Ĕ, Ė / Ė, Ë / Ë, Ì / Ì, Í / Í, Î / Î, Ĩ / Ĩ, Ī / Ī, Ĭ / Ĭ, İ / İ, Ï / Ï, Ð / Ð, Ñ / Ñ, Ò / Ò, Ó / Ó, Ô / Ô, Õ / Õ, Ō / Ō, Ŏ / Ŏ, Ȯ / Ȯ, Ö / Ö, Ø / Ø, Ù / Ù, Ú / Ú, Û / Û, Ũ / Ũ, Ū / Ū, Ŭ / Ŭ, Ü / Ü, Ý / Ý, Ÿ / Ÿ, Þ / Þ, ß / ß, à / à, á / á, â / â, ã / ã, ā / ā, ă / ă, ä / ä, å / å, æ / æ, ç / ç, è / è, é / é, ê / ê, ē / ē, ĕ / ĕ, ė / ė, ë / ë, ì / ì, í / í, î / î, ī / ī, ĭ / ĭ, ï / ï, ð / ð, ñ / n, ò / ò, ó / ó, ô / ô, õ / õ, ō / ō, ŏ / ŏ, ȯ / ȯ, ö / ö, ø / ø, ù / ù, ú / ú, û / û, ū / ū, ŭ / ŭ, ü / ü, ý / ý, þ / þ, ÿ / ÿ, Ġ / Ġ, ġ / ġ, Ś / Ś, ś / ś,
-Original file line number
+Diff line change
@@ @@ -0,0 +1,95 @@ @@
 +À,À
 +Á,Á
 +Â,Â
 +Ã,Ã
 +Ā,Ā
 +Ă,Ă
 +Ȧ,Ȧ
 +Ä,Ä
 +Å,Å
 +Æ,Æ
 +Ç,Ç
 +È,È
 +É,É
 +Ê,Ê
 +Ē,Ē
 +Ĕ,Ĕ
 +Ė,Ė
 +Ë,Ë
 +Ì,Ì
 +Í,Í
 +Î,Î
 +Ĩ,Ĩ
 +Ī,Ī
 +Ĭ,Ĭ
 +İ,İ
 +Ï,Ï
 +Ð,Ð
 +Ñ,Ñ
 +Ò,Ò
 +Ó,Ó
 +Ô,Ô
 +Õ,Õ
 +Ō,Ō
 +Ŏ,Ŏ
 +Ȯ,Ȯ
 +Ö,Ö
 +Ø,Ø
 +Ù,Ù
 +Ú,Ú
 +Û,Û
 +Ũ,Ũ
 +Ū,Ū
 +Ŭ,Ŭ
 +Ü,Ü
 +Ý,Ý
 +Ÿ,Ÿ
 +Þ,Þ
 +ß,ß
 +à,à
 +á,á
 +â,â
 +ã,ã
 +ā,ā
 +ă,ă
 +ä,ä
 +å,å
 +æ,æ
 +ç,ç
 +è,è
 +é,é
 +ê,ê
 +ē,ē
 +ĕ,ĕ
 +ė,ė
 +ë,ë
 +ì,ì
 +í,í
 +î,î
 +ī,ī
 +ĭ,ĭ
 +ï,ï
 +ð,ð
 +ñ,ñ
 +ò,ò
 +ó,ó
 +ô,ô
 +õ,õ
 +ō,ō
 +ŏ,ŏ
 +ȯ,ȯ
 +ö,ö
 +ø,ø
 +ù,ù
 +ú,ú
 +û,û
 +ū,ū
 +ŭ,ŭ
 +ü,ü
 +ý,ý
 +þ,þ
 +ÿ,ÿ
 +Ġ,Ġ
 +ġ,ġ
 +Ś,Ś
 +ś,ś