Skip to content

Commit f4813c7

Browse files
committed
Use Transliterator as transliteration state class.
1 parent c5fa891 commit f4813c7

2 files changed

Lines changed: 49 additions & 30 deletions

File tree

scriptshifter/trans.py

Lines changed: 47 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717
logger = logging.getLogger(__name__)
1818

1919

20-
class Context:
20+
class Transliterator:
2121
"""
22-
Context used within the transliteration and passed to hook functions.
22+
Context carrying the state of transliteration process.
2323
2424
Use within a `with` block for proper cleanup.
2525
"""
@@ -35,6 +35,10 @@ def src(self):
3535
def src(self):
3636
raise NotImplementedError("Attribute is read-only.")
3737

38+
@property
39+
def cur_char(self):
40+
return self.src[self.cur]
41+
3842
def __init__(self, lang, src, t_dir, options={}):
3943
"""
4044
Initialize a context.
@@ -86,7 +90,6 @@ def normalize_src(self):
8690
NOTE: this manipluates the protected source attribute so it may not
8791
correspond to the originally provided source.
8892
"""
89-
norm_rules = get_lang_normalize(self.conn, self.lang_id)
9093
# Normalize precomposed Unicode characters.
9194
#
9295
# In using diacritics, LC standards prefer the decomposed form
@@ -98,11 +101,39 @@ def normalize_src(self):
98101
logger.debug("Normalizing pre-composed symbols.")
99102
self._src = precomp_normalize("NFD", self.src)
100103

104+
norm_rules = get_lang_normalize(self.conn, self.lang_id)
105+
101106
for nk, nv in norm_rules.items():
102107
self._src = self.src.replace(nk, nv)
103108

104109
return self.run_hook("post_normalize")
105110

111+
def cur_at_bow(self, cur=None):
112+
"""
113+
Check if cursor is at the beginning of a word.
114+
115+
@param cur(int): Position to check. By default, the current cursor.
116+
"""
117+
if cur is None:
118+
cur = self.cur
119+
return (
120+
self.cur == 0
121+
or self.src[cur - 1] in WORD_BOUNDARY
122+
) and (self.src[cur] not in WORD_BOUNDARY)
123+
124+
def cur_at_eow(self, cur=None):
125+
"""
126+
Check if cursor is at the end of a word.
127+
128+
@param cur(int): Position to check. By default, the current cursor.
129+
"""
130+
if cur is None:
131+
cur = self.cur
132+
return (
133+
cur == len(self.src) - 1
134+
or self.src[cur + 1] in WORD_BOUNDARY
135+
) and (self.src[cur] not in WORD_BOUNDARY)
136+
106137

107138
def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
108139
"""
@@ -140,7 +171,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
140171

141172
src = src.strip()
142173
options["capitalize"] = capitalize
143-
with Context(lang, src, t_dir, options) as ctx:
174+
with Transliterator(lang, src, t_dir, options) as ctx:
144175

145176
if t_dir == FEAT_S2R and not ctx.general["has_s2r"]:
146177
raise NotImplementedError(
@@ -175,14 +206,13 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
175206
# Reset cursor position flags.
176207
# Carry over extended "beginning of word" flag.
177208
ctx.cur_flags = 0
178-
cur_char = ctx.src[ctx.cur]
179209

180210
# Look for a word boundary and flag word beginning/end it if found.
181-
if _is_bow(ctx.cur, ctx, WORD_BOUNDARY):
211+
if ctx.cur_at_bow():
182212
# Beginning of word.
183213
logger.debug(f"Beginning of word at position {ctx.cur}.")
184214
ctx.cur_flags |= BOW
185-
if _is_eow(ctx.cur, ctx, WORD_BOUNDARY):
215+
if ctx.cur_at_eow():
186216
# End of word.
187217
logger.debug(f"End of word at position {ctx.cur}.")
188218
ctx.cur_flags |= EOW
@@ -240,7 +270,6 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
240270
ctx.ignoring = False
241271
break
242272

243-
cur_char = ctx.src[ctx.cur]
244273
ctx.ignoring = True
245274
break
246275
# We looked through all ignore tokens, not found any. Move on.
@@ -275,7 +304,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
275304
# point value) than the current character, then break the loop
276305
# without a match, because we know there won't be any more
277306
# match due to the alphabetical ordering.
278-
if ctx.src_tk.content[0] > cur_char:
307+
if ctx.src_tk.content[0] > ctx.cur_char:
279308
logger.debug(
280309
f"{ctx.src_tk.content} is after "
281310
f"{ctx.src[ctx.cur:ctx.cur + step]}. "
@@ -285,11 +314,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
285314
# If src_tk has a WB flag but the token is not at WB, skip.
286315
if (
287316
(ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW)
288-
or
289-
# Can't rely on EOW flag, we must check on the last
290-
# character of the potential match.
291-
(ctx.src_tk.flags & EOW and not _is_eow(
292-
ctx.cur + step - 1, ctx, WORD_BOUNDARY))
317+
or (
318+
# Can't rely on EOW flag, we must check on the last
319+
# character of the potential match.
320+
ctx.src_tk.flags & EOW
321+
and not ctx.cur_at_eow(ctx.cur + step - 1)
322+
)
293323
):
294324
continue
295325

@@ -346,9 +376,10 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
346376

347377
# No match found. Copy non-mapped character (one at a time).
348378
logger.info(
349-
f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
379+
f"Token {ctx.cur_char} "
380+
f"(\\u{hex(ord(ctx.cur_char))[2:]}) "
350381
f"at position {ctx.cur} is not mapped.")
351-
ctx.dest_ls.append(cur_char)
382+
ctx.dest_ls.append(ctx.cur_char)
352383
ctx.cur += 1
353384
else:
354385
delattr(ctx, "match")
@@ -375,15 +406,3 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
375406
ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
376407

377408
return ctx.dest, ctx.warnings
378-
379-
380-
def _is_bow(cur, ctx, word_boundary):
381-
return (cur == 0 or ctx.src[cur - 1] in word_boundary) and (
382-
ctx.src[cur] not in word_boundary)
383-
384-
385-
def _is_eow(cur, ctx, word_boundary):
386-
return (
387-
cur == len(ctx.src) - 1
388-
or ctx.src[cur + 1] in word_boundary
389-
) and (ctx.src[cur] not in word_boundary)

test/unittest/test04_normalization.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from os import environ, path, unlink
33
from unittest import TestCase
44

5-
from scriptshifter.trans import Context, FEAT_R2S
5+
from scriptshifter.trans import Transliterator, FEAT_R2S
66
from scriptshifter.tables import init_db
77

88
from test import TEST_DATA_DIR
@@ -25,6 +25,6 @@ def test_norm_decompose_r2s(self):
2525
data = reader(fh)
2626

2727
for precomp, decomp in data:
28-
with Context("rot3", precomp, FEAT_R2S, {}) as ctx:
28+
with Transliterator("rot3", precomp, FEAT_R2S, {}) as ctx:
2929
ctx.normalize_src()
3030
self.assertEqual(ctx.src, decomp)

0 commit comments

Comments
 (0)