1717logger = logging .getLogger (__name__ )
1818
1919
20- class Context :
20+ class Transliterator :
2121 """
22- Context used within the transliteration and passed to hook functions .
22+ Context carrying the state of transliteration process .
2323
2424 Use within a `with` block for proper cleanup.
2525 """
@@ -35,6 +35,10 @@ def src(self):
3535 def src (self ):
3636 raise NotImplementedError ("Attribute is read-only." )
3737
38+ @property
39+ def cur_char (self ):
40+ return self .src [self .cur ]
41+
3842 def __init__ (self , lang , src , t_dir , options = {}):
3943 """
4044 Initialize a context.
@@ -86,7 +90,6 @@ def normalize_src(self):
8690 NOTE: this manipluates the protected source attribute so it may not
8791 correspond to the originally provided source.
8892 """
89- norm_rules = get_lang_normalize (self .conn , self .lang_id )
9093 # Normalize precomposed Unicode characters.
9194 #
9295 # In using diacritics, LC standards prefer the decomposed form
@@ -98,11 +101,39 @@ def normalize_src(self):
98101 logger .debug ("Normalizing pre-composed symbols." )
99102 self ._src = precomp_normalize ("NFD" , self .src )
100103
104+ norm_rules = get_lang_normalize (self .conn , self .lang_id )
105+
101106 for nk , nv in norm_rules .items ():
102107 self ._src = self .src .replace (nk , nv )
103108
104109 return self .run_hook ("post_normalize" )
105110
111+ def cur_at_bow (self , cur = None ):
112+ """
113+ Check if cursor is at the beginning of a word.
114+
115+ @param cur(int): Position to check. By default, the current cursor.
116+ """
117+ if cur is None :
118+ cur = self .cur
119+ return (
120+ self .cur == 0
121+ or self .src [cur - 1 ] in WORD_BOUNDARY
122+ ) and (self .src [cur ] not in WORD_BOUNDARY )
123+
124+ def cur_at_eow (self , cur = None ):
125+ """
126+ Check if cursor is at the end of a word.
127+
128+ @param cur(int): Position to check. By default, the current cursor.
129+ """
130+ if cur is None :
131+ cur = self .cur
132+ return (
133+ cur == len (self .src ) - 1
134+ or self .src [cur + 1 ] in WORD_BOUNDARY
135+ ) and (self .src [cur ] not in WORD_BOUNDARY )
136+
106137
107138def transliterate (src , lang , t_dir = "s2r" , capitalize = False , options = {}):
108139 """
@@ -140,7 +171,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
140171
141172 src = src .strip ()
142173 options ["capitalize" ] = capitalize
143- with Context (lang , src , t_dir , options ) as ctx :
174+ with Transliterator (lang , src , t_dir , options ) as ctx :
144175
145176 if t_dir == FEAT_S2R and not ctx .general ["has_s2r" ]:
146177 raise NotImplementedError (
@@ -175,14 +206,13 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
175206 # Reset cursor position flags.
176207 # Carry over extended "beginning of word" flag.
177208 ctx .cur_flags = 0
178- cur_char = ctx .src [ctx .cur ]
179209
180210 # Look for a word boundary and flag word beginning/end it if found.
181- if _is_bow ( ctx .cur , ctx , WORD_BOUNDARY ):
211+ if ctx .cur_at_bow ( ):
182212 # Beginning of word.
183213 logger .debug (f"Beginning of word at position { ctx .cur } ." )
184214 ctx .cur_flags |= BOW
185- if _is_eow ( ctx .cur , ctx , WORD_BOUNDARY ):
215+ if ctx .cur_at_eow ( ):
186216 # End of word.
187217 logger .debug (f"End of word at position { ctx .cur } ." )
188218 ctx .cur_flags |= EOW
@@ -240,7 +270,6 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
240270 ctx .ignoring = False
241271 break
242272
243- cur_char = ctx .src [ctx .cur ]
244273 ctx .ignoring = True
245274 break
246275 # We looked through all ignore tokens, not found any. Move on.
@@ -275,7 +304,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
275304 # point value) than the current character, then break the loop
276305 # without a match, because we know there won't be any more
277306 # match due to the alphabetical ordering.
278- if ctx .src_tk .content [0 ] > cur_char :
307+ if ctx .src_tk .content [0 ] > ctx . cur_char :
279308 logger .debug (
280309 f"{ ctx .src_tk .content } is after "
281310 f"{ ctx .src [ctx .cur :ctx .cur + step ]} . "
@@ -285,11 +314,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
285314 # If src_tk has a WB flag but the token is not at WB, skip.
286315 if (
287316 (ctx .src_tk .flags & BOW and not ctx .cur_flags & BOW )
288- or
289- # Can't rely on EOW flag, we must check on the last
290- # character of the potential match.
291- (ctx .src_tk .flags & EOW and not _is_eow (
292- ctx .cur + step - 1 , ctx , WORD_BOUNDARY ))
317+ or (
318+ # Can't rely on EOW flag, we must check on the last
319+ # character of the potential match.
320+ ctx .src_tk .flags & EOW
321+ and not ctx .cur_at_eow (ctx .cur + step - 1 )
322+ )
293323 ):
294324 continue
295325
@@ -346,9 +376,10 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
346376
347377 # No match found. Copy non-mapped character (one at a time).
348378 logger .info (
349- f"Token { cur_char } (\\ u{ hex (ord (cur_char ))[2 :]} ) "
379+ f"Token { ctx .cur_char } "
380+ f"(\\ u{ hex (ord (ctx .cur_char ))[2 :]} ) "
350381 f"at position { ctx .cur } is not mapped." )
351- ctx .dest_ls .append (cur_char )
382+ ctx .dest_ls .append (ctx . cur_char )
352383 ctx .cur += 1
353384 else :
354385 delattr (ctx , "match" )
@@ -375,15 +406,3 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
375406 ctx .dest = MULTI_WS_RE .sub (r"\1" , ctx .dest .strip ())
376407
377408 return ctx .dest , ctx .warnings
378-
379-
380- def _is_bow (cur , ctx , word_boundary ):
381- return (cur == 0 or ctx .src [cur - 1 ] in word_boundary ) and (
382- ctx .src [cur ] not in word_boundary )
383-
384-
385- def _is_eow (cur , ctx , word_boundary ):
386- return (
387- cur == len (ctx .src ) - 1
388- or ctx .src [cur + 1 ] in word_boundary
389- ) and (ctx .src [cur ] not in word_boundary )
0 commit comments