1717logger = logging .getLogger (__name__ )
1818
1919
20- class Context :
20+ class Transliterator :
2121 """
22- Context used within the transliteration and passed to hook functions .
22+ Context carrying the state of transliteration process .
2323
2424 Use within a `with` block for proper cleanup.
2525 """
@@ -35,6 +35,10 @@ def src(self):
3535 def src (self ):
3636 raise NotImplementedError ("Attribute is read-only." )
3737
38+ @property
39+ def cur_char (self ):
40+ return self .src [self .cur ]
41+
3842 def __init__ (self , lang , src , t_dir , options = {}):
3943 """
4044 Initialize a context.
@@ -64,6 +68,72 @@ def __enter__(self):
6468 def __exit__ (self , exc_type , exc_value , traceback ):
6569 self .conn .close ()
6670
71+ def run_hook (self , hname ):
72+ ret = None
73+ for hook_def in self .hooks .get (hname , []):
74+ fn = getattr (
75+ import_module ("." + hook_def ["module_name" ], HOOK_PKG_PATH ),
76+ hook_def ["fn_name" ]
77+ )
78+ ret = fn (self , ** hook_def ["kwargs" ])
79+ if ret in (BREAK , CONT ):
80+ # This will stop parsing hooks functions and tell the caller to
81+ # break out of the outer loop or skip iteration.
82+ return ret
83+
84+ return ret
85+
86+ def normalize_src (self ):
87+ """
88+ Normalize source text according to rules.
89+
90+ NOTE: this manipluates the protected source attribute so it may not
91+ correspond to the originally provided source.
92+ """
93+ # Normalize precomposed Unicode characters.
94+ #
95+ # In using diacritics, LC standards prefer the decomposed form
96+ # (combining diacritic + base character) to the pre-composed form
97+ # (single Unicode symbol for the letter with diacritic).
98+ #
99+ # Note: only safe for R2S.
100+ if self .t_dir == FEAT_R2S :
101+ logger .debug ("Normalizing pre-composed symbols." )
102+ self ._src = precomp_normalize ("NFD" , self .src )
103+
104+ norm_rules = get_lang_normalize (self .conn , self .lang_id )
105+
106+ for nk , nv in norm_rules .items ():
107+ self ._src = self .src .replace (nk , nv )
108+
109+ return self .run_hook ("post_normalize" )
110+
111+ def cur_at_bow (self , cur = None ):
112+ """
113+ Check if cursor is at the beginning of a word.
114+
115+ @param cur(int): Position to check. By default, the current cursor.
116+ """
117+ if cur is None :
118+ cur = self .cur
119+ return (
120+ self .cur == 0
121+ or self .src [cur - 1 ] in WORD_BOUNDARY
122+ ) and (self .src [cur ] not in WORD_BOUNDARY )
123+
124+ def cur_at_eow (self , cur = None ):
125+ """
126+ Check if cursor is at the end of a word.
127+
128+ @param cur(int): Position to check. By default, the current cursor.
129+ """
130+ if cur is None :
131+ cur = self .cur
132+ return (
133+ cur == len (self .src ) - 1
134+ or self .src [cur + 1 ] in WORD_BOUNDARY
135+ ) and (self .src [cur ] not in WORD_BOUNDARY )
136+
67137
68138def transliterate (src , lang , t_dir = "s2r" , capitalize = False , options = {}):
69139 """
@@ -101,7 +171,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
101171
102172 src = src .strip ()
103173 options ["capitalize" ] = capitalize
104- with Context (lang , src , t_dir , options ) as ctx :
174+ with Transliterator (lang , src , t_dir , options ) as ctx :
105175
106176 if t_dir == FEAT_S2R and not ctx .general ["has_s2r" ]:
107177 raise NotImplementedError (
@@ -118,12 +188,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
118188
119189 # This hook may take over the whole transliteration process or delegate
120190 # it to some external process, and return the output string directly.
121- if _run_hook ("post_config" , ctx ) == BREAK :
191+ if ctx . run_hook ("post_config" ) == BREAK :
122192 return getattr (ctx , "dest" , "" ), ctx .warnings
123193
124- # _normalize_src returns the results of the post_normalize hook.
125- if _normalize_src (
126- ctx , get_lang_normalize (ctx .conn , ctx .lang_id )) == BREAK :
194+ # ctx.normalize_src returns the results of the post_normalize hook.
195+ if ctx .normalize_src () == BREAK :
127196 return getattr (ctx , "dest" , "" ), ctx .warnings
128197
129198 logger .debug (f"Normalized source: { ctx .src } " )
@@ -137,21 +206,20 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
137206 # Reset cursor position flags.
138207 # Carry over extended "beginning of word" flag.
139208 ctx .cur_flags = 0
140- cur_char = ctx .src [ctx .cur ]
141209
142210 # Look for a word boundary and flag word beginning/end it if found.
143- if _is_bow ( ctx .cur , ctx , WORD_BOUNDARY ):
211+ if ctx .cur_at_bow ( ):
144212 # Beginning of word.
145213 logger .debug (f"Beginning of word at position { ctx .cur } ." )
146214 ctx .cur_flags |= BOW
147- if _is_eow ( ctx .cur , ctx , WORD_BOUNDARY ):
215+ if ctx .cur_at_eow ( ):
148216 # End of word.
149217 logger .debug (f"End of word at position { ctx .cur } ." )
150218 ctx .cur_flags |= EOW
151219
152220 # This hook may skip the parsing of the current
153221 # token or exit the scanning loop altogether.
154- hret = _run_hook ("begin_input_token" , ctx )
222+ hret = ctx . run_hook ("begin_input_token" )
155223 if hret == BREAK :
156224 logger .debug ("Breaking text scanning from hook signal." )
157225 break
@@ -165,7 +233,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
165233 while True :
166234 ctx .ignoring = False
167235 for ctx .tk in get_lang_ignore (ctx .conn , ctx .lang_id ):
168- hret = _run_hook ("pre_ignore_token" , ctx )
236+ hret = ctx . run_hook ("pre_ignore_token" )
169237 if hret == BREAK :
170238 break
171239 if hret == CONT :
@@ -187,7 +255,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
187255
188256 if _matching :
189257 # The position matches an ignore token.
190- hret = _run_hook ("on_ignore_match" , ctx )
258+ hret = ctx . run_hook ("on_ignore_match" )
191259 if hret == BREAK :
192260 break
193261 if hret == CONT :
@@ -202,7 +270,6 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
202270 ctx .ignoring = False
203271 break
204272
205- cur_char = ctx .src [ctx .cur ]
206273 ctx .ignoring = True
207274 break
208275 # We looked through all ignore tokens, not found any. Move on.
@@ -221,7 +288,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
221288 ctx .match = False
222289
223290 for ctx .src_tk , ctx .dest_str in lang_map :
224- hret = _run_hook ("pre_tx_token" , ctx )
291+ hret = ctx . run_hook ("pre_tx_token" )
225292 if hret == BREAK :
226293 break
227294 if hret == CONT :
@@ -237,7 +304,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
237304 # point value) than the current character, then break the loop
238305 # without a match, because we know there won't be any more
239306 # match due to the alphabetical ordering.
240- if ctx .src_tk .content [0 ] > cur_char :
307+ if ctx .src_tk .content [0 ] > ctx . cur_char :
241308 logger .debug (
242309 f"{ ctx .src_tk .content } is after "
243310 f"{ ctx .src [ctx .cur :ctx .cur + step ]} . "
@@ -247,11 +314,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
247314 # If src_tk has a WB flag but the token is not at WB, skip.
248315 if (
249316 (ctx .src_tk .flags & BOW and not ctx .cur_flags & BOW )
250- or
251- # Can't rely on EOW flag, we must check on the last
252- # character of the potential match.
253- (ctx .src_tk .flags & EOW and not _is_eow (
254- ctx .cur + step - 1 , ctx , WORD_BOUNDARY ))
317+ or (
318+ # Can't rely on EOW flag, we must check on the last
319+ # character of the potential match.
320+ ctx .src_tk .flags & EOW
321+ and not ctx .cur_at_eow (ctx .cur + step - 1 )
322+ )
255323 ):
256324 continue
257325
@@ -262,7 +330,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
262330 ctx .match = True
263331 # This hook may skip this token or break out of the token
264332 # lookup for the current position.
265- hret = _run_hook ("on_tx_token_match" , ctx )
333+ hret = ctx . run_hook ("on_tx_token_match" )
266334 if hret == BREAK :
267335 break
268336 if hret == CONT :
@@ -300,17 +368,18 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
300368
301369 if ctx .match is False :
302370 delattr (ctx , "match" )
303- hret = _run_hook ("on_no_tx_token_match" , ctx )
371+ hret = ctx . run_hook ("on_no_tx_token_match" )
304372 if hret == BREAK :
305373 break
306374 if hret == CONT :
307375 continue
308376
309377 # No match found. Copy non-mapped character (one at a time).
310378 logger .info (
311- f"Token { cur_char } (\\ u{ hex (ord (cur_char ))[2 :]} ) "
379+ f"Token { ctx .cur_char } "
380+ f"(\\ u{ hex (ord (ctx .cur_char ))[2 :]} ) "
312381 f"at position { ctx .cur } is not mapped." )
313- ctx .dest_ls .append (cur_char )
382+ ctx .dest_ls .append (ctx . cur_char )
314383 ctx .cur += 1
315384 else :
316385 delattr (ctx , "match" )
@@ -320,7 +389,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
320389
321390 # This hook may take care of the assembly and cause the function to
322391 # return its own return value.
323- hret = _run_hook ("pre_assembly" , ctx )
392+ hret = ctx . run_hook ("pre_assembly" )
324393 if hret is not None :
325394 return hret , ctx .warnings
326395
@@ -329,62 +398,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
329398
330399 # This hook may reassign the output string and/or cause the function to
331400 # return it immediately.
332- hret = _run_hook ("post_assembly" , ctx )
401+ hret = ctx . run_hook ("post_assembly" )
333402 if hret is not None :
334403 return hret , ctx .warnings
335404
336405 # Strip multiple spaces and leading/trailing whitespace.
337406 ctx .dest = MULTI_WS_RE .sub (r"\1" , ctx .dest .strip ())
338407
339408 return ctx .dest , ctx .warnings
340-
341-
342- def _normalize_src (ctx , norm_rules ):
343- """
344- Normalize source text according to rules.
345-
346- NOTE: this manipluates the protected source attribute so it may not
347- correspond to the originally provided source.
348- """
349- # Normalize precomposed Unicode characters.
350- #
351- # In using diacritics, LC standards prefer the decomposed form (combining
352- # diacritic + base character) to the pre-composed form (single Unicode
353- # symbol for the letter with diacritic).
354- #
355- # Note: only safe for R2S.
356- if ctx .t_dir == FEAT_R2S :
357- logger .debug ("Normalizing pre-composed symbols." )
358- ctx ._src = precomp_normalize ("NFD" , ctx .src )
359-
360- for nk , nv in norm_rules .items ():
361- ctx ._src = ctx .src .replace (nk , nv )
362-
363- return _run_hook ("post_normalize" , ctx )
364-
365-
366- def _is_bow (cur , ctx , word_boundary ):
367- return (cur == 0 or ctx .src [cur - 1 ] in word_boundary ) and (
368- ctx .src [cur ] not in word_boundary )
369-
370-
371- def _is_eow (cur , ctx , word_boundary ):
372- return (
373- cur == len (ctx .src ) - 1
374- or ctx .src [cur + 1 ] in word_boundary
375- ) and (ctx .src [cur ] not in word_boundary )
376-
377-
378- def _run_hook (hname , ctx ):
379- ret = None
380- for hook_def in ctx .hooks .get (hname , []):
381- fn = getattr (
382- import_module ("." + hook_def ["module_name" ], HOOK_PKG_PATH ),
383- hook_def ["fn_name" ])
384- ret = fn (ctx , ** hook_def ["kwargs" ])
385- if ret in (BREAK , CONT ):
386- # This will stop parsing hooks functions and tell the caller to
387- # break out of the outer loop or skip iteration.
388- return ret
389-
390- return ret
0 commit comments