@@ -64,6 +64,45 @@ def __enter__(self):
6464 def __exit__ (self , exc_type , exc_value , traceback ):
6565 self .conn .close ()
6666
67+ def run_hook (self , hname ):
68+ ret = None
69+ for hook_def in self .hooks .get (hname , []):
70+ fn = getattr (
71+ import_module ("." + hook_def ["module_name" ], HOOK_PKG_PATH ),
72+ hook_def ["fn_name" ]
73+ )
74+ ret = fn (self , ** hook_def ["kwargs" ])
75+ if ret in (BREAK , CONT ):
76+ # This will stop parsing hooks functions and tell the caller to
77+ # break out of the outer loop or skip iteration.
78+ return ret
79+
80+ return ret
81+
82+ def normalize_src (self ):
83+ """
84+ Normalize source text according to rules.
85+
86+ NOTE: this manipluates the protected source attribute so it may not
87+ correspond to the originally provided source.
88+ """
89+ norm_rules = get_lang_normalize (self .conn , self .lang_id )
90+ # Normalize precomposed Unicode characters.
91+ #
92+ # In using diacritics, LC standards prefer the decomposed form
93+ # (combining diacritic + base character) to the pre-composed form
94+ # (single Unicode symbol for the letter with diacritic).
95+ #
96+ # Note: only safe for R2S.
97+ if self .t_dir == FEAT_R2S :
98+ logger .debug ("Normalizing pre-composed symbols." )
99+ self ._src = precomp_normalize ("NFD" , self .src )
100+
101+ for nk , nv in norm_rules .items ():
102+ self ._src = self .src .replace (nk , nv )
103+
104+ return self .run_hook ("post_normalize" )
105+
67106
68107def transliterate (src , lang , t_dir = "s2r" , capitalize = False , options = {}):
69108 """
@@ -118,12 +157,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
118157
119158 # This hook may take over the whole transliteration process or delegate
120159 # it to some external process, and return the output string directly.
121- if _run_hook ("post_config" , ctx ) == BREAK :
160+ if ctx . run_hook ("post_config" ) == BREAK :
122161 return getattr (ctx , "dest" , "" ), ctx .warnings
123162
124- # _normalize_src returns the results of the post_normalize hook.
125- if _normalize_src (
126- ctx , get_lang_normalize (ctx .conn , ctx .lang_id )) == BREAK :
163+ # ctx.normalize_src returns the results of the post_normalize hook.
164+ if ctx .normalize_src () == BREAK :
127165 return getattr (ctx , "dest" , "" ), ctx .warnings
128166
129167 logger .debug (f"Normalized source: { ctx .src } " )
@@ -151,7 +189,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
151189
152190 # This hook may skip the parsing of the current
153191 # token or exit the scanning loop altogether.
154- hret = _run_hook ("begin_input_token" , ctx )
192+ hret = ctx . run_hook ("begin_input_token" )
155193 if hret == BREAK :
156194 logger .debug ("Breaking text scanning from hook signal." )
157195 break
@@ -165,7 +203,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
165203 while True :
166204 ctx .ignoring = False
167205 for ctx .tk in get_lang_ignore (ctx .conn , ctx .lang_id ):
168- hret = _run_hook ("pre_ignore_token" , ctx )
206+ hret = ctx . run_hook ("pre_ignore_token" )
169207 if hret == BREAK :
170208 break
171209 if hret == CONT :
@@ -187,7 +225,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
187225
188226 if _matching :
189227 # The position matches an ignore token.
190- hret = _run_hook ("on_ignore_match" , ctx )
228+ hret = ctx . run_hook ("on_ignore_match" )
191229 if hret == BREAK :
192230 break
193231 if hret == CONT :
@@ -221,7 +259,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
221259 ctx .match = False
222260
223261 for ctx .src_tk , ctx .dest_str in lang_map :
224- hret = _run_hook ("pre_tx_token" , ctx )
262+ hret = ctx . run_hook ("pre_tx_token" )
225263 if hret == BREAK :
226264 break
227265 if hret == CONT :
@@ -262,7 +300,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
262300 ctx .match = True
263301 # This hook may skip this token or break out of the token
264302 # lookup for the current position.
265- hret = _run_hook ("on_tx_token_match" , ctx )
303+ hret = ctx . run_hook ("on_tx_token_match" )
266304 if hret == BREAK :
267305 break
268306 if hret == CONT :
@@ -300,7 +338,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
300338
301339 if ctx .match is False :
302340 delattr (ctx , "match" )
303- hret = _run_hook ("on_no_tx_token_match" , ctx )
341+ hret = ctx . run_hook ("on_no_tx_token_match" )
304342 if hret == BREAK :
305343 break
306344 if hret == CONT :
@@ -320,7 +358,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
320358
321359 # This hook may take care of the assembly and cause the function to
322360 # return its own return value.
323- hret = _run_hook ("pre_assembly" , ctx )
361+ hret = ctx . run_hook ("pre_assembly" )
324362 if hret is not None :
325363 return hret , ctx .warnings
326364
@@ -329,7 +367,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
329367
330368 # This hook may reassign the output string and/or cause the function to
331369 # return it immediately.
332- hret = _run_hook ("post_assembly" , ctx )
370+ hret = ctx . run_hook ("post_assembly" )
333371 if hret is not None :
334372 return hret , ctx .warnings
335373
@@ -339,30 +377,6 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
339377 return ctx .dest , ctx .warnings
340378
341379
342- def _normalize_src (ctx , norm_rules ):
343- """
344- Normalize source text according to rules.
345-
346- NOTE: this manipluates the protected source attribute so it may not
347- correspond to the originally provided source.
348- """
349- # Normalize precomposed Unicode characters.
350- #
351- # In using diacritics, LC standards prefer the decomposed form (combining
352- # diacritic + base character) to the pre-composed form (single Unicode
353- # symbol for the letter with diacritic).
354- #
355- # Note: only safe for R2S.
356- if ctx .t_dir == FEAT_R2S :
357- logger .debug ("Normalizing pre-composed symbols." )
358- ctx ._src = precomp_normalize ("NFD" , ctx .src )
359-
360- for nk , nv in norm_rules .items ():
361- ctx ._src = ctx .src .replace (nk , nv )
362-
363- return _run_hook ("post_normalize" , ctx )
364-
365-
366380def _is_bow (cur , ctx , word_boundary ):
367381 return (cur == 0 or ctx .src [cur - 1 ] in word_boundary ) and (
368382 ctx .src [cur ] not in word_boundary )
@@ -373,18 +387,3 @@ def _is_eow(cur, ctx, word_boundary):
373387 cur == len (ctx .src ) - 1
374388 or ctx .src [cur + 1 ] in word_boundary
375389 ) and (ctx .src [cur ] not in word_boundary )
376-
377-
378- def _run_hook (hname , ctx ):
379- ret = None
380- for hook_def in ctx .hooks .get (hname , []):
381- fn = getattr (
382- import_module ("." + hook_def ["module_name" ], HOOK_PKG_PATH ),
383- hook_def ["fn_name" ])
384- ret = fn (ctx , ** hook_def ["kwargs" ])
385- if ret in (BREAK , CONT ):
386- # This will stop parsing hooks functions and tell the caller to
387- # break out of the outer loop or skip iteration.
388- return ret
389-
390- return ret
0 commit comments