|
2 | 2 |
|
3 | 3 | from importlib import import_module |
4 | 4 | from re import Pattern, compile |
| 5 | +from unicodedata import normalize as precomp_normalize |
5 | 6 |
|
6 | 7 | from scriptshifter.exceptions import BREAK, CONT |
7 | 8 | from scriptshifter.tables import ( |
8 | | - BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH, |
| 9 | + BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH, |
9 | 10 | get_connection, get_lang_dcap, get_lang_general, get_lang_hooks, |
10 | 11 | get_lang_ignore, get_lang_map, get_lang_normalize) |
11 | 12 |
|
@@ -345,6 +346,17 @@ def _normalize_src(ctx, norm_rules): |
345 | 346 | NOTE: this manipluates the protected source attribute so it may not |
346 | 347 | correspond to the originally provided source. |
347 | 348 | """ |
| 349 | + # Normalize precomposed Unicode characters. |
| 350 | + # |
| 351 | + # In using diacritics, LC standards prefer the decomposed form (combining |
| 352 | + # diacritic + base character) to the pre-composed form (single Unicode |
| 353 | + # symbol for the letter with diacritic). |
| 354 | + # |
| 355 | + # Note: only safe for R2S. |
| 356 | + if ctx.t_dir == FEAT_R2S: |
| 357 | + logger.debug("Normalizing pre-composed symbols.") |
| 358 | + ctx._src = precomp_normalize("NFD", ctx.src) |
| 359 | + |
348 | 360 | for nk, nv in norm_rules.items(): |
349 | 361 | ctx._src = ctx.src.replace(nk, nv) |
350 | 362 |
|
|
0 commit comments