|
2 | 2 |
|
3 | 3 | from importlib import import_module |
4 | 4 | from re import Pattern, compile |
| 5 | +from unicode_data import normalize as precomp_normalize |
5 | 6 |
|
6 | 7 | from scriptshifter.exceptions import BREAK, CONT |
7 | 8 | from scriptshifter.tables import ( |
8 | | - BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH, |
| 9 | + BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH, |
9 | 10 | get_connection, get_lang_dcap, get_lang_general, get_lang_hooks, |
10 | 11 | get_lang_ignore, get_lang_map, get_lang_normalize) |
11 | 12 |
|
@@ -342,6 +343,13 @@ def _normalize_src(ctx, norm_rules): |
342 | 343 | NOTE: this manipluates the protected source attribute so it may not |
343 | 344 | correspond to the originally provided source. |
344 | 345 | """ |
| 346 | + # Normalize precomposed Unicode characters. |
| 347 | + # |
| 348 | + # In using diacritics, LC standards prefer the decomposed form (combining |
| 349 | + # diacritic + base character) to the pre-composed form (single Unicode |
| 350 | + # symbol for the letter with diacritic). |
| 351 | + ctx._src = precomp_normalize("NFD", ctx.src) |
| 352 | + |
345 | 353 | for nk, nv in norm_rules.items(): |
346 | 354 | ctx._src = ctx.src.replace(nk, nv) |
347 | 355 |
|
|
0 commit comments