Skip to content

Commit aefb899

Browse files
committed
Normalize precomposed Unicode characters.
1 parent 2489c54 commit aefb899

1 file changed

Lines changed: 9 additions & 1 deletion

File tree

scriptshifter/trans.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22

33
from importlib import import_module
44
from re import Pattern, compile
5+
from unicode_data import normalize as precomp_normalize
56

67
from scriptshifter.exceptions import BREAK, CONT
78
from scriptshifter.tables import (
8-
BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
9+
BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
910
get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
1011
get_lang_ignore, get_lang_map, get_lang_normalize)
1112

@@ -342,6 +343,13 @@ def _normalize_src(ctx, norm_rules):
342343
NOTE: this manipluates the protected source attribute so it may not
343344
correspond to the originally provided source.
344345
"""
346+
# Normalize precomposed Unicode characters.
347+
#
348+
# In using diacritics, LC standards prefer the decomposed form (combining
349+
# diacritic + base character) to the pre-composed form (single Unicode
350+
# symbol for the letter with diacritic).
351+
ctx._src = precomp_normalize("NFD", ctx.src)
352+
345353
for nk, nv in norm_rules.items():
346354
ctx._src = ctx.src.replace(nk, nv)
347355

0 commit comments

Comments
 (0)