Skip to content

Commit 5eea9a9

Browse files
authored
Merge pull request #189 from lcnetdev/decompose
Normalize precomposed Unicode characters.
2 parents b1b59aa + bc8533c commit 5eea9a9

1 file changed

Lines changed: 13 additions & 1 deletion

File tree

scriptshifter/trans.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22

33
from importlib import import_module
44
from re import Pattern, compile
5+
from unicodedata import normalize as precomp_normalize
56

67
from scriptshifter.exceptions import BREAK, CONT
78
from scriptshifter.tables import (
8-
BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
9+
BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
910
get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
1011
get_lang_ignore, get_lang_map, get_lang_normalize)
1112

@@ -345,6 +346,17 @@ def _normalize_src(ctx, norm_rules):
345346
NOTE: this manipluates the protected source attribute so it may not
346347
correspond to the originally provided source.
347348
"""
349+
# Normalize precomposed Unicode characters.
350+
#
351+
# In using diacritics, LC standards prefer the decomposed form (combining
352+
# diacritic + base character) to the pre-composed form (single Unicode
353+
# symbol for the letter with diacritic).
354+
#
355+
# Note: only safe for R2S.
356+
if ctx.t_dir == FEAT_R2S:
357+
logger.debug("Normalizing pre-composed symbols.")
358+
ctx._src = precomp_normalize("NFD", ctx.src)
359+
348360
for nk, nv in norm_rules.items():
349361
ctx._src = ctx.src.replace(nk, nv)
350362

0 commit comments

Comments
 (0)