Skip to content

Commit 1531278

Browse files
authored
Merge pull request #202 from lcnetdev/main
Update test branch
2 parents 0543e7c + 28fc69f commit 1531278

15 files changed

Lines changed: 727 additions & 530 deletions

File tree

doc/supported_scripts.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ third-party library.
6363
| [macedonian](../scriptshifter/tables/data/macedonian.yml) | Macedonian | Y | Y | stable |
6464
| [malayalam](../scriptshifter/tables/data/malayalam.yml) | Malayalam | Y | Y | | s-to-r lacks capitalization
6565
| [mansi_cyrillic](../scriptshifter/tables/data/mansi_cyrillic.yml) | Mansi (Cyrillic) | Y | Y | stable |
66-
| [marathi](../scriptshifter/tables/data/marathi.yml) | Marathi | Y | Y | | s-to-r lacks capitalization
66+
| [marathi](../scriptshifter/tables/data/marathi_devanagari.yml) | Marathi | Y | Y | | s-to-r lacks capitalization
6767
| [mari_cyrillic](../scriptshifter/tables/data/mari_cyrillic.yml) | Mari (Cyrillic) | Y | Y | stable |
6868
| [moldovan_cyrillic](../scriptshifter/tables/data/moldovan_cyrillic.yml) | Moldovan (Cyrillic) | Y | Y | stable |
6969
| [mongolian_cyrillic](../scriptshifter/tables/data/mongolian_cyrillic.yml) | Mongolian (Cyrillic) | Y | Y | stable |
@@ -86,13 +86,13 @@ third-party library.
8686
| [sanskrit_devanagari](../scriptshifter/tables/data/sanskrit_devanagari.yml) | Sanskrit (Devanagari) | Y | Y | | s-to-r lacks capitalization
8787
| [serbian](../scriptshifter/tables/data/serbian.yml) | Serbian | Y | Y | stable |
8888
| [shor_cyrillic](../scriptshifter/tables/data/shor_cyrillic.yml) | Shor (Cyrillic) | Y | Y | stable |
89-
| [sinhalese_sinhala](../scriptshifter/tables/data/sinhalese_sinhala.yml) | Sinhalese (Sinhala) | Y | Y | | s-to-r lacks capitalization
89+
| [sinhalese_sinhala](../scriptshifter/tables/data/sinhalese.yml) | Sinhalese (Sinhala) | Y | Y | | s-to-r lacks capitalization
9090
| [syriac_cyrillic](../scriptshifter/tables/data/syriac_cyrillic.yml) | Syriac (Cyrillic) | Y | Y | stable |
9191
| [tajik_cyrillic](../scriptshifter/tables/data/tajik_cyrillic.yml) | Tajik (Cyrillic) | Y | Y | stable |
9292
| [tamil](../scriptshifter/tables/data/tamil.yml) | Tamil | Y | Y | beta |
9393
| [tamil_brahmi](../scriptshifter/tables/data/tamil_brahmi.yml) | Tamil Brahmi | Y | Y | |
9494
| [tamil_extended](../scriptshifter/tables/data/tamil_extended.yml) | Tamil (extended) | Y | Y | |
95-
| [tatar-kryashen_cyrillic](../scriptshifter/tables/data/tatar.yml) | Tatar-Kryashen (Cyrillic) | Y | Y | stable |
95+
| [tatar-kryashen_cyrillic](../scriptshifter/tables/data/tatar-kryashen_cyrillic.yml) | Tatar-Kryashen (Cyrillic) | Y | Y | stable |
9696
| [tatar_cyrillic](../scriptshifter/tables/data/tatar_cyrillic.yml) | Tatar (Cyrillic) | Y | Y | stable |
9797
| [telugu](../scriptshifter/tables/data/telugu.yml) | Telugu | Y | Y | | s-to-r lacks capitalization
9898
| [thai](../scriptshifter/tables/data/thai.yml) | Thai | Y | Y | |

scriptshifter/hooks/general/__init__.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,25 @@
2727
logger = getLogger(__name__)
2828

2929

30+
def capitalize_pre_assembly(ctx):
31+
"""
32+
Capitalize a not-yet-assembled result list according to user options.
33+
"""
34+
ctx.dest_ls = _capitalize(ctx.dest_ls, ctx.options.get("capitalize"))
35+
36+
37+
def capitalize_post_assembly(ctx):
38+
"""
39+
Capitalize an already assembled result string according to user options.
40+
"""
41+
dest_ls = ctx.dest.split(" ")
42+
43+
dest_ls = _capitalize(dest_ls, ctx.options.get("capitalize"))
44+
45+
return " ".join(dest_ls)
46+
47+
48+
3049
def normalize_spacing_post_assembly(ctx):
3150
"""
3251
Remove duplicate and unwanted whitespace around punctuation.
@@ -53,3 +72,18 @@ def normalize_spacing_post_assembly(ctx):
5372
# norm = NORM8_RE.sub(r"\1\2", norm)
5473

5574
return norm
75+
76+
77+
def _capitalize(src, which):
78+
"""
79+
Only capitalize first word and words preceded by space.
80+
81+
NOTE: this function is only used for capitalizing hook-generated
82+
transliterations, which are not normally processed. Double cap rules are
83+
not applicable here.
84+
"""
85+
if which == "first":
86+
ctx.dest_ls[0] = ctx.dest_ls[0].upper()
87+
88+
elif which == "all":
89+
ctx.dest_ls = [tk[0].upper() + tk[1:] for tk in ctx.dest_ls]

scriptshifter/hooks/hebrew/dicta_api.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from requests import post
44

55
from scriptshifter.exceptions import BREAK, UpstreamError
6-
from scriptshifter.tools import capitalize
6+
from scriptshifter.hooks.general import capitalize_post_assembly
77

88
EP = environ.get("TXL_DICTA_EP")
99
DEFAULT_GENRE = "rabbinic"
@@ -25,16 +25,8 @@ def s2r_post_config(ctx):
2525
except Exception:
2626
raise UpstreamError("Error received from Dicta service.")
2727

28-
rom = rsp.json().get("transliteration")
29-
30-
if rom:
31-
if ctx.options["capitalize"] == "all":
32-
rom = capitalize(rom)
33-
elif ctx.options["capitalize"] == "first":
34-
rom = rom[0].upper() + rom[1:]
35-
else:
36-
ctx.warnings.append("Upstream service returned empty result.")
37-
38-
ctx.dest = rom
28+
ctx.dest = rsp.json().get("transliteration")
29+
if ctx.dest:
30+
ctx.dest = capitalize_post_assembly(ctx)
3931

4032
return BREAK

scriptshifter/hooks/korean/romanizer.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
from scriptshifter.exceptions import BREAK
3030
from scriptshifter.hooks.korean import KCONF
31-
from scriptshifter.tools import capitalize
31+
from scriptshifter.hooks.general import capitalize_post_assembly
3232

3333

3434
PWD = path.dirname(path.realpath(__file__))
@@ -62,6 +62,12 @@ def s2r_nonames_post_config(ctx):
6262
ctx.dest, ctx.warnings = _romanize_nonames(
6363
ctx.src, ctx.options)
6464

65+
if ctx.dest:
66+
# FKR042: Capitalize all first letters
67+
# FKR043: Capitalize the first letter
68+
logger.debug(f"Before capitalization: {ctx.dest}")
69+
ctx.dest = capitalize_post_assembly(ctx)
70+
6571
return BREAK
6672

6773

@@ -74,6 +80,12 @@ def s2r_names_post_config(ctx):
7480
"""
7581
ctx.dest, ctx.warnings = _romanize_names(ctx.src, ctx.options)
7682

83+
if ctx.dest:
84+
# FKR042: Capitalize all first letters
85+
# FKR043: Capitalize the first letter
86+
logger.debug(f"Before capitalization: {ctx.dest}")
87+
ctx.dest = capitalize_post_assembly(ctx)
88+
7789
return BREAK
7890

7991

@@ -105,19 +117,9 @@ def _romanize_nonames(src, options):
105117

106118
rom = _romanize_oclc_auto(kor)
107119

108-
logger.debug(f"Before capitalization: {rom}")
109-
# FKR042: Capitalize all first letters
110-
if options["capitalize"] == "all":
111-
rom = capitalize(rom)
112-
# FKR043: Capitalize the first letter
113-
elif options["capitalize"] == "first":
114-
rom = rom[0].upper() + rom[1:]
115-
116120
# FKR044: Ambiguities
117121
ambi = re.sub("[,.\";: ]+", " ", rom)
118122

119-
# TODO Decide what to do with these. There is no facility for outputting
120-
# warnings or notes to the user yet.
121123
warnings = []
122124
_fkr_log(45)
123125
for exp, warn in KCONF["fkr045"].items():
@@ -308,10 +310,11 @@ def _kor_corp_name_rom(src):
308310
src = src[:-4]
309311
yu = "R"
310312

311-
rom_tok = []
312-
for tok in src.split(" "):
313-
rom_tok.append(_romanize_oclc_auto(tok))
314-
rom = capitalize(" ".join(rom_tok))
313+
rom_tok = [
314+
_romanize_oclc_auto(tok)
315+
for tok in src.split(" ")
316+
]
317+
rom = " ".join(rom_tok)
315318

316319
if chu == "L":
317320
rom = "(Chu) " + rom

scriptshifter/hooks/yiddish_/__init__.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,16 @@
1616
from yiddish import detransliterate, transliterate
1717

1818
from scriptshifter.exceptions import BREAK
19-
from scriptshifter.tools import capitalize
2019

2120

2221
def s2r_post_config(ctx):
2322
"""
2423
Script to Roman.
2524
"""
26-
rom = transliterate(
25+
ctx.dest = transliterate(
2726
ctx.src, loc=True,
2827
loshn_koydesh=ctx.options.get("loshn_koydesh"))
2928

30-
if ctx.options["capitalize"] == "all":
31-
rom = capitalize(rom)
32-
elif ctx.options["capitalize"] == "first":
33-
rom = rom[0].upper() + rom[1:]
34-
35-
ctx.dest = rom
36-
3729
return BREAK
3830

3931

scriptshifter/tables/data/_ignore_base.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@ roman_to_script:
3131
- "X{1,3}I{,3}"
3232
- "X{1,3}I(V|X)"
3333
- "X{1,3}VI{,3}"
34+
- "[\u2021$][0-9a-z] *"
3435

3536
script_to_roman:
3637
ignore:
3738
- " "
38-
39+
ignore_ptn:
40+
- "[\u2021$][0-9a-z] *"

0 commit comments

Comments
 (0)