Skip to content

Commit 2b9b5be

Browse files
committed
Merge branch 'main' into test
2 parents 00e89d6 + f4b7142 commit 2b9b5be

68 files changed

Lines changed: 1897 additions & 1497 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

doc/supported_scripts.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ third-party library.
6363
| [macedonian](../scriptshifter/tables/data/macedonian.yml) | Macedonian | Y | Y | stable |
6464
| [malayalam](../scriptshifter/tables/data/malayalam.yml) | Malayalam | Y | Y | | s-to-r lacks capitalization
6565
| [mansi_cyrillic](../scriptshifter/tables/data/mansi_cyrillic.yml) | Mansi (Cyrillic) | Y | Y | stable |
66-
| [marathi](../scriptshifter/tables/data/marathi.yml) | Marathi | Y | Y | | s-to-r lacks capitalization
66+
| [marathi](../scriptshifter/tables/data/marathi_devanagari.yml) | Marathi | Y | Y | | s-to-r lacks capitalization
6767
| [mari_cyrillic](../scriptshifter/tables/data/mari_cyrillic.yml) | Mari (Cyrillic) | Y | Y | stable |
6868
| [moldovan_cyrillic](../scriptshifter/tables/data/moldovan_cyrillic.yml) | Moldovan (Cyrillic) | Y | Y | stable |
6969
| [mongolian_cyrillic](../scriptshifter/tables/data/mongolian_cyrillic.yml) | Mongolian (Cyrillic) | Y | Y | stable |
@@ -86,13 +86,13 @@ third-party library.
8686
| [sanskrit_devanagari](../scriptshifter/tables/data/sanskrit_devanagari.yml) | Sanskrit (Devanagari) | Y | Y | | s-to-r lacks capitalization
8787
| [serbian](../scriptshifter/tables/data/serbian.yml) | Serbian | Y | Y | stable |
8888
| [shor_cyrillic](../scriptshifter/tables/data/shor_cyrillic.yml) | Shor (Cyrillic) | Y | Y | stable |
89-
| [sinhalese_sinhala](../scriptshifter/tables/data/sinhalese_sinhala.yml) | Sinhalese (Sinhala) | Y | Y | | s-to-r lacks capitalization
89+
| [sinhalese_sinhala](../scriptshifter/tables/data/sinhalese.yml) | Sinhalese (Sinhala) | Y | Y | | s-to-r lacks capitalization
9090
| [syriac_cyrillic](../scriptshifter/tables/data/syriac_cyrillic.yml) | Syriac (Cyrillic) | Y | Y | stable |
9191
| [tajik_cyrillic](../scriptshifter/tables/data/tajik_cyrillic.yml) | Tajik (Cyrillic) | Y | Y | stable |
9292
| [tamil](../scriptshifter/tables/data/tamil.yml) | Tamil | Y | Y | beta |
9393
| [tamil_brahmi](../scriptshifter/tables/data/tamil_brahmi.yml) | Tamil Brahmi | Y | Y | |
9494
| [tamil_extended](../scriptshifter/tables/data/tamil_extended.yml) | Tamil (extended) | Y | Y | |
95-
| [tatar-kryashen_cyrillic](../scriptshifter/tables/data/tatar.yml) | Tatar-Kryashen (Cyrillic) | Y | Y | stable |
95+
| [tatar-kryashen_cyrillic](../scriptshifter/tables/data/tatar-kryashen_cyrillic.yml) | Tatar-Kryashen (Cyrillic) | Y | Y | stable |
9696
| [tatar_cyrillic](../scriptshifter/tables/data/tatar_cyrillic.yml) | Tatar (Cyrillic) | Y | Y | stable |
9797
| [telugu](../scriptshifter/tables/data/telugu.yml) | Telugu | Y | Y | | s-to-r lacks capitalization
9898
| [thai](../scriptshifter/tables/data/thai.yml) | Thai | Y | Y | |

scriptshifter/hooks/general/__init__.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,24 @@
2727
logger = getLogger(__name__)
2828

2929

30+
def capitalize_pre_assembly(ctx):
31+
"""
32+
Capitalize a not-yet-assembled result list according to user options.
33+
"""
34+
ctx.dest_ls = _capitalize(ctx.dest_ls, ctx.options.get("capitalize"))
35+
36+
37+
def capitalize_post_assembly(ctx):
38+
"""
39+
Capitalize an already assembled result string according to user options.
40+
"""
41+
dest_ls = ctx.dest.split(" ")
42+
43+
dest_ls = _capitalize(dest_ls, ctx.options.get("capitalize"))
44+
45+
return " ".join(dest_ls)
46+
47+
3048
def normalize_spacing_post_assembly(ctx):
3149
"""
3250
Remove duplicate and unwanted whitespace around punctuation.
@@ -53,3 +71,21 @@ def normalize_spacing_post_assembly(ctx):
5371
# norm = NORM8_RE.sub(r"\1\2", norm)
5472

5573
return norm
74+
75+
76+
def _capitalize(src, which):
77+
"""
78+
capitalize first word only or all words.
79+
80+
NOTE: this function is only used for capitalizing hook-generated
81+
transliterations, which are not normally processed. Double cap rules are
82+
not applicable here.
83+
"""
84+
if which == "first":
85+
src[0] = src[0].capitalize()
86+
return src
87+
88+
if which == "all":
89+
return [tk[0].upper() + tk[1:] for tk in src]
90+
91+
return src

scriptshifter/hooks/hebrew/dicta_api.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from requests import post
44

55
from scriptshifter.exceptions import BREAK, UpstreamError
6-
from scriptshifter.tools import capitalize
6+
from scriptshifter.hooks.general import capitalize_post_assembly
77

88
EP = environ.get("TXL_DICTA_EP")
99
DEFAULT_GENRE = "rabbinic"
@@ -25,16 +25,8 @@ def s2r_post_config(ctx):
2525
except Exception:
2626
raise UpstreamError("Error received from Dicta service.")
2727

28-
rom = rsp.json().get("transliteration")
29-
30-
if rom:
31-
if ctx.options["capitalize"] == "all":
32-
rom = capitalize(rom)
33-
elif ctx.options["capitalize"] == "first":
34-
rom = rom[0].upper() + rom[1:]
35-
else:
36-
ctx.warnings.append("Upstream service returned empty result.")
37-
38-
ctx.dest = rom
28+
ctx.dest = rsp.json().get("transliteration")
29+
if ctx.dest:
30+
ctx.dest = capitalize_post_assembly(ctx)
3931

4032
return BREAK

scriptshifter/hooks/korean/romanizer.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
from scriptshifter.exceptions import BREAK
3030
from scriptshifter.hooks.korean import KCONF
31-
from scriptshifter.tools import capitalize
31+
from scriptshifter.hooks.general import capitalize_post_assembly
3232

3333

3434
PWD = path.dirname(path.realpath(__file__))
@@ -62,6 +62,12 @@ def s2r_nonames_post_config(ctx):
6262
ctx.dest, ctx.warnings = _romanize_nonames(
6363
ctx.src, ctx.options)
6464

65+
if ctx.dest:
66+
# FKR042: Capitalize all first letters
67+
# FKR043: Capitalize the first letter
68+
logger.debug(f"Before capitalization: {ctx.dest}")
69+
ctx.dest = capitalize_post_assembly(ctx)
70+
6571
return BREAK
6672

6773

@@ -74,6 +80,12 @@ def s2r_names_post_config(ctx):
7480
"""
7581
ctx.dest, ctx.warnings = _romanize_names(ctx.src, ctx.options)
7682

83+
if ctx.dest:
84+
# FKR042: Capitalize all first letters
85+
# FKR043: Capitalize the first letter
86+
logger.debug(f"Before capitalization: {ctx.dest}")
87+
ctx.dest = capitalize_post_assembly(ctx)
88+
7789
return BREAK
7890

7991

@@ -105,19 +117,9 @@ def _romanize_nonames(src, options):
105117

106118
rom = _romanize_oclc_auto(kor)
107119

108-
logger.debug(f"Before capitalization: {rom}")
109-
# FKR042: Capitalize all first letters
110-
if options["capitalize"] == "all":
111-
rom = capitalize(rom)
112-
# FKR043: Capitalize the first letter
113-
elif options["capitalize"] == "first":
114-
rom = rom[0].upper() + rom[1:]
115-
116120
# FKR044: Ambiguities
117121
ambi = re.sub("[,.\";: ]+", " ", rom)
118122

119-
# TODO Decide what to do with these. There is no facility for outputting
120-
# warnings or notes to the user yet.
121123
warnings = []
122124
_fkr_log(45)
123125
for exp, warn in KCONF["fkr045"].items():
@@ -308,10 +310,11 @@ def _kor_corp_name_rom(src):
308310
src = src[:-4]
309311
yu = "R"
310312

311-
rom_tok = []
312-
for tok in src.split(" "):
313-
rom_tok.append(_romanize_oclc_auto(tok))
314-
rom = capitalize(" ".join(rom_tok))
313+
rom_tok = [
314+
_romanize_oclc_auto(tok)
315+
for tok in src.split(" ")
316+
]
317+
rom = " ".join(rom_tok)
315318

316319
if chu == "L":
317320
rom = "(Chu) " + rom

scriptshifter/hooks/yiddish_/__init__.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,16 @@
1616
from yiddish import detransliterate, transliterate
1717

1818
from scriptshifter.exceptions import BREAK
19-
from scriptshifter.tools import capitalize
2019

2120

2221
def s2r_post_config(ctx):
2322
"""
2423
Script to Roman.
2524
"""
26-
rom = transliterate(
25+
ctx.dest = transliterate(
2726
ctx.src, loc=True,
2827
loshn_koydesh=ctx.options.get("loshn_koydesh"))
2928

30-
if ctx.options["capitalize"] == "all":
31-
rom = capitalize(rom)
32-
elif ctx.options["capitalize"] == "first":
33-
rom = rom[0].upper() + rom[1:]
34-
35-
ctx.dest = rom
36-
3729
return BREAK
3830

3931

scriptshifter/tables/data/_cyrillic_base.yml

Lines changed: 0 additions & 133 deletions
This file was deleted.

scriptshifter/tables/data/_ignore_base.yml

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,30 +9,37 @@ roman_to_script:
99
- "date of publication not identified"
1010
- "place of publication not identified"
1111
- "publisher not identified"
12+
- "and one other"
13+
- "et al."
14+
ignore_ptn:
15+
- "and ([a-z0-9]+ )?others"
16+
17+
# Incorrectly entered (but frequently found) Roman numerals.
1218
# NOTE There is ambiguity about ignoring these
1319
# words. Note that the single-character Roman
1420
# numerals are not included on purpose.
1521
# Ideally the source editors should use the
1622
# dedicated U+2160÷U+216F (uppercase Roman
1723
# numerals) and/or U+2170÷U+217F (lower case Roman
1824
# numerals) ranges to avoid this ambiguity.
19-
- "and one other"
20-
- "et al."
21-
ignore_ptn:
22-
- "and ([a-z0-9]+ )?others"
23-
- "I{2,3}"
24-
- "I(V|X)"
25-
- "LI{,3}"
26-
- "LI?(V|X)"
27-
- "L(V|X{1,3})I{,3}"
28-
- "LX{1,3}I?V"
29-
- "LX{1,3}VI{,3}"
30-
- "(V|X{1,3})I{,3}"
31-
- "X{1,3}I{,3}"
32-
- "X{1,3}I(V|X)"
33-
- "X{1,3}VI{,3}"
25+
- "\\bI{2,3}\\b"
26+
- "\\bI(V|X)\\b"
27+
- "\\bLI{,3}\\b"
28+
- "\\bLI?(V|X)\\b"
29+
- "\\bL(V|X{1,3})I{,3}\\b"
30+
- "\\bLX{1,3}I?V\\b"
31+
- "\\bLX{1,3}VI{,3}\\b"
32+
- "\\b(V|X{1,3})I{,3}\\b"
33+
- "\\bX{1,3}I{,3}\\b"
34+
- "\\bX{1,3}I(V|X)\\b"
35+
- "\\bX{1,3}VI{,3}\\b"
36+
37+
# MARC sub-field markers.
38+
- "\\b[\u2021$][0-9a-z]\\b"
3439

3540
script_to_roman:
3641
ignore:
3742
- " "
38-
43+
ignore_ptn:
44+
# MARC sub-field markers.
45+
- "\\b[\u2021$][0-9a-z]\\b"

scriptshifter/tables/data/abkhaz_cyrillic.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
general:
22
name: Abkhaz (Cyrillic)
33
parents:
4-
- _cyrillic_base
4+
- cyrillic_generic
55

66
roman_to_script:
77
map:

0 commit comments

Comments
 (0)