Skip to content

Commit 6cf69f1

Browse files
Merge branch 'main' into randy_2025_03_index
2 parents 1bb7ed3 + eb677d8 commit 6cf69f1

59 files changed

Lines changed: 1583 additions & 1168 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

doc/supported_scripts.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ third-party library.
6363
| [macedonian](../scriptshifter/tables/data/macedonian.yml) | Macedonian | Y | Y | stable |
6464
| [malayalam](../scriptshifter/tables/data/malayalam.yml) | Malayalam | Y | Y | | s-to-r lacks capitalization
6565
| [mansi_cyrillic](../scriptshifter/tables/data/mansi_cyrillic.yml) | Mansi (Cyrillic) | Y | Y | stable |
66-
| [marathi](../scriptshifter/tables/data/marathi.yml) | Marathi | Y | Y | | s-to-r lacks capitalization
66+
| [marathi](../scriptshifter/tables/data/marathi_devanagari.yml) | Marathi | Y | Y | | s-to-r lacks capitalization
6767
| [mari_cyrillic](../scriptshifter/tables/data/mari_cyrillic.yml) | Mari (Cyrillic) | Y | Y | stable |
6868
| [moldovan_cyrillic](../scriptshifter/tables/data/moldovan_cyrillic.yml) | Moldovan (Cyrillic) | Y | Y | stable |
6969
| [mongolian_cyrillic](../scriptshifter/tables/data/mongolian_cyrillic.yml) | Mongolian (Cyrillic) | Y | Y | stable |
@@ -86,13 +86,13 @@ third-party library.
8686
| [sanskrit_devanagari](../scriptshifter/tables/data/sanskrit_devanagari.yml) | Sanskrit (Devanagari) | Y | Y | | s-to-r lacks capitalization
8787
| [serbian](../scriptshifter/tables/data/serbian.yml) | Serbian | Y | Y | stable |
8888
| [shor_cyrillic](../scriptshifter/tables/data/shor_cyrillic.yml) | Shor (Cyrillic) | Y | Y | stable |
89-
| [sinhalese_sinhala](../scriptshifter/tables/data/sinhalese_sinhala.yml) | Sinhalese (Sinhala) | Y | Y | | s-to-r lacks capitalization
89+
| [sinhalese_sinhala](../scriptshifter/tables/data/sinhalese.yml) | Sinhalese (Sinhala) | Y | Y | | s-to-r lacks capitalization
9090
| [syriac_cyrillic](../scriptshifter/tables/data/syriac_cyrillic.yml) | Syriac (Cyrillic) | Y | Y | stable |
9191
| [tajik_cyrillic](../scriptshifter/tables/data/tajik_cyrillic.yml) | Tajik (Cyrillic) | Y | Y | stable |
9292
| [tamil](../scriptshifter/tables/data/tamil.yml) | Tamil | Y | Y | beta |
9393
| [tamil_brahmi](../scriptshifter/tables/data/tamil_brahmi.yml) | Tamil Brahmi | Y | Y | |
9494
| [tamil_extended](../scriptshifter/tables/data/tamil_extended.yml) | Tamil (extended) | Y | Y | |
95-
| [tatar-kryashen_cyrillic](../scriptshifter/tables/data/tatar.yml) | Tatar-Kryashen (Cyrillic) | Y | Y | stable |
95+
| [tatar-kryashen_cyrillic](../scriptshifter/tables/data/tatar-kryashen_cyrillic.yml) | Tatar-Kryashen (Cyrillic) | Y | Y | stable |
9696
| [tatar_cyrillic](../scriptshifter/tables/data/tatar_cyrillic.yml) | Tatar (Cyrillic) | Y | Y | stable |
9797
| [telugu](../scriptshifter/tables/data/telugu.yml) | Telugu | Y | Y | | s-to-r lacks capitalization
9898
| [thai](../scriptshifter/tables/data/thai.yml) | Thai | Y | Y | |

scriptshifter/tables/data/_cyrillic_base.yml

Lines changed: 0 additions & 133 deletions
This file was deleted.

scriptshifter/tables/data/_ignore_base.yml

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,32 +9,37 @@ roman_to_script:
99
- "date of publication not identified"
1010
- "place of publication not identified"
1111
- "publisher not identified"
12+
- "and one other"
13+
- "et al."
14+
ignore_ptn:
15+
- "and ([a-z0-9]+ )?others"
16+
17+
# Incorrectly entered (but frequently found) Roman numerals.
1218
# NOTE There is ambiguity about ignoring these
1319
# words. Note that the single-character Roman
1420
# numerals are not included on purpose.
1521
# Ideally the source editors should use the
1622
# dedicated U+2160÷U+216F (uppercase Roman
1723
# numerals) and/or U+2170÷U+217F (lower case Roman
1824
# numerals) ranges to avoid this ambiguity.
19-
- "and one other"
20-
- "et al."
21-
ignore_ptn:
22-
- "and ([a-z0-9]+ )?others"
23-
- "I{2,3}"
24-
- "I(V|X)"
25-
- "LI{,3}"
26-
- "LI?(V|X)"
27-
- "L(V|X{1,3})I{,3}"
28-
- "LX{1,3}I?V"
29-
- "LX{1,3}VI{,3}"
30-
- "(V|X{1,3})I{,3}"
31-
- "X{1,3}I{,3}"
32-
- "X{1,3}I(V|X)"
33-
- "X{1,3}VI{,3}"
34-
- "[\u2021$][0-9a-z] *"
25+
- "\\bI{2,3}\\b"
26+
- "\\bI(V|X)\\b"
27+
- "\\bLI{,3}\\b"
28+
- "\\bLI?(V|X)\\b"
29+
- "\\bL(V|X{1,3})I{,3}\\b"
30+
- "\\bLX{1,3}I?V\\b"
31+
- "\\bLX{1,3}VI{,3}\\b"
32+
- "\\b(V|X{1,3})I{,3}\\b"
33+
- "\\bX{1,3}I{,3}\\b"
34+
- "\\bX{1,3}I(V|X)\\b"
35+
- "\\bX{1,3}VI{,3}\\b"
36+
37+
# MARC sub-field markers.
38+
- "\\b[\u2021$][0-9a-z]\\b"
3539

3640
script_to_roman:
3741
ignore:
3842
- " "
3943
ignore_ptn:
40-
- "[\u2021$][0-9a-z] *"
44+
# MARC sub-field markers.
45+
- "\\b[\u2021$][0-9a-z]\\b"

scriptshifter/tables/data/abkhaz_cyrillic.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
general:
22
name: Abkhaz (Cyrillic)
33
parents:
4-
- _cyrillic_base
4+
- cyrillic_generic
55

66
roman_to_script:
77
map:

scriptshifter/tables/data/altai_cyrillic.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
general:
22
name: Altai (Cyrillic)
33
parents:
4-
- _cyrillic_base
4+
- cyrillic_generic
55

66
roman_to_script:
77
map:

scriptshifter/tables/data/arabic.yml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@ general:
1515
roman_to_script:
1616
map:
1717

18-
# Original table by David Bucknum
19-
# Last updated 25 January 2019
18+
# Original table by David Bucknum, 5 April 2010
19+
# Updated, 25 January 2019
2020
# Modified by WK with testing by Arabic Cat Staff LOC-CAIRO
2121
# Additional info from R. Vassie, [n.d.] "Marrying the Arabic and Latin
2222
# Scripts Conceptually"
23+
# Updated, 26 March 2025 by Randall K. Barry to reverse truncation marks for ScriptShifter
2324

2425

2526
# Punctuation marks:
@@ -135,11 +136,11 @@ roman_to_script:
135136
"fi\u0304-": "\u0641\u064A"
136137
"ka-": "\u0643"
137138

138-
# Vowels and vowel/consonant combinations
139+
# Vowels and vowel/consonant combinations - ta-marbutah at end of word
139140
"ah%": "\u0629"
140141
"at%": "\u0629"
141142

142-
# tanwin
143+
# tanwin at end of word
143144
"an%": "\u0627"
144145

145146
# ayn-alif combo
@@ -149,6 +150,8 @@ roman_to_script:
149150
"\u02BBA\u0304": "\u0639\u0627"
150151
"\u02BBa\u0304": "\u0639\u0627"
151152

153+
"\u02BBI\u0304Y": "\u0639\u064A"
154+
"\u02BBi\u0304y": "\u0639\u064A"
152155
"\u02BBI\u0304": "\u0639\u064A"
153156
"\u02BBi\u0304": "\u0639\u064A"
154157

scriptshifter/tables/data/azerbaijani_cyrillic.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
general:
22
name: Azerbaijani (Cyrillic)
33
parents:
4-
- _cyrillic_base
4+
- cyrillic_generic
55

66
roman_to_script:
77
map:

scriptshifter/tables/data/bashkir_cyrillic.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
general:
22
name: Bashkir (Cyrillic)
33
parents:
4-
- _cyrillic_base
4+
- cyrillic_generic
55

66
roman_to_script:
77
map:

scriptshifter/tables/data/belarusian.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
general:
22
name: Belarusian
33
parents:
4-
- _cyrillic_base
4+
- cyrillic_generic
55

66
roman_to_script:
77
map:
Lines changed: 18 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,38 @@
11
general:
22
name: Bulgarian
33
parents:
4-
- _cyrillic_base
4+
- cyrillic_generic
55

66
roman_to_script:
77
map:
8-
"G": "\u0413"
9-
"g": "\u0433"
10-
# this conversion shouldn't be needed, but does no harm
11-
"ZH": "\u0416"
12-
"Zh": "\u0416"
13-
"zh": "\u0436"
14-
"I\uFE20E\uFE21": "\u0462"
15-
# this conversion shouldn't be needed, but does no harm
16-
"I\uFE20e\uFE21": "\u0462"
17-
# this conversion shouldn't be needed, but does no harm
18-
# this conversion shouldn't be needed, but does no harm
19-
"I": "\u0418"
20-
"i\uFE20e\uFE21": "\u0463"
21-
"i": "\u0438"
22-
# this conversion shouldn't be needed, but does no harm
238
"SHT": "\u0429"
249
"Sht": "\u0429"
2510
"sht": "\u0449"
26-
"T\uFE20S\uFE21": "\u0426"
27-
# this conversion shouldn't be needed, but does no harm
28-
"T\uFE20s\uFE21": "\u0426"
29-
"t\uFE20s\uFE21": "\u0446"
30-
"U\u0310": "\u046A"
11+
"U\u0306": "\u042A"
12+
# Mapping from precomposed non-MARC-8 Latin equivalent
13+
"\u016C": "\u042A"
3114
"u\u0306": "\u044A"
15+
# Mapping from precomposed non-MARC-8 Latin equivalent
16+
"\u016D": "\u044A"
17+
"U\u0310": "\u046A"
3218
"u\u0310": "\u046B"
3319
# this conversion is ambiguous - \u042A is also theoretically possible
3420
"\u02BA": "\u044A"
21+
# upper case hard sign is unlikely to occur
22+
"\u02BA\u0332": "\u042A"
3523

3624
script_to_roman:
3725
map:
38-
"\u044C": ""
39-
"\u042C": ""
40-
"\u044A": ""
41-
"\u042A%": "" # Final
42-
"\u042A": "u\u0306"
43-
"\u0413": "G"
44-
"\u0433": "g"
45-
"\u0416": "Zh"
46-
"\u0436": "zh"
47-
"\u0462": "I\uFE20E\uFE21"
48-
"\u0418": "I"
49-
"\u0463": "i\uFE20e\uFE21"
50-
"\u0438": "i"
5126
"\u0429": "Sht"
27+
"\u042A": "U\u0306"
28+
# Capital letter hard sign at the end of a word (rare)
29+
"\u042A%": "\u02BA\u0332"
30+
"\u042C": "\u02B9\u0332"
5231
"\u0449": "sht"
53-
"\u0426": "T\uFE20S\uFE21"
54-
"\u0446": "t\uFE20s\uFE21"
32+
"\u044A": "u\u0306"
33+
# Small letter hard sign at the end of a word (rare)
34+
"\u044A%": "\u02BA"
35+
"\u044C": "\u02B9"
5536
"\u046A": "U\u0310"
5637
"\u046B": "u\u0310"
57-
"\u042A": "u\u016C"
58-
"\u044A": "u\u016D"
38+

0 commit comments

Comments
 (0)