Skip to content

Commit f4b7142

Browse files
committed
Merge branch 'main' of gh:lcnetdev/scriptshifter
2 parents a255131 + fd14c6a commit f4b7142

4 files changed

Lines changed: 304 additions & 9 deletions

File tree

scriptshifter/tables/data/arabic.yml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@ general:
1515
roman_to_script:
1616
map:
1717

18-
# Original table by David Bucknum
19-
# Last updated 25 January 2019
18+
# Original table by David Bucknum, 5 April 2010
19+
# Updated, 25 January 2019
2020
# Modified by WK with testing by Arabic Cat Staff LOC-CAIRO
2121
# Additional info from R. Vassie, [n.d.] "Marrying the Arabic and Latin
2222
# Scripts Conceptually"
23+
# Updated, 26 March 2025 by Randall K. Barry to reverse truncation marks for ScriptShifter
2324

2425

2526
# Punctuation marks:
@@ -135,11 +136,11 @@ roman_to_script:
135136
"fi\u0304-": "\u0641\u064A"
136137
"ka-": "\u0643"
137138

138-
# Vowels and vowel/consonant combinations
139+
# Vowels and vowel/consonant combinations - ta-marbutah at end of word
139140
"ah%": "\u0629"
140141
"at%": "\u0629"
141142

142-
# tanwin
143+
# tanwin at end of word
143144
"an%": "\u0627"
144145

145146
# ayn-alif combo
@@ -149,6 +150,8 @@ roman_to_script:
149150
"\u02BBA\u0304": "\u0639\u0627"
150151
"\u02BBa\u0304": "\u0639\u0627"
151152

153+
"\u02BBI\u0304Y": "\u0639\u064A"
154+
"\u02BBi\u0304y": "\u0639\u064A"
152155
"\u02BBI\u0304": "\u0639\u064A"
153156
"\u02BBi\u0304": "\u0639\u064A"
154157

scriptshifter/tables/data/macedonian.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ roman_to_script:
1414
"\u01F5": "\u0453"
1515
"g": "\u0433"
1616
"\u0110": "\u0402"
17-
"D\uFE20Z\u030C\uFE21": "\040F"
18-
"D\uFE20z\u030C\uFE21": "\040F"
19-
"d\uFE20Z\u030C\uFE21": "\040F"
17+
"D\uFE20Z\u030C\uFE21": "\u040F"
18+
"D\uFE20z\u030C\uFE21": "\u040F"
19+
"d\uFE20Z\u030C\uFE21": "\u040F"
2020
# Mapping from precomposed non-MARC-8 Latin equivalent
2121
"\u01C4": "\u040F"
2222
# Mapping from precomposed non-MARC-8 Latin equivalent
2323
"\u01C5": "\u040F"
24-
"d\uFE20z\u030C\uFE21": "\045F"
24+
"d\uFE20z\u030C\uFE21": "\u045F"
2525
# Mapping from precomposed non-MARC-8 Latin equivalent
2626
"\u01C6": "\u045F"
2727
"D\uFE20Z\uFE21": "\u0405"
Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
---
2+
general:
3+
name: Tod (Mongolian)
4+
parents:
5+
- _ignore_base
6+
case_sensitive: false
7+
8+
roman_to_script:
9+
10+
map:
11+
# Generates Narrow No-Break Space
12+
"\u002Daca": "\u202F\u1820\u1854\u1820"
13+
"\u002DA": "\u180E\u1820"
14+
"\u002Da": "\u180E\u1820"
15+
"A": "\u1820"
16+
"a": "\u1820"
17+
"\u002Dece": "\u202F\u1844\u1854\u1844"
18+
"\u002DE": "\u180E\u1844"
19+
"\u002De": "\u180E\u1844"
20+
# Generates Narrow No-Break Space
21+
"\u002D": "\u202F"
22+
"E": "\u1844"
23+
"e": "\u1844"
24+
"I": "\u1845"
25+
"i": "\u1845"
26+
"O\u0308": "\u1848"
27+
"o\u0308": "\u1848"
28+
"O": "\u1846"
29+
"o": "\u1846"
30+
"U\u0308": "\u1849"
31+
"u\u0308": "\u1849"
32+
"U": "\u1847"
33+
"u": "\u1847"
34+
# n followed by a g with dot
35+
"ng\u0307": "\u1828\u184E"
36+
# this conversion shouldn't be needed, but does no harm
37+
"nG": "\u184A"
38+
"ng": "\u184A"
39+
"N\u0303": "\u185B"
40+
"n\u0303": "\u185B"
41+
"N": "\u1828"
42+
"n": "\u1828"
43+
"B": "\u184B"
44+
"b": "\u184B"
45+
"P": "\u184C"
46+
"p": "\u184C"
47+
"Q": "\u184E"
48+
"q": "\u184E"
49+
"KH": "\u183B"
50+
"Kh": "\u183B"
51+
# this conversion shouldn't be needed, but does no harm
52+
"kH": "\u183B"
53+
"kh": "\u183B"
54+
"K\u0307": "\u1857"
55+
"k\u0307": "\u1857"
56+
"Ka": "\u1857\u1820"
57+
"ka": "\u1857\u1820"
58+
"Ke": "\u184D\u1844"
59+
"ke": "\u184D\u1844"
60+
"Ki": "\u184D\u1845"
61+
"ki": "\u184D\u1845"
62+
"Ko\u0308": "\u184D\u1848"
63+
"ko\u0308": "\u184D\u1848"
64+
"Ko": "\u1857\u1846"
65+
"ko": "\u1857\u1846"
66+
"Ku\u0308": "\u184D\u1849"
67+
"ku\u0308": "\u184D\u1849"
68+
"Ku": "\u1857\u1847"
69+
"ku": "\u1857\u1847"
70+
"K": "\u1857"
71+
"k": "\u1857"
72+
"G\u0307": "\u184E"
73+
"g\u0307": "\u184E"
74+
"G": "\u184E"
75+
"g": "\u184E"
76+
"M": "\u184F"
77+
"m": "\u184F"
78+
"LH": "\u1840"
79+
"Lh": "\u1840"
80+
# this conversion shouldn't be needed, but does no harm
81+
"lH": "\u1840"
82+
"lh": "\u1840"
83+
"L": "\u182F"
84+
"l": "\u182F"
85+
"TS\u0307": "\u1854"
86+
# this conversion shouldn't be needed, but does no harm
87+
"Ts\u0307": "\u1854"
88+
# this conversion shouldn't be needed, but does no harm
89+
"tS\u0307": "\u1854"
90+
"ts\u0307": "\u1854"
91+
"S\u0301": "\u1831"
92+
"s\u0301": "\u1831"
93+
"S": "\u1830"
94+
"s": "\u1830"
95+
"T": "\u1850"
96+
"t": "\u1850"
97+
"D": "\u1851"
98+
"d": "\u1851"
99+
"J\u0301": "\u185A"
100+
"j\u0301": "\u185A"
101+
"J": "\u1853"
102+
"j": "\u1853"
103+
"Y": "\u1855"
104+
"y": "\u1855"
105+
"V": "\u1856"
106+
"v": "\u1856"
107+
"W": "\u1856"
108+
"w": "\u1856"
109+
"F": "\u1839"
110+
"f": "\u1839"
111+
"Xa": "\u184D\u1820"
112+
"xa": "\u184D\u1820"
113+
"Xe": "\u184D\u1844"
114+
"xe": "\u184D\u1844"
115+
"Xi": "\u184D\u1845"
116+
"xi": "\u184D\u1845"
117+
"Xo\u0308": "\u184D\u1848"
118+
"xo\u0308": "\u184D\u1848"
119+
"Xo": "\u184D\u1846"
120+
"xo": "\u184D\u1846"
121+
"Xu\u0308": "\u184D\u1849"
122+
"xu\u0308": "\u184D\u1849"
123+
"Xu": "\u184D\u1847"
124+
"xu": "\u184D\u1847"
125+
"X": "\u184D"
126+
"x": "\u184D"
127+
"Z\u0301": "\u183F"
128+
"z\u0301": "\u183F"
129+
"ZR": "\u183F"
130+
# this conversion shouldn't be needed, but does no harm
131+
"Zr": "\u183F"
132+
# this conversion shouldn't be needed, but does no harm
133+
"zR": "\u183F"
134+
"zr": "\u183F"
135+
"R": "\u1837"
136+
"r": "\u1837"
137+
"ZH": "\u1841"
138+
"Zh": "\u1841"
139+
# this conversion shouldn't be needed, but does no harm
140+
"zH": "\u1841"
141+
"zh": "\u1841"
142+
"CH": "\u1842"
143+
"Ch": "\u1842"
144+
# this conversion shouldn't be needed, but does no harm
145+
"cH": "\u1842"
146+
"ch": "\u1842"
147+
"C\u0307": "\u1878"
148+
"c\u0307": "\u1878"
149+
"C\u0301": "\u183C"
150+
"c\u0301": "\u183C"
151+
"C": "\u1852"
152+
"c": "\u1852"
153+
"H": "\u183E"
154+
"h": "\u183E"
155+
"Z": "\u1834"
156+
"z": "\u1834"
157+
"...": "\u1801"
158+
"..": "\u1803"
159+
".": "\u180A"
160+
",": "\u1802"
161+
":": "\u1804"
162+
# Left pointing double angle quotation mark
163+
"\u003C\u003C": "\u300A"
164+
# Right pointing double angle quotation mark
165+
"\u003E\u003E": "\u300B"
166+
"0": "\u1810"
167+
"1": "\u1811"
168+
"2": "\u1812"
169+
"3": "\u1813"
170+
"4": "\u1814"
171+
"5": "\u1815"
172+
"6": "\u1816"
173+
"7": "\u1817"
174+
"8": "\u1818"
175+
"9": "\u1819"
176+
"\u0304": "\u1843"
177+
178+
script_to_roman:
179+
180+
map:
181+
"\u184E\u1820": "g\u0307a"
182+
"\u184E\u1846": "g\u0307o"
183+
"\u184E\u1847": "g\u0307u"
184+
"\u1820\u184E": "aq"
185+
"\u1846\u184E": "oq"
186+
"\u1847\u184E": "uq"
187+
"\u184E\u1844": "ge"
188+
"\u184E\u1845": "gi"
189+
"\u184E\u1848": "go\u0308"
190+
"\u184E\u1849": "gu\u0308"
191+
"\u1844\u184E": "eq"
192+
"\u1845\u184E": "iq"
193+
"\u1848\u184E": "o\u0308q"
194+
"\u1849\u184E": "u\u0308q"
195+
"\u184D\u1820": "xa"
196+
"\u184D\u1846": "xo"
197+
"\u184D\u1847": "xu"
198+
"\u184D\u1844": "ke"
199+
"\u184D\u1845": "ki"
200+
"\u184D\u1848": "ko\u0308"
201+
"\u184D\u1849": "ku\u0308"
202+
"\u180E\u1820": "\u002Da"
203+
"\u180E\u1844": "\u002De"
204+
"\u180E\u1845": "U002Di"
205+
"\u180E": "\u002D"
206+
"\u202F": "\u002D"
207+
"\u1801": "..."
208+
"\u1803": "."
209+
"\u1802": ","
210+
"\u1804": ":"
211+
"\u1810": "0"
212+
"\u1811": "1"
213+
"\u1812": "2"
214+
"\u1813": "3"
215+
"\u1814": "4"
216+
"\u1815": "5"
217+
"\u1816": "6"
218+
"\u1817": "7"
219+
"\u1818": "8"
220+
"\u1819": "9"
221+
"\u1820": "a"
222+
"\u1828": "n"
223+
"\u182F": "l"
224+
"\u1830": "s"
225+
"\u1831": "s\u0301"
226+
"\u1834": "z"
227+
"\u1837": "r"
228+
"\u1839": "f"
229+
"\u183C": "c\u0301"
230+
"\u183E": "h"
231+
"\u183F": "z\u0301"
232+
"\u1843": "\u0304"
233+
"\u1844": "e"
234+
"\u1845": "i"
235+
"\u1846": "o"
236+
"\u1847": "u"
237+
"\u1848": "o\u0308"
238+
"\u1849": "u\u0308"
239+
"\u184A": "ng"
240+
"\u184B": "b"
241+
"\u184C": "p"
242+
"\u184D\u1820": "xa"
243+
"\u184D\u1844": "xe"
244+
"\u184D\u1845": "xi"
245+
"\u184D\u1848": "xo\u0308"
246+
"\u184D\u1846": "xo"
247+
"\u184D\u1849": "xu\u0308"
248+
"\u184D\u1847": "xu"
249+
"\u184D": "q"
250+
"\u184E": "g"
251+
"\u184F": "m"
252+
"\u1850": "t"
253+
"\u1851": "d"
254+
"\u1852": "c"
255+
"\u1853": "j"
256+
"\u1854": "ts"
257+
"\u1855": "y"
258+
"\u1856": "v"
259+
"\u1857\u1820": "ka"
260+
"\u1857\u1844": "ke"
261+
"\u1857\u1845": "ki"
262+
"\u1857\u1846": "ko"
263+
"\u1857\u1847": "ku"
264+
"\u1857\u1848": "ko\u0308"
265+
"\u1857\u1849": "ku\u0308"
266+
"\u1857": "k"
267+
"\u1858": "g"
268+
"\u1859": "h"
269+
"\u185A": "j\u0301"
270+
"\u185B": "k\u0307"
271+
"\u185C": "j"
272+
"\u00AB": "\u003C\u003C"
273+
"\u00BB": "\u003E\u003E"
274+
"\u300A": "\u0022"
275+
"\u300B": "\u0022"

scriptshifter/tables/index.yml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,16 @@ chuvash_cyrillic:
7272
marc_code: chv
7373
name: Chuvash (Cyrillic)
7474
cyrillic_generic:
75-
name: Cyrillic (generic)
75+
description: 'Multi-purpose transliteration for most languages that use the Cyrillic script:
76+
Abaza, Abkhaz, Adygei, Aisor, Altai, Avar, Azeri, Balkar, Bashkir, Belarusian, Bulgarian,
77+
Buryat, Chechen, Chukchi, Chuvash, Dargwa, Dungan, Eskimo, Even, Evenki, Gagauz, Ingush,
78+
Inuit, Kabardian, Kalmyk, Karachay, Karachay-Balkar, Karakalpak, Karelian, Khakass, Khanty,
79+
Komi, Komi-Permyak, Koryak, Kumyk, Lak, Lapp, Lezghian, Lithuanian, Macedonian, Mansi, Mari,
80+
Moldovan, Molodstov, Mordvin, Nanai, Nenets, Nivkh, Nogai, Ossetic, Permyak, Romanian, Romany,
81+
Russian, Selkup, Serbian, Shor, Tabasaran, Tat, Tuva, Udekhe, Udmurt, Ukrainian, Yakut.'
82+
marc_code: abk, ady, alt, ava, bak, bel, bul, che, chm, chv, dar, ale, esk, kbd, xal, krc, kaa,
83+
krl, kom, kum, lez, lit, mac, nog, oss, rum, rom, sah, sel, srp, udm, ukr
84+
name: Cyrillic (Generic)
7685
devanagari:
7786
marc_code: hin, san
7887
name: Devanagari
@@ -169,6 +178,9 @@ macedonian:
169178
malayalam:
170179
marc_code: mal
171180
name: Malayalam
181+
manchu:
182+
marc_code: mnc
183+
name: Manchu
172184
mansi_cyrillic:
173185
name: Mansi (Cyrillic)
174186
marathi_devanagari:
@@ -268,6 +280,9 @@ thai:
268280
tibetan:
269281
marc_code: tib
270282
name: Tibetan
283+
tod_mongolian:
284+
marc_code: xal
285+
name: Tod Mongolian
271286
turkmen_cyrillic:
272287
marc_code: tuk
273288
name: Turkmen (Cyrillic)
@@ -300,3 +315,5 @@ yiddish:
300315
name: Yiddish
301316
yuit_cyrillic:
302317
name: Yuit (Cyrillic)
318+
319+

0 commit comments

Comments
 (0)