Skip to content

Commit 2489c54

Browse files
committed
Reverse % sign for word boundary.
1 parent ddb6ce3 commit 2489c54

8 files changed

Lines changed: 303 additions & 299 deletions

File tree

scriptshifter/tables/data/arabic.yml

Lines changed: 58 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
---
55
general:
66
name: Arabic
7-
description: Arabic R2S using a conversion table and S2R using a 3rd party library.
7+
description: >
8+
Arabic R2S using a conversion table and S2R using a 3rd party library.
89
case_sensitive: false
910

1011
parents:
@@ -16,17 +17,18 @@ roman_to_script:
1617

1718
# Original table by David Bucknum
1819
# Last updated 25 January 2019
19-
# Modified by WK with testing by Arabic Cat Staff LOC-CAIRO
20-
# Additional info from R. Vassie, [n.d.] "Marrying the Arabic and Latin Scripts Conceptually"
20+
# Modified by WK with testing by Arabic Cat Staff LOC-CAIRO
21+
# Additional info from R. Vassie, [n.d.] "Marrying the Arabic and Latin
22+
# Scripts Conceptually"
2123

2224

23-
# Punctuation marks:
25+
# Punctuation marks:
2426
"*": "\u066D"
2527
",": "\u060C"
2628
";": "\u061B"
2729
"?": "\u061F"
2830

29-
# Exceptions for specific words
31+
# Exceptions for specific words
3032
# Allah
3133
"Alla\u0304h": "\u0627\u0644\u0644\u0647"
3234

@@ -66,19 +68,19 @@ roman_to_script:
6668

6769
# "sh[dot below] as in "Ishaq"
6870

69-
"%sh\u0323%": "\u0633\u062D"
71+
"sh\u0323": "\u0633\u062D"
7072

7173
# "s[prime]h" combos
7274

73-
"%s\u02B9h%": "\u0633\u0647"
75+
"s\u02B9h": "\u0633\u0647"
7476

7577
# "th[dot below]"
7678

77-
"%th\u0323%": "\u062A\u062D"
79+
"th\u0323": "\u062A\u062D"
7880

79-
# dh[dot under]
81+
# dh[dot under]
8082

81-
"%dh\u0323%": "\u062F\u062D"
83+
"dh\u0323": "\u062F\u062D"
8284

8385
# La-hu
8486

@@ -96,7 +98,9 @@ roman_to_script:
9698
"mi\u02BEat": "\u0645\u0627\u0626\u0629"
9799
"mi\u02BCat": "\u0645\u0627\u0626\u0629"
98100

99-
# Numbers (I have set these to Hindi numbers. Note that Persian and Urdu will technically use \u06F0-06F9. This needs further discussion with PSD as RLIN21 used Hindi numbers, Connexion and Voyager does not.)
101+
# Numbers (I have set these to Hindi numbers. Note that Persian and Urdu
102+
# will technically use \u06F0-06F9. This needs further discussion with PSD
103+
# as RLIN21 used Hindi numbers, Connexion and Voyager does not.)
100104

101105
# Edition statements with Latin number
102106
"al-T\u0323ab\u02BBah 1": "\u0627\u0644\u0637\u0628\u0639\u0629 1"
@@ -132,15 +136,15 @@ roman_to_script:
132136
"ka-": "\u0643"
133137

134138
# Vowels and vowel/consonant combinations
135-
"%ah": "\u0629"
136-
"%at": "\u0629"
139+
"ah%": "\u0629"
140+
"at%": "\u0629"
137141

138-
#tanwin
139-
"%an": "\u0627"
142+
# tanwin
143+
"an%": "\u0627"
140144

141145
# ayn-alif combo
142-
"%\u02BBa\u0304\u02BE": "\u0639\u0627\u0621"
143-
"%\u02BBa\u0304\u02BC": "\u0639\u0627\u0621"
146+
"\u02BBa\u0304\u02BE%": "\u0639\u0627\u0621"
147+
"\u02BBa\u0304\u02BC%": "\u0639\u0627\u0621"
144148

145149
"\u02BBA\u0304": "\u0639\u0627"
146150
"\u02BBa\u0304": "\u0639\u0627"
@@ -153,27 +157,27 @@ roman_to_script:
153157
"\u02BBU": "\u0639"
154158
"\u02BBu": "\u0639"
155159

156-
"\u02BBA%": "\u0639"
157-
#"\u02BBa%": "\u0639"
160+
"%\u02BBA": "\u0639"
161+
# "%\u02BBa": "\u0639"
158162

159163
# alif and hamzas for all occasions
160164

161-
# truncation necessary? It seems to work fine with.
165+
# truncation necessary? It seems to work fine with.
162166

163-
"%i\u0304\u02BEah": "\u064A\u0626\u0629"
164-
"%i\u0304\u02BCah": "\u064A\u0626\u0629"
167+
"i\u0304\u02BEah%": "\u064A\u0626\u0629"
168+
"i\u0304\u02BCah%": "\u064A\u0626\u0629"
165169

166-
"%i\u0304\u02BEat": "\u064A\u0626\u0629"
167-
"%i\u0304\u02BCat": "\u064A\u0626\u0629"
170+
"i\u0304\u02BEat%": "\u064A\u0626\u0629"
171+
"i\u0304\u02BCat%": "\u064A\u0626\u0629"
168172

169-
"%i\u02BEa\u0304": "\u0626\u0627"
170-
"%i\u02BCa\u0304": "\u0626\u0627"
173+
"i\u02BEa\u0304%": "\u0626\u0627"
174+
"i\u02BCa\u0304%": "\u0626\u0627"
171175

172-
"%i\u02BE": "\u0626"
173-
"%i\u02BC": "\u0626"
176+
"i\u02BE%": "\u0626"
177+
"i\u02BC%": "\u0626"
174178
"a\u0304\u02BEa\u0304": "\u0627\u0621\u0627"
175179
"a\u0304\u02BCa\u0304": "\u0627\u0621\u0627"
176-
180+
177181
"a\u02BE": "\u0623"
178182
"a\u02BC": "\u0623"
179183
"\u02BEi": "\u0626"
@@ -198,64 +202,66 @@ roman_to_script:
198202
"a\u0304\u02BEi": "\u0627\u0626"
199203
"a\u0304\u02BC": "\u0627\u0621"
200204
"a\u0304\u02BE": "\u0627\u0621"
201-
"A\u0304%": "\u0622"
202-
"a\u0304%": "\u0622"
205+
"%A\u0304": "\u0622"
206+
"%a\u0304": "\u0622"
203207
"A\u0304": "\u0627"
204208
"a\u0304": "\u0627"
205209

206-
# These next two lines were intended to convert to alif-ayn when it is at the beginning of a word, definite or indefinine (i.e. al-a[ayn]ma[macron]l or [space]a[ayn]ma[macron]l"
207-
"A\u02BB%": "\u0623\u0639"
208-
"a\u02BB%": "\u0623\u0639"
210+
# These next two lines were intended to convert to alif-ayn when it is at
211+
# # the beginning of a word, definite or indefinine (i.e.
212+
# al-a[ayn]ma[macron]l or [space]a[ayn]ma[macron]l"
213+
"%A\u02BB": "\u0623\u0639"
214+
"%a\u02BB": "\u0623\u0639"
209215
"a\u02BB": "\u0639"
210216
"A\u0301": "\u0649"
211217
"a\u0301": "\u0649"
212218

213219
"ayy": "\u064A"
214-
"A%": "\u0623"
215-
"a%": "\u0627"
220+
"%A": "\u0623"
221+
"%a": "\u0627"
216222
"A": "\u0623"
217223
"a": ""
218224

219225
# I - Capital I at beginning of word is usually alif hamzah-below.
220226

221-
"%i\u0304": "\u064A"
227+
"i\u0304%": "\u064A"
222228
"i\u0304y": "\u064A"
223229
"iy": "\u064A"
224-
"I\u0304%": "\u0625\u064A"
230+
"%I\u0304": "\u0625\u064A"
225231
"i\u0304": "\u064A"
226-
"\u02BBI%": "\u0639"
232+
"%\u02BBI": "\u0639"
227233

228-
#"i\u02BB": "\u0625\u0639"
234+
# "i\u02BB": "\u0625\u0639"
229235

230236
"I\u02BE": "\u0627\u0626"
231237
"I\u02BC": "\u0627\u0626"
232238
"i\u02BE": "\u0626"
233239
"i\u02BC": "\u0627\u0626"
234240

235-
"I%": "\u0625"
236-
"i%": "\u0625"
241+
"%I": "\u0625"
242+
"%i": "\u0625"
237243
"I": "\u0625"
238244
"i": ""
239245

240-
# U
246+
# U
241247

242248
"u\u0304\u02BE": "\u0624"
243249
"u\u0304\u02BC": "\u0624"
244-
"U\u0304w%": "\u0623\u0648"
245-
"u\u0304w%": "\u0623\u0648"
246-
"U\u0304%": "\u0623\u0648"
247-
"u\u0304%": "\u0623\u0648"
250+
"%U\u0304w": "\u0623\u0648"
251+
"%u\u0304w": "\u0623\u0648"
252+
"%U\u0304": "\u0623\u0648"
253+
"%u\u0304": "\u0623\u0648"
248254
"u\u0304w": "\u0648"
249255
"u\u0304": "\u0648"
250256
"u\u02BE": "\u0624"
251257
"u\u02BC": "\u0624"
252258

253-
"U%": "\u0623"
254-
"u%": "\u0623"
259+
"%U": "\u0623"
260+
"%u": "\u0623"
255261
"U": "\u0623"
256262
"u": ""
257263

258-
# Consonants, with tashdid added
264+
# Consonants, with tashdid added
259265

260266
"B": "\u0628"
261267
"bb": "\u0628"
@@ -352,8 +358,8 @@ roman_to_script:
352358
# hamza - not romanized
353359
# "\u0621"
354360
# hamza (alone in final position)
355-
"%\u02BE": "\u0621"
356-
"%\u02BC": "\u0621"
361+
"\u02BE%": "\u0621"
362+
"\u02BC%": "\u0621"
357363

358364
# Do not know what, if anything, is needed here:
359365
# tatweel:
@@ -380,8 +386,6 @@ roman_to_script:
380386
# "\u0671"
381387

382388

383-
384-
385389
script_to_roman:
386390
hooks:
387391
post_config:

scriptshifter/tables/data/bulgarian.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ script_to_roman:
3838
"\u044C": ""
3939
"\u042C": ""
4040
"\u044A": ""
41-
"\u042A%": "u\u0306"
42-
"\u042A": ""
41+
"\u042A%": "" # Final
42+
"\u042A": "u\u0306"
4343
"\u0413": "G"
4444
"\u0433": "g"
4545
"\u0416": "Zh"

scriptshifter/tables/data/divehi_thaana.yml

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ roman_to_script:
3030
"h\u032E": "\u0781\u07B0"
3131

3232
# THAANA LETTER ALIFU FINAL WITH SUKUN (LOW LINE 0332)
33-
"%H\u0332": "\u0787\u07B0"
34-
"%h\u0332": "\u0787\u07B0"
33+
"H\u0332%": "\u0787\u07B0"
34+
"h\u0332%": "\u0787\u07B0"
3535
"H": "\u0780"
3636
"h": "\u0780"
3737
"S\u0301": "\u0781"
@@ -70,13 +70,13 @@ roman_to_script:
7070
"F": "\u078A"
7171
"ff": "\u0787\u07B0\u078A"
7272
"f": "\u078A"
73-
# THAANA LETTER "D/d" WITH DOT BELOW (0323)
73+
# THAANA LETTER "D/d" WITH DOT BELOW (0323)
7474
"D\u0323": "\u0791"
7575
"d\u0323": "\u0791"
7676
"D": "\u078B"
7777
"dd": "\u0787\u07B0\u078B"
7878
"d": "\u078B"
79-
# THAANA LETTER "T/t" WITH DOT BELOW (0323)
79+
# THAANA LETTER "T/t" WITH DOT BELOW (0323)
8080
"T\u0323": "\u0793"
8181
"t\u0323": "\u0793"
8282
"T\u0324T": "\u078C\u07B0\u078C"
@@ -169,7 +169,7 @@ roman_to_script:
169169
"ghgh": "\u0787\u07B0\u07A3"
170170
"gh": "\u07A3"
171171

172-
# THAANA EXTENSION FOR ARABIC LETTER QAAFU
172+
# THAANA EXTENSION FOR ARABIC LETTER QAAFU
173173
"Q": "\u07A4"
174174
"qq": "\u0787\u07B0\u07A4"
175175
"q": "\u07A4"
@@ -181,24 +181,24 @@ roman_to_script:
181181

182182
# INITIAL (AND UPPERCASE) VOWELS THAT CONVERT
183183
# TO ALIF FOLLOWED BY VOWEL (ALIF OMITTED IN ROMANIZATION)
184-
"A\u0304%": "\u0787\u07A7"
185-
"A%": "\u0787\u07A6"
184+
"%A\u0304": "\u0787\u07A7"
185+
"%A": "\u0787\u07A6"
186186
"\u0020a\u0304": "\u0020\u0787\u07A7"
187187
"\u0020a": "\u0020\u0787\u07A6"
188-
"E\u0304%": "\u0787\u07AD"
189-
"E%": "\u0787\u07AC"
188+
"%E\u0304": "\u0787\u07AD"
189+
"%E": "\u0787\u07AC"
190190
"\u0020e\u0304": "\u0020\u0787\u07AD"
191191
"\u0020e": "\u0020\u0787\u07AC"
192-
"I\u0304%": "\u0787\u07A9"
193-
"I%": "\u0787\u07A8"
192+
"%I\u0304": "\u0787\u07A9"
193+
"%I": "\u0787\u07A8"
194194
"\u0020i\u0304": "\u0020\u0787\u07A9"
195195
"\u0020i": "\u0020\u0787\u07A8"
196-
"O\u0304%": "\u0787\u07AF"
197-
"O%": "\u0787\u07AE"
196+
"%O\u0304": "\u0787\u07AF"
197+
"%O": "\u0787\u07AE"
198198
"\u0020o\u0304": "\u0020\u0787\u07AF"
199199
"\u0020o": "\u0020\u0787\u07AE"
200-
"U\u0304%": "\u0787\u07AB"
201-
"U%": "\u0787\u07AB"
200+
"%U\u0304": "\u0787\u07AB"
201+
"%U": "\u0787\u07AB"
202202
"\u0020u\u0304": "\u0020\u0787\u07AB"
203203
"\u0020u": "\u0020\u0787\u07AB"
204204

@@ -301,7 +301,7 @@ script_to_roman:
301301
"\u0787\u07B0\u078A": "ff"
302302
"\u078A": "f"
303303

304-
# THAANA LETTER "D/d" WITH DOT BELOW (0323)
304+
# THAANA LETTER "D/d" WITH DOT BELOW (0323)
305305
"\u0787\u07B0\u0791": "d\u0323d\u0323"
306306
"\u0791": "d\u0323"
307307

@@ -386,7 +386,7 @@ script_to_roman:
386386
"\u0787\u07B0\u07A3": "ghgh"
387387
"\u07A3": "gh"
388388

389-
# THAANA EXTENSION FOR ARABIC LETTER QAAFU
389+
# THAANA EXTENSION FOR ARABIC LETTER QAAFU
390390
"\u0787\u07B0\u07A4": "qq"
391391
"\u07A4": "q"
392392

0 commit comments

Comments
 (0)