Skip to content

Commit bf2cfab

Browse files
committed
Use Unicode classes for word boundary markers.
1 parent e4a21ee commit bf2cfab

3 files changed

Lines changed: 14 additions & 32 deletions

File tree

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@ flask>=2.3,<3
55
flask-cors>=4.0,<5
66
python-dotenv>=1.0,<2
77
pyyaml>=6.0,<7
8+
regex>=2023.8.8
89
uwsgi>=2.0,<2.1
910
yiddish==0.0.21

scriptshifter/trans.py

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import logging
22

33
from importlib import import_module
4-
from re import Pattern, compile
4+
from re import Pattern
5+
from regex import compile
56
from unicodedata import normalize as precomp_normalize
67

78
from scriptshifter.exceptions import BREAK, CONT
@@ -13,8 +14,10 @@
1314

1415
logger = logging.getLogger(__name__)
1516

16-
WORD_PTN = compile(r"\w")
17-
WB_PTN = compile(r"\W")
17+
# Beginning-of-word pattern.
18+
BOW_PTN = compile(r"(?<=[\p{P}\p{Z}]|^)[\p{L}\p{M}\p{S}]")
19+
# End-of-word pattern.
20+
EOW_PTN = compile(r"[\p{L}\p{M}\p{S}](?=[\p{P}\p{Z}]|$)")
1821

1922

2023
class Transliterator:
@@ -107,33 +110,10 @@ def normalize_src(self):
107110
for nk, nv in norm_rules.items():
108111
self.src = self.src.replace(nk, nv)
109112

110-
return self.run_hook("post_normalize")
111-
112-
def cur_at_bow(self, cur=None):
113-
"""
114-
Check if cursor is at the beginning of a word.
115-
116-
@param cur(int): Position to check. By default, the current cursor.
117-
"""
118-
if cur is None:
119-
cur = self.cur
120-
return (
121-
self.cur == 0
122-
or WB_PTN.match(self.src[cur - 1])
123-
) and WORD_PTN.match(self.src[cur])
124-
125-
def cur_at_eow(self, cur=None):
126-
"""
127-
Check if cursor is at the end of a word.
113+
self.bow_coords = {m.span()[0] for m in BOW_PTN.finditer(self.src)}
114+
self.eow_coords = {m.span()[0] for m in EOW_PTN.finditer(self.src)}
128115

129-
@param cur(int): Position to check. By default, the current cursor.
130-
"""
131-
if cur is None:
132-
cur = self.cur
133-
return (
134-
cur == len(self.src) - 1
135-
or WB_PTN.match(self.src[cur + 1])
136-
) and WORD_PTN.match(self.src[cur])
116+
return self.run_hook("post_normalize")
137117

138118

139119
def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
@@ -209,11 +189,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
209189
ctx.cur_flags = 0
210190

211191
# Look for a word boundary and flag word beginning/end it if found.
212-
if ctx.cur_at_bow():
192+
if ctx.cur in ctx.bow_coords:
213193
# Beginning of word.
214194
logger.debug(f"Beginning of word at position {ctx.cur}.")
215195
ctx.cur_flags |= BOW
216-
if ctx.cur_at_eow():
196+
if ctx.cur in ctx.eow_coords:
217197
# End of word.
218198
logger.debug(f"End of word at position {ctx.cur}.")
219199
ctx.cur_flags |= EOW
@@ -319,7 +299,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
319299
# Can't rely on EOW flag, we must check on the last
320300
# character of the potential match.
321301
ctx.src_tk.flags & EOW
322-
and not ctx.cur_at_eow(ctx.cur + step - 1)
302+
and ctx.cur + step - 1 not in ctx.eow_coords
323303
)
324304
):
325305
continue

test/data/script_samples/tibetan.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,4 @@
2525
"tibetan","Rgyud don rig-ʼdzin dgyes paʾi źal luṅ ","རྒྱུད་དོན་རིག་འཛིན་དགྱེས་པའི་ཞལ་ལུང་།","r2s","{""capitalize"": ""first""}","From Lobsang"
2626
"tibetan","Gnas brtan chen po bcu drug gi mchod pa rgyal bstan mdzad med nor bu zhes bya ba bzhugs so","གནས་བརྟན་ཆེན་པོ་བཅུ་དྲུག་གི་མཆོད་པ་རྒྱལ་བསྟན་མཛད་མེད་ནོར་བུ་ཞེས་བྱ་བ་བཞུགས་སོ།","r2s","{""capitalize"": ""first""}","From Lobsang"
2727
"tibetan","སྐྱབས་འགྲོ་ཡན་ལག་དྲུག་པ་བཤད་པའི་བསྐྱུད་བྱང་ཟིན་མ་ཊཱིཀ་བཞུགས་སོ།","skyabs ’gro yan lag drug pa bshad pa’i bskyud byang zin ma ṭīka bzhugs so","s2r",,"From Lobsang"
28+
"tibetan","བོད་ཀྱི་ཆོས་འབྱུང་དང་རྒྱལ་རབས་ཀྱི་སྐོར: སྦ་བཞེད་དེབ་ཐེར་དམར་པོ: དེབ་ཐེར་དཀར་པོ: གངས༌ཅན་བོད་ཀྱི་རྒྱལ་བསྟན་ཕྱི་མོ་སྔ་འགྱུར་རྙིང་མའི་བྱུང་བ་མདོ་ཙམ་བརྗོད་པ་པདྨ་དམ་རྭ་གའི་དོ་ཤལ་གཞོན་ནུ་དགྱེས་པའི་མགུལ་རྒྱན / སྦ༌གསལ༌སྣང, ་ཚལ༌པ་ཀུན༌དགའ༌རྡོ༌རྗེ, ་དགེ༌འདུན༌ཆོས༌འཕེལ, ་ཁམས༌སྤྲུལ་བསོད༌ནམས༌དོན༌གྲུབ / པའི་བེའུ་མིའི་དགེའི་ཤུའི་མོའི་ཐུའུ་གསོའི་པའོ","Bod kyi chos ʾbyung dang rgyal rabs kyi skor : Sba bzhed Deb ther dmar po : Deb ther dkar po : Gangs-can Bod kyi rgyal bstan phyi mo snga ʾgyur rnying maʾi byung ba mdo tsam brjod pa Padma dma rwa gaʾi do shal gzhon nu dgyes paʾi mgul rgyan / Sba-gsal-snang, Tshal-pa Kun-dgaʾ-rdo-rje, Dge-ʾdun-chos-ʾphel, Khams-sprul Bsod-nams-don-grub / paʼi beʼu miʼi dgeʼi shuʼi moʼi Thuʼu gsoʼi paʼo",,,

0 commit comments

Comments
 (0)