Skip to content

Commit f92ab8d

Browse files
authored
Merge pull request #250 from lcnetdev/tibetan
Update Tibetan; use Unicode word boundaries instead of hardcoded ones.
2 parents 4e53dd8 + 307421e commit f92ab8d

30 files changed

Lines changed: 14014 additions & 205 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,4 @@ ext/arabic_rom/data
140140
scriptshifter/data/*.db
141141
!.keep
142142
VERSION
143+
.~lock.*

ext/arabic_rom

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@ flask>=2.3,<3
55
flask-cors>=4.0,<5
66
python-dotenv>=1.0,<2
77
pyyaml>=6.0,<7
8+
regex>=2023.8.8
89
uwsgi>=2.0,<2.1
910
yiddish==0.0.21

scriptshifter/hooks/arabic/build_model.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,6 @@ python3 src/loc_transcribe.py predict mle dev --mle_model models/mle/size1.0.tsv
3838

3939
# Seq2Seq
4040
echo "Preparing Seq2seq."
41-
make prep_seq2seq
41+
python3 src/data/make_seq2seq_dataset.py -l ${SS_LANG}
4242
echo "Training models."
43-
python3 src/loc_transcribe.py train seq2seq --train --size {1.0,0.5,0.25,0.125,0.0625,0.03125,0.015625}
43+
python3 src/data/make_seq2seq_dataset.py --size {1.0,0.5,0.25,0.125,0.0625,0.03125,0.015625}

scriptshifter/tables/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@
4747
)
4848
# Package path where hook functions are kept.
4949
HOOK_PKG_PATH = "scriptshifter.hooks"
50-
# Default characters defining a word boundary. This is configurable per-table.
51-
WORD_BOUNDARY = " \n\t:;.,\"'-()[]{}"
5250

5351
# Token word boundary marker. Used in maps to distinguish special
5452
# transliterations for initial, final, and standalone tokens.

0 commit comments

Comments
 (0)