Skip to content

Commit 90127b7

Browse files
authored
Merge pull request #245 from lcnetdev/dec_config
Decouple configs
2 parents d9a93a4 + b847826 commit 90127b7

4 files changed

Lines changed: 171 additions & 67 deletions

File tree

doc/config.md

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@ Configuration files, also called transliteration tables, are contained in the
1313
The configuration file names are key to most operations in the software. They
1414
are all-lowercase and use underscores to separate words, e.g.
1515
`church_slavonic`. They have the `.yml` extension and are written in the
16-
[YAML](https://yaml.org/) configuration language. Hence, a transliteration
17-
request to the `/trans` REST API endpoint providing `church_slavonic` as the
18-
transliteration language, uses the `church_slavonic.yml` configuration file.
16+
[YAML](https://yaml.org/) configuration language.
1917

2018
Other files are present in the `data` directory that are not exposed to the end
2119
user via Web UI or REST API. These files may be incomplete transliteration
@@ -35,10 +33,49 @@ transliteration table key names as described previously, and the values are
3533
key-value pairs which can have arbitrary contents. These contents are displayed
3634
to the user in the `/languages` API endpoint.
3735

38-
The only mandatory key for each key-value pair is `name`, which is the
39-
human-readable label that is displayed in the Web UI. Other keys, such as
40-
`description`, may be used to inform the user about the scope of a particular
41-
table.
36+
Each entry of the index file are the following:
37+
38+
### `<entry_name>`
39+
40+
The key for the language/script. This is referred in multiple places across
41+
the application, e.g. the `/trans/mongolian_cyrillic` API method transliterates
42+
a sentence using the `mongolian_cyrillic` index entry. By default, the
43+
`mongolian_cyrillic` entry in the index file (see below) uses the
44+
`mongolian_cyrillic.yml` configuration file in the `data/` folder. This can be
45+
overridden by the `conf` key (see below).
46+
47+
By convention, an entry name uses the name of the language, followed by the
48+
name of the script, separated by an underscore, *only if that language is known
49+
to exist in multiple scripts*. For example, `mongolian_cyrillic` is used for
50+
Mongolian written in Cyrillic, and `mongolian_mongol_bichig` for the native
51+
Mongol Bichig script; while `persian` is only found in Arabic script, so
52+
`arabic` is not added.
53+
54+
### `<entry_name>.name`
55+
56+
Human-readable label that is displayed in the Web UI. Mandatory.
57+
58+
### `<entry_name>.conf`
59+
60+
Override the default configuration file lookup. By default,
61+
the configuration file is inferred from the key name, e.g. `chinese` looks up
62+
`data/chinese.yml`. However, some entries in the index may not have a distinct
63+
configuration and reuse an existing configuration that works for that language.
64+
Several languages in the Cyrillic script use this method.
65+
66+
The value is the full file name relative to the `data/` directory, e.g.
67+
`cyrillic_generic.yml`.
68+
69+
### `<entry_name>.marc_code`
70+
71+
MARC code from the [MARC Standards Office registry
72+
](https://www.loc.gov/marc/languages/language_name.html). This may be used by
73+
external applications to more easily look up entries. Optional.
74+
75+
### `<entry_name>.description`
76+
77+
Additional description that may be used to inform the user about the scope of
78+
a particular table. Optional.
4279

4380
## Inheritance
4481

scriptshifter/tables/__init__.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@
6666

6767
logger = logging.getLogger(__name__)
6868

69+
tbl_index = None # Module-level index of all scripts.
70+
6971

7072
class Token(str):
7173
"""
@@ -165,11 +167,12 @@ def init_db():
165167
conn.executescript(fh.read())
166168

167169
# Populate tables.
170+
global tbl_index
168171
with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
169-
tlist = load(fh, Loader=Loader)
172+
tbl_index = load(fh, Loader=Loader)
170173
try:
171174
with conn:
172-
for tname, tdata in tlist.items():
175+
for tname, tdata in tbl_index.items():
173176
populate_table(conn, tname, tdata)
174177

175178
# If the DB already exists, it will be overwritten ONLY on success at
@@ -340,9 +343,14 @@ def load_table(tname):
340343
the language & script metadata and parsing rules.
341344
"""
342345

343-
fname = path.join(TABLE_DIR, tname + ".yml")
346+
try:
347+
fname = path.join(TABLE_DIR, tbl_index[tname]["conf"])
348+
except KeyError:
349+
# If no `conf` key is provided, use the conventional table name + .yml.
350+
fname = path.join(TABLE_DIR, tname + ".yml")
344351
if not access(fname, R_OK):
345-
raise ValueError(f"No transliteration table for {tname}!")
352+
raise ValueError(
353+
f"No transliteration table `{fname}` found for {tname}!")
346354

347355
with open(fname) as fh:
348356
tdata = load(fh, Loader=Loader)
@@ -400,7 +408,7 @@ def load_table(tname):
400408

401409
# Inherit normalization rules.
402410
for parent in parents:
403-
parent_langsec = load_table(parent)["script_to_roman"]
411+
parent_langsec = load_table(parent).get("script_to_roman", {})
404412
normalize |= parent_langsec.get("normalize", {})
405413

406414
for k, v in tdata["script_to_roman"].get("normalize", {}).items():

scriptshifter/tables/data/_ignore_base.yml

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,17 @@ roman_to_script:
2222
# dedicated U+2160÷U+216F (uppercase Roman
2323
# numerals) and/or U+2170÷U+217F (lower case Roman
2424
# numerals) ranges to avoid this ambiguity.
25-
- "\\bI{2,3}\\b"
26-
- "\\bI(V|X)\\b"
27-
- "\\bLI{,3}\\b"
28-
- "\\bLI?(V|X)\\b"
29-
- "\\bL(V|X{1,3})I{,3}\\b"
30-
- "\\bLX{1,3}I?V\\b"
31-
- "\\bLX{1,3}VI{,3}\\b"
32-
- "\\b(V|X{1,3})I{,3}\\b"
33-
- "\\bX{1,3}I{,3}\\b"
34-
- "\\bX{1,3}I(V|X)\\b"
35-
- "\\bX{1,3}VI{,3}\\b"
25+
- "I{2,3}\\b"
26+
- "I(V|X)\\b"
27+
- "LI{,3}\\b"
28+
- "LI?(V|X)\\b"
29+
- "L(V|X{1,3})I{,3}\\b"
30+
- "LX{1,3}I?V\\b"
31+
- "LX{1,3}VI{,3}\\b"
32+
- "(V|X{1,3})I{,3}\\b"
33+
- "X{1,3}I{,3}\\b"
34+
- "X{1,3}I(V|X)\\b"
35+
- "X{1,3}VI{,3}\\b"
3636

3737
# MARC sub-field markers.
38-
- "\\b[\u2021\u01C2\\$][0-9a-z]\\b"
39-
40-
script_to_roman:
41-
ignore_ptn:
42-
# MARC sub-field markers.
43-
- "\\b[\u2021\u01C2\\$][0-9a-z]\\b"
38+
- "[\u2021\u01C2\\$][0-9a-z]\\b"

0 commit comments

Comments
 (0)