-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmap-forms.py
More file actions
executable file
·102 lines (69 loc) · 2.35 KB
/
map-forms.py
File metadata and controls
executable file
·102 lines (69 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
# Creates a map.json.gz file which maps the inflected forms of words to their
# root form.
import os
import json
import glob
import gzip
chars = '0123456789aáâåäbcdeéfghijklmnoóöõpqrsšștuüvwxyzž '
word_map = {}
def get_group_name(word):
return word[0] if len(word) < 4 else word[:4].replace(' ', '_')
def save(path, entries):
with gzip.open(path, 'wt', encoding = 'utf-8') as f:
json.dump(entries, f, ensure_ascii = False)
def add_word(word, root, form):
word = ''.join([c for c in word.lower() if c in chars])
if not word: return
# Don't map incomplete multi-word inflections
if len(word.split()) < len(root.split()): return
if not word in word_map: word_map[word] = []
if not [root, form] in word_map[word]:
word_map[word].append([root, form])
def map_root(d, root):
form = 'inf1' if d.get('pos') == 'verb' else 'nom-sg'
add_word(root, root, form)
def _map_inflections(inflections, root):
# Map inflected forms
for form, words in inflections.items():
if not words: continue
if isinstance(words, str): words = [words]
for word in words:
add_word(word, root, form)
# Add plural form of past participle
perf1pl = inflections.get('perf-1pl')
if perf1pl:
if isinstance(perf1pl, str): perf1pl = [perf1pl]
for form in perf1pl:
if form and form.startswith('olemme '):
add_word(form[7:], root, 'past-part-pl')
break
def map_inflections(d, root):
inflections = d.get('inflections')
if not inflections: return
if isinstance(inflections, (tuple, list)):
for inf in inflections:
if 'forms' in inf: _map_inflections(inf['forms'], root)
else: _map_inflections(inf, root)
else: _map_inflections(inflections, root)
# Map inflected forms
for path in glob.glob('dict/*.json'):
with open(path, 'r') as f: data = json.load(f)
root = path[5:-5].replace('_', ' ')
for d in data['defs']:
map_inflections(d, root)
map_root(d, root)
# Save map
if not os.path.exists('map'): os.mkdir('map')
last = None
entries = {}
words = word_map.keys()
for word in sorted(words, key = get_group_name):
group = get_group_name(word)
if group != last and last:
save('map/%s.json.gz' % last, entries)
entries = {}
entries[word] = word_map[word]
last = group
if entries: save('map/%s.json.gz' % group, entries)
save('map.json.gz', word_map)