Skip to content

Commit 2b17fbc

Browse files
committed
update: Rewrite update script
New update script divides work into tasks scheduled between a constant number of processes, instead of statically assigning a single long running task to each thread. This results in better CPU saturation. Database handles are not shared between threads anymore, instead the main thread is used to commit results of other processes into the database. This trades locking on database access for serialization costs - since multiprocessing is used, values returned from futures are pickled.
1 parent eb36850 commit 2b17fbc

3 files changed

Lines changed: 370 additions & 13 deletions

File tree

elixir/data.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,16 @@
2121
import berkeleydb
2222
import re
2323
from . import lib
24+
from .lib import autoBytes
2425
import os
2526
import os.path
2627
import errno
2728

29+
# Cache size used by the update script for the largest databases. Tuple of (gigabytes, bytes).
30+
# https://docs.oracle.com/database/bdb181/html/api_reference/C/dbset_cachesize.html
31+
# https://docs.oracle.com/database/bdb181/html/programmer_reference/general_am_conf.html#am_conf_cachesize
32+
CACHESIZE = (2,0)
33+
2834
deflist_regex = re.compile(b'(\d*)(\w)(\d*)(\w),?')
2935
deflist_macro_regex = re.compile('\dM\d+(\w)')
3036

@@ -72,6 +78,14 @@ def iter(self, dummy=False):
7278
if dummy:
7379
yield maxId, None, None, None
7480

81+
def exists(self, idx, line_num):
82+
entries = deflist_regex.findall(self.data)
83+
for id, _, line, _ in entries:
84+
if id == idx and int(line) == line_num:
85+
return True
86+
87+
return False
88+
7589
def append(self, id, type, line, family):
7690
if type not in defTypeD:
7791
return
@@ -145,11 +159,14 @@ def pack(self):
145159
return self.data
146160

147161
class BsdDB:
148-
def __init__(self, filename, readonly, contentType, shared=False):
162+
def __init__(self, filename, readonly, contentType, shared=False, cachesize=None):
149163
self.filename = filename
150164
self.db = berkeleydb.db.DB()
151165
flags = berkeleydb.db.DB_THREAD if shared else 0
152166

167+
if cachesize is not None:
168+
self.db.set_cachesize(cachesize[0], cachesize[1])
169+
153170
if readonly:
154171
flags |= berkeleydb.db.DB_RDONLY
155172
self.db.open(filename, flags=flags)
@@ -159,40 +176,50 @@ def __init__(self, filename, readonly, contentType, shared=False):
159176
self.ctype = contentType
160177

161178
def exists(self, key):
162-
key = lib.autoBytes(key)
179+
key = autoBytes(key)
163180
return self.db.exists(key)
164181

165182
def get(self, key):
166-
key = lib.autoBytes(key)
183+
key = autoBytes(key)
167184
p = self.db.get(key)
168-
return self.ctype(p) if p is not None else None
185+
if p is None:
186+
return None
187+
p = self.ctype(p)
188+
return p
169189

170190
def get_keys(self):
171191
return self.db.keys()
172192

173193
def put(self, key, val, sync=False):
174-
key = lib.autoBytes(key)
175-
val = lib.autoBytes(val)
194+
key = autoBytes(key)
195+
val = autoBytes(val)
176196
if type(val) is not bytes:
177197
val = val.pack()
178198
self.db.put(key, val)
179199
if sync:
180200
self.db.sync()
181201

202+
def sync(self):
203+
self.db.sync()
204+
182205
def close(self):
183206
self.db.close()
184207

185208
def __len__(self):
186209
return self.db.stat()["nkeys"]
187210

188211
class DB:
189-
def __init__(self, dir, readonly=True, dtscomp=False, shared=False):
212+
def __init__(self, dir, readonly=True, dtscomp=False, shared=False, update_cache=False):
190213
if os.path.isdir(dir):
191214
self.dir = dir
192215
else:
193216
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dir)
194217

195218
ro = readonly
219+
cachesize = None
220+
221+
if update_cache:
222+
cachesize = CACHESIZE
196223

197224
self.vars = BsdDB(dir + '/variables.db', ro, lambda x: int(x.decode()), shared=shared)
198225
# Key-value store of basic information
@@ -203,20 +230,20 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False):
203230
self.file = BsdDB(dir + '/filenames.db', ro, lambda x: x.decode(), shared=shared)
204231
# Map serial number to filename
205232
self.vers = BsdDB(dir + '/versions.db', ro, PathList, shared=shared)
206-
self.defs = BsdDB(dir + '/definitions.db', ro, DefList, shared=shared)
233+
self.defs = BsdDB(dir + '/definitions.db', ro, DefList, shared=shared, cachesize=cachesize)
207234
self.defs_cache = {}
208235
NOOP = lambda x: x
209236
self.defs_cache['C'] = BsdDB(dir + '/definitions-cache-C.db', ro, NOOP, shared=shared)
210237
self.defs_cache['K'] = BsdDB(dir + '/definitions-cache-K.db', ro, NOOP, shared=shared)
211238
self.defs_cache['D'] = BsdDB(dir + '/definitions-cache-D.db', ro, NOOP, shared=shared)
212239
self.defs_cache['M'] = BsdDB(dir + '/definitions-cache-M.db', ro, NOOP, shared=shared)
213240
assert sorted(self.defs_cache.keys()) == sorted(lib.CACHED_DEFINITIONS_FAMILIES)
214-
self.refs = BsdDB(dir + '/references.db', ro, RefList, shared=shared)
215-
self.docs = BsdDB(dir + '/doccomments.db', ro, RefList, shared=shared)
241+
self.refs = BsdDB(dir + '/references.db', ro, RefList, shared=shared, cachesize=cachesize)
242+
self.docs = BsdDB(dir + '/doccomments.db', ro, RefList, shared=shared, cachesize=cachesize)
216243
self.dtscomp = dtscomp
217244
if dtscomp:
218-
self.comps = BsdDB(dir + '/compatibledts.db', ro, RefList, shared=shared)
219-
self.comps_docs = BsdDB(dir + '/compatibledts_docs.db', ro, RefList, shared=shared)
245+
self.comps = BsdDB(dir + '/compatibledts.db', ro, RefList, shared=shared, cachesize=cachesize)
246+
self.comps_docs = BsdDB(dir + '/compatibledts_docs.db', ro, RefList, shared=shared, cachesize=cachesize)
220247
# Use a RefList in case there are multiple doc comments for an identifier
221248

222249
def close(self):

elixir/lib.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import sys
2222
import logging
2323
import subprocess, os
24+
from typing import List
2425

2526
logger = logging.getLogger(__name__)
2627

@@ -46,7 +47,7 @@ def run_cmd(*args, env=None):
4647
# Invoke ./script.sh with the given arguments
4748
# Returns the list of output lines
4849

49-
def scriptLines(*args, env=None):
50+
def scriptLines(*args, env=None) -> List[bytes]:
5051
p = script(*args, env=env)
5152
p = p.split(b'\n')
5253
del p[-1]

0 commit comments

Comments
 (0)