Skip to content

Commit ca8fb5f

Browse files
committed
add numpy memmap backend for bigwig
1 parent 7ae6ead commit ca8fb5f

2 files changed

Lines changed: 62 additions & 8 deletions

File tree

seqchromloader/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
def set_bigwig_backend(backend):
44
global BIGWIG_BACKEND
55

6-
assert backend in ['pyBigWig', 'pybigtools'], "Backend options are pyBigWig or pybigtools!"
6+
assert backend in ['pyBigWig', 'pybigtools', 'memmap'], "Backend options are pyBigWig, memmap or pybigtools!"
77

88
BIGWIG_BACKEND = backend

seqchromloader/utils.py

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Utilities for iterating constructing data sets and iterating over
33
DNA sequence data.
44
"""
5+
import os
56
import math
67
import pandas as pd
78
import numpy as np
@@ -20,11 +21,47 @@
2021
logger = logging.getLogger(__name__)
2122

2223
class BigWig():
23-
def __init__(self, bw_path, backend='pyBigWig'):
24+
def __init__(self, bw_path, memmap_dir=None, backend='pyBigWig'):
2425
if backend == 'pyBigWig':
2526
self.bw = pyBigWig.open(bw_path)
26-
else:
27+
self._chroms = self.bw.chroms()
28+
elif backend == 'pybigtools':
2729
self.bw = pybigtools.open(bw_path)
30+
self._chroms = self.bw.chroms()
31+
elif backend == 'memmap':
32+
# load all chromosomes using np.memmap
33+
if memmap_dir is None:
34+
memmap_dir = bw_path + ".memmap"
35+
36+
self.memmap_dir = memmap_dir
37+
os.makedirs(self.memmap_dir, exist_ok=True)
38+
39+
# open bigwig via pybigtools
40+
self.bw = pybigtools.open(bw_path)
41+
42+
self.arrays = {}
43+
self._chroms = {}
44+
45+
# build memmap if not exists
46+
for chrom, length in self.bw.chroms().items():
47+
npy_path = os.path.join(self.memmap_dir, f"{chrom}.npy")
48+
49+
if not os.path.exists(npy_path):
50+
logger.info(f"[memmap] dumping {chrom}...")
51+
52+
# load full chromosome
53+
data = self.bw.values(chrom, 0, length, missing=0.0)
54+
55+
# save
56+
np.save(npy_path, data)
57+
58+
# load as memmap
59+
arr = np.load(npy_path, mmap_mode='r')
60+
self.arrays[chrom] = arr
61+
self._chroms[chrom] = arr.shape[0]
62+
else:
63+
raise Exception(f"Unknown Backend: {backend}")
64+
2865
self.backend = backend
2966

3067
def intervals(self, chrom):
@@ -36,17 +73,34 @@ def intervals(self, chrom):
3673
def stats(self, chrom, type='mean', exact=True):
3774
if self.backend == 'pyBigWig':
3875
return self.bw.stats(chrom, type=type, exact=exact)[0]
39-
else:
76+
elif self.backend == 'pybigtools':
4077
return self.bw.values(chrom, missing=np.nan, bins=1, exact=exact, summary='mean')[0].item()
78+
elif self.backend == 'memmap':
79+
arr = self.arrays[chrom]
80+
if type == 'mean':
81+
return float(np.mean(arr))
82+
elif type == 'max':
83+
return float(np.max(arr))
84+
elif type == 'min':
85+
return float(np.min(arr))
86+
else:
87+
raise NotImplementedError(f"stat {type} not implemented for memmap backend")
4188

4289
def values(self, chrom, start, end, missing=0):
4390
if self.backend == 'pyBigWig':
44-
return np.nan_to_num(self.bw.values(chrom, start, end)).astype(np.float32)
45-
else:
46-
return self.bw.values(chrom, start, end, missing=0.).astype(np.float32)
91+
return np.nan_to_num(self.bw.values(chrom, start, end), nan=missing).astype(np.float32)
92+
elif self.backend == 'pybigtools':
93+
return self.bw.values(chrom, start, end, missing=missing).astype(np.float32)
94+
elif self.backend == 'memmap':
95+
arr = self.arrays[chrom]
96+
97+
start = max(0, start)
98+
end = min(arr.shape[0], end)
99+
100+
return arr[start:end] # no copy
47101

48102
def chroms(self):
49-
return self.bw.chroms()
103+
return self._chroms
50104

51105
def close(self):
52106
self.bw.close()

0 commit comments

Comments
 (0)