22Utilities for iterating constructing data sets and iterating over
33DNA sequence data.
44"""
5+ import os
56import math
67import pandas as pd
78import numpy as np
2021logger = logging .getLogger (__name__ )
2122
2223class BigWig ():
23- def __init__ (self , bw_path , backend = 'pyBigWig' ):
24+ def __init__ (self , bw_path , memmap_dir = None , backend = 'pyBigWig' ):
2425 if backend == 'pyBigWig' :
2526 self .bw = pyBigWig .open (bw_path )
26- else :
27+ self ._chroms = self .bw .chroms ()
28+ elif backend == 'pybigtools' :
2729 self .bw = pybigtools .open (bw_path )
30+ self ._chroms = self .bw .chroms ()
31+ elif backend == 'memmap' :
32+ # load all chromosomes using np.memmap
33+ if memmap_dir is None :
34+ memmap_dir = bw_path + ".memmap"
35+
36+ self .memmap_dir = memmap_dir
37+ os .makedirs (self .memmap_dir , exist_ok = True )
38+
39+ # open bigwig via pybigtools
40+ self .bw = pybigtools .open (bw_path )
41+
42+ self .arrays = {}
43+ self ._chroms = {}
44+
45+ # build memmap if not exists
46+ for chrom , length in self .bw .chroms ().items ():
47+ npy_path = os .path .join (self .memmap_dir , f"{ chrom } .npy" )
48+
49+ if not os .path .exists (npy_path ):
50+ logger .info (f"[memmap] dumping { chrom } ..." )
51+
52+ # load full chromosome
53+ data = self .bw .values (chrom , 0 , length , missing = 0.0 )
54+
55+ # save
56+ np .save (npy_path , data )
57+
58+ # load as memmap
59+ arr = np .load (npy_path , mmap_mode = 'r' )
60+ self .arrays [chrom ] = arr
61+ self ._chroms [chrom ] = arr .shape [0 ]
62+ else :
63+ raise Exception (f"Unknown Backend: { backend } " )
64+
2865 self .backend = backend
2966
3067 def intervals (self , chrom ):
@@ -36,17 +73,34 @@ def intervals(self, chrom):
3673 def stats (self , chrom , type = 'mean' , exact = True ):
3774 if self .backend == 'pyBigWig' :
3875 return self .bw .stats (chrom , type = type , exact = exact )[0 ]
39- else :
76+ elif self . backend == 'pybigtools' :
4077 return self .bw .values (chrom , missing = np .nan , bins = 1 , exact = exact , summary = 'mean' )[0 ].item ()
78+ elif self .backend == 'memmap' :
79+ arr = self .arrays [chrom ]
80+ if type == 'mean' :
81+ return float (np .mean (arr ))
82+ elif type == 'max' :
83+ return float (np .max (arr ))
84+ elif type == 'min' :
85+ return float (np .min (arr ))
86+ else :
87+ raise NotImplementedError (f"stat { type } not implemented for memmap backend" )
4188
4289 def values (self , chrom , start , end , missing = 0 ):
4390 if self .backend == 'pyBigWig' :
44- return np .nan_to_num (self .bw .values (chrom , start , end )).astype (np .float32 )
45- else :
46- return self .bw .values (chrom , start , end , missing = 0. ).astype (np .float32 )
91+ return np .nan_to_num (self .bw .values (chrom , start , end ), nan = missing ).astype (np .float32 )
92+ elif self .backend == 'pybigtools' :
93+ return self .bw .values (chrom , start , end , missing = missing ).astype (np .float32 )
94+ elif self .backend == 'memmap' :
95+ arr = self .arrays [chrom ]
96+
97+ start = max (0 , start )
98+ end = min (arr .shape [0 ], end )
99+
100+ return arr [start :end ] # no copy
47101
48102 def chroms (self ):
49- return self .bw . chroms ()
103+ return self ._chroms
50104
51105 def close (self ):
52106 self .bw .close ()
0 commit comments