Skip to content

Commit 0275c41

Browse files
committed
add pybigtools backend
1 parent 5cfff78 commit 0275c41

4 files changed

Lines changed: 51 additions & 13 deletions

File tree

seqchromloader/loader.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import random
1111
import pysam
1212
import pyfaidx
13-
import pyBigWig
13+
import pybigtools
1414
import numpy as np
1515
import pandas as pd
1616
import webdataset as wds
@@ -19,7 +19,7 @@
1919
from torch.utils.data import Dataset, IterableDataset, DataLoader
2020
from pybedtools import BedTool
2121

22-
from seqchromloader import utils
22+
from seqchromloader import utils, config
2323

2424
logger = logging.getLogger(__name__)
2525

@@ -139,7 +139,7 @@ def initialize(self):
139139
# create the stream handler after child processes spawned to enable parallel reading
140140
# this function will be called by worker_init_function in DataLoader
141141
self.genome_pyfaidx = pyfaidx.Fasta(self.genome_fasta)
142-
self.bigwigs = [pyBigWig.open(bw) for bw in self.bigwig_filelist] if self.bigwig_filelist is not None else None
142+
self.bigwigs = [utils.BigWig(bw_path) for bw_path in self.bigwig_filelist] if self.bigwig_filelist is not None else None
143143
if self.target_bam is not None:
144144
if isinstance(self.target_bam, list):
145145
self.target_pysam = [pysam.AlignmentFile(b) for b in self.target_bam]

seqchromloader/utils.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,49 @@
88
import logging
99
import pysam
1010
import pyBigWig
11+
import pybigtools
1112
from Bio import motifs
1213
from pyfaidx import Fasta
1314
from multiprocessing import Pool
1415
from pybedtools import Interval, BedTool
1516
from pybedtools.helpers import chromsizes
1617

18+
from seqchromloader import config
19+
1720
logger = logging.getLogger(__name__)
1821

22+
class BigWig():
23+
def __init__(self, bw_path, backend='pyBigWig'):
24+
if backend == 'pyBigWig':
25+
self.bw = pyBigWig.open(bw_path)
26+
else:
27+
self.bw = pybigtools.open(bw_path)
28+
self.backend = backend
29+
30+
def intervals(self, chrom):
31+
if self.backend == 'pyBigWig':
32+
return self.bw.intervals(chrom)
33+
else:
34+
return self.bw.records(chrom)
35+
36+
def stats(self, chrom, type='mean'):
37+
if self.backend == 'pyBigWig':
38+
return self.bw.stats(chrom, type=type, exact=True)[0]
39+
else:
40+
return self.bw.values(chrom, missing=np.nan, bins=1, exact=True, summary='mean')[0].item()
41+
42+
def values(self, chrom, start, end, missing=0):
43+
if self.backend == 'pyBigWig':
44+
return np.nan_to_num(self.bw.values(chrom, start, end)).astype(np.float32)
45+
else:
46+
return self.bw.values(chrom, start, end, missing=0.).astype(np.float32)
47+
48+
def chroms(self):
49+
return self.chroms()
50+
51+
def close(self):
52+
self.bw.close()
53+
1954
def get_genome_sizes(gs=None, genome=None, to_filter=None, to_keep=None):
2055
"""
2156
Loads the genome sizes file, filter or keep chromosomes
@@ -372,7 +407,7 @@ def compute_mean_std_bigwig(bigwig):
372407
:type bigwig: str
373408
:rtype: (mean, stddev)
374409
"""
375-
bw = pyBigWig.open(bigwig)
410+
bw = BigWig(bigwig)
376411

377412
# get chrom length list
378413
chroms = bw.chroms()
@@ -485,7 +520,7 @@ def extract_bw(chrom, start, end, strand, bigwigs):
485520
chroms_array = []
486521
try:
487522
for idx, bigwig in enumerate(bigwigs):
488-
c = (np.nan_to_num(bigwig.values(chrom, start, end))).astype(np.float32)
523+
c = bigwig.values(chrom, start, end)
489524
if strand=="-":
490525
c = c[::-1]
491526
chroms_array.append(c)
@@ -508,9 +543,9 @@ def extract_dnaOneHot(chrom, start, end, strand, genome_pyfaidx):
508543
def extract_single_target(chrom, start, end, strand, target):
509544
if isinstance(target, pysam.AlignmentFile):
510545
target_array = np.array(target.count(chrom, start, end), dtype=np.float32)[np.newaxis]
511-
elif isinstance(target, pyBigWig.pyBigWig):
546+
elif isinstance(target, BigWig):
512547
try:
513-
target_array = np.nan_to_num(target.values(chrom, start, end)).astype(np.float32)
548+
target_array = target.values(chrom, start, end)
514549
if strand=="-":
515550
target_array = target_array[::-1]
516551
except RuntimeError as e:

seqchromloader/writer.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,10 @@
1414

1515
import pyfaidx
1616
import pysam
17-
import pyBigWig
1817
import webdataset as wds
1918

20-
from . import utils
21-
from .loader import _SeqChromDatasetByWds
19+
from seqchromloader import utils
20+
from seqchromloader.loader import _SeqChromDatasetByWds
2221

2322
logger = logging.getLogger(__name__)
2423

@@ -148,17 +147,17 @@ def dump_data_webdataset_worker(coords,
148147
):
149148
#get handlers
150149
genome_pyfaidx = pyfaidx.Fasta(fasta)
151-
bigwigs = [pyBigWig.open(bw) for bw in bigwig_files] if bigwig_files is not None else None
150+
bigwigs = [utils.BigWig(bw) for bw in bigwig_files] if bigwig_files is not None else None
152151
if target_bam is not None:
153152
if isinstance(target_bam, list):
154153
target = [pysam.AlignmentFile(b) for b in target_bam]
155154
else:
156155
target = pysam.AlignmentFile(target_bam)
157156
elif target_bw is not None:
158157
if isinstance(target_bw, list):
159-
target = [pyBigWig.open(b) for b in target_bw]
158+
target = [utils.BigWig(b) for b in target_bw]
160159
else:
161-
target = pyBigWig.open(target_bw)
160+
target = utils.BigWig(target_bw)
162161
else:
163162
target = None
164163

tests/test_writer_loader.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
from pyfaidx import Fasta
1919
from pyjaspar import jaspardb
2020

21+
import seqchromloader
22+
23+
seqchromloader.config.set_bigwig_backend('pybigtools')
24+
2125
class Test(unittest.TestCase):
2226
def setUp(self) -> None:
2327
pass

0 commit comments

Comments
 (0)