Skip to content

Commit ea292f6

Browse files
committed
add: SeqChromLoaderByDataFrame
1 parent d1911c0 commit ea292f6

2 files changed

Lines changed: 40 additions & 9 deletions

File tree

seqchromloader/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
from .loader import SeqChromDatasetByBed, SeqChromDatasetByWds, SeqChromDataModule
1+
from .loader import SeqChromDatasetByDataFrame, SeqChromDatasetByBed, SeqChromDatasetByWds, SeqChromDataModule
22
from .writer import dump_data_webdataset

seqchromloader/loader.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,10 @@ def __iter__(self):
9393

9494
SeqChromDatasetByWds = seqChromLoaderCurry(_SeqChromDatasetByWds)
9595

96-
class _SeqChromDatasetByBed(Dataset):
96+
class _SeqChromDatasetByDataFrame(Dataset):
9797
"""
98-
:param bed: Bed file describing genomics regions to extract info from, every region has to be of the same length.
99-
:type bed: str
98+
:param dataframe: pandas dataframe describing genomics regions to extract info from, every region has to be of the same length.
99+
:type dataframe: pd.DataFrame
100100
:param genome_fasta: Genome fasta file.
101101
:type genome_fasta: str
102102
:param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications)
@@ -106,9 +106,15 @@ class _SeqChromDatasetByBed(Dataset):
106106
:param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
107107
:type transforms: dict of functions
108108
"""
109-
def __init__(self, bed, genome_fasta, bigwig_filelist:list, target_bam=None, transforms:dict=None, initialize_first=False):
110-
self.bed = pd.read_table(bed, header=None, names=['chrom', 'start', 'end', 'label', 'score', 'strand' ])
111-
109+
def __init__(self,
110+
dataframe: pd.DataFrame,
111+
genome_fasta: str,
112+
bigwig_filelist:list,
113+
target_bam=None,
114+
transforms:dict=None,
115+
initialize_first=False):
116+
117+
self.dataframe = dataframe
112118
self.genome_fasta = genome_fasta
113119
self.genome_pyfasta = None
114120
self.bigwig_filelist = bigwig_filelist
@@ -121,17 +127,18 @@ def __init__(self, bed, genome_fasta, bigwig_filelist:list, target_bam=None, tra
121127
if initialize_first: self.initialize()
122128

123129
def initialize(self):
130+
# create the stream handler after child processes spawned to enable parallel reading
124131
# this function will be called by worker_init_function in DataLoader
125132
self.genome_pyfasta = pyfasta.Fasta(self.genome_fasta)
126133
self.bigwigs = [pyBigWig.open(bw) for bw in self.bigwig_filelist]
127134
if self.target_bam is not None:
128135
self.target_pysam = pysam.AlignmentFile(self.target_bam)
129136

130137
def __len__(self):
131-
return len(self.bed)
138+
return len(self.dataframe)
132139

133140
def __getitem__(self, idx):
134-
item = self.bed.iloc[idx,]
141+
item = self.dataframe.iloc[idx,]
135142
try:
136143
feature = utils.extract_info(
137144
item.chrom,
@@ -148,6 +155,30 @@ def __getitem__(self, idx):
148155
raise e
149156

150157
return feature['seq'], feature['chrom'], feature['target'], feature['label']
158+
159+
SeqChromDatasetByDataFrame = seqChromLoaderCurry(_SeqChromDatasetByDataFrame)
160+
161+
class _SeqChromDatasetByBed(_SeqChromDatasetByDataFrame):
162+
"""
163+
:param bed: Bed file describing genomics regions to extract info from, every region has to be of the same length.
164+
:type bed: str
165+
:param genome_fasta: Genome fasta file.
166+
:type genome_fasta: str
167+
:param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications)
168+
:type bigwig_filelist: list of str or None
169+
:param target_bam: bam file to get # reads in each region
170+
:type target_bam: str or None
171+
:param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
172+
:type transforms: dict of functions
173+
"""
174+
def __init__(self, bed: str, genome_fasta: str, bigwig_filelist:list, target_bam=None, transforms:dict=None, initialize_first=False):
175+
dataframe = pd.read_table(bed, header=None, names=['chrom', 'start', 'end', 'label', 'score', 'strand' ])
176+
super().__init__(dataframe,
177+
genome_fasta,
178+
bigwig_filelist,
179+
target_bam,
180+
transforms,
181+
initialize_first)
151182

152183
SeqChromDatasetByBed = seqChromLoaderCurry(_SeqChromDatasetByBed)
153184

0 commit comments

Comments
 (0)