@@ -93,10 +93,10 @@ def __iter__(self):
9393
9494SeqChromDatasetByWds = seqChromLoaderCurry (_SeqChromDatasetByWds )
9595
96- class _SeqChromDatasetByBed (Dataset ):
96+ class _SeqChromDatasetByDataFrame (Dataset ):
9797 """
98- :param bed: Bed file describing genomics regions to extract info from, every region has to be of the same length.
99- :type bed: str
98+ :param dataframe: pandas dataframe describing genomics regions to extract info from, every region has to be of the same length.
99+ :type dataframe: pd.DataFrame
100100 :param genome_fasta: Genome fasta file.
101101 :type genome_fasta: str
102102 :param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications)
@@ -106,9 +106,15 @@ class _SeqChromDatasetByBed(Dataset):
106106 :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
107107 :type transforms: dict of functions
108108 """
109- def __init__ (self , bed , genome_fasta , bigwig_filelist :list , target_bam = None , transforms :dict = None , initialize_first = False ):
110- self .bed = pd .read_table (bed , header = None , names = ['chrom' , 'start' , 'end' , 'label' , 'score' , 'strand' ])
111-
109+ def __init__ (self ,
110+ dataframe : pd .DataFrame ,
111+ genome_fasta : str ,
112+ bigwig_filelist :list ,
113+ target_bam = None ,
114+ transforms :dict = None ,
115+ initialize_first = False ):
116+
117+ self .dataframe = dataframe
112118 self .genome_fasta = genome_fasta
113119 self .genome_pyfasta = None
114120 self .bigwig_filelist = bigwig_filelist
@@ -121,17 +127,18 @@ def __init__(self, bed, genome_fasta, bigwig_filelist:list, target_bam=None, tra
121127 if initialize_first : self .initialize ()
122128
123129 def initialize (self ):
130+ # create the stream handler after child processes spawned to enable parallel reading
124131 # this function will be called by worker_init_function in DataLoader
125132 self .genome_pyfasta = pyfasta .Fasta (self .genome_fasta )
126133 self .bigwigs = [pyBigWig .open (bw ) for bw in self .bigwig_filelist ]
127134 if self .target_bam is not None :
128135 self .target_pysam = pysam .AlignmentFile (self .target_bam )
129136
130137 def __len__ (self ):
131- return len (self .bed )
138+ return len (self .dataframe )
132139
133140 def __getitem__ (self , idx ):
134- item = self .bed .iloc [idx ,]
141+ item = self .dataframe .iloc [idx ,]
135142 try :
136143 feature = utils .extract_info (
137144 item .chrom ,
@@ -148,6 +155,30 @@ def __getitem__(self, idx):
148155 raise e
149156
150157 return feature ['seq' ], feature ['chrom' ], feature ['target' ], feature ['label' ]
158+
159+ SeqChromDatasetByDataFrame = seqChromLoaderCurry (_SeqChromDatasetByDataFrame )
160+
161+ class _SeqChromDatasetByBed (_SeqChromDatasetByDataFrame ):
162+ """
163+ :param bed: Bed file describing genomics regions to extract info from, every region has to be of the same length.
164+ :type bed: str
165+ :param genome_fasta: Genome fasta file.
166+ :type genome_fasta: str
167+ :param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications)
168+ :type bigwig_filelist: list of str or None
169+ :param target_bam: bam file to get # reads in each region
170+ :type target_bam: str or None
171+ :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
172+ :type transforms: dict of functions
173+ """
174+ def __init__ (self , bed : str , genome_fasta : str , bigwig_filelist :list , target_bam = None , transforms :dict = None , initialize_first = False ):
175+ dataframe = pd .read_table (bed , header = None , names = ['chrom' , 'start' , 'end' , 'label' , 'score' , 'strand' ])
176+ super ().__init__ (dataframe ,
177+ genome_fasta ,
178+ bigwig_filelist ,
179+ target_bam ,
180+ transforms ,
181+ initialize_first )
151182
152183SeqChromDatasetByBed = seqChromLoaderCurry (_SeqChromDatasetByBed )
153184
0 commit comments