@@ -75,6 +75,8 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
7575 :type compress: boolean
7676 :param numProcessors: number of processors
7777 :type numProcessors: int
78+ :param DALI: Set to True if you want to use the dataset for NVIDIA DALI, it would save all arrays in bytes, which results in losing the array shape info
79+ :param DALI: boolean
7880 """
7981
8082 # split coordinates and assign chunks to workers
@@ -90,10 +92,16 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
9092 target_bam = target_bam ,
9193 compress = compress ,
9294 outdir = outdir ,
93- transforms = transforms )
95+ transforms = transforms ,
96+ DALI = DALI )
97+
98+ count_of_digits = 0
99+ while num_chunks > 0 :
100+ num_chunks = int (num_chunks / 10 )
101+ count_of_digits += 1
94102
95103 pool = Pool (numProcessors )
96- res = pool .starmap_async (dump_data_worker_freeze , zip (chunks , [outprefix + "_" + str ( i ) for i in range (num_chunks )]))
104+ res = pool .starmap_async (dump_data_worker_freeze , zip (chunks , [outprefix + "_" + format ( i , f'0 { count_of_digits } d' ) for i in range (num_chunks )]))
97105 files = res .get ()
98106
99107 return files
@@ -105,7 +113,8 @@ def dump_data_webdataset_worker(coords,
105113 target_bam = None ,
106114 outdir = "dataset/" ,
107115 compress = True ,
108- transforms = None ):
116+ transforms = None ,
117+ DALI = False ):
109118 # get handlers
110119 genome_pyfasta = pyfasta .Fasta (fasta )
111120 bigwigs = [pyBigWig .open (bw ) for bw in bigwig_files ]
@@ -132,11 +141,17 @@ def dump_data_webdataset_worker(coords,
132141 )
133142 except utils .BigWigInaccessible as e :
134143 continue
135-
136- feature_dict ["seq.npy" ] = feature ['seq' ]
137- feature_dict ["chrom.npy" ] = feature ['chrom' ]
138- feature_dict ["target.npy" ] = feature ['target' ]
139- feature_dict ["label.npy" ] = feature ['label' ]
144+
145+ if not DALI :
146+ feature_dict ["seq.npy" ] = feature ['seq' ]
147+ feature_dict ["chrom.npy" ] = feature ['chrom' ]
148+ feature_dict ["target.npy" ] = feature ['target' ]
149+ feature_dict ["label.npy" ] = feature ['label' ]
150+ else :
151+ feature_dict ["seq.npy" ] = feature ['seq' ].tobytes ()
152+ feature_dict ["chrom.npy" ] = feature ['chrom' ].tobytes ()
153+ feature_dict ["target.npy" ] = feature ['target' ].tobytes ()
154+ feature_dict ["label.npy" ] = feature ['label' ].tobytes ()
140155
141156 sink .write (feature_dict )
142157
0 commit comments