Skip to content

Commit d91cc7a

Browse files
committed
add: option to create DALI compatible webdataset
1 parent 14e4662 commit d91cc7a

1 file changed

Lines changed: 23 additions & 8 deletions

File tree

seqchromloader/writer.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
7575
:type compress: boolean
7676
:param numProcessors: number of processors
7777
:type numProcessors: int
78+
:param DALI: Set to True if you want to use the dataset for NVIDIA DALI, it would save all arrays in bytes, which results in losing the array shape info
79+
:param DALI: boolean
7880
"""
7981

8082
# split coordinates and assign chunks to workers
@@ -90,10 +92,16 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
9092
target_bam=target_bam,
9193
compress=compress,
9294
outdir=outdir,
93-
transforms=transforms)
95+
transforms=transforms,
96+
DALI=DALI)
97+
98+
count_of_digits = 0
99+
while num_chunks > 0:
100+
num_chunks = int(num_chunks/10)
101+
count_of_digits += 1
94102

95103
pool = Pool(numProcessors)
96-
res = pool.starmap_async(dump_data_worker_freeze, zip(chunks, [outprefix + "_" + str(i) for i in range(num_chunks)]))
104+
res = pool.starmap_async(dump_data_worker_freeze, zip(chunks, [outprefix + "_" + format(i, f'0{count_of_digits}d') for i in range(num_chunks)]))
97105
files = res.get()
98106

99107
return files
@@ -105,7 +113,8 @@ def dump_data_webdataset_worker(coords,
105113
target_bam=None,
106114
outdir="dataset/",
107115
compress=True,
108-
transforms=None):
116+
transforms=None,
117+
DALI=False):
109118
# get handlers
110119
genome_pyfasta = pyfasta.Fasta(fasta)
111120
bigwigs = [pyBigWig.open(bw) for bw in bigwig_files]
@@ -132,11 +141,17 @@ def dump_data_webdataset_worker(coords,
132141
)
133142
except utils.BigWigInaccessible as e:
134143
continue
135-
136-
feature_dict["seq.npy"] = feature['seq']
137-
feature_dict["chrom.npy"] = feature['chrom']
138-
feature_dict["target.npy"] = feature['target']
139-
feature_dict["label.npy"] = feature['label']
144+
145+
if not DALI:
146+
feature_dict["seq.npy"] = feature['seq']
147+
feature_dict["chrom.npy"] = feature['chrom']
148+
feature_dict["target.npy"] = feature['target']
149+
feature_dict["label.npy"] = feature['label']
150+
else:
151+
feature_dict["seq.npy"] = feature['seq'].tobytes()
152+
feature_dict["chrom.npy"] = feature['chrom'].tobytes()
153+
feature_dict["target.npy"] = feature['target'].tobytes()
154+
feature_dict["label.npy"] = feature['label'].tobytes()
140155

141156
sink.write(feature_dict)
142157

0 commit comments

Comments
 (0)