Skip to content

Commit 40aea4f

Browse files
committed
add braceexpand option to simplify wds file list
1 parent 09fb8da commit 40aea4f

1 file changed

Lines changed: 10 additions & 2 deletions

File tree

seqchromloader/writer.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
5353
compress=True,
5454
numProcessors=1,
5555
transforms=None,
56+
braceexpand=True,
5657
DALI=False):
5758
"""
5859
Given coordinates dataframe, extract the sequence and chromatin signal, save in webdataset format
@@ -75,6 +76,8 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
7576
:type compress: boolean
7677
:param numProcessors: number of processors
7778
:type numProcessors: int
79+
:param braceexpand: if use brace to simplify the wds file list into a string
80+
:param braceexpand: boolean
7881
:param DALI: Set to True if you want to use the dataset for NVIDIA DALI, it would save all arrays in bytes, which results in losing the array shape info
7982
:param DALI: boolean
8083
"""
@@ -103,8 +106,13 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
103106
pool = Pool(numProcessors)
104107
res = pool.starmap_async(dump_data_worker_freeze, zip(chunks, [outprefix + "_" + format(i, f'0{count_of_digits}d') for i in range(num_chunks)]))
105108
files = res.get()
106-
107-
return files
109+
110+
if braceexpand:
111+
begin = f'0{count_of_digits}d'.format(0)
112+
end = f'0{count_of_digits}d'.format(range(num_chunks)[-1])
113+
return f"outprefix_{{{begin}...{end}}}.tar.gz" if compress else f"outprefix_{{{begin}...{end}}}.tar"
114+
else:
115+
return files
108116

109117
def dump_data_webdataset_worker(coords,
110118
outprefix,

0 commit comments

Comments
 (0)