|
17 | 17 | import pyBigWig |
18 | 18 | import webdataset as wds |
19 | 19 |
|
20 | | -from seqchromloader import utils |
| 20 | +from . import utils |
| 21 | +from .loader import _SeqChromDatasetByWds |
21 | 22 |
|
| 23 | +def convert_data_webdataset(wds_in, wds_out, transforms=None, compress=False): |
| 24 | + """ |
| 25 | + Transform the provided webdataset |
| 26 | + |
| 27 | + :param wds_in: input webdataset file |
| 28 | + :type wds_in: string |
| 29 | + :param wds_out: output webdataset file |
| 30 | + :type wds_out: string |
| 31 | + :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]* |
| 32 | + :type transforms: dict of functions |
| 33 | + :param compress: whether to compress the output file |
| 34 | + :type compress: boolean |
| 35 | + """ |
| 36 | + |
| 37 | + ds = _SeqChromDatasetByWds(wds_in, transforms=transforms, keep_key=True) |
| 38 | + sink = wds.TarWriter(wds_out, compress=compress) |
| 39 | + for (key, seq, chrom, target, label) in ds: |
| 40 | + feature_dict = defaultdict() |
| 41 | + feature_dict["__key__"] = key |
| 42 | + |
| 43 | + feature_dict["seq.npy"] = seq |
| 44 | + feature_dict["chrom.npy"] = chrom |
| 45 | + feature_dict["target.npy"] = target |
| 46 | + feature_dict["label.npy"] = label |
| 47 | + sink.write(feature_dict) |
| 48 | + sink.close() |
| 49 | + |
22 | 50 | def dump_data_webdataset(coords, genome_fasta, bigwig_filelist, |
23 | 51 | target_bam=None, |
24 | 52 | outdir="dataset/", outprefix="seqchrom", |
25 | 53 | compress=True, |
26 | 54 | numProcessors=1, |
27 | | - transforms=None): |
| 55 | + transforms=None, |
| 56 | + DALI=False): |
28 | 57 | """ |
29 | 58 | Given coordinates dataframe, extract the sequence and chromatin signal, save in webdataset format |
30 | 59 |
|
|
0 commit comments