|
| 1 | +import argparse |
| 2 | +import json |
| 3 | +import requests |
| 4 | + |
| 5 | +def getParams(): |
| 6 | + '''Parse parameters from the command line''' |
| 7 | + parser = argparse.ArgumentParser(description='Retrieve 4D Nucleosome metadata from API for plotter.') |
| 8 | + |
| 9 | + parser.add_argument('-i','--input', metavar='input_fn', required=True, help='the tab-delimited file with 4DNFI accessions of BAM files in the first column') |
| 10 | + parser.add_argument('-o','--output', metavar='json_fn', required=True, help='the output json filename') |
| 11 | + |
| 12 | + args = parser.parse_args() |
| 13 | + return(args) |
| 14 | + |
| 15 | + |
| 16 | +# Helper: 4DNFI to URL to payload |
| 17 | +def fetch_data(url): |
| 18 | + # Force return from the server in JSON format |
| 19 | + headers = {'accept': 'application/json'} |
| 20 | + |
| 21 | + # GET the search result |
| 22 | + response = requests.get(url, headers=headers) |
| 23 | + |
| 24 | + # Extract the JSON response as a python dictionary |
| 25 | + search_results = response.json() |
| 26 | + return(search_results) |
| 27 | + |
| 28 | + |
| 29 | +# Main program which takes in input parameters |
| 30 | +if __name__ == '__main__': |
| 31 | + |
| 32 | + # Get params |
| 33 | + args = getParams() |
| 34 | + |
| 35 | + # Parse list of accessions |
| 36 | + sample_list = [] |
| 37 | + reader = open(args.input, 'r') |
| 38 | + for line in reader: |
| 39 | + sample_list.append(line.strip().split('\t')[0]) |
| 40 | + reader.close() |
| 41 | + |
| 42 | + # Initialize metadata storage dict |
| 43 | + metadata = {} |
| 44 | + |
| 45 | + # Parse payload for each accession |
| 46 | + for bam_4DNFI in sample_list: |
| 47 | + # Get payload for accession |
| 48 | + url = 'https://data.4dnucleome.org/files-processed/%s/?format=json' % bam_4DNFI |
| 49 | + data = fetch_data(url) |
| 50 | + |
| 51 | + # Confirm payload accession |
| 52 | + accession = data.get('accession', '4DNFIXXXXXXX').strip() |
| 53 | + if (accession != bam_4DNFI): |
| 54 | + print("Error: mismatched ENCFF (%s != %s)" % (accession, bam_4DNFI)) |
| 55 | + continue |
| 56 | + experiments = data.get('experiments', []) |
| 57 | + track_facet_info = data.get("track_and_facet_info", None) |
| 58 | + |
| 59 | + |
| 60 | + |
| 61 | + # Get Library accession |
| 62 | + # Okay that it's None |
| 63 | + ENCLB = None |
| 64 | + |
| 65 | + # Get Experiment Accession |
| 66 | + ENCSR = None |
| 67 | + for experiment in experiments: |
| 68 | + if '@id' in experiment: |
| 69 | + ENCSR = experiment['@id'] |
| 70 | + else: |
| 71 | + print("No experiments or accession not in experiments") |
| 72 | + |
| 73 | + # Get Experiment-dependent info |
| 74 | + ENCBS = None |
| 75 | + for experiment in experiments: |
| 76 | + if 'biosample' in experiment: |
| 77 | + biosample = experiment['biosample'] |
| 78 | + biosource = biosample["biosource"] |
| 79 | + for id in biosource: |
| 80 | + if "@id" in id: |
| 81 | + ENCBS = id["@id"] |
| 82 | + else: |
| 83 | + print("No biosource or ENCBS not in biosource") |
| 84 | + else: |
| 85 | + print("No experiments or biosample not in experiments") |
| 86 | + |
| 87 | + # Get Target |
| 88 | + target = None |
| 89 | + if track_facet_info is not None: |
| 90 | + target = track_facet_info["assay_info"] |
| 91 | + else: |
| 92 | + print("No track_and_facet_info, can't find experiment_type") |
| 93 | + |
| 94 | + # Get Biosample name |
| 95 | + strain = None |
| 96 | + for experiment in experiments: |
| 97 | + if 'biosample' in experiment: |
| 98 | + biosample = experiment['biosample'] |
| 99 | + biosource = biosample["biosource"] |
| 100 | + for bio in biosource: |
| 101 | + if "cell_line" in bio: |
| 102 | + cell_line = bio["cell_line"] |
| 103 | + else: |
| 104 | + print("No biosource or cell_line not in biosource") |
| 105 | + strain = cell_line["term_name"] |
| 106 | + else: |
| 107 | + print("No experiments or biosample not in experiments") |
| 108 | + |
| 109 | + # Get Treatment (N/A for now) |
| 110 | + |
| 111 | + # Get Assay |
| 112 | + assay_title = None |
| 113 | + if track_facet_info is not None: |
| 114 | + assay_title = track_facet_info["experiment_type"] |
| 115 | + else: |
| 116 | + print("No track_and_facet_info, can't find experiment_type") |
| 117 | + |
| 118 | + # Get Read Info |
| 119 | + assembly = data.get("genome_assembly", None) |
| 120 | + |
| 121 | + file_size = data.get('file_size', None) |
| 122 | + |
| 123 | + # Get Total Reads |
| 124 | + # CUT&RUN doens't have total reads |
| 125 | + total_reads = None |
| 126 | + if assay_title == "in situ Hi-C": |
| 127 | + total_reads = None |
| 128 | + quality_metric = data.get("quality_metric", []) |
| 129 | + quality_metric_summary = quality_metric.get("quality_metric_summary", []) |
| 130 | + for metric in quality_metric_summary: |
| 131 | + if metric["title"] == "Total Reads": |
| 132 | + total_reads = metric["value"] |
| 133 | + break |
| 134 | + else: |
| 135 | + print("No quality_metric_summary, can't find total reads") |
| 136 | + |
| 137 | + # Get all Fastq's from the json |
| 138 | + fastq_list = [] |
| 139 | + if assay_title == "CUT&RUN": |
| 140 | + workflow_run_outputs = data.get('workflow_run_outputs', []) |
| 141 | + for w in workflow_run_outputs: |
| 142 | + if "input_files" in w: |
| 143 | + for input_file in w["input_files"]: |
| 144 | + if "value" in input_file: |
| 145 | + value = input_file["value"] |
| 146 | + if "@id" in value: |
| 147 | + id = value["@id"] |
| 148 | + if "/files-fastq/" in id: |
| 149 | + parts = id.split('/') |
| 150 | + fastq = parts[-2] |
| 151 | + fastq_list.append(fastq) |
| 152 | + else: |
| 153 | + print("@id not in value section") |
| 154 | + else: |
| 155 | + print("value not in input_files section") |
| 156 | + else: |
| 157 | + print("input_files not in workflow_run_outputs section") |
| 158 | + |
| 159 | + fastq_read_length_dict = {} |
| 160 | + fastq_run_type_dict = {} |
| 161 | + for f in fastq_list: |
| 162 | + fastq_url = 'https://data.4dnucleome.org/files-fastq/%s/?format=json' % f |
| 163 | + fastq_data = fetch_data(fastq_url) |
| 164 | + read_length = fastq_data.get("read_length", None) |
| 165 | + key = "/files-fastq/" + f |
| 166 | + fastq_read_length_dict[key] = read_length |
| 167 | + if "paired_end" in fastq_data: |
| 168 | + run_type = "pair-ended" |
| 169 | + else: |
| 170 | + run_type = "single-ended" |
| 171 | + fastq_run_type_dict[key] = run_type |
| 172 | + |
| 173 | + # Get Read Length |
| 174 | + mapped_read_length = None |
| 175 | + if assay_title == "CUT&RUN": |
| 176 | + mapped_read_length = [fastq_read_length_dict] |
| 177 | + if mapped_read_length is None: |
| 178 | + mapped_read_length = "None" |
| 179 | + |
| 180 | + # Get Run Type |
| 181 | + # May need to double check if this is extracted from the right place |
| 182 | + mapped_run_type = None |
| 183 | + if assay_title == "CUT&RUN": |
| 184 | + mapped_run_type = [fastq_run_type_dict] |
| 185 | + if mapped_run_type is None: |
| 186 | + mapped_run_type = "None" |
| 187 | + |
| 188 | + # Future work: add audit information |
| 189 | + |
| 190 | + # Udate metadata with new accession info |
| 191 | + metadata.update({ |
| 192 | + accession: { |
| 193 | + 'ENCSR': str(ENCSR), |
| 194 | + 'ENCLB': str(ENCLB), |
| 195 | + 'target': str(target), |
| 196 | + 'ENCBS': str(ENCBS), |
| 197 | + 'strain': str(strain), |
| 198 | + 'assay': str(assay_title), |
| 199 | + 'assembly': str(assembly), |
| 200 | + 'file_size': str(file_size), |
| 201 | + 'total_reads': str(total_reads), |
| 202 | + 'read_length': (mapped_read_length), |
| 203 | + 'run_type': (mapped_run_type) |
| 204 | + } |
| 205 | + }) |
| 206 | + |
| 207 | + # Writing to sample.json |
| 208 | + with open(args.output, "w") as outfile: |
| 209 | + outfile.write(json.dumps(metadata, indent=4)) |
0 commit comments