Skip to content

Commit 536518f

Browse files
authored
Merge pull request #41 from CEGRcode/4dn
4dn metadata fetching support
2 parents 3e72df1 + 1dc085b commit 536518f

4 files changed

Lines changed: 355 additions & 0 deletions

File tree

scripts/README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,36 @@ optional arguments:
4646
-o json_fn, --output json_fn
4747
the output json filename
4848
```
49+
## get_metadata_from_4DNucleosome.py
4950

51+
Retrieves the following information keyed on the BAM file accession (4DNFIXXXXXXX) using the 4D Nucleosome API.
52+
- experiment accession (ENSRXXXXXX)
53+
- assay name
54+
- biosample accession (ENCBSXXXXXX)
55+
- strain info, run type (single/paired end)
56+
- target ("None" if not applicable)
57+
- file size
58+
- total reads
59+
- read length
60+
- genome assembly
61+
62+
```
63+
usage: get_metadata_from_4DNucleosome.py [-h] -i input_fn -o json_fn
64+
65+
Retrieve 4D Nucleosome metadata from API for plotter.
66+
67+
optional arguments:
68+
-h, --help show this help message and exit
69+
-i input_fn, --input input_fn
70+
the tab-delimited file with 4DNFI accessions of BAM
71+
files in the first column
72+
-o json_fn, --output json_fn
73+
the output json filename
74+
```
5075

5176
## Run tests
5277
```
5378
python get_metadata_from_ENCODE.py -i testdata/encode_samples.txt -o testdata/encode_samples.json
5479
python get_metadata_from_TABfile.py -i testdata/samples.tab -o testdata/samples.json
80+
python3 get_metadata_from_4DNucleosome.py -i testdata/4dnucleosome_sample.txt -o testdata/4dnucleosome_sample.json
5581
```
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
import argparse
2+
import json
3+
import requests
4+
5+
def getParams():
6+
'''Parse parameters from the command line'''
7+
parser = argparse.ArgumentParser(description='Retrieve 4D Nucleosome metadata from API for plotter.')
8+
9+
parser.add_argument('-i','--input', metavar='input_fn', required=True, help='the tab-delimited file with 4DNFI accessions of BAM files in the first column')
10+
parser.add_argument('-o','--output', metavar='json_fn', required=True, help='the output json filename')
11+
12+
args = parser.parse_args()
13+
return(args)
14+
15+
16+
# Helper: 4DNFI to URL to payload
17+
def fetch_data(url):
18+
# Force return from the server in JSON format
19+
headers = {'accept': 'application/json'}
20+
21+
# GET the search result
22+
response = requests.get(url, headers=headers)
23+
24+
# Extract the JSON response as a python dictionary
25+
search_results = response.json()
26+
return(search_results)
27+
28+
29+
# Main program which takes in input parameters
30+
if __name__ == '__main__':
31+
32+
# Get params
33+
args = getParams()
34+
35+
# Parse list of accessions
36+
sample_list = []
37+
reader = open(args.input, 'r')
38+
for line in reader:
39+
sample_list.append(line.strip().split('\t')[0])
40+
reader.close()
41+
42+
# Initialize metadata storage dict
43+
metadata = {}
44+
45+
# Parse payload for each accession
46+
for bam_4DNFI in sample_list:
47+
# Get payload for accession
48+
url = 'https://data.4dnucleome.org/files-processed/%s/?format=json' % bam_4DNFI
49+
data = fetch_data(url)
50+
51+
# Confirm payload accession
52+
accession = data.get('accession', '4DNFIXXXXXXX').strip()
53+
if (accession != bam_4DNFI):
54+
print("Error: mismatched ENCFF (%s != %s)" % (accession, bam_4DNFI))
55+
continue
56+
experiments = data.get('experiments', [])
57+
track_facet_info = data.get("track_and_facet_info", None)
58+
59+
60+
61+
# Get Library accession
62+
# Okay that it's None
63+
ENCLB = None
64+
65+
# Get Experiment Accession
66+
ENCSR = None
67+
for experiment in experiments:
68+
if '@id' in experiment:
69+
ENCSR = experiment['@id']
70+
else:
71+
print("No experiments or accession not in experiments")
72+
73+
# Get Experiment-dependent info
74+
ENCBS = None
75+
for experiment in experiments:
76+
if 'biosample' in experiment:
77+
biosample = experiment['biosample']
78+
biosource = biosample["biosource"]
79+
for id in biosource:
80+
if "@id" in id:
81+
ENCBS = id["@id"]
82+
else:
83+
print("No biosource or ENCBS not in biosource")
84+
else:
85+
print("No experiments or biosample not in experiments")
86+
87+
# Get Target
88+
target = None
89+
if track_facet_info is not None:
90+
target = track_facet_info["assay_info"]
91+
else:
92+
print("No track_and_facet_info, can't find experiment_type")
93+
94+
# Get Biosample name
95+
strain = None
96+
for experiment in experiments:
97+
if 'biosample' in experiment:
98+
biosample = experiment['biosample']
99+
biosource = biosample["biosource"]
100+
for bio in biosource:
101+
if "cell_line" in bio:
102+
cell_line = bio["cell_line"]
103+
else:
104+
print("No biosource or cell_line not in biosource")
105+
strain = cell_line["term_name"]
106+
else:
107+
print("No experiments or biosample not in experiments")
108+
109+
# Get Treatment (N/A for now)
110+
111+
# Get Assay
112+
assay_title = None
113+
if track_facet_info is not None:
114+
assay_title = track_facet_info["experiment_type"]
115+
else:
116+
print("No track_and_facet_info, can't find experiment_type")
117+
118+
# Get Read Info
119+
assembly = data.get("genome_assembly", None)
120+
121+
file_size = data.get('file_size', None)
122+
123+
# Get Total Reads
124+
# CUT&RUN doens't have total reads
125+
total_reads = None
126+
if assay_title == "in situ Hi-C":
127+
total_reads = None
128+
quality_metric = data.get("quality_metric", [])
129+
quality_metric_summary = quality_metric.get("quality_metric_summary", [])
130+
for metric in quality_metric_summary:
131+
if metric["title"] == "Total Reads":
132+
total_reads = metric["value"]
133+
break
134+
else:
135+
print("No quality_metric_summary, can't find total reads")
136+
137+
# Get all Fastq's from the json
138+
fastq_list = []
139+
if assay_title == "CUT&RUN":
140+
workflow_run_outputs = data.get('workflow_run_outputs', [])
141+
for w in workflow_run_outputs:
142+
if "input_files" in w:
143+
for input_file in w["input_files"]:
144+
if "value" in input_file:
145+
value = input_file["value"]
146+
if "@id" in value:
147+
id = value["@id"]
148+
if "/files-fastq/" in id:
149+
parts = id.split('/')
150+
fastq = parts[-2]
151+
fastq_list.append(fastq)
152+
else:
153+
print("@id not in value section")
154+
else:
155+
print("value not in input_files section")
156+
else:
157+
print("input_files not in workflow_run_outputs section")
158+
159+
fastq_read_length_dict = {}
160+
fastq_run_type_dict = {}
161+
for f in fastq_list:
162+
fastq_url = 'https://data.4dnucleome.org/files-fastq/%s/?format=json' % f
163+
fastq_data = fetch_data(fastq_url)
164+
read_length = fastq_data.get("read_length", None)
165+
key = "/files-fastq/" + f
166+
fastq_read_length_dict[key] = read_length
167+
if "paired_end" in fastq_data:
168+
run_type = "pair-ended"
169+
else:
170+
run_type = "single-ended"
171+
fastq_run_type_dict[key] = run_type
172+
173+
# Get Read Length
174+
mapped_read_length = None
175+
if assay_title == "CUT&RUN":
176+
mapped_read_length = [fastq_read_length_dict]
177+
if mapped_read_length is None:
178+
mapped_read_length = "None"
179+
180+
# Get Run Type
181+
# May need to double check if this is extracted from the right place
182+
mapped_run_type = None
183+
if assay_title == "CUT&RUN":
184+
mapped_run_type = [fastq_run_type_dict]
185+
if mapped_run_type is None:
186+
mapped_run_type = "None"
187+
188+
# Future work: add audit information
189+
190+
# Udate metadata with new accession info
191+
metadata.update({
192+
accession: {
193+
'ENCSR': str(ENCSR),
194+
'ENCLB': str(ENCLB),
195+
'target': str(target),
196+
'ENCBS': str(ENCBS),
197+
'strain': str(strain),
198+
'assay': str(assay_title),
199+
'assembly': str(assembly),
200+
'file_size': str(file_size),
201+
'total_reads': str(total_reads),
202+
'read_length': (mapped_read_length),
203+
'run_type': (mapped_run_type)
204+
}
205+
})
206+
207+
# Writing to sample.json
208+
with open(args.output, "w") as outfile:
209+
outfile.write(json.dumps(metadata, indent=4))
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
{
2+
"4DNFIK734P7Z": {
3+
"ENCSR": "/experiments-hi-c/4DNEXJCUBTM2/",
4+
"ENCLB": "None",
5+
"target": "Arima - A1, A2",
6+
"ENCBS": "/biosources/4DNSRCCM5D5D/",
7+
"strain": "HUES8",
8+
"assay": "in situ Hi-C",
9+
"assembly": "GRCh38",
10+
"file_size": "59552912307",
11+
"total_reads": "321592041",
12+
"read_length": "None",
13+
"run_type": "None"
14+
},
15+
"4DNFIKSORPB9": {
16+
"ENCSR": "/experiments-hi-c/4DNEXW6T5QSA/",
17+
"ENCLB": "None",
18+
"target": "HindIII",
19+
"ENCBS": "/biosources/4DNSRLAXYUCU/",
20+
"strain": "GM19204",
21+
"assay": "Dilution Hi-C",
22+
"assembly": "GRCh38",
23+
"file_size": "92909712242",
24+
"total_reads": "None",
25+
"read_length": "None",
26+
"run_type": "None"
27+
},
28+
"4DNFIP6DJ98P": {
29+
"ENCSR": "/experiments-repliseq/4DNEXOLHMWYM/",
30+
"ENCLB": "None",
31+
"target": "late fraction of 2 fractions",
32+
"ENCBS": "/biosources/4DNSRIOTVJ4X/",
33+
"strain": "pluripotent stem cell",
34+
"assay": "2-stage Repli-seq",
35+
"assembly": "GRCh38",
36+
"file_size": "606626982",
37+
"total_reads": "None",
38+
"read_length": "None",
39+
"run_type": "None"
40+
},
41+
"4DNFI66KS84H": {
42+
"ENCSR": "/experiments-repliseq/4DNEXOA9VFCD/",
43+
"ENCLB": "None",
44+
"target": "P2 of 16 fractions",
45+
"ENCBS": "/biosources/4DNSRJ3TG8FL/",
46+
"strain": "HCT116",
47+
"assay": "Multi-stage Repli-seq",
48+
"assembly": "GRCh38",
49+
"file_size": "1222308231",
50+
"total_reads": "None",
51+
"read_length": "None",
52+
"run_type": "None"
53+
},
54+
"4DNFI61TAGXP": {
55+
"ENCSR": "/experiments-seq/4DNEXHKQPX6M/",
56+
"ENCLB": "None",
57+
"target": "H2A.Z protein",
58+
"ENCBS": "/biosources/4DNSRV3SKQ8M/",
59+
"strain": "H1-hESC",
60+
"assay": "CUT&RUN",
61+
"assembly": "GRCh38",
62+
"file_size": "10229433099",
63+
"total_reads": "None",
64+
"read_length": [
65+
{
66+
"/files-fastq/4DNFIOXB4NOH": 25,
67+
"/files-fastq/4DNFIMTMXANT": 25,
68+
"/files-fastq/4DNFIHKEPRLT": 25,
69+
"/files-fastq/4DNFIW2Y8BBQ": 25,
70+
"/files-fastq/4DNFIABI5ARW": 25,
71+
"/files-fastq/4DNFI5TBKNYX": 25,
72+
"/files-fastq/4DNFITUXPJN2": 25,
73+
"/files-fastq/4DNFILMHOUZC": 25,
74+
"/files-fastq/4DNFIPSB3Z5A": 25,
75+
"/files-fastq/4DNFIZ9HJHMH": 25,
76+
"/files-fastq/4DNFIBNA7Y2C": 25,
77+
"/files-fastq/4DNFIT91ZD5W": 25,
78+
"/files-fastq/4DNFI7MS4DBN": 25,
79+
"/files-fastq/4DNFI2YHB4ZG": 25
80+
}
81+
],
82+
"run_type": [
83+
{
84+
"/files-fastq/4DNFIOXB4NOH": "pair-ended",
85+
"/files-fastq/4DNFIMTMXANT": "pair-ended",
86+
"/files-fastq/4DNFIHKEPRLT": "pair-ended",
87+
"/files-fastq/4DNFIW2Y8BBQ": "pair-ended",
88+
"/files-fastq/4DNFIABI5ARW": "pair-ended",
89+
"/files-fastq/4DNFI5TBKNYX": "pair-ended",
90+
"/files-fastq/4DNFITUXPJN2": "pair-ended",
91+
"/files-fastq/4DNFILMHOUZC": "pair-ended",
92+
"/files-fastq/4DNFIPSB3Z5A": "pair-ended",
93+
"/files-fastq/4DNFIZ9HJHMH": "pair-ended",
94+
"/files-fastq/4DNFIBNA7Y2C": "pair-ended",
95+
"/files-fastq/4DNFIT91ZD5W": "pair-ended",
96+
"/files-fastq/4DNFI7MS4DBN": "pair-ended",
97+
"/files-fastq/4DNFI2YHB4ZG": "pair-ended"
98+
}
99+
]
100+
},
101+
"4DNFICHIIXAT": {
102+
"ENCSR": "/experiments-damid/4DNEXJ6SOGOE/",
103+
"ENCLB": "None",
104+
"target": "LMNB1 protein",
105+
"ENCBS": "/biosources/4DNSRHGVFSRJ/",
106+
"strain": "RPE-hTERT",
107+
"assay": "pA-DamID",
108+
"assembly": "GRCh38",
109+
"file_size": "507523701",
110+
"total_reads": "None",
111+
"read_length": "None",
112+
"run_type": "None"
113+
}
114+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
4DNFIK734P7Z
2+
4DNFIKSORPB9
3+
4DNFIP6DJ98P
4+
4DNFI66KS84H
5+
4DNFI61TAGXP
6+
4DNFICHIIXAT

0 commit comments

Comments
 (0)