Merge pull request #41 from CEGRcode/4dn

owlang · web-flow · commit 536518f73807 · 2023-10-12T15:13:30.000-04:00
4dn metadata fetching support
diff --git a/scripts/README.md b/scripts/README.md
@@ -46,10 +46,36 @@ optional arguments:
   -o json_fn, --output json_fn
                         the output json filename
 ```
+## get_metadata_from_4DNucleosome.py
 
+Retrieves the following information keyed on the BAM file accession (4DNFIXXXXXXX) using the 4D Nucleosome API.
+- experiment accession (ENSRXXXXXX)
+- assay name
+- biosample accession (ENCBSXXXXXX)
+- strain info, run type (single/paired end)
+- target ("None" if not applicable)
+- file size
+- total reads
+- read length
+- genome assembly
+
+```
+usage: get_metadata_from_4DNucleosome.py [-h] -i input_fn -o json_fn
+
+Retrieve 4D Nucleosome metadata from API for plotter.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i input_fn, --input input_fn
+                        the tab-delimited file with 4DNFI accessions of BAM
+                        files in the first column
+  -o json_fn, --output json_fn
+                        the output json filename
+```
 
 ## Run tests
 ```
 python get_metadata_from_ENCODE.py -i testdata/encode_samples.txt -o testdata/encode_samples.json
 python get_metadata_from_TABfile.py -i testdata/samples.tab -o testdata/samples.json
+python3 get_metadata_from_4DNucleosome.py -i testdata/4dnucleosome_sample.txt -o testdata/4dnucleosome_sample.json 
 ```
diff --git a/scripts/get_metadata_from_4DNucleosome.py b/scripts/get_metadata_from_4DNucleosome.py
@@ -0,0 +1,209 @@
+import argparse
+import json
+import requests
+
+def getParams():
+    '''Parse parameters from the command line'''
+    parser = argparse.ArgumentParser(description='Retrieve 4D Nucleosome metadata from API for plotter.')
+
+    parser.add_argument('-i','--input', metavar='input_fn', required=True, help='the tab-delimited file with 4DNFI accessions of BAM files in the first column')
+    parser.add_argument('-o','--output', metavar='json_fn', required=True, help='the output json filename')
+
+    args = parser.parse_args()
+    return(args)
+
+
+# Helper: 4DNFI to URL to payload
+def fetch_data(url):
+    # Force return from the server in JSON format
+    headers = {'accept': 'application/json'}
+
+    # GET the search result
+    response = requests.get(url, headers=headers)
+
+    # Extract the JSON response as a python dictionary
+    search_results = response.json()
+    return(search_results)
+
+
+# Main program which takes in input parameters
+if __name__ == '__main__':
+
+    # Get params
+    args = getParams()
+
+    # Parse list of accessions
+    sample_list = []
+    reader = open(args.input, 'r')
+    for line in reader:
+        sample_list.append(line.strip().split('\t')[0])
+    reader.close()
+
+    # Initialize metadata storage dict
+    metadata = {}
+
+    # Parse payload for each accession
+    for bam_4DNFI in sample_list:
+        # Get payload for accession
+        url = 'https://data.4dnucleome.org/files-processed/%s/?format=json' % bam_4DNFI
+        data = fetch_data(url)
+
+        # Confirm payload accession
+        accession = data.get('accession', '4DNFIXXXXXXX').strip()
+        if (accession != bam_4DNFI):
+            print("Error: mismatched ENCFF (%s != %s)" % (accession, bam_4DNFI))
+            continue
+        experiments = data.get('experiments', [])
+        track_facet_info = data.get("track_and_facet_info", None)
+
+
+
+        # Get Library accession
+        # Okay that it's None
+        ENCLB = None
+
+        # Get Experiment Accession
+        ENCSR = None
+        for experiment in experiments:
+            if '@id' in experiment:
+                ENCSR = experiment['@id']
+            else:
+                print("No experiments or accession not in experiments")
+
+        # Get Experiment-dependent info
+        ENCBS = None
+        for experiment in experiments:
+            if 'biosample' in experiment:
+                biosample = experiment['biosample']
+                biosource = biosample["biosource"]
+                for id in biosource:
+                    if "@id" in id:
+                        ENCBS = id["@id"]
+                    else:
+                        print("No biosource or ENCBS not in biosource")
+            else:
+                print("No experiments or biosample not in experiments")
+
+        # Get Target
+        target = None
+        if track_facet_info is not None:
+            target = track_facet_info["assay_info"]
+        else:
+            print("No track_and_facet_info, can't find experiment_type")
+
+        # Get Biosample name
+        strain = None
+        for experiment in experiments:
+            if 'biosample' in experiment:
+                biosample = experiment['biosample']
+                biosource = biosample["biosource"]
+                for bio in biosource:
+                    if "cell_line" in bio:
+                        cell_line = bio["cell_line"]
+                    else:
+                        print("No biosource or cell_line not in biosource")
+                strain = cell_line["term_name"]
+            else:
+                print("No experiments or biosample not in experiments")
+
+        # Get Treatment (N/A for now)
+
+        # Get Assay
+        assay_title = None
+        if track_facet_info is not None:
+            assay_title = track_facet_info["experiment_type"]
+        else:
+            print("No track_and_facet_info, can't find experiment_type")
+
+        # Get Read Info
+        assembly = data.get("genome_assembly", None)
+
+        file_size = data.get('file_size', None)
+
+        # Get Total Reads
+        # CUT&RUN doens't have total reads
+        total_reads = None 
+        if assay_title == "in situ Hi-C":
+            total_reads = None
+            quality_metric = data.get("quality_metric", [])
+            quality_metric_summary = quality_metric.get("quality_metric_summary", [])
+            for metric in quality_metric_summary:
+                if metric["title"] == "Total Reads":
+                    total_reads = metric["value"]
+                    break
+                else:
+                    print("No quality_metric_summary, can't find total reads")
+
+        # Get all Fastq's from the json
+        fastq_list = []
+        if assay_title == "CUT&RUN":
+            workflow_run_outputs = data.get('workflow_run_outputs', [])
+            for w in workflow_run_outputs:
+                if "input_files" in w:
+                    for input_file in w["input_files"]:
+                        if "value" in input_file:
+                            value = input_file["value"]
+                            if "@id" in value:
+                                id = value["@id"]
+                                if "/files-fastq/" in id:
+                                    parts = id.split('/')
+                                    fastq = parts[-2]   
+                                    fastq_list.append(fastq)
+                            else:
+                                print("@id not in value section")
+                        else:
+                            print("value not in input_files section")
+                else:
+                    print("input_files not in workflow_run_outputs section")
+        
+        fastq_read_length_dict = {}
+        fastq_run_type_dict = {}
+        for f in fastq_list:
+            fastq_url = 'https://data.4dnucleome.org/files-fastq/%s/?format=json' % f
+            fastq_data = fetch_data(fastq_url)
+            read_length = fastq_data.get("read_length", None)
+            key = "/files-fastq/" + f
+            fastq_read_length_dict[key] = read_length
+            if "paired_end" in fastq_data:
+                run_type = "pair-ended"
+            else:
+                run_type = "single-ended"
+            fastq_run_type_dict[key] = run_type
+
+        # Get Read Length
+        mapped_read_length = None  
+        if assay_title == "CUT&RUN":
+            mapped_read_length = [fastq_read_length_dict]
+        if mapped_read_length is None:
+            mapped_read_length = "None"
+
+        # Get Run Type
+        # May need to double check if this is extracted from the right place
+        mapped_run_type = None
+        if assay_title == "CUT&RUN":
+            mapped_run_type = [fastq_run_type_dict]
+        if mapped_run_type is None:
+            mapped_run_type = "None"
+
+        # Future work: add audit information
+
+        # Udate metadata with new accession info
+        metadata.update({
+            accession: {
+                'ENCSR': str(ENCSR),
+                'ENCLB': str(ENCLB),
+                'target': str(target),
+                'ENCBS': str(ENCBS),
+                'strain': str(strain),
+                'assay': str(assay_title),
+                'assembly': str(assembly),
+                'file_size': str(file_size),
+                'total_reads': str(total_reads),
+                'read_length': (mapped_read_length),
+                'run_type': (mapped_run_type)
+            }
+        })
+
+    # Writing to sample.json
+    with open(args.output, "w") as outfile:
+        outfile.write(json.dumps(metadata, indent=4))
diff --git a/scripts/testdata/4dnucleosome_sample.json b/scripts/testdata/4dnucleosome_sample.json
@@ -0,0 +1,114 @@
+{
+    "4DNFIK734P7Z": {
+        "ENCSR": "/experiments-hi-c/4DNEXJCUBTM2/",
+        "ENCLB": "None",
+        "target": "Arima - A1, A2",
+        "ENCBS": "/biosources/4DNSRCCM5D5D/",
+        "strain": "HUES8",
+        "assay": "in situ Hi-C",
+        "assembly": "GRCh38",
+        "file_size": "59552912307",
+        "total_reads": "321592041",
+        "read_length": "None",
+        "run_type": "None"
+    },
+    "4DNFIKSORPB9": {
+        "ENCSR": "/experiments-hi-c/4DNEXW6T5QSA/",
+        "ENCLB": "None",
+        "target": "HindIII",
+        "ENCBS": "/biosources/4DNSRLAXYUCU/",
+        "strain": "GM19204",
+        "assay": "Dilution Hi-C",
+        "assembly": "GRCh38",
+        "file_size": "92909712242",
+        "total_reads": "None",
+        "read_length": "None",
+        "run_type": "None"
+    },
+    "4DNFIP6DJ98P": {
+        "ENCSR": "/experiments-repliseq/4DNEXOLHMWYM/",
+        "ENCLB": "None",
+        "target": "late fraction of 2 fractions",
+        "ENCBS": "/biosources/4DNSRIOTVJ4X/",
+        "strain": "pluripotent stem cell",
+        "assay": "2-stage Repli-seq",
+        "assembly": "GRCh38",
+        "file_size": "606626982",
+        "total_reads": "None",
+        "read_length": "None",
+        "run_type": "None"
+    },
+    "4DNFI66KS84H": {
+        "ENCSR": "/experiments-repliseq/4DNEXOA9VFCD/",
+        "ENCLB": "None",
+        "target": "P2 of 16 fractions",
+        "ENCBS": "/biosources/4DNSRJ3TG8FL/",
+        "strain": "HCT116",
+        "assay": "Multi-stage Repli-seq",
+        "assembly": "GRCh38",
+        "file_size": "1222308231",
+        "total_reads": "None",
+        "read_length": "None",
+        "run_type": "None"
+    },
+    "4DNFI61TAGXP": {
+        "ENCSR": "/experiments-seq/4DNEXHKQPX6M/",
+        "ENCLB": "None",
+        "target": "H2A.Z protein",
+        "ENCBS": "/biosources/4DNSRV3SKQ8M/",
+        "strain": "H1-hESC",
+        "assay": "CUT&RUN",
+        "assembly": "GRCh38",
+        "file_size": "10229433099",
+        "total_reads": "None",
+        "read_length": [
+            {
+                "/files-fastq/4DNFIOXB4NOH": 25,
+                "/files-fastq/4DNFIMTMXANT": 25,
+                "/files-fastq/4DNFIHKEPRLT": 25,
+                "/files-fastq/4DNFIW2Y8BBQ": 25,
+                "/files-fastq/4DNFIABI5ARW": 25,
+                "/files-fastq/4DNFI5TBKNYX": 25,
+                "/files-fastq/4DNFITUXPJN2": 25,
+                "/files-fastq/4DNFILMHOUZC": 25,
+                "/files-fastq/4DNFIPSB3Z5A": 25,
+                "/files-fastq/4DNFIZ9HJHMH": 25,
+                "/files-fastq/4DNFIBNA7Y2C": 25,
+                "/files-fastq/4DNFIT91ZD5W": 25,
+                "/files-fastq/4DNFI7MS4DBN": 25,
+                "/files-fastq/4DNFI2YHB4ZG": 25
+            }
+        ],
+        "run_type": [
+            {
+                "/files-fastq/4DNFIOXB4NOH": "pair-ended",
+                "/files-fastq/4DNFIMTMXANT": "pair-ended",
+                "/files-fastq/4DNFIHKEPRLT": "pair-ended",
+                "/files-fastq/4DNFIW2Y8BBQ": "pair-ended",
+                "/files-fastq/4DNFIABI5ARW": "pair-ended",
+                "/files-fastq/4DNFI5TBKNYX": "pair-ended",
+                "/files-fastq/4DNFITUXPJN2": "pair-ended",
+                "/files-fastq/4DNFILMHOUZC": "pair-ended",
+                "/files-fastq/4DNFIPSB3Z5A": "pair-ended",
+                "/files-fastq/4DNFIZ9HJHMH": "pair-ended",
+                "/files-fastq/4DNFIBNA7Y2C": "pair-ended",
+                "/files-fastq/4DNFIT91ZD5W": "pair-ended",
+                "/files-fastq/4DNFI7MS4DBN": "pair-ended",
+                "/files-fastq/4DNFI2YHB4ZG": "pair-ended"
+            }
+        ]
+    },
+    "4DNFICHIIXAT": {
+        "ENCSR": "/experiments-damid/4DNEXJ6SOGOE/",
+        "ENCLB": "None",
+        "target": "LMNB1 protein",
+        "ENCBS": "/biosources/4DNSRHGVFSRJ/",
+        "strain": "RPE-hTERT",
+        "assay": "pA-DamID",
+        "assembly": "GRCh38",
+        "file_size": "507523701",
+        "total_reads": "None",
+        "read_length": "None",
+        "run_type": "None"
+    }
+}
diff --git a/scripts/testdata/4dnucleosome_sample.txt b/scripts/testdata/4dnucleosome_sample.txt
@@ -0,0 +1,6 @@
+4DNFIK734P7Z
+4DNFIKSORPB9
+4DNFIP6DJ98P
+4DNFI66KS84H
+4DNFI61TAGXP
+4DNFICHIIXAT