MG-RAST
diff --git a/‎LICENSE‎
Lines changed: 25 additions & 0 deletions b/‎LICENSE‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎mglib/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎mglib/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mglib/mglib.py‎
Lines changed: 31 additions & 6 deletions b/‎mglib/mglib.py‎
Lines changed: 31 additions & 6 deletions
diff --git a/‎scripts/mg-export-research-object.py‎
Lines changed: 173 additions & 0 deletions b/‎scripts/mg-export-research-object.py‎
Lines changed: 173 additions & 0 deletions
@@ -0,0 +1,25 @@
+BSD 2-Clause License
+
+Copyright (c) 2018, University of Chicago and Argonne National Laboratory 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -9,4 +9,4 @@
 API_URL = "https://api.mg-rast.org/"
 SHOCK_URL = "https://shock.mg-rast.org/"
 AUTH_LIST = "Jared Bischof, Travis Harrison, Folker Meyer, Tobias Paczian, Andreas Wilke"
-SEARCH_FIELDS = ["function", "organism", "md5", "name", "metadata", "biome", "feature", "material", "country", "location", "longitude", "latitude", "created", "env_package_type", "project_id", "project_name", "PI_firstname", "PI_lastname", "sequence_type", "seq_method", "collection_date"]
+SEARCH_FIELDS = ["all", "name", "investigation_type", "biome", "feature", "material", "continent", "country", "location", "longitude", "latitude", "created_on", "env_package", "project_id", "project_name", "PI_firstname", "PI_lastname", "sequence_type", "seq_meth", "collection_date", "bp_count_raw", "sequence_count_raw", "average_length_raw", "taxonomy", "function"]
@@ -8,6 +8,7 @@
 import json
 import string
 import random
+import hashlib
 import subprocess
 try:
     from StringIO import StringIO
@@ -100,20 +101,24 @@ def obj_from_url(url, auth=None, data=None, debug=False, method=None):
     return obj
 
 # print to file results of MG-RAST or Shock API
-def file_from_url(url, handle, auth=None, data=None, debug=False):
+def file_from_url(url, handle, auth=None, data=None, debug=False, sha1=False):
     result = body_from_url(url, 'text/plain', auth=auth, data=data, debug=debug)
+    sha1hash = hashlib.sha1()
     while True:
         chunk = result.read(8192)
         if not chunk:
             break
+        if sha1:
+            sha1hash.update(chunk)
         handle.write(chunk.decode('utf8'))
+    return sha1hash.hexdigest()
 
 # print to stdout results of MG-RAST API
 def stdout_from_url(url, auth=None, data=None, debug=False):
     file_from_url(url, sys.stdout, auth=auth, data=data, debug=debug)
 
 # return python struct from JSON output of asynchronous MG-RAST API
-def async_rest_api(url, auth=None, data=None, debug=False, delay=15):
+def async_rest_api(url, auth=None, data=None, debug=False, delay=60):
     try:
         parameters = parse_qs(url.split("?")[1])
         assert "asynchronous" in parameters, "Must specify asynchronous=1 for asynchronous call!"
@@ -123,10 +128,9 @@ def async_rest_api(url, auth=None, data=None, debug=False, delay=15):
 # If "status" is nor present, or if "status" is somehow not "submitted" 
 # assume this is not an asynchronous call and it's done.
     if ('status' in submit) and (submit['status'] == 'done') and ('url' in submit):
-        return obj_from_url(submit["url"], debug=debug)
+        return submit['data']
     if not (('status' in submit) and (submit['status'] == 'submitted') and ('url' in submit)):
-        return(submit)
-#        sys.stderr.write("ERROR: return data invalid format\n:%s"%json.dumps(submit))
+        return submit
     result = obj_from_url(submit['url'], debug=debug)
     try:
         while result['status'] != 'done':
@@ -250,15 +254,36 @@ def metadata_from_biom(biom, term):
         vals.append(value)
     return vals
 
+# turn profile format BIOM into matrix format, use only abundances
+def profile_to_matrix(p):
+    if p['columns'][0]['id'] != 'abundance':
+        # not a profile
+        return p
+    trim = True if len(p['columns']) > 1 else False
+    p['columns'][0]['id'] = p['id']
+    p['matrix_element_type'] = 'int'
+    p['matrix_element_value'] = 'abundance'
+    p['date'] = time.strftime("%Y-%m-%d %H:%M:%S")
+    if p['matrix_type'] == 'sparse':
+        p['data'] = sparse_to_dense(p['data'], p['shape'][0], p['shape'][1])
+    if trim:
+        p['columns'] = p['columns'][:1]
+        for i in range(len(p['rows'])):
+            p['data'][i] = p['data'][i][:1]
+    return p
+
 # merge two BIOM objects
 def merge_biom(b1, b2):
     """input: 2 biom objects of same 'type', 'matrix_element_type', and 'matrix_element_value'
     return: merged biom object, duplicate columns skipped, duplicate rows added"""
-    # hack for using in loop when one oif 2 is empty
+    # hack for using in loop when one of 2 is empty
     if b1 and (not b2):
         return b1
     if b2 and (not b1):
         return b2
+    # transform profile BIOM from UI export into matrix BIOM
+    b1 = profile_to_matrix(b1)
+    b2 = profile_to_matrix(b2)
     # validate
     if not (b1 and b2 and (b1['type'] == b2['type']) and (b1['matrix_element_type'] == b2['matrix_element_type']) and (b1['matrix_element_value'] == b2['matrix_element_value'])):
         sys.stderr.write("The inputed biom objects are not compatable for merging\n")
 
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import json
+import yaml
+import shutil
+import hashlib
+from optparse import OptionParser
+from prettytable import PrettyTable
+from mglib import VERSION, get_auth_token, AUTH_LIST, API_URL, obj_from_url, file_from_url, random_str
+
+VERSION = 'alpha'
+
+prehelp = """
+NAME
+    mg-export-research-object
+
+VERSION
+    %s
+
+SYNOPSIS
+    mg-export-research-object [ --help, --user <user>, --passwd <password>, --token <oAuth token>, --metagenome <metagenome id>, --dir <directory name> --list <list manifest>]
+
+DESCRIPTION
+    Retrieve metagenome research object.
+    Note: This is an alpha version and currently does not produce a full Research Object.
+"""
+
+posthelp = """
+Output
+    List available files in manifest.
+      OR
+    Download research object from manifest.
+
+EXAMPLES
+    mg-export-research-object --metagenome mgm4441680.3 --list
+
+SEE ALSO
+    -
+
+AUTHORS
+    %s
+"""
+
+def my_unicode_repr(self, data):
+    return self.represent_str(data.encode('utf-8'))
+
+def edit_input(text, mg):
+    info = yaml.load(text)
+    param = mg['pipeline_parameters']
+    info['jobid'] = int(mg['job_id'])
+    info['sequences']['path'] = "../data/"+mg['id']+".050.upload."+param['file_type']
+    if 'filterLn' in info:
+        info['filterLn'] = True if param['filter_ln'] == "yes" else False
+    if 'filterAmbig' in info:
+        info['filterAmbig'] = True if param['filter_ambig'] == "yes" else False
+    if 'deviation' in info:
+        info['deviation'] = float(param['filter_ln_mult'])
+    if 'maxAmbig' in info:
+        info['maxAmbig'] = int(param['max_ambig'])
+    if 'derepPrefix' in info:
+        if param['dereplicate'] == 'yes':
+            info['derepPrefix'] = int(param['prefix_length'])
+        else:
+            info['derepPrefix'] = 0
+    if 'minQual' in info:
+        info['minQual'] = int(param['min_qual'])
+    if 'maxLqb' in info:
+        info['maxLqb'] = int(param['max_lqb'])
+    
+    yaml.representer.Representer.add_representer(unicode, my_unicode_repr)
+    return yaml.dump(info, allow_unicode=True, default_flow_style=False)
+
+def main(args):
+    OptionParser.format_description = lambda self, formatter: self.description
+    OptionParser.format_epilog = lambda self, formatter: self.epilog
+    parser = OptionParser(usage='', description=prehelp%VERSION, epilog=posthelp%AUTH_LIST)
+    parser.add_option("", "--url", dest="url", default=API_URL, help="MG-RAST API url")
+    parser.add_option("", "--user", dest="user", default=None, help="OAuth username")
+    parser.add_option("", "--passwd", dest="passwd", default=None, help="OAuth password")
+    parser.add_option("", "--token", dest="token", default=None, help="OAuth token")
+    parser.add_option("", "--metagenome", dest="metagenome", default=None, help="metagenome ID")
+    parser.add_option("", "--dir", dest="dir", default=".", help="directory to export to")
+    parser.add_option("", "--list", dest="list", action="store_true", default=False, help="list files in manifest")
+    
+    # get inputs
+    (opts, args) = parser.parse_args()
+    if not opts.metagenome:
+        sys.stderr.write("ERROR: a metagenome id is required\n")
+        return 1
+    if not os.path.isdir(opts.dir):
+        sys.stderr.write("ERROR: dir '%s' does not exist\n"%opts.dir)
+        return 1
+    
+    # get auth
+    token = get_auth_token(opts)
+    
+    # get mg info
+    url = opts.url+'/metagenome/'+opts.metagenome
+    mg  = obj_from_url(url, auth=token)
+    
+    # get manifest
+    url  = opts.url+'/researchobject/manifest/'+opts.metagenome
+    data = obj_from_url(url, auth=token)
+    
+    # just list
+    if opts.list:
+        pt = PrettyTable(["File Name", "Folder", "Media Type"])
+        for info in data["aggregates"]:
+            pt.add_row([info["bundledAs"]["filename"], info["bundledAs"]["folder"], info["mediatype"]])
+        pt.align = "l"
+        print(pt)
+        return 0
+    
+    # get cwl files
+    temp_name = random_str(10)
+    pipeline_dir = os.path.join(opts.dir, temp_name)
+    git_clone = "git clone https://github.com/MG-RAST/pipeline.git " + pipeline_dir
+    os.system(git_clone)
+    
+    # download manifest
+    sha1s = []
+    base = data["@context"][0]["@base"].strip('/')
+    manifest_dir = os.path.join(opts.dir, base)
+    os.mkdir(manifest_dir)
+    data_str = json.dumps(data)
+    open(os.path.join(manifest_dir, data["manifest"]), 'w').write(data_str)
+    sha1s.append([ hashlib.sha1(data_str).hexdigest(), os.path.join(base, data["manifest"]) ])
+    
+    # download aggregates
+    for info in data["aggregates"]:
+        sys.stdout.write("Downloading %s ... "%(info["bundledAs"]["filename"]))
+        folder = info["bundledAs"]["folder"].strip('/')
+        folder_dir = os.path.join(opts.dir, folder)
+        if not os.path.isdir(folder_dir):
+            os.mkdir(folder_dir)
+        if "githubusercontent" in info["uri"]:
+            pos = info["uri"].find("CWL")
+            src = os.path.join(pipeline_dir, info["uri"][pos:])
+            dst = os.path.join(folder_dir, info["bundledAs"]["filename"])
+            text = open(src, 'r').read().replace('../Inputs/', '').replace('../Tools/', '').replace('../Workflows/', '')
+            if dst.endswith('job.yaml'):
+                text = edit_input(text, mg) 
+            open(dst, 'w').write(text)
+            sha1s.append([ hashlib.sha1(text).hexdigest(), os.path.join(folder, info["bundledAs"]["filename"]) ])
+        else:
+            fh = open(os.path.join(folder_dir, info["bundledAs"]["filename"]), 'w')
+            s1 = file_from_url(info["uri"], fh, auth=token, sha1=True)
+            fh.close()
+            sha1s.append([ s1, os.path.join(folder, info["bundledAs"]["filename"]) ])
+        sys.stdout.write("Done\n")
+    
+    # output sha1
+    mansha1 = open(os.path.join(opts.dir, "manifest-sha1.txt"), 'w')
+    tagsha1 = open(os.path.join(opts.dir, "tagmanifest-sha1.txt"), 'w')
+    sha1s.sort(key=lambda x: x[1])
+    for s1 in sha1s:
+        if s1[1].startswith('data'):
+            mansha1.write("%s\t%s\n"%(s1[0], s1[1]))
+        else:
+            tagsha1.write("%s\t%s\n"%(s1[0], s1[1]))
+    mansha1.close()
+    tagsha1.close()
+    
+    # cleanup
+    shutil.rmtree(pipeline_dir)
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit( main(sys.argv) )