Merge branch 'master' into quick-fix

Daniel Abercrombie · Daniel Abercrombie · commit 30b482f64288 · 2019-01-16T10:46:03.000-05:00
Conflicts:
	workflowwebtools/__init__.py
	workflowwebtools/predict/evaluate.py
diff --git a/setup.py b/setup.py
@@ -22,14 +22,18 @@
         'cherrypy<18.0.0',
         'mako',
         'numpy>=1.6.1',
-        'scipy>=0.19.1',
+        'scipy==1.1.0',
         'sklearn',
         'passlib>=1.6',
         'bcrypt',
         'pyOpenSSL',
         'pyyaml',
         'validators',
         'tabulate',
-        'pymongo<3.5.0'
+        'pymongo<3.5.0',
+        'cx_Oracle',
+        'pandas',
+        'keras',
+        'tensorflow'
         ]
     )
diff --git a/workflowwebtools/__init__.py b/workflowwebtools/__init__.py
@@ -4,6 +4,6 @@
 :author: Daniel Abercrombie <dabercro@mit.edu>
 """
 
-__version__ = '0.7.3'
+__version__ = '0.9.2'
 
 __all__ = []
diff --git a/workflowwebtools/classifyerrors.py b/workflowwebtools/classifyerrors.py
@@ -8,10 +8,11 @@
 
 import re
 
+from collections import defaultdict
+
 from .procedures import PROCEDURES
-from .globalerrors import check_session
 
-def classifyerror(errorcode, workflow, session=None):
+def classifyerror(errorcode, workflow):
     """
     Return the most relevant characteristics of an error code for this session.
     This will include things like:
@@ -24,8 +25,7 @@ def classifyerror(errorcode, workflow, session=None):
        More error types should be added to this function as needed
 
     :param int errorcode: The error code that we want to classify
-    :param str workflow: the workflow that we want to get the errors from
-    :param cherrypy.Session session: Is the user's cherrypy session
+    :param workflowinfo.WorkflowInfo workflow: the workflow that we want to get the errors from
     :returns: A tuple of strings describing the key characteristics of the errorcode.
               These strings are good for printing directly in web browsers.
               The first string is the types of errors reported with this error code.
@@ -36,7 +36,7 @@ def classifyerror(errorcode, workflow, session=None):
 
     procedure = PROCEDURES.get(errorcode, {})
 
-    logs = check_session(session).get_workflow(workflow).get_explanation(str(errorcode))
+    logs = workflow.get_explanation(str(errorcode))
 
     error_re = re.compile(r'[\w\s]+ \(Exit code: (\d+)\)')
     error_types = {}
@@ -79,26 +79,29 @@ def classifyerror(errorcode, workflow, session=None):
             additional_actions_string.replace(' |br| |br| ', '<br>'))
 
 
-def get_max_errorcode(workflow, session=None):
+def get_max_errorcode(workflow):
     """
     Get the errorcode with the most errors for a session
 
-    :param str workflow: Is the primary name of the workflow
-    :param cherrypy.Session session: Is the user's cherrypy session
+    :param workflowinfo.WorkflowInfo workflow: the workflow that we want to get the errors from
     :returns: The error code that appears most often for this workflow
     :rtype: int
     """
 
+    errors = workflow.get_errors(True)
+    errors_summed = defaultdict(int)
 
-    curs, _, allerrors, _ = check_session(session).info
-
-    num_errors = []
-
-    for errorcode in allerrors:
-        output = curs.execute("SELECT SUM(numbererrors) FROM workflows WHERE "
-                              "stepname LIKE '/{0}/%' AND errorcode={1}".\
-                                  format(workflow, errorcode))
+    for codes in errors.values():
+        for errorcode, sites in codes.items():
+            numcode = -1 if errorcode == 'NotReported' else int(errorcode)
+            for num in sites.values():
+                errors_summed[numcode] += num
 
-        num_errors.append(output[0])
+    output = 0
+    max_num = 0
+    for code, num in errors_summed.items():
+        if num > max_num:
+            max_num = num
+            output = code
 
-    return allerrors[num_errors.index(max(num_errors))]
+    return output
diff --git a/workflowwebtools/default/config.yml b/workflowwebtools/default/config.yml
@@ -81,3 +81,4 @@ cluster:
 cache_refresh:
   errors: 345600
 workspace: '.'
+refresh_period: 15
diff --git a/workflowwebtools/errorutils.py b/workflowwebtools/errorutils.py
@@ -14,13 +14,33 @@
 
 import validators
 import cherrypy
+import cx_Oracle
 
 from cmstoolbox import sitereadiness
 from cmstoolbox.webtools import get_json
 
 from . import workflowinfo
 from . import serverconfig
 
+def errors_from_list(workflows):
+    """
+    :param list workflows: A list of workflows that are in assistance-manual
+    :returns: The errors for the workflows
+    :rtype: dict
+    """
+    indict = {}
+
+    for workflow in workflows:
+        base = workflowinfo.WorkflowInfo(workflow)
+        prep_id = base.get_prep_id()
+        for wkf in set(workflowinfo.PrepIDInfo(prep_id).get_workflows()):
+            indict.update(
+                workflowinfo.WorkflowInfo(wkf).get_errors(get_unreported=True)
+                )
+
+    return indict
+
+
 def open_location(data_location):
     """
     This function assumes that the contents of the location is in JSON format.
@@ -30,8 +50,19 @@ def open_location(data_location):
     :returns: information in the JSON file
     :rtype: dict
     """
+    config_dict = serverconfig.config_dict()
+
+    if 'oracle' in config_dict:
+        oracle_db_conn = cx_Oracle.connect(*config_dict['oracle']) # pylint:disable=c-extension-no-member
+        oracle_cursor = oracle_db_conn.cursor()
+        oracle_cursor.execute(
+            "SELECT NAME FROM CMS_UNIFIED_ADMIN.workflow WHERE lower(STATUS) LIKE '%manual%'")
+        wkfs = [row for row, in oracle_cursor]
+        oracle_db_conn.close()
+        cherrypy.log('Number of workflows from database: %i' % len(wkfs))
+        return errors_from_list(wkfs)
+
     raw = None
-    indict = {}
 
     if os.path.isfile(data_location):
         with open(data_location, 'r') as input_file:
@@ -41,7 +72,7 @@ def open_location(data_location):
         components = urlparse.urlparse(data_location)
 
         # Anything we need for the Shibboleth cookie could be in the config file
-        cookie_stuff = serverconfig.config_dict()['data']
+        cookie_stuff = config_dict['data']
 
         raw = get_json(components.netloc, components.path,
                        use_https=True,
@@ -56,16 +87,10 @@ def open_location(data_location):
     if not (keys and isinstance(raw[keys[0]], list)):
         return raw
 
-    for workflow, statuses in raw.iteritems():
-        if True in ['manual' in status for status in statuses]:
-            base = workflowinfo.WorkflowInfo(workflow)
-            prep_id = base.get_prep_id()
-            for wkf in set(workflowinfo.PrepIDInfo(prep_id).get_workflows()):
-                indict.update(
-                    workflowinfo.WorkflowInfo(wkf).get_errors(get_unreported=True)
-                    )
-
-    return indict
+    return errors_from_list([
+        workflow for workflow, statuses in raw.iteritems()
+        if True in ['manual' in status for status in statuses]
+    ])
 
 
 def get_list_info(status_list):
diff --git a/workflowwebtools/globalerrors.py b/workflowwebtools/globalerrors.py
@@ -357,7 +357,8 @@ def check_session(session, can_refresh=False):
         GLOBAL_LOCK.release()
 
     # If session ErrorInfo is old, set up another connection
-    if can_refresh and theinfo.timestamp < time.time() - 60*30:
+    if can_refresh and theinfo.timestamp < time.time() - \
+            60*serverconfig.config_dict()['refresh_period']:
         theinfo.teardown()
         theinfo.setup()
 
diff --git a/workflowwebtools/predict/evaluate.py b/workflowwebtools/predict/evaluate.py
@@ -1,18 +1,189 @@
+# pylint: disable=missing-docstring, too-complex, invalid-name, too-many-branches, too-many-locals
+
 """
 A module that evaluates a model and returns the prediction
 """
 
-#from cmstoolbox import sitereadiness
+import os
+import random
+import itertools
+
+import numpy as np
+import pandas as pd
+import keras as K
+
+
+def modified_site_name(site):
+    site_name = site.split('_')[:-1]
+    s = ''
+    for i in site_name:
+        s = s+i+'_'
+    s = s.rstrip('_')
+    return s
+
+def build_table(df, template_table):
+    sparse_df = template_table.copy()
+
+    tier0_sites, tier1_sites, tier2_sites, tier3_sites = [], [], [], []
+    for i in sparse_df.keys():
+        if i != 'NA':
+            if i[1] == '0':
+                tier0_sites.append(i)
+            elif i[1] == '1':
+                tier1_sites.append(i)
+            elif i[1] == '2':
+                tier2_sites.append(i)
+            elif i[1] == '3':
+                tier3_sites.append(i)
+
+    n0, n1, n2, n3 = len(tier0_sites), len(tier1_sites), len(tier2_sites), len(tier3_sites)
+    for exit_code, site_dict in zip(df.keys(), df.values()):
+        exit_code = int(exit_code)
+        for site, count in site_dict.items():
+
+            chosen_site = None
+            site_present_in_training_data = site in sparse_df.keys()
+            if not site_present_in_training_data:
+
+                site = modified_site_name(site)
+                cond = site in sparse_df.keys()
+                if cond:
+                    chosen_site = site
+
+                print "Detected a site %s which was not present in the training dataset" % site
+                print "We would use a proxy site for this based on whether it is T1, T2 or T3"
+                tier = site.split("_")[0][1]
+                if chosen_site is None:
+                    if tier == '1':
+                        chosen_num = random.randint(0, n1-1)
+                        chosen_site = tier1_sites[chosen_num]
+                    elif tier == '2':
+                        chosen_num = random.randint(0, n2-1)
+                        chosen_site = tier2_sites[chosen_num]
+                        print 'The chosen site is ', chosen_site
+                    elif tier == '3':
+                        chosen_num = random.randint(0, n3-1)
+                        chosen_site = tier3_sites[chosen_num]
+                        print 'The chosen site is ', chosen_site
+                    elif tier == '0':
+                        chosen_num = random.randint(0, n0-1)
+                        chosen_site = tier0_sites[chosen_num]
+                        print 'The chosen site is ', chosen_site
+                    else:
+                        continue
+                if chosen_site is None:
+                    chosen_site = site
+
+                if np.isnan(count) or np.isnan(sparse_df.loc[exit_code, chosen_site]):
+                    sparse_df.loc[exit_code, chosen_site] = 0
+                else:
+                    sparse_df.loc[exit_code, chosen_site] = count
+
+    return sparse_df
+
+
+def list_of_sites(x):
+    return [item.keys() for item in x] or ['NA']
+
+def build_table_flatten(x):
+    d_outer = []
+
+    for column in x:
+
+        for item in x[column]:
+            d_outer.append(item)
+
+    return d_outer
+
+
+def pred(errors):
+    # Needs all of these files to be local
+    for filename in ['sparse_table.csv', 'actionfile.txt', 'my_model.h5']:
+        if not os.path.exists(filename):
+            return ['TBD']
+
+    df = pd.DataFrame(columns=('workflow', 'errors'))
+    base_data = []
+    for i in errors:
+        it = i.items()
+        base_data.extend(it)
+
+    for i, dat in enumerate(base_data):
+        workflow, error_dict = dat[0], dat[1]
+
+        if 'NotReported' in error_dict:
+            error_dict[-1] = error_dict.pop('NotReported')
+
+        df.loc[i] = [workflow, error_dict]
+
+    template_table = pd.read_csv("sparse_table.csv").set_index("Unnamed: 0")
+    template_table[:] = 0
+    df['errors_sites_exit_codes'] = df['errors'].apply(lambda x: x.keys() if x else ['0'])
+
+    df['errors_sites_dict'] = df['errors'].apply(lambda x: x.values() if x else [{'NA': 0}])
+
+    df['errors_sites_list'] = df['errors_sites_dict'].apply(list_of_sites)
+
+    list2d = df['errors_sites_exit_codes'].tolist()
+
+    sites_exit_codes = sorted(set(list(itertools.chain.from_iterable(list2d))), key=int)
+    sites_exit_codes = [str(x) for x in sites_exit_codes]
+
+    list2d_step1 = df['errors_sites_list'].tolist()
+    list2d_step2 = list(itertools.chain.from_iterable(list2d_step1))
+    site_names = sorted(set(list(itertools.chain.from_iterable(list2d_step2))))
+    site_names = [str(x) for x in site_names]
+
+    df['table_sites'] = df['errors'].apply(
+        lambda x: build_table(x, template_table))
+    df['table_sites_flatten'] = df['table_sites'].apply(build_table_flatten)
+    x_dataframe = df.loc[:, "table_sites_flatten"]
+    x_matrix = x_dataframe.values
+    feature_size = len(x_matrix[0])
+    res = []
+    clip_length = len(x_matrix[0])
+    for i in x_matrix:
+        i_clipped = i[:clip_length]
+        res.extend(i_clipped)
+
+    res = np.asarray(res).reshape(-1, feature_size)
+    mask = ~np.any(pd.isnull(res), axis=1)
+    res = res[mask]
+
+    model = K.models.load_model('my_model.h5')
+    predicted_actions_encoded = model.predict(np.array(np.asfarray(res)))
+    predicted_actions_encoded = np.round(predicted_actions_encoded)
+
+    action_code_dictionary = {}
+    a = np.genfromtxt("actionfile.txt", delimiter='\t', dtype=str)
+    b = list(i.split('   ') for i in a)
+    for i in b:
+
+        action_code_dictionary[int(i[1])] = i[0]
+
+    predicted_actions = []
+    for i in predicted_actions_encoded:
+        pos = np.argmax(i)
+
+        if pos in action_code_dictionary:
+            predicted_actions.append(action_code_dictionary[pos])
+        else:
+            predicted_actions.append(-1)
+
+    K.backend.clear_session()
 
+    return predicted_actions
 
 
-def predict(errors): # pylint: disable=unused-argument
+def predict(wf_obj):
     """
     Takes the errors for a workflow and makes an action prediction
-    :param workflowwebtool.workflowinfo.WorkflowInfo errors:
+    :param workflowwebtool.workflowinfo.WorkflowInfo wf_obj:
         The WorkflowInfo object that we want to perform a prediction on
     :returns: Prediction results to be passed back to a browser
     :rtype: dict
     """
 
-    return {'Action': 'TBD'}
+    return {
+        'Action': pred([wf_obj.get_errors(True)])[0]
+    }
diff --git a/workflowwebtools/web/static/js/submit.js b/workflowwebtools/web/static/js/submit.js
diff --git a/workflowwebtools/web/templates/workflowtables2.html b/workflowwebtools/web/templates/workflowtables2.html
diff --git a/workflowwebtools/workflowtools.py b/workflowwebtools/workflowtools.py