Merge pull request #180 from WengLab-InformaticsResearch/node_norm_update

CaseyTa · web-flow · commit 2055f4e56840 · 2023-08-11T11:43:50.000-07:00
Node norm update
diff --git a/cohd/biolink_mapper.py b/cohd/biolink_mapper.py
@@ -705,9 +705,9 @@ def build_mappings() -> Tuple[str, int]:
         total_errors = 0
         max_total_errors = 10
         max_tries = 2
-        omop_labels = dict()
-        lookup_responses = dict()
-        potential_curies = list()
+        string_sim_criteria = 0.9
+        string_match_count = 0
+        params = list()
         for r in missing_ingredient_concepts:
             if total_errors >= max_total_errors:
                 logging.error(f'Biolink Mapper Max Total Errors')
@@ -718,19 +718,46 @@ def build_mappings() -> Tuple[str, int]:
             while tries <= max_tries:
                 try:
                     omop_id = r['concept_id']
-                    concept_name = r['concept_name']
-                    omop_labels[omop_id] = concept_name
+                    concept_name = r['concept_name'].lower()
 
                     # Lookup
-                    j = SriNameResolution.name_lookup(concept_name)
+                    j = SriNameResolution.name_lookup(concept_name, biolink_type='ChemicalEntity', timeout=20)
                     if j is None:
                         logging.error(f'Biolink Mapper SRI Lookup Error: {omop_id} - {concept_name}')
                         total_errors += 1
                     else:
                         if len(j) > 0:
-                            # Collect the responses
-                            lookup_responses[omop_id] = j
-                            potential_curies.extend(j.keys())
+                            # Check if any of the labels match well enough
+                            # CURIEs are in order of best match, according to SRI, use this order to find the 1st match
+                            found_match = None
+                            for node in j:
+                                label = node['label']
+                                string_similarity = difflib.SequenceMatcher(None, concept_name, label.lower()).ratio()
+                                if string_similarity > string_sim_criteria:
+                                    found_match = node
+                                    break
+
+                            # If none of the labels matched well, check the synonyms
+                            if not found_match:
+                                for node in j:
+                                    for syn in node['synonyms']:
+                                        string_similarity = difflib.SequenceMatcher(None, concept_name, syn.lower()).ratio()
+                                        if string_similarity > string_sim_criteria:
+                                            found_match = node
+                                            break
+
+                                    if found_match:
+                                        break
+
+                            if found_match:
+                                curie = found_match['curie']
+                                label = found_match['label']
+                                categories = json.dumps(found_match['types'])                                
+                                provenance = f'(OMOP:{omop_id})-[SRI Name Resolution]-({curie})'
+                                params.extend([omop_id, curie, label, categories, provenance, True, 99,
+                                               string_similarity])
+                                string_match_count += 1
+
                             break
                         else:
                             logging.info(f'Biolink Mapper - No Match: {omop_id} - {concept_name}')
@@ -740,47 +767,6 @@ def build_mappings() -> Tuple[str, int]:
 
                 tries += 1
 
-        # Call SRI Node Normalizer to get categories for all potential CURIEs
-        potential_curies = list(set(potential_curies))
-        normalized_nodes = SriNodeNormalizer.get_normalized_nodes(potential_curies, 60)
-
-        # For each search result, find the first result that is a biolink:ChemicalEntity and high string similarity
-        string_sim_criteria = 0.9
-        string_match_count = 0
-        params = list()
-        chemical_descendants = bm_toolkit.get_descendants('biolink:ChemicalEntity', reflexive=True, formatted=True)
-        for omop_id, lookup_response in lookup_responses.items():
-            omop_label = omop_labels[omop_id].lower()
-            # CURIEs are in order of best match, according to SRI, so use this order to find the first match
-            found_match = False
-            for curie, labels in lookup_response.items():
-                # Check if the categories of the CURIE include biolink:ChemicalEntity
-                normalized_node = normalized_nodes.get(curie)
-                if normalized_node is None:
-                    continue
-                is_chemical_descendant = False
-                categories = normalized_node.categories
-                for category in categories:
-                    if category in chemical_descendants:
-                        is_chemical_descendant = True
-                        break
-                if not is_chemical_descendant:
-                    continue
-
-                # Check if any of the labels match well enough
-                for label in labels:
-                    string_similarity = difflib.SequenceMatcher(None, omop_label, label.lower()).ratio()
-                    if string_similarity > string_sim_criteria:
-                        found_match = True
-                        categories = json.dumps(categories)
-                        provenance = f'(OMOP:{omop_id})-[SRI Name Resolution]-({curie})'
-                        params.extend([omop_id, curie, label, categories, provenance, True, 99, string_similarity])
-                        string_match_count += 1
-                        break
-
-                if found_match:
-                    break
-
         # Name lookup can take a while, reconnect to SQL server
         conn = sql_connection()
         cur = conn.cursor()
diff --git a/cohd/scheduled_tasks.py b/cohd/scheduled_tasks.py
@@ -14,9 +14,10 @@ def task_build_cache():
 scheduler = BackgroundScheduler()
 scheduler.add_job(func=BiolinkConceptMapper.prefetch_mappings, trigger='cron', hour=6)
 
-# Schedule a task to build the cache every first Saturday of the month (in ITRB-CI and Dev only)
+# Schedule a task to build the cache every first Saturday of the month 
+# Perform in each ITRB environment and on the prod instance on TReK server 
 deployment_env = app.config.get('DEPLOYMENT_ENV', 'dev').lower()
-if False:    
+if deployment_env in ('itrb-ci', 'itrb-test', 'itrb-prod', 'prod'):    
     scheduler.add_job(func=task_build_cache, trigger='cron', day='1st sat', hour=0)    
     logging.info(f'Background task scheduled to build Biolink mappings (env: {deployment_env})')
 else:
diff --git a/cohd/translator/sri_name_resolution.py b/cohd/translator/sri_name_resolution.py
@@ -8,9 +8,9 @@
 class SriNameResolution:
     # server_url = url= 'https://name-resolution-sri.renci.org/'
 
-    server_url_default = 'https://name-lookup.transltr.io'
+    server_url_default = 'http://name-resolution-sri-dev.apps.renci.org/'
     server_urls = {
-        'dev': 'https://name-resolution-sri.renci.org',
+        'dev': 'http://name-resolution-sri-dev.apps.renci.org/',
         'ITRB-CI': 'https://name-lookup.ci.transltr.io',
         'ITRB-TEST': 'https://name-lookup.test.transltr.io',
         'ITRB-PROD': 'https://name-lookup.transltr.io'
@@ -22,14 +22,17 @@ class SriNameResolution:
     logging.info(f'Deployment environment "{deployment_env}" --> using Node Resolution @ {server_url}')
 
     @staticmethod
-    def name_lookup(text, offset=0, limit=10):
+    def name_lookup(text, offset=0, limit=10, biolink_type=None, only_prefixes=None, timeout=_TIMEOUT):
         """ Lookup CURIEs by name using SRI Name Resolution service
 
         Parameters
         ----------
         text - name to search for
-        offset - ???
+        offset - The number of results to skip. Can be used to page through the results of a query.
         limit - max number of search results
+        biolink_type - The Biolink type to filter to (with or without the biolink: prefix), e.g. biolink:Disease or
+                       Disease
+        only_prefixes - Pipe-separated, case-sensitive list of prefixes to filter to, e.g. MONDO|EFO
 
         Returns
         -------
@@ -43,10 +46,15 @@ def name_lookup(text, offset=0, limit=10):
             'offset': offset,
             'limit': limit
         }
+        if biolink_type is not None:
+            params['biolink_type'] = biolink_type
+        if only_prefixes is not None:
+            params['only_prefixes'] = only_prefixes
+
         try:
-            response = requests.post(url, params=params, timeout=SriNameResolution._TIMEOUT)
+            response = requests.post(url, params=params, timeout=timeout)
         except requests.exceptions.Timeout:
-            logging.error(f'SRI Name Resolution timed out after {SriNameResolution._TIMEOUT} sec\n'
+            logging.error(f'SRI Name Resolution timed out after {timeout} sec\n'
                           f'Posted params:\n{json.dumps(params)}'
                           )
             return None