Skip to content

Commit 2055f4e

Browse files
authored
Merge pull request #180 from WengLab-InformaticsResearch/node_norm_update
Node norm update
2 parents db31bcc + 647770e commit 2055f4e

3 files changed

Lines changed: 53 additions & 58 deletions

File tree

cohd/biolink_mapper.py

Lines changed: 36 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -705,9 +705,9 @@ def build_mappings() -> Tuple[str, int]:
705705
total_errors = 0
706706
max_total_errors = 10
707707
max_tries = 2
708-
omop_labels = dict()
709-
lookup_responses = dict()
710-
potential_curies = list()
708+
string_sim_criteria = 0.9
709+
string_match_count = 0
710+
params = list()
711711
for r in missing_ingredient_concepts:
712712
if total_errors >= max_total_errors:
713713
logging.error(f'Biolink Mapper Max Total Errors')
@@ -718,19 +718,46 @@ def build_mappings() -> Tuple[str, int]:
718718
while tries <= max_tries:
719719
try:
720720
omop_id = r['concept_id']
721-
concept_name = r['concept_name']
722-
omop_labels[omop_id] = concept_name
721+
concept_name = r['concept_name'].lower()
723722

724723
# Lookup
725-
j = SriNameResolution.name_lookup(concept_name)
724+
j = SriNameResolution.name_lookup(concept_name, biolink_type='ChemicalEntity', timeout=20)
726725
if j is None:
727726
logging.error(f'Biolink Mapper SRI Lookup Error: {omop_id} - {concept_name}')
728727
total_errors += 1
729728
else:
730729
if len(j) > 0:
731-
# Collect the responses
732-
lookup_responses[omop_id] = j
733-
potential_curies.extend(j.keys())
730+
# Check if any of the labels match well enough
731+
# CURIEs are in order of best match, according to SRI, use this order to find the 1st match
732+
found_match = None
733+
for node in j:
734+
label = node['label']
735+
string_similarity = difflib.SequenceMatcher(None, concept_name, label.lower()).ratio()
736+
if string_similarity > string_sim_criteria:
737+
found_match = node
738+
break
739+
740+
# If none of the labels matched well, check the synonyms
741+
if not found_match:
742+
for node in j:
743+
for syn in node['synonyms']:
744+
string_similarity = difflib.SequenceMatcher(None, concept_name, syn.lower()).ratio()
745+
if string_similarity > string_sim_criteria:
746+
found_match = node
747+
break
748+
749+
if found_match:
750+
break
751+
752+
if found_match:
753+
curie = found_match['curie']
754+
label = found_match['label']
755+
categories = json.dumps(found_match['types'])
756+
provenance = f'(OMOP:{omop_id})-[SRI Name Resolution]-({curie})'
757+
params.extend([omop_id, curie, label, categories, provenance, True, 99,
758+
string_similarity])
759+
string_match_count += 1
760+
734761
break
735762
else:
736763
logging.info(f'Biolink Mapper - No Match: {omop_id} - {concept_name}')
@@ -740,47 +767,6 @@ def build_mappings() -> Tuple[str, int]:
740767

741768
tries += 1
742769

743-
# Call SRI Node Normalizer to get categories for all potential CURIEs
744-
potential_curies = list(set(potential_curies))
745-
normalized_nodes = SriNodeNormalizer.get_normalized_nodes(potential_curies, 60)
746-
747-
# For each search result, find the first result that is a biolink:ChemicalEntity and high string similarity
748-
string_sim_criteria = 0.9
749-
string_match_count = 0
750-
params = list()
751-
chemical_descendants = bm_toolkit.get_descendants('biolink:ChemicalEntity', reflexive=True, formatted=True)
752-
for omop_id, lookup_response in lookup_responses.items():
753-
omop_label = omop_labels[omop_id].lower()
754-
# CURIEs are in order of best match, according to SRI, so use this order to find the first match
755-
found_match = False
756-
for curie, labels in lookup_response.items():
757-
# Check if the categories of the CURIE include biolink:ChemicalEntity
758-
normalized_node = normalized_nodes.get(curie)
759-
if normalized_node is None:
760-
continue
761-
is_chemical_descendant = False
762-
categories = normalized_node.categories
763-
for category in categories:
764-
if category in chemical_descendants:
765-
is_chemical_descendant = True
766-
break
767-
if not is_chemical_descendant:
768-
continue
769-
770-
# Check if any of the labels match well enough
771-
for label in labels:
772-
string_similarity = difflib.SequenceMatcher(None, omop_label, label.lower()).ratio()
773-
if string_similarity > string_sim_criteria:
774-
found_match = True
775-
categories = json.dumps(categories)
776-
provenance = f'(OMOP:{omop_id})-[SRI Name Resolution]-({curie})'
777-
params.extend([omop_id, curie, label, categories, provenance, True, 99, string_similarity])
778-
string_match_count += 1
779-
break
780-
781-
if found_match:
782-
break
783-
784770
# Name lookup can take a while, reconnect to SQL server
785771
conn = sql_connection()
786772
cur = conn.cursor()

cohd/scheduled_tasks.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@ def task_build_cache():
1414
scheduler = BackgroundScheduler()
1515
scheduler.add_job(func=BiolinkConceptMapper.prefetch_mappings, trigger='cron', hour=6)
1616

17-
# Schedule a task to build the cache every first Saturday of the month (in ITRB-CI and Dev only)
17+
# Schedule a task to build the cache every first Saturday of the month
18+
# Perform in each ITRB environment and on the prod instance on TReK server
1819
deployment_env = app.config.get('DEPLOYMENT_ENV', 'dev').lower()
19-
if False:
20+
if deployment_env in ('itrb-ci', 'itrb-test', 'itrb-prod', 'prod'):
2021
scheduler.add_job(func=task_build_cache, trigger='cron', day='1st sat', hour=0)
2122
logging.info(f'Background task scheduled to build Biolink mappings (env: {deployment_env})')
2223
else:

cohd/translator/sri_name_resolution.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
class SriNameResolution:
99
# server_url = url= 'https://name-resolution-sri.renci.org/'
1010

11-
server_url_default = 'https://name-lookup.transltr.io'
11+
server_url_default = 'http://name-resolution-sri-dev.apps.renci.org/'
1212
server_urls = {
13-
'dev': 'https://name-resolution-sri.renci.org',
13+
'dev': 'http://name-resolution-sri-dev.apps.renci.org/',
1414
'ITRB-CI': 'https://name-lookup.ci.transltr.io',
1515
'ITRB-TEST': 'https://name-lookup.test.transltr.io',
1616
'ITRB-PROD': 'https://name-lookup.transltr.io'
@@ -22,14 +22,17 @@ class SriNameResolution:
2222
logging.info(f'Deployment environment "{deployment_env}" --> using Node Resolution @ {server_url}')
2323

2424
@staticmethod
25-
def name_lookup(text, offset=0, limit=10):
25+
def name_lookup(text, offset=0, limit=10, biolink_type=None, only_prefixes=None, timeout=_TIMEOUT):
2626
""" Lookup CURIEs by name using SRI Name Resolution service
2727
2828
Parameters
2929
----------
3030
text - name to search for
31-
offset - ???
31+
offset - The number of results to skip. Can be used to page through the results of a query.
3232
limit - max number of search results
33+
biolink_type - The Biolink type to filter to (with or without the biolink: prefix), e.g. biolink:Disease or
34+
Disease
35+
only_prefixes - Pipe-separated, case-sensitive list of prefixes to filter to, e.g. MONDO|EFO
3336
3437
Returns
3538
-------
@@ -43,10 +46,15 @@ def name_lookup(text, offset=0, limit=10):
4346
'offset': offset,
4447
'limit': limit
4548
}
49+
if biolink_type is not None:
50+
params['biolink_type'] = biolink_type
51+
if only_prefixes is not None:
52+
params['only_prefixes'] = only_prefixes
53+
4654
try:
47-
response = requests.post(url, params=params, timeout=SriNameResolution._TIMEOUT)
55+
response = requests.post(url, params=params, timeout=timeout)
4856
except requests.exceptions.Timeout:
49-
logging.error(f'SRI Name Resolution timed out after {SriNameResolution._TIMEOUT} sec\n'
57+
logging.error(f'SRI Name Resolution timed out after {timeout} sec\n'
5058
f'Posted params:\n{json.dumps(params)}'
5159
)
5260
return None

0 commit comments

Comments
 (0)