Skip to content

Commit f6c53f3

Browse files
committed
Update biolink_mapper for new Name Resolution service
1 parent fb16381 commit f6c53f3

2 files changed

Lines changed: 39 additions & 53 deletions

File tree

cohd/biolink_mapper.py

Lines changed: 36 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -704,9 +704,9 @@ def build_mappings() -> Tuple[str, int]:
704704
total_errors = 0
705705
max_total_errors = 10
706706
max_tries = 2
707-
omop_labels = dict()
708-
lookup_responses = dict()
709-
potential_curies = list()
707+
string_sim_criteria = 0.9
708+
string_match_count = 0
709+
params = list()
710710
for r in missing_ingredient_concepts:
711711
if total_errors >= max_total_errors:
712712
logging.error(f'Biolink Mapper Max Total Errors')
@@ -717,19 +717,46 @@ def build_mappings() -> Tuple[str, int]:
717717
while tries <= max_tries:
718718
try:
719719
omop_id = r['concept_id']
720-
concept_name = r['concept_name']
721-
omop_labels[omop_id] = concept_name
720+
concept_name = r['concept_name'].lower()
722721

723722
# Lookup
724-
j = SriNameResolution.name_lookup(concept_name)
723+
j = SriNameResolution.name_lookup(concept_name, biolink_type='ChemicalEntity', timeout=20)
725724
if j is None:
726725
logging.error(f'Biolink Mapper SRI Lookup Error: {omop_id} - {concept_name}')
727726
total_errors += 1
728727
else:
729728
if len(j) > 0:
730-
# Collect the responses
731-
lookup_responses[omop_id] = j
732-
potential_curies.extend(j.keys())
729+
# Check if any of the labels match well enough
730+
# CURIEs are in order of best match, according to SRI, use this order to find the 1st match
731+
found_match = None
732+
for node in j:
733+
label = node['label']
734+
string_similarity = difflib.SequenceMatcher(None, concept_name, label.lower()).ratio()
735+
if string_similarity > string_sim_criteria:
736+
found_match = node
737+
break
738+
739+
# If none of the labels matched well, check the synonyms
740+
if not found_match:
741+
for node in j:
742+
for syn in node['synonyms']:
743+
string_similarity = difflib.SequenceMatcher(None, concept_name, syn.lower()).ratio()
744+
if string_similarity > string_sim_criteria:
745+
found_match = node
746+
break
747+
748+
if found_match:
749+
break
750+
751+
if found_match:
752+
curie = found_match['curie']
753+
label = found_match['label']
754+
categories = json.dumps(found_match['types'])
755+
provenance = f'(OMOP:{omop_id})-[SRI Name Resolution]-({curie})'
756+
params.extend([omop_id, curie, label, categories, provenance, True, 99,
757+
string_similarity])
758+
string_match_count += 1
759+
733760
break
734761
else:
735762
logging.info(f'Biolink Mapper - No Match: {omop_id} - {concept_name}')
@@ -739,47 +766,6 @@ def build_mappings() -> Tuple[str, int]:
739766

740767
tries += 1
741768

742-
# Call SRI Node Normalizer to get categories for all potential CURIEs
743-
potential_curies = list(set(potential_curies))
744-
normalized_nodes = SriNodeNormalizer.get_normalized_nodes(potential_curies, 60)
745-
746-
# For each search result, find the first result that is a biolink:ChemicalEntity and high string similarity
747-
string_sim_criteria = 0.9
748-
string_match_count = 0
749-
params = list()
750-
chemical_descendants = bm_toolkit.get_descendants('biolink:ChemicalEntity', reflexive=True, formatted=True)
751-
for omop_id, lookup_response in lookup_responses.items():
752-
omop_label = omop_labels[omop_id].lower()
753-
# CURIEs are in order of best match, according to SRI, so use this order to find the first match
754-
found_match = False
755-
for curie, labels in lookup_response.items():
756-
# Check if the categories of the CURIE include biolink:ChemicalEntity
757-
normalized_node = normalized_nodes.get(curie)
758-
if normalized_node is None:
759-
continue
760-
is_chemical_descendant = False
761-
categories = normalized_node.categories
762-
for category in categories:
763-
if category in chemical_descendants:
764-
is_chemical_descendant = True
765-
break
766-
if not is_chemical_descendant:
767-
continue
768-
769-
# Check if any of the labels match well enough
770-
for label in labels:
771-
string_similarity = difflib.SequenceMatcher(None, omop_label, label.lower()).ratio()
772-
if string_similarity > string_sim_criteria:
773-
found_match = True
774-
categories = json.dumps(categories)
775-
provenance = f'(OMOP:{omop_id})-[SRI Name Resolution]-({curie})'
776-
params.extend([omop_id, curie, label, categories, provenance, True, 99, string_similarity])
777-
string_match_count += 1
778-
break
779-
780-
if found_match:
781-
break
782-
783769
# Name lookup can take a while, reconnect to SQL server
784770
conn = sql_connection()
785771
cur = conn.cursor()

cohd/translator/sri_name_resolution.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class SriNameResolution:
2222
logging.info(f'Deployment environment "{deployment_env}" --> using Node Resolution @ {server_url}')
2323

2424
@staticmethod
25-
def name_lookup(text, offset=0, limit=10, biolink_type=None, only_prefixes=None):
25+
def name_lookup(text, offset=0, limit=10, biolink_type=None, only_prefixes=None, timeout=_TIMEOUT):
2626
""" Lookup CURIEs by name using SRI Name Resolution service
2727
2828
Parameters
@@ -52,9 +52,9 @@ def name_lookup(text, offset=0, limit=10, biolink_type=None, only_prefixes=None)
5252
params['only_prefixes'] = only_prefixes
5353

5454
try:
55-
response = requests.post(url, params=params, timeout=SriNameResolution._TIMEOUT)
55+
response = requests.post(url, params=params, timeout=timeout)
5656
except requests.exceptions.Timeout:
57-
logging.error(f'SRI Name Resolution timed out after {SriNameResolution._TIMEOUT} sec\n'
57+
logging.error(f'SRI Name Resolution timed out after {timeout} sec\n'
5858
f'Posted params:\n{json.dumps(params)}'
5959
)
6060
return None

0 commit comments

Comments
 (0)