@@ -704,9 +704,9 @@ def build_mappings() -> Tuple[str, int]:
704704 total_errors = 0
705705 max_total_errors = 10
706706 max_tries = 2
707- omop_labels = dict ()
708- lookup_responses = dict ()
709- potential_curies = list ()
707+ string_sim_criteria = 0.9
708+ string_match_count = 0
709+ params = list ()
710710 for r in missing_ingredient_concepts :
711711 if total_errors >= max_total_errors :
712712 logging .error (f'Biolink Mapper Max Total Errors' )
@@ -717,19 +717,46 @@ def build_mappings() -> Tuple[str, int]:
717717 while tries <= max_tries :
718718 try :
719719 omop_id = r ['concept_id' ]
720- concept_name = r ['concept_name' ]
721- omop_labels [omop_id ] = concept_name
720+ concept_name = r ['concept_name' ].lower ()
722721
723722 # Lookup
724- j = SriNameResolution .name_lookup (concept_name )
723+ j = SriNameResolution .name_lookup (concept_name , biolink_type = 'ChemicalEntity' , timeout = 20 )
725724 if j is None :
726725 logging .error (f'Biolink Mapper SRI Lookup Error: { omop_id } - { concept_name } ' )
727726 total_errors += 1
728727 else :
729728 if len (j ) > 0 :
730- # Collect the responses
731- lookup_responses [omop_id ] = j
732- potential_curies .extend (j .keys ())
729+ # Check if any of the labels match well enough
730+ # CURIEs are in order of best match, according to SRI, use this order to find the 1st match
731+ found_match = None
732+ for node in j :
733+ label = node ['label' ]
734+ string_similarity = difflib .SequenceMatcher (None , concept_name , label .lower ()).ratio ()
735+ if string_similarity > string_sim_criteria :
736+ found_match = node
737+ break
738+
739+ # If none of the labels matched well, check the synonyms
740+ if not found_match :
741+ for node in j :
742+ for syn in node ['synonyms' ]:
743+ string_similarity = difflib .SequenceMatcher (None , concept_name , syn .lower ()).ratio ()
744+ if string_similarity > string_sim_criteria :
745+ found_match = node
746+ break
747+
748+ if found_match :
749+ break
750+
751+ if found_match :
752+ curie = found_match ['curie' ]
753+ label = found_match ['label' ]
754+ categories = json .dumps (found_match ['types' ])
755+ provenance = f'(OMOP:{ omop_id } )-[SRI Name Resolution]-({ curie } )'
756+ params .extend ([omop_id , curie , label , categories , provenance , True , 99 ,
757+ string_similarity ])
758+ string_match_count += 1
759+
733760 break
734761 else :
735762 logging .info (f'Biolink Mapper - No Match: { omop_id } - { concept_name } ' )
@@ -739,47 +766,6 @@ def build_mappings() -> Tuple[str, int]:
739766
740767 tries += 1
741768
742- # Call SRI Node Normalizer to get categories for all potential CURIEs
743- potential_curies = list (set (potential_curies ))
744- normalized_nodes = SriNodeNormalizer .get_normalized_nodes (potential_curies , 60 )
745-
746- # For each search result, find the first result that is a biolink:ChemicalEntity and high string similarity
747- string_sim_criteria = 0.9
748- string_match_count = 0
749- params = list ()
750- chemical_descendants = bm_toolkit .get_descendants ('biolink:ChemicalEntity' , reflexive = True , formatted = True )
751- for omop_id , lookup_response in lookup_responses .items ():
752- omop_label = omop_labels [omop_id ].lower ()
753- # CURIEs are in order of best match, according to SRI, so use this order to find the first match
754- found_match = False
755- for curie , labels in lookup_response .items ():
756- # Check if the categories of the CURIE include biolink:ChemicalEntity
757- normalized_node = normalized_nodes .get (curie )
758- if normalized_node is None :
759- continue
760- is_chemical_descendant = False
761- categories = normalized_node .categories
762- for category in categories :
763- if category in chemical_descendants :
764- is_chemical_descendant = True
765- break
766- if not is_chemical_descendant :
767- continue
768-
769- # Check if any of the labels match well enough
770- for label in labels :
771- string_similarity = difflib .SequenceMatcher (None , omop_label , label .lower ()).ratio ()
772- if string_similarity > string_sim_criteria :
773- found_match = True
774- categories = json .dumps (categories )
775- provenance = f'(OMOP:{ omop_id } )-[SRI Name Resolution]-({ curie } )'
776- params .extend ([omop_id , curie , label , categories , provenance , True , 99 , string_similarity ])
777- string_match_count += 1
778- break
779-
780- if found_match :
781- break
782-
783769 # Name lookup can take a while, reconnect to SQL server
784770 conn = sql_connection ()
785771 cur = conn .cursor ()
0 commit comments