@@ -705,9 +705,9 @@ def build_mappings() -> Tuple[str, int]:
705705 total_errors = 0
706706 max_total_errors = 10
707707 max_tries = 2
708- omop_labels = dict ()
709- lookup_responses = dict ()
710- potential_curies = list ()
708+ string_sim_criteria = 0.9
709+ string_match_count = 0
710+ params = list ()
711711 for r in missing_ingredient_concepts :
712712 if total_errors >= max_total_errors :
713713 logging .error (f'Biolink Mapper Max Total Errors' )
@@ -718,19 +718,46 @@ def build_mappings() -> Tuple[str, int]:
718718 while tries <= max_tries :
719719 try :
720720 omop_id = r ['concept_id' ]
721- concept_name = r ['concept_name' ]
722- omop_labels [omop_id ] = concept_name
721+ concept_name = r ['concept_name' ].lower ()
723722
724723 # Lookup
725- j = SriNameResolution .name_lookup (concept_name )
724+ j = SriNameResolution .name_lookup (concept_name , biolink_type = 'ChemicalEntity' , timeout = 20 )
726725 if j is None :
727726 logging .error (f'Biolink Mapper SRI Lookup Error: { omop_id } - { concept_name } ' )
728727 total_errors += 1
729728 else :
730729 if len (j ) > 0 :
731- # Collect the responses
732- lookup_responses [omop_id ] = j
733- potential_curies .extend (j .keys ())
730+ # Check if any of the labels match well enough
731+ # CURIEs are in order of best match, according to SRI, use this order to find the 1st match
732+ found_match = None
733+ for node in j :
734+ label = node ['label' ]
735+ string_similarity = difflib .SequenceMatcher (None , concept_name , label .lower ()).ratio ()
736+ if string_similarity > string_sim_criteria :
737+ found_match = node
738+ break
739+
740+ # If none of the labels matched well, check the synonyms
741+ if not found_match :
742+ for node in j :
743+ for syn in node ['synonyms' ]:
744+ string_similarity = difflib .SequenceMatcher (None , concept_name , syn .lower ()).ratio ()
745+ if string_similarity > string_sim_criteria :
746+ found_match = node
747+ break
748+
749+ if found_match :
750+ break
751+
752+ if found_match :
753+ curie = found_match ['curie' ]
754+ label = found_match ['label' ]
755+ categories = json .dumps (found_match ['types' ])
756+ provenance = f'(OMOP:{ omop_id } )-[SRI Name Resolution]-({ curie } )'
757+ params .extend ([omop_id , curie , label , categories , provenance , True , 99 ,
758+ string_similarity ])
759+ string_match_count += 1
760+
734761 break
735762 else :
736763 logging .info (f'Biolink Mapper - No Match: { omop_id } - { concept_name } ' )
@@ -740,47 +767,6 @@ def build_mappings() -> Tuple[str, int]:
740767
741768 tries += 1
742769
743- # Call SRI Node Normalizer to get categories for all potential CURIEs
744- potential_curies = list (set (potential_curies ))
745- normalized_nodes = SriNodeNormalizer .get_normalized_nodes (potential_curies , 60 )
746-
747- # For each search result, find the first result that is a biolink:ChemicalEntity and high string similarity
748- string_sim_criteria = 0.9
749- string_match_count = 0
750- params = list ()
751- chemical_descendants = bm_toolkit .get_descendants ('biolink:ChemicalEntity' , reflexive = True , formatted = True )
752- for omop_id , lookup_response in lookup_responses .items ():
753- omop_label = omop_labels [omop_id ].lower ()
754- # CURIEs are in order of best match, according to SRI, so use this order to find the first match
755- found_match = False
756- for curie , labels in lookup_response .items ():
757- # Check if the categories of the CURIE include biolink:ChemicalEntity
758- normalized_node = normalized_nodes .get (curie )
759- if normalized_node is None :
760- continue
761- is_chemical_descendant = False
762- categories = normalized_node .categories
763- for category in categories :
764- if category in chemical_descendants :
765- is_chemical_descendant = True
766- break
767- if not is_chemical_descendant :
768- continue
769-
770- # Check if any of the labels match well enough
771- for label in labels :
772- string_similarity = difflib .SequenceMatcher (None , omop_label , label .lower ()).ratio ()
773- if string_similarity > string_sim_criteria :
774- found_match = True
775- categories = json .dumps (categories )
776- provenance = f'(OMOP:{ omop_id } )-[SRI Name Resolution]-({ curie } )'
777- params .extend ([omop_id , curie , label , categories , provenance , True , 99 , string_similarity ])
778- string_match_count += 1
779- break
780-
781- if found_match :
782- break
783-
784770 # Name lookup can take a while, reconnect to SQL server
785771 conn = sql_connection ()
786772 cur = conn .cursor ()
0 commit comments