3535LOWERCASE_ALL_NAMES = True
3636
3737
38- def convert_synonyms_to_sapbert (synonym_filename , sapbert_filename_gzipped ):
38+ def convert_synonyms_to_sapbert (synonym_filename_gz , sapbert_filename_gzipped ):
3939 """
4040 Convert a synonyms file to the training format for SAPBERT (https://github.com/RENCI-NER/sapbert).
4141
4242 Based on the code in https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207
4343
44- :param synonym_filename : The compendium file to convert.
44+ :param synonym_filename_gz : The compendium file to convert.
4545 :param sapbert_filename_gzipped: The SAPBERT training file to generate.
4646 """
4747
48- logger .info (f"convert_synonyms_to_sapbert({ synonym_filename } , { sapbert_filename_gzipped } )" )
48+ logger .info (f"convert_synonyms_to_sapbert({ synonym_filename_gz } , { sapbert_filename_gzipped } )" )
4949
5050 # For now, the simplest way to identify the DrugChemicalConflated file is by name.
5151 # In this case we still generate DrugChemicalConflated.txt, but we also generate
5252 # DrugChemicalConflatedSmaller.txt, which ignores cliques whose preferred label is
5353 # longer than config['demote_labels_longer_than'].
5454 generate_smaller_filename = None
55- if GENERATE_DRUG_CHEMICAL_SMALLER_FILE and synonym_filename .endswith ('/DrugChemicalConflated.txt' ):
55+ if GENERATE_DRUG_CHEMICAL_SMALLER_FILE and synonym_filename_gz .endswith ('/DrugChemicalConflated.txt.gz ' ):
5656 generate_smaller_filename = sapbert_filename_gzipped .replace ('.txt.gz' , 'Smaller.txt.gz' )
5757
5858 # Make the output directories if they don't exist.
@@ -67,7 +67,7 @@ def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
6767 count_entry = 0
6868 count_training_rows = 0
6969 count_smaller_rows = 0
70- with open (synonym_filename , "r " , encoding = "utf-8" ) as synonymf , gzip .open (sapbert_filename_gzipped , "wt" , encoding = "utf-8" ) as sapbertf :
70+ with gzip . open (synonym_filename_gz , "rt " , encoding = "utf-8" ) as synonymf , gzip .open (sapbert_filename_gzipped , "wt" , encoding = "utf-8" ) as sapbertf :
7171 for input_line in synonymf :
7272 count_entry += 1
7373 entry = json .loads (input_line )
@@ -136,14 +136,14 @@ def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
136136 generate_smaller_file .write (line )
137137 count_smaller_rows += 1
138138
139- logger .info (f"Converted { synonym_filename } to SAPBERT training file { synonym_filename } : " +
139+ logger .info (f"Converted { synonym_filename_gz } to SAPBERT training file { synonym_filename_gz } : " +
140140 f"read { count_entry } entries and wrote out { count_training_rows } training rows." )
141141
142142 # Close SmallerFile if needed.
143143 if generate_smaller_file :
144144 generate_smaller_file .close ()
145145 percentage = count_smaller_rows / float (count_training_rows ) * 100
146- logger .info (f"Converted { synonym_filename } to smaller SAPBERT training file { generate_smaller_filename } : " +
146+ logger .info (f"Converted { synonym_filename_gz } to smaller SAPBERT training file { generate_smaller_filename } : " +
147147 f"read { count_entry } entries and wrote out { count_smaller_rows } training rows ({ percentage :.2f} %)." )
148148
149149
0 commit comments