Skip to content

Commit 3fe7526

Browse files
authored
Merge pull request #434 from TranslatorSRI/fix-synonyms-compression
Actually running the synonym compression code in Babel v1.10.0 (#424) revealed that it wasn't working properly. This PR should fix that.
2 parents 5c5b1d6 + b752845 commit 3fe7526

13 files changed

Lines changed: 78 additions & 57 deletions

Snakefile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,12 @@ rule clean_downloads:
7474
shell:
7575
"rm -rf {params.dir}/*"
7676

77+
# Sometimes a synonyms file is available as a .gz file, but not as the .txt file itself.
78+
# This rule is here so that Snakemake knows how to uncompress it if needed.
79+
rule uncompress_synonym_file:
80+
input:
81+
config['output_directory'] + '/synonyms/{synonym_file}.txt.gz'
82+
output:
83+
config['output_directory'] + '/synonyms/{synonym_file}.txt'
84+
shell:
85+
'gunzip {input} -c > {output}'

src/createcompendia/leftover_umls.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from src.categories import ACTIVITY, AGENT, DEVICE, DRUG, FOOD, SMALL_MOLECULE, PHYSICAL_ENTITY, PUBLICATION, PROCEDURE
1313

1414

15-
def write_leftover_umls(compendia, mrconso, mrsty, synonyms, umls_compendium, umls_synonyms, report, done, biolink_version):
15+
def write_leftover_umls(compendia, mrconso, mrsty, synonyms, umls_compendium, umls_synonyms, report, biolink_version):
1616
"""
1717
Search for "leftover" UMLS concepts, i.e. those that are defined and valid in MRCONSO but are not
1818
mapped to a concept in Babel.
@@ -26,12 +26,11 @@ def write_leftover_umls(compendia, mrconso, mrsty, synonyms, umls_compendium, um
2626
:param umls_compendium: The UMLS compendium file to write out.
2727
:param umls_synonyms: The synonyms file to generate for this compendium.
2828
:param report: The report file to write out.
29-
:param done: The done file to write out.
3029
:return: Nothing.
3130
"""
3231

3332
logging = Logger()
34-
logging.info(f"write_leftover_umls({compendia}, {mrconso}, {mrsty}, {synonyms}, {umls_compendium}, {umls_synonyms}, {report}, {done})")
33+
logging.info(f"write_leftover_umls({compendia}, {mrconso}, {mrsty}, {synonyms}, {umls_compendium}, {umls_synonyms}, {report}, {biolink_version})")
3534

3635
# For now, we have many more UMLS entities in MRCONSO than in the compendia, so
3736
# we'll make an in-memory list of those first. Once that flips, this should be
@@ -244,9 +243,4 @@ def umls_type_to_biolink_type(umls_tui):
244243
logging.info(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.")
245244
reportf.write(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.\n")
246245

247-
248-
# Write out `done` file.
249-
with open(done, 'w') as outf:
250-
outf.write(f"done\n{datetime.now()}\n")
251-
252-
logging.info("Complete")
246+
logging.info("Complete")

src/exporters/duckdb_exporters.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def export_compendia_to_parquet(compendium_filename, clique_parquet_filename, du
9797
)
9898

9999

100-
def export_synonyms_to_parquet(synonyms_filename, duckdb_filename, synonyms_parquet_filename):
100+
def export_synonyms_to_parquet(synonyms_filename_gz, duckdb_filename, synonyms_parquet_filename):
101101
"""
102102
Export a synonyms file to a DuckDB directory.
103103
@@ -115,7 +115,7 @@ def export_synonyms_to_parquet(synonyms_filename, duckdb_filename, synonyms_parq
115115

116116
with setup_duckdb(duckdb_filename) as db:
117117
# Step 1. Load the entire synonyms file.
118-
synonyms_jsonl = db.read_json(synonyms_filename, format='newline_delimited')
118+
synonyms_jsonl = db.read_json(synonyms_filename_gz, format='newline_delimited')
119119

120120
# Step 2. Create a Cliques table with all the cliques from this file.
121121
#db.sql("CREATE TABLE Cliques (curie TEXT PRIMARY KEY, label TEXT, clique_identifier_count INT, biolink_type TEXT)")

src/exporters/sapbert.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,24 @@
3535
LOWERCASE_ALL_NAMES = True
3636

3737

38-
def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
38+
def convert_synonyms_to_sapbert(synonym_filename_gz, sapbert_filename_gzipped):
3939
"""
4040
Convert a synonyms file to the training format for SAPBERT (https://github.com/RENCI-NER/sapbert).
4141
4242
Based on the code in https://github.com/TranslatorSRI/babel-validation/blob/f21b1b308e54ec0af616f2c24f7e2738ac4c261c/src/main/scala/org/renci/babel/utils/converter/Converter.scala#L107-L207
4343
44-
:param synonym_filename: The compendium file to convert.
44+
:param synonym_filename_gz: The compendium file to convert.
4545
:param sapbert_filename_gzipped: The SAPBERT training file to generate.
4646
"""
4747

48-
logger.info(f"convert_synonyms_to_sapbert({synonym_filename}, {sapbert_filename_gzipped})")
48+
logger.info(f"convert_synonyms_to_sapbert({synonym_filename_gz}, {sapbert_filename_gzipped})")
4949

5050
# For now, the simplest way to identify the DrugChemicalConflated file is by name.
5151
# In this case we still generate DrugChemicalConflated.txt, but we also generate
5252
# DrugChemicalConflatedSmaller.txt, which ignores cliques whose preferred label is
5353
# longer than config['demote_labels_longer_than'].
5454
generate_smaller_filename = None
55-
if GENERATE_DRUG_CHEMICAL_SMALLER_FILE and synonym_filename.endswith('/DrugChemicalConflated.txt'):
55+
if GENERATE_DRUG_CHEMICAL_SMALLER_FILE and synonym_filename_gz.endswith('/DrugChemicalConflated.txt.gz'):
5656
generate_smaller_filename = sapbert_filename_gzipped.replace('.txt.gz', 'Smaller.txt.gz')
5757

5858
# Make the output directories if they don't exist.
@@ -67,7 +67,7 @@ def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
6767
count_entry = 0
6868
count_training_rows = 0
6969
count_smaller_rows = 0
70-
with open(synonym_filename, "r", encoding="utf-8") as synonymf, gzip.open(sapbert_filename_gzipped, "wt", encoding="utf-8") as sapbertf:
70+
with gzip.open(synonym_filename_gz, "rt", encoding="utf-8") as synonymf, gzip.open(sapbert_filename_gzipped, "wt", encoding="utf-8") as sapbertf:
7171
for input_line in synonymf:
7272
count_entry += 1
7373
entry = json.loads(input_line)
@@ -136,14 +136,14 @@ def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
136136
generate_smaller_file.write(line)
137137
count_smaller_rows += 1
138138

139-
logger.info(f"Converted {synonym_filename} to SAPBERT training file {synonym_filename}: " +
139+
logger.info(f"Converted {synonym_filename_gz} to SAPBERT training file {synonym_filename_gz}: " +
140140
f"read {count_entry} entries and wrote out {count_training_rows} training rows.")
141141

142142
# Close SmallerFile if needed.
143143
if generate_smaller_file:
144144
generate_smaller_file.close()
145145
percentage = count_smaller_rows / float(count_training_rows) * 100
146-
logger.info(f"Converted {synonym_filename} to smaller SAPBERT training file {generate_smaller_filename}: " +
146+
logger.info(f"Converted {synonym_filename_gz} to smaller SAPBERT training file {generate_smaller_filename}: " +
147147
f"read {count_entry} entries and wrote out {count_smaller_rows} training rows ({percentage:.2f}%).")
148148

149149

src/snakefiles/cell_line.snakefile

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import src.datahandlers.clo as clo
22
import src.createcompendia.cell_line as cell_line
33
import src.assess_compendia as assessments
4+
import src.snakefiles.util as util
45

56
### Cell Line
67

@@ -26,7 +27,7 @@ rule cell_line_compendia:
2627
icrdf_filename=config['download_directory']+'/icRDF.tsv',
2728
output:
2829
config['output_directory']+"/compendia/CellLine.txt",
29-
config['output_directory']+"/synonyms/CellLine.txt"
30+
temp(config['output_directory']+"/synonyms/CellLine.txt")
3031
run:
3132
cell_line.build_compendia(input.ids,input.icrdf_filename)
3233

@@ -49,9 +50,11 @@ rule check_cell_line:
4950
rule cell_line:
5051
input:
5152
config['output_directory']+'/reports/cell_line_completeness.txt',
52-
config['output_directory'] + "/synonyms/CellLine.txt",
53-
config['output_directory'] + "/reports/CellLine.txt"
53+
config['output_directory'] + "/reports/CellLine.txt",
54+
cell_line_synonyms=config['output_directory'] + "/synonyms/CellLine.txt",
5455
output:
56+
config['output_directory'] + "/synonyms/CellLine.txt.gz",
5557
x=config['output_directory']+'/reports/cell_line_done'
56-
shell:
57-
"echo 'done' >> {output.x}"
58+
run:
59+
util.gzip_files([input.cell_line_synonyms])
60+
util.write_done(output.x)

src/snakefiles/drugchemical.snakefile

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,20 +56,17 @@ rule drugchemical_conflated_synonyms:
5656
input:
5757
drugchemical_conflation=[config['output_directory']+'/conflation/DrugChemical.txt'],
5858
chemical_compendia=expand("{do}/compendia/{co}", do=config['output_directory'], co=config['chemical_outputs']),
59-
chemical_synonyms=expand("{do}/synonyms/{co}", do=config['output_directory'], co=config['chemical_outputs']),
59+
chemical_synonyms_gz=expand("{do}/synonyms/{co}.gz", do=config['output_directory'], co=config['chemical_outputs']),
6060
output:
61-
drugchemical_conflated=config['output_directory']+'/synonyms/DrugChemicalConflated.txt',
61+
drugchemical_conflated_gz=config['output_directory']+'/synonyms/DrugChemicalConflated.txt.gz',
6262
run:
63-
synonymconflation.conflate_synonyms(input.chemical_synonyms, input.chemical_compendia, input.drugchemical_conflation, output.drugchemical_conflated)
63+
synonymconflation.conflate_synonyms(input.chemical_synonyms_gz, input.chemical_compendia, input.drugchemical_conflation, output.drugchemical_conflated_gz)
6464

6565
rule drugchemical:
6666
input:
6767
config['output_directory']+'/conflation/DrugChemical.txt',
68-
config['output_directory']+'/synonyms/DrugChemicalConflated.txt',
69-
chemical_synonyms=expand("{do}/synonyms/{co}", do=config['output_directory'], co=config['chemical_outputs']),
68+
config['output_directory']+'/synonyms/DrugChemicalConflated.txt.gz',
7069
output:
71-
chemical_synonyms_gzipped=expand("{do}/synonyms/{co}.gz", do=config['output_directory'], co=config['chemical_outputs']),
72-
x=config['output_directory']+'/reports/drugchemical_done'
70+
done=config['output_directory']+'/reports/drugchemical_done'
7371
run:
74-
util.gzip_files(input.chemical_synonyms)
75-
util.write_done(output.x)
72+
util.write_done(output.done)

src/snakefiles/duckdb.snakefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ rule export_all_synonyms_to_duckdb:
4444
# Generic rule for generating the Parquet files for a particular compendia file.
4545
rule export_synonyms_to_duckdb:
4646
input:
47-
synonyms_file=config['output_directory'] + "/synonyms/{filename}.txt",
47+
synonyms_file=config['output_directory'] + "/synonyms/{filename}.txt.gz",
4848
output:
4949
duckdb_filename=temp(config['output_directory'] + "/duckdb/duckdbs/filename={filename}/synonyms.duckdb"),
5050
synonyms_parquet_filename=config['output_directory'] + "/duckdb/parquet/filename={filename}/Synonyms.parquet",

src/snakefiles/exports.snakefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ rule export_all_to_sapbert_training:
4949
# Generic rule for generating the KGX files for a particular compendia file.
5050
rule generate_sapbert_training_data:
5151
input:
52-
synonym_file=config['output_directory'] + "/synonyms/{filename}",
52+
synonym_file_gz=config['output_directory'] + "/synonyms/{filename}.gz",
5353
output:
5454
sapbert_training_data_file=config['output_directory'] + "/sapbert-training-data/{filename}.gz",
5555
run:
56-
sapbert.convert_synonyms_to_sapbert(input.synonym_file, output.sapbert_training_data_file)
56+
sapbert.convert_synonyms_to_sapbert(input.synonym_file_gz, output.sapbert_training_data_file)

src/snakefiles/leftover_umls.snakefile

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,15 @@ rule leftover_umls:
3131
umls_compendium = config['output_directory'] + "/compendia/umls.txt",
3232
umls_synonyms = temp(config['output_directory'] + "/synonyms/umls.txt"),
3333
report = config['output_directory'] + "/reports/umls.txt",
34-
done = config['output_directory'] + "/reports/umls_done"
3534
run:
36-
write_leftover_umls(input.input_compendia, input.mrconso, input.mrsty, input.synonyms, output.umls_compendium, output.umls_synonyms, output.report, output.done, config['biolink_version'])
35+
write_leftover_umls(input.input_compendia, input.mrconso, input.mrsty, input.synonyms, output.umls_compendium, output.umls_synonyms, output.report, config['biolink_version'])
3736

3837
rule compress_umls:
3938
input:
4039
umls_synonyms = config['output_directory'] + "/synonyms/umls.txt",
4140
output:
4241
umls_synonyms_gzipped = config['output_directory'] + "/synonyms/umls.txt.gz",
42+
done = config['output_directory'] + "/reports/umls_done",
4343
run:
44-
util.gzip_files([input.umls_synonyms])
44+
util.gzip_files([input.umls_synonyms])
45+
util.write_done(output.done)

src/snakefiles/publications.snakefile

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import src.createcompendia.publications as publications
22
import src.assess_compendia as assessments
3+
from src.snakefiles import util
34

45
### PubMed
56

@@ -54,6 +55,8 @@ rule generate_pubmed_compendia:
5455
icrdf_filename=config['download_directory'] + '/icRDF.tsv',
5556
output:
5657
publication_compendium = config['output_directory'] + '/compendia/Publication.txt',
58+
# We generate an empty Publication Synonyms files, but we still need to generate one.
59+
publication_synonyms_gz = config['output_directory'] + '/synonyms/Publication.txt.gz',
5760
run:
5861
publications.generate_compendium(
5962
[input.pmid_doi_concord_file],
@@ -62,6 +65,10 @@ rule generate_pubmed_compendia:
6265
output.publication_compendium,
6366
input.icrdf_filename
6467
)
68+
# generate_compendium() will generate an (empty) Publication.txt.gz file, but we need
69+
# to compress it.
70+
publication_synonyms = os.path.splitext(output.publication_synonyms_gz)[0]
71+
util.gzip_files([publication_synonyms])
6572

6673
rule check_publications_completeness:
6774
input:
@@ -82,8 +89,7 @@ rule check_publications:
8289
rule publications:
8390
input:
8491
config['output_directory']+'/reports/publication_completeness.txt',
85-
# No synonyms for Publication.txt yet.
86-
# synonyms=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['publication_outputs']),
92+
synonyms = expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['publication_outputs']),
8793
reports = expand("{od}/reports/{ap}",od=config['output_directory'], ap = config['publication_outputs'])
8894
output:
8995
x=config['output_directory']+'/reports/publications_done'

0 commit comments

Comments
 (0)