Skip to content

Commit bb81b4e

Browse files
committed
feat(file_server): download pdf files via http instead of ssh
- changed ssh_ parameters (host, user, password) to file_server_ - added functions to download pdf files and to list supp files dirs from http server - updated tests - updated README
1 parent 9186d06 commit bb81b4e

10 files changed

Lines changed: 98 additions & 70 deletions

File tree

README.md

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,24 @@ from wbtools.literature.corpus import CorpusManager
1818
paper_id = "00050564"
1919
cm = CorpusManager()
2020
cm.load_from_wb_database(db_name="wb_dbname", db_user="wb_dbuser", db_password="wb_dbpasswd", db_host="wb_dbhost",
21-
paper_ids=[paper_id], ssh_host="ssh_host", ssh_user="ssh_user", ssh_passwd="ssh_passwd")
21+
paper_ids=[paper_id], file_server_host="file_server_base_url", file_server_user="username",
22+
file_server_passwd="password")
2223
sentences = cm.get_paper(paper_id).get_text_docs(split_sentences=True)
2324
```
2425

25-
### Get the latest papers (up to 50) added to WormBase or modified in the last month
26+
### Get the latest papers (up to 50) added to WormBase or modified in the last 30 days
2627

2728
```python
2829
from wbtools.literature.corpus import CorpusManager
2930
import datetime
3031

32+
one_month_ago = (datetime.datetime.now() - datetime.timedelta(days=30)).strftime("%M/%D/%Y")
33+
3134
cm = CorpusManager()
3235
cm.load_from_wb_database(db_name="wb_dbname", db_user="wb_dbuser", db_password="wb_dbpasswd", db_host="wb_dbhost",
33-
from_date=datetime.datetime.now(), max_num_papers=50, ssh_host="ssh_host", ssh_user="ssh_user",
34-
ssh_passwd="ssh_passwd")
36+
from_date=one_month_ago, max_num_papers=50,
37+
file_server_host="file_server_base_url", file_server_user="username",
38+
file_server_passwd="password")
3539
paper_ids = [paper.paper_id for paper in cm.get_all_papers()]
3640
```
3741

@@ -43,8 +47,8 @@ import datetime
4347

4448
cm = CorpusManager()
4549
cm.load_from_wb_database(db_name="wb_dbname", db_user="wb_dbuser", db_password="wb_dbpasswd", db_host="wb_dbhost",
46-
from_date=datetime.datetime.now(), max_num_papers=50, must_be_autclass_flagged=True,
47-
exclude_pap_types=['Review'], exclude_temp_pdf=True, ssh_host="ssh_host", ssh_user="ssh_user",
48-
ssh_passwd="ssh_passwd")
50+
max_num_papers=50, must_be_autclass_flagged=True, exclude_pap_types=['Review'],
51+
exclude_temp_pdf=True, file_server_host="file_server_base_url",
52+
file_server_user="username", file_server_passwd="password")
4953
paper_ids = [paper.paper_id for paper in cm.get_all_papers()]
5054
```

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="wbtools",
8-
version="1.2.16",
8+
version="1.3.0",
99
author="Valerio Arnaboldi",
1010
author_email="valearna@caltech.edu",
1111
description="Interface to WormBase (www.wormbase.org) curation data, including literature management and NLP "

tests/db/test_generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def test_get_curated_variations(self):
2424
curated_variations = self.db_manager.get_curated_variations(exclude_id_used_as_name=True)
2525
allele_regex = ALL_VAR_REGEX.format(designations=self.db_manager.get_allele_designations())
2626
for variation in curated_variations:
27-
self.assertTrue(re.match(allele_regex, variation))
27+
self.assertTrue(re.match(allele_regex, variation.lower()))
2828

2929
def test_entity_name_id_maps(self):
3030
gene_name_id_map = self.db_manager.get_gene_name_id_map()

tests/lib/entity_extraction/test_email_addresses.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ def test_get_email_addresses_from_paper(self):
2323
cm.load_from_wb_database(db_name=config["wb_database"]["db_name"], db_user=config["wb_database"]["db_user"],
2424
db_password=config["wb_database"]["db_password"],
2525
db_host=config["wb_database"]["db_host"],
26-
ssh_user=tazendra_config["ssh"]["ssh_user"],
27-
ssh_passwd=tazendra_config["ssh"]["ssh_password"],
26+
file_server_user=tazendra_config["file_server"]["user"],
27+
file_server_passwd=tazendra_config["file_server"]["password"],
2828
paper_ids=['00062455'])
2929
email_addresses = get_email_addresses_from_text(cm.get_paper('00062455').get_text_docs(
3030
include_supplemental=False, return_concatenated=True))

tests/lib/test_text_preprocessing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ def test_sectioning_cell_template(self):
3030
cm.load_from_wb_database(db_name=config["wb_database"]["db_name"], db_user=config["wb_database"]["db_user"],
3131
db_password=config["wb_database"]["db_password"],
3232
db_host=config["wb_database"]["db_host"],
33-
ssh_user=tazendra_config["ssh"]["ssh_user"],
34-
ssh_passwd=tazendra_config["ssh"]["ssh_password"],
33+
file_server_user=tazendra_config["file_server"]["user"],
34+
file_server_passwd=tazendra_config["file_server"]["password"],
3535
paper_ids=['00059375'])
3636
fulltext = cm.get_paper('00059375').get_text_docs(remove_sections=[PaperSections.REFERENCES],
3737
must_be_present=[PaperSections.METHOD, PaperSections.RESULTS])

tests/literature/test_corpus.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,15 @@ def test_load_from_wb_database(self):
6363
cm.load_from_wb_database(db_name=db_config["wb_database"]["db_name"], db_user=db_config["wb_database"]["db_user"],
6464
db_password=db_config["wb_database"]["db_password"],
6565
db_host=db_config["wb_database"]["db_host"],
66-
ssh_user=tazendra_config["ssh"]["ssh_user"],
67-
ssh_passwd=tazendra_config["ssh"]["ssh_password"], max_num_papers=2)
66+
file_server_user=tazendra_config["file_server"]["user"],
67+
file_server_passwd=tazendra_config["file_server"]["password"], max_num_papers=2)
6868
self.assertTrue(cm.size() == 2)
6969
cm.load_from_wb_database(db_name=db_config["wb_database"]["db_name"],
7070
db_user=db_config["wb_database"]["db_user"],
7171
db_password=db_config["wb_database"]["db_password"],
7272
db_host=db_config["wb_database"]["db_host"],
73-
ssh_user=tazendra_config["ssh"]["ssh_user"],
74-
ssh_passwd=tazendra_config["ssh"]["ssh_password"], max_num_papers=2,
73+
file_server_user=tazendra_config["file_server"]["user"],
74+
file_server_passwd=tazendra_config["file_server"]["password"], max_num_papers=2,
7575
exclude_temp_pdf=True)
7676
self.assertFalse(any([paper.is_temp() for paper in cm.get_all_papers()]))
7777

@@ -83,8 +83,8 @@ def test_load_supplemental(self):
8383
db_user=db_config["wb_database"]["db_user"],
8484
db_password=db_config["wb_database"]["db_password"],
8585
db_host=db_config["wb_database"]["db_host"],
86-
ssh_user=tazendra_config["ssh"]["ssh_user"],
87-
ssh_passwd=tazendra_config["ssh"]["ssh_password"], paper_ids=["00062512"])
86+
file_server_user=tazendra_config["file_server"]["user"],
87+
file_server_passwd=tazendra_config["file_server"]["password"], paper_ids=["00062512"])
8888
self.assertTrue(len(cm.get_paper("00062512").supplemental_docs) > 0)
8989

9090
@unittest.skipIf(not os.path.exists(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data",
@@ -97,8 +97,8 @@ def test_load_from_wb_database_afp(self):
9797
db_user=db_config["wb_database"]["db_user"],
9898
db_password=db_config["wb_database"]["db_password"],
9999
db_host=db_config["wb_database"]["db_host"],
100-
ssh_user=tazendra_config["ssh"]["ssh_user"],
101-
ssh_passwd=tazendra_config["ssh"]["ssh_password"], max_num_papers=2,
100+
file_server_user=tazendra_config["file_server"]["user"],
101+
file_server_passwd=tazendra_config["file_server"]["password"], max_num_papers=2,
102102
load_curation_info=True, load_afp_info=True,
103103
exclude_temp_pdf=True, exclude_afp_processed=True, must_be_autclass_flagged=True)
104104
self.assertFalse(any([paper.afp_processed for paper in cm.get_all_papers()]))

tests/literature/test_paper.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,13 @@ def test_extract_all_email_addresses_from_text(self):
7171
"local_config", "db.cfg")), "Test DB config file not present")
7272
def test_pdf2txt_conversion(self):
7373
config = read_db_config()
74-
ssh_config = read_tazendra_config()
74+
file_server_config = read_tazendra_config()
7575
db_manager = WBPaperDBManager(
7676
dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"],
7777
password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"])
78-
paper = WBPaper(paper_id="00003969", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"],
79-
ssh_passwd=ssh_config["ssh"]["ssh_password"])
78+
paper = WBPaper(paper_id="00003969", db_manager=db_manager,
79+
file_server_user=file_server_config["file_server"]["user"],
80+
file_server_passwd=file_server_config["file_server"]["password"])
8081
paper.load_text_from_pdf_files_in_db()
8182
fulltext = paper.get_text_docs()
8283
self.assertGreater(len(fulltext), 0)
@@ -86,12 +87,13 @@ def test_pdf2txt_conversion(self):
8687
"local_config", "db.cfg")), "Test DB config file not present")
8788
def test_pdf_table_conversion(self):
8889
config = read_db_config()
89-
ssh_config = read_tazendra_config()
90+
file_server_config = read_tazendra_config()
9091
db_manager = WBPaperDBManager(
9192
dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"],
9293
password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"])
93-
paper = WBPaper(paper_id="00059755", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"],
94-
ssh_passwd=ssh_config["ssh"]["ssh_password"])
94+
paper = WBPaper(paper_id="00059755", db_manager=db_manager,
95+
file_server_user=file_server_config["file_server"]["user"],
96+
file_server_passwd=file_server_config["file_server"]["password"])
9597
paper.load_text_from_pdf_files_in_db()
9698
fulltext = paper.get_text_docs()
9799
self.assertTrue(fulltext)
@@ -100,12 +102,13 @@ def test_pdf_table_conversion(self):
100102
"local_config", "db.cfg")), "Test DB config file not present")
101103
def test_tokenize_sentences_with_tables(self):
102104
config = read_db_config()
103-
ssh_config = read_tazendra_config()
105+
file_server_config = read_tazendra_config()
104106
db_manager = WBPaperDBManager(
105107
dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"],
106108
password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"])
107-
paper = WBPaper(paper_id="00003969", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"],
108-
ssh_passwd=ssh_config["ssh"]["ssh_password"])
109+
paper = WBPaper(paper_id="00003969", db_manager=db_manager,
110+
file_server_user=file_server_config["file_server"]["user"],
111+
file_server_passwd=file_server_config["file_server"]["password"])
109112
paper.load_text_from_pdf_files_in_db()
110113
sentences = paper.get_text_docs(split_sentences=True)
111114
self.assertGreater(len(sentences), 0)
@@ -114,12 +117,13 @@ def test_tokenize_sentences_with_tables(self):
114117
"local_config", "db.cfg")), "Test DB config file not present")
115118
def test_two_cols_conversion(self):
116119
config = read_db_config()
117-
ssh_config = read_tazendra_config()
120+
file_server_config = read_tazendra_config()
118121
db_manager = WBPaperDBManager(
119122
dbname=config["wb_database"]["db_name"], user=config["wb_database"]["db_user"],
120123
password=config["wb_database"]["db_password"], host=config["wb_database"]["db_host"])
121-
paper = WBPaper(paper_id="00055367", db_manager=db_manager, ssh_user=ssh_config["ssh"]["ssh_user"],
122-
ssh_passwd=ssh_config["ssh"]["ssh_password"])
124+
paper = WBPaper(paper_id="00055367", db_manager=db_manager,
125+
file_server_user=file_server_config["file_server"]["user"],
126+
file_server_passwd=file_server_config["file_server"]["password"])
123127
paper.load_text_from_pdf_files_in_db()
124128
fulltext = paper.get_text_docs()
125129
self.assertTrue(fulltext)

wbtools/lib/scraping.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import re
44
import ssl
5+
import tempfile
56
import urllib.request
67
from typing import List
78

@@ -64,3 +65,27 @@ def get_curated_papers(datatype, tazendra_user, tazendra_password) -> List[str]:
6465
if m:
6566
curated_papers = curated_papers | set(m.group(1).split())
6667
return list(curated_papers)
68+
69+
70+
def get_supp_file_names_from_paper_dir(paper_sup_dir_url, user, password):
71+
request = urllib.request.Request(paper_sup_dir_url)
72+
base64string = base64.b64encode(bytes('%s:%s' % (user, password), 'ascii'))
73+
request.add_header("Authorization", "Basic %s" % base64string.decode('utf-8'))
74+
supp_files = set()
75+
with urllib.request.urlopen(request) as response:
76+
res = response.read().decode("utf8")
77+
m = re.findall('.*alt="\[ \]"></td><td><a href="([^"]+)">.*', res)
78+
if m:
79+
supp_files = set(m)
80+
return list(supp_files)
81+
82+
83+
def download_pdf_file_from_url(url, user, password):
84+
tmp_file = tempfile.NamedTemporaryFile()
85+
request = urllib.request.Request(url)
86+
base64string = base64.b64encode(bytes('%s:%s' % (user, password), 'ascii'))
87+
request.add_header("Authorization", "Basic %s" % base64string.decode('utf-8'))
88+
with urllib.request.urlopen(request) as response:
89+
with open(tmp_file.name, 'wb') as tmp_file_stream:
90+
tmp_file_stream.write(response.read())
91+
return tmp_file

wbtools/literature/corpus.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ def load_from_dir_with_txt_files(self, dir_path: str):
5858
paper.add_file(dir_path=dir_path, filename=f, remote_file=False, pdf=False)
5959

6060
def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db_host: str,
61-
ssh_host: str = 'tazendra.caltech.edu', ssh_user: str = None, ssh_passwd: str = None,
61+
file_server_host: str = 'https://tazendra.caltech.edu/~acedb/daniel/',
62+
file_server_user: str = None, file_server_passwd: str = None,
6263
paper_ids: list = None,
6364
from_date: str = None, load_pdf_files: bool = True, load_bib_info: bool = True,
6465
load_curation_info: bool = True, load_afp_info: bool = False, max_num_papers: int = None,
@@ -74,9 +75,9 @@ def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db
7475
db_user (str): database user
7576
db_password (str): database password
7677
db_host (str): database host
77-
ssh_host (str): host where to fetch the files via ssh
78-
ssh_user (str): ssh user to fetch pdf files
79-
ssh_passwd (str): ssh password to fetch pdf files
78+
file_server_host (str): host where to fetch the files via url
79+
file_server_user (str): user required to log in to web form
80+
file_server_passwd (str): password to fetch pdf files from web form
8081
paper_ids (list): optional list of paper ids to be fetched
8182
from_date (str): load papers added or modified from the specified date (only if paper_ids is not provided)
8283
load_pdf_files (bool): load pdf files using ssh credentials
@@ -119,8 +120,8 @@ def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db
119120
exclude_no_author_email else []
120121

121122
for paper_id in paper_ids:
122-
paper = WBPaper(paper_id=paper_id, ssh_host=ssh_host, ssh_user=ssh_user,
123-
ssh_passwd=ssh_passwd, db_manager=main_db_manager.paper)
123+
paper = WBPaper(paper_id=paper_id, file_server_host=file_server_host, file_server_user=file_server_user,
124+
file_server_passwd=file_server_passwd, db_manager=main_db_manager.paper)
124125
if exclude_afp_processed and paper_id in afp_processed_ids:
125126
logger.info("Skipping paper already processed by AFP")
126127
continue

0 commit comments

Comments
 (0)