Skip to content

Commit e8e9105

Browse files
committed
fix(email_extraction): remove extra dots at the end of email addresses + extract some email addresses on two lines
- strip addresses of extra "." - replace @\n with @ in fulltext before extraction to include cases like the following: you@ email.com
1 parent 1251a73 commit e8e9105

3 files changed

Lines changed: 25 additions & 4 deletions

File tree

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="wbtools",
8-
version="1.2.14",
8+
version="1.2.15",
99
author="Valerio Arnaboldi",
1010
author_email="valearna@caltech.edu",
1111
description="Interface to WormBase (www.wormbase.org) curation data, including literature management and NLP "

tests/lib/entity_extraction/test_email_addresses.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import unittest
2+
import os
23

4+
from tests.config_reader import read_db_config, read_tazendra_config
35
from wbtools.lib.nlp.entity_extraction.email_addresses import get_email_addresses_from_text
6+
from wbtools.literature.corpus import CorpusManager
47

58

69
class TestEmailAddresses(unittest.TestCase):
@@ -11,6 +14,24 @@ def test_get_email_addresses_from_text(self):
1114
addr = get_email_addresses_from_text(text)
1215
self.assertEqual(len(addr), 2)
1316

17+
@unittest.skipIf(not os.path.exists(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..", "data",
18+
"local_config", "db.cfg")), "Test DB config file not present")
19+
def test_get_email_addresses_from_paper(self):
20+
config = read_db_config()
21+
tazendra_config = read_tazendra_config()
22+
cm = CorpusManager()
23+
cm.load_from_wb_database(db_name=config["wb_database"]["db_name"], db_user=config["wb_database"]["db_user"],
24+
db_password=config["wb_database"]["db_password"],
25+
db_host=config["wb_database"]["db_host"],
26+
ssh_user=tazendra_config["ssh"]["ssh_user"],
27+
ssh_passwd=tazendra_config["ssh"]["ssh_password"],
28+
paper_ids=['00062455'])
29+
email_addresses = get_email_addresses_from_text(cm.get_paper('00062455').get_text_docs(
30+
include_supplemental=False, return_concatenated=True))
31+
email_addresses_in_wb = cm.get_paper('00062455').get_authors_with_email_address_in_wb()
32+
self.assertEqual(len(email_addresses), 3)
33+
self.assertGreaterEqual(len(email_addresses_in_wb), 2)
34+
1435

1536
if __name__ == '__main__':
1637
unittest.main()

wbtools/lib/nlp/entity_extraction/email_addresses.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44

55

66
def get_email_addresses_from_text(text):
7-
all_addresses = re.findall(EMAIL_ADDRESS_REGEX, text)
7+
all_addresses = re.findall(EMAIL_ADDRESS_REGEX, text.replace('@\n', '@'))
88
if not all_addresses:
99
text = text.replace(". ", ".")
10-
all_addresses = re.findall(EMAIL_ADDRESS_REGEX, text)
11-
all_addresses = [address.strip(".") for address in all_addresses]
10+
all_addresses = re.findall(EMAIL_ADDRESS_REGEX, text.replace('@\n', '@'))
11+
all_addresses = [address.strip(".") for address in all_addresses]
1212
added_addresses = set()
1313
return_addresses = []
1414
for address in all_addresses:

0 commit comments

Comments
 (0)