Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
36d56f2
text tokenizer return postions of token
whalebot-helmsman Sep 21, 2017
2d4d2ef
update tests
whalebot-helmsman Sep 21, 2017
80658ca
separate statement for every action
whalebot-helmsman Sep 21, 2017
c52e449
comma preserving test
whalebot-helmsman Sep 21, 2017
8178776
too much tokens around
whalebot-helmsman Sep 21, 2017
51c0932
encode in indices instead of entities
whalebot-helmsman Sep 21, 2017
1a667ec
handle empty lists
whalebot-helmsman Sep 21, 2017
24465b1
pass token length and position from TextToken to HtmlToken
whalebot-helmsman Sep 21, 2017
06befbb
letter perfect detokenization
whalebot-helmsman Sep 22, 2017
e5730b2
do not cleanup tokenized tree by default, separate method for tree cl…
Sep 25, 2017
e340444
update tests for separate tree cleaning
Sep 25, 2017
89673c1
update tests for correct punctuation positions
Sep 25, 2017
7c45984
correct length for replaced quotes
Sep 25, 2017
46fc4df
pep8
Sep 29, 2017
388170e
comma at line end, not start
Sep 29, 2017
71caf61
one join instead of many additions, dont be Schleimel
Sep 29, 2017
37d7470
correct formatting
Sep 29, 2017
e93c6dc
add clarification
Sep 29, 2017
e02c275
fix typo
Sep 29, 2017
f26569f
pep8
Sep 29, 2017
d1aecbb
preserve tokenize method for compatibility
Sep 29, 2017
35a9d88
function to reduce code in tests
Sep 29, 2017
9033188
remove test for nltk tokenizer
Sep 29, 2017
c14f363
test our behaviour, which difers from original treebank tokenizer
Sep 29, 2017
a071cd4
remove useless conversion
Sep 29, 2017
a33f564
rename method to avoid confusion with nltk tokenize_span method
Sep 29, 2017
75a9698
remove brittle tests
Sep 29, 2017
4729323
small benchmark for html tokenizer
Sep 29, 2017
943a44e
Revert "remove brittle tests"
whalebot-helmsman Oct 2, 2017
ba7d6fe
move brittle tests to pytest xfail
whalebot-helmsman Oct 2, 2017
b72bcc1
expect behaviour of nltk tokenizer
whalebot-helmsman Oct 2, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 24 additions & 21 deletions webstruct/text_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ class WordTokenizer(object):

>>> from nltk.tokenize.treebank import TreebankWordTokenizer # doctest: +SKIP
>>> s = '''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com'''
>>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP
>>> TreebankWordTokenizer().span_tokenize(s) # doctest: +SKIP
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

span_tokenize returns a different kind of output, it can be a good time to either fix tests, or remove them

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', 'gmail.com']
>>> WordTokenizer().tokenize(s)
>>> WordTokenizer().span_tokenize(s)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in nltk span_tokenize returns (start, end) tuples; it can be better to either implemente a compatible API or use a different name.

[TextToken(chars='Good', position=0, length=4),
TextToken(chars='muffins', position=5, length=7),
TextToken(chars='cost', position=13, length=4),
Expand All @@ -27,25 +27,25 @@ class WordTokenizer(object):
TextToken(chars='muffins@gmail.com', position=44, length=17)]

>>> s = '''Shelbourne Road,'''
>>> WordTokenizer().tokenize(s)
>>> WordTokenizer().span_tokenize(s)
[TextToken(chars='Shelbourne', position=0, length=10),
TextToken(chars='Road', position=11, length=4),
TextToken(chars=',', position=15, length=1)]

>>> s = '''population of 100,000'''
>>> WordTokenizer().tokenize(s)
>>> WordTokenizer().span_tokenize(s)
[TextToken(chars='population', position=0, length=10),
TextToken(chars='of', position=11, length=2),
TextToken(chars='100,000', position=14, length=7)]

>>> s = '''Hello|World'''
>>> WordTokenizer().tokenize(s)
>>> WordTokenizer().span_tokenize(s)
[TextToken(chars='Hello', position=0, length=5),
TextToken(chars='|', position=5, length=1),
TextToken(chars='World', position=6, length=5)]

>>> s2 = '"We beat some pretty good teams to get here," Slocum said.'
>>> WordTokenizer().tokenize(s2) # doctest: +NORMALIZE_WHITESPACE
>>> WordTokenizer().span_tokenize(s2) # doctest: +NORMALIZE_WHITESPACE
[TextToken(chars='``', position=0, length=1),
TextToken(chars='We', position=1, length=2),
TextToken(chars='beat', position=4, length=4),
Expand All @@ -65,7 +65,7 @@ class WordTokenizer(object):
... cliche-ridden, \"Touched by an
... Angel\" (a show creator John Masius
... worked on) wanna-be if she didn't.'''
>>> WordTokenizer().tokenize(s3) # doctest: +NORMALIZE_WHITESPACE
>>> WordTokenizer().span_tokenize(s3) # doctest: +NORMALIZE_WHITESPACE
[TextToken(chars='Well', position=0, length=4),
TextToken(chars=',', position=4, length=1),
TextToken(chars='we', position=6, length=2),
Expand Down Expand Up @@ -97,28 +97,28 @@ class WordTokenizer(object):
TextToken(chars="didn't", position=133, length=6),
TextToken(chars='.', position=139, length=1)]

>>> WordTokenizer().tokenize('"')
>>> WordTokenizer().span_tokenize('"')
[TextToken(chars='``', position=0, length=1)]

>>> WordTokenizer().tokenize('" a')
>>> WordTokenizer().span_tokenize('" a')
[TextToken(chars='``', position=0, length=1),
TextToken(chars='a', position=2, length=1)]

Some issues:

>>> WordTokenizer().tokenize("Phone:855-349-1914") # doctest: +SKIP
>>> WordTokenizer().span_tokenize("Phone:855-349-1914") # doctest: +SKIP
['Phone', ':', '855-349-1914']

>>> WordTokenizer().tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") # doctest: +SKIP
>>> WordTokenizer().span_tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") # doctest: +SKIP
['Copyright', '\xc2\xa9', '2014', 'Wall', 'Decor', 'and', 'Home', 'Accents', '.', 'All', 'Rights', 'Reserved', '.']

>>> WordTokenizer().tokenize("Powai Campus, Mumbai-400077") # doctest: +SKIP
>>> WordTokenizer().span_tokenize("Powai Campus, Mumbai-400077") # doctest: +SKIP
['Powai', 'Campus', ',', 'Mumbai", "-", "400077']

>>> WordTokenizer().tokenize("1 5858/ 1800") # doctest: +SKIP
>>> WordTokenizer().span_tokenize("1 5858/ 1800") # doctest: +SKIP
['1', '5858', '/', '1800']

>>> WordTokenizer().tokenize("Saudi Arabia-") # doctest: +SKIP
>>> WordTokenizer().span_tokenize("Saudi Arabia-") # doctest: +SKIP
['Saudi', 'Arabia', '-']

"""
Expand All @@ -140,17 +140,17 @@ class WordTokenizer(object):

open_quotes = re.compile(r'(^|[\s(\[{<])"')

def _tokenize(self, text):
def _span_tokenize(self, text):
# this one cannot be placed in the loop because it requires
# position check (beginning of the string) or previous char value
quote = self.open_quotes.search(text)
if quote is not None:
end = quote.end() - 1
for t in self._tokenize(text[:end]):
for t in self._span_tokenize(text[:end]):
yield t
yield TextToken(chars='``', position=end, length=1)
shift = end + 1
for t in self._tokenize(text[shift:]):
for t in self._span_tokenize(text[shift:]):
yield TextToken(chars=t.chars,
position=t.position + shift,
length=t.length)
Expand Down Expand Up @@ -185,13 +185,16 @@ def _tokenize(self, text):
break
i += shift

def span_tokenize(self, text):
return [t for t in self._span_tokenize(text) if t.chars]

def tokenize(self, text):
return [t for t in self._tokenize(text) if t.chars]
return [t.chars for t in self.span_tokenize(text)]


class DefaultTokenizer(WordTokenizer):
def tokenize(self, text):
tokens = super(DefaultTokenizer, self).tokenize(text)
def span_tokenize(self, text):
tokens = super(DefaultTokenizer, self).span_tokenize(text)
# remove standalone commas and semicolons
# as they broke tag sets,
# e.g. PERSON->FUNCTION in case "PERSON, FUNCTION"
Expand All @@ -205,4 +208,4 @@ def tokenize(self, text):
return [t for t in tokens if t.chars not in {',', ';'}]


tokenize = DefaultTokenizer().tokenize
tokenize = DefaultTokenizer().span_tokenize