-
Notifications
You must be signed in to change notification settings - Fork 59
Letter-precise html tokenization #49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
36d56f2
2d4d2ef
80658ca
c52e449
8178776
51c0932
1a667ec
24465b1
06befbb
e5730b2
e340444
89673c1
7c45984
46fc4df
388170e
71caf61
37d7470
e93c6dc
e02c275
f26569f
d1aecbb
35a9d88
9033188
c14f363
a071cd4
a33f564
75a9698
4729323
943a44e
ba7d6fe
b72bcc1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,18 +16,22 @@ | |
| import six | ||
| from six.moves import zip | ||
|
|
||
| from lxml.etree import XPathEvaluator, Comment | ||
| from lxml.etree import Comment, iterwalk | ||
|
|
||
| from webstruct.sequence_encoding import IobEncoder | ||
| from webstruct.text_tokenizers import tokenize | ||
| from webstruct.text_tokenizers import tokenize, TextToken | ||
| from webstruct.utils import ( | ||
| replace_html_tags, | ||
| kill_html_tags, | ||
| smart_join, | ||
| ) | ||
|
|
||
|
|
||
| _HtmlToken = namedtuple('HtmlToken', 'index tokens elem is_tail') | ||
| _HtmlToken = namedtuple('HtmlToken', ['index', | ||
| 'tokens', | ||
| 'elem', | ||
| 'is_tail', | ||
| 'position', | ||
| 'length']) | ||
|
|
||
|
|
||
| class HtmlToken(_HtmlToken): | ||
|
|
@@ -41,6 +45,8 @@ class HtmlToken(_HtmlToken): | |
| * :attr:`elem` is the current html block (as lxml's Element) - most | ||
| likely you want :attr:`parent` instead of it | ||
| * :attr:`is_tail` flag indicates that token belongs to element tail | ||
| * :attr:`position is position of token start in parent text | ||
| * :attr:`length is length of token in parent text | ||
|
|
||
| Computed properties: | ||
|
|
||
|
|
@@ -64,8 +70,10 @@ def root(self): | |
| return self.elem.getroottree() | ||
|
|
||
| def __repr__(self): | ||
| return "HtmlToken(token=%r, parent=%r, index=%s)" % ( | ||
| self.token, self.parent, self.index | ||
| return ("HtmlToken(" | ||
| "token=%r, parent=%r, index=%s, position=%d, length=%d" | ||
| ")") % ( | ||
| self.token, self.parent, self.index, self.position, self.length | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -85,7 +93,8 @@ class HtmlTokenizer(object): | |
| ---------- | ||
|
|
||
| tagset : set, optional | ||
| A set of entity types to keep. If not passed, all entity types are kept. | ||
| A set of entity types to keep. | ||
| If not passed, all entity types are kept. | ||
| Use this argument to discard some entity types from training data. | ||
| sequence_encoder : object, optional | ||
| Sequence encoder object. If not passed, | ||
|
|
@@ -142,7 +151,7 @@ def tokenize_single(self, tree): | |
| >>> tree = loader.loadbytes(b"<p>hello, <PER>John <b>Doe</b></PER> <br> <PER>Mary</PER> said</p>") | ||
| >>> html_tokens, tags = html_tokenizer.tokenize_single(tree) | ||
| >>> html_tokens | ||
| [HtmlToken(token='hello', parent=<Element p at ...>, index=0), HtmlToken...] | ||
| [HtmlToken(token='hello', parent=<Element p at ...>, index=0, ...), HtmlToken...] | ||
| >>> tags | ||
| ['O', 'B-PER', 'I-PER', 'B-PER', 'O'] | ||
| >>> for tok, iob_tag in zip(html_tokens, tags): | ||
|
|
@@ -180,6 +189,8 @@ def detokenize_single(self, html_tokens, tags): | |
| Build annotated ``lxml.etree.ElementTree`` from | ||
| ``html_tokens`` (a list of :class:`.HtmlToken` instances) | ||
| and ``tags`` (a list of their tags). | ||
| **ATTENTION**: ``html_tokens`` should be tokenized from tree | ||
| without tags | ||
|
|
||
| Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__`` | ||
| text tokens (this is the format :mod:`webstruct.loaders` use). | ||
|
|
@@ -190,9 +201,7 @@ def detokenize_single(self, html_tokens, tags): | |
| if not html_tokens: | ||
| return None | ||
|
|
||
| orig_tree = html_tokens[0].root | ||
| tree = copy.deepcopy(orig_tree) | ||
| xpatheval = XPathEvaluator(tree) | ||
| tree = html_tokens[0].root | ||
|
|
||
| # find starts/ends of token groups | ||
| token_groups = self.sequence_encoder.group(zip(html_tokens, tags)) | ||
|
|
@@ -206,30 +215,47 @@ def detokenize_single(self, html_tokens, tags): | |
| pos += n_tokens | ||
|
|
||
| # mark starts/ends with special tokens | ||
| data = zip(html_tokens, tags, range(len(html_tokens))) | ||
| keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail) | ||
| data = [(s, True) for s in starts] | ||
| data.extend((s, False) for s in ends) | ||
| keyfunc = lambda rec: (id(html_tokens[rec[0]].elem), html_tokens[rec[0]].is_tail) | ||
| data.sort(key=keyfunc) | ||
|
|
||
| for (orig_elem, is_tail), g in groupby(data, keyfunc): | ||
| for (_, is_tail), g in groupby(data, keyfunc): | ||
| g = list(g) | ||
| fix = False | ||
| tokens = g[0][0].tokens[:] | ||
| for token, tag, token_idx in g: | ||
| if token_idx in starts: | ||
| text = ' __START_%s__ %s' % (tag[2:], tokens[token.index]) | ||
| tokens[token.index] = text | ||
| fix = True | ||
| if token_idx in ends: | ||
| text = '%s __END_%s__ ' % (tokens[token.index], tag[2:]) | ||
| tokens[token.index] = text | ||
| fix = True | ||
|
|
||
| if fix: | ||
| xpath = orig_tree.getpath(orig_elem) | ||
| elem = xpatheval(xpath)[0] | ||
| if is_tail: | ||
| elem.tail = smart_join(tokens) | ||
| g.sort(key=lambda t: (html_tokens[t[0]].position, not t[1])) | ||
|
|
||
| if not g: | ||
| continue | ||
|
|
||
| elem = html_tokens[g[0][0]].elem | ||
|
|
||
| pos_in_source = 0 | ||
| source = elem.text | ||
| if is_tail: | ||
| source = elem.tail | ||
|
|
||
| modded = '' | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please use a list here, and join it at the end? The current code is not O(N^2) only in CPython. |
||
|
|
||
| for idx, is_starts in g: | ||
| token = html_tokens[idx] | ||
| tag = tags[idx] | ||
| modded = modded + source[pos_in_source:token.position] | ||
| pos_in_source = token.position | ||
| if is_starts: | ||
| patch = ' __START_%s__ ' % (tag[2:],) | ||
| modded = modded + patch | ||
| else: | ||
| elem.text = smart_join(tokens) | ||
| end_in_source = pos_in_source + token.length | ||
| modded = modded + source[pos_in_source:end_in_source] | ||
| pos_in_source = pos_in_source + token.length | ||
| patch = ' __END_%s__ ' % (tag[2:],) | ||
| modded = modded + patch | ||
|
|
||
| modded = modded + source[pos_in_source:] | ||
| if is_tail: | ||
| elem.tail = modded | ||
| else: | ||
| elem.text = modded | ||
|
|
||
| return tree | ||
|
|
||
|
|
@@ -245,18 +271,35 @@ def _process_tree(self, tree): | |
| return | ||
|
|
||
| head_tokens, head_tags = self._tokenize_and_split(tree.text) | ||
| char_tokens = [t.chars for t in head_tokens] | ||
| for index, (token, tag) in enumerate(zip(head_tokens, head_tags)): | ||
| yield HtmlToken(index, head_tokens, tree, False), tag | ||
| yield HtmlToken(index, | ||
| char_tokens, | ||
| tree, | ||
| False, | ||
| token.position, | ||
| token.length), tag | ||
|
|
||
| for child in tree: # where is my precious "yield from"? | ||
| for html_token, tag in self._process_tree(child): | ||
| yield html_token, tag | ||
|
|
||
| tail_tokens, tail_tags = self._tokenize_and_split(tree.tail) | ||
| char_tokens = [t.chars for t in tail_tokens] | ||
| for index, (token, tag) in enumerate(zip(tail_tokens, tail_tags)): | ||
| yield HtmlToken(index, tail_tokens, tree, True), tag | ||
| yield HtmlToken(index, | ||
| char_tokens, | ||
| tree, | ||
| True, | ||
| token.position, | ||
| token.length), tag | ||
|
|
||
| def cleanup_tree(self, tree): | ||
| cleaned = copy.deepcopy(tree) | ||
| for _, elem in iterwalk(cleaned): | ||
| self._cleanup_elem(elem) | ||
|
|
||
| self._cleanup_elem(tree) | ||
| return cleaned | ||
|
|
||
| def _cleanup_elem(self, elem): | ||
| """ Remove special tokens from elem """ | ||
|
|
@@ -266,16 +309,23 @@ def _cleanup_elem(self, elem): | |
| elem.tail = self._tag_re.sub("", elem.tail) | ||
|
|
||
| def _tokenize_and_split(self, text): | ||
| input_tokens = self._limit_tags(self.text_tokenize_func(text or '')) | ||
| input_tokens = map(six.text_type, input_tokens) | ||
| return self.sequence_encoder.encode_split(input_tokens) | ||
| text = text or '' | ||
| input_tokens = [t for t in self.text_tokenize_func(text)] | ||
| input_tokens = self._limit_tags(input_tokens) | ||
| input_tokens = [TextToken(chars=six.text_type(t.chars), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unicode doesn't look right here; if t.chars if unicode (str in Python 3) then conversion is not needed; if t.chars is bytes, then conversion should use a proper encoding, not sys.getdefaultencoding() which is often ascii
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This conversion is the same as it was before
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One of tests awaits unicode
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All real encoding/decoding handled by lxml. lxml uses utf-8 as it is internal representation. I think we can add test with real unicode and safely remove this conversion
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I finally recalled why do we have such code! In Python 2.x lxml returns bytes for ASCII-only data and unicode for non-ascii data; this code ensures everything is unicode. It is only active for ascii-only bytes in Python 2.x, and no-op in all other cases, so it works as intended. Sorry for a false alarm. |
||
| position=t.position, | ||
| length=t.length) for t in input_tokens] | ||
| chains = self.sequence_encoder.encode(t.chars for t in input_tokens) | ||
| chains = self.sequence_encoder.from_indicies(chains, input_tokens) | ||
| chains = [l for l in chains] | ||
| return self.sequence_encoder.split(chains) | ||
|
|
||
| def _limit_tags(self, input_tokens): | ||
| if self.tagset is None: | ||
| return input_tokens | ||
|
|
||
| proc = self.sequence_encoder.token_processor | ||
| token_classes = [proc.classify(tok) for tok in input_tokens] | ||
| token_classes = [proc.classify(tok.chars) for tok in input_tokens] | ||
| return [ | ||
| tok for (tok, (typ, value)) in zip(input_tokens, token_classes) | ||
| if not (typ in {'start', 'end'} and value not in self.tagset) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,23 +11,31 @@ class IobEncoder(object): | |
|
|
||
| >>> iob_encoder = IobEncoder() | ||
| >>> input_tokens = ["__START_PER__", "John", "__END_PER__", "said"] | ||
| >>> iob_encoder.encode(input_tokens) | ||
| >>> [p for p in IobEncoder.from_indicies(iob_encoder.encode(input_tokens), input_tokens)] | ||
| [('John', 'B-PER'), ('said', 'O')] | ||
|
|
||
| Get the result in another format using ``encode_split`` method:: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
|
||
| >>> input_tokens = ["hello", "__START_PER__", "John", "Doe", "__END_PER__", "__START_PER__", "Mary", "__END_PER__", "said"] | ||
| >>> tokens, tags = iob_encoder.encode_split(input_tokens) | ||
| >>> tokens = iob_encoder.encode(input_tokens) | ||
| >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens)] | ||
| >>> tokens, tags = iob_encoder.split(tokens) | ||
| >>> tokens, tags | ||
| (['hello', 'John', 'Doe', 'Mary', 'said'], ['O', 'B-PER', 'I-PER', 'B-PER', 'O']) | ||
|
|
||
| Note that IobEncoder is stateful. This means you can encode incomplete | ||
| stream and continue the encoding later:: | ||
|
|
||
| >>> iob_encoder = IobEncoder() | ||
| >>> iob_encoder.encode(["__START_PER__", "John"]) | ||
| >>> input_tokens_partial = ["__START_PER__", "John"] | ||
| >>> tokens = iob_encoder.encode(input_tokens_partial) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. .encode method no longer returns tokens, so it could be better to rename the variable. |
||
| >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens_partial)] | ||
| >>> tokens | ||
| [('John', 'B-PER')] | ||
| >>> iob_encoder.encode(["Mayer", "__END_PER__", "said"]) | ||
| >>> input_tokens_partial = ["Mayer", "__END_PER__", "said"] | ||
| >>> tokens = iob_encoder.encode(input_tokens_partial) | ||
| >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens_partial)] | ||
| >>> tokens | ||
| [('Mayer', 'I-PER'), ('said', 'O')] | ||
|
|
||
| To reset internal state, use ``reset method``:: | ||
|
|
@@ -36,7 +44,7 @@ class IobEncoder(object): | |
|
|
||
| Group results to entities:: | ||
|
|
||
| >>> iob_encoder.group(iob_encoder.encode(input_tokens)) | ||
| >>> iob_encoder.group([p for p in IobEncoder.from_indicies(iob_encoder.encode(input_tokens), input_tokens)]) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This pattern is repeated in test cases only, we can define a function in tests code. |
||
| [(['hello'], 'O'), (['John', 'Doe'], 'PER'), (['Mary'], 'PER'), (['said'], 'O')] | ||
|
|
||
| Input token stream is processed by ``InputTokenProcessor()`` by default; | ||
|
|
@@ -53,7 +61,7 @@ def reset(self): | |
| self.tag = 'O' | ||
|
|
||
| def iter_encode(self, input_tokens): | ||
| for token in input_tokens: | ||
| for number, token in enumerate(input_tokens): | ||
| token_type, value = self.token_processor.classify(token) | ||
|
|
||
| if token_type == 'start': | ||
|
|
@@ -68,7 +76,7 @@ def iter_encode(self, input_tokens): | |
| self.tag = "O" | ||
|
|
||
| elif token_type == 'token': | ||
| yield token, self.tag | ||
| yield number, self.tag | ||
| if self.tag[0] == 'B': | ||
| self.tag = "I" + self.tag[1:] | ||
|
|
||
|
|
@@ -81,13 +89,14 @@ def iter_encode(self, input_tokens): | |
| def encode(self, input_tokens): | ||
| return list(self.iter_encode(input_tokens)) | ||
|
|
||
| def encode_split(self, input_tokens): | ||
| """ The same as ``encode``, but returns ``(tokens, tags)`` tuple """ | ||
| res = self.encode(input_tokens) | ||
| if not res: | ||
| return (), () | ||
| tokens, tags = zip(*res) | ||
| return list(tokens), list(tags) | ||
| def split(self, tokens): | ||
| """ split ``[(token, tag)]`` to ``([token], [tags])`` tuple """ | ||
| return [t[0] for t in tokens], [t[1] for t in tokens] | ||
|
|
||
| @classmethod | ||
| def from_indicies(Cls, indicies, input_tokens): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| for idx, tag in indicies: | ||
| yield input_tokens[idx], tag | ||
|
|
||
| @classmethod | ||
| def group(cls, data, strict=False): | ||
|
|
@@ -186,4 +195,3 @@ def classify(self, token): | |
|
|
||
| # regular token | ||
| return 'token', token | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's clarify if we're talking about bytes positions or unicode positions