scrapinghub · kmike · Oct 2, 2017 · Sep 21, 2017 · Sep 21, 2017 · Sep 21, 2017
diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py
@@ -16,18 +16,22 @@
 import six
 from six.moves import zip
 
-from lxml.etree import XPathEvaluator, Comment
+from lxml.etree import Comment, iterwalk
 
 from webstruct.sequence_encoding import IobEncoder
-from webstruct.text_tokenizers import tokenize
+from webstruct.text_tokenizers import tokenize, TextToken
 from webstruct.utils import (
     replace_html_tags,
     kill_html_tags,
-    smart_join,
 )
 
 
-_HtmlToken = namedtuple('HtmlToken', 'index tokens elem is_tail')
+_HtmlToken = namedtuple('HtmlToken', ['index',
+                                      'tokens',
+                                      'elem',
+                                      'is_tail',
+                                      'position',
+                                      'length'])
 
 
 class HtmlToken(_HtmlToken):
@@ -41,6 +45,8 @@ class HtmlToken(_HtmlToken):
     * :attr:`elem` is the current html block (as lxml's Element) - most
       likely you want :attr:`parent` instead of it
     * :attr:`is_tail` flag indicates that token belongs to element tail
+    * :attr:`position is position of token start in parent text
+    * :attr:`length is length of token in parent text
 
     Computed properties:
 
@@ -64,8 +70,10 @@ def root(self):
         return self.elem.getroottree()
 
     def __repr__(self):
-        return "HtmlToken(token=%r, parent=%r, index=%s)" % (
-            self.token, self.parent, self.index
+        return ("HtmlToken("
+                "token=%r, parent=%r, index=%s, position=%d, length=%d"
+                ")") % (
+            self.token, self.parent, self.index, self.position, self.length
         )
 
 
@@ -85,7 +93,8 @@ class HtmlTokenizer(object):
     ----------
 
     tagset : set, optional
-        A set of entity types to keep. If not passed, all entity types are kept.
+        A set of entity types to keep.
+        If not passed, all entity types are kept.
         Use this argument to discard some entity types from training data.
     sequence_encoder : object, optional
         Sequence encoder object. If not passed,
@@ -142,7 +151,7 @@ def tokenize_single(self, tree):
             >>> tree = loader.loadbytes(b"<p>hello, <PER>John <b>Doe</b></PER> <br> <PER>Mary</PER> said</p>")
             >>> html_tokens, tags = html_tokenizer.tokenize_single(tree)
             >>> html_tokens
-            [HtmlToken(token='hello', parent=<Element p at ...>, index=0), HtmlToken...]
+            [HtmlToken(token='hello', parent=<Element p at ...>, index=0, ...), HtmlToken...]
             >>> tags
             ['O', 'B-PER', 'I-PER', 'B-PER', 'O']
             >>> for tok, iob_tag in zip(html_tokens, tags):
@@ -180,6 +189,8 @@ def detokenize_single(self, html_tokens, tags):
         Build annotated ``lxml.etree.ElementTree`` from
         ``html_tokens`` (a list of :class:`.HtmlToken` instances)
         and ``tags`` (a list of their tags).
+        **ATTENTION**: ``html_tokens`` should be tokenized from tree
+        without tags
 
         Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__``
         text tokens (this is the format :mod:`webstruct.loaders` use).
@@ -190,9 +201,7 @@ def detokenize_single(self, html_tokens, tags):
         if not html_tokens:
             return None
 
-        orig_tree = html_tokens[0].root
-        tree = copy.deepcopy(orig_tree)
-        xpatheval = XPathEvaluator(tree)
+        tree = html_tokens[0].root
 
         # find starts/ends of token groups
         token_groups = self.sequence_encoder.group(zip(html_tokens, tags))
@@ -206,30 +215,47 @@ def detokenize_single(self, html_tokens, tags):
             pos += n_tokens
 
         # mark starts/ends with special tokens
-        data = zip(html_tokens, tags, range(len(html_tokens)))
-        keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail)
+        data = [(s, True) for s in starts]
+        data.extend((s, False) for s in ends)
+        keyfunc = lambda rec: (id(html_tokens[rec[0]].elem), html_tokens[rec[0]].is_tail)
+        data.sort(key=keyfunc)
 
-        for (orig_elem, is_tail), g in groupby(data, keyfunc):
+        for (_, is_tail), g in groupby(data, keyfunc):
             g = list(g)
-            fix = False
-            tokens = g[0][0].tokens[:]
-            for token, tag, token_idx in g:
-                if token_idx in starts:
-                    text = ' __START_%s__ %s' % (tag[2:], tokens[token.index])
-                    tokens[token.index] = text
-                    fix = True
-                if token_idx in ends:
-                    text = '%s __END_%s__ ' % (tokens[token.index], tag[2:])
-                    tokens[token.index] = text
-                    fix = True
-
-            if fix:
-                xpath = orig_tree.getpath(orig_elem)
-                elem = xpatheval(xpath)[0]
-                if is_tail:
-                    elem.tail = smart_join(tokens)
+            g.sort(key=lambda t: (html_tokens[t[0]].position, not t[1]))
+
+            if not g:
+                continue
+
+            elem = html_tokens[g[0][0]].elem
+
+            pos_in_source = 0
+            source = elem.text
+            if is_tail:
+                source = elem.tail
+
+            modded = ''
+
+            for idx, is_starts in g:
+                token = html_tokens[idx]
+                tag = tags[idx]
+                modded = modded + source[pos_in_source:token.position]
+                pos_in_source = token.position
+                if is_starts:
+                    patch = ' __START_%s__ ' % (tag[2:],)
+                    modded = modded + patch
                 else:
-                    elem.text = smart_join(tokens)
+                    end_in_source = pos_in_source + token.length
+                    modded = modded + source[pos_in_source:end_in_source]
+                    pos_in_source = pos_in_source + token.length
+                    patch = ' __END_%s__ ' % (tag[2:],)
+                    modded = modded + patch
+
+            modded = modded + source[pos_in_source:]
+            if is_tail:
+                elem.tail = modded
+            else:
+                elem.text = modded
 
         return tree
 
@@ -245,18 +271,35 @@ def _process_tree(self, tree):
             return
 
         head_tokens, head_tags = self._tokenize_and_split(tree.text)
+        char_tokens = [t.chars for t in head_tokens]
         for index, (token, tag) in enumerate(zip(head_tokens, head_tags)):
-            yield HtmlToken(index, head_tokens, tree, False), tag
+            yield HtmlToken(index,
+                            char_tokens,
+                            tree,
+                            False,
+                            token.position,
+                            token.length), tag
 
         for child in tree:  # where is my precious "yield from"?
             for html_token, tag in self._process_tree(child):
                 yield html_token, tag
 
         tail_tokens, tail_tags = self._tokenize_and_split(tree.tail)
+        char_tokens = [t.chars for t in tail_tokens]
         for index, (token, tag) in enumerate(zip(tail_tokens, tail_tags)):
-            yield HtmlToken(index, tail_tokens, tree, True), tag
+            yield HtmlToken(index,
+                            char_tokens,
+                            tree,
+                            True,
+                            token.position,
+                            token.length), tag
+
+    def cleanup_tree(self, tree):
+        cleaned = copy.deepcopy(tree)
+        for _, elem in iterwalk(cleaned):
+            self._cleanup_elem(elem)
 
-        self._cleanup_elem(tree)
+        return cleaned
 
     def _cleanup_elem(self, elem):
         """ Remove special tokens from elem """
@@ -266,16 +309,23 @@ def _cleanup_elem(self, elem):
             elem.tail = self._tag_re.sub("", elem.tail)
 
     def _tokenize_and_split(self, text):
-        input_tokens = self._limit_tags(self.text_tokenize_func(text or ''))
-        input_tokens = map(six.text_type, input_tokens)
-        return self.sequence_encoder.encode_split(input_tokens)
+        text = text or ''
+        input_tokens = [t for t in self.text_tokenize_func(text)]
+        input_tokens = self._limit_tags(input_tokens)
+        input_tokens = [TextToken(chars=six.text_type(t.chars),
+                                  position=t.position,
+                                  length=t.length) for t in input_tokens]
+        chains = self.sequence_encoder.encode(t.chars for t in input_tokens)
+        chains = self.sequence_encoder.from_indicies(chains, input_tokens)
+        chains = [l for l in chains]
+        return self.sequence_encoder.split(chains)
 
     def _limit_tags(self, input_tokens):
         if self.tagset is None:
             return input_tokens
 
         proc = self.sequence_encoder.token_processor
-        token_classes = [proc.classify(tok) for tok in input_tokens]
+        token_classes = [proc.classify(tok.chars) for tok in input_tokens]
         return [
             tok for (tok, (typ, value)) in zip(input_tokens, token_classes)
             if not (typ in {'start', 'end'} and value not in self.tagset)

diff --git a/webstruct/sequence_encoding.py b/webstruct/sequence_encoding.py
@@ -11,23 +11,31 @@ class IobEncoder(object):
 
         >>> iob_encoder = IobEncoder()
         >>> input_tokens = ["__START_PER__", "John", "__END_PER__", "said"]
-        >>> iob_encoder.encode(input_tokens)
+        >>> [p for p in IobEncoder.from_indicies(iob_encoder.encode(input_tokens), input_tokens)]
         [('John', 'B-PER'), ('said', 'O')]
 
     Get the result in another format using ``encode_split`` method::
 
         >>> input_tokens = ["hello", "__START_PER__", "John", "Doe", "__END_PER__", "__START_PER__", "Mary", "__END_PER__", "said"]
-        >>> tokens, tags = iob_encoder.encode_split(input_tokens)
+        >>> tokens = iob_encoder.encode(input_tokens)
+        >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens)]
+        >>> tokens, tags = iob_encoder.split(tokens)
         >>> tokens, tags
         (['hello', 'John', 'Doe', 'Mary', 'said'], ['O', 'B-PER', 'I-PER', 'B-PER', 'O'])
 
     Note that IobEncoder is stateful. This means you can encode incomplete
     stream and continue the encoding later::
 
         >>> iob_encoder = IobEncoder()
-        >>> iob_encoder.encode(["__START_PER__", "John"])
+        >>> input_tokens_partial = ["__START_PER__", "John"]
+        >>> tokens = iob_encoder.encode(input_tokens_partial)
+        >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens_partial)]
+        >>> tokens
         [('John', 'B-PER')]
-        >>> iob_encoder.encode(["Mayer", "__END_PER__", "said"])
+        >>> input_tokens_partial = ["Mayer", "__END_PER__", "said"]
+        >>> tokens = iob_encoder.encode(input_tokens_partial)
+        >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens_partial)]
+        >>> tokens
         [('Mayer', 'I-PER'), ('said', 'O')]
 
     To reset internal state, use ``reset method``::
@@ -36,7 +44,7 @@ class IobEncoder(object):
 
     Group results to entities::
 
-        >>> iob_encoder.group(iob_encoder.encode(input_tokens))
+        >>> iob_encoder.group([p for p in IobEncoder.from_indicies(iob_encoder.encode(input_tokens), input_tokens)])
         [(['hello'], 'O'), (['John', 'Doe'], 'PER'), (['Mary'], 'PER'), (['said'], 'O')]
 
     Input token stream is processed by ``InputTokenProcessor()`` by default;
@@ -53,7 +61,7 @@ def reset(self):
         self.tag = 'O'
 
     def iter_encode(self, input_tokens):
-        for token in input_tokens:
+        for number, token in enumerate(input_tokens):
             token_type, value = self.token_processor.classify(token)
 
             if token_type == 'start':
@@ -68,7 +76,7 @@ def iter_encode(self, input_tokens):
                 self.tag = "O"
 
             elif token_type == 'token':
-                yield token, self.tag
+                yield number, self.tag
                 if self.tag[0] == 'B':
                     self.tag = "I" + self.tag[1:]
 
@@ -81,13 +89,14 @@ def iter_encode(self, input_tokens):
     def encode(self, input_tokens):
         return list(self.iter_encode(input_tokens))
 
-    def encode_split(self, input_tokens):
-        """ The same as ``encode``, but returns ``(tokens, tags)`` tuple """
-        res = self.encode(input_tokens)
-        if not res:
-            return (), ()
-        tokens, tags = zip(*res)
-        return list(tokens), list(tags)
+    def split(self, tokens):
+        """ split ``[(token, tag)]`` to ``([token], [tags])`` tuple """
+        return [t[0] for t in tokens], [t[1] for t in tokens]
+
+    @classmethod
+    def from_indicies(Cls, indicies, input_tokens):
+        for idx, tag in indicies:
+            yield input_tokens[idx], tag
 
     @classmethod
     def group(cls, data, strict=False):
@@ -186,4 +195,3 @@ def classify(self, token):
 
         # regular token
         return 'token', token
-
diff --git a/webstruct/tests/test_html_tokenizer.py b/webstruct/tests/test_html_tokenizer.py
@@ -69,8 +69,6 @@ def assertTokenizationWorks(self, tree):
             [u'B-ORG', u'I-ORG', 'O', 'O', 'O', 'O', u'B-CITY']
         )
 
-        tree = html_tokens[0].root
-        self.assertNotIn(b'__', tostring(tree))
 
     def test_tokenize_single(self):
         self.assertTokenizationWorks(self._load())
@@ -84,7 +82,7 @@ def test_detokenize_single(self):
 
         tokenizer = HtmlTokenizer()
         html_tokens, tags = tokenizer.tokenize_single(src_tree)
-        new_tree = html_tokens[0].root
+        new_tree = tokenizer.cleanup_tree(src_tree)
         self.assertIn(b'__START_ORG__', tostring(src_tree))
         self.assertNotIn(b'__START_ORG__', tostring(new_tree))
 
@@ -93,6 +91,7 @@ def test_detokenize_single(self):
             html_document_fromstring(UNANNOTATED_HTML)
         )
 
+        html_tokens, _ = tokenizer.tokenize_single(new_tree)
         detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
         self.assertIn(b'__START_ORG__', tostring(detokenized_tree))
 
@@ -137,3 +136,18 @@ def test_tokenize_scripts_and_styles(self):
         # and restores the tree if needed
         detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
         self.assertHtmlTreeEqual(tree, detokenized_tree)
+
+    def test_detokenize_preserve_commas(self):
+        annotated_html = b"""
+        <html>
+          <body> __START_ORG__ hello __END_ORG__  a, b <a>world</a></body>
+        </html>
+        """
+
+        annotated_tree = HtmlLoader().loadbytes(annotated_html)
+        tokenizer = HtmlTokenizer()
+        html_tokens, tags = tokenizer.tokenize_single(annotated_tree)
+        clean_tree = tokenizer.cleanup_tree(annotated_tree)
+        html_tokens, _ = tokenizer.tokenize_single(clean_tree)
+        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
+        self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)