-
Notifications
You must be signed in to change notification settings - Fork 139
Expand file tree
/
Copy path__init__.py
More file actions
85 lines (73 loc) · 2.73 KB
/
__init__.py
File metadata and controls
85 lines (73 loc) · 2.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import jpype
import requests
import socket
import charade
import threading
socket.setdefaulttimeout(15)
lock = threading.Lock()
InputSource = jpype.JClass('org.xml.sax.InputSource')
StringReader = jpype.JClass('java.io.StringReader')
HTMLHighlighter = jpype.JClass('de.l3s.boilerpipe.sax.HTMLHighlighter')
BoilerpipeSAXInput = jpype.JClass('de.l3s.boilerpipe.sax.BoilerpipeSAXInput')
class Extractor(object):
"""
Extract text. Constructor takes 'extractor' as a keyword argument,
being one of the boilerpipe extractors:
- DefaultExtractor
- ArticleExtractor
- ArticleSentencesExtractor
- KeepEverythingExtractor
- KeepEverythingWithMinKWordsExtractor
- LargestContentExtractor
- NumWordsRulesExtractor
- CanolaExtractor
"""
extractor = None
source = None
data = None
headers = {'User-Agent': 'Mozilla/5.0'}
def __init__(self, extractor='DefaultExtractor', **kwargs):
if kwargs.get('url'):
response = requests.request('GET', kwargs['url'], headers=self.headers, timeout=10)
self.data = response.text
elif kwargs.get('html'):
self.data = kwargs['html']
if not isinstance(self.data, unicode):
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
else:
raise Exception('No text or url provided')
try:
# make it thread-safe
if threading.activeCount() > 1:
if jpype.isThreadAttachedToJVM() == False:
jpype.attachThreadToJVM()
lock.acquire()
self.extractor = jpype.JClass(
"de.l3s.boilerpipe.extractors."+extractor).INSTANCE
finally:
lock.release()
reader = StringReader(self.data)
self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
self.extractor.process(self.source)
def getText(self):
return self.source.getContent()
def getHTML(self):
highlighter = HTMLHighlighter.newExtractingInstance()
return highlighter.process(self.source, self.data)
def getTitle(self):
return self.source.getTitle()
def getImages(self):
extractor = jpype.JClass(
"de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE
images = extractor.process(self.source, self.data)
jpype.java.util.Collections.sort(images)
images = [
{
'src' : image.getSrc(),
'width' : image.getWidth(),
'height': image.getHeight(),
'alt' : image.getAlt(),
'area' : image.getArea()
} for image in images
]
return images