Skip to content

Commit 5d81c12

Browse files
gabrielgradinarupgaref
authored andcommitted
Added some error handling (#25)
* Proxy timeout - error handling * Mocked proxy websites in tests
1 parent 5621d8a commit 5d81c12

8 files changed

Lines changed: 215 additions & 31 deletions

File tree

http_request_randomizer/requests/parsers/FreeProxyParser.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010

1111

1212
class FreeProxyParser(UrlParser):
13-
def __init__(self, web_url):
14-
UrlParser.__init__(self, web_url)
13+
def __init__(self, web_url, timeout=None):
14+
UrlParser.__init__(self, web_url, timeout)
1515

1616
def parse_proxyList(self):
1717
curr_proxy_list = []
18-
content = requests.get(self.get_URl()).content
18+
content = requests.get(self.get_URl(), timeout=self.timeout).content
1919
soup = BeautifulSoup(content, "html.parser")
2020
table = soup.find("table", attrs={"class": "display fpltable"})
2121

@@ -25,7 +25,8 @@ def parse_proxyList(self):
2525
datasets = []
2626
for row in table.find_all("tr")[1:]:
2727
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
28-
datasets.append(dataset)
28+
if dataset:
29+
datasets.append(dataset)
2930

3031
for dataset in datasets:
3132
# Check Field[0] for tags and field[1] for values!

http_request_randomizer/requests/parsers/ProxyForEuParser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010

1111

1212
class ProxyForEuParser(UrlParser):
13-
def __init__(self, web_url, bandwithdh=None):
14-
UrlParser.__init__(self, web_url, bandwithdh)
13+
def __init__(self, web_url, bandwithdh=None, timeout=None):
14+
UrlParser.__init__(self, web_url, bandwithdh, timeout)
1515

1616
def parse_proxyList(self):
1717
curr_proxy_list = []
18-
content = requests.get(self.get_URl()).content
18+
content = requests.get(self.get_URl(), timeout=self.timeout).content
1919
soup = BeautifulSoup(content, "html.parser")
2020
table = soup.find("table", attrs={"class": "proxy_list"})
2121

http_request_randomizer/requests/parsers/RebroWeeblyParser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@
1010

1111

1212
class RebroWeeblyParser(UrlParser):
13-
def __init__(self, web_url):
13+
def __init__(self, web_url, timeout=None):
1414
self.top_proxy_path = "proxy-list.html"
1515
self.txt_proxy_path = "txt-lists.html"
16-
UrlParser.__init__(self, web_url)
16+
UrlParser.__init__(self, web_url, timeout)
1717

1818
def parse_proxyList(self, use_top15k=False):
1919
curr_proxy_list = []
20-
content = requests.get(self.get_URl()+"/"+self.top_proxy_path).content
20+
content = requests.get(self.get_URl()+"/"+self.top_proxy_path, timeout=self.timeout).content
2121
soup = BeautifulSoup(content, "html.parser")
2222
table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}).find('font', attrs={
2323
'color': '#33a27f'})

http_request_randomizer/requests/parsers/SamairProxyParser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010

1111

1212
class SamairProxyParser(UrlParser):
13-
def __init__(self, web_url):
14-
UrlParser.__init__(self, web_url)
13+
def __init__(self, web_url, timeout=None):
14+
UrlParser.__init__(self, web_url, timeout)
1515

1616
def parse_proxyList(self):
1717
curr_proxy_list = []
18-
content = requests.get(self.get_URl()).content
18+
content = requests.get(self.get_URl(), timeout=self.timeout).content
1919
soup = BeautifulSoup(content, "html.parser")
2020
# css provides the port number so we reverse it
2121
# for href in soup.findAll('link'):

http_request_randomizer/requests/parsers/UrlParser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ class UrlParser(object):
1515
minimum_bandwidth_in_KBs (to avoid straggling proxies when having the extra info from proxy provider)
1616
"""
1717

18-
def __init__(self, web_url, bandwidthKBs=None):
18+
def __init__(self, web_url, bandwidthKBs=None, timeout=None):
1919
self.url = web_url
20+
self.timeout = timeout
2021
if bandwidthKBs is not None:
2122
self.minimum_bandwidth_in_KBs = bandwidthKBs
2223
else:

http_request_randomizer/requests/proxy/requestProxy.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828

2929
class RequestProxy:
30-
def __init__(self, web_proxy_list=[], sustain=False):
30+
def __init__(self, web_proxy_list=[], sustain=False, timeout=5):
3131
self.userAgent = UserAgentManager()
3232
self.logger = logging.getLogger()
3333
self.logger.addHandler(handler)
@@ -37,10 +37,10 @@ def __init__(self, web_proxy_list=[], sustain=False):
3737
# Each of the classes below implements a specific URL Parser
3838
#####
3939
parsers = list([])
40-
parsers.append(FreeProxyParser('http://free-proxy-list.net'))
41-
parsers.append(ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0))
42-
parsers.append(RebroWeeblyParser('http://rebro.weebly.com'))
43-
# parsers.append(SamairProxyParser('http://samair.ru/proxy/time-01.htm'))
40+
parsers.append(FreeProxyParser('http://free-proxy-list.net', timeout=timeout))
41+
parsers.append(ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0, timeout=timeout))
42+
parsers.append(RebroWeeblyParser('http://rebro.weebly.com', timeout=timeout))
43+
parsers.append(SamairProxyParser('http://samair.ru/proxy/time-01.htm', timeout=timeout))
4444

4545
self.logger.debug("=== Initialized Proxy Parsers ===")
4646
for i in range(len(parsers)):
@@ -51,7 +51,10 @@ def __init__(self, web_proxy_list=[], sustain=False):
5151
self.parsers = parsers
5252
self.proxy_list = web_proxy_list
5353
for i in range(len(parsers)):
54-
self.proxy_list += parsers[i].parse_proxyList()
54+
try:
55+
self.proxy_list += parsers[i].parse_proxyList()
56+
except ReadTimeout:
57+
self.logger.warn("Proxy Parser: '{}' TimedOut!".format(parsers[i].url))
5558
self.current_proxy = self.randomize_proxy()
5659

5760
def set_logger_level(self, level):

tests/mocks.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
from httmock import urlmatch
2+
3+
4+
free_proxy_expected = ['http://138.197.136.46:3128', 'http://177.207.75.227:8080']
5+
proxy_for_eu_expected = ['http://107.151.136.222:80', 'http://37.187.253.39:8115']
6+
rebro_weebly_expected = ['http://213.149.105.12:8080', 'http://119.188.46.42:8080']
7+
samair_expected = ['http://191.252.61.28:80', 'http://167.114.203.141:8080']
8+
9+
@urlmatch(netloc=r'(.*\.)?free-proxy-list\.net$')
10+
def free_proxy_mock(url, request):
11+
return """<table border="0" cellpadding="0" cellspacing="0" class="display fpltable"
12+
id="proxylisttable">\n
13+
<thead>\n
14+
<tr>\n
15+
<th>IP Address</th>
16+
\n
17+
<th>Port</th>
18+
\n
19+
<th>Code</th>
20+
\n
21+
<th>Country</th>
22+
\n
23+
<th>Anonymity</th>
24+
\n
25+
<th>Google</th>
26+
\n
27+
<th>Https</th>
28+
\n
29+
<th>Last Checked</th>
30+
\n
31+
</tr>
32+
\n
33+
</thead>
34+
\n
35+
<tbody>
36+
<tr>
37+
<td>138.197.136.46</td>
38+
<td>3128</td>
39+
<td>CA</td>
40+
<td>Canada</td>
41+
<td>anonymous</td>
42+
<td>no</td>
43+
<td>no</td>
44+
<td>7 seconds ago</td>
45+
</tr>
46+
\n
47+
<tr>
48+
<td>177.207.75.227</td>
49+
<td>8080</td>
50+
<td>BR</td>
51+
<td>Brazil</td>
52+
<td>transparent</td>
53+
<td>no</td>
54+
<td>no</td>
55+
<td>2 hours 21 minutes ago</td>
56+
</tr>
57+
\n
58+
</tbody>
59+
\n
60+
<tfoot>\n
61+
<tr>\n
62+
<th class="input"><input type="text"/></th>
63+
\n
64+
<th></th>
65+
\n
66+
<th></th>
67+
\n
68+
<th></th>
69+
\n
70+
<th></th>
71+
\n
72+
<th></th>
73+
\n
74+
<th></th>
75+
\n
76+
<th></th>
77+
\n
78+
</tr>
79+
\n
80+
</tfoot>
81+
\n
82+
</table>"""
83+
84+
85+
@urlmatch(netloc=r'(.*\.)?proxyfor\.eu')
86+
def proxy_for_eu_mock(url, request):
87+
return """<table class="proxy_list">
88+
<tr>
89+
<th>IP</th>
90+
<th>Port</th>
91+
<th>Country</th>
92+
<th>Anon</th>
93+
<th>Speed</th>
94+
<th> Check</th>
95+
<th>Cookie/POST</th>
96+
</tr>
97+
<tr>
98+
<td>107.151.136.222</td>
99+
<td>80</td>
100+
<td>United States</td>
101+
<td>HIGH</td>
102+
<td>1.643</td>
103+
<td>2016-04-12 17:02:43</td>
104+
<td>Yes/Yes</td>
105+
</tr>
106+
<tr>
107+
<td>37.187.253.39</td>
108+
<td>8115</td>
109+
<td>France</td>
110+
<td>HIGH</td>
111+
<td>12.779</td>
112+
<td>2016-04-12 14:36:18</td>
113+
<td>Yes/Yes</td>
114+
</tr>
115+
</table>"""
116+
117+
118+
@urlmatch(netloc=r'(.*\.)?rebro\.weebly\.com$')
119+
def rebro_weebly_mock(url, request):
120+
return """<div class="paragraph" style="text-align:left;"><strong><font color="#3ab890"
121+
size="3"><font
122+
color="#d5d5d5">IP:Port</font></font></strong><br/><font
123+
size="2"><strong><font color="#33a27f">213.149.105.12:8080<br/>119.188.46.42:8080</font></strong></font><br/><span></span>
124+
</div>"""
125+
126+
127+
@urlmatch(netloc=r'(.*\.)?www.samair.ru')
128+
def samair_mock(url, request):
129+
return """<table id="proxylist">\n
130+
<tr class="list_sorted">\n
131+
<th><a href="http://samair.ru/proxy/ip-address-01.htm"
132+
title="Proxy List sorted by ip address">IP address</a></th>
133+
\n
134+
<th><a href="http://samair.ru/proxy/proxy-01.htm"
135+
title="Proxy List sorted by anonymity level">Anonymity level</a>
136+
</th>
137+
\n
138+
<th><a href="http://samair.ru/proxy/time-01.htm"
139+
title="Proxy List sorted by updated time">Checked time</a></th>
140+
\n
141+
<th><a href="http://samair.ru/proxy/type-01.htm"
142+
title="Proxy list sorted by country">Country</a></th>
143+
\n
144+
<th><dfn title="City or State\\Region ">City</dfn></th>
145+
\n
146+
<th><dfn title="Internet Service Provider">ISP</dfn></th>
147+
\n
148+
</tr>
149+
\n
150+
<tr class="elite">
151+
<td>191.252.61.28:80</td>
152+
<td>high-anonymous</td>
153+
<td>Apr-18, 17:18</td>
154+
<td>Brazil</td>
155+
<td>S\xe3o Jos\xe9 Dos Campos</td>
156+
<td><dfn title="Locaweb Servi\xe7os de Internet S/A">Locaweb
157+
Servi\xe7o...</dfn></td>
158+
</tr>
159+
\n
160+
<tr class="transp">
161+
<td>167.114.203.141:8080</td>
162+
<td>transparent</td>
163+
<td>Apr-18, 13:22</td>
164+
<td>Canada</td>
165+
<td>Montr\xe9al (QC)</td>
166+
<td>OVH Hosting</td>
167+
</tr>
168+
\n
169+
</table>"""

tests/test_providers.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
import unittest
44
import sys
55
import os
6+
from httmock import HTTMock
67

78
sys.path.insert(0, os.path.abspath('.'))
89

10+
from tests.mocks import free_proxy_mock, proxy_for_eu_mock, rebro_weebly_mock, samair_mock
11+
from tests.mocks import free_proxy_expected, proxy_for_eu_expected, rebro_weebly_expected, samair_expected
912
from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser
1013
from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser
1114
from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser
@@ -15,23 +18,30 @@
1518

1619

1720
class TestProxyProviders(unittest.TestCase):
18-
# def setUp(self):
1921

2022
def test_FreeProxyParser(self):
21-
proxy_provider = FreeProxyParser('http://free-proxy-list.net')
22-
proxy_provider.parse_proxyList()
23+
with HTTMock(free_proxy_mock):
24+
proxy_provider = FreeProxyParser('http://free-proxy-list.net')
25+
proxy_list = proxy_provider.parse_proxyList()
26+
self.assertEqual(proxy_list, free_proxy_expected)
2327

2428
def test_ProxyForEuParser(self):
25-
proxy_provider = ProxyForEuParser('http://proxyfor.eu/geo.php')
26-
proxy_provider.parse_proxyList()
29+
with HTTMock(proxy_for_eu_mock):
30+
proxy_provider = ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0)
31+
proxy_list = proxy_provider.parse_proxyList()
32+
self.assertEqual(proxy_list, proxy_for_eu_expected)
2733

2834
def test_RebroWeeblyParser(self):
29-
proxy_provider = RebroWeeblyParser('http://rebro.weebly.com')
30-
proxy_provider.parse_proxyList()
31-
32-
# def test_SemairProxyParser(self):
33-
# proxy_provider = SamairProxyParser('http://www.samair.ru/proxy/time-01.htm')
34-
# proxy_provider.parse_proxyList()
35+
with HTTMock(rebro_weebly_mock):
36+
proxy_provider = RebroWeeblyParser('http://rebro.weebly.com')
37+
proxy_list = proxy_provider.parse_proxyList()
38+
self.assertEqual(proxy_list, rebro_weebly_expected)
39+
40+
def test_SemairProxyParser(self):
41+
with HTTMock(samair_mock):
42+
proxy_provider = SamairProxyParser('http://www.samair.ru/proxy/time-01.htm')
43+
proxy_list = proxy_provider.parse_proxyList()
44+
self.assertEqual(proxy_list, samair_expected)
3545

3646

3747
if __name__ == '__main__':

0 commit comments

Comments
 (0)