Skip to content

Commit ad7d4c7

Browse files
author
pgaref
committed
Merge remote-tracking branch 'origin/master' into develop
2 parents d47eb00 + 5d81c12 commit ad7d4c7

9 files changed

Lines changed: 233 additions & 45 deletions

File tree

http_request_randomizer/requests/parsers/FreeProxyParser.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010

1111

1212
class FreeProxyParser(UrlParser):
13-
def __init__(self, web_url):
14-
UrlParser.__init__(self, web_url)
13+
def __init__(self, web_url, timeout=None):
14+
UrlParser.__init__(self, web_url, timeout)
1515

1616
def parse_proxyList(self):
1717
curr_proxy_list = []
18-
content = requests.get(self.get_URl()).content
18+
content = requests.get(self.get_URl(), timeout=self.timeout).content
1919
soup = BeautifulSoup(content, "html.parser")
2020
table = soup.find("table", attrs={"class": "display fpltable"})
2121

@@ -25,7 +25,8 @@ def parse_proxyList(self):
2525
datasets = []
2626
for row in table.find_all("tr")[1:]:
2727
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
28-
datasets.append(dataset)
28+
if dataset:
29+
datasets.append(dataset)
2930

3031
for dataset in datasets:
3132
# Check Field[0] for tags and field[1] for values!

http_request_randomizer/requests/parsers/ProxyForEuParser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010

1111

1212
class ProxyForEuParser(UrlParser):
13-
def __init__(self, web_url, bandwithdh=None):
14-
UrlParser.__init__(self, web_url, bandwithdh)
13+
def __init__(self, web_url, bandwithdh=None, timeout=None):
14+
UrlParser.__init__(self, web_url, bandwithdh, timeout)
1515

1616
def parse_proxyList(self):
1717
curr_proxy_list = []
18-
content = requests.get(self.get_URl()).content
18+
content = requests.get(self.get_URl(), timeout=self.timeout).content
1919
soup = BeautifulSoup(content, "html.parser")
2020
table = soup.find("table", attrs={"class": "proxy_list"})
2121

http_request_randomizer/requests/parsers/RebroWeeblyParser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@
1010

1111

1212
class RebroWeeblyParser(UrlParser):
13-
def __init__(self, web_url):
13+
def __init__(self, web_url, timeout=None):
1414
self.top_proxy_path = "proxy-list.html"
1515
self.txt_proxy_path = "txt-lists.html"
16-
UrlParser.__init__(self, web_url)
16+
UrlParser.__init__(self, web_url, timeout)
1717

1818
def parse_proxyList(self, use_top15k=False):
1919
curr_proxy_list = []
20-
content = requests.get(self.get_URl()+"/"+self.top_proxy_path).content
20+
content = requests.get(self.get_URl()+"/"+self.top_proxy_path, timeout=self.timeout).content
2121
soup = BeautifulSoup(content, "html.parser")
2222
table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}).find('font', attrs={
2323
'color': '#33a27f'})

http_request_randomizer/requests/parsers/SamairProxyParser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010

1111

1212
class SamairProxyParser(UrlParser):
13-
def __init__(self, web_url):
14-
UrlParser.__init__(self, web_url)
13+
def __init__(self, web_url, timeout=None):
14+
UrlParser.__init__(self, web_url, timeout)
1515

1616
def parse_proxyList(self):
1717
curr_proxy_list = []
18-
content = requests.get(self.get_URl()).content
18+
content = requests.get(self.get_URl(), timeout=self.timeout).content
1919
soup = BeautifulSoup(content, "html.parser")
2020
# css provides the port number so we reverse it
2121
# for href in soup.findAll('link'):

http_request_randomizer/requests/parsers/UrlParser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ class UrlParser(object):
1515
minimum_bandwidth_in_KBs (to avoid straggling proxies when having the extra info from proxy provider)
1616
"""
1717

18-
def __init__(self, web_url, bandwidthKBs=None):
18+
def __init__(self, web_url, bandwidthKBs=None, timeout=None):
1919
self.url = web_url
20+
self.timeout = timeout
2021
if bandwidthKBs is not None:
2122
self.minimum_bandwidth_in_KBs = bandwidthKBs
2223
else:

http_request_randomizer/requests/proxy/requestProxy.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828

2929
class RequestProxy:
30-
def __init__(self, web_proxy_list=[], sustain=False):
30+
def __init__(self, web_proxy_list=[], sustain=False, timeout=5):
3131
self.userAgent = UserAgentManager()
3232
self.logger = logging.getLogger()
3333
self.logger.addHandler(handler)
@@ -37,10 +37,10 @@ def __init__(self, web_proxy_list=[], sustain=False):
3737
# Each of the classes below implements a specific URL Parser
3838
#####
3939
parsers = list([])
40-
parsers.append(FreeProxyParser('http://free-proxy-list.net'))
41-
parsers.append(ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0))
42-
parsers.append(RebroWeeblyParser('http://rebro.weebly.com'))
43-
# parsers.append(SamairProxyParser('http://samair.ru/proxy/time-01.htm'))
40+
parsers.append(FreeProxyParser('http://free-proxy-list.net', timeout=timeout))
41+
parsers.append(ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0, timeout=timeout))
42+
parsers.append(RebroWeeblyParser('http://rebro.weebly.com', timeout=timeout))
43+
parsers.append(SamairProxyParser('http://samair.ru/proxy/time-01.htm', timeout=timeout))
4444

4545
self.logger.debug("=== Initialized Proxy Parsers ===")
4646
for i in range(len(parsers)):
@@ -51,7 +51,10 @@ def __init__(self, web_proxy_list=[], sustain=False):
5151
self.parsers = parsers
5252
self.proxy_list = web_proxy_list
5353
for i in range(len(parsers)):
54-
self.proxy_list += parsers[i].parse_proxyList()
54+
try:
55+
self.proxy_list += parsers[i].parse_proxyList()
56+
except ReadTimeout:
57+
self.logger.warn("Proxy Parser: '{}' TimedOut!".format(parsers[i].url))
5558
self.current_proxy = self.randomize_proxy()
5659

5760
def set_logger_level(self, level):
@@ -83,7 +86,11 @@ def randomize_proxy(self):
8386
def generate_proxied_request(self, url, method="GET", params={}, data={}, headers={}, req_timeout=30):
8487
try:
8588
random.shuffle(self.proxy_list)
86-
req_headers = dict(params.items() + self.generate_random_request_headers().items())
89+
# req_headers = dict(params.items() + self.generate_random_request_headers().items())
90+
91+
req_headers = dict(params.items())
92+
req_headers_random = dict(self.generate_random_request_headers().items())
93+
req_headers.update(req_headers_random)
8794

8895
if not self.sustain:
8996
self.randomize_proxy()
@@ -100,7 +107,7 @@ def generate_proxied_request(self, url, method="GET", params={}, data={}, header
100107
raise ConnectionError("HTTP Response [403] - Permission denied error")
101108
elif request.status_code == 503:
102109
raise ConnectionError("HTTP Response [503] - Service unavailable error")
103-
print 'RR Status {}'.format(request.status_code)
110+
print('RR Status {}'.format(request.status_code))
104111
return request
105112
except ConnectionError:
106113
try:
@@ -132,19 +139,19 @@ def generate_proxied_request(self, url, method="GET", params={}, data={}, header
132139

133140
start = time.time()
134141
req_proxy = RequestProxy()
135-
print "Initialization took: {0} sec".format((time.time() - start))
136-
print "Size : ", len(req_proxy.get_proxy_list())
137-
print " ALL = ", req_proxy.get_proxy_list()
142+
print("Initialization took: {0} sec".format((time.time() - start)))
143+
print("Size: {0}".format(len(req_proxy.get_proxy_list())))
144+
print("ALL = {0} ".format(req_proxy.get_proxy_list()))
138145

139146
test_url = 'http://ipv4.icanhazip.com'
140147

141148
while True:
142149
start = time.time()
143150
request = req_proxy.generate_proxied_request(test_url)
144-
print "Proxied Request Took: {0} sec => Status: {1}".format((time.time() - start), request.__str__())
151+
print("Proxied Request Took: {0} sec => Status: {1}".format((time.time() - start), request.__str__()))
145152
if request is not None:
146-
print "\t Response: ip={0}".format(u''.join(request.text).encode('utf-8'))
147-
print "Proxy List Size: ", len(req_proxy.get_proxy_list())
153+
print("\t Response: ip={0}".format(u''.join(request.text).encode('utf-8')))
154+
print("Proxy List Size: {0}".format(len(req_proxy.get_proxy_list())))
148155

149-
print"-> Going to sleep.."
156+
print("-> Going to sleep..")
150157
time.sleep(10)

http_request_randomizer/requests/useragent/userAgent.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ def get_len_user_agent(self):
4040

4141
if __name__ == '__main__':
4242
ua = UserAgentManager()
43-
print "Number of User Agent headers: " + str(ua.get_len_user_agent)
44-
print "First User Agent in file: " + ua.get_first_user_agent()
45-
print "Last User Agent in file: " + ua.get_last_user_agent()
46-
print "If you want one random header for a request, you may use the following header:\n"
47-
print "User-Agent: " + ua.get_random_user_agent() + "\n"
43+
print("Number of User Agent headers: {0}".format(str(ua.get_len_user_agent)))
44+
print("First User Agent in file: {0}".format(ua.get_first_user_agent()))
45+
print("Last User Agent in file: {0}".format(ua.get_last_user_agent()))
46+
print("If you want one random header for a request, you may use the following header:\n")
47+
print("User-Agent: " + ua.get_random_user_agent() + "\n")

tests/mocks.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
from httmock import urlmatch
2+
3+
4+
free_proxy_expected = ['http://138.197.136.46:3128', 'http://177.207.75.227:8080']
5+
proxy_for_eu_expected = ['http://107.151.136.222:80', 'http://37.187.253.39:8115']
6+
rebro_weebly_expected = ['http://213.149.105.12:8080', 'http://119.188.46.42:8080']
7+
samair_expected = ['http://191.252.61.28:80', 'http://167.114.203.141:8080']
8+
9+
@urlmatch(netloc=r'(.*\.)?free-proxy-list\.net$')
10+
def free_proxy_mock(url, request):
11+
return """<table border="0" cellpadding="0" cellspacing="0" class="display fpltable"
12+
id="proxylisttable">\n
13+
<thead>\n
14+
<tr>\n
15+
<th>IP Address</th>
16+
\n
17+
<th>Port</th>
18+
\n
19+
<th>Code</th>
20+
\n
21+
<th>Country</th>
22+
\n
23+
<th>Anonymity</th>
24+
\n
25+
<th>Google</th>
26+
\n
27+
<th>Https</th>
28+
\n
29+
<th>Last Checked</th>
30+
\n
31+
</tr>
32+
\n
33+
</thead>
34+
\n
35+
<tbody>
36+
<tr>
37+
<td>138.197.136.46</td>
38+
<td>3128</td>
39+
<td>CA</td>
40+
<td>Canada</td>
41+
<td>anonymous</td>
42+
<td>no</td>
43+
<td>no</td>
44+
<td>7 seconds ago</td>
45+
</tr>
46+
\n
47+
<tr>
48+
<td>177.207.75.227</td>
49+
<td>8080</td>
50+
<td>BR</td>
51+
<td>Brazil</td>
52+
<td>transparent</td>
53+
<td>no</td>
54+
<td>no</td>
55+
<td>2 hours 21 minutes ago</td>
56+
</tr>
57+
\n
58+
</tbody>
59+
\n
60+
<tfoot>\n
61+
<tr>\n
62+
<th class="input"><input type="text"/></th>
63+
\n
64+
<th></th>
65+
\n
66+
<th></th>
67+
\n
68+
<th></th>
69+
\n
70+
<th></th>
71+
\n
72+
<th></th>
73+
\n
74+
<th></th>
75+
\n
76+
<th></th>
77+
\n
78+
</tr>
79+
\n
80+
</tfoot>
81+
\n
82+
</table>"""
83+
84+
85+
@urlmatch(netloc=r'(.*\.)?proxyfor\.eu')
86+
def proxy_for_eu_mock(url, request):
87+
return """<table class="proxy_list">
88+
<tr>
89+
<th>IP</th>
90+
<th>Port</th>
91+
<th>Country</th>
92+
<th>Anon</th>
93+
<th>Speed</th>
94+
<th> Check</th>
95+
<th>Cookie/POST</th>
96+
</tr>
97+
<tr>
98+
<td>107.151.136.222</td>
99+
<td>80</td>
100+
<td>United States</td>
101+
<td>HIGH</td>
102+
<td>1.643</td>
103+
<td>2016-04-12 17:02:43</td>
104+
<td>Yes/Yes</td>
105+
</tr>
106+
<tr>
107+
<td>37.187.253.39</td>
108+
<td>8115</td>
109+
<td>France</td>
110+
<td>HIGH</td>
111+
<td>12.779</td>
112+
<td>2016-04-12 14:36:18</td>
113+
<td>Yes/Yes</td>
114+
</tr>
115+
</table>"""
116+
117+
118+
@urlmatch(netloc=r'(.*\.)?rebro\.weebly\.com$')
119+
def rebro_weebly_mock(url, request):
120+
return """<div class="paragraph" style="text-align:left;"><strong><font color="#3ab890"
121+
size="3"><font
122+
color="#d5d5d5">IP:Port</font></font></strong><br/><font
123+
size="2"><strong><font color="#33a27f">213.149.105.12:8080<br/>119.188.46.42:8080</font></strong></font><br/><span></span>
124+
</div>"""
125+
126+
127+
@urlmatch(netloc=r'(.*\.)?www.samair.ru')
128+
def samair_mock(url, request):
129+
return """<table id="proxylist">\n
130+
<tr class="list_sorted">\n
131+
<th><a href="http://samair.ru/proxy/ip-address-01.htm"
132+
title="Proxy List sorted by ip address">IP address</a></th>
133+
\n
134+
<th><a href="http://samair.ru/proxy/proxy-01.htm"
135+
title="Proxy List sorted by anonymity level">Anonymity level</a>
136+
</th>
137+
\n
138+
<th><a href="http://samair.ru/proxy/time-01.htm"
139+
title="Proxy List sorted by updated time">Checked time</a></th>
140+
\n
141+
<th><a href="http://samair.ru/proxy/type-01.htm"
142+
title="Proxy list sorted by country">Country</a></th>
143+
\n
144+
<th><dfn title="City or State\\Region ">City</dfn></th>
145+
\n
146+
<th><dfn title="Internet Service Provider">ISP</dfn></th>
147+
\n
148+
</tr>
149+
\n
150+
<tr class="elite">
151+
<td>191.252.61.28:80</td>
152+
<td>high-anonymous</td>
153+
<td>Apr-18, 17:18</td>
154+
<td>Brazil</td>
155+
<td>S\xe3o Jos\xe9 Dos Campos</td>
156+
<td><dfn title="Locaweb Servi\xe7os de Internet S/A">Locaweb
157+
Servi\xe7o...</dfn></td>
158+
</tr>
159+
\n
160+
<tr class="transp">
161+
<td>167.114.203.141:8080</td>
162+
<td>transparent</td>
163+
<td>Apr-18, 13:22</td>
164+
<td>Canada</td>
165+
<td>Montr\xe9al (QC)</td>
166+
<td>OVH Hosting</td>
167+
</tr>
168+
\n
169+
</table>"""

0 commit comments

Comments
 (0)