Skip to content

Commit b875087

Browse files
author
pgaref
committed
Added pagination parser for Samair/Preempt proxy, more work on ProxyObject #30, some log house keeping
1 parent 37a844a commit b875087

8 files changed

Lines changed: 69 additions & 44 deletions

File tree

http_request_randomizer/requests/parsers/FreeProxyParser.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,15 @@ def parse_proxyList(self):
3636
datasets.append(dataset)
3737

3838
for dataset in datasets:
39-
proxy_obj = self.createProxyObject(dataset)
39+
proxy_obj = self.create_proxy_object(dataset)
4040
# Make sure it is a Valid Proxy Address
41-
if UrlParser.valid_ip_port(proxy_obj.getAddress()):
41+
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
4242
curr_proxy_list.append(proxy_obj)
43-
proxy_obj.print_everything()
4443
else:
45-
logger.debug("Address with Invalid format: {}".format(proxy_obj.getAddress()))
46-
# print "{0:<10}: {1}".format(field[0], field[1])
47-
# print "ALL: ", curr_proxy_list
48-
44+
logger.debug("Proxy Invalid: {}".format(dataset))
4945
return curr_proxy_list
5046

51-
def createProxyObject(self, dataset):
47+
def create_proxy_object(self, dataset):
5248
# Check Field[0] for tags and field[1] for values!
5349
ip = ""
5450
port = None
@@ -58,7 +54,7 @@ def createProxyObject(self, dataset):
5854
if field[0] == 'IP Address':
5955
# Make sure it is a Valid IP
6056
ip = field[1].strip() # String strip()
61-
# TODO @pgaref: Duplicate code!!!
57+
# Make sure it is a Valid IP
6258
if not UrlParser.valid_ip(ip):
6359
logger.debug("IP with Invalid format: {}".format(ip))
6460
return None

http_request_randomizer/requests/parsers/ProxyForEuParser.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,15 @@ def parse_proxyList(self):
3535
datasets.append(dataset)
3636

3737
for dataset in datasets:
38-
3938
# Avoid Straggler proxies and make sure it is a Valid Proxy Address
40-
proxy_obj = self.createProxyObject(dataset)
41-
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.getAddress()):
39+
proxy_obj = self.create_proxy_object(dataset)
40+
if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()):
4241
curr_proxy_list.append(proxy_obj)
43-
proxy_obj.print_everything()
44-
# print "{0:<10}: {1}".format(field[0], field[1])
45-
# print "ALL: ", curr_proxy_list
42+
else:
43+
logger.debug("Proxy Invalid: {}".format(dataset))
4644
return curr_proxy_list
4745

48-
def createProxyObject(self, dataset):
46+
def create_proxy_object(self, dataset):
4947
ip = ""
5048
port = None
5149
anonymity = AnonymityLevel.UNKNOWN
@@ -55,10 +53,10 @@ def createProxyObject(self, dataset):
5553
# Discard slow proxies! Speed is in KB/s
5654
if field[0] == 'Speed':
5755
if float(field[1]) < self.get_min_bandwidth():
56+
logger.debug("Proxy with low bandwidth: {}".format(float(field[1])))
5857
return None
5958
if field[0] == 'IP':
6059
ip = field[1].strip() # String strip()
61-
# TODO @pgaref : Dupicate code?
6260
# Make sure it is a Valid IP
6361
if not UrlParser.valid_ip(ip):
6462
logger.debug("IP with Invalid format: {}".format(ip))

http_request_randomizer/requests/parsers/RebroWeeblyParser.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ def parse_proxyList(self, use_top15k=False):
3131
# Parse Top Proxy List page
3232
for row in [x for x in table.contents if getattr(x, 'name', None) != 'br']:
3333
# Make sure it is a Valid Proxy Address
34-
if UrlParser.valid_ip_port(row):
35-
proxy_obj = self.createProxyObject(row)
34+
proxy_obj = self.create_proxy_object(row)
35+
if proxy_obj is not None and UrlParser.valid_ip_port(row):
3636
curr_proxy_list.append(proxy_obj)
3737
else:
38-
logger.debug("Address with Invalid format: {}".format(row))
38+
logger.debug("Proxy Invalid: {}".format(row))
3939
# Usually these proxies are stale
4040
if use_top15k:
4141
# Parse 15k Nodes Text file (named *-all-*.txt)
@@ -49,15 +49,18 @@ def parse_proxyList(self, use_top15k=False):
4949
more_content = requests.get(self.get_URl() + self.txt_proxy_path).text
5050
for proxy_address in more_content.split():
5151
if UrlParser.valid_ip_port(proxy_address):
52-
proxy_obj = self.createProxyObject(row)
52+
proxy_obj = self.create_proxy_object(row)
5353
curr_proxy_list.append(proxy_obj)
54-
5554
return curr_proxy_list
5655

57-
def createProxyObject(self, dataset):
56+
def create_proxy_object(self, dataset):
5857
# Provider specific code
5958
dataset = dataset.strip() # String strip()
6059
ip = dataset.split(":")[0]
60+
# Make sure it is a Valid IP
61+
if not UrlParser.valid_ip(ip):
62+
logger.debug("IP with Invalid format: {}".format(ip))
63+
return None
6164
port = dataset.split(":")[1]
6265
# TODO: Parse extra tables and combine data - Provider seems to be out-of-date
6366
country = "Unknown"

http_request_randomizer/requests/parsers/SamairProxyParser.py

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@ def __init__(self, id, web_url, timeout=None):
1919
def parse_proxyList(self):
2020
curr_proxy_list = []
2121
# Parse all proxy pages -> format: /list/{num}.htm
22-
# TODO @pgaref: get the pageRange from the 'pagination' table
23-
for page in range(1, 21):
24-
response = requests.get("{0}{num:02d}.htm".format(self.get_URl(), num=page), timeout=self.timeout)
22+
# Get the pageRange from the 'pagination' table
23+
page_set = self.get_pagination_set()
24+
logger.debug("Pages: {}".format(page_set))
25+
for page in page_set:
26+
response = requests.get("{0}{1}".format(self.get_URl(), page), timeout=self.timeout)
2527
if not response.ok:
2628
# Could not parse ANY page - Let user know
2729
if not curr_proxy_list:
@@ -50,20 +52,42 @@ def parse_proxyList(self):
5052
for row in table.find_all("tr")[1:]:
5153
td_row = row.find("td")
5254
# curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
55+
proxy_obj = self.create_proxy_object(row)
5356
# Make sure it is a Valid Proxy Address
54-
if UrlParser.valid_ip_port(td_row.text):
55-
proxy_obj = self.createProxyObject(row)
56-
proxy_obj.print_everything()
57+
if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text):
5758
curr_proxy_list.append(proxy_obj)
5859
else:
59-
logger.debug("Address with Invalid format: {}".format(td_row.text))
60+
logger.debug("Proxy Invalid: {}".format(td_row.text))
6061
return curr_proxy_list
6162

62-
def createProxyObject(self, row):
63+
def get_pagination_set(self):
64+
response = requests.get(self.get_URl(), timeout=self.timeout)
65+
page_set = set()
66+
# Could not parse pagination page - Let user know
67+
if not response.ok:
68+
logger.warn("Proxy Provider url failed: {}".format(self.get_URl()))
69+
return page_set
70+
content = response.content
71+
soup = BeautifulSoup(content, "html.parser")
72+
for ultag in soup.find_all('ul', {'class': 'pagination'}):
73+
for litag in ultag.find_all('li'):
74+
page_ref = litag.a.get('href').decode('utf-8')
75+
# Skip current page '/list'
76+
if page_ref.endswith(('htm', 'html')):
77+
page_set.add(page_ref)
78+
else:
79+
page_set.add("")
80+
return page_set
81+
82+
def create_proxy_object(self, row):
6383
for td_row in row.findAll("td"):
6484
if td_row.attrs['data-label'] == 'IP:port ':
6585
text = td_row.text.strip()
6686
ip = text.split(":")[0]
87+
# Make sure it is a Valid IP
88+
if not UrlParser.valid_ip(ip):
89+
logger.debug("IP with Invalid format: {}".format(ip))
90+
return None
6791
port = text.split(":")[1]
6892
elif td_row.attrs['data-label'] == 'Anonymity Type: ':
6993
anonymity = AnonymityLevel(td_row.text.strip())
@@ -73,4 +97,4 @@ def createProxyObject(self, row):
7397

7498
def __str__(self):
7599
return "SemairProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
76-
.format(self.url, self.minimum_bandwidth_in_KBs)
100+
.format(self.url, self.minimum_bandwidth_in_KBs)

http_request_randomizer/requests/proxy/ProxyObject.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,20 @@ def __init__(self, source, ip, port, anonymity_level, country=None, protocols=[
2121
self.protocols = protocols
2222
self.tunnel = tunnel
2323

24-
def getAddress(self):
24+
def get_address(self):
2525
return "{0}:{1}".format(self.ip, self.port)
2626

2727
def __str__(self):
2828
""" Method is heavily used for Logging - make sure we have a readable output
2929
3030
:return: The address representation of the proxy
3131
"""
32-
return "{0} | {1}".format(self.getAddress(), self.source)
32+
return "{0} | {1}".format(self.get_address(), self.source)
3333

34-
def print_everything(self):
35-
print("Address: {0} | Src: {1} | | Country: {2} | Anonymity: {3} | Protoc: {4} | Tunnel: {5}" \
36-
.format(self.getAddress(), self.source, self.country, self.anonymity_level, self.protocols,
37-
self.tunnel))
34+
def to_str(self):
35+
return "Address: {0} | Src: {1} | | Country: {2} | Anonymity: {3} | Protoc: {4} | Tunnel: {5}"\
36+
.format(self.get_address(), self.source, self.country, self.anonymity_level, self.protocols,
37+
self.tunnel)
3838

3939

4040
class AnonymityLevel(MultiValueEnum):

http_request_randomizer/requests/proxy/requestProxy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def generate_proxied_request(self, url, method="GET", params={}, data={}, header
100100
headers.update(req_headers)
101101

102102
self.logger.debug("Using proxy: {0}".format(str(self.current_proxy)))
103-
request = requests.request(method, url, proxies={"http": self.current_proxy.getAddress()},
103+
request = requests.request(method, url, proxies={"http": self.current_proxy.get_address()},
104104
headers=headers, data=data, params=params, timeout=req_timeout)
105105
# Avoid HTTP request errors
106106
if request.status_code == 409:
@@ -151,7 +151,7 @@ def generate_proxied_request(self, url, method="GET", params={}, data={}, header
151151
req_proxy = RequestProxy()
152152
print("Initialization took: {0} sec".format((time.time() - start)))
153153
print("Size: {0}".format(len(req_proxy.get_proxy_list())))
154-
print("ALL = {0} ".format(req_proxy.get_proxy_list()))
154+
print("ALL = {0} ".format(map(lambda x: x.get_address(), req_proxy.get_proxy_list())))
155155

156156
test_url = 'http://ipv4.icanhazip.com'
157157

tests/mocks.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,10 @@ def samair_mock(url, request):
142142
\n
143143
</tr>
144144
\n
145+
<div id="navbar">
146+
<ul class="pagination"><li class="active"><a href="/list/">1</a></li><li><a href="02.htm">2</a></li></ul>
147+
</div>
148+
\n
145149
<tr class="anon">
146150
<td data-label="IP:port ">191.252.61.28:80</td>
147151
<td data-label="Anonymity Type: ">high-anonymous</td>

tests/test_providers.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def test_FreeProxyParser(self):
2525
proxy_list = proxy_provider.parse_proxyList()
2626
proxy_list_addr = []
2727
for proxy in proxy_list:
28-
proxy_list_addr.append(proxy.getAddress())
28+
proxy_list_addr.append(proxy.get_address())
2929
self.assertEqual(proxy_list_addr, free_proxy_expected)
3030

3131
def test_ProxyForEuParser(self):
@@ -34,7 +34,7 @@ def test_ProxyForEuParser(self):
3434
proxy_list = proxy_provider.parse_proxyList()
3535
proxy_list_addr = []
3636
for proxy in proxy_list:
37-
proxy_list_addr.append(proxy.getAddress())
37+
proxy_list_addr.append(proxy.get_address())
3838
self.assertEqual(proxy_list_addr, proxy_for_eu_expected)
3939

4040
def test_RebroWeeblyParser(self):
@@ -43,7 +43,7 @@ def test_RebroWeeblyParser(self):
4343
proxy_list = proxy_provider.parse_proxyList()
4444
proxy_list_addr = []
4545
for proxy in proxy_list:
46-
proxy_list_addr.append(proxy.getAddress())
46+
proxy_list_addr.append(proxy.get_address())
4747
self.assertEqual(proxy_list_addr, rebro_weebly_expected)
4848

4949
def test_SemairProxyParser(self):
@@ -52,7 +52,7 @@ def test_SemairProxyParser(self):
5252
proxy_list = proxy_provider.parse_proxyList()
5353
proxy_list_addr = []
5454
for proxy in proxy_list:
55-
proxy_list_addr.append(proxy.getAddress())
55+
proxy_list_addr.append(proxy.get_address())
5656
for item in samair_expected:
5757
self.assertTrue(item in proxy_list_addr)
5858

0 commit comments

Comments
 (0)