Skip to content

Commit 9cc1278

Browse files
authored
Merge pull request #2 from ieguiguren/samair
Samair
2 parents dc09188 + d618713 commit 9cc1278

2 files changed

Lines changed: 32 additions & 0 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ The project code in this repository is crawling three different public proxy web
2525
* http://proxyfor.eu/geo.php
2626
* http://free-proxy-list.net
2727
* http://rebro.weebly.com/proxy-list.html
28+
* http://www.samair.ru/proxy/time-01.htm
2829

2930
After collecting the proxy data and filtering the slowest ones it is randomly selecting one of them to query the target url.
3031
The request timeout is configured at 30 seconds and if the proxy fails to return a response it is deleted from the application proxy list.

project/http/requests/proxy/requestProxy.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ def __init__(self, web_proxy_list=[]):
2222
self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php', 100.0)
2323
self.proxy_list += self.freeProxy_url_parser('http://free-proxy-list.net')
2424
self.proxy_list += self.weebly_url_parser('http://rebro.weebly.com/proxy-list.html')
25+
self.proxy_list += self.samair_url_parser('http://www.samair.ru/proxy/time-01.htm')
26+
2527

2628
def get_proxy_list(self):
2729
return self.proxy_list
@@ -127,6 +129,34 @@ def weebly_url_parser(self, web_url):
127129
curr_proxy_list.append(proxy.__str__())
128130
return curr_proxy_list
129131

132+
def samair_url_parser(self, web_url, speed_in_KBs=100.0):
133+
curr_proxy_list = []
134+
content = requests.get(web_url).content
135+
soup = BeautifulSoup(content, "html.parser")
136+
# css provides the port number so we reverse it
137+
for href in soup.findAll('link'):
138+
if '/styles/' in href.get('href'):
139+
style = "http://www.samair.ru" + href.get('href')
140+
break
141+
css = requests.get(style).content.split('\n')
142+
css.pop()
143+
ports = {}
144+
for l in css:
145+
p = l.split(' ')
146+
key = p[0].split(':')[0][1:]
147+
value = p[1].split('\"')[1]
148+
ports[key] = value
149+
150+
table = soup.find("table", attrs={"id": "proxylist"})
151+
152+
# The first tr contains the field names.
153+
headings = [th.get_text() for th in table.find("tr").find_all("th")]
154+
155+
for row in table.find_all("span")[1:]:
156+
curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
157+
158+
return curr_proxy_list
159+
130160
def generate_proxied_request(self, url, params={}, req_timeout=30):
131161
#if len(self.proxy_list) < 2:
132162
# self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php')
@@ -137,6 +167,7 @@ def generate_proxied_request(self, url, params={}, req_timeout=30):
137167
request = None
138168
try:
139169
rand_proxy = random.choice(self.proxy_list)
170+
print "Next proxy: " + str(rand_proxy)
140171
request = requests.get(test_url, proxies={"http": rand_proxy},
141172
headers=req_headers, timeout=req_timeout)
142173
except ConnectionError:

0 commit comments

Comments
 (0)