Merge pull request #2 from ieguiguren/samair

pgaref · web-flow · commit 9cc1278d8d9d · 2016-07-13T13:16:13.000-07:00
Samair
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ The project code in this repository is crawling three different public proxy web
 * http://proxyfor.eu/geo.php
 * http://free-proxy-list.net
 * http://rebro.weebly.com/proxy-list.html
+* http://www.samair.ru/proxy/time-01.htm 
 
 After collecting the proxy data and filtering the slowest ones it is randomly selecting one of them to query the target url.
 The request timeout is configured at 30 seconds and if the proxy fails to return a response it is deleted from the application proxy list.
diff --git a/project/http/requests/proxy/requestProxy.py b/project/http/requests/proxy/requestProxy.py
@@ -22,6 +22,8 @@ def __init__(self, web_proxy_list=[]):
         self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php', 100.0)
         self.proxy_list += self.freeProxy_url_parser('http://free-proxy-list.net')
         self.proxy_list += self.weebly_url_parser('http://rebro.weebly.com/proxy-list.html')
+        self.proxy_list += self.samair_url_parser('http://www.samair.ru/proxy/time-01.htm')
+
 
     def get_proxy_list(self):
         return self.proxy_list
@@ -127,6 +129,34 @@ def weebly_url_parser(self, web_url):
             curr_proxy_list.append(proxy.__str__())
         return curr_proxy_list
 
+    def samair_url_parser(self, web_url, speed_in_KBs=100.0):
+         curr_proxy_list = []
+         content = requests.get(web_url).content
+         soup = BeautifulSoup(content, "html.parser")
+         # css provides the port number so we reverse it
+         for href in soup.findAll('link'):
+             if '/styles/' in href.get('href'):
+                style = "http://www.samair.ru" + href.get('href')
+                break
+         css = requests.get(style).content.split('\n')
+         css.pop()
+         ports = {}
+         for l in css:
+                 p = l.split(' ')
+                 key = p[0].split(':')[0][1:]
+                 value = p[1].split('\"')[1]
+                 ports[key] = value
+ 
+         table = soup.find("table", attrs={"id": "proxylist"})
+ 
+         # The first tr contains the field names.
+         headings = [th.get_text() for th in table.find("tr").find_all("th")]
+ 
+         for row in table.find_all("span")[1:]:
+             curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
+ 
+         return curr_proxy_list
+
     def generate_proxied_request(self, url, params={}, req_timeout=30):
         #if len(self.proxy_list) < 2:
         #    self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php')
@@ -137,6 +167,7 @@ def generate_proxied_request(self, url, params={}, req_timeout=30):
         request = None
         try:
             rand_proxy = random.choice(self.proxy_list)
+            print "Next proxy: " + str(rand_proxy)
             request = requests.get(test_url, proxies={"http": rand_proxy},
                                    headers=req_headers, timeout=req_timeout)
         except ConnectionError: