added http://rebro.weebly.com/proxy-list.html

ieguiguren · ieguiguren · commit 70f160255230 · 2016-07-11T07:36:00.000+02:00
diff --git a/README.md b/README.md
@@ -21,10 +21,12 @@ Surprisingly, the only thing that tells a server the application triggered the r
 
 ## The source code
 
-The project code in this repository is crawling two different public proxy websites:
+The project code in this repository is crawling three different public proxy websites:
 * http://proxyfor.eu/geo.php
 * http://free-proxy-list.net
+* http://rebro.weebly.com/proxy-list.html
 
 After collecting the proxy data and filtering the slowest ones it is randomly selecting one of them to query the target url.
 The request timeout is configured at 30 seconds and if the proxy fails to return a response it is deleted from the application proxy list.
 I have to mention that for each request a different agent header is used. The different headers are stored in the **/data/user_agents.txt** file which contains around 900 different agents.
+
diff --git a/project/http/requests/proxy/requestProxy.py b/project/http/requests/proxy/requestProxy.py
@@ -21,6 +21,7 @@ def __init__(self, web_proxy_list=[]):
         self.proxy_list = web_proxy_list
         self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php', 100.0)
         self.proxy_list += self.freeProxy_url_parser('http://free-proxy-list.net')
+        self.proxy_list += self.weebly_url_parser('http://rebro.weebly.com/proxy-list.html')
 
     def get_proxy_list(self):
         return self.proxy_list
@@ -115,6 +116,17 @@ def freeProxy_url_parser(self, web_url):
         #print "ALL: ", curr_proxy_list
         return curr_proxy_list
 
+    def weebly_url_parser(self, web_url):
+        curr_proxy_list = []
+        content = requests.get(web_url).content
+        soup = BeautifulSoup(content, "html.parser")
+        table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}).find('font', attrs={'color' :'#33a27f'})
+
+        for row in [ x for x in table.contents if getattr(x, 'name', None) != 'br']:
+            proxy = "http://" + row
+            curr_proxy_list.append(proxy.__str__())
+        return curr_proxy_list
+
     def generate_proxied_request(self, url, params={}, req_timeout=30):
         #if len(self.proxy_list) < 2:
         #    self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php')