Hotfix for SemairProxyParser -- TODO: 1) parse all site pages 2) add some checks when Proxy page HTML structure changed

pgaref · pgaref · commit a946e7cfbccb · 2016-10-31T20:47:27.000Z
diff --git a/project/http/requests/parsers/samairproxyParser.py b/project/http/requests/parsers/samairproxyParser.py
@@ -15,24 +15,26 @@ def parse_proxyList(self):
         content = requests.get(self.get_URl()).content
         soup = BeautifulSoup(content, "html.parser")
         # css provides the port number so we reverse it
-        for href in soup.findAll('link'):
-            if '/styles/' in href.get('href'):
-                style = "http://www.samair.ru" + href.get('href')
-                break
-        css = requests.get(style).content.split('\n')
-        css.pop()
-        ports = {}
-        for l in css:
-            p = l.split(' ')
-            key = p[0].split(':')[0][1:]
-            value = p[1].split('\"')[1]
-            ports[key] = value
+        # for href in soup.findAll('link'):
+        #     if '/styles/' in href.get('href'):
+        #         style = "http://www.samair.ru" + href.get('href')
+        #         break
+        # css = requests.get(style).content.split('\n')
+        # css.pop()
+        # ports = {}
+        # for l in css:
+        #     p = l.split(' ')
+        #     key = p[0].split(':')[0][1:]
+        #     value = p[1].split('\"')[1]
+        #     ports[key] = value
 
         table = soup.find("table", attrs={"id": "proxylist"})
         # The first tr contains the field names.
         headings = [th.get_text() for th in table.find("tr").find_all("th")]
-        for row in table.find_all("span")[1:]:
-            curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
+        for row in table.find_all("tr")[1:]:
+            td_row = row.find("td")
+            # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
+            curr_proxy_list.append('http://' +td_row.text)
 
         return curr_proxy_list