|
| 1 | +import logging |
| 2 | + |
| 3 | +import requests |
| 4 | +from bs4 import BeautifulSoup |
| 5 | + |
| 6 | +from http_request_randomizer.requests.parsers.UrlParser import UrlParser |
| 7 | +from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel, Protocol |
| 8 | + |
| 9 | +logger = logging.getLogger(__name__) |
| 10 | +__author__ = 'pgaref' |
| 11 | + |
| 12 | + |
| 13 | +class SslProxyParser(UrlParser): |
| 14 | + def __init__(self, id, web_url, timeout=None): |
| 15 | + UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout) |
| 16 | + |
| 17 | + def parse_proxyList(self): |
| 18 | + curr_proxy_list = [] |
| 19 | + try: |
| 20 | + response = requests.get(self.get_url(), timeout=self.timeout) |
| 21 | + if not response.ok: |
| 22 | + logger.warning("Proxy Provider url failed: {}".format(self.get_url())) |
| 23 | + return [] |
| 24 | + |
| 25 | + content = response.content |
| 26 | + soup = BeautifulSoup(content, "html.parser") |
| 27 | + table = soup.find("table", attrs={"id": "proxylisttable"}) |
| 28 | + |
| 29 | + # The first tr contains the field names. |
| 30 | + headings = [th.get_text() for th in table.find("tr").find_all("th")] |
| 31 | + |
| 32 | + datasets = [] |
| 33 | + for row in table.find_all("tr")[1:-1]: |
| 34 | + dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) |
| 35 | + if dataset: |
| 36 | + datasets.append(dataset) |
| 37 | + |
| 38 | + for dataset in datasets: |
| 39 | + proxy_obj = self.create_proxy_object(dataset) |
| 40 | + # Make sure it is a Valid Proxy Address |
| 41 | + if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()): |
| 42 | + curr_proxy_list.append(proxy_obj) |
| 43 | + else: |
| 44 | + logger.debug("Proxy Invalid: {}".format(dataset)) |
| 45 | + except AttributeError as e: |
| 46 | + logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e)) |
| 47 | + except KeyError as e: |
| 48 | + logger.error("Provider {0} failed with Key error: {1}".format(self.id, e)) |
| 49 | + except Exception as e: |
| 50 | + logger.error("Provider {0} failed with Unknown error: {1}".format(self.id, e)) |
| 51 | + finally: |
| 52 | + return curr_proxy_list |
| 53 | + |
| 54 | + def create_proxy_object(self, dataset): |
| 55 | + # Check Field[0] for tags and field[1] for values! |
| 56 | + ip = "" |
| 57 | + port = None |
| 58 | + anonymity = AnonymityLevel.UNKNOWN |
| 59 | + country = None |
| 60 | + protocols = [] |
| 61 | + for field in dataset: |
| 62 | + if field[0] == 'IP Address': |
| 63 | + # Make sure it is a Valid IP |
| 64 | + ip = field[1].strip() # String strip() |
| 65 | + # Make sure it is a Valid IP |
| 66 | + if not UrlParser.valid_ip(ip): |
| 67 | + logger.debug("IP with Invalid format: {}".format(ip)) |
| 68 | + return None |
| 69 | + elif field[0] == 'Port': |
| 70 | + port = field[1].strip() # String strip() |
| 71 | + elif field[0] == 'Anonymity': |
| 72 | + anonymity = AnonymityLevel.get(field[1].strip()) # String strip() |
| 73 | + elif field[0] == 'Country': |
| 74 | + country = field[1].strip() # String strip() |
| 75 | + elif field[0] == 'Https': |
| 76 | + if field[1].strip().lower() == 'yes': protocols.extend([Protocol.HTTP, Protocol.HTTPS]) |
| 77 | + elif field[1].strip().lower() == 'no': protocols.append(Protocol.HTTP) |
| 78 | + return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country, protocols=protocols) |
| 79 | + |
| 80 | + def __str__(self): |
| 81 | + return "{0} parser of '{1}' with required bandwidth: '{2}' KBs" \ |
| 82 | + .format(self.id, self.url, self.minimum_bandwidth_in_KBs) |
0 commit comments