@@ -19,9 +19,11 @@ def __init__(self, id, web_url, timeout=None):
1919 def parse_proxyList (self ):
2020 curr_proxy_list = []
2121 # Parse all proxy pages -> format: /list/{num}.htm
22- # TODO @pgaref: get the pageRange from the 'pagination' table
23- for page in range (1 , 21 ):
24- response = requests .get ("{0}{num:02d}.htm" .format (self .get_URl (), num = page ), timeout = self .timeout )
22+ # Get the pageRange from the 'pagination' table
23+ page_set = self .get_pagination_set ()
24+ logger .debug ("Pages: {}" .format (page_set ))
25+ for page in page_set :
26+ response = requests .get ("{0}{1}" .format (self .get_URl (), page ), timeout = self .timeout )
2527 if not response .ok :
2628 # Could not parse ANY page - Let user know
2729 if not curr_proxy_list :
@@ -50,20 +52,42 @@ def parse_proxyList(self):
5052 for row in table .find_all ("tr" )[1 :]:
5153 td_row = row .find ("td" )
5254 # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
55+ proxy_obj = self .create_proxy_object (row )
5356 # Make sure it is a Valid Proxy Address
54- if UrlParser .valid_ip_port (td_row .text ):
55- proxy_obj = self .createProxyObject (row )
56- proxy_obj .print_everything ()
57+ if proxy_obj is not None and UrlParser .valid_ip_port (td_row .text ):
5758 curr_proxy_list .append (proxy_obj )
5859 else :
59- logger .debug ("Address with Invalid format : {}" .format (td_row .text ))
60+ logger .debug ("Proxy Invalid: {}" .format (td_row .text ))
6061 return curr_proxy_list
6162
62- def createProxyObject (self , row ):
63+ def get_pagination_set (self ):
64+ response = requests .get (self .get_URl (), timeout = self .timeout )
65+ page_set = set ()
66+ # Could not parse pagination page - Let user know
67+ if not response .ok :
68+ logger .warn ("Proxy Provider url failed: {}" .format (self .get_URl ()))
69+ return page_set
70+ content = response .content
71+ soup = BeautifulSoup (content , "html.parser" )
72+ for ultag in soup .find_all ('ul' , {'class' : 'pagination' }):
73+ for litag in ultag .find_all ('li' ):
74+ page_ref = litag .a .get ('href' ).decode ('utf-8' )
75+ # Skip current page '/list'
76+ if page_ref .endswith (('htm' , 'html' )):
77+ page_set .add (page_ref )
78+ else :
79+ page_set .add ("" )
80+ return page_set
81+
82+ def create_proxy_object (self , row ):
6383 for td_row in row .findAll ("td" ):
6484 if td_row .attrs ['data-label' ] == 'IP:port ' :
6585 text = td_row .text .strip ()
6686 ip = text .split (":" )[0 ]
87+ # Make sure it is a Valid IP
88+ if not UrlParser .valid_ip (ip ):
89+ logger .debug ("IP with Invalid format: {}" .format (ip ))
90+ return None
6791 port = text .split (":" )[1 ]
6892 elif td_row .attrs ['data-label' ] == 'Anonymity Type: ' :
6993 anonymity = AnonymityLevel (td_row .text .strip ())
@@ -73,4 +97,4 @@ def createProxyObject(self, row):
7397
7498 def __str__ (self ):
7599 return "SemairProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
76- .format (self .url , self .minimum_bandwidth_in_KBs )
100+ .format (self .url , self .minimum_bandwidth_in_KBs )
0 commit comments