File tree Expand file tree Collapse file tree
proxypool/crawlers/public Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ import time
2+ from retrying import RetryError
3+ from loguru import logger
4+ from proxypool .schemas .proxy import Proxy
5+ from proxypool .crawlers .base import BaseCrawler
6+ import json
7+
8+ BASE_URL = 'https://www.docip.net/data/free.json?t={date}'
9+
10+
11+
12+ class DocipCrawler (BaseCrawler ):
13+ """
14+ Docip crawler, https://www.docip.net/data/free.json
15+ """
16+ urls = [BASE_URL .format (date = time .strftime ("%Y%m%d" , time .localtime ()))]
17+
18+ def parse (self , html ):
19+ """
20+ parse html file to get proxies
21+ :return:
22+ """
23+ try :
24+ result = json .loads (html )
25+ proxy_list = result ['data' ]
26+ for proxy_item in proxy_list :
27+ host = proxy_item ['ip' ]
28+ port = proxy_item ['port' ]
29+ yield Proxy (host = host , port = port )
30+ except json .JSONDecodeError :
31+ print ("json.JSONDecodeError" )
32+ return
33+
34+
35+ if __name__ == '__main__' :
36+ crawler = DocipCrawler ()
37+ for proxy in crawler .crawl ():
38+ print (proxy )
Original file line number Diff line number Diff line change 1+ from pyquery import PyQuery as pq
2+ from proxypool .schemas .proxy import Proxy
3+ from proxypool .crawlers .base import BaseCrawler
4+ from loguru import logger
5+
6+ BASE_URL = 'https://ip.uqidata.com/free/index.html'
7+
8+
9+ class UqidataCrawler (BaseCrawler ):
10+ """
11+ Uqidata crawler, https://ip.uqidata.com/free/index.html
12+ """
13+ urls = [BASE_URL ]
14+ ignore = True
15+
16+ def encode (input_str ):
17+ tmp = []
18+ for i in range (len (input_str )):
19+ tmp .append ("ABCDEFGHIZ" .find (input_str [i ]))
20+ result = "" .join (str (i ) for i in tmp )
21+ result = int (result ) >> 0x03
22+ return result
23+
24+ def parse (self , html ):
25+ """
26+ parse html file to get proxies
27+ :return:
28+ """
29+ doc = pq (html )
30+ trs = doc ('#main_container .inner table tbody tr:nth-child(n+3)' ).items ()
31+ for tr in trs :
32+ ip_html = tr ('td.ip' ).find ("*" ).items ()
33+ host = ''
34+ for i in ip_html :
35+ if i .attr ('style' ) is not None and 'none' in i .attr ('style' ):
36+ continue
37+ if i .text () == '' :
38+ continue
39+ host += i .text ()
40+
41+ port_code = tr ('td.port' ).attr ('class' ).split (' ' )[1 ]
42+ port = UqidataCrawler .encode (port_code )
43+ yield Proxy (host = host , port = port )
44+
45+
46+ if __name__ == '__main__' :
47+ crawler = UqidataCrawler ()
48+ for proxy in crawler .crawl ():
49+ print (proxy )
You can’t perform that action at this time.
0 commit comments