-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Expand file tree
/
Copy pathip3366.py
More file actions
32 lines (25 loc) · 865 Bytes
/
ip3366.py
File metadata and controls
32 lines (25 loc) · 865 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from proxypool.crawlers.base import BaseCrawler
from proxypool.schemas.proxy import Proxy
import re
MAX_PAGE = 5
BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}'
class IP3366Crawler(BaseCrawler):
"""
ip3366 crawler, http://www.ip3366.net/
"""
urls = [BASE_URL.format(page=i) for i in range(1, 8)]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
# \s * 匹配空格,起到换行作用
re_ip_address = ip_address.findall(html)
for address, port in re_ip_address:
proxy = Proxy(host=address.strip(), port=int(port.strip()))
yield proxy
if __name__ == '__main__':
crawler = IP3366Crawler()
for proxy in crawler.run():
print(proxy)