-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnlpjob_crawler.py
More file actions
69 lines (56 loc) · 2.43 KB
/
nlpjob_crawler.py
File metadata and controls
69 lines (56 loc) · 2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
from lxml import etree
import re
import csv
PAGE_NUM = 10
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
def get_category_url(url):
'''
行业类别,比如:人工智能、自然语言处理等
'''
html = requests.get(url,headers=headers)
selector = etree.HTML(html.text)
category_urls = selector.xpath('//div[@id="categs-nav"]/ul/li/a/@href')
for category_url in category_urls:
category = category_url.split("/")[-2]
filename = "./" + category + ".csv"
fp = open(filename,'wt',newline='',encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(('title', 'type', 'company', 'city', 'time', 'url'))
for i in range(1, PAGE_NUM+1):
page_url = category_url + "?p=" + str(i)
get_job_info(page_url, writer)
fp.close()
def get_job_info(url, writer):
'''
全职 爱因互动招聘AI知识编辑及审核主管 爱因互动科技发展(北京)有限公司 北京 job_url
包含: 职位类别 job_url job_title job_company job_city job_time
只爬取日期较近的10页职位
'''
html = requests.get(url,headers=headers)
selector = etree.HTML(html.text)
job_rows = selector.xpath('//*[@id="job-listings"]/div[@class]')
try:
for job_row in job_rows:
# time
time = job_row.xpath('//span[@class="time-posted"]/text()')[0].strip()
title = job_row.xpath('//span[@class="row-info"]/a/@title')[0].strip()
job_url = job_row.xpath('//span[@class="row-info"]/a/@href')[0].strip()
type = job_row.xpath('//span[@class="row-info"]/img/@alt')[0].strip()
company_city = job_row.xpath('//span[@class="row-info"]')[0].xpath('string(.)').split("at")[-1].strip().split("in")
if len(company_city) == 2:
company = company_city[0].strip()
city = company_city[1].strip()
else:
company_city = company_city[0].split(',')
company = company_city[0].strip()
city = company_city[1].strip()
#print(title, type, company, city, time, job_url)
writer.writerow((title, type, company, city, time, job_url))
except IndexError:
pass
if __name__ == '__main__':
url = 'http://www.nlpjob.com/jobs/'
get_category_url(url)