爬取Boss上有关Python的工作。
网址链接
https://www.zhipin.com/wapi/zpgeek/search/joblist.json?scene=1&query=python&city=100010000&experience=&payType=&partTime=°ree=&industry=&scale=&stage=&position=&jobType=&salary=&multiBusinessDistrict=&multiSubway=&page=1&pageSize=30
• 导包
import re
import csv
import requests
• 创建csv文件
# 创建csv文件
file = open('Boss-Python.csv', encoding='utf8', newline='', mode='w')
# 设置表头名称
name = ['岗位', '薪资', '经验', '学位']
# 将文件对象转换成 DictWriter 对象
w = csv.DictWriter(file, name)
# 写入表头
w.writeheader()
• 放入链接,设置请求头
url = 'https://www.zhipin.com/wapi/zpgeek/search/joblist.json?scene=1&query=python&city=100010000&experience=&payType=&partTime=°ree=&industry=&scale=&stage=&position=&jobType=&salary=&multiBusinessDistrict=&multiSubway=&page=1&pageSize=30'
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58",
"Cookie": "Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1688023782; __zp_seo_uuid__=7514828d-1776-458f-8520-f0876c1becaa; __g=-; wd_guid=d8bedee0-773c-4b5f-9f5e-7367d7ed657c; historyState=state; _bl_uid=90lRajRtgCbtj4s5Ibg0pX64hb3X; wt2=DJW03yxuNmXOxCfdDdBjUdTnrUxQHEXCBhpsTfurRCVthAanOA0f-5DtGX4zFf0ist63D_2TZudvG8K8Nby4T0w~~; wbg=0; __l=r=https%3A%2F%2Fcn.bing.com%2F&l=%2Fwww.zhipin.com%2Fweb%2Fgeek%2Fjob%3Fquery%3Djava%26city%3D100010000%26page%3D1&s=3&g=&friend_source=0&s=3&friend_source=0; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1688026856; __zp_stoken__=1a7beZzc4OnoODkpuJnJ4V0VXZzdVU30lF1MDJiw6EE4nPnR7KSMxM3I6LS9fRWp5Hh9KezVcLm9%2FRF1VLjhbc2UzY2lZS10QNlx8GgsiFF9zeA5KPyVOemwjOlpeREkyXCZ4ADVWDRg2BUY%3D; __c=1688023784; __a=12723853.1688023784..1688023784.26.1.26.26; geek_zp_token=V1RN8uGOf02V1tVtRvxxUYKi617jPexi4~"
}
html = requests.get(url, headers=head).text
注:cookie需自己进入Boss网站登录,然后复制自己的cookie
若出现行为异常报错,则需要刷新页面重新复制cookie
• 运用正则匹配,获取需要的数据
obj = re.compile(r'"bossName":"(?P.*?)".*?'
r'"jobName":"(?P.*?)".*?'
r'"salaryDesc":"(?P.*?)".*?'
r'"jobExperience":"(?P.*?)".*?'
r'"jobDegree":"(?P.*?)"', re.S)
result = obj.finditer(html)
• 循环输出结果,并写入csv文件,最后关闭文件
for i in result:
jobname = i.group('jobname')
salary = i.group('salary')
experience = i.group('experience')
degree = i.group('degree')
w.writerow({'岗位': jobname, '薪资': salary, '经验': experience, '学位': degree})
file.close()
• 运行结果如下
来看看完整代码
import re
import csv
import requests
# 创建csv文件
file = open('Boss-Python.csv', encoding='utf8', newline='', mode='w')
# 设置表头名称
name = ['岗位', '薪资', '经验', '学位']
# 将文件对象转换成 DictWriter 对象
w = csv.DictWriter(file, name)
# 写入表头
w.writeheader()
url = 'https://www.zhipin.com/wapi/zpgeek/search/joblist.json?scene=1&query=python&city=100010000&experience=&payType=&partTime=°ree=&industry=&scale=&stage=&position=&jobType=&salary=&multiBusinessDistrict=&multiSubway=&page=1&pageSize=30'
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58",
"Cookie": "Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1688023782; __zp_seo_uuid__=7514828d-1776-458f-8520-f0876c1becaa; __g=-; wd_guid=d8bedee0-773c-4b5f-9f5e-7367d7ed657c; historyState=state; _bl_uid=90lRajRtgCbtj4s5Ibg0pX64hb3X; wt2=DJW03yxuNmXOxCfdDdBjUdTnrUxQHEXCBhpsTfurRCVthAanOA0f-5DtGX4zFf0ist63D_2TZudvG8K8Nby4T0w~~; wbg=0; __l=r=https%3A%2F%2Fcn.bing.com%2F&l=%2Fwww.zhipin.com%2Fweb%2Fgeek%2Fjob%3Fquery%3Djava%26city%3D100010000%26page%3D1&s=3&g=&friend_source=0&s=3&friend_source=0; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1688026856; __zp_stoken__=1a7beZzc4OnoODlAANWALV0VXZzdzTQcVKF4DJiw6EE1PJFMQJSMxM3I6WxIqfDl5Hh9KezVcLm9%2FQDZVLSw%2FTh4KZVtyXVYHQ2JqGgsiFF9zeA4ZBlBzDGwjOlpeSEkyXCZ4ADVWDRg2BUY%3D; __c=1688023784; __a=12723853.1688023784..1688023784.27.1.27.27; geek_zp_token=V1RN8uGOf02V1tVtRvxxUYLS226zzfzSw~"
}
html = requests.get(url, headers=head).text
print(html)
obj = re.compile(r'"bossName":"(?P.*?)".*?'
r'"jobName":"(?P.*?)".*?'
r'"salaryDesc":"(?P.*?)".*?'
r'"jobExperience":"(?P.*?)".*?'
r'"jobDegree":"(?P.*?)"', re.S)
result = obj.finditer(html)
for i in result:
jobname = i.group('jobname')
salary = i.group('salary')
experience = i.group('experience')
degree = i.group('degree')
w.writerow({'岗位': jobname, '薪资': salary, '经验': experience, '学位': degree})
file.close()
完成啦,希望有帮助到大家,有疑问或问题也可在评论区留言~