__author__ = '田明博'
__date__ = '2019/10/20 8:39'
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from lxml import etree
import time
import csv
class Lagou_spyder(object):
def __init__(self):
self.driver = webdriver.Firefox(executable_path=r'D:\BaiduNetdiskDownload\geckodriver.exe')
self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
def run(self):
self.driver.get(self.url) # 获取职位第一页
self.pre_work() # 先把csv文件的标题写入
while True:
source = self.driver.page_source # 第一页源代码
self.parse_position(source) # 解析每页的职位信息,获取url,进入详情页
# 点击下一页,等待页面加载完
WebDriverWait(self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, '//div[@class="pager_container"]//span[@action="next"]')))
# 下一页的按钮
next_tag = self.driver.find_element_by_xpath('//div[@class="pager_container"]//span[@action="next"]')
# 按钮变灰,没有下一页,会出现‘pager_next_disabled’属性
if 'pager_next_disabled' in next_tag.get_attribute('class'):
break
else:
#如果有下一页,就点击下一页
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") # 把要点击的按钮放到可视范围内,不然会报错
next_tag.click()
time.sleep(1)
def parse_position(self, source):
'''
:param source: 每页的源代码
:return:
'''
html = etree.HTML(source) # xpath
position_links = html.xpath('//a[@class="position_link"]/@href') # 解析每个职位的url
for link in position_links: # 进入每个岗位的详情页面
self.get_detail_source(link)
def get_detail_source(self, link):
# now_handle = self.driver.current_window_handle
# # 打开新窗口
self.driver.execute_script("window.open('%s')" % link)
# 切换句柄
self.driver.switch_to.window(self.driver.window_handles[1])
# 显式等待,等公司信息加载出来后,再往下执行
WebDriverWait(self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, '//h4[@class="company"]')))
#
# # 获得所有窗口句柄
# all_handles = self.driver.window_handles
# print(all_handles)
# # 循环判断窗口是否为当前窗口
# for handle in all_handles:
# if handle != now_handle:
# self.driver.switch_to.window(handle) # 用于处理多窗口之间的切换
# print(self.driver.current_window_handle)
#
# print(self.driver.page_source)
# 详情页源代码
source = self.driver.page_source
self.parse_detail_position(source) # 解析详情页
self.driver.close() # 关闭当前详情页
self.driver.switch_to.window(self.driver.window_handles[0]) # 把句柄移到第一页,职位列表页
def parse_detail_position(self, source):
'''
:param source: 详情页源代码
:return:
'''
html = etree.HTML(source)
# 解析职位信息
company = html.xpath('//h4[@class="company"]/text()')[0] # 公司
position = html.xpath('//h1[@class="name"]/text()')[0] # 职位
job_infos = html.xpath('//dd[@class="job_request"]//span')
salary = job_infos[0].xpath('.//text()')[0].strip() # 薪水
city = job_infos[1].xpath('.//text()')[0].replace('/', '').strip() # 城市
work_years = job_infos[2].xpath('.//text()')[0].strip() # 工作时长
education = job_infos[3].xpath('.//text()')[0].strip() # 教育程度
require = job_infos[4].xpath('.//text()')[0].strip() # 工作需要
desc = ''.join(html.xpath('//dd[@class="job_bt"]//text()')).replace('\n', '').strip()
position_into = {
'company': company,
'position': position,
'salary': salary,
'city': city,
'work_years': work_years,
'education': education,
'require': require,
'desc': desc
}
headers = ['company', 'position', 'salary', 'city', 'work_years', 'education', 'require', 'desc']
# 写入csv文件
with open('test.csv', 'a+', newline='', encoding='utf-8') as fp:
writer = csv.DictWriter(fp, headers)
writer.writerow(position_into) # 写入字典position_into
time.sleep(1)
def pre_work(self):
headers = ['company', 'position', 'salary', 'city', 'work_years', 'education', 'require', 'desc']
with open('test.csv', 'a+', newline='', encoding='utf-8') as fp:
writer = csv.DictWriter(fp, headers)
writer.writeheader() # 先写入csv文件的头
if __name__ == '__main__':
test = Lagou_spyder()
test.run()
写入csv文件: