使用selenium爬取拉勾网

__author__ = '田明博'
__date__ = '2019/10/20 8:39'

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from lxml import etree
import time
import csv


class Lagou_spyder(object):
    def __init__(self):
        self.driver = webdriver.Firefox(executable_path=r'D:\BaiduNetdiskDownload\geckodriver.exe')
        self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='

    def run(self):
        self.driver.get(self.url)  # 获取职位第一页
        self.pre_work()  # 先把csv文件的标题写入
        while True:
            source = self.driver.page_source  # 第一页源代码
            self.parse_position(source)  # 解析每页的职位信息,获取url,进入详情页

            # 点击下一页,等待页面加载完
            WebDriverWait(self.driver, timeout=10).until(
                EC.presence_of_element_located((By.XPATH, '//div[@class="pager_container"]//span[@action="next"]')))

            # 下一页的按钮
            next_tag = self.driver.find_element_by_xpath('//div[@class="pager_container"]//span[@action="next"]')
            # 按钮变灰,没有下一页,会出现‘pager_next_disabled’属性
            if 'pager_next_disabled' in next_tag.get_attribute('class'):
                break
            else:
                #如果有下一页,就点击下一页
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")  # 把要点击的按钮放到可视范围内,不然会报错
                next_tag.click()
            time.sleep(1)

    def parse_position(self, source):
        '''
        :param source:  每页的源代码
        :return:
        '''
        html = etree.HTML(source)  # xpath
        position_links = html.xpath('//a[@class="position_link"]/@href')  # 解析每个职位的url
        for link in position_links:  # 进入每个岗位的详情页面
            self.get_detail_source(link)

    def get_detail_source(self, link):
        # now_handle = self.driver.current_window_handle

        # # 打开新窗口
        self.driver.execute_script("window.open('%s')" % link)
        # 切换句柄
        self.driver.switch_to.window(self.driver.window_handles[1])
        # 显式等待,等公司信息加载出来后,再往下执行
        WebDriverWait(self.driver, timeout=10).until(
            EC.presence_of_element_located((By.XPATH, '//h4[@class="company"]')))
        #
        # # 获得所有窗口句柄
        # all_handles = self.driver.window_handles
        # print(all_handles)
        # # 循环判断窗口是否为当前窗口
        # for handle in all_handles:
        #     if handle != now_handle:
        #         self.driver.switch_to.window(handle)  # 用于处理多窗口之间的切换
        # print(self.driver.current_window_handle)
        #
        # print(self.driver.page_source)

        # 详情页源代码
        source = self.driver.page_source
        self.parse_detail_position(source)  # 解析详情页
        self.driver.close()  # 关闭当前详情页
        self.driver.switch_to.window(self.driver.window_handles[0])  # 把句柄移到第一页,职位列表页

    def parse_detail_position(self, source):
        '''
        :param source: 详情页源代码
        :return:
        '''
        html = etree.HTML(source)
        # 解析职位信息
        company = html.xpath('//h4[@class="company"]/text()')[0]  # 公司
        position = html.xpath('//h1[@class="name"]/text()')[0]  # 职位
        job_infos = html.xpath('//dd[@class="job_request"]//span')
        salary = job_infos[0].xpath('.//text()')[0].strip()  # 薪水
        city = job_infos[1].xpath('.//text()')[0].replace('/', '').strip()  # 城市
        work_years = job_infos[2].xpath('.//text()')[0].strip()  # 工作时长
        education = job_infos[3].xpath('.//text()')[0].strip()  # 教育程度
        require = job_infos[4].xpath('.//text()')[0].strip()  # 工作需要
        desc = ''.join(html.xpath('//dd[@class="job_bt"]//text()')).replace('\n', '').strip()
        position_into = {
            'company': company,
            'position': position,
            'salary': salary,
            'city': city,
            'work_years': work_years,
            'education': education,
            'require': require,
            'desc': desc
        }
        headers = ['company', 'position', 'salary', 'city', 'work_years', 'education', 'require', 'desc']

        # 写入csv文件
        with open('test.csv', 'a+', newline='', encoding='utf-8') as fp:
            writer = csv.DictWriter(fp, headers)
            writer.writerow(position_into)  # 写入字典position_into
        time.sleep(1)

    def pre_work(self):
        headers = ['company', 'position', 'salary', 'city', 'work_years', 'education', 'require', 'desc']
        with open('test.csv', 'a+', newline='', encoding='utf-8') as fp:
            writer = csv.DictWriter(fp, headers)
            writer.writeheader()  # 先写入csv文件的头


if __name__ == '__main__':
    test = Lagou_spyder()
    test.run()

写入csv文件:

使用selenium爬取拉勾网_第1张图片

你可能感兴趣的:(爬虫)