python爬虫从开始登录开始一条龙爬取拉勾职位信息(selenium+chrome)

直接上代码:书写不易,给个好评吧吧吧

from lxml import etree
from selenium import webdriver
import time
import re
from selenium.webdriver.support.ui import Select,WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


class LagouSpider(object):

    #初始化
    def __init__(self):
        self.driver = webdriver.Chrome()
        #登录url
        self.loginUrl = 'https://passport.lagou.com/login/login.html'
        #存放数据
        self.info = []

    #登录
    def login(self):
        # 发送请求
        self.driver.get(self.loginUrl)
        # 通过xpath选择器选择元素,并且使用selenium自带的api写入数据到文本框
        #用户名清空
        self.driver.find_element_by_xpath(
            "//div[@data-view='passwordLogin']//div[@data-propertyname='username']/input").clear()
        #用户名赋值
        self.driver.find_element_by_xpath(
            "//div[@data-view='passwordLogin']//div[@data-propertyname='username']/input").send_keys('******')
        #密码清空
        self.driver.find_element_by_xpath(
            "//div[@data-view='passwordLogin']//div[@data-propertyname='password']/input").clear()
        #密码赋值
        self.driver.find_element_by_xpath(
            "//div[@data-view='passwordLogin']//div[@data-propertyname='password']/input").send_keys('******')
        #触发登录按钮
        self.driver.find_element_by_xpath(
            "//div[@data-view='passwordLogin']//div[@data-propertyname='submit']/input").click()

    def mainPage(self):
        # 将按钮显示等待
        WebDriverWait(driver=self.driver, timeout=10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//div[@id='search_box']//input[@id='search_input']"))
        )
        # 清空搜索框
        self.driver.find_element_by_xpath(
            "//div[@id='search_box']//input[@id='search_input']").clear()
        # 赋值搜索框
        self.driver.find_element_by_xpath(
            "//div[@id='search_box']//input[@id='search_input']").send_keys('python')
        # 触发搜索按钮
        self.driver.find_element_by_xpath(
            "//div[@id='search_box']//input[@id='search_button']").click()

    def run(self):
        # 用while循环目的就是为了可以一直爬取数据
        while True:
            # 获取返回的资源
            source = self.driver.page_source
            # 解析资源
            self.parse_list_page(self, source)
            time.sleep(4)
            # 将按钮显示等待
            WebDriverWait(driver=self.driver, timeout=10).until(
                EC.presence_of_all_elements_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
            )
            # 按钮元素加载出来以后就获取
            next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']"
                                                         "/span[last()]")
            # 按钮置灰说没有下一页
            if "pager_next_disabled" in next_btn.get_attribute("class"):
                pass
            else:
                # 点击下一页
                self.driver.execute_script("arguments[0].click();", next_btn)

    # python中的类默认第一个形参delf必传,调用时传入的实参数对应的是delf后面的形参:
    def parse_list_page(delf, self, source):
        html = etree.HTML(source)
        # 获取当页职位列表详情链接
        links = html.xpath("//a[@class='position_link']/@href")
        for link in links:
            print(link)
            # 解析每一个详情页面
            self.request_detail_page(self, link)
            time.sleep(2)

    def request_detail_page(delf, self, url):
        print(url)
        # 打开新的窗口
        self.driver.execute_script("window.open('%s')" % url)
        # 切换窗口,不能让详情页面把列表页面覆盖掉了
        self.driver.switch_to.window(self.driver.window_handles[1])
        source = self.driver.page_source
        # 获取详情页数据,数据没有获取那么多,这边随便low出来几个数据
        self.parse_detail_page(self, source)
        # 关闭详情页面
        time.sleep(3)
        self.driver.close()
        # 切换到列表窗口
        self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_detail_page(delf, self, source):
        html = etree.HTML(source)
        position_name = html.xpath("//span[@class='name']/text()")[0]
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        salary = job_request_spans[0].xpath('.//text()')[0].strip()
        city = job_request_spans[1].xpath(".//text()")[0].strip()
        city = re.sub(r"[\s/]", "", city)
        detail_info = {
            'name': position_name,
            'salary': salary,
            'city': city
        }
        self.info.append(detail_info)
        print(self.info)
        print("=" * 40)  # 打印40个等号

if __name__ == '__main__':
   spider = LagouSpider()
   spider.login()
   spider.mainPage()
   spider.run()

你可能感兴趣的:(python爬虫)