前程无忧数据的爬取

这里是爬取前程无忧的数据,需要用的知识点是lxml,selenium模块的学习,这里只是简单的爬取了一页数据,还需后面更新。

import re
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
class JobSpider(object):
    driver = webdriver.Chrome()
    def __init__(self):
        self.url = 'https://search.51job.com/list/040000%252C020000,000000,0000,00,9,99,pyhton,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
    def run(self):                            ##运行入口
        self.driver.get(self.url)
        source = self.driver.page_source
        self.select_page(source)
        time.sleep(2)
        self.driver.quit()
    def select_page(self,source):             ##获取主页面所有详情页网页链接
        page_html = etree.HTML(source)
        self.parse_home(page_html)
        htmls = page_html.xpath('/html/body/div[2]/div[4]/div/p/span/a/@href')  ##获取详情页数据(工作地点,薪资,发布时间,但职位名和公司名没有获取,我是在详情页获取的)
        for html in htmls:
            self.requests_page(html)
            time.sleep(2)
    def parse_home(self,page_html):  ###解析主页
        html_str = etree.tostring(page_html,encoding='utf-8').decode('utf-8')
        contents = re.findall(r'(.*?).*?(.*?).*?(.*?)',html_str,re.DOTALL)  ##获取主页工作地点,薪资,发布时间
        # for content in contents:
        #     print(content)
    def requests_page(self,html):          ##获取详情页面的网页
        self.driver.get(html)
        source = self.driver.page_source
        self.parse_page(source)
    def parse_page(self,source):         ##解析详情页面的数据
        html = etree.HTML(source)
        htmls = etree.tostring(html, encoding='utf-8').decode('utf-8')   ##html格式修改为字符串格式,才能使用正则表达式
        # job_names = html.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/@title')     ##职位
        # companys = html.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title')  ##公司名字
        yaoqius = re.findall(r'
(.*?)
',htmls,re.DOTALL) ##工作要求 # for yaoqiu in yaoqius: # s = re.sub('<.*?>','',yaoqiu) # print(s) if __name__ == '__main__': spider = JobSpider() spider.run()

你可能感兴趣的:(爬虫)