毕业设计 基于python的boss直聘数据可视化系统

可运行的完整项目,如有需要课私信联系

爬虫部分

import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import pandas as pd
import os
import django
from selenium.webdriver.chrome.service import Service
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'boss直聘数据可视化分析.settings')
django.setup()
from myApp.models import *
class spider(object):
    def __init__(self,type,page):
        self.type = type
        self.page = page
        self.spiderUrl = "https://www.zhipin.com/web/geek/job?query=%s&city=100010000&page=%s"

    def startBrower(self):
        s = Service("chromedriver.exe")
        browser = webdriver.Chrome(service=s)
        # browser=webdriver.Chrome(executable_path='./chromedriver.exe')
        return browser

    def main(self,**info):
        if info['page'] < self.page:return
        brower = self.startBrower()
        print('页表页面URL:' + self.spiderUrl % (self.type,self.page))
        brower.get(self.spiderUrl % (self.type,self.page))
        time.sleep(15)
        # return
        # //*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[1]/ul
        job_list = brower.find_elements(by=By.XPATH, value="//ul[@class='job-list-box']/li")
        for index,job in enumerate(job_list):
            try:
                print("爬取的是第 %d 条" % (index + 1))
                jobData = []
                # title  工作名字
                title = job.find_element(by=By.XPATH,
                                         value=".//div[contains(@class,'job-title')]/span[@class='job-name']").text
                # address  地址
                addresses = job.find_element(by=By.XPATH,
                                           value=".//div[contains(@class,'job-title')]//span[@class='job-area']").text.split(
                    '·')
                address = addresses[0]
                # dist 行政区
                if len(addresses) != 1:dist = addresses[1]
                else: dist = ''
                # type  工作类型
                type = self.type
               # // *[ @ id = "wrap"] / div[2] / div[2] / div / div[1] / div[1] / ul / li[5] / div[1] / div / div[2] / ul
                tag_list = job.find_elements(by=By.XPATH,
                                             value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li")
                if len(tag_list) == 2:
                    educational = job.find_element(by=By.XPATH,
                                                   value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li[2]&

你可能感兴趣的:(python,信息可视化,开发语言)