岗位招聘信息大数据分析实践(数据分析岗)

一.项目的数据爬取

本次项目中,为了体现数据的多源异构性,从智联、拉勾网、51Job三个网站爬取招聘信息。主要是为了学习记录的保存,如果想要项目文件,在我的个人资源里,希望各位观众老爷支持一下!!

本次项目使用的是python语言,Jupyter运行项目。项目不是很大,所以偷了下懒。

爬取智联招聘代码:下面这代码24年1月2号前能用,由于智联将静态页面源代码重构成了js动态页面。所以爬虫代码需要修改。

import requests
from lxml import etree
import numpy as np
import pandas as pd
import time 

# 获取爬取的城市和爬取的页码数
def get_city_page():
    city = input('请输入要爬取的城市:')
    page = int(input('请输入要爬取的页码数:'))
    return city,page

# 构建url_list
def create_url_lis(city,page):
    # 城市对应编码
    city_code_dict = {
        '上海':538, '沈阳':599, '济南':702, '青岛':703, '苏州':639, '无锡':636, '宁波':654, 
        '郑州':719, '长沙':749, '福州':681, 
    }
    url_lis = []
    city_code = city_code_dict[city]
    for p in range(page):
        url = 'https://sou.zhaopin.com/?jl={}&kw={}&p={}'.format(city_code,'数据分析',p+1)
        url_lis.append(url)
    return url_lis

# 根据url获取网页源代码
def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
        'Cookie': 'x-zp-client-id=74bb6efc-8e6b-4835-9c8a-9cf634642187; FSSBBIl1UgzbN7NO=5I_dfP7CdrCVHb8rxHV9oJdTF674zGDxncPy5gxeIgZmi8ey_U8x38UN4LKWwejt20gP2I0kqGbxu8VGmqWt3Hq; sajssdk_2015_cross_new_user=1; locationInfo_search={%22code%22:%22538%22%2C%22name%22:%22%E4%B8%8A%E6%B5%B7%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}; _uab_collina=170426097604463095302484; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1704260976; LastCity=%E7%9F%B3%E5%98%B4%E5%B1%B1; LastCity%5Fid=887; sts_deviceid=18ccf47b46127a-03cea167f7a7f2-4c657b58-2073600-18ccf47b4628a4; ZP_OLD_FLAG=false; sts_sg=1; sts_chnlsid=Unknown; zp_src_url=https%3A%2F%2Fpassport.zhaopin.com%2F; campusOperateJobUserInfo=4534b403-2849-4a7e-9405-6acaf4ab78f6; JSsUserInfo=3D753D6857645975486852645E754D685A6451754A685864587542682664267544685B64517548685F645C75496858645B754A685E64517542683F642675446866300E2E42682B643C7544685B6445754A684A645A75496850645A754C685164297535685764587542683; JSpUserInfo=3D753D6857645975486852645E754D685A6451754A685864587542682664267544685B64517548685F645C75496858645B754A685E64517542683F642675446866300E2E42682B643C7544685B6445754A684A645A75496850645A754C685164297535685764587542683; ssxmod_itna=Cqmx2DuCq7qqzxA2lQqDQr4DvxAKq0dGCzlfDBdlxiNDnD8x7YDvzGw9UzYUYlUnxPieaKcTa0dfSiDcY5AFUw4GLDmKDyWl7uqDxrq0rD74irDDxD3Db3dDSDWKD9D048yRvLKGWDbx=Df4DmDGYneqDgDYQDGupLD7QDIk6dDDXdnmeDe84DdlfewdRyiAiqQD=DjkbD/RqHXD6Ks=9dUL0TTcNWeqGy/PGuIdtlR3bDCO6lAuNz+DP4+Bh1B0eai0G=Ko+diGxfTZ+xtRqGYGv=Qiur7vDYAGffxD3fEbD===; ssxmod_itna2=Cqmx2DuCq7qqzxA2lQqDQr4DvxAKq0dGCzlD8MnBS4GNQeGaiQ++H1Z+x82tnLa=zFU=APx4ltCw2oDwg408DeM+D===; ZL_REPORT_GLOBAL={%22jobs%22:{%22funczoneShare%22:%22dtl_best_for_you%22%2C%22recommandActionidShare%22:%2253f6092d-0d14-4ec4-ad02-29b73e2d339e-job%22}%2C%22/resume/new%22:{%22actionid%22:%229dd17539-dbe4-4ae3-9933-ece52221601b%22%2C%22funczone%22:%22addrsm_ok_rcm%22}}; zp_passport_deepknow_sessionId=fb8bf406s30e0e4162b3d8b8b94cdf031acb; at=b8b684ad6d1d41c8a38acb22c51a0556; rt=c7571a05c44b4a969cf4572b276db2db; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221132981290%22%2C%22first_id%22%3A%2218ccddebce273c-06b94671a326d6c-4c657b58-2073600-18ccddebce31018%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThjY2RkZWJjZTI3M2MtMDZiOTQ2NzFhMzI2ZDZjLTRjNjU3YjU4LTIwNzM2MDAtMThjY2RkZWJjZTMxMDE4IiwiJGlkZW50aXR5X2xvZ2luX2lkIjoiMTEzMjk4MTI5MCJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%221132981290%22%7D%2C%22%24device_id%22%3A%2218ccddebce273c-06b94671a326d6c-4c657b58-2073600-18ccddebce31018%22%7D; acw_tc=2760828d17042937136764709eec597583e5a8a60b0f2808f40483dabc5902; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1704293716; selectCity_search=538; FSSBBIl1UgzbN7NP=5RRYTVbKYwYEqqqDWuetqRaWARh9_IK5o25ELIpY31kDrWOQMyq_Jee3VRdIV7WwdR9HZq8J_J96SOPL_Rx6UrzzuroycpMpXo00dvn4u.NlnVowRt9kC9zo5ajfVR0d2hLjA28k3CnmVIJFvltqpflKiZ5B4M9ZHMZzXm62n3YPfceU4K1799vIzJrq3y8SzRsSofx4lUb5vluabRKCYfCJpedmchlJ5lKPBHosSH.Oc3LwYu20IPF2iWD13XHqHsBN3GD9XkMpSx52ZWD1LdYJ4zrfg6.0O_tj66FypkQjwNRSSxNeDwx.xEAomo9DdZMcP1fx_3v8AhY6T8Kjxqp'
    }
    html = requests.get(url,headers = headers).text
    return html

# 解析网页源代码,提取想要的信息
def transform_html(response):
    """
    解析网页源代码,提取想要的信息,并返回信息的dataframe
    response:抓取到的网页源代码
    """
    # 职位名称、薪资、地区、工作经验、学历、职位类别、招聘人数、职位描述、岗位职责、职位福利、公司名称、公司类型、公司规模
    html = etree.HTML(response)
    print(html)
    # 获取工作名称
    job = html.xpath('//span[@class="iteminfo__line1__jobname__name"]/@title')
    # 获取薪资范围
    salary = html.xpath('//p[@class="iteminfo__line2__jobdesc__salary"]/text()')
    for i in range(len(salary)):
        salary[i] = salary[i].strip('\n').strip(' ').rstrip('\n')
    # 获取地区、经验、学历信息
    location,experience,education = ([] for i in range(3))
    require = html.xpath('//ul[@class="iteminfo__line2__jobdesc__demand"]')
    for req in require:
        try:
            loc = req.xpath('.//li[@class="iteminfo__line2__jobdesc__demand__item"]/text()')[0]
            location.append(loc)
        except:
            location.append(np.nan)
        try:
            exp = req.xpath('.//li[@class="iteminfo__line2__jobdesc__demand__item"]/text()')[1]
            experience.append(exp)
        except:
            experience.append(np.nan)
        try:
            edu = req.xpath('.//li[@class="iteminfo__line2__jobdesc__demand__item"]/text()')[2]
            education.append(edu)
        except:
            education.append(np.nan)

    # 获取职位标签
    job_tag = []
    job_tag_lis = html.xpath('//div[@class="iteminfo__line3__welfare"]')
    for tag in job_tag_lis:
        tag_info = tag.xpath('.//div[@class="iteminfo__line3__welfare__item"]/text()')
        tag_info = str(tag_info)
        job_tag.append(tag_info)

    # 获取公司名称
    company_name = html.xpath('//span[@class="iteminfo__line1__compname__name"]/text()')
    print(company_name)

    # 获取公司类型、公司规模
    company_type = []
    company_size = []
    company_detail = html.xpath('//div[@class="iteminfo__line2__compdesc"]')
    for company in company_detail:
        try:
            com_type = company.xpath('.//span[@class="iteminfo__line2__compdesc__item"]/text()')[0]
            company_type.append(com_type)
        except:
            company_type.append(np.nan)
        try:
            com_size = company.xpath('.//span[@class="iteminfo__line2__compdesc__item"]/text()')[1]
            company_size.append(com_size)
        except:
            company_size.append(np

你可能感兴趣的:(数据分析,数据挖掘,大作业,爬虫)