爬虫称为网页蜘蛛或网络机器人,用于自动获(爬)取互联网上的信息,本质就是一段代码
任何一门高级语言都可以实现爬虫,并不只有python
通过代码,模拟浏览器向服务器发送HTTP或HTTPS请求,然后对服务器相应的结果进行处理,获取想要的数据
三步走
使用urllib标准库
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
__author__ = '陈爽'
# 导入urllib中的request模块,用于发送http/https请求获取数据
from urllib import request
# 获取数据
def get_data():
url = 'https://search.51job.com/list/160500,000000,0000,00,9,99,java%25E5%25BC%2580%25E5%258F%2591,2,1.html'
# 创建request对象 指定url和请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
}
# 使用Request封装返回Request对象
req = request.Request(url, headers=headers)
response = request.urlopen(req)
#
# print(type(response))
# print(response.getcode())
# print(response.info())
if response.getcode() == 200:
data = response.read() # 读取响应的结果
# print(type(data)) #
data = str(data, encoding='gbk') # 转换为str
# print(data)
# 将数据写入文件中
with open('index.html', mode='w',encoding='gbk') as f:
f.write(data)
if __name__ == '__main__':
get_data()
处理方式三种
pip install beautifulsoup4
from bs4 import BeautifulSoup
# 处理解析数据
def parse_data():
with open('index.html',mode='r', encoding='gbk') as f:
html = f.read()
# 创建Beautiful Soup实例,解析html数据
bs = BeautifulSoup(html,'html.parser')
'''
查找数据
1、find('name') 获取第一个匹配标签
2、find_all() 获取所有匹配标签
3、.select()
'''
# div = bs.find('div')
# print(div, type(div)) #
# metas = bs.find_all('meta') # 返回集合
# print(bs.find_all(id='hello')) # 根据id获取 返回集合
# print(bs.find_all(class_='cauc')) # 根据class_获取
# print(bs.select('#hello'))
# print(bs.select('p#world span'))
# 获取标签中的文本及其子孙的文本
value = bs.select('#hello')[0].get_text().strip()
print(len(value))
安装pymysql模块并引入
def save_to_mysql(data):
config = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'python',
'charset': 'utf8',
}
conn = pymysql.connect(**config)
cursor = conn.cursor()
sql = '''
insert into t_job
(title,company,address,salary,pubDate)
values
(%(title)s,%(company)s,%(address)s,%(salary)s,%(pubDate)s)
'''
cursor.executemany(sql, data)
conn.commit()
cursor.close()
conn.close()
将 openpyxl模块下Workbook引入
安装模块pip install openpyxl
def save_to_Excel(data):
# 创建工作簿 Workbook
wbook = Workbook()
# 创建工作表
sheet = wbook.create_sheet('招聘信息', 0)
# 向工作表中传入数据
sheet.append(['职位名', '公司名', '工作地点', '薪资', '发布时间'])
for item in data:
row = [item['title'], item['company'], item['address'], item['salary'], item['pubDate']]
sheet.append(row)
# 输出保存
wbook.save('51job.xlsx')
from urllib import request
import json
def get_data():
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=500&page_start=0'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
req = request.Request(url, headers=headers)
response = request.urlopen(req)
if response.getcode() == 200:
result = response.read()
# print(type(result)) #
return result
# 处理数据
def parse_data(html):
# 将字符串形式的json转化为dict字典
data = json.loads(html)
# print(type(data), data)
movies = data['subjects']
for movie in movies:
print(movie['title'], movie['rate'])
if __name__ == '__main__':
parse_data(get_data()) #
步骤:
1.获取数据
2.处理数据(清洗数据)
3.存储数据
4.数据可视化