利用MySQL数据库,可以轻松地管理爬虫所得的海量数据。对MySQL以及pymysql不太熟悉的读者,可以先学习MySQL教程和pymysql教程。
本文以https://www.bitpush.news/covid19/为例:
打开页面,右键点击F12,找到所需爬取的数据在HTML中的位置:
def getdata():
res = requests.get("https://www.bitpush.news/covid19/")
html = res.text
bs = BeautifulSoup(html, features="lxml")
titles = []
confirms = []
deaths = []
titleslist = bs.find_all(class_='table_card_cell_col_0 table_card_cell_stringwithicon_type')
for item in titleslist:
titles.append(item.text) # text方法可以去掉标签值
confirmslist = bs.find_all(class_='table_card_cell_col_1 table_card_cell_int_type')
for item in confirmslist:
confirms.append(item.text)
deathslist = bs.find_all(class_='table_card_cell_col_4 table_card_cell_int_type')
for item in deathslist:
deaths.append(item.text)
return titles, confirms, deaths
# 当python函数同时返回多个值时,python会将其封装为元组
初始化:
conn = pymysql.connect(user='root', password='', database='')
# 连接数据库的最基本方式
# 密码和数据库需要自己创建、填写
cur = conn.cursor() # 创建一个游标对象
cur.execute('create table coronavirus (title varchar(20), confirm varchar(20),death varchar(20))')
# 在数据库中建立一个名为coronavirus的表,有title,confirm,death三列,数据类型为varchar(20)
# 注意数据库只需要创建一次!!
存储部分:
def save_to_MySQL(title, confirm, death):
cur.execute("insert into coronavirus (title,confirm,death) VALUES (%s,%s,%s)", (title, confirm, death))
# 将数据依次存放在coronavirus表中
cur.connection.commit()
# 向MySQL提交数据
import requests
from bs4 import BeautifulSoup
import pymysql
conn = pymysql.connect(user='root', password='', database='')
# 连接数据库的最基本方式
# 密码和数据库需要自己创建、填写
cur = conn.cursor() # 创建一个游标对象
cur.execute('create table coronavirus (title varchar(20), confirm varchar(20),death varchar(20))')
# 在数据库中建立一个名为coronavirus的表,有title,confirm,death三列,数据类型为varchar(20)
# 注意数据库只需要创建一次!!
def getdata():
res = requests.get("https://www.bitpush.news/covid19/")
html = res.text
bs = BeautifulSoup(html, features="lxml")
titles = []
confirms = []
deaths = []
titleslist = bs.find_all(class_='table_card_cell_col_0 table_card_cell_stringwithicon_type')
for item in titleslist:
titles.append(item.text) # text方法可以去掉标签值
confirmslist = bs.find_all(class_='table_card_cell_col_1 table_card_cell_int_type')
for item in confirmslist:
confirms.append(item.text)
deathslist = bs.find_all(class_='table_card_cell_col_4 table_card_cell_int_type')
for item in deathslist:
deaths.append(item.text)
return titles, confirms, deaths
# 当python函数同时返回多个值时,python会将其封装为元组
def save_to_MySQL(title, confirm, death):
cur.execute("insert into coronavirus (title,confirm,death) VALUES (%s,%s,%s)", (title, confirm, death))
# 将数据依次存放在coronavirus表中
cur.connection.commit()
# 向MySQL提交数据
if __name__ == '__main__':
data = getdata()
for i in range(1, 102):
print(i)
print(data[0][i], data[1][i], data[2][i])
save_to_MySQL(data[0][i], data[1][i], data[2][i])
cur.close()
conn.close()
# 关闭cursor和connection