新型肺炎数据爬取与分析实战

文章目录

      • 导包
      • 爬取数据
      • 数据清洗
      • 数据分析
      • 数据可视化
      • DataFrame数据存入MySQL数据库
      • 将数据存成CSV/XLSX 文件
          • DataFrame数据存成csv文件
          • DataFrame数据存成xlsx文件

导包

import requests
import json
import pandas as pd

爬取数据

#爬取数据
#json.loads()是用来读取字符串的,即,可以把文件打开,用readline()读取一行,然后json.loads()一行
#https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5
#https://view.inews.qq.com/g2/getOnsInfo?name=disease_other
def getData():
    url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }
    r = requests.get(url,headers)
    if r.status_code == 200:
        return json.loads(json.loads(r.text)['data'])
data_dict = getData()
data_dict
# lastUpdateTime :最后更新时间    chinaTotal :累计值  chinaAdd :每日新增 和 chinaTotal字段一致
# confirm 确诊  heal 治愈  dead 死亡     nowConfirm 现有确诊   suspect 疑似   nowSevere 重症
data_dict.keys()
#dict_keys(['lastUpdateTime', 'chinaTotal', 'chinaAdd', 'isShowAdd', 'showAddSwitch', 'areaTree', 'chinaDayList', 'chinaDayAddList', 'dailyNewAddHistory', 'dailyHistory', 'wuhanDayList', 'articleList'])

数据清洗

#使用for循环到省份
for province in data_dict.get('areaTree')[0]['children']:
    print(province['name'])
#每个省当天的数据
for province in data_dict.get('areaTree')[0]['children']:
    print(province['today'])
#每个省总的数据情况
for province in data_dict.get('areaTree')[0]['children']:
    print(province['total'])
#将数据变成列表再变成dataframe
province_list = list()
for province in data_dict.get('areaTree')[0]['children']:
    province_info = province['total']
    province_info['name'] = province['name']
    province_list.append(province_info)
province_list

province_df = pd.DataFrame(province_list)
province_df
#列的索引号选择多列
province_df = province_df[['confirm','dead','deadRate','heal','healRate','name','suspect']]
province_df
#按照数据类型删除列
#include=包含什么类型, exclude=不包含什么类型
province_df = province_df.select_dtypes(exclude=['bool'])
province_df

数据分析

#用tolist把省份的名称和累计确诊的数据转换成list
province_name = province_df.name.tolist()

province_confirm = province_df.confirm.tolist()
#zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的对象
for tup in zip(province_name, province_confirm):
    print(tup)

数据可视化

#导入pyecharts的包,用于数据可视化
from pyecharts.charts import Map
from pyecharts import options as opts #配色  标题
china_map = Map()
#定义地图,填充数据
china_map.add('全国疫情分布',[tup for tup in zip(province_name,province_confirm)],'china')

#打印地图
china_map.render_notebook()

新型肺炎数据爬取与分析实战_第1张图片

#颜色配置 必须是一个list包裹dict
pieces = [
    {'min':1,'max':9,'color':'#FFE0E0'},
    {'min':10,'max':99,'color':'#FFC0C0'},
    {'min':100,'max':499,'color':'#FF9090'},
    {'min':500,'max':999,'color':'#FF6060'},
    {'min':1000,'max':9999,'color':'#FF3030'},
    {'min':10000,'color':'#DD0000'},
    
]


china_map = Map()
#定义地图,填充数据
china_map.add('全国疫情分布',[tup for tup in zip(province_name,province_confirm)],'china')
#填充
china_map.set_global_opts(title_opts=opts.TitleOpts(title='中国加油!武汉加油!'),\
                          visualmap_opts=opts.VisualMapOpts(is_piecewise=True,pieces=pieces))
#打印地图
china_map.render_notebook()

新型肺炎数据爬取与分析实战_第2张图片

DataFrame数据存入MySQL数据库

from sqlalchemy import create_engine
#配置数据库链接字符串
conn = "mysql+pymysql://root:@127.0.0.1:3306/feiyanInfo?charset=utf8"
#保存每天的总数据
total = data_dict['chinaTotal']
total['date'] = data_dict['lastUpdateTime'].split()[0]
total_df = pd.DataFrame(pd.Series(total)).T
total_df
#数据类型转换
for i in total_df.iloc[:,:-1].columns:
    total_df.loc[:,i] = total_df.loc[:,i].astype('int32')
total_df.loc[:,'date'] = pd.to_datetime(total_df.loc[:,'date'])
total_df.iloc[:,:-1].astype('int32').info(0)
#sql语句
total_df.to_sql('china_total',conn,index=False,if_exists='append')

将数据存成CSV/XLSX 文件

#国家数据
country_list = list()
for country in data_dict['areaTree']:
#     print(data_dict['lastUpdateTime'],country['name'],country['today'],country['total'])
    country_dict = country['total']
    country_dict['add_confirm'] = country['today']['confirm']
    country_dict['name'] = country['name']
    country_dict['date'] = data_dict['lastUpdateTime']
    country_list.append(country_dict)
country_df = pd.DataFrame(country_list)
country_df
#删除bool值
country_df= country_df.select_dtypes(exclude=['bool'])
#把比例修改成浮点型
country_df.loc[:,['deadRate','healRate']] = country_df.loc[:,['deadRate','healRate']].astype('float32')
#把时间修改成时间序列类型
country_df.date = pd.to_datetime(country_df.date)
country_df
DataFrame数据存成csv文件
country_df.to_csv('country_df.csv')
#中国各个省份数据
city_list = list()
for pro in data_dict['areaTree'][0]['children']:
    for city in pro['children']:
        city_dict = city['total']
        city_dict['add_confirm'] = city['today']['confirm']
        city_dict['city_name'] = city['name']
        city_dict['province_name'] = pro['name']
        city_dict['date'] = data_dict['lastUpdateTime']
        city_list.append(city_dict)
city_df = pd.DataFrame(city_list)
#删除bool值
city_df= city_df.select_dtypes(exclude=['bool'])
#把比例修改成浮点型
city_df.loc[:,['deadRate','healRate']] = city_df.loc[:,['deadRate','healRate']].astype('float32')
#把时间修改成时间序列类型
city_df.date = pd.to_datetime(city_df.date)
city_df
DataFrame数据存成xlsx文件
city_df.to_excel('city_df.xlsx',sheet_name='city',index =False)

你知道的越多,你不知道的越多。
有道无术,术尚可求,有术无道,止于术。
如有其它问题,欢迎大家留言,我们一起讨论,一起学习,一起进步

你可能感兴趣的:(大数据学习,人工智能)