python 爬取 世界空气污染:空气质量指数历史数据

1.1网站分析(抓包)

使用谷歌浏览器(火狐浏览器)的开发者工具,通过刷新监听抓取浏览器请求的响应包,找到历史数据及所有地区url等对应的包

1.1.1历史数据的数据包(get请求方式)

重庆安康: https://api.waqi.info/api/attsse/9239/yd.json 贵阳马鞍:https://api.waqi.info/api/attsse/1368/yd.json 

  通过对比两个地区,可发现url里面的数字即idx是变化的使用谷歌浏览器或者postman打开包后可看出数据是按照月分的,一个data对应一个月,且里面的数据是加密的,

1.1.2世界所有地区idx的(post请求方式)

Url: https://api.waqi.info/mapq2/bounds

同1.1.1方法一样可发现包里面字典键值data对应的就是不同地区的idx

1.1.3解密函数的包Url:https://aqicn.org/webapp/dist/historic-module-dyn.2b2626b6ef49374f9dcd.js

通过对网站进行断点调试,可查看到数据经过这个包里面的一些函数时,加密数据会被解析成一个月每一天的对应指标的值,

1.2爬取数据、解密数据、清洗数据并写入CSV文件

  • 通过requests的post请求方式获取到所有地区的idx,通过循环将idx写入下面url,并执行下面的操作
  • 将地区的url的idx作为变量如(https://api.waqi.info/api/attsse/{idx}/yd.json),
  • 通过requests的get请求方法获取idx对应地区的原始数据, 将图2.3、图2.4的解密函数放在一个js文件里面, 通过Execjs库调用文件里面的js函数对原始数据进行解密, 得到的数据是字典列表多重嵌套的形式, 通过键值提取需要的数据
  • 按照时间序列合并数据(二维数组类型,[[时间,pm2.5,pm10,O3,NO2,SO2,CO],])
  • 通过对city进行分割成国家/地区的形式, 通过正则替换掉一些win系统文件名不支持的字符, 用OS创建对应的目录路径(例如data/空气污染指数历史数据/CN/Guangdong/东莞/东城主山.csv), 然后使用pandas将数据写入对应路径的CSV文件

完整项目:基于python 的爬虫及flask框架web大屏交互式可视化https://download.csdn.net/download/weixin_66397563/87651644?spm=1001.2014.3001.5503

注意: 本文以学习技术为主,不可以用于非法行为, 如有侵权请联系删除

import requests
import json
import subprocess
from functools import partial
import time
import os
import pandas as pd
import re
import random

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.7 Safari/537.36'
}

subprocess.Popen = partial(subprocess.Popen, encoding="utf-8")
# 修改编码方式,window默认编码是gbk,

import execjs


def get_idx():
    """
    :return: 得到所有地区的idx和name,返回列表与元组嵌套类型[(idx,name),]
    """
    url = 'https://api.waqi.info/mapq2/bounds'
    data = {
        'bounds': "-306.21093750000006,-62.10388252289787,306.5625,78.42019327591201",
        'country': "",
        'inc': "placeholders",
        'viewer': "webgl",
        'zoom': 2
    }
    request = requests.post(url, data=data, headers=headers).text
    a = json.loads(request)
    result = []
    for j in a["data"]:
        idx = j['idx']
        if idx.isdigit():
            result.append(idx)
    return result


def get_py_json(url):
    '''
    :param url: 某地区url
    :return: 提取url加密了的的数据
    '''
    resp = requests.get(url, headers=headers).text
    lis = resp.split('\n\n')
    result = []
    for i in range(1, len(lis) - 1, 2):
        st = lis[i][18:]
        dic = json.loads(st)
        if 'msg' in dic:
            result.append(dic["msg"])
    if result:
        return result[1:]
    return result


def get_js_function(js_path, func_name, func_args):
    '''
    :param js_path: 存放解密函数的js文件
    :param func_name: 调用的js函数名
    :param func_args: 待解密数据
    :return: 返回解析后的数据
    '''
    with open(js_path, 'r', encoding="utf-8") as f:
        js = f.read()
        ctx = execjs.compile(js)
    return ctx.call(func_name, func_args)


def get_decode_data(json):
    '''
    :param json: 加密数据
    :return: 解密后的数据
    '''
    items = []
    for item in json:
        if item["ps"]:
            data = get_js_function('./static/js/test.js', 's', item)
            items.append(data)
    return items


def get_index_data(items):
    '''
    :param items: 解密后的数据
    :return: 将不同指标分开,按照时间排序合并一起
    '''
    pm25 = []
    pm10 = []
    O3 = []
    NO2 = []
    SO2 = []
    CO = []
    time_time = []
    city = items[0].get('source').get('city').get('name')
    for item in items:
        for i in item["species"]:
            name = i['name']
            values = i["values"]
            if name == 'PM2.5':
                for j in i['values']:
                    d1 = j["t"]["d"]
                    v = j['v']
                    time_time.append(d1)
                    pm25.append((d1, v))
            if name == 'PM10':
                for j in i['values']:
                    d2 = j["t"]["d"]
                    v = j['v']
                    time_time.append(d2)
                    pm10.append((d2, v))
            if name == 'O3':
                for j in i['values']:
                    d3 = j["t"]["d"]
                    v = j['v']
                    time_time.append(d3)
                    O3.append((d3, v))
            if name == 'NO2':
                for j in i['values']:
                    d4 = j["t"]["d"]
                    v = j['v']
                    time_time.append(d4)
                    NO2.append((d4, v))
            if name == 'SO2':
                for j in i['values']:
                    d5 = j["t"]["d"]
                    v = j['v']
                    time_time.append(d5)
                    SO2.append((d5, v))
            if name == 'CO':
                for j in i['values']:
                    d6 = j["t"]["d"]
                    v = j['v']
                    time_time.append(d6)
                    CO.append((d6, v))

    time1 = list(set(time_time))
    date_time = []
    for i in time1:
        time_1 = time.strptime(i[:10], "%Y-%m-%d")
        date_time.append(time_1)
    date_time.sort()
    data_1 = []
    for i in date_time:
        data_1.append(time.strftime("%Y-%m-%d", i) + 'T00:00:00.000Z')
    head_list = ['时间', 'pm25', 'pm10', 'O3', 'NO2', 'SO2', 'CO']
    data_list = [head_list, ]
    for i in data_1:
        list_1 = [i, '', '', '', '', '', '']
        for pm2 in pm25:
            if pm2[0] == i:
                list_1[1] = pm2[1]
        for pm1 in pm10:
            if pm1[0] == i:
                list_1[2] = pm1[1]
        for O_3 in O3:
            if O_3[0] == i:
                list_1[3] = O_3[1]
        for n2 in NO2:
            if n2[0] == i:
                list_1[4] = n2[1]
        for s2 in SO2:
            if s2[0] == i:
                list_1[5] = s2[1]
        for c_1 in CO:
            if c_1[0] == i:
                list_1[6] = c_1[1]
        data_list.append(list_1)
    return data_list, city


def write_file(city, data_list):
    # “?”、“、”、“╲”、“/”、“*”、““”、“”“、“<”、“>”、“|”
    lst = re.sub(r'[?、 .╲*"<>|,]', '_', city).replace(":", "/").split("/")
    city_1 = '/'
    for i in range(0, len(lst) - 1):
        city_1 += lst[i] + '/'
    path = os.path.join("data/空气污染指数历史数据" + city_1)
    if not os.path.exists(path) or not os.path.isdir(path):
        os.makedirs(path)
    file_name = os.path.join(path + lst[-1] + '.csv')
    print(file_name)
    pd.DataFrame(data_list).to_csv(file_name, encoding='utf-8', index=False, header=False)


def main():
    print("Air爬虫启动")
    pid = os.getpid()
    print("pid:", pid)
    with open("./data/air_pid.txt", "w") as f:
        f.write(str(pid))
    for j in range(100000):
        idx = get_idx()
        # print(len(idx))
        for i in idx:
            url = f'https://api.waqi.info/api/attsse/{i}/yd.json'
            with open('./data/url.txt', 'r', encoding='utf-8') as f:
                line = f.read().splitlines()
            if url not in line:
                try:
                    encryption_list = get_py_json(url)
                    decode_data = get_decode_data(encryption_list)
                    data_list1, city = get_index_data(decode_data)
                    print(city)
                    write_file(city, data_list1)
                    with open('./data/url.txt', 'a', encoding='utf-8') as f:
                        f.write(f'{url}\n')
                except Exception as e:
                    print(f"请求错误:{e}")
                    time.sleep(5)
                # time.sleep(random.randint(1,10))
        time.sleep(random.randint(5, 10))


if __name__ == '__main__':
    for j in range(100000):
        idx = get_idx()
        # print(len(idx))
        for i in idx:
            url = f'https://api.waqi.info/api/attsse/{i}/yd.json'
            with open('./data/url.txt', 'r', encoding='utf-8') as f:
                line = f.read().splitlines()
            if url not in line:
                try:
                    encryption_list = get_py_json(url)
                    decode_data = get_decode_data(encryption_list)
                    data_list1, city = get_index_data(decode_data)
                    print(city)
                    write_file(city, data_list1)
                    with open('./data/url.txt', 'a', encoding='utf-8') as f:
                        f.write(f'{url}\n')
                except Exception as e:
                    print(f"请求错误:{e}")
                    time.sleep(5)
                # time.sleep(random.randint(1,10))
        time.sleep(random.randint(5, 10))

注意: 本文以学习技术为主,不可以用于非法行为, 如有侵权请连续删除

你可能感兴趣的:(python,json,爬虫,数据挖掘,开发语言)