基于LSTM的气候变化预测源码

import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tensorflow.keras import Model
from tensorflow.keras.layers import Input,LSTM,Dropout,Dense,Activation,Conv1D,AveragePooling1D,Bidirectional,Add
from tensorflow.keras.optimizers import RMSprop,Adam
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from sklearn.preprocessing import LabelEncoder

# # 转换流水线
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
pd.set_option('display.max_rows', None)#显示全部行
pd.set_option('display.max_columns', None)#显示全部列
# 加载数据
WEATHER_PATH = './datasets'
def load_weather_data(weather_path):
    csv_path = os.path.join(weather_path,"weather.csv")
    return pd.read_csv(csv_path)
#查看数据
weather = load_weather_data(WEATHER_PATH)
print(weather.columns)
print(weather.info())
print(weather.describe())


# 数据初步清理
# 处理时间属性
def Clean_weather(weather):
    # print(weather["cloud"].describe())
    date = pd.to_datetime(weather['date'].apply(lambda x: str(x))) #取值替换将其转换为字符串
    weather['year'] =date.dt.year  #提取年
    weather['month'] = date.dt.month  #提取月
    weather['week'] = date.dt.weekofyear  #提取周
    weather['quarter'] = date.dt.to_period('Q').astype('str')[:-2].apply(lambda x:x[-1]).astype('int') #将日期以季度为单位
    weather['day'] = date.dt.dayofyear #提取天
    # 将无效数据替换成NAN
    weather[weather==999999] = np.NaN
    weather[weather==999990] = np.NaN
    # 去掉wind_direction和phenomenon属性
    #weather = weather.drop(["wind_direction","phenomenon"],axis=1)
    return weather
weather = Clean_weather(weather)
weather.to_csv('./datasets/clean_weather.csv',index=False)

# 数据可视化
plt.rcParams['font.family'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号

# 绘制某个属性关于季度和地理位置的变化情况
def plot_of_quarter_city(X,attribe):
    # 将属性按照季度和城市分组
    df_group = X[attribe].groupby([X['quarter'],X['city']]).mean()
    # 对多级索引中的city进行重命名
    #df_group = df_group.rename({'重庆市':'chongqing','北京市':'beijing','上海市':'shanghai'},axis='index')
    # 将series 按照city展开成一个表格
    pop_df = df_group.unstack()
    return pop_df

print(weather.describe())

# 每个城市在每个季度的可见度
df_vb_group1 = plot_of_quarter_city(weather,'visibility')
# 对多级索引重新命名
df_vb_group1.plot(kind='bar')
plt.title("可见度变化情况")
plt.show()

# 绘制降雨量
df_rain20_group = plot_of_quarter_city(weather,'rain20')
#df_rain20_group.plot(kind='bar')
plt.title("20时降雨量")
df_rain08_group = plot_of_quarter_city(weather,'rain08')
df_rain08_group.plot(kind='bar')
plt.title("08时降雨量")
plt.show()

# 温度变化情况
df_tempature_group = plot_of_quarter_city(weather,'temperature')
df_tempature_group.plot(kind='bar')
plt.title("温度变化情况")
plt.show()

# 湿度变化情况
df_humidity_group = plot_of_quarter_city(weather,'humidity')
df_humidity_group.plot(kind='bar')
plt.title("湿度变化情况")
plt.show()

# 气压变化情况
df_pressure_group = plot_of_quarter_city(weather,'pressure')
df_pressure_group.plot(kind='bar')
plt.title("气压变化情况")
plt.show()

# 风速变化情况
df_windspeed_group = plot_of_quarter_city(weather,'wind_speed')
df_windspeed_group.plot(kind='bar')
plt.title("风速变化情况")
plt.show()

# 云量变化情况
df_cloud_group = plot_of_quarter_city(weather,'cloud')
df_cloud_group.plot(kind='bar')
plt.title("云量变化情况")
plt.show()

# 各个属性相互之间的相关性
from pandas.plotting import scatter_matrix
attributes = ['wind_speed','temperature', 'humidity', 'rain20', 'cloud', 'visibility']
scatter_matrix(weather[attributes],figsize=(12,8))
plt.show()

#生成数据集

## 标签具有单个属性
# 选择窗口 单个属性
def windows_select(data_perpared,feature_size,squence_length,temper_dim):
    features = []
    labels = []
    for i in range(data_perpared.shape[0]-squence_length):
        x = np.array(data_perpared[i:i+squence_length,:]).flatten()
        y = data_perpared[i+squence_length,temper_dim] # 第temper_dim维是温度
        features.append(x)
        labels.append(y)
    features = np.array(features)
    labels = np.array(labels)
    return features,labels

def shuffle_data(features,labels): #将序列中的所有数据随机排序
    # shuffle data
    shuffle_indicies = np.random.permutation(features.shape[0])
    features = features[shuffle_indicies,:]
    labels = labels[shuffle_indicies]
    return features,labels

# 数据预处理
def get_data(data_perpared,is_joint=False,squence_length=20,feature_size=7,temper_dim=2,train_ratio=0.8,validate_ratio=0.1):
    if not is_joint:
        data_perpared = data_perpared[:,:feature_size] # 取前面7维的数据
        features,labels =windows_select(data_perpared,feature_size,squence_length,temper_dim)
        features,labels = shuffle_data(features,labels)
    else:
        data_perpared[0] = data_perpared[0][:,:feature_size]
        features,labels = windows_select(data_perpared[0],feature_size,squence_length,temper_dim)
        for data in data_perpared[1:]:
            data = data[:,:feature_size] # 取前面7维的数据
            # 对一个地级市内的数据进行分组
            f,l =windows_select(data,feature_size,squence_length,temper_dim)
            features = np.vstack((features,f)) # 垂直拼接
            labels = np.hstack((labels,l))  # 水平拼接
        features,labels = shuffle_data(features,labels)
    print("features.shape",features.shape)
    print("labels.shape",labels.shape)
    # train samples
    train_row = round(features.shape[0]*train_ratio)
    validate_num = round(features.shape[0]*validate_ratio)
    test_num = features.shape[0]-train_row-validate_num
    x_train = np.reshape(features[:train_row,:],(train_row,squence_length,feature_size))
    y_train = np.reshape(labels[:train_row],(train_row,-1))
    # validation samples
    x_val = np.reshape(features[train_row:train_row+validate_num,:],(validate_num,squence_length,feature_size))
    y_val = np.reshape(labels[train_row:train_row+validate_num],(validate_num,-1))
    # test samples
    x_test = np.reshape(features[train_row+validate_num:,:],(test_num,squence_length,feature_size))
    y_test = np.reshape(labels[train_row+validate_num:],(test_num,-1))
    print("train_samples:",x_train.shape,y_train.shape)
    print("validate_samples:",x_val.shape,y_val.shape)
    print("test_samples:",x_test.shape,y_test.shape)
    return (x_train,y_train,x_val,y_val,x_test,y_test)

### 标签具有多个属性
# 选择窗口 多个属性
def windows_select_multi(data_perpared,feature_size,squence_length,temper_dim):
    features = []
    labels = []
    for i in range(data_perpared.shape[0]-squence_length):
        x = np.array(data_perpared[i:i+squence_length,:]).flatten()
        y = data_perpared[i+squence_length,:temper_dim] # 前temper_dim维
        features.append(x)
        labels.append(y)
    features = np.array(features)
    labels = np.array(labels)
    return features,labels

def shuffle_data_multi(features,labels):
    # shuffle data
    shuffle_indicies = np.random.permutation(features.shape[0])
    features = features[shuffle_indicies,:]
    labels = labels[shuffle_indicies,:]
    return features,labels


# 数据预处理
def get_data_multi(data_perpared,is_joint=False,squence_length=20,feature_size=26,temper_dim=11,train_ratio=0.8,validate_ratio=0.1):
    # data_perpared 是二维数据
    if not is_joint:
        data_perpared = data_perpared[:,:feature_size] # 取前面7维的数据
        features,labels =windows_select_multi(data_perpared,feature_size,squence_length,temper_dim)
        features,labels = shuffle_data_multi(features,labels)
    # data_perpared 是三维数据,(county_num,feature_size,temper_dim)
    else:
        data_perpared[0] = data_perpared[0][:,:feature_size]
        features,labels = windows_select_multi(data_perpared[0],feature_size,squence_length,temper_dim)
        for data in data_perpared[1:]:
            data = data[:,:feature_size] # 取前面7维的数据
            # 对一个地级市内的数据进行分组
            f,l =windows_select_multi(data,feature_size,squence_length,temper_dim)
            features = np.vstack((features,f)) # 垂直拼接
            labels = np.vstack((labels,l))  # 垂直拼接
        features,labels = shuffle_data_multi(features,labels)
    print("features.shape",features.shape)
    print("labels.shape",labels.shape)
    # train samples
    train_row = round(features.shape[0]*train_ratio)
    validate_num = round(features.shape[0]*validate_ratio)
    test_num = features.shape[0]-train_row-validate_num
    x_train = np.reshape(features[:train_row,:],(train_row,squence_length,feature_size))
    y_train = np.reshape(labels[:train_row,:],(train_row,temper_dim))
    # validation samples
    x_val = np.reshape(features[train_row:train_row+validate_num,:],(validate_num,squence_length,feature_size))
    y_val = np.reshape(labels[train_row:train_row+validate_num,:],(validate_num,temper_dim))
    # test samples
    x_test = np.reshape(features[train_row+validate_num:,:],(test_num,squence_length,feature_size))
    y_test = np.reshape(labels[train_row+validate_num:,:],(test_num,temper_dim))
    print("train_samples:",x_train.shape,y_train.shape)
    print("validate_samples:",x_val.shape,y_val.shape)
    print("test_samples:",x_test.shape,y_test.shape)
    return (x_train,y_train,x_val,y_val,x_test,y_test)

# 模型设计
# 多对一
def BLSTM_model(input_shape):
    x_in = Input(input_shape,name='input')
    x = Conv1D(64,2,padding='same',name='conv1')(x_in)
    x = AveragePooling1D(2,name='apool1')(x)
    x1 = Bidirectional(LSTM(80,go_backwards=False),name='forward_lstm')(x)
    x1 = Dropout(0.3,name='drop1')(x1)
    x2 = Bidirectional(LSTM(80,go_backwards=True),name='backward_lstm')(x)
    x2 = Dropout(0.3,name='drop2')(x2)
    x = Add(name='add')([x1,x2])
    x = Dense(100,activation='relu',name='dense1')(x)
    x = Dropout(0.3,name='drop3')(x)
    x = Dense(50,activation='relu',name='dense2')(x)  # 70
    x = Dropout(0.2,name='drop4')(x)
    x = Dense(10,activation='relu',name='dense3')(x)  # 20
    x = Dropout(0.1,name='drop5')(x)
    x = Dense(1,activation='sigmoid',name='dense4')(x)  #linear
    return Model(x_in,x,name='BLSTM')
# 多对多
def BLSTM_model_multi(input_shape,classes):
    x_in = Input(input_shape,name='input')
    x = Conv1D(64,2,padding='same',name='conv1')(x_in)
    x = AveragePooling1D(2,name='apool1')(x)
    x1 = Bidirectional(LSTM(80,go_backwards=False),name='forward_lstm')(x)
    x1 = Dropout(0.3,name='drop1')(x1)
    x2 = Bidirectional(LSTM(80,go_backwards=True),name='backward_lstm')(x)
    x2 = Dropout(0.3,name='drop2')(x2)
    x = Add(name='add')([x1,x2])
    x = Dense(100,activation='relu',name='dense1')(x)
    x = Dropout(0.3,name='drop3')(x)
    x = Dense(50,activation='relu',name='dense2')(x)  # 70
    x = Dropout(0.2,name='drop4')(x)
    x = Dense(10,activation='relu',name='dense3')(x)  # 20
    x = Dropout(0.1,name='drop5')(x)
    x = Dense(classes,activation='sigmoid',name='dense4')(x)  #linear
    return Model(x_in,x,name='BLSTM')

#回归模型的训练和测试
# 评价指标
def evaluate_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print("mse: ", mse)
    print("mae: ", mae)
    print('rmse: ', rmse)
    print("r2_score:", r2)


model_path = './checkpoint/weights-temp_acc=0.97.h5'


# 测试模型
def test(model, x_test, y_test):
    model.load_weights(model_path)
    # predict
    y_pred = model.predict(x_test)
    # 评价指标
    evaluate_metrics(y_test, y_pred)
    # plot
    y_pred = y_pred.flatten()
    y_test = y_test.flatten()
    return y_pred, y_test

# 训练和预测
def train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,is_train=True):
    input_shape = (x_train.shape[1],x_train.shape[2])
    # train model
    model = BLSTM_model(input_shape)
    model.compile(loss="mse",optimizer=Adam())
    print(model.summary())
    if is_train:
        cb_ckpt = ModelCheckpoint('./weights.{epoch:02d}-{val_loss:.2f}.h5',monitor='val_loss',
                                  save_best_only=True,mode='min',period=20)
        history = model.fit(x_train,y_train,batch_size=32,epochs=100,
                            validation_data=(x_val,y_val),shuffle=True,
                           callbacks=[cb_ckpt,
                            EarlyStopping(monitor='val_loss',patience=10)])
        plt.plot(history.history['loss'],label='train')
        plt.plot(history.history['val_loss'],label='test')
        plt.legend()
        plt.title("损失值对比图")
        plt.show()
    else:
        y_pred,y_test = test(model,x_test,y_test)
        return y_pred,y_test

# 在本数据集上测试回归模型 (预测温度)
num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy="median")),# 处理缺失值
    ('std_scaler',MinMaxScaler()),# 最大最小归一化
])
# # 对weather 进行数据清理
num_weather = weather.drop(["date",'city','county','station','year','phenomenon'],axis=1)
num_attribs = list(num_weather.columns)
# 处理数值和文本属性的流水线
full_pipeline = ColumnTransformer([
    ("num",num_pipeline,num_attribs),
    ("city",OneHotEncoder(),['city']),
])

data_perpared = full_pipeline.fit_transform(weather)
weather_clean = pd.DataFrame(data_perpared,columns=num_attribs+['shanghai','beijing', 'chongqin'])
print(weather_clean.columns)
weather_clean[['date','county']] = weather[['date','county']]

def generate_features(weather_clean):
    # 按照时间和地区进行排序
    sortweather = weather_clean.sort_values(by=['county','date'],ascending=(False,True))
    data_perpared_countys = []
    # 按照地级市进行分组
    for name,group in sortweather.groupby('county'):
        group = group.drop(['date','county'],axis=1)
        group_array = np.array(group)
        data_perpared_countys.append(group_array)
    return data_perpared_countys

data_perpared_countys = generate_features(weather_clean)
print(len(data_perpared_countys))


# 训练模型
# data_perpared_countys:处理后气象数据(数组),squence_length:窗口大小,temper_dim:温度的维度, feature_size: 特征数
# load data
x_train,y_train,x_val,y_val,x_test,y_test = get_data(data_perpared_countys,True,30,26,2)
#train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,is_train=True)

# 测试模型
model_path = './checkpoint/weights-temp_acc=0.97.h5'
print(data_perpared_countys[0].shape)
y_pred,y_test = train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,False)

# 绘制温度变化图形
# # 保留温度的最大最小值
temper_max = np.max(weather['temperature'])
temper_min = np.min(weather['temperature'])
# 还原温度值
y_pred_orignal = np.array(y_pred)*(temper_max-temper_min)+temper_min
y_test_orignal = np.array(y_test)*(temper_max-temper_min)+temper_min

evaluate_metrics(y_pred_orignal,y_test_orignal)

fig = plt.figure(figsize=(12,8))
plt.plot(y_test,label='y_test',alpha = 0.3)
plt.plot(y_pred,label='y_pred',alpha = 0.3)
plt.title("温度值对比")
plt.legend()
plt.show()
fig1 = plt.figure(figsize=(12,8))
plt.plot(y_pred_orignal,label='y_pred',alpha = 0.3)
plt.plot(y_test_orignal,label='y_test',alpha = 0.3)
plt.title("原始温度值对比")
plt.legend()
plt.show()



# 预测可见度
corr_matrix = weather_clean.corr()
print(corr_matrix['visibility'].sort_values(ascending=False)) #各种天气情况可见度排名

## 训练模型
# load data
x_train,y_train,x_val,y_val,x_test,y_test = get_data(data_perpared_countys,True,30,26,7)
#train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,is_train=True)

## 测试模型
model_path = './checkpoint/weights-visibility_acc=0.78.h5'
y_pred,y_test = train_and_predict(x_train,y_train,x_val,y_val,x_test,y_test,False)

# 绘制可见度变化图形
# # 保留可见度的最大最小值
visbi_max = np.max(weather['visibility'])
visbi_min = np.min(weather['visibility'])
# 还原温度值
y_pred_orignal = np.array(y_pred)*(visbi_max-visbi_min)+visbi_min
y_test_orignal = np.array(y_test)*(visbi_max-visbi_min)+visbi_min

evaluate_metrics(y_pred_orignal,y_test_orignal)

fig = plt.figure(figsize=(12,8))
plt.plot(y_test,label='y_test',alpha = 0.3)
plt.plot(y_pred,label='y_pred',alpha = 0.3)
plt.title("可见度对比")
plt.legend()
plt.show()

fig1 = plt.figure(figsize=(12,8))
plt.plot(y_pred_orignal,label='y_pred',alpha = 0.3)
plt.plot(y_test_orignal,label='y_test',alpha = 0.3)
plt.title("原始可见度对比")
plt.legend()
plt.show()



# 预测多个数值属性
print(weather_clean.columns)

# 训练和预测
# data_perpared:处理后气象数据(数组),squence_length:窗口大小,temper_dim:温度的维度, feature_size: 特征数
def train_and_predict_multi(x_train, y_train, x_val, y_val, x_test, y_test, is_train=True):
    input_shape = (x_train.shape[1], x_train.shape[2])
    # train model
    model = BLSTM_model_multi(input_shape, y_train.shape[1])
    model.compile(loss="mse", optimizer=Adam())
    print(model.summary())
    if is_train:
        cb_ckpt = ModelCheckpoint('./weights.{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss',
                                  save_best_only=True, mode='min', period=20)
        history = model.fit(x_train, y_train, batch_size=32, epochs=100,
                            validation_data=(x_val, y_val), shuffle=True,
                            callbacks=[cb_ckpt,
                                       EarlyStopping(monitor='val_loss', patience=10)])
        plt.plot(history.history['loss'], label='train')
        plt.plot(history.history['val_loss'], label='test')
        plt.title("损失值对比图")
        plt.legend()
        plt.show()
    else:
        y_pred, y_test = test(model, x_test, y_test)
        return y_pred, y_test

# load data
x_train,y_train,x_val,y_val,x_test,y_test = get_data_multi(data_perpared_countys,True,30,26,7)
# 训练模型
#  train_and_predict_multi(x_train,y_train,x_val,y_val,x_test,y_test,True)

# 测试模型
model_path = './checkpoint/weights-multi_acc=0.54.h5'
print(data_perpared_countys[0].shape)
y_pred,y_test =  train_and_predict_multi(x_train,y_train,x_val,y_val,x_test,y_test,False)
#多分类模型的训练和测试
# 调整列的顺序
weather_clean_class = weather_clean[['sunny', 'cloudy', 'rain', 'fog', 'haze', 'dust',
       'thunder', 'lightning','snow', 'hail', 'wind','pressure', 'wind_speed', 'temperature', 'humidity', 'rain20', 'rain08',
       'cloud', 'visibility','month', 'week','quarter', 'day', 'shanghai', 'beijing', 'chongqin', 'date', 'county']]

# 生成特征列表
data_perpared_countys = generate_features(weather_clean_class)
classes = ['sunny', 'cloudy', 'rain', 'fog', 'haze', 'dust','thunder', 'lightning','snow', 'hail', 'wind']

# 测试模型
model_path = ''
def test_classify(model,x_test,y_test,classes):
    model.load_weights(model_path)
    y_pred = model.predict(x_test)
    rocauc = metrics.roc_auc_score(y_test,y_pred)
    prauc = metrics.average_precision_score(y_test,y_pred,average='macro')
    print(f'ROC-AUC score={rocauc:.6f}')
    print(f'Prauc score={prauc:.6f}')
    y_prod = (y_pred > 0.5).astype(np.float32)
    acc = metrics.accuracy_score(y_test,y_prod)
    f1 = metrics.f1_score(y_test,y_prod,average='samples')
    print(f'acc score={acc:.6f}')
    print(f'f1 score={f1:.6f}')
    # 计算每个类的准确率
    for i,cls in enumerate(classes):
        cls_rocauc = metrics.roc_auc_score(y_test[:,i],y_pred[:,i])
        cls_prauc = metrics.average_precision_score(y_test[:,i],y_pred[:,i])
        cls_acc = metrics.accuracy_score(y_test[:,i],y_prod[:,i])
        cls_f1 = metrics.f1_score(y_test[:,i],y_prod[:,i])
        print(f"[{i:2} {cls:10}] rocauc={cls_rocauc:.4f} prauc={cls_prauc:.4f} acc={cls_acc:4f} f1={cls_f1:.4f}")
    return y_pred,y_test


# 训练和预测
# data_perpared:处理后气象数据(数组),squence_length:窗口大小,temper_dim:气象的维度, feature_size: 特征数
def train_and_predict_classify(x_train,y_train,x_val,y_val,x_test,y_test,classes,is_train=True):
#     input_shape = (squence_length,feature_size)
    input_shape = (x_train.shape[1],x_train.shape[2])
    # train model
    model = BLSTM_model_multi(input_shape,y_train.shape[1])
    model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['accuracy'])
    print(model.summary())
    if is_train:
        cb_ckpt = ModelCheckpoint('./weights.{epoch:02d}-{val_loss:.2f}.h5',monitor='val_loss',
                                  save_best_only=True,mode='min',period=20)
        history = model.fit(x_train,y_train,batch_size=32,epochs=100,
                            validation_data=(x_val,y_val),shuffle=True,
                           callbacks=[cb_ckpt,
                            EarlyStopping(monitor='val_loss',patience=10)])
        plt.plot(history.history['loss'],label='train')
        plt.plot(history.history['val_loss'],label='test')
        plt.title("损失值对比图")
        plt.legend()
        plt.show()
    else:
        y_pred,y_test = test_classify(model,x_test,y_test,classes)
        return y_pred,y_test


# load data
x_train,y_train,x_val,y_val,x_test,y_test = get_data_multi(data_perpared_countys,True,30,26,11)
# 训练模型
#  train_and_predict_classify(x_train,y_train,x_val,y_val,x_test,y_test,True)

model_path = './checkpoint/weights_classify_auc=0.85.h5'
y_pred,y_test = train_and_predict_classify(x_train,y_train,x_val,y_val,x_test,y_test,classes,False)

你可能感兴趣的:(lstm,python)