SVR, adaboost, MLP, GBDT, XGBOOST, LIGHTGBM以及随机森林模型参数优化+模型训练+shap解释

SVR,MLP, adaboost,GBDT,XGBOOST,LIGHTGBM,随机森林模型参数优化+模型训练+shap解释

  • 导入所需要的库及数据处理
  • 模型超参数优化
  • 拆分训练集和测试集,进行shap解释

导入所需要的库及数据处理

import numpy as np               # 导入NumPy库,用于处理多维数组和矩阵运算
import pandas as pd              # 导入Pandas库,用于数据处理和分析
import matplotlib.pyplot as plt  # 导入Matplotlib库,用于数据可视化
from sklearn.model_selection import train_test_split  # 导入Scikit-learn库中的train_test_split函数,用于将数据集分为训练集和测试集
from sklearn.preprocessing import StandardScaler     # 导入Scikit-learn库中的标准化函数,用于对数据进行标准化处理
from sklearn.ensemble import RandomForestRegressor   # 导入Scikit-learn库中的随机森林回归模型
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score  # 导入Scikit-learn库中的评价指标函数,用于评价模型的性能
import warnings                  # 导入warnings库,用于控制警告信息的显示
warnings.filterwarnings('ignore')  # 过滤警告信息,不显示警告

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_percentage_error # 导入回归模型评估指标
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

读取数据

data = pd.read_excel('data.xlsx')  # 读取Excel文件,返回一个DataFrame对象

处理缺失值异常值

# 使用 isna() 或 isnull() 标识缺失值
missing_values = data.isna()  # 或者 df.isnull()

# 计算每列的缺失值数量
missing_counts = missing_values.sum()
print(missing_counts)
data = data.fillna(0)#缺失值填充

#将异常值替换为均值
for i in data.columns:
    a = data.iloc[data[data[i] != -9999].index.tolist()][i].mean()
    data.loc[data[data[i] == -9999].index.tolist(), i] = a

数据准备及归一化

X = data[['Y', 'GDP', 'GYQY', 'POPD', 'YJDG', 'RSEI', 'DEM', 'Aspect',
       'MRVBF', 'TPI', 'SC', 'SM', 'GPP']]  # 选择指定列
X.columns = ['Y', 'GDP', 'GYQY', 'POPD', 'YJDG', 'RSEI', 'DEM', 'Aspect',
       'MRVBF', 'TPI', 'SC', 'SM', 'GPP']   # 重命名列
X.describe()  # 显示数据的描述性统计信息,包括均值、标准差、最小值、最大值等等
X_train = data.drop(['Y'], axis=1)    # 从训练集中删除目标变量,并将其余特征赋值给X_train
y_train = np.array(data['Y'].copy())   # 将目标变量PB赋值给y_train

#归一化
standarder = StandardScaler()                # 创建一个StandardScaler对象,用于对特征进行标准化
X_train = standarder.fit_transform(X_train)
y_standarder = StandardScaler()               # 创建一个StandardScaler对象,用于对目标变量进行标准化
y_train = np.expand_dims(y_train, axis=1)    # 将y_train转换为二维数组
y_train = y_standarder.fit_transform(y_train)# 对训练集目标变量进行标准化
y_train = y_train.squeeze(axis=1)            # 将y_train还原为一维数组

模型超参数优化

首先定义object函数

def objective(params):
    
    model = AdaBoostRegressor()
    if params['model_name'] == 'RF':
        del params['model_name']
        model = RandomForestRegressor(**params, random_state=42)  # 创建随机森林
        
    elif params['model_name'] == 'SVR':
        del params['model_name']  # 删除params中的model_name参数
        model = SVR(kernel = 'rbf', **params)  # 创建svr
        
    elif params['model_name'] == 'ADABOOST':
        del params['model_name']  # 删除params中的model_name参数
        model = AdaBoostRegressor(**params)  # 创建
        
    elif params['model_name'] == 'MLP':
        del params['model_name']  # 删除params中的model_name参数
        model = MLPRegressor( **params)  # 创建    
        
    elif params['model_name'] == 'GBDT':
        del params['model_name']  # 删除params中的model_name参数
        model = GradientBoostingRegressor(**params)  # 创建  

    elif params['model_name'] == 'LIGHTGBM':
        del params['model_name']  # 删除params中的model_name参数
        model = LGBMRegressor(**params)  # 创建随机森林  
    elif params['model_name'] == 'XGBOOST':
        del params['model_name']  # 删除params中的model_name参数
        model = XGBRegressor(**params)
#     五折交叉验证
    folds = 5
    mse_test = 0
    mae_test = 0
    r2_test = 0
    kfold = KFold(n_splits=folds, shuffle=True, random_state=5421)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(X_train, y_train)):
        # print('-------fold {}-------'.format(fold+1))
        x_tra, y_trn, x_val, y_val = X_train[trn_idx], y_train[trn_idx], X_train[val_idx], y_train[val_idx]
        model.fit(x_tra,y_trn)
        test_pred = model.predict(x_val)

#         mse_test = (mse_test + mean_squared_error(y_true=y_val, y_pred=test_pred)) / (fold + 1)
#         mae_test = (mae_test + mean_absolute_error(y_true=y_val, y_pred=test_pred)) / (fold + 1)
#         r2_test = (r2_test + r2_score(y_true=y_val, y_pred=test_pred)) / (fold + 1)

        mse_test = mean_squared_error(y_true=y_val, y_pred=test_pred)
        mae_test = mean_absolute_error(y_true=y_val, y_pred=test_pred)
        r2_test =  r2_score(y_true=y_val, y_pred=test_pred)

        # 将计算结果保存到DataFrame中并输出
    result_df = pd.DataFrame({'RMSE': [mse_test**0.5],
                          'MAE': [mae_test],
                          'R2': [r2_test],
                          }, index=['训练集'])
#     print(result_df)
    return {'loss': mse_test, 'status': STATUS_OK}  # 返回字典,其中loss为负的交叉验证得分,status为STATUS_OK表示优化成功

定义optimizim函数

def optimizim(objective, space, ):
    trials = Trials()  # 创建一个Trials对象,用于记录优化过程中的参数和结果
    best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=1,
            trials=trials)  # 调用fmin函数进行优化,并将结果保存在Trials对象中
    params = space_eval(space, best)  # 通过space_eval函数将最佳参数从超参数空间中提取出来
    print(f'最佳参数:{params}')
    model_name = params['model_name']
    del params['model_name']  # 删除模型名称,因为在模型构建时已经不需要该参数了
    return params, model_name

定义eva函数

def eva(model, params ,is_train):
# 使用寻找到的最优参数建立模型

    model = model.set_params(**params)
    model.fit(X_train, y_train)
    
    # 预测
    train_pred = model.predict(X_train)
    if is_train == False:
        train_pre = y_standarder.inverse_transform(train_pred.reshape(-1, 1))
        data['pred'] = train_pre
        data.to_excel(str(model.__class__.__name__)+'模型预测结果.xlsx')
        result_plot(y_train, train_pred, model)
        
    mse_train = mean_squared_error(y_true=y_train, y_pred=train_pred)
    mae_train = mean_absolute_error(y_true=y_train, y_pred=train_pred)
    r2_train = r2_score(y_true=y_train, y_pred=train_pred)

    result_df = pd.DataFrame({'RMSE': [mse_train**0.5],
                          'MAE': [mae_train,],
                          'R2': [r2_train,],
                          }, index=['评估结果'])
    print(result_df)
    return  r2_train

定义参数空间和模型选择函数

 clf1 = SVR()
clf2 = MLPRegressor()
clf3 = RandomForestRegressor()
clf4 = AdaBoostRegressor(random_state=123)
clf5 = GradientBoostingRegressor(criterion='friedman_mse')
clf6 = LGBMRegressor()
clf7 = XGBRegressor()
# clf4 = AdaBoostRegressor(n_estimators=50, random_state=123,learning_rate=1.0)
# clf5 = GradientBoostingRegressor(learning_rate=0.1,n_estimators=100,subsample=1.0,criterion='friedman_mse')
# 定义space字典
space_RF = {
    'model_name': hp.choice('model_name', ['RF']),  # 随机选择模型名,
    'n_estimators': hp.choice('n_estimators', range(10, 500, 1)),  # 随机选择n_estimators
    'max_depth': hp.choice('max_depth', range(1, 50)),
    'min_samples_split': hp.choice('min_samples_split', range(2, 10)), 
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)),  
    'max_features': hp.choice('max_features', range(1, X_train.shape[1]))  
}
space_SVR = {
    'model_name': hp.choice('model_name', ['SVR']),  # 随机选择模型名,
    'C': hp.choice('C', np.arange(0.1, 10, 0.1)),  # 
    'gamma': hp.choice('gamma', np.arange(0.1, 50)),
    'epsilon': hp.choice('epsilon',  np.arange(0, 1, 0.1)),   
}

space_ADABOOST = {
    'model_name': hp.choice('model_name', ['AdaBoost']),  # 随机选择模型名,
    'n_estimators': hp.choice('n_estimators', np.arange(10, 300, 10)),  # 
    'learning_rate': hp.choice('learning_rate',  np.arange(0.1, 5, 0.1)),   
}

space_MLP = {
    'model_name': hp.choice('model_name', ['MLP']),  # 随机选择模型名,
    'batch_size': hp.choice('batch_size', np.arange(5, 30, 5)),  # 
    'max_iter': hp.choice('max_iter',  np.arange(50, 500, 50)),   
}
space_GBDT = {
    'model_name': hp.choice('model_name', ['GBDT']),  # 随机选择模型名,
    'n_estimators': hp.choice('n_estimators', range(10, 500, 1)),  # 随机选择n_estimators
    'max_depth': hp.choice('max_depth', range(1, 50)),
    'min_samples_split': hp.choice('min_samples_split', range(2, 10)), 
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)),  
    'learning_rate': hp.choice('learning_rate',  np.arange(0.1, 2, 0.1)),
    'subsample': hp.choice('subsample',  np.arange(0.1, 1, 0.1)),  
}

space_LIGHTGBM = {
    'model_name': hp.choice('model_name', ['LIGHTGBM']),  # 随机选择模型名,
    'n_estimators': hp.choice('n_estimators', range(10, 500, 1)),  # 随机选择n_estimators
    'max_depth': hp.choice('max_depth', range(1, 50)),
    'num_leaves': hp.choice('num_leaves', range(2, 10)), 
    # 'device' : hp.choice('device', ['gpu']),
    # 'gpu_platform_id' : hp.choice('gpu_platform_id', ['0']),
    # 'gpu_device_id' : hp.choice('gpu_device_id', ['0']),

    
    
}

space_XGBOOST = {
    'model_name': hp.choice('model_name', ['XGBOOST']),  # 随机选择模型名,
    'n_estimators': hp.choice('n_estimators', range(10, 500, 1)),  # 随机选择n_estimators
    'max_depth': hp.choice('max_depth', range(1, 50)),
    'min_child_weight': hp.choice('min_child_weight', range(1, 10)), 
    'gamma': hp.choice('gamma', np.arange(0.2, 1,0.1)), 
    # 'tree_method' : hp.choice('tree_method', ['gpu_hist']),
    # 'gpu_id' : hp.choice('gpu_id', [0]),
}
space = [space_SVR, space_MLP, space_RF, space_ADABOOST, space_GBDT, space_LIGHTGBM, space_XGBOOST]
clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7 ]



def model_select(clfs, space, is_train):
    model_best_name=''
    r2 = 0
    for clf, space in zip(clfs, space):
        r2_train,  model_best, _ = train(clf, space, is_train)
        if r2_train > r2:
            model_best_name = model_best
            r2 = r2_train
            
    print(f'本轮最好的模型是{model_best_name}')
    print(f'本轮最好的模型的r2是{r2}')
    return model_best_name,r2

使用多数投票法选取最优模型

results = []
for i in range(10):

    result, r2= model_select(clfs, space, True)
    results.append(result)
results_order = pd.Series(results).value_counts().to_dict()
best_model_last = list(results_order.keys())[0]
print(f'最好的模型是{best_model_last}')

拆分训练集和测试集,进行shap解释

data_train, data_test = train_test_split(X, test_size=24, random_state=2)  # 将数据集随机划分为训练集和测试集
pd.merge(data['FID'],data_train,left_index=True,right_index=True,how='right').to_excel('./train_data.xlsx',index=None)
pd.merge(data['FID'],data_test,left_index=True,right_index=True,how='right').to_excel('./test_data.xlsx',index=None)
print(len(data_train))  # 输出训练集的样本数量
print(len(data_test))   # 输出测试集的样本数量

X_train_SHAP = data_train.drop(['Y'], axis=1)    # 从训练集中删除目标变量,并将其余特征赋值给X_train
y_train_SHAP = np.array(data_train['Y'].copy())   # 将目标变量PB赋值给y_train
features = X_train_SHAP.columns.to_list() # 获取特征名称

X_test_SHAP = data_test.drop(['Y'], axis=1)      # 从测试集中删除目标变量,并将其余特征赋值给X_test
y_test_SHAP = np.array(data_test['Y'].copy())     # 将目标变量PB赋值给y_test

standarder = StandardScaler()                # 创建一个StandardScaler对象,用于对特征进行标准化
X_train_SHAP = standarder.fit_transform(X_train_SHAP)  # 对训练集特征进行标准化
X_test_SHAP = pd.DataFrame(standarder.transform(X_test_SHAP), columns =features )         # 对测试集特征进行标准化

y_standarder = StandardScaler()               # 创建一个StandardScaler对象,用于对目标变量进行标准化
y_train_SHAP = np.expand_dims(y_train_SHAP, axis=1)    # 将y_train转换为二维数组
y_test_SHAP = np.expand_dims(y_test_SHAP, axis=1)      # 将y_test转换为二维数组
y_train_SHAP = y_standarder.fit_transform(y_train_SHAP)# 对训练集目标变量进行标准化
y_test_SHAP = y_standarder.transform(y_test_SHAP)      # 对测试集目标变量进行标准化
y_train_SHAP = y_train_SHAP.squeeze(axis=1)            # 将y_train还原为一维数组
y_test_SHAP = y_test_SHAP.squeeze(axis=1)              # 将y_test还原为一维数组

传递最优模型,进行shap解释

import shap
clf1 = SVR()
clf2 = MLPRegressor()
clf3 = RandomForestRegressor()
clf4 = AdaBoostRegressor(random_state=123)
clf5 = GradientBoostingRegressor(criterion='friedman_mse')
clf6 = LGBMRegressor()
clf7 = XGBRegressor()
shap.initjs()
if best_model_last == 'GBDT':
    _ , _ , best_params = train(GradientBoostingRegressor(), space_GBDT, True)
    model = GradientBoostingRegressor()
    
    # 设置模型参数
    model.set_params(**best_params)  # 使用你之前确定的最佳参数
    model.fit(X_train_SHAP, y_train_SHAP)
    
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP)

    
elif best_model_last == 'rf':
    _ , _ , best_params = train(RandomForestRegressor(), space_RF, True)
    model = RandomForestRegressor()
    
    # 设置模型参数
    model.set_params(**best_params)  # 使用你之前确定的最佳参数
    model.fit(X_train_SHAP, y_train_SHAP)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP) # 特征ph依赖图
    
elif best_model_last == 'LIGHTGBM':
    _ , _ , best_params = train(LGBMRegressor(), space_LIGHTGBM, True)
    model = LGBMRegressor()
    
    # 设置模型参数
    model.set_params(**best_params)  # 使用你之前确定的最佳参数
    model.fit(X_train_SHAP, y_train_SHAP)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP)

elif best_model_last == 'XGBOOST':
    _ , _ , best_params = train(XGBRegressor(), space_XGBOOST, True)
    model = XGBRegressor()
    
    # 设置模型参数
    model.set_params(**best_params)  # 使用你之前确定的最佳参数
    model.fit(X_train_SHAP, y_train_SHAP)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP)
    
    
elif best_model_last == 'MLP':
    _ , _ , best_params = train(MLPRegressor(), space_MLP, True)
    model = MLPRegressor()
    
    # 设置模型参数
    model.set_params(**best_params)  # 使用你之前确定的最佳参数
    model.fit(X_train_SHAP, y_train_SHAP)
    X_train_summary = shap.kmeans(X_train_SHAP, 10)
    def predict_fn(X):
        return model.predict(X)

    explainer = shap.KernelExplainer(predict_fn, X_train_summary)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP) # 特征ph依赖图)
    
elif best_model_last == 'SVR':
    _ , _ , best_params = train(SVR(), space_SVR, True)
    model = SVR()
    
    # 设置模型参数
    model.set_params(**best_params)  # 使用你之前确定的最佳参数
    model.fit(X_train_SHAP, y_train_SHAP)
    X_train_summary = shap.kmeans(X_train_SHAP, 10)
    def predict_fn(X):
        return model.predict(X)

    explainer = shap.KernelExplainer(predict_fn, X_train_summary)
    shap_values = explainer.shap_values(X_test_SHAP)
    shap.summary_plot(shap_values, X_test_SHAP)
    shap.dependence_plot("GDP", shap_values, X_test_SHAP) # 特征ph依赖图)
    
shap.force_plot(explainer.expected_value, shap_values, X_test_SHAP)

代码缺失train函数,SX

你可能感兴趣的:(随机森林,算法,机器学习)