Python 训练营打卡 Day14


@浙大疏锦行

  1. 补全剩余的几个图

  2. 尝试确定一下shap各个绘图函数对于每一个参数的尺寸要求,如shap.force_plot力图中的数据需要满足什么形状。

  3. 确定分类问题和回归问题的数据如何才能满足尺寸,分类采取信贷数据集,回归采取单车数据集。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
 
# 设置中文字体(解决中文显示问题)
plt.rcParams['font.sans-serif'] = ['SimHei']  # Windows系统常用黑体字体
plt.rcParams['axes.unicode_minus'] = False    # 正常显示负号
warnings.filterwarnings("ignore")
 
# 读取数据
data = pd.read_csv('data.csv')
 
# 筛选字符串变量并进行编码
discrete_features = data.select_dtypes(include=['object']).columns.tolist()
 
# Home Ownership 标签编码
home_ownership_mapping = {
    'Own Home': 1,
    'Rent': 2,
    'Have Mortgage': 3,
    'Home Mortgage': 4
}
data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)
 
# Years in current job 标签编码
years_in_job_mapping = {
    '< 1 year': 1,
    '1 year': 2,
    '2 years': 3,
    '3 years': 4,
    '4 years': 5,
    '5 years': 6,
    '6 years': 7,
    '7 years': 8,
    '8 years': 9,
    '9 years': 10,
    '10+ years': 11
}
data['Years in current job'] = data['Years in current job'].map(years_in_job_mapping)
 
# Purpose 独热编码
data = pd.get_dummies(data, columns=['Purpose'])
data2 = pd.read_csv("data.csv")  # 重新读取数据,用来做列名对比
list_final = []  # 新建一个空列表,用于存放独热编码后新增的特征名
for i in data.columns:
    if i not in data2.columns:
        list_final.append(i)  # 这里打印出来的就是独热编码后的特征名
for i in list_final:
    data[i] = data[i].astype(int)  # 这里的i就是独热编码后的特征名
 
# Term 0 - 1 映射
term_mapping = {
    'Short Term': 0,
    'Long Term': 1
}
data['Term'] = data['Term'].map(term_mapping)
data.rename(columns={'Term': 'Long Term'}, inplace=True)  # 重命名列
 
# 连续特征用中位数补全
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
for feature in continuous_features:
    mode_value = data[feature].mode()[0]
    data[feature].fillna(mode_value, inplace=True)
 
# 划分数据集
X = data.drop(['Credit Default'], axis=1)  # 特征
y = data['Credit Default']  # 标签
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
# 训练随机森林模型
start_time = time.time()
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
end_time = time.time()
 
print(f"训练与预测耗时: {end_time - start_time:.4f} 秒")
print("\n默认随机森林 在测试集上的分类报告:")
print(classification_report(y_test, rf_pred))
print("默认随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, rf_pred))
 
# 计算 SHAP 值
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test)
 
# 绘制全局特征重要性图
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)
 
# 绘制特征值与 SHAP 值的关系图
shap.dependence_plot('Years in current job', shap_values[1], X_test, feature_names=X_test.columns)
 
# 绘制力图
shap.force_plot(explainer.expected_value[1], shap_values[1][0, :], X_test.iloc[0, :], feature_names=X_test.columns)
 
# 绘制决策图
shap.decision_plot(explainer.expected_value[1], shap_values[1][0, :], X_test.iloc[0, :], feature_names=X_test.columns)
 
# 绘制瀑布图
shap.plots.waterfall(shap_values[1][0, :], max_display=10)
 
# 绘制热图
shap.plots.heatmap(shap_values[1])
 = 'bar';

你可能感兴趣的:(Python打卡训练,python,机器学习)