知识点:
vvimport numpy as np
#numpy数组的创建
a=np.array([1,2,3,4,5,6])
b = np.array([[1,2,3],[4,5,6]])
a.shape
print(b)
#创建全零矩阵
zeros = np.zeros((2, 3))
print(zeros)
#创建全1矩阵
ones=np.ones((4,2))
print(ones)
# 创建一个形状为(3,)的全1数组
ones1 = np.ones((3,)) # 创建一个形状为(3,)的全1数组
ones1
# 顺序数组的创建
arange = np.arange(1, 10) # 创建一个从1到10的数组
arange
#数组的随机化
c=np.random.randint(2,2)
print(c)
import random
d=random.random()
print(d)
e=np.random.rand()
f=np.random.rand(3,2)
print(e)
print(f)
g=np.random.randn(2,2)
print(g)
# 设置随机种子以确保结果可重复
np.random.seed(42)
# 生成10个语文成绩(正态分布,均值75,标准差10)
chinese_scores = np.random.normal(75, 10, 10).round(1)
# 找出最高分和最低分及其索引
max_scores=np.max(chinese_scores)
max_index=np.argmax(chinese_scores)
min_scores=np.min(chinese_scores)
min_index=np.argmin(chinese_scores)
print(f"10个人的成绩:{chinese_scores}")
print(f"最高分: {max_scores} (第{max_index}个学生)")
print(f"最低分: {min_scores} (第{min_index}个学生)")
#数组的遍历
scores = np.array([5, 9, 9, 11, 11, 13, 15, 19])
scores += 1
sum = 0
for i in scores:
sum += i
print(sum)
#数组的运算
a = np.array([[1, 2], [3, 4], [5, 6]])
b = np.array([[7, 8], [9, 10], [11, 12]])
print(a)
print(b)
# 计算两个数组的和
print(a + b)
# 计算两个数组的差
print(a - b)
# 矩阵点乘
print(a * b)
# 计算两个数组的除法
print(a / b)
#计算转置
h=np.dot(a,b.T)
print(h)
# 矩阵乘法,3*2的矩阵和2*3的矩阵相乘,得到3*3的矩阵
a @ b.T
#数组的索引
# 数组: [0 1 2 3 4 5 6 7 8 9]
arr1d = np.arange(10)
arr1d
# 1. 取出数组的第一个元素。
print(arr1d[0])
# 取出数组的最后一个元素。-1表示倒数第一个元素
print(arr1d[-1])
# 3. 取出数组中索引为 3, 5, 8 的元素。
# 使用整数数组进行索引,可以一次性取出多个元素。语法是 arr1d[[index1, index2, ...]]。
print(arr1d[[3,5,8]])
# 取出索引为2到5的元素(不包括索引6的元素,取左不取右)
print(arr1d[2:6])
# 取出数组中从头到索引 6 (不包含 6) 的元素。
# 使用切片 slice [:stop]
print(arr1d[:6])
# 取出数组中从索引 2 到结尾的元素。
# 使用切片 slice [start:]
print(arr1d[2:])
# 取出全部元素
print(arr1d[:])
# 7取出数组中所有偶数索引对应的元素 (即索引 0, 2, 4, 6, 8)。
# 使用带步长的切片 slice [start:stop:step]
print(arr1d[::2])
arr2d = np.array([[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12],
[13, 14, 15, 16]])
print(arr2d[0, :])
print(arr2d[:, 1])
print(arr2d[2, 1])
# 取出由第 0 行和第 2 行组成的新数组。
# 使用整数数组作为行索引 arr[[row1, row2, ...], :]
print(arr2d[[0, 2], :])
print(arr2d[:,[1,3]])
print(arr2d[1:3, 1:3])
arr3d = np.arange(3 * 4 * 5).reshape((3, 4, 5))
print(arr3d[1, :, :])
print(arr3d[1, 0:2, :])
print(arr3d[1, 0:2, 3:4])
#SHAP值
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
from sklearn.metrics import make_scorer,accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
# 设置中文字体(解决中文显示问题)
plt.rcParams['font.sans-serif'] = ['SimHei'] # Windows系统常用黑体字体
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
data = pd.read_csv('data.csv') #读取数据
# 先筛选字符串变量
discrete_features = data.select_dtypes(include=['object']).columns.tolist()
#标签编码
mappings = {
"Years in current job": {
"10+ years": 10,
"2 years": 2,
"3 years": 3,
"< 1 year": 0,
"5 years": 5,
"1 year": 1,
"4 years": 4,
"6 years": 6,
"7 years": 7,
"8 years": 8,
"9 years": 9
},
"Home Ownership": {
"Home Mortgage": 0,
"Rent": 1,
"Own Home": 2,
"Have Mortgage": 3
},
"Term":{
"Short Term":0,
"Long Term":1
}
}
# 使用映射字典进行转换
data["Years in current job"] = data["Years in current job"].map(mappings["Years in current job"])
data["Home Ownership"] = data["Home Ownership"].map(mappings["Home Ownership"])
data["Term"] = data["Term"].map(mappings["Term"])
# Purpose 独热编码,记得需要将bool类型转换为数值
data=pd.get_dummies(data,columns=['Purpose'])
# 重新读取数据,用来做列名对比
data2=pd.read_csv('data.csv')
# 新建一个空列表,用于存放独热编码后新增的特征名
list_final=[]
for i in data.columns:
if i not in data2.columns:
list_final.append(i)# 这里打印出来的就是独热编码后的特征名
for i in list_final:
data[i]=data[i].astype(int)#将bool型转换为数值型,这里的i就是独热编码后的特征名
continuous_features=data.select_dtypes(include=['float64','int64']).columns.tolist()
discrete_features=data.select_dtypes(exclude=['float64','int64']).columns.tolist()
#离散特征使用众数进行补全
for feature in discrete_features:
if data[feature].isnull().sum()>0:
mode_value = data[feature].mode()[0]
data[feature].fillna(mode_value, inplace=True)
#连续变量用中位数进行补全
for feature in continuous_features:
if data[feature].isnull().sum()>0:
median_value = data[feature].median()
data[feature].fillna(median_value, inplace=True)
st=MinMaxScaler()
data[continuous_features]=st.fit_transform(data[continuous_features])
from sklearn.model_selection import train_test_split
x = data.drop(['Credit Default'], axis=1) # 特征,axis=1表示按列删除
y = data['Credit Default'] # 标签
# 按照8:2划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # 80%训练集,20%测试集
# 随机森林
print("--- 1. 默认参数随机森林 (训练集 -> 测试集) ---")
import time # 这里介绍一个新的库,time库,主要用于时间相关的操作,因为调参需要很长时间,记录下会帮助后人知道大概的时长
start_time=time.time()# 记录开始时间
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train, y_train)# 在训练集上训练
rf_pred = rf_model.predict(x_test)# 在测试集上预测
end_time=time.time()# 记录结束时间
print(f"训练与预测时间:{end_time-start_time:.4f}秒")
print("\n随机森林 分类报告:")
print(classification_report(y_test, rf_pred))
print("随机森林 混淆矩阵:")
print(confusion_matrix(y_test, rf_pred))
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
print("随机森林 模型评估指标:")
print(f"准确率: {rf_accuracy:.4f}")
print(f"精确率: {rf_precision:.4f}")
print(f"召回率: {rf_recall:.4f}")
print(f"F1 值: {rf_f1:.4f}")
import shap
import matplotlib.pyplot as plt
# 初始化 SHAP 解释器
explainer = shap.TreeExplainer(rf_model)
# 计算 SHAP 值(基于测试集),这个shap_values是一个numpy数组,表示每个特征对每个样本的贡献值
# 这里大家先知道这是个numpy数组即可,我们后面学习完numpy在来回头解读这个值
shap_values = explainer.shap_values(x_test) # 这个计算耗时
print(shap_values)
print(shap_values[0,:,:])
print(shap_values[0,:,:].shape)
print(shap_values.shape)
print(shap_values[:,:,0])