# 导入所需要的包
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from D11 import discrete_features, home_ownership_mapping
# 设置中文字体(解决中文显示问题)
plt.rcParams['axes.unicode_minus'] = False # Windows系统常用黑体
plt.rcParams['font.sans-serif'] = ['SimHei'] # 正常显示负号
# 查看数据信息
data = pd.read_csv("data.csv")
print("数据基本信息:") #
print(data.info())
print("数据前5行预览:")
print(data.head()) # [5 rows x 18 columns]
# 首先处理object对象
# 因为最后的输入都是数值类型,所以先把字符串变量处理了
# 先筛选出字符串变量
discrete_features = data.select_dtypes(include=["object"]).columns.tolist()
print(discrete_features) # ['Home Ownership', 'Years in current job', 'Purpose', 'Term']
# 观察字符串变量的取值,选择合适的方式转化为数值类型
# 先处理'Home Ownership'
print(data["Home Ownership"].value_counts())
"""
Home Ownership
Home Mortgage 3637
Rent 3204
Own Home 647
Have Mortgage 12
Name: count, dtype: int64
"""
# 根据对标签的影响程度,可采用标签编码
# 按照贷款严重程度(抗风险能力),依次是自有住房<租房<有其他贷款<住房抵押贷款
# 建立字典
home_ownership_mapping = {
"Own Home":1,
"Rent": 2,
"Have Mortgage": 3,
"Home Mortgage": 4
}
data["Home Ownership"] = data["Home Ownership"].map(home_ownership_mapping)
# 打印"Home Ownership"前5行,观察编码后的结果
print(data["Home Ownership"].head()) #
"""
0 1
1 1
2 4
3 1
4 2
Name: Home Ownership, dtype: int64
"""
# 观察Years in current job的取值
print(data["Years in current job"].value_counts())
"""
Years in current job
10+ years 2332
2 years 705
3 years 620
< 1 year 563
5 years 516
1 year 504
4 years 469
6 years 426
7 years 396
8 years 339
9 years 259
Name: count, dtype: int64
"""
# 采用标签编码
years_in_current_job_mapping = {
"< 1 year": 0,
"1 year": 1,
"2 years": 2,
"3 years": 3,
"4 years": 4,
"5 years": 5,
"6 years": 6,
"7 years": 7,
"8 years": 8,
"9 years": 9,
"10 years": 10
}
data["Years in current job"] = data["Years in current job"].map(years_in_current_job_mapping)
# 输出"Years in current job"前5行,观察编码后结果
print(data["Years in current job"].head())
"""
0 NaN
1 NaN
2 8.0
3 6.0
4 8.0
Name: Years in current job, dtype: float64
发现有Nan值,后续需要单独处理缺失值
"""
# 打印Purpose列的取值
print(data["Purpose"].value_counts())
"""
Purpose
debt consolidation 5944
other 665
home improvements 412
business loan 129
buy a car 96
medical bills 71
major purchase 40
take a trip 37
buy house 34
small business 26
wedding 15
moving 11
educational expenses 10
vacation 8
renewable energy 2
Name: count, dtype: int64
"""
# 对Purpose列做独热编码
# data为整个数据集,columns为需要进行独热编码的列,drop_first是将独热编码生成的第一列删除
data= pd.get_dummies(data, columns=['Purpose'], drop_first=True, dtype="int")
print(data.info())
# 需要将独热编码后的bool数据类型转化为int类型
# method_1
# data= pd.get_dummies(data, columns=['Purpose'], drop_first=True, dtype="int")
# 在get_dummies()中的dtype设置为dtype="int"
# method_2
data_columns = data.columns.tolist()
data2 = pd.read_csv("data.csv")
data2_columns = data2.columns.tolist()
new_columns_list = [] # 创建一个空列表对象
# 利用循环找出新生成的列
for column in data_columns:
if column not in data2_columns:
new_columns_list.append(column)
# 将new_columns_list列表中的列的数据类型转化为int类型
for column in new_columns_list:
data[column] = data[column].astype(int)
# 输出处理后的数据
print(data.info())
# 打印Term的取值
print(data["Term"].value_counts())
"""
Term
Short Term 5556
Long Term 1944
Name: count, dtype: int64
"""
# 采用对Term进行0-1编码
# 建立字典
Term_mapping = {
"Short Term":0,
"Long Term":1,
}
data["Term"] = data["Term"].map(Term_mapping)
# 打印编码后"Term"列的前5行
print(data["Term"].head())
"""
0 0
1 1
2 0
3 0
4 0
Name: Term, dtype: int64
"""
# 将Term的列名改为Long Term
data.rename(columns={"Term":"Long Term"}, inplace=True)
"""
data.rename的参数以及含义:
rename是pandas库中DataFrame和Series对象都有的一个方法,其主要功能是对索引(行标签)或者列标签进行重命名操作。
columns参数:
用于指定列名的重命名规则。可以传入一个字典,字典的键是原列名,值是新列名;
也可以传入一个函数,该函数会对每个列名进行处理并返回新的列名。
index参数:
用于指定行索引的重命名规则。和columns参数类似,既可以传入一个字典,字典的键是原索引名,
值是新索引名;也可以传入一个函数,函数会对每个索引名进行处理并返回新的索引名。
axis参数:
指定重命名操作的轴方向。axis=0或者axis='index'表示对行索引进行操作;
axis=1或者axis='columns'表示对列名进行操作。默认值是 None,
这意味着如果同时传入了columns和index参数,会同时对列名和行索引进行重命名;
如果只传入了一个参数,会根据参数类型自动判断轴方向。
inplace参数:
布尔值,默认是False。若设置为True,会直接在原DataFrame上进行重命名操作,
不会返回新的DataFrame;若设置为False,则会返回一个重命名后的新DataFrame,
原DataFrame保持不变。
"""
# 填补缺失值
# 打印缺失值布尔矩阵
print(data.isnull().sum())
# 循环遍历所有列,填补缺失值
for column in data.columns.tolist():
if data[column].isnull().sum() > 0:
mode = data[column].mode()[0] # 使用众数列表的第一个元素
data[column] = data[column].fillna(mode)
# 重新打印缺失值布尔矩阵
print(data.isnull().sum()) # 结果为:全0
# 机器学习模型建模
# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X = data.drop(columns="Credit Default", axis=1) # 拿到所有特征,axis=1表示按列删除
y = data["Credit Default"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 打印训练集和测试集的形状
print(f"训练集特征的形状{X_train.shape},训练集标签的形状{y_train.shape},测试集特征的形状{X_test.shape},测试集标签的形状{y_test.shape}")
# 训练集特征的形状(6000, 30),训练集标签的形状(6000,),测试集特征的形状(1500, 30),测试集标签的形状(1500,)
# 模型训练与评估
"""
三行经典代码
1.模型实例化
2.模型训练(代入数据集)
3.模型预测(代入测试集)
测试集的预测值和测试中的真实值进行对比,得到混淆矩阵
基于混淆矩阵,计算准确率、召回率、F1值,这些都是固定阈值的评估指标
AUC是基于不同阈值得到的不同的混淆矩阵,然后计算每个阈值对应的FPR和TPR,将这些点连成线,最后求曲线下的面积,得到AUC值
"""
from sklearn.svm import SVC # 支持向量机分类器
from sklearn.neighbors import KNeighborsClassifier # K近邻分类器
from sklearn.linear_model import LogisticRegression # 逻辑回归分类器
import xgboost as xgb # XGBoost分类器
import lightgbm as lgb # LightGBM分类器
from sklearn.ensemble import RandomForestClassifier # 随机森林分类器
from catboost import CatBoostClassifier # CatBoost分类器
from sklearn.tree import DecisionTreeClassifier # 决策树分类器
from sklearn.naive_bayes import GaussianNB # 高斯朴素贝叶斯分类器
from sklearn.metrics import classification_report, confusion_matrix # 用于生成分裂报告和混淆矩阵
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
import warnings # 用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
# SVM支持向量机模型
svm_model = SVC(random_state=42) # 采用random_state=42的随机数种子,便于复现,其他的内参采用默认参数
svm_model.fit(X_train, y_train) # 利用训练集对svm_model模型进行训练
svm_pred = svm_model.predict(X_test) # 采用测试集进行测试,得到预测的结果
print("SVM分类报告:")
print(classification_report(y_test, svm_pred)) # 打印分类报告,参数为真实值和预测值
print("SVM混淆矩阵:")
print(confusion_matrix(y_test, svm_pred)) # 打印混淆矩阵
# 计算SVM评估指标,这些指标默认计算正类的性能
# accuracy_score用于计算分类模型的准确率,即预测正确的样本数占总样本数的比例。
svm_accuracy = accuracy_score(y_test, svm_pred) # 准确率
# precision_score用于计算分类模型的精确率,精确率是指预测为正类的样本中实际为正类的比例。
svm_precision = precision_score(y_test, svm_pred) # 精确率
# recall_score用于计算分类模型的召回率,召回率是指实际为正类的样本中被预测为正类的比例。
svm_recall = recall_score(y_test, svm_pred) # 召回率
# f1_score用于计算分类模型的F1值,F1值是精确率和召回率的调和平均数,综合考虑了精确率和召回率。
svm_f1 = f1_score(y_test, svm_pred) # F1值
print("SVM性能指标:")
print(f"准确率:{svm_accuracy:.4f}")
print(f"精确率:{svm_precision:.4f}")
print(f"召回率:{svm_recall:.4f}")
print(f"F1分数:{svm_f1:.4f}")
# KNN
knn_model = KNeighborsClassifier() # K近邻分类器
knn_model.fit(X_train, y_train) # 利用训练集来对模型进行训练
knn_pred = knn_model.predict(X_test) # 利用测试集来预测
print("KNN分类报告:")
print(classification_report(y_test, knn_pred))
print("KNN混淆矩阵:")
print(confusion_matrix(y_test, knn_pred))
knn_accuracy = accuracy_score(y_test, knn_pred) # 准确率
knn_precision = precision_score(y_test, knn_pred) # 精确率
knn_recall = recall_score(y_test, knn_pred) # 召回率
knn_f1 = f1_score(y_test, knn_pred) # F1分数
print("KNN模型评估指标")
print(f"准确率:{knn_accuracy:.4f}")
print(f"精确率:{knn_precision:.4f}")
print(f"召回率:{knn_recall:.4f}")
print(f"F1分数:{knn_f1:.4f}")
# 逻辑回归
logreg_model = LogisticRegression(random_state=42) # 赋予随机数种子random_state=42
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)
print("LogReg逻辑回归分类报告:")
print(classification_report(y_test, logreg_pred))
print("LogReg逻辑回归混淆矩阵:")
print(confusion_matrix(y_test, logreg_pred))
logreg_accuracy = accuracy_score(y_test, logreg_pred) # 准确率
logreg_precision = precision_score(y_test, logreg_pred) # 精确率
logreg_recall = recall_score(y_test, logreg_pred) # 召回率
logreg_f1 = f1_score(y_test, logreg_pred) # F1分数
print("LogReg逻辑回归模型评估指标:")
print(f"准确率:{logreg_accuracy:.4f}")
print(f"精确率:{logreg_precision:.4f}")
print(f"召回率:{logreg_recall:.4f}")
print(f"F1分数:{logreg_f1:.4f}")
# 朴素贝叶斯
nb_model = GaussianNB()
"""
GaussianNB是朴素贝叶斯(Naive Bayes)的一种实现,假设特征服从高斯分布(正态分布),
适用于连续型数据的分类任务。它基于贝叶斯定理,通过计算类条件概率进行预测。
"""
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
print("Naive Bayes贝叶斯分类报告:")
print(classification_report(y_test, nb_pred))
print("Naive Bayes贝叶斯混淆矩阵:")
print(confusion_matrix(y_test, nb_pred))
nb_accuracy = accuracy_score(y_test, nb_pred) # 准确率
nb_precision = precision_score(y_test, nb_pred) # 精确率
nb_recall = recall_score(y_test, nb_pred) # 召回率
nb_f1 = f1_score(y_test, nb_pred) # F1分数
print("Naive Bayes")
print(f"准确率:{nb_accuracy:.4f}")
print(f"精确率:{nb_precision:.4f}")
print(f"召回率:{nb_recall:.4f}")
print(f"F1分数:{nb_f1:.4f}")
# 决策树
dt_model = DecisionTreeClassifier(random_state=42) # 赋予随机数种子random_state
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
print("DecisionTree决策树分类报告:")
print(classification_report(y_test, dt_pred))
print("DecisionTree决策树混淆矩阵:")
print(confusion_matrix(y_test, dt_pred))
dt_accuracy = accuracy_score(y_test, dt_pred) # 准确率
dt_precision = precision_score(y_test, dt_pred) # 精确率
dt_recall = recall_score(y_test, dt_pred) # 召回率
dt_f1 = f1_score(y_test, dt_pred) # F1分数
print("DecisionTree决策树模型评估指标:")
print(f"准确率:{dt_accuracy:.4f}")
print(f"精确率:{dt_precision:.4f}")
print(f"召回率:{dt_recall:.4f}")
print(f"F1分数:{dt_f1:.4f}")
# 随机森林
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("RandomForest随机森林分类报告:")
print(classification_report(y_test, rf_pred))
print("RandomForest随即森丽混淆矩阵:")
print(confusion_matrix(y_test, rf_pred))
rf_accuracy = accuracy_score(y_test, rf_pred) # 准确率
rf_precision = precision_score(y_test, rf_pred) # 精确率
rf_recall = recall_score(y_test, rf_pred) # 召回率
rf_f1 = f1_score(y_test, rf_pred) # F1分数
print("RandomForest随机森林评估指标")
print(f"准确率:{rf_accuracy:.4f}")
print(f"精确率:{rf_precision:.4f}")
print(f"召回率:{rf_recall:.4f}")
print(f"F1分数:{rf_f1:.4f}")
# XGBoost
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
print("XGBoost分类报告:")
print(classification_report(y_test, xgb_pred))
print("XGBoost混淆矩阵:")
print(confusion_matrix(y_test, xgb_pred))
xgb_accuracy = accuracy_score(y_test, xgb_pred) # 准确率
xgb_precision = precision_score(y_test, xgb_pred) # 精确率
xgb_recall = recall_score(y_test, xgb_pred) # 召回率
xgb_f1 = f1_score(y_test, xgb_pred) # F1分数
print("XGB模型评估指标:")
print(f"准确率:{xgb_accuracy:.4f}")
print(f"精确率:{xgb_precision:.4f}")
print(f"召回率:{xgb_recall:.4f}")
print(f"F1分数:{xgb_f1:.4f}")
# LightGBM
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)
print("LightGBM分类报告:")
print(classification_report(y_test, lgb_pred))
print("LightGBM混淆矩阵:")
print(confusion_matrix(y_test, lgb_pred))
lgb_accuracy = accuracy_score(y_test, lgb_pred) # 准确率
lgb_precision = precision_score(y_test, lgb_pred) # 精确率
lgb_recall = recall_score(y_test, lgb_pred) # 召回率
lgb_f1 = f1_score(y_test, lgb_pred) # F1分数
print("LightGBM模型评估指标:")
print(f"准确率:{lgb_accuracy:.4f}")
print(f"精确率:{lgb_precision:.4f}")
print(f"召回率:{lgb_recall:.4f}")
print(f"F1分数:{lgb_f1:.4f}")
