步骤 | 内容 |
---|---|
第1天 | 高级数据操作(索引、透视、变形) |
第2天 | 缺失值和异常值处理 |
第3天 | 多表合并与连接 |
第4天 | 特征工程(编码、归一化、时间) |
第5天 | 数据集拆分(训练集 / 测试集) |
第6天 | 逻辑回归模型构建与评估 |
第7天 | 多模型对比评估(今天) |
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
from sklearn.metrics import classification_report
print(" Logistic 回归:")
print(classification_report(y_test, y_pred_log))
print(" 决策树模型:")
print(classification_report(y_test, y_pred_tree))
import matplotlib.pyplot as plt
models = ["Logistic", "DecisionTree"]
accuracies = [
accuracy_score(y_test, y_pred_log),
accuracy_score(y_test, y_pred_tree),
]
plt.bar(models, accuracies, color=["skyblue", "lightgreen"])
plt.title("模型准确率对比")
plt.ylabel("Accuracy")
plt.show()
compare_models.py
)读取本周生成的训练 / 测试数据
同时训练逻辑回归与决策树模型
输出各自的评估指标(Accuracy、Precision、Recall、F1)
(可选)将结果写入一个 CSV 或图表可视化
思考不同模型优劣,以及如何选择合适模型
# compare_models.py
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
accuracy_score,
classification_report,
confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns
import os
plt.rcParams['font.family'] = 'Arial Unicode MS' # Mac 用户可用
plt.rcParams['axes.unicode_minus'] = False
# 1. 加载训练与测试数据
data_dir = "data/model"
X_train = pd.read_csv(os.path.join(data_dir, "X_train.csv"))
X_test = pd.read_csv(os.path.join(data_dir, "X_test.csv"))
y_train = pd.read_csv(os.path.join(data_dir, "y_train.csv")).values.ravel()
y_test = pd.read_csv(os.path.join(data_dir, "y_test.csv")).values.ravel()
# 2. 初始化模型
log_model = LogisticRegression()
tree_model = DecisionTreeClassifier(random_state=42)
# 3. 模型训练
log_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)
# 4. 模型预测
y_pred_log = log_model.predict(X_test)
y_pred_tree = tree_model.predict(X_test)
# 5. 评估结果
print(" Logistic 回归评估报告:")
print(classification_report(y_test, y_pred_log))
print("\n 决策树评估报告:")
print(classification_report(y_test, y_pred_tree))
# 6. 准确率对比
acc_log = accuracy_score(y_test, y_pred_log)
acc_tree = accuracy_score(y_test, y_pred_tree)
# 7. 可视化混淆矩阵
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.heatmap(confusion_matrix(y_test, y_pred_log, labels=[0, 1]), annot=True, fmt="d", cmap="Blues",
xticklabels=["0", "1"], yticklabels=["0", "1"])
plt.title("Logistic 回归 - 混淆矩阵")
plt.xlabel("预测", fontproperties="Arial Unicode MS")
plt.ylabel("真实", fontproperties="Arial Unicode MS")
plt.subplot(1, 2, 2)
sns.heatmap(confusion_matrix(y_test, y_pred_tree, labels=[0, 1]), annot=True, fmt="d", cmap="Greens",
xticklabels=["0", "1"], yticklabels=["0", "1"])
plt.title("决策树 - 混淆矩阵")
plt.xlabel("预测", fontproperties="Arial Unicode MS")
plt.ylabel("真实", fontproperties="Arial Unicode MS")
plt.tight_layout()
plt.show()
# 8. 准确率柱状图
plt.figure(figsize=(5, 4))
plt.bar(["Logistic", "Decision Tree"], [acc_log, acc_tree], color=["skyblue", "lightgreen"])
plt.title("模型准确率对比")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
# 9. 汇总结果(可选保存)
results_df = pd.DataFrame({
"模型": ["Logistic", "Decision Tree"],
"准确率": [acc_log, acc_tree]
})
os.makedirs("data/result", exist_ok=True)
results_df.to_csv("data/result/model_comparison.csv", index=False)
print("\n✅ 对比结果已保存:data/result/model_comparison.csv")
结果输出:
Logistic 回归评估报告:
precision recall f1-score support
0 1.00 1.00 1.00 7
1 1.00 1.00 1.00 13
accuracy 1.00 20
macro avg 1.00 1.00 1.00 20
weighted avg 1.00 1.00 1.00 20
决策树评估报告:
precision recall f1-score support
0 1.00 1.00 1.00 7
1 1.00 1.00 1.00 13
accuracy 1.00 20
macro avg 1.00 1.00 1.00 20
weighted avg 1.00 1.00 1.00 20
✅ 对比结果已保存:data/result/model_comparison.csv
data/result/model_comparison.csv:
PS:
可以使用下面的代码生成训练/测试集:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
# 构造示例数据
np.random.seed(42)
size = 100
df = pd.DataFrame({
"成绩": np.random.randint(40, 100, size=size),
"性别": np.random.choice(["男", "女"], size=size)
})
# 增加派生特征
df["成绩_标准化"] = (df["成绩"] - df["成绩"].mean()) / df["成绩"].std()
df["是否及格_数值"] = (df["成绩"] >= 60).astype(int)
df["性别_男"] = (df["性别"] == "男").astype(int)
df["性别_女"] = (df["性别"] == "女").astype(int)
# 特征与标签
X = df[["成绩_标准化", "性别_男", "性别_女", "是否及格_数值"]]
y = df["是否及格_数值"]
# 拆分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 保存路径
os.makedirs("data/model", exist_ok=True)
X_train.to_csv("data/model/X_train.csv", index=False)
X_test.to_csv("data/model/X_test.csv", index=False)
y_train.to_csv("data/model/y_train.csv", index=False)
y_test.to_csv("data/model/y_test.csv", index=False)