仔细回顾一下之前21天的内容,没跟上进度的同学补一下进度。
作业:
自行学习参考如何使用kaggle平台,写下使用注意点,并对下述比赛提交代码
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
from sklearn.metrics import make_scorer,accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
# 设置图片清晰度
plt.rcParams['figure.dpi'] = 300
# 设置中文字体
# 修改此处,更换为 Windows 系统已有的中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 读取数据
train_data = pd.read_csv(r'C:\Users\Administrator\Desktop\python60-days-challenge-master\python60-days-challenge-master\titanic\train.csv')
test_data = pd.read_csv(r'C:\Users\Administrator\Desktop\python60-days-challenge-master\python60-days-challenge-master\titanic\test.csv')
train_data.head()
train_data.info()
train_data = train_data.drop(columns=["PassengerId","Name","Ticket","Cabin"])
#补全缺失值
continuous_features=train_data.select_dtypes(include=['float64','int64']).columns.tolist()
discrete_features=train_data.select_dtypes(exclude=['float64','int64']).columns.tolist()
#离散特征使用众数进行补全
for feature in discrete_features:
if train_data[feature].isnull().sum()>0:
mode_value = train_data[feature].mode()[0]
train_data[feature].fillna(mode_value, inplace=True)
#连续变量用中位数进行补全
for feature in continuous_features:
if train_data[feature].isnull().sum()>0:
median_value = train_data[feature].median()
train_data[feature].fillna(median_value, inplace=True)
train_data.info()
#对没有顺序的离散特征进行独热编码
train_data["Sex"]=pd.get_dummies(train_data["Sex"],dtype=int,drop_first=True)
train_data = pd.concat([train_data.drop("Embarked", axis=1), pd.get_dummies(train_data["Embarked"], prefix="Embarked", dtype=int, drop_first=False)], axis=1)
train_data.head()
#划分训练集与测试集
from sklearn.model_selection import train_test_split
x = train_data.drop(['Survived'], axis=1) # 特征,axis=1表示按列删除
y = train_data['Survived'] # 标签
# 按照8:2划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # 80%训练集,20%测试集
import time
# 随机森林
start_time=time.time()
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train, y_train)
rf_pred = rf_model.predict(x_test)
end_time=time.time()
print(f"训练与预测时间:{end_time-start_time:.4f}秒")
print("\n随机森林 分类报告:")
print(classification_report(y_test, rf_pred))
print("随机森林 混淆矩阵:")
print(confusion_matrix(y_test, rf_pred))
rf_accuracy_1 = accuracy_score(y_test, rf_pred)
rf_precision_1 = precision_score(y_test, rf_pred)
rf_recall_1 = recall_score(y_test, rf_pred)
rf_f1_1 = f1_score(y_test, rf_pred)
print("随机森林 模型评估指标:")
print(f"准确率: {rf_accuracy_1:.4f}")
print(f"精确率: {rf_precision_1:.4f}")
print(f"召回率: {rf_recall_1:.4f}")
print(f"F1 值: {rf_f1_1:.4f}")
#网格调参
#SMOTE过采样
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)
from sklearn.model_selection import GridSearchCV
# 定义要搜索的参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# 创建网格搜索对象
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), # 随机森林分类器
param_grid=param_grid, # 参数网格
cv=5, # 5折交叉验证
n_jobs=-1, # 使用所有可用的CPU核心进行并行计算
scoring='accuracy') # 使用准确率作为评分标准
start_time = time.time()
# 在训练集上进行网格搜索
grid_search.fit(x_train_smote, y_train_smote) # 在训练集上训练,模型实例化和训练的方法都被封装在这个网格搜索对象里了
end_time = time.time()
print(f"网格搜索耗时: {end_time - start_time:.4f} 秒")
print("最佳参数: ", grid_search.best_params_) #best_params_属性返回最佳参数组合
# 使用最佳参数的模型进行预测
best_model = grid_search.best_estimator_ # 获取最佳模型
best_pred = best_model.predict(x_test) # 在测试集上进行预测
print("\n网格搜索优化后的随机森林 在测试集上的分类报告:")
print(classification_report(y_test, best_pred))
print("网格搜索优化后的随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, best_pred))
print(confusion_matrix(y_test, best_pred))
rf_accuracy = accuracy_score(y_test,best_pred)
rf_precision = precision_score(y_test,best_pred)
rf_recall = recall_score(y_test,best_pred)
rf_f1 = f1_score(y_test,best_pred)
print("随机森林 模型评估指标:")
print(f"准确率: {rf_accuracy:.4f}")
print(f"精确率: {rf_precision:.4f}")
print(f"召回率: {rf_recall:.4f}")
print(f"F1 值: {rf_f1:.4f}")
# 创建新特征:家庭大小
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
# 创建年龄分组
train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=[0, 12, 18, 30, 50, 100],
labels=['儿童', '青少年', '青年', '中年', '老年'])
# 创建票价分组
train_data['FareGroup'] = pd.qcut(train_data['Fare'], q=4, labels=['非常低', '低', '高', '非常高'])
# 打印列名,检查Embarked是否存在
print("数据列名:", train_data.columns.tolist())
# 一、计算各特征与幸存率的关系
# 1. 性别与幸存率
sex_survival = train_data.groupby('Sex')['Survived'].mean().round(4) * 100
# 2. 年龄组与幸存率
age_survival = train_data.groupby('AgeGroup')['Survived'].mean().round(4) * 100
# 3. 船舱等级与幸存率
class_survival = train_data.groupby('Pclass')['Survived'].mean().round(4) * 100
# 4. 家庭大小与幸存率
family_survival = train_data.groupby('FamilySize')['Survived'].mean().round(4) * 100
# 7. 票价分组与幸存率
fare_survival = train_data.groupby('FareGroup')['Survived'].mean().round(4) * 100
# 二、可视化分析
plt.figure(figsize=(36, 30))
# 1. 性别与幸存率
plt.subplot(3, 2, 1)
sns.barplot(x=sex_survival.index, y=sex_survival.values, palette=['#FF6B8B', '#40A9FF'])
plt.title('性别与幸存率')
plt.xlabel('性别')
plt.ylabel('幸存率(%)')
for i, v in enumerate(sex_survival.values):
plt.text(i, v + 1, f'{v:.1f}%', ha='center')
# 2. 年龄组与幸存率
plt.subplot(3, 2, 2)
sns.barplot(x=age_survival.index, y=age_survival.values, palette='Set3')
plt.title('年龄组与幸存率')
plt.xlabel('年龄组')
plt.ylabel('幸存率(%)')
for i, v in enumerate(age_survival.values):
plt.text(i, v + 1, f'{v:.1f}%', ha='center')
# 3. 船舱等级与幸存率
plt.subplot(3, 2, 3)
sns.barplot(x=class_survival.index, y=class_survival.values, palette='Blues_d')
plt.title('船舱等级与幸存率')
plt.xlabel('船舱等级')
plt.ylabel('幸存率(%)')
for i, v in enumerate(class_survival.values):
plt.text(i, v + 1, f'{v:.1f}%', ha='center')
# 4. 家庭大小与幸存率
plt.subplot(3, 2, 4)
sns.barplot(x=family_survival.index, y=family_survival.values, palette='Greens_d')
plt.title('家庭大小与幸存率')
plt.xlabel('家庭大小')
plt.ylabel('幸存率(%)')
for i, v in enumerate(family_survival.values):
plt.text(i, v + 1, f'{v:.1f}%', ha='center')
# 5. 登船港口与幸存率(如果Embarked列存在)
if 'Embarked' in train_data.columns:
embarked_survival = train_data.groupby('Embarked')['Survived'].mean().round(4) * 100
plt.subplot(3, 2, 5)
sns.barplot(x=embarked_survival.index, y=embarked_survival.values, palette='Oranges_d')
plt.title('登船港口与幸存率')
plt.xlabel('登船港口')
plt.ylabel('幸存率(%)')
for i, v in enumerate(embarked_survival.values):
plt.text(i, v + 1, f'{v:.1f}%', ha='center')
else:
# 如果Embarked列不存在,则跳过该子图
plt.subplot(3, 2, 5)
plt.text(0.5, 0.5, 'Embarked列不存在', ha='center', va='center', fontsize=12)
plt.axis('off')
# 6. 票价分组与幸存率
plt.subplot(3, 2, 6)
sns.barplot(x=fare_survival.index, y=fare_survival.values, palette='Purples_d')
plt.title('票价分组与幸存率')
plt.xlabel('票价分组')
plt.ylabel('幸存率(%)')
for i, v in enumerate(fare_survival.values):
plt.text(i, v + 1, f'{v:.1f}%', ha='center')
plt.tight_layout()
plt.show()
# 三、相关性矩阵分析
# 对分类变量进行编码
df_encoded = train_data.copy()
df_encoded['Sex'] = df_encoded['Sex'].map({'male': 0, 'female': 1})
# 只有当Embarked列存在时才进行编码
if 'Embarked' in df_encoded.columns:
df_encoded['Embarked'] = df_encoded['Embarked'].astype('category').cat.codes
df_encoded['AgeGroup'] = df_encoded['AgeGroup'].astype('category').cat.codes
df_encoded['FareGroup'] = df_encoded['FareGroup'].astype('category').cat.codes
# 计算相关性矩阵(排除不存在的列)
columns_to_use = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
'Fare', 'FamilySize', 'AgeGroup', 'FareGroup']
if 'Embarked' in df_encoded.columns:
columns_to_use.append('Embarked')
if 'Cabin' in df_encoded.columns: # 同样检查Cabin列是否存在
columns_to_use.append('Cabin')
correlation = df_encoded[columns_to_use].corr()
# 绘制热力图
plt.figure(figsize=(24, 20))
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm',
square=True, linewidths=.5, cbar_kws={"shrink": .8})
plt.title('幸存率与各特征的相关性热力图')
plt.tight_layout()
plt.show()
@浙大疏锦行