python打卡训练营打卡记录day22

复习日

仔细回顾一下之前21天的内容,没跟上进度的同学补一下进度。

作业:

自行学习参考如何使用kaggle平台,写下使用注意点,并对下述比赛提交代码

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
from sklearn.metrics import  make_scorer,accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
# 设置图片清晰度
plt.rcParams['figure.dpi'] = 300
# 设置中文字体
# 修改此处,更换为 Windows 系统已有的中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 读取数据
train_data = pd.read_csv(r'C:\Users\Administrator\Desktop\python60-days-challenge-master\python60-days-challenge-master\titanic\train.csv')
test_data = pd.read_csv(r'C:\Users\Administrator\Desktop\python60-days-challenge-master\python60-days-challenge-master\titanic\test.csv')
train_data.head()
train_data.info()
train_data = train_data.drop(columns=["PassengerId","Name","Ticket","Cabin"])
#补全缺失值
continuous_features=train_data.select_dtypes(include=['float64','int64']).columns.tolist()
discrete_features=train_data.select_dtypes(exclude=['float64','int64']).columns.tolist()
#离散特征使用众数进行补全
for feature in discrete_features:
    if train_data[feature].isnull().sum()>0:     
     mode_value = train_data[feature].mode()[0]            
     train_data[feature].fillna(mode_value, inplace=True)   
#连续变量用中位数进行补全
for feature in continuous_features:    
    if train_data[feature].isnull().sum()>0:     
     median_value = train_data[feature].median()         
     train_data[feature].fillna(median_value, inplace=True)   
train_data.info()
#对没有顺序的离散特征进行独热编码
train_data["Sex"]=pd.get_dummies(train_data["Sex"],dtype=int,drop_first=True)
train_data = pd.concat([train_data.drop("Embarked", axis=1), pd.get_dummies(train_data["Embarked"], prefix="Embarked", dtype=int, drop_first=False)], axis=1)
train_data.head()
#划分训练集与测试集
from sklearn.model_selection import train_test_split
x = train_data.drop(['Survived'], axis=1)  # 特征,axis=1表示按列删除
y = train_data['Survived'] # 标签
# 按照8:2划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)  # 80%训练集,20%测试集
import time
# 随机森林
start_time=time.time()
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train, y_train)
rf_pred = rf_model.predict(x_test)
end_time=time.time()
print(f"训练与预测时间:{end_time-start_time:.4f}秒")
print("\n随机森林 分类报告:")
print(classification_report(y_test, rf_pred))
print("随机森林 混淆矩阵:")
print(confusion_matrix(y_test, rf_pred))
 
rf_accuracy_1 = accuracy_score(y_test, rf_pred)
rf_precision_1 = precision_score(y_test, rf_pred)
rf_recall_1 = recall_score(y_test, rf_pred)
rf_f1_1 = f1_score(y_test, rf_pred)
print("随机森林 模型评估指标:")
print(f"准确率: {rf_accuracy_1:.4f}")
print(f"精确率: {rf_precision_1:.4f}")
print(f"召回率: {rf_recall_1:.4f}")
print(f"F1 值: {rf_f1_1:.4f}")
#网格调参
#SMOTE过采样
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)
from sklearn.model_selection import GridSearchCV
 
# 定义要搜索的参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
 
# 创建网格搜索对象
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), # 随机森林分类器
                           param_grid=param_grid, # 参数网格
                           cv=5, # 5折交叉验证
                           n_jobs=-1, # 使用所有可用的CPU核心进行并行计算
                           scoring='accuracy') # 使用准确率作为评分标准
 
start_time = time.time()
# 在训练集上进行网格搜索
grid_search.fit(x_train_smote, y_train_smote) # 在训练集上训练,模型实例化和训练的方法都被封装在这个网格搜索对象里了
end_time = time.time()
 
print(f"网格搜索耗时: {end_time - start_time:.4f} 秒")
print("最佳参数: ", grid_search.best_params_) #best_params_属性返回最佳参数组合
 
# 使用最佳参数的模型进行预测
best_model = grid_search.best_estimator_ # 获取最佳模型
best_pred = best_model.predict(x_test) # 在测试集上进行预测
 
print("\n网格搜索优化后的随机森林 在测试集上的分类报告:")
print(classification_report(y_test, best_pred))
print("网格搜索优化后的随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, best_pred))
print(confusion_matrix(y_test, best_pred))
rf_accuracy = accuracy_score(y_test,best_pred)
rf_precision = precision_score(y_test,best_pred)
rf_recall = recall_score(y_test,best_pred)
rf_f1 = f1_score(y_test,best_pred)
print("随机森林 模型评估指标:")
print(f"准确率: {rf_accuracy:.4f}")
print(f"精确率: {rf_precision:.4f}")
print(f"召回率: {rf_recall:.4f}")
print(f"F1 值: {rf_f1:.4f}")
# 创建新特征:家庭大小
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1

# 创建年龄分组
train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=[0, 12, 18, 30, 50, 100], 
                        labels=['儿童', '青少年', '青年', '中年', '老年'])

# 创建票价分组
train_data['FareGroup'] = pd.qcut(train_data['Fare'], q=4, labels=['非常低', '低', '高', '非常高'])

# 打印列名,检查Embarked是否存在
print("数据列名:", train_data.columns.tolist())

# 一、计算各特征与幸存率的关系
# 1. 性别与幸存率
sex_survival = train_data.groupby('Sex')['Survived'].mean().round(4) * 100

# 2. 年龄组与幸存率
age_survival = train_data.groupby('AgeGroup')['Survived'].mean().round(4) * 100

# 3. 船舱等级与幸存率
class_survival = train_data.groupby('Pclass')['Survived'].mean().round(4) * 100

# 4. 家庭大小与幸存率
family_survival = train_data.groupby('FamilySize')['Survived'].mean().round(4) * 100

# 7. 票价分组与幸存率
fare_survival = train_data.groupby('FareGroup')['Survived'].mean().round(4) * 100

# 二、可视化分析
plt.figure(figsize=(36, 30))

# 1. 性别与幸存率
plt.subplot(3, 2, 1)
sns.barplot(x=sex_survival.index, y=sex_survival.values, palette=['#FF6B8B', '#40A9FF'])
plt.title('性别与幸存率')
plt.xlabel('性别')
plt.ylabel('幸存率(%)')
for i, v in enumerate(sex_survival.values):
    plt.text(i, v + 1, f'{v:.1f}%', ha='center')

# 2. 年龄组与幸存率
plt.subplot(3, 2, 2)
sns.barplot(x=age_survival.index, y=age_survival.values, palette='Set3')
plt.title('年龄组与幸存率')
plt.xlabel('年龄组')
plt.ylabel('幸存率(%)')
for i, v in enumerate(age_survival.values):
    plt.text(i, v + 1, f'{v:.1f}%', ha='center')

# 3. 船舱等级与幸存率
plt.subplot(3, 2, 3)
sns.barplot(x=class_survival.index, y=class_survival.values, palette='Blues_d')
plt.title('船舱等级与幸存率')
plt.xlabel('船舱等级')
plt.ylabel('幸存率(%)')
for i, v in enumerate(class_survival.values):
    plt.text(i, v + 1, f'{v:.1f}%', ha='center')

# 4. 家庭大小与幸存率
plt.subplot(3, 2, 4)
sns.barplot(x=family_survival.index, y=family_survival.values, palette='Greens_d')
plt.title('家庭大小与幸存率')
plt.xlabel('家庭大小')
plt.ylabel('幸存率(%)')
for i, v in enumerate(family_survival.values):
    plt.text(i, v + 1, f'{v:.1f}%', ha='center')

# 5. 登船港口与幸存率(如果Embarked列存在)
if 'Embarked' in train_data.columns:
    embarked_survival = train_data.groupby('Embarked')['Survived'].mean().round(4) * 100
    plt.subplot(3, 2, 5)
    sns.barplot(x=embarked_survival.index, y=embarked_survival.values, palette='Oranges_d')
    plt.title('登船港口与幸存率')
    plt.xlabel('登船港口')
    plt.ylabel('幸存率(%)')
    for i, v in enumerate(embarked_survival.values):
        plt.text(i, v + 1, f'{v:.1f}%', ha='center')
else:
    # 如果Embarked列不存在,则跳过该子图
    plt.subplot(3, 2, 5)
    plt.text(0.5, 0.5, 'Embarked列不存在', ha='center', va='center', fontsize=12)
    plt.axis('off')

# 6. 票价分组与幸存率
plt.subplot(3, 2, 6)
sns.barplot(x=fare_survival.index, y=fare_survival.values, palette='Purples_d')
plt.title('票价分组与幸存率')
plt.xlabel('票价分组')
plt.ylabel('幸存率(%)')
for i, v in enumerate(fare_survival.values):
    plt.text(i, v + 1, f'{v:.1f}%', ha='center')

plt.tight_layout()
plt.show()

# 三、相关性矩阵分析
# 对分类变量进行编码
df_encoded = train_data.copy()
df_encoded['Sex'] = df_encoded['Sex'].map({'male': 0, 'female': 1})

# 只有当Embarked列存在时才进行编码
if 'Embarked' in df_encoded.columns:
    df_encoded['Embarked'] = df_encoded['Embarked'].astype('category').cat.codes

df_encoded['AgeGroup'] = df_encoded['AgeGroup'].astype('category').cat.codes
df_encoded['FareGroup'] = df_encoded['FareGroup'].astype('category').cat.codes

# 计算相关性矩阵(排除不存在的列)
columns_to_use = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
                 'Fare', 'FamilySize', 'AgeGroup', 'FareGroup']
if 'Embarked' in df_encoded.columns:
    columns_to_use.append('Embarked')
if 'Cabin' in df_encoded.columns:  # 同样检查Cabin列是否存在
    columns_to_use.append('Cabin')

correlation = df_encoded[columns_to_use].corr()

# 绘制热力图
plt.figure(figsize=(24, 20))
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=.5, cbar_kws={"shrink": .8})
plt.title('幸存率与各特征的相关性热力图')
plt.tight_layout()
plt.show()

 @浙大疏锦行

你可能感兴趣的:(python,开发语言)