Python训练营-Day9

知识点见示例代码
  • 字典的简单介绍
  • 标签编连续特征的处理:归一化和标准化
  • import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    import pandas as pd
    df = pd.read_csv('data.csv')
    df.info()
    df.head()
    
    mappings = {
        "Years in current job": {
            "10+ years": 10,
            "2 years": 2,
            "3 years": 3,
            "< 1 year": 0,
            "5 years": 5,
            "1 year": 1,
            "4 years": 4,
            "6 years": 6,
            "7 years": 7,
            "8 years": 8,
            "9 years": 9
        },
        "Home Ownership": {
            "Home Mortgage": 0,
            "Rent": 1,
            "Own Home": 2,
            "Have Mortgage": 3
        }
    }
    df["Years in current job"] = df["Years in current job"].map(mappings["Years in current job"])
    df["Home Ownership"] = df["Home Ownership"].map(mappings["Home Ownership"])
    df.head()
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    continuous_features = [
        'Annual Income', 'Years in current job', 'Tax Liens',
        'Number of Open Accounts', 'Years of Credit History',
        'Maximum Open Credit', 'Number of Credit Problems',
        'Months since last delinquent', 'Bankruptcies',
        'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt',
        'Credit Score'
    ]
    
    correlation_matrix = df[continuous_features].corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
    plt.title('Correlation Matrix of Continuous Features')
    plt.show()
    features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    i=0
    feature=features[i]
    axes[0,0].boxplot(df[feature].dropna())
    axes[0,0].set_title(f'boxplot of {feature}')
    axes[0,0].set_ylabel(feature)
    i=1
    feature=features[i]
    axes[0,1].boxplot(df[feature].dropna())
    axes[0,1].set_title(f'boxplot of {feature}')
    axes[0,1].set_ylabel(feature)
    i=2
    feature=features[i]
    axes[1,0].boxplot(df[feature].dropna())
    axes[1,0].set_title(f'boxplot of {feature}')
    axes[1,0].set_ylabel(feature)
    i=3
    feature=features[i]
    axes[1,1].boxplot(df[feature].dropna())
    axes[1,1].set_title(f'boxplot of {feature}')
    axes[1,1].set_ylabel(feature)
    plt.tight_layout()
    plt.show()
    features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    for i in range(len(features)):
        feature=features[i]
        axes[i//2,i%2].boxplot(df[feature].dropna())
        axes[i//2,i%2].set_title(f'boxplot of {feature}')
        axes[i//2,i%2].set_ylabel(feature)
    features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
    for i, feature in enumerate(features):
        print(f'索引: {i}, 特征: {feature}')
    
    features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    for i, feature in enumerate(features):
        axes[i//2,i%2].boxplot(df[feature].dropna())
        axes[i//2,i%2].set_title(f'boxplot of {feature}')
        axes[i//2,i%2].set_ylabel(feature)

作业:心脏病数据集的特征用上述知识完成,一次性用所有的处理方式完成预处理,尝试手动完成。

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

df = pd.read_csv('heart.csv')
#检查缺失值
df.isnull().sum()
features = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
fig, axes = plt.subplots(4, 4, figsize=(20, 20))
for i, feature in enumerate(features):
    feature=features[i]
    axes[i//4,i%4].boxplot(df[feature])
    axes[i//4,i%4].set_title(f'boxplot of {feature}')
    axes[i//4,i%4].set_ylabel(feature)
features = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
correlation_matrix = df[features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix Of Heart Disease Dataset')
plt.show()

你可能感兴趣的:(python,开发语言)