第三十一天打卡

@浙大疏锦行

今日的示例代码包含2个部分

 

notebook文件夹内的ipynb文件,介绍下今天的思路

项目文件夹中其他部分:拆分后的信贷项目,学习下如何拆分的,未来你看到的很多大项目都是类似的拆分方法

知识点回顾

 

规范的文件命名

规范的文件夹管理

机器学习项目的拆分

编码格式和类型注解

作业:尝试针对之前的心脏病项目ipynb,将他按照今天的示例项目整理成规范的形式,思考下哪些部分可以未来复用。

import os

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, recall_score

import shap

import matplotlib.pyplot as plt

import seaborn as sns

 

# 步骤1:创建项目结构

project_root = '/mnt/heart_disease_prediction'

data_dir = os.path.join(project_root, 'data')

raw_data_dir = os.path.join(data_dir, 'raw')

processed_data_dir = os.path.join(data_dir, 'processed')

src_dir = os.path.join(project_root, 'src')

data_code_dir = os.path.join(src_dir, 'data')

models_code_dir = os.path.join(src_dir,'models')

visualization_code_dir = os.path.join(src_dir, 'visualization')

notebooks_dir = os.path.join(project_root, 'notebooks')

 

dirs = [

    project_root, data_dir, raw_data_dir, processed_data_dir,

    src_dir, data_code_dir, models_code_dir, visualization_code_dir,

    notebooks_dir

]

for d in dirs:

    if not os.path.exists(d):

        os.makedirs(d)

 

# 步骤2:整理数据

# 假设 heart.csv 在 /mnt 目录下,将其移动到 data/raw/ 目录

import shutil

shutil.copy('/mnt/heart.csv', raw_data_dir)

 

# 步骤3:提取数据处理代码到 src/data/preprocessing.py

preprocessing_code = """

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def handle_missing_values(data):

    # 处理缺失值的代码逻辑

    return data.fillna(method='ffill') # 示例,具体根据实际情况调整

def encode_features(data):

    # 标签编码

    label_encoder = LabelEncoder()

    # 假设某些列需要标签编码,这里以示例说明

    data['categorical_column_1'] = label_encoder.fit_transform(data['categorical_column_1'])

    

    # 独热编码

    onehot_encoder = OneHotEncoder()

    # 假设某些列需要独热编码,这里以示例说明

    encoded = onehot_encoder.fit_transform(data[['categorical_column_2']]).toarray()

    encoded_df = pd.DataFrame(encoded, columns=onehot_encoder.get_feature_names_out(['categorical_column_2']))

    data = pd.concat([data.reset_index(drop=True), encoded_df], axis=1)

    data = data.drop(['categorical_column_2'], axis=1)

    

    return data

def split_dataset(data, target_column):

    X = data.drop(target_column, axis=1)

    y = data[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

"""

preprocessing_file_path = os.path.join(data_code_dir, 'preprocessing.py')

with open(preprocessing_file_path, 'w') as file:

    file.write(preprocessing_code)

 

# 步骤4:提取特征工程代码到 src/data/feature_engineering.py

feature_engineering_code = """

import pandas as pd

from sklearn.ensemble import RandomForestClassifier

def process_continuous_features(data):

    # 连续特征处理逻辑,如标准化等

    for col in data.select_dtypes(include=['float64', 'int64']).columns:

        data[col] = (data[col] - data[col].mean()) / data[col].std()

    return data

def analyze_feature_importance(X_train, y_train):

    model = RandomForestClassifier()

    model.fit(X_train, y_train)

    feature_importance = pd.Series(model.feature_importances_, index=X_train.columns)

    return feature_importance

"""

feature_engineering_file_path = os.path.join(data_code_dir, 'feature_engineering.py')

with open(feature_engineering_file_path, 'w') as file:

    file.write(feature_engineering_code)

 

# 步骤5:提取模型训练代码到 src/models/train.py

train_code = """

import shap

from sklearn.ensemble import RandomForestClassifier

def train_random_forest(X_train, y_train):

    model = RandomForestClassifier()

    model.fit(X_train, y_train)

    return model

def explain_model(model, X_train):

    explainer = shap.TreeExplainer(model)

    shap_values = explainer.shap_values(X_train)

    return shap_values

"""

train_file_path = os.path.join(models_code_dir, 'train.py')

with open(train_file_path, 'w') as file:

    file.write(train_code)

 

# 步骤6:提取模型评估代码到 src/models/evaluate.py

evaluate_code = """

from sklearn.metrics import accuracy_score, recall_score

def evaluate_model(model, X_test, y_test):

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    recall = recall_score(y_test, y_pred)

    return accuracy, recall

"""

evaluate_file_path = os.path.join(models_code_dir, 'evaluate.py')

with open(evaluate_file_path, 'w') as file:

    file.write(evaluate_code)

 

# 步骤7:提取可视化代码到 src/visualization/plots.py

plots_code = """

import matplotlib.pyplot as plt

import seaborn as sns

import shap

def plot_feature_importance(feature_importance):

    plt.figure(figsize=(10, 6))

    feature_importance.nlargest(10).plot(kind='barh')

    plt.title('Top 10 Feature Importance')

    plt.show()

def plot_shap_values(shap_values, X_train):

    shap.summary_plot(shap_values, X_train)

"""

plots_file_path = os.path.join(visualization_code_dir, 'plots.py')

with open(plots_file_path, 'w') as file:

    file.write(plots_code)

 

# 步骤8:更新 Jupyter Notebook 内容到 notebooks/model_development.ipynb

notebook_content = """

```python

from src.data.preprocessing import handle_missing_values, encode_features, split_dataset

from src.data.feature_engineering import process_continuous_features, analyze_feature_importance

from src.models.train import train_random_forest, explain_model

from src.models.evaluate import evaluate_model

from src.visualization.plots import plot_feature_importance, plot_shap_values

# 加载数据

data = pd.read_csv('../data/raw/heart.csv')

# 数据预处理

data = handle_missing_values(data)

data = encode_features(data)

# 特征工程

data = process_continuous_features(data)

X_train, X_test, y_train, y_test = split_dataset(data, 'target_column') # 替换 target_column 为实际列名

feature_importance = analyze_feature_importance(X_train, y_train)

# 模型训练

model = train_random_forest(X_train, y_train)

shap_values = explain_model(model, X_train)

# 模型评估

accuracy, recall = evaluate_model(model, X_test, y_test)

print(f"Accuracy: {accuracy}, Recall: {recall}")

# 可视化

plot_feature_importance(feature_importance)

plot_shap_values(shap_values, X_train)

你可能感兴趣的:(python打卡,python)