@浙大疏锦行
今日的示例代码包含2个部分
notebook文件夹内的ipynb文件,介绍下今天的思路
项目文件夹中其他部分:拆分后的信贷项目,学习下如何拆分的,未来你看到的很多大项目都是类似的拆分方法
知识点回顾
规范的文件命名
规范的文件夹管理
机器学习项目的拆分
编码格式和类型注解
作业:尝试针对之前的心脏病项目ipynb,将他按照今天的示例项目整理成规范的形式,思考下哪些部分可以未来复用。
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score
import shap
import matplotlib.pyplot as plt
import seaborn as sns
# 步骤1:创建项目结构
project_root = '/mnt/heart_disease_prediction'
data_dir = os.path.join(project_root, 'data')
raw_data_dir = os.path.join(data_dir, 'raw')
processed_data_dir = os.path.join(data_dir, 'processed')
src_dir = os.path.join(project_root, 'src')
data_code_dir = os.path.join(src_dir, 'data')
models_code_dir = os.path.join(src_dir,'models')
visualization_code_dir = os.path.join(src_dir, 'visualization')
notebooks_dir = os.path.join(project_root, 'notebooks')
dirs = [
project_root, data_dir, raw_data_dir, processed_data_dir,
src_dir, data_code_dir, models_code_dir, visualization_code_dir,
notebooks_dir
]
for d in dirs:
if not os.path.exists(d):
os.makedirs(d)
# 步骤2:整理数据
# 假设 heart.csv 在 /mnt 目录下,将其移动到 data/raw/ 目录
import shutil
shutil.copy('/mnt/heart.csv', raw_data_dir)
# 步骤3:提取数据处理代码到 src/data/preprocessing.py
preprocessing_code = """
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
def handle_missing_values(data):
# 处理缺失值的代码逻辑
return data.fillna(method='ffill') # 示例,具体根据实际情况调整
def encode_features(data):
# 标签编码
label_encoder = LabelEncoder()
# 假设某些列需要标签编码,这里以示例说明
data['categorical_column_1'] = label_encoder.fit_transform(data['categorical_column_1'])
# 独热编码
onehot_encoder = OneHotEncoder()
# 假设某些列需要独热编码,这里以示例说明
encoded = onehot_encoder.fit_transform(data[['categorical_column_2']]).toarray()
encoded_df = pd.DataFrame(encoded, columns=onehot_encoder.get_feature_names_out(['categorical_column_2']))
data = pd.concat([data.reset_index(drop=True), encoded_df], axis=1)
data = data.drop(['categorical_column_2'], axis=1)
return data
def split_dataset(data, target_column):
X = data.drop(target_column, axis=1)
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
return X_train, X_test, y_train, y_test
"""
preprocessing_file_path = os.path.join(data_code_dir, 'preprocessing.py')
with open(preprocessing_file_path, 'w') as file:
file.write(preprocessing_code)
# 步骤4:提取特征工程代码到 src/data/feature_engineering.py
feature_engineering_code = """
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
def process_continuous_features(data):
# 连续特征处理逻辑,如标准化等
for col in data.select_dtypes(include=['float64', 'int64']).columns:
data[col] = (data[col] - data[col].mean()) / data[col].std()
return data
def analyze_feature_importance(X_train, y_train):
model = RandomForestClassifier()
model.fit(X_train, y_train)
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns)
return feature_importance
"""
feature_engineering_file_path = os.path.join(data_code_dir, 'feature_engineering.py')
with open(feature_engineering_file_path, 'w') as file:
file.write(feature_engineering_code)
# 步骤5:提取模型训练代码到 src/models/train.py
train_code = """
import shap
from sklearn.ensemble import RandomForestClassifier
def train_random_forest(X_train, y_train):
model = RandomForestClassifier()
model.fit(X_train, y_train)
return model
def explain_model(model, X_train):
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)
return shap_values
"""
train_file_path = os.path.join(models_code_dir, 'train.py')
with open(train_file_path, 'w') as file:
file.write(train_code)
# 步骤6:提取模型评估代码到 src/models/evaluate.py
evaluate_code = """
from sklearn.metrics import accuracy_score, recall_score
def evaluate_model(model, X_test, y_test):
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
return accuracy, recall
"""
evaluate_file_path = os.path.join(models_code_dir, 'evaluate.py')
with open(evaluate_file_path, 'w') as file:
file.write(evaluate_code)
# 步骤7:提取可视化代码到 src/visualization/plots.py
plots_code = """
import matplotlib.pyplot as plt
import seaborn as sns
import shap
def plot_feature_importance(feature_importance):
plt.figure(figsize=(10, 6))
feature_importance.nlargest(10).plot(kind='barh')
plt.title('Top 10 Feature Importance')
plt.show()
def plot_shap_values(shap_values, X_train):
shap.summary_plot(shap_values, X_train)
"""
plots_file_path = os.path.join(visualization_code_dir, 'plots.py')
with open(plots_file_path, 'w') as file:
file.write(plots_code)
# 步骤8:更新 Jupyter Notebook 内容到 notebooks/model_development.ipynb
notebook_content = """
```python
from src.data.preprocessing import handle_missing_values, encode_features, split_dataset
from src.data.feature_engineering import process_continuous_features, analyze_feature_importance
from src.models.train import train_random_forest, explain_model
from src.models.evaluate import evaluate_model
from src.visualization.plots import plot_feature_importance, plot_shap_values
# 加载数据
data = pd.read_csv('../data/raw/heart.csv')
# 数据预处理
data = handle_missing_values(data)
data = encode_features(data)
# 特征工程
data = process_continuous_features(data)
X_train, X_test, y_train, y_test = split_dataset(data, 'target_column') # 替换 target_column 为实际列名
feature_importance = analyze_feature_importance(X_train, y_train)
# 模型训练
model = train_random_forest(X_train, y_train)
shap_values = explain_model(model, X_train)
# 模型评估
accuracy, recall = evaluate_model(model, X_test, y_test)
print(f"Accuracy: {accuracy}, Recall: {recall}")
# 可视化
plot_feature_importance(feature_importance)
plot_shap_values(shap_values, X_train)