Python打卡训练营day23——2025.05.12

作业:

整理下全部逻辑的先后顺序,看看能不能制作出适合所有机器学习的通用pipeline

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
 
# 1. 数据加载与预处理
def load_data(path):
    """加载数据并进行基本预处理"""
    df = pd.read_csv(path)
    X = df.drop('target_column', axis=1)  # 替换target_column为实际目标变量名
    y = df['target_column']
    return X, y
 
# 2. 定义数据预处理流水线
def create_preprocessing_pipeline(X):
    """创建数据预处理流水线,区分数值型和类别型特征"""
    # 自动识别数值型和类别型特征
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # 数值特征处理流水线
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # 中位数填充缺失值
        ('scaler', StandardScaler())  # 特征标准化
    ])
    
    # 类别特征处理流水线
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # 众数填充缺失值
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # 独热编码
    ])
    
    # 合并处理流程
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor, numerical_features, categorical_features
 
# 3. 创建完整的机器学习Pipeline
def create_full_pipeline(preprocessor):
    """创建包含预处理和模型训练的完整Pipeline"""
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # 数据预处理
        ('feature_selection', SelectKBest(score_func=f_classif, k=10)),  # 特征选择
        ('classifier', RandomForestClassifier(random_state=42))  # 模型选择 (可替换为其他算法)
    ])
    
    return pipeline
 
# 4. 超参数优化
def perform_hyperparameter_tuning(pipeline, X_train, y_train):
    """执行超参数优化"""
    # 定义要搜索的参数空间
    param_grid = {
        'feature_selection__k': [5, 10, 15],  # 选择的特征数量
        'classifier__n_estimators': [50, 100, 200],  # 随机森林树的数量
        'classifier__max_depth': [None, 10, 20]  # 树的最大深度
    }
    
    # 使用网格搜索进行超参数优化
    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=5,  # 5折交叉验证
        n_jobs=-1,  # 使用所有CPU核心
        scoring='accuracy'  # 评估指标 (可替换为其他指标)
    )
    
    # 执行超参数搜索
    grid_search.fit(X_train, y_train)
    
    print("最佳参数组合:", grid_search.best_params_)
    print("最佳交叉验证分数:", grid_search.best_score_)
    
    return grid_search.best_estimator_
 
# 5. 模型评估
def evaluate_model(best_model, X_test, y_test):
    """评估模型性能"""
    # 预测
    y_pred = best_model.predict(X_test)
    
    # 输出评估报告
    print("模型评估报告:")
    print(classification_report(y_test, y_pred))
    
    # 可添加其他评估指标,如ROC曲线、混淆矩阵等
 
# 主函数:执行完整的Pipeline流程
def main():
    # 1. 加载数据
    X, y = load_data("your_data.csv")  # 替换为实际数据路径
    
    # 2. 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # 3. 创建预处理流水线
    preprocessor, numerical_features, categorical_features = create_preprocessing_pipeline(X_train)
    
    # 4. 创建完整Pipeline
    pipeline = create_full_pipeline(preprocessor)
    
    # 5. 超参数优化
    best_model = perform_hyperparameter_tuning(pipeline, X_train, y_train)
    
    # 6. 模型评估
    evaluate_model(best_model, X_test, y_test)
    
    # 7. 模型持久化
    # 可添加代码将最佳模型保存到文件,以便后续使用
 
if __name__ == "__main__":
    main()

@浙大疏锦行

你可能感兴趣的:(python,开发语言)