作业:
整理下全部逻辑的先后顺序,看看能不能制作出适合所有机器学习的通用pipeline
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
# 1. 数据加载与预处理
def load_data(path):
"""加载数据并进行基本预处理"""
df = pd.read_csv(path)
X = df.drop('target_column', axis=1) # 替换target_column为实际目标变量名
y = df['target_column']
return X, y
# 2. 定义数据预处理流水线
def create_preprocessing_pipeline(X):
"""创建数据预处理流水线,区分数值型和类别型特征"""
# 自动识别数值型和类别型特征
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
# 数值特征处理流水线
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')), # 中位数填充缺失值
('scaler', StandardScaler()) # 特征标准化
])
# 类别特征处理流水线
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')), # 众数填充缺失值
('onehot', OneHotEncoder(handle_unknown='ignore')) # 独热编码
])
# 合并处理流程
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
])
return preprocessor, numerical_features, categorical_features
# 3. 创建完整的机器学习Pipeline
def create_full_pipeline(preprocessor):
"""创建包含预处理和模型训练的完整Pipeline"""
pipeline = Pipeline(steps=[
('preprocessor', preprocessor), # 数据预处理
('feature_selection', SelectKBest(score_func=f_classif, k=10)), # 特征选择
('classifier', RandomForestClassifier(random_state=42)) # 模型选择 (可替换为其他算法)
])
return pipeline
# 4. 超参数优化
def perform_hyperparameter_tuning(pipeline, X_train, y_train):
"""执行超参数优化"""
# 定义要搜索的参数空间
param_grid = {
'feature_selection__k': [5, 10, 15], # 选择的特征数量
'classifier__n_estimators': [50, 100, 200], # 随机森林树的数量
'classifier__max_depth': [None, 10, 20] # 树的最大深度
}
# 使用网格搜索进行超参数优化
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=5, # 5折交叉验证
n_jobs=-1, # 使用所有CPU核心
scoring='accuracy' # 评估指标 (可替换为其他指标)
)
# 执行超参数搜索
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)
print("最佳交叉验证分数:", grid_search.best_score_)
return grid_search.best_estimator_
# 5. 模型评估
def evaluate_model(best_model, X_test, y_test):
"""评估模型性能"""
# 预测
y_pred = best_model.predict(X_test)
# 输出评估报告
print("模型评估报告:")
print(classification_report(y_test, y_pred))
# 可添加其他评估指标,如ROC曲线、混淆矩阵等
# 主函数:执行完整的Pipeline流程
def main():
# 1. 加载数据
X, y = load_data("your_data.csv") # 替换为实际数据路径
# 2. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 3. 创建预处理流水线
preprocessor, numerical_features, categorical_features = create_preprocessing_pipeline(X_train)
# 4. 创建完整Pipeline
pipeline = create_full_pipeline(preprocessor)
# 5. 超参数优化
best_model = perform_hyperparameter_tuning(pipeline, X_train, y_train)
# 6. 模型评估
evaluate_model(best_model, X_test, y_test)
# 7. 模型持久化
# 可添加代码将最佳模型保存到文件,以便后续使用
if __name__ == "__main__":
main()
@浙大疏锦行