Python打卡:day23

作业:

整理下全部逻辑的先后顺序,看看能不能制作出适合所有机器学习的通用pipeline


def create_general_pipeline(model, ordinal_features=None, ordinal_categories=None, 
                            nominal_features=None, continuous_features=None):
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
    from sklearn.impute import SimpleImputer
    
    # 有序特征处理 Pipeline
    if ordinal_features and ordinal_categories:
        ordinal_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
        ])
    else:
        ordinal_transformer = 'passthrough'
    
    # 标称特征处理 Pipeline
    if nominal_features:
        nominal_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
    else:
        nominal_transformer = 'passthrough'
    
    # 连续特征处理 Pipeline
    if continuous_features:
        continuous_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('scaler', StandardScaler())
        ])
    else:
        continuous_transformer = 'passthrough'
    
    # 定义 ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('ordinal', ordinal_transformer, ordinal_features),
            ('nominal', nominal_transformer, nominal_features),
            ('continuous', continuous_transformer, continuous_features)
        ])
    
    # 定义完整的 Pipeline
    general_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    return general_pipeline
 
# 使用示例
from sklearn.ensemble import RandomForestClassifier
 
# 假设已经有了特征和标签,以及对应的特征列表
# X, y, ordinal_features, ordinal_categories, nominal_features, continuous_features
 
model = RandomForestClassifier(random_state=42)
pipeline = create_general_pipeline(model, ordinal_features, ordinal_categories, 
                                   nominal_features, continuous_features)
 
# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
# 训练模型
pipeline.fit(X_train, y_train)
 
# 预测
y_pred = pipeline.predict(X_test)
 
# 评估模型
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

@浙大疏锦行

你可能感兴趣的:(python打卡,python,开发语言)