sklearn pipeline 实现多个模型统一调参

实现多模型统一调参

解决问题:在复现GBDT+LR的经典结构的时候,发现需要对两个模型一起进行调参,网上找不到相关代码,研究之后实现LGB + LR的统一调参

需写2个自定义管道流的类来完成
LgbmPipeline类中还包含对将lgb预测值转换为LR预测值作为下一步LR的特征的转换

from sklearn.base import BaseEstimator, TransformerMixin
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

class LgbmPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, num_leaves, n_estimators, other_params):
        self.num_leaves = num_leaves
        self.n_estimators = n_estimators
        self.other_params = other_params

    def fit(self, X, y=None):
        self.lgb = LGBMClassifier(num_leaves = self.num_leaves, n_estimators = self.n_estimators, **self.other_params)
        self.lgb.fit(X, y)
        return self

    def transform(self, X):
        try:
            lgb_feature = self.lgb.predict(X, pred_leaf=True)
            self.lgb_feature_rows_len = lgb_feature.shape[0]
            self.lgb_feature_columns_len = lgb_feature.shape[1]

            transformed_matrix = np.zeros([self.lgb_feature_rows_len, self.lgb_feature_columns_len * self.num_leaves], dtype=np.int8)  # N * num_tress * num_leafs
            for i in range(0, self.lgb_feature_rows_len):
                temp = np.arange(self.lgb_feature_columns_len) * self.num_leaves + np.array(lgb_feature[i])#计算onehot在100 * 64列当中的位置 arange(100)*64 + y_pred[i]
                transformed_matrix[i][temp] += 1
            return transformed_matrix
        except Exception as ex:
            print(ex)

class LrPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, class_weight, penalty):
        self.class_weight = class_weight
        self.penalty = penalty

    def fit(self, X, y=None):
        self.lr = LogisticRegression(class_weight = self.class_weight, penalty = self.penalty)
        self.lr.fit(X, y)
        return self

    def transform(self, X):
        y_pred_scores = self.lr.predict_proba(X)
        return y_pred_scores

    def predict_proba(self, X):
        y_pred_scores = self.lr.predict_proba(X)
        return y_pred_scores

管道流实现fit与调用, X_train_tr为ndarry格式的数据

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # GBDT算法为基础
    'objective': 'binary',
    'metric': 'auc',  # 评判指标
    'max_bin': 255,  # 大会有更准的效果,更慢的速度
    'learning_rate': 0.1,  # 学习率
#     'num_leaves': 32,  # 大会更准,但可能过拟合
    'max_depth': -1,  # 小数据集下限制最大深度可防止过拟合,小于0表示无限制
    'feature_fraction': 0.8,  # 防止过拟合
    'bagging_freq': 5,  # 防止过拟合
    'bagging_fraction': 0.8,  # 防止过拟合
    'min_data_in_leaf': 21,  # 防止过拟合
    'min_sum_hessian_in_leaf': 3.0,  # 防止过拟合
    'min_child_weight': 0.1,
    'lambda_l1': 0.2,
    'lambda_l2': 20,
    'is_unbalance': True
#     'n_estimators': 20
}

model_pipeline = Pipeline([
    ("lgb_matrix", LgbmPipeline(num_leaves=32, n_estimators=50, other_params=lgb_params)),
    ("lr", LrPipeline(class_weight=dict({0: 1, 1: 5}), penalty='l1'))
])

调参过程

from sklearn.model_selection import GridSearchCV

param_grid = dict(
    lgb_matrix__num_leaves = [32],
    lgb_matrix__n_estimators = [50],
    lr__class_weight=[dict({0: 1, 1: 1}), dict({0: 1, 1: 4}), dict({0: 1, 1: 7})],
    lr__penalty = ['l1', 'l2']
)

grid_search_gbdt_lr_pipeline = GridSearchCV(model_pipeline, param_grid=param_grid, verbose=1, cv=2, n_jobs=-1, scoring='recall')

# 调参
grid_search_gbdt_lr_pipeline.fit(X_train_tr, y_train)

# 输出最优模型与最优参数
grid_search_gbdt_lr_best_1 = grid_search_gbdt_lr_pipeline.best_estimator_
print("Best: %f using %s" % (grid_search_gbdt_lr_pipeline.best_score_, grid_search_gbdt_lr_pipeline.best_params_))

你可能感兴趣的:(机器学习)