# 数据处理、分析
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime
import os
import glob
# sklearn模型
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as gbm
from sklearn.ensemble import GradientBoostingClassifier as gbdt
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier
# sklearn特征工程、数据准备和评估
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_validate, KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.neural_network import BernoulliRBM
from sklearn.datasets.samples_generator import make_blobs
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin,ClassifierMixin
from sklearn import clone
# keras数据准备
from keras.models import load_model
from keras.utils import to_categorical
# keras神经网络
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers
# 图形显示
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# 表格显示
pd.set_option('max_colwidth',20)
pd.set_option('display.max_columns', 30)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
Using TensorFlow backend.
数据存储在csv当中,文件小于10M,以pandas为主要处理方法。
基本的处理包括:数值型变量归一化,类别型变量数值化
path = r'C:/Users/Administrator/Documents/ls/data/pima-indians-diabetes.data.csv'
data_set = pd.read_csv(filepath_or_buffer=path,encoding='utf-8',sep=',',index_col=False, header=None)
use_data = pd.get_dummies(data_set, columns=[0,7])
use_data.head()
1 | 2 | 3 | 4 | 5 | 6 | 8 | 0_0 | 0_1 | 0_2 | 0_3 | 0_4 | 0_5 | 0_6 | 0_7 | ... | 7_58 | 7_59 | 7_60 | 7_61 | 7_62 | 7_63 | 7_64 | 7_65 | 7_66 | 7_67 | 7_68 | 7_69 | 7_70 | 7_72 | 7_81 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 76 columns
# 对数值型数据进行归一化处理
def normolized_col(train_df, test_df,tranform_colname=list(range(1,7,1))):
stand = StandardScaler()
train_transform = train_df[:,tranform_colname]
stand.fit(train_transform)
train_df[:,tranform_colname] = stand.transform(train_transform)
test_df[:,tranform_colname] = stand.transform(test_df[:,tranform_colname])
return train_df, test_df
target_col = 8
input_col = [i for i in use_data.columns if i != target_col]
X = use_data.loc[:,input_col].astype(np.float32).as_matrix()
y = use_data.loc[:,target_col].astype(np.str).as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train, X_test = normolized_col(X_train, X_test)
def acc_cv(models, train_x=X_train, train_y=y_train, n_folds = 5, test_x=X_test, test_y=y_test):
clf = make_pipeline(models)
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_x)
acc_score = cross_val_score(clf, train_x, train_y, scoring="accuracy", cv = kf)
print('*'*50)
print('train_accuracy: %.5f' %(acc_score.mean()))
clf.fit(X_train, y_train)
# print('clf:%s'%(clf))
# X_test_new = StandardScaler().fit(X_train).transform(X_test)
y_pred = clf.predict(X_test)
print('test_accuracy: %.5f ' %accuracy_score(y_pred=y_pred, y_true=y_test))
print('*'*50)
return acc_score
class StackingAverageModels(BaseEstimator, ClassifierMixin, TransformerMixin):
def __init__(self, base_models, meta_model, n_folds):
self.base_models = base_models
self.meta_model = meta_model
self.n_folds = n_folds
def fit(self, X, y):
# X = X.as_matrix()
# y = y.as_matrix()
self.base_models_ = [list() for x in self.base_models]
self.meta_model_ = clone(self.meta_model)
kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=123)
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)), dtype=np.float64)
for i, model in enumerate(self.base_models):
for train_index, holdout_index in kfold.split(X, y): # KFold的split方法, 将通过indice切分train和test
instance = clone(model)
self.base_models_[i].append(instance)
instance.fit(X[train_index], y[train_index])
y_pred = instance.predict(X[holdout_index])
out_of_fold_predictions[holdout_index, i] = y_pred
self.meta_model_.fit(out_of_fold_predictions, y)
return self
def predict(self, X):
# X = X.as_matrix()
meta_features = np.column_stack(stats.mode(np.column_stack([model.predict(X) for model in base_models]), axis=1)[0] for base_models in self.base_models_)
# meta_features = np.column_stack([stats.mode(np.column_stack([model.predict(X) for model in base_models]),axis=0)[0]
# for base_models in self.base_models_])
return self.meta_model_.predict(meta_features)
编写实现stack的类方法,实现基模型是神经网络,元模型是其他。具体包括载入模型、预测集构建、元模型训练、最终预测结果。
class StackingAverageModels_build2():
'''
第一层的submodel是神经网络
第二层的模型是其他模型。
'''
def __init__(self, X_train, y_train, X_test, y_test):
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.doc_dir = None
self.members = None
self.n_models = None
self.meta_model = None
def load_all_models(self, n_models, doc_dir):
all_models = list()
for i in range(n_models):
filename = os.path.join(doc_dir, 'model_'+ np.str(i + 1)+'.h5')
model = load_model(filename)
all_models.append(model)
print('>loaded %s' %(filename))
self.members = all_models
self.doc_dir = doc_dir
self.n_models = n_models
return all_models
def stacked_dataset(self, inputX):
'''
第一层模型-建立模型并训练,输出预测结果
'''
stackX = None
for model in self.members:
# 做预测
y_pred = model.predict(inputX, verbose=0)
# 预测结果重塑成[row, members, probalities]
if stackX is None:
stackX = y_pred
else:
stackX = np.dstack((stackX, y_pred))
# 将预测结果展开成,[rows, members * probalities]
stackX = stackX.reshape((stackX.shape[0], stackX.shape[1]*stackX.shape[2]))
return stackX
def fit_stacked_model(self, meta_model):
'''
第二层模型-基于第一层预测结果,建立模型并训练
return: 已训练好的模型
'''
inputX = self.X_test
inputy = self.y_test
# 创建训练集
stackedX = self.stacked_dataset(inputX)
# 第二层的模型进行训练
meta_model.fit(stackedX, inputy)
self.meta_model = meta_model
return meta_model
def stacked_prediction(self):
'''
基于第二层模型得到的预测结果
'''
# 创建训练数据集
stackedX = self.stacked_dataset(self.X_test)
# 做预测
model = self.meta_model
y_pred = model.predict(stackedX)
return y_pred
# keras中的Sequential建立的DNN
def DNN_base_v1(X_train, y_train):
model = models.Sequential()
model.add(layers.Dense(96, activation='elu',kernel_regularizer=regularizers.l2(0.005), input_shape=(X_train.shape[1], )))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='elu',kernel_regularizer=regularizers.l2(0.005)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(32, activation='elu',kernel_regularizer=regularizers.l2(0.005)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(32, activation='elu',kernel_regularizer=regularizers.l2(0.005)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=optimizers.Adadelta(), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=1200, batch_size=50, validation_split=0.2, verbose=0, shuffle=True)
results_train = model.evaluate(X_train, y_train)
print('accuracy: %s' %(results_train))
return model
def DNN_fit_and_save(X_train, y_train, doc_dir, model_numbers):
if os.path.exists(doc_dir) == True:
pass
else:
os.makedirs(doc_dir)
for i in range(model_numbers):
model = DNN_base_v1(X_train, y_train)
filename = os.path.join(doc_dir, 'model_'+ np.str(i + 1)+'.h5')
model.save(filename)
print('>save %s' %(filename))
doc_dir = r'C:\Users\Administrator\Documents\ls\tmp_models'
dnn = DNN_fit_and_save(X_train, y_train, doc_dir, 5)
614/614 [==============================] - 0s 86us/step
accuracy: [0.50532664577036801, 0.78175895784887506]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_1.h5
614/614 [==============================] - 0s 80us/step
accuracy: [0.50161608306126793, 0.78664495133421708]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_2.h5
614/614 [==============================] - 0s 86us/step
accuracy: [0.49014993210957181, 0.78827361582933109]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_3.h5
614/614 [==============================] - 0s 124us/step
accuracy: [0.50730565588326715, 0.77850162885864704]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_4.h5
614/614 [==============================] - 0s 83us/step
accuracy: [0.5090954220256122, 0.77198697087819101]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_5.h5
lr = LR(random_state=123, verbose=0)
svm_clf2 = SVC(kernel='rbf',class_weight='balanced',random_state=123)
dt = DT(max_depth=4,random_state=123)
nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto')
rdf = RandomForestClassifier(random_state=123)
gbm_sklearn_model = gbdt(random_state=123)
xgb_model = xgb.XGBClassifier(seed=123)
gbm_model = gbm.LGBMClassifier(random_state=123)
不同的元模型:logistic、SVM、Decision Tree、Xgboost、GBM等。将其与神经网络模型组合成集成学习模型stack方法,在元模型采用基本的默认参数情况下,比较各stack方法的效果
aa = StackingAverageModels_build2(X_train, y_train, X_test, y_test)
aa.load_all_models(doc_dir=r'C:\Users\Administrator\Documents\ls\tmp_models', n_models=5)
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_1.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_2.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_3.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_4.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_5.h5
[,
,
,
,
]
aa.fit_stacked_model(meta_model=lr)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.80519480519480524
aa.fit_stacked_model(meta_model=dt)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.85064935064935066
aa.fit_stacked_model(meta_model=svm_clf2)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.7857142857142857
aa.fit_stacked_model(meta_model=xgb_model)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.9285714285714286
aa.fit_stacked_model(meta_model=rdf)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.96753246753246758
aa.fit_stacked_model(meta_model=gbm_model)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.88961038961038963
aa.fit_stacked_model(meta_model=gbm_sklearn_model)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
1.0
结论:树模型与神经网络模型的结合,效果不错。在分类任务可以考虑多尝试使用。
results = [0.80519480519480524, 0.85064935064935066, 0.7857142857142857, 0.9285714285714286, 0.96753246753246758, 0.88961038961038963,1.0]
results = [float('%.4f' %(i)) for i in results]
model_name = ['LR', 'DT', 'SVM', 'Xgb', 'RDF', 'Lgbm', 'GBDT']
df_plot = pd.DataFrame({'model':model_name, 'accuracy':results})
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel(...,fontsize=20)
ax.set_ylabel(...,fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
pl = sns.barplot(x='model', y='accuracy', data=df_plot)
plt.show()
参考文章: