学习曲线是不同训练集大小,模型在训练集和验证集上的得分变化曲线。
也就是以样本数为横坐标,训练和交叉验证集上的得分(如准确率)为纵坐标。
learning curve可以帮助我们判断模型现在所处的状态:过拟合(overfiting / high variance) or 欠拟合(underfitting / high bias)
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import numpy as np
#绘制学习曲线,以确定模型的状况
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
train_sizes=np.linspace(.1, 1.0, 5)):
"""
画出data在某模型上的learning curve.
参数解释
----------
estimator : 使用的分类器。
title : 标题
X : 输入的feature,numpy类型
y : 输入的target vector
ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
"""
plt.figure()
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=5, n_jobs=1, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.legend(loc="best")
plt.grid("on")
if ylim:
plt.ylim(ylim)
plt.title(title)
plt.show()
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target
import xgboost as xgb
xgb_model = xgb.XGBClassifier(objective='binary:hinge',
nthread=16,
booster='gbtree',
n_estimators=500,
learning_rate=0.05,
max_depth=9,
subsample=0.8,
colsample_bytree=0.8
)
plot_learning_curve(xgb_model, 'Learning Curve', X, y, ylim=None, cv=6,
train_sizes=np.linspace(.1, 1.0, 5))