本预测与葡萄牙银行机构的营销活动相关。这些营销活动一般以电话为基础,银行的客服人员至少联系客户一次,以确认客户是否有意愿购买该银行的产品(定期存款)。任务是基本类型为分类任务,即预测客户是否购买该银行的产品。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn import ensemble,
from sklearn import model_selection
from sklearn import multiclass
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import roc_curve, auc,roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from scipy.stats import chi2_contingency
train_set = pd.read_csv('train_set.csv')
test_set = pd.read_csv('test_set.csv')
print(train_set.shape)
print(test_set.shape)
#(25317, 18)
#(10852, 17)
#train_set.info()
#test_set.info()
数据字段
ID :客户唯一标识
age :年龄
job :职位
marital :婚姻状况
education :教育
default :是否有违约记录
balance :每年账户的平均余额
housing :是否有住房贷款
loan :是否有个人贷款
contact :与客户的沟通方式
day :最后一次联系的日期
month :最后一次联系的月份
duration :最后一次联系的交流时时长
campaign :在本次活动中与客户交流的次数
pdays :距离上次活动最后一次联系该客户,过去了多久(999表示没有联系过)
previous :在本次活动之前,与该客户交流过的次数
poutcome :上一次活动的结果
y :预测客户是否会订购定期存款业务
test_set = test_set.drop_duplicates()
train_set = train_set.drop_duplicates()
print(train_set.shape)
print(test_set.shape)
#(25317, 18)
#(10852, 17)
#数据没有重复值
#数据没有NA值但是有unknow值
train_set.isin(['unknown']).mean()*100
#job:0.643836,education:4.206660,contact:28.759332,poutcome:81.672394
test_set.isin(['unknown']).mean()*100
#job:0.552893,education:4.128271,contact:28.676742,poutcome:81.800590
# 工作,教育和沟通方式用众数填充,上一次沟通的结果poutcome因为缺失太多,把这一特征舍弃掉
train_set.drop(['poutcome'],inplace=True,axis=1)
train_set['job'].replace(['unknown'],train_set['job'].mode(),inplace=True)
train_set['education'].replace(['unknown'],train_set['education'].mode(),inplace=True)
train_set['contact'].replace(['unknown'],train_set['contact'].mode(),inplace=True)
test_set.drop(['poutcome'],inplace=True,axis=1)
test_set['job'].replace(['unknown'],test_set['job'].mode(),inplace=True)
test_set['education'].replace(['unknown'],test_set['education'].mode(),inplace=True)
test_set['contact'].replace(['unknown'],test_set['contact'].mode(),inplace=True)
#离散变量列名
object_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact','month']
#连续变量列名
num_columns = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous','day']
def barplot(x,y, **kwargs):
sns.barplot(x=x , y=y)
x = plt.xticks(rotation=45)
f = pd.melt(train_set, value_vars=object_columns ,id_vars = 'y')
g = sns.FacetGrid(f, col="variable", col_wrap=3, sharex=False, sharey=False, size=5)
g = g.map(barplot,"value",'y')
职位:退休人员和学生更有可能买购买定期存款,其次是失业人员和管理人员,蓝领定期理财的意愿较低
婚姻状况:单身人士比离婚和结婚人士更愿意定期理财
教育程度:定期理财意愿按照大学、高中、初中受教育程度依次降低
是否有违约记录:没有违约记录的人士购买定期理财的可能是有违约记录人员的两倍
是否有住房贷款:没有住房贷款的人士购买定期理财的可能是有住房贷款人员的两倍以上
是否有个人贷款:没有个人贷款的人士更有可能购买定期理财
与客户的沟通方式:用电话沟通和手机沟通对客户决定是个购买定期存款 影响不明显
最后一次联系的月份:12、10、9、3月购买定期存款的比例较高
g = sns.FacetGrid(train_set, col='marital',size=5)
g.map(sns.barplot, 'default', 'y', 'education')
g.add_legend()
单身&没有违约记录&大学学历的人士购买定期存款的概率较高
离婚&有违约记录&大学学历的人士购买定期存款的概率较低
def barplot(x,y, **kwargs):
sns.barplot(x=x , y=y)
x=plt.xticks(rotation=90)
plt.figure(figsize=(16, 12))
g = sns.FacetGrid(train_set, col='education',col_order=['primary','secondary','tertiary'],size=5)
g.map(barplot, 'job', 'y')
随着学历的提升退休人员和蓝领购买定期理财的意愿升高,而学生的定期理财意愿降低
f = pd.melt(train_set, value_vars=num_columns ,id_vars = 'y')
g = sns.FacetGrid(f, col="variable", col_wrap=3, sharex=False, sharey=False, size=5,hue='y')
g = g.map(sns.distplot,"value",bins=20)
g.add_legend()
年龄:30岁之前更愿意购买定期理财,30-60岁购买定期理财的意愿有所降低,60岁后明显的更有可能购买定期理财产品
沟通时长:沟通时长在大约300分钟之后购买定期理财的比率提高
本次活动中与客户交流的次数:交流次数增多不能明显的提高客户购买定期理财的比率
距离上次活动最后一次联系该客户,过去了多久:在140-160,180-200天客户购买定期理财的比率明显增多
在本次活动之前,与该客户交流过的次数:交流次数增多不能明显的提高客户购买定期理财的比率,可能会适得其反
日期:在每月会有三次购买定期理财高峰,每个月的23-25日客户最不想购买定期理财
sns.countplot(train_set['y'])
#数据分布不平衡的,可以使用上采样解决样本不均衡问题
#描述性统计
train_set.describe()
test_set.describe()
#数据的分布情况
categorical_features = ['balance','duration','campaign','pdays','previous']
f = pd.melt(train_set, value_vars=categorical_features,id_vars=['y'])
g = sns.FacetGrid(f, col="variable", col_wrap=3, sharex=False, sharey=False, size=5)
g = g.map(sns.boxplot, "value")
可以看到数据的极端值比较多,删除极端值会对源数据造成一定的影响,后续对数据进行分箱处理
#查看类别变量的值
for column in object_columns:
print(column,': ',train_set[column].unique())
#job : ['management' 'technician' 'admin.' 'services' 'retired' 'student' 'blue-collar' 'entrepreneur' 'housemaid' 'self-employed' 'unemployed']
#marital : ['married' 'divorced' 'single']
#education : ['tertiary' 'primary' 'secondary']
#default : ['no' 'yes']
#housing : ['yes' 'no']
#loan : ['no' 'yes']
#contact : ['cellular' 'telephone']
#month : ['may' 'apr' 'jul' 'jun' 'nov' 'aug' 'jan' 'feb' 'dec' 'oct' 'sep' 'mar']
#构造季度和半年的特征
def quarter(data):
a = ''
if data in ['jan','feb','mar']:
a = 'Q1'
elif data in ['apr','may','jun']:
a = 'Q2'
elif data in ['jul','aug','sep']:
a = 'Q3'
else:
a = 'Q4'
return a
def halfyear(data):
a = ''
if data in ['jan','feb','mar','apr','may','jun']:
a = 'H1'
else:
a = 'H2'
return a
对离散数据进行编码化处理,对连续性数据进行分箱,分箱的方法有等值、等宽、聚类、卡方、最小熵值分箱等。在此根据数据的特征选择性使用qcut和cut对数据进行分箱,使用woe和iv值决定分箱的数量。
def CalcWOE(df, col, target):
'''
: df dataframe
: col 注意这列已经分过箱了,现在计算每箱的WOE和总的IV
:target 目标列 0-1值
:return 返回每箱的WOE和总的IV
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].count() - df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
N = sum(regroup['total'])
B = sum(regroup['bad'])
regroup['good'] = regroup['total'] - regroup['bad']
G = N - B
regroup['bad_pcnt'] = regroup['bad'].map(lambda x: x * 1.0 / B)
regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G)
regroup['WOE'] = regroup.apply(
lambda x: np.log(x.good_pcnt * 1.0 / x.bad_pcnt), axis=1)
WOE_dict = regroup[[col, 'WOE']].set_index(col).to_dict(orient='index')
IV = regroup.apply(
lambda x:
(x.good_pcnt - x.bad_pcnt) * np.log(x.good_pcnt * 1.0 / x.bad_pcnt),
axis=1)
IV_SUM = sum(IV)
return {'WOE': WOE_dict, 'IV_sum': IV_SUM, 'IV': IV}
#判断woe是否满足单调性
def BadRateMonotone(df, sortByVar, target):
#df[sortByVar]这列已经经过分箱
df2 = df.sort_values(by=[sortByVar])
total = df2.groupby([sortByVar])[target].count()
total = pd.DataFrame({'total': total})
bad = df2.groupby([sortByVar])[target].count() - df2.groupby(
[sortByVar])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
combined = zip(regroup['total'], regroup['bad'])
badRate = [x[1] * 1.0 / x[0] for x in combined]
badRateMonotone = [
badRate[i] < badRate[i + 1] for i in range(len(badRate) - 1)
]
Monotone = len(set(badRateMonotone))
if Monotone == 1:
return True
else:
return False
def num_band(df, columns, target, min_num, max_num):
result = []
for col in columns:
for i in range(min_num, max_num):
try:
df['band'] = pd.cut(df[col], i)
WOE_IV = CalcWOE(df, 'band', target)
T_F = BadRateMonotone(df, 'band', target)
result.append([col, i, WOE_IV['IV_sum'], T_F])
except:
continue
return pd.DataFrame(result, columns=['column', 'num', 'IV_sum', 'T_F'])
num_band(train_set, num_columns, 'y', 2, 10)
#选择woe满足单调性且IV值最大的分箱数量
for dataset in [train_set]:
dataset['balanceBin'] = pd.qcut(dataset['balance'], 5)
dataset['ageBin'] = pd.cut(dataset['age'].astype(int), [0, 30, 60, 100])
dataset['quarter'] = dataset['month'].map(quarter)
dataset['halfyear'] = dataset['month'].map(halfyear)
dataset['dayBin'] = pd.cut(dataset['day'], 2)
dataset['durationBin'] = pd.qcut(dataset['duration'], 9)
dataset['campaignBin'] = pd.qcut(dataset['campaign'], 2)
dataset['pdaysBin'] = pd.cut(dataset['pdays'], 9)
dataset['previousBin'] = pd.cut(dataset['previous'], 9)
dataset['all_previous'] = dataset['campaign'] + dataset['previous']
dataset['all_previousBin'] = pd.cut(dataset['all_previous'], 2)
判断每一个类别(分箱结果)占总体比例的最大值,如果某一个类别占比超过95%说明这个特征严重偏斜,应去除该特征,最后的结果是去除previousBin和all_previousBin特征
def MaximumBinPcnt(df, col):
N = df.shape[0]
total = df.groupby([col])[col].count()
pcnt = total * 1.0 / N
return max(pcnt)
discrete_columns = [
'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
'month', 'y', 'balanceBin', 'ageBin', 'quarter', 'halfyear', 'dayBin',
'durationBin', 'campaignBin', 'pdaysBin', 'previousBin', 'all_previousBin'
]
for column in discrete_columns:
print(column, ':', MaximumBinPcnt(train_set, column))
对数据标签进行编码化
label = LabelEncoder()
for dataset in [train_set]:
dataset['job_Code'] = label.fit_transform(dataset['job'])
dataset['ageBin_Code'] = label.fit_transform(dataset['ageBin'])
dataset['marital_Code'] = label.fit_transform(dataset['marital'])
dataset['education_Code'] = label.fit_transform(dataset['education'])
dataset['default_Code'] = label.fit_transform(dataset['default'])
dataset['balanceBin_Code'] = label.fit_transform(dataset['balanceBin'])
dataset['housing_Code'] = label.fit_transform(dataset['housing'])
dataset['loan_Code'] = label.fit_transform(dataset['loan'])
dataset['contact_Code'] = label.fit_transform(dataset['contact'])
dataset['dayBin_Code'] = label.fit_transform(dataset['dayBin'])
dataset['month_Code'] = label.fit_transform(dataset['month'])
dataset['durationBin_Code'] = label.fit_transform(dataset['durationBin'])
dataset['campaignBin_Code'] = label.fit_transform(dataset['campaignBin'])
dataset['pdaysBin_Code'] = label.fit_transform(dataset['pdaysBin'])
dataset['quarter_Code'] = label.fit_transform(dataset['quarter'])
dataset['halfyear_Code'] = label.fit_transform(dataset['halfyear'])
使用卡方检验进行离散型的变量进行筛选,
for col in columns_train_data_x:
obs = pd.crosstab(train_set['y'],
train_set[col],
rownames=['y'],
colnames=[col])
chi2, p, dof, expect = chi2_contingency(obs)
print("{} 卡方检验p值: {:.4f}".format(col, p))
#所有特征的卡方检验P值显著小于0.01,保留所有特征
columns_train_data_x = [
'job_Code', 'ageBin_Code', 'marital_Code', 'education_Code',
'default_Code', 'balanceBin_Code', 'housing_Code', 'loan_Code',
'contact_Code', 'dayBin_Code', 'month_Code', 'durationBin_Code',
'campaignBin_Code', 'pdaysBin_Code', 'quarter_Code', 'halfyear_Code'
]
Target = ['y']
#使用spearman对数据之间进行相关性检验
def correlation_heatmap(df):
_ , ax = plt.subplots(figsize =(20, 16))
colormap = sns.diverging_palette(220, 10, as_cmap = True)
_ = sns.heatmap(
df.corr('spearman'),
cmap = colormap,
square=True,
cbar_kws={'shrink':.9 },
ax=ax,
annot=True,
linewidths=0.1,vmax=1.0, linecolor='white',
annot_kws={'fontsize':10 }
)
plt.title('Pearson Correlation of Features', y=1.05, size=15)
correlation_heatmap(train_set[columns_train_data_xy])
quarter_Code和halfyear_Code相关性较高,选择仅保留halfyear_Code
columns_train_data_x = [
'job_Code', 'ageBin_Code', 'marital_Code', 'education_Code',
'default_Code', 'balanceBin_Code', 'housing_Code', 'loan_Code',
'contact_Code', 'dayBin_Code', 'month_Code', 'durationBin_Code',
'campaignBin_Code', 'pdaysBin_Code', 'halfyear_Code'
]
train_data_x = train_set[columns_train_data_x]
train_data_y = train_set['y']
train_data_x = pd.get_dummies(train_data_x , columns=columns_train_data_x)
对样本进行上采样,采样方法还有BorderlineSMOTE,ADASYN,SMOTETomek等
#对样本过采样
train_data_x,train_data_y = SMOTE().fit_resample(train_data_x,train_data_y)
train_data_y.value_counts()
#0 22356
#1 22356
GBDT 的全称是 Gradient Boosting Decision Tree,梯度提升决策树,是一种集成学习算法,在产业界应用十分广泛。
直接对GBDT进行网格搜索调优,如果数据量太大可以使用贪心算法调优,但是贪心算法只能达到局部最优。
GBDT_n_estimators = [120, 300]
GBDT_learning_rate = [0.001, 0.01]
GBDT_max_features = ['sqrt']
GBDT_max_depth = [3, 5, 8]
#GBDT_min_samples_split = [1, 2, 5, 10, 15, 100]
#GBDT_min_samples_leaf = [1, 2, 5, 10]
#GBDT_subsample = [0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_grid = {
'n_estimators': GBDT_n_estimators,
'learning_rate': GBDT_learning_rate,
'max_features': GBDT_max_features,
'max_depth': GBDT_max_depth,
#'min_samples_split': GBDT_min_samples_split,
#'min_samples_leaf': GBDT_min_samples_leaf,
#'subsample': GBDT_subsample
}
cv_split = model_selection.ShuffleSplit(n_splits=10,
test_size=.3,
random_state=0)
model_tunning = GridSearchCV(ensemble.GradientBoostingClassifier(),
param_grid=param_grid,
cv=cv_split,
scoring='roc_auc')
model_tunning.fit(train_data_x, train_data_y)
print('最优分数', model_tunning.best_score_) #模型最高分
print('最优参数', model_tunning.best_params_) #最优参数
print('最优模型', model_tunning.best_estimator_) #最优模型
best_model = model_tunning.best_estimator_
模型评估方法有混淆矩阵,精确率、召回率、ROC曲线、ACU值等
#混淆矩阵
x_train, x_test, y_train, y_test = model_selection.train_test_split(
train_data_x, train_data_y, train_size=.7)
model = ensemble.GradientBoostingClassifier(n_estimators=300,
learning_rate=0.1,
max_features='sqrt',
max_depth=7,
min_samples_split=500,
min_samples_leaf=60,
subsample=1)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
cm = confusion_matrix(y_test, y_predict)
ConfusionMatrixDisplay(cm).plot(cmap='Blues')
from sklearn.metrics import accuracy_score, precision_score,recall_score
print(accuracy_score(y_test, y_predict)) #准确率
print(precision_score(y_test, y_predict, average='weighted')) #加权准确率
print(recall_score(y_test, y_predict)) #召回率
score = cross_val_score(model, train_data_x, train_data_y, cv=5, n_jobs=1, scoring='roc_auc')
score.mean()
#最终模型分数0.9830339716115262
ROC曲线
#ROC/AUC
model.fit(x_train, y_train)
def get_rocauc(y,X,clf):
FPR,recall,thresholds=roc_curve(y,clf.predict_proba(X)[:,1],pos_label=1)
area=roc_auc_score(y,clf.predict_proba(X)[:,1])
maxindex=(recall-FPR).tolist().index(max(recall-FPR))
threshold=thresholds[maxindex]
plt.figure()
plt.plot(FPR,recall,color='red',label='ROC curve (area = %0.2f)'%area)
plt.plot([0,1],[0,1],color='black',linestyle='--')
plt.scatter(FPR[maxindex],recall[maxindex],c='black',s=30)
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('Recall')
plt.title('Receiver operating characteristic example')
plt.legend(loc='lower right')
plt.show()
return threshold
threshold=get_rocauc(y_test, x_test,model)