新人赛地址
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
train = pd.read_csv("./train_set.csv")
test = pd.read_csv("./test_set.csv")
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25317 entries, 0 to 25316
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 25317 non-null int64
1 age 25317 non-null int64
2 job 25317 non-null object
3 marital 25317 non-null object
4 education 25317 non-null object
5 default 25317 non-null object
6 balance 25317 non-null int64
7 housing 25317 non-null object
8 loan 25317 non-null object
9 contact 25317 non-null object
10 day 25317 non-null int64
11 month 25317 non-null object
12 duration 25317 non-null int64
13 campaign 25317 non-null int64
14 pdays 25317 non-null int64
15 previous 25317 non-null int64
16 poutcome 25317 non-null object
17 y 25317 non-null int64
dtypes: int64(9), object(9)
memory usage: 3.5+ MB
NO | 字段名称 | 数据类型 | 字段描述 |
---|---|---|---|
1 | ID | Int | 客户唯一标识 |
2 | age | Int | 客户年龄 |
3 | job | String | 客户的职业 |
4 | marital | String | 婚姻状况 |
5 | education | String | 受教育水平 |
6 | default | String | 是否有违约记录 |
7 | balance | Int | 每年账户的平均余额 |
8 | housing | String | 是否有住房贷款 |
9 | loan | String | 是否有个人贷款 |
10 | contact | String | 与客户联系的沟通方式 |
11 | day | Int | 最后一次联系的时间(几号) |
12 | month | String | 最后一次联系的时间(月份) |
13 | duration | Int | 最后一次联系的交流时长 |
14 | campaign | Int | 在本次活动中,与该客户交流过的次数 |
15 | pdays | Int | 距离上次活动最后一次联系该客户,过去了多久(999表示没有联系过) |
16 | previous | Int | 在本次活动之前,与该客户交流过的次数 |
17 | poutcome | String | 上一次活动的结果 |
18 | y | Int | 预测客户是否会订购定期存款业务 |
abs(train.corr()['y']).sort_values(ascending=False)
y 1.000000
ID 0.556627
duration 0.394746
pdays 0.107565
previous 0.088337
campaign 0.075173
balance 0.057564
day 0.031886
age 0.029916
Name: y, dtype: float64
s = (train.dtypes == 'object')
object_col = list(s[s].index)
object_col
num_col = list(set(train.columns) - set(object_col))
plt.figure(figsize=(25,22))
for (i,col) in enumerate(num_col):
plt.subplot(3,3,i+1)
sns.distplot(train[col]) # kde=False 可不显示密度线
plt.xlabel(col,size=20)
plt.show()
y
标签的比例len(train[train['y']==1])/len(train['y'])
0.11695698542481336
只有 11% 的人会购买
X_train = train.drop(['ID','y'], axis=1)
X_test = test.drop(['ID'], axis=1)
y_train = train['y']
def num_cat_splitor(X_train):
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
num_cols = list(set(X_train.columns) - set(object_cols))
return num_cols, object_cols
num_cols, object_cols = num_cat_splitor(X_train)
# 查看文字变量的种类
for col in object_col:
print(col, sorted(train[col].unique()))
print(col, sorted(test[col].unique()))
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_cols)),
#('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(object_cols)),
('cat_encoder', OneHotEncoder(sparse=False,handle_unknown='ignore')),
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
X_prepared = full_pipeline.fit_transform(X_train)
from sklearn.ensemble import RandomForestClassifier
prepare_select_and_predict_pipeline = Pipeline([
('preparation', full_pipeline),
('forst_reg', RandomForestClassifier(random_state=0))
])
param_grid = [{
'forst_reg__n_estimators' : [50,100, 150, 200,250,300,330,350],
'forst_reg__max_features':[45,50, 55, 65]
}]
grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=7,
scoring='roc_auc', verbose=2, n_jobs=-1)
grid_search_prep.fit(X_train,y_train)
grid_search_prep.best_params_
final_model = grid_search_prep.best_estimator_
y_pred_test = final_model.predict(X_test)
result = pd.DataFrame()
result['ID'] = test['ID']
result['pred'] = y_pred_test
result.to_csv('buy_product_pred.csv',index=False)
auc 得分:0.72439844
def split_age(data):
for i, age in enumerate(data['age']):
if age < 25:
data['age_'][i] = 'A'
elif age < 35:
data['age_'][i] = 'B'
elif age < 45:
data['age_'][i] = 'C'
elif age < 55:
data['age_'][i] = 'D'
elif age < 65:
data['age_'][i] = 'E'
else:
data['age_'][i] = 'F'
return data
train['age_'] = ''
test['age_'] = ''
train = split_age(train)
test = split_age(test)
# test['age_'].value_counts()
X_train = train.drop(['ID','y','age'], axis=1)
X_test = test.drop(['ID','age'], axis=1)
几乎没有效果:得分,0.7199633663512787
for col in object_col:
print(col, sorted(train[col].unique()))
print(col, sorted(test[col].unique()))
ob ['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown']
job ['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown']
marital ['divorced', 'married', 'single']
marital ['divorced', 'married', 'single']
education ['primary', 'secondary', 'tertiary', 'unknown']
education ['primary', 'secondary', 'tertiary', 'unknown']
default ['no', 'yes']
default ['no', 'yes']
housing ['no', 'yes']
housing ['no', 'yes']
loan ['no', 'yes']
loan ['no', 'yes']
contact ['cellular', 'telephone', 'unknown']
contact ['cellular', 'telephone', 'unknown']
month ['apr', 'aug', 'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep']
month ['apr', 'aug', 'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep']
poutcome ['failure', 'other', 'success', 'unknown']
poutcome ['failure', 'other', 'success', 'unknown']
发现有 unknown,检查其比例
train.isin(['unknown']).mean()*100
ID 0.000000
age 0.000000
job 0.643836
marital 0.000000
education 4.206660
default 0.000000
balance 0.000000
housing 0.000000
loan 0.000000
contact 28.759332 # 缺失较多
day 0.000000
month 0.000000
duration 0.000000
campaign 0.000000
pdays 0.000000
previous 0.000000
poutcome 81.672394 # 缺失较多
y 0.000000
age_ 0.000000
dtype: float64
train['contact'].value_counts()
cellular 16391
unknown 7281
telephone 1645
Name: contact, dtype: int64
cellular
填补train['contact'][(train['contact']=='unknown')] = 'cellular'
poutcome
丢弃train['poutcome'].value_counts()
unknown 20677
failure 2735
other 1070
success 835
Name: poutcome, dtype: int64
再次提交:得分,0.7028823729532243,更差了
求大佬分享下做法,学习一下。