20170410
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage,fclusterdata
from scipy.sparse import coo_matrix, bmat
from sklearn import preprocessing
PLOT = False #是否画图
N = 1 #输出结果个数
data = pd.read_csv(r'E:\7 Python\data\t2.csv')
print data.head()
print data.info()
print data.columns
data.columns = ['ID', 'FAULT_TYPE_3', 'work_months', 'save_months',
'FAULT_MONTH', 'INST_MONTH', 'PS_MONTH', 'SYNC_ORG_NO',
'SPEC_CODE', 'COMM_MODE', 'ARRIVE_BATCH_NO', 'MANUFACTURER',
'TYPE_CODE', 'EXCHG_TYPE_CODE', 'LC_FLAG', 'TL_SHARE_FLAG',
'MP_CAP', 'TRADE_CODE', 'ELEC_TYPE_CODE', 'RUN_CAP',
'CUST_STATUS_CODE', 'TRANSFER_CODE', 'TMP_FLAG']
#故障作图
print data['FAULT_TYPE_3'].value_counts()
fig, axis0 = plt.subplots(1, 1)
sns.countplot(x='FAULT_TYPE_3', data=data, ax=axis0)
# plt.show()
# SYNC_ORG_NO
print data['SYNC_ORG_NO'].describe()
# plot
def plot_fun(name_fea, name_fault, figsize=None, fontsize=None):
plt.figure(figsize=figsize)
fig, axis1 = plt.subplots(1, 1)
sns.countplot(x=name_fea, data=data, ax=axis1)
plt.figure(figsize=figsize)
fig, axis2 = plt.subplots(1, 1)
c = data[name_fea].value_counts()
s = c.cumsum() / c.sum()
axis2.plot(np.arange(s.shape[0]) + 1, s.values * 100)
axis2.set_title('precent of %s' % name_fea)
plt.figure(figsize=figsize)
fig, axis3 = plt.subplots(1, 1)
sns.countplot(x=name_fea, hue=name_fault, data=data, ax=axis3)
plt.legend(loc=2)
plt.figure(figsize=figsize)
fig, axis4 = plt.subplots(1, 1)
sns.countplot(x=name_fault, hue=name_fea, data=data, ax=axis4)
plt.legend(loc=2, fontsize=fontsize)
# calculate similar score
from scipy.cluster.hierarchy import dendrogram, linkage
# clustermap
fault_num1 = data.groupby([name_fault, name_fea])[data.columns[0]].count().unstack()
ratio = fault_num1 / fault_num1.sum()
g1 = sns.clustermap(ratio,
cmap=plt.get_cmap('RdBu'),
vmax=1,
vmin=-1,
linewidth=0,
figsize=(10, 10),
row_cluster=False,
col_cluster=False
)
plt.title('fault ratio')
# 聚类函数
def cluster_encoding(name):
global data
fault_num = data.groupby(['FAULT_TYPE_3', name])[data.columns[0]].count().unstack()
MAN_ratio = fault_num / fault_num.sum()
MAN_ratio_T = MAN_ratio.T
clusters = fclusterdata(np.array(MAN_ratio_T), 1)
print clusters.shape
clusters_mapping = {label: idx for label, idx in zip(MAN_ratio.columns, clusters)}
data[name] = data[name].map(clusters_mapping)
# 编码函数
def onehot_pre(name):
global data
le = preprocessing.LabelEncoder()
le.fit(data[name])
cat_name = list(le.classes_)
data[name] = le.transform(data[name])
return cat_name
# plot
if PLOT:
plot_fun('SYNC_ORG_NO', 'FAULT_TYPE_3')
# get_dummies
# SYNC_ORG_dummies = pd.get_dummies(data['SYNC_ORG_NO'],prefix='SYNC_ORG_NO')
SYNC_ORG_dummies = coo_matrix(pd.get_dummies(data['SYNC_ORG_NO'],prefix='SYNC_ORG_NO'))
# #ORG省份故障统计
# print data['SYNC_ORG_NO'].describe()
# #plot
# #get_dummies
# ORG_freq = data['SYNC_ORG_NO'].value_counts().index[data['SYNC_ORG_NO'].value_counts().values<100]
# # data['SYNC_ORG_NO'] = data['SYNC_ORG_NO'].replace(ORG_freq.values,0)#报错
# if PLOT:
# plot_fun('SYNC_ORG_NO', 'FAULT_TYPE_3', figsize=(20,6),fontsize=0.1)
#
# ORG_dummies = coo_matrix(pd.get_dummies(data['SYNC_ORG_NO']))# 转化为稀疏矩阵
# ORG_dummies.drop(['33101', '33407', '33411'], axis=1, inplace=True)
# print ORG_dummies.shape
#
#SPEC_CODE
data['SPEC_CODE'].describe()
print data['SPEC_CODE'].value_counts()
# if PLOT:
# plot_fun('SPEC_CODE', 'FAULT_TYPE_3')
# spec_freq = data['SPEC_CODE'].value_counts().index[data['SPEC_CODE'].value_counts().values<100]
# #spec_mapping = {label:idx for label,idx in zip(spec_freq, np.zeros(len(spec_freq)))}
# print spec_freq.values
# data['SPEC_CODE'].value_counts()
# data['SPEC_CODE'] = data['SPEC_CODE'].replace(spec_freq.values, 0)
# print data['SPEC_CODE'].value_counts()
#get_dummies
# SPEC_dummies = pd.get_dummies(data['SPEC_CODE'],prefix='SPEC_CODE')
SPEC_dummies = coo_matrix(pd.get_dummies(data['SPEC_CODE'],prefix='SPEC_CODE'))
#MANUFACTURER
data['MANUFACTURER'].value_counts()
print len(data['MANUFACTURER'].value_counts())
# spec_freq = data['MANUFACTURER'].value_counts().index[data['MANUFACTURER'].value_counts().values<500]
# data['MANUFACTURER'] = data['MANUFACTURER'].replace(spec_freq.values, 0)
# print len(data['MANUFACTURER'].value_counts())
# #plot
# if PLOT:
# plot_fun('MANUFACTURER', 'FAULT_TYPE',figsize=(20,6), fontsize=1)
# print len(data['MANUFACTURER'].value_counts())
#get_dummies
# MAN_dummies = pd.get_dummies(data['MANUFACTURER'],prefix='MANUFACTURER')
MAN_dummies = coo_matrix(pd.get_dummies(data['MANUFACTURER'],prefix='MANUFACTURER'))
#COMM_MODE
print data['COMM_MODE'].value_counts()
# if PLOT:
# plot_fun('COMM_MODE', 'FAULT_TYPE')
#
# COMM_freq = data['COMM_MODE'].value_counts().index[data['COMM_MODE'].value_counts().values<100]
# data['COMM_MODE'] = data['COMM_MODE'].replace(COMM_freq.values, 0)
# COMM_dummies = pd.get_dummies(data['COMM_MODE'],prefix='COMM_MODE')# 转化为稀疏矩阵
COMM_dummies = coo_matrix(pd.get_dummies(data['COMM_MODE'],prefix='COMM_MODE'))# 转化为稀疏矩阵
# 故障月份
data['FAULT_MONTH'] = pd.Categorical(data['FAULT_MONTH'], ordered=True)
if PLOT:
m1 = data.groupby(['FAULT_MONTH', 'FAULT_TYPE_3']).size().unstack().reindex(index=np.arange(data.FAULT_MONTH.min(), data.FAULT_MONTH.max()+1)).fillna(0)
m1.plot(kind='bar', figsize=(12, 12), subplots=True)
plot_fun('FAULT_MONTH', 'FAULT_TYPE_3', fontsize=1)
#get_dummies
# FAUMON_dummies = pd.get_dummies(data['FAULT_MONTH'],prefix='FAULT_MONTH')
FAUMON_dummies = coo_matrix(pd.get_dummies(data['FAULT_MONTH'],prefix='FAULT_MONTH'))
#安装月份
data['INST_MONTH'] = pd.Categorical(data['INST_MONTH'], ordered=True)
if PLOT:
m1 = data.groupby(['INST_MONTH', 'FAULT_TYPE_3']).size().unstack().reindex(index=np.arange(data.FAULT_MONTH.min(), data.FAULT_MONTH.max()+1)).fillna(0)
m1.plot(kind='bar', figsize=(12, 12), subplots=True)
plot_fun('INST_MONTH', 'FAULT_TYPE_3', fontsize=1)
#get_dummies
# INSMON_dummies = pd.get_dummies(data['INST_MONTH'],prefix='INST_MONTH')
INSMON_dummies = coo_matrix(pd.get_dummies(data['INST_MONTH'],prefix='INST_MONTH'))
#库存时间
# #提取数字,负值变为nan
# data['save_months'] = data['save_months'].str.extract('(?P\d)?')
if PLOT:
c1 = data.groupby(['save_months']).size()
c1.plot(kind='bar', figsize=(12, 6))
c2 = data.groupby(['save_months', 'FAULT_TYPE']).size().unstack().reindex(
index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0)
c2.plot(kind='bar', figsize=(12, 12), subplots=True)
c3 = data.groupby(['save_months', 'SYNC_ORG_NO']).size().unstack().reindex(
index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0)
c3.plot(kind='bar', figsize=(12, 12), subplots=True)
# # 归一化
# min_max_scaler = preprocessing.MinMaxScaler()
# data['save_months'] = min_max_scaler.fit_transform(data['save_months'])
#
# save_dummies = pd.get_dummies(data['save_months'],prefix='save_months')
save_dummies = coo_matrix(pd.get_dummies(data['save_months'],prefix='save_months'))
# 工作时长
# #提取数字,负值变为nan
# data['work_months'] = data['work_months'].str.extract('(?P\d)?')
if PLOT:
c1 = data.groupby(['work_months']).size()
c1.plot(kind='bar', figsize=(12, 6))
c2 = data.groupby(['work_months', 'FAULT_TYPE_3']).size().unstack().reindex(
index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0)
c2.plot(kind='bar', figsize=(12, 12), subplots=True)
c3 = data.groupby(['work_months', 'SYNC_ORG_NO']).size().unstack().reindex(
index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0)
c3.plot(kind='bar', figsize=(12, 12), subplots=True)
# 归一化
# min_max_scaler = preprocessing.MinMaxScaler()
# data['work_months'] = min_max_scaler.fit_transform(data['work_months'])
# work_dummies = pd.get_dummies(data['work_months'],prefix='work_months')
work_dummies = coo_matrix(pd.get_dummies(data['work_months'],prefix='work_months'))
#ARRIVE_BATCH_NO
print len(data['ARRIVE_BATCH_NO'].value_counts())
#cluster_encoding('ARRIVE_BATCH_NO')
arr_freq = data['ARRIVE_BATCH_NO'].value_counts().index[data['ARRIVE_BATCH_NO'].value_counts().values<300]
data['ARRIVE_BATCH_NO'] = data['ARRIVE_BATCH_NO'].replace(arr_freq.values, 0)
print len(data['ARRIVE_BATCH_NO'].value_counts())
#plot
if PLOT:
plot_fun('ARRIVE_BATCH_NO', 'FAULT_TYPE',figsize=(20,6), fontsize=1)
#get_dummies
# arr_dummies = pd.get_dummies(data['ARRIVE_BATCH_NO'],prefix='ARRIVE_BATCH_NO')
arr_dummies = coo_matrix(pd.get_dummies(data['ARRIVE_BATCH_NO'],prefix='ARRIVE_BATCH_NO'))
print data['ARRIVE_BATCH_NO'].value_counts()
# TYPE_CODE
# TYPE_dummies = pd.get_dummies(data['TYPE_CODE'],prefix='TYPE_CODE')
TYPE_dummies = coo_matrix(pd.get_dummies(data['TYPE_CODE'],prefix='TYPE_CODE'))
# EXCHG_TYPE_CODE
# EXCHG_dummies = pd.get_dummies(data['EXCHG_TYPE_CODE'],prefix='EXCHG_TYPE_CODE')
EXCHG_dummies = coo_matrix(pd.get_dummies(data['EXCHG_TYPE_CODE'],prefix='EXCHG_TYPE_CODE'))
# LC_FLAG
# LC_dummies = pd.get_dummies(data['LC_FLAG'],prefix='LC_FLAG')
LC_dummies = coo_matrix(pd.get_dummies(data['LC_FLAG'],prefix='LC_FLAG'))
# TL_SHARE_FLAG
# TL_dummies = pd.get_dummies(data['TL_SHARE_FLAG'],prefix='TL_SHARE_FLAG')
TL_dummies = coo_matrix(pd.get_dummies(data['TL_SHARE_FLAG'],prefix='TL_SHARE_FLAG'))
# MP_CAP
# MP_dummies = pd.get_dummies(data['MP_CAP'],prefix='MP_CAP')
MP_dummies = coo_matrix(pd.get_dummies(data['MP_CAP'],prefix='MP_CAP'))
# TRADE_CODE
# TRADE_dummies = pd.get_dummies(data['TRADE_CODE'],prefix='TRADE_CODE')
TRADE_dummies = coo_matrix(pd.get_dummies(data['TRADE_CODE'],prefix='TRADE_CODE'))
# ELEC_TYPE_CODE
# ELEC_dummies = pd.get_dummies(data['ELEC_TYPE_CODE'],prefix='ELEC_TYPE_CODE')
ELEC_dummies = coo_matrix(pd.get_dummies(data['ELEC_TYPE_CODE'],prefix='ELEC_TYPE_CODE'))
# RUN_CAP
# RUN_dummies = pd.get_dummies(data['RUN_CAP'],prefix='RUN_CAP')
RUN_dummies = coo_matrix(pd.get_dummies(data['RUN_CAP'],prefix='RUN_CAP'))
# CUST_STATUS_CODE
# CUST_dummies = pd.get_dummies(data['CUST_STATUS_CODE'],prefix='CUST_STATUS_CODE')
CUST_dummies = coo_matrix(pd.get_dummies(data['CUST_STATUS_CODE'],prefix='CUST_STATUS_CODE'))
# TRANSFER_CODE
# TRANSFER_dummies = pd.get_dummies(data['TRANSFER_CODE'],prefix='TRANSFER_CODE')
TRANSFER_dummies = coo_matrix(pd.get_dummies(data['TRANSFER_CODE'],prefix='TRANSFER_CODE'))
# TMP_FLAG
# TMP_dummies = pd.get_dummies(data['TMP_FLAG'],prefix='TMP_FLAG')
TMP_dummies = coo_matrix(pd.get_dummies(data['TMP_FLAG'],prefix='TMP_FLAG'))
#整合数据
data = data.join(pd.DataFrame(bmat([[ SYNC_ORG_dummies, SPEC_dummies, COMM_dummies,work_dummies,
save_dummies,INSMON_dummies,
FAUMON_dummies,arr_dummies,MAN_dummies,TYPE_dummies,
EXCHG_dummies,LC_dummies,TL_dummies,MP_dummies,TRADE_dummies,
ELEC_dummies,RUN_dummies,CUST_dummies,TRANSFER_dummies,TMP_dummies]]).toarray()))
data.drop(['PS_MONTH','ID','SYNC_ORG_NO','SPEC_CODE','COMM_MODE','work_months','save_months',
'INST_MONTH','FAULT_MONTH','ARRIVE_BATCH_NO','MANUFACTURER',
'TYPE_CODE','EXCHG_TYPE_CODE','LC_FLAG',
'TL_SHARE_FLAG','MP_CAP','TRADE_CODE','ELEC_TYPE_CODE','RUN_CAP',
'CUST_STATUS_CODE','TRANSFER_CODE','TMP_FLAG'], axis=1, inplace=True)
del SYNC_ORG_dummies, SPEC_dummies, COMM_dummies,work_dummies,save_dummies,\
INSMON_dummies,FAUMON_dummies,arr_dummies,MAN_dummies,TYPE_dummies,\
EXCHG_dummies,LC_dummies,TL_dummies,MP_dummies,TRADE_dummies,ELEC_dummies,\
RUN_dummies,CUST_dummies,TRANSFER_dummies,TMP_dummies
print data
#机器学习算法故障预测
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV
import pickle
from scipy.sparse import csc_matrix
data_X = data.drop(['FAULT_TYPE_3'], axis=1)
data_y = data['FAULT_TYPE_3']
data_X = csc_matrix(data_X)
#encode label
le = preprocessing.LabelEncoder()
data_y = le.fit_transform(data_y)
'''
data_X1 = coo_matrix(data_X.ix[:200000])
data_X2 = coo_matrix(data_X.ix[200001:400000])
data_X3 = coo_matrix(data_X.ix[400001:])
data_X4 = bmat([[data_X1], [data_X2], [data_X3]], format='coo')
'''
#del data_X1, data_X2, data_X3
train, test, train_y, test_y = train_test_split(data_X, data_y, test_size=0.33, random_state=27)
#XGBoost
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
TRAIN = True # 是否训练
CV = False
# split train set and test set
dtrain = xgb.DMatrix(data_X, data_y)
dtest = xgb.DMatrix(test)
clf = xgb.XGBClassifier(
learning_rate = 0.2,
n_estimators = 720,
max_depth = 9,
colsample_bytree = 0.8,
subsample = 0.9,
objective = 'multi:softprob',
min_child_weight = 1,
gamma = 2,
seed = 27
)
param = clf.get_xgb_params()
param['num_class'] = 11
if CV:
cvresult = xgb.cv(param, dtrain, num_boost_round=2000, nfold=3, stratified=True,
metrics='merror', early_stopping_rounds=10, verbose_eval=True)
clf.set_params(n_estimators=cvresult.shape[0]) # set n_estimators as cv rounds
if TRAIN:
clf.fit(data_X, data_y, eval_metric='merror')
else:
clf = pickle.load(open("zhejiang_4_all.pkl", "rb"))
ypred_xgb = clf.predict(test)
ypred_xgb = le.inverse_transform(ypred_xgb)
test_y_xgb = le.inverse_transform(test_y)
# print model report:
print(classification_report(test_y_xgb, ypred_xgb))
print(confusion_matrix(test_y_xgb, ypred_xgb))
xgb.plot_importance(clf.booster())
plt.show()
pickle.dump(clf, open("zhejiang_4_all_jiaoliu.pkl", "wb"))
# #knn
# from sklearn import neighbors
#
# USE_GridSearch = False
# n_neighbors = 25
#
# clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
# if USE_GridSearch:
# param_test1 = {'n_neighbors': range(20, 60, 10), 'weights': ['uniform', 'distance']}
# gsearch1 = GridSearchCV(estimator=clf, param_grid=param_test1, scoring='accuracy', n_jobs=-1, cv=2, verbose=True)
# gsearch1.fit(train, train_y)
# print gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
# clf = gsearch1
# else:
# clf.fit(train, train_y)
# ypred_knn = clf.predict(test)
# ypred_knn = le.inverse_transform(ypred_knn)
# test_y_knn = le.inverse_transform(test_y)
# # print model report:
# print(classification_report(test_y_knn, ypred_knn))
# print(confusion_matrix(test_y_knn, ypred_knn))
# pickle.dump(clf, open("zhejiang_4_KNN.pkl", "wb"))