2022 钉钉杯数据挖掘A题 Python 源码
注:仅供学习使用,请勿用于商业用途、论文发表、竞赛支撑材料盗取等其他任何用途!!!谢谢
关于数据集可联系竞赛组委会或与本人联系:
[email protected]
获取完整源码请与本人联系。
import collections
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import itertools
import keras
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.model_selection import RandomizedSearchCV,cross_val_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler,RobustScaler
from scipy.stats import norm
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers.core import Dense
from keras.optimizers import adam_v2
from keras.metrics import categorical_crossentropy
df = pd.read_csv('card_transdata.csv')
df.head()
print('共计有:',df.isnull().sum().max(),'个数据缺失')
df.describe()
number_fraud = len(df[df['fraud']==1])
number_notfraud = number_all - number_fraud
print('总交易笔数:', number_all ,'笔')
print('非诈骗交易笔数:', number_notfraud ,'笔')
print('诈骗交易笔数:', number_fraud ,'笔')
print('非诈骗交易比例:{:.4f}'.format(number_notfraud/number_all*100,),'%')
print('诈骗交易比例:{:.4f}'.format(number_fraud/number_all*100,),'%')
sns.countplot('fraud',data = df)
plt.title('type \n(0: not fraud||1: fraud)',fontsize = 14)
std_scaler = StandardScaler()
rob_scaler = RobustScaler()
df['scaled_distance_from_home'] = rob_scaler.fit_transform(df['distance_from_home'].values.reshape(-1,1))
df['scaled_distance_from_last_transaction'] = rob_scaler.fit_transform(df['distance_from_last_transaction'].values.reshape(-1,1))
df['scaled_ratio_to_median_purchase_price'] = rob_scaler.fit_transform(df['ratio_to_median_purchase_price'].values.reshape(-1,1))
df.drop(['distance_from_home','distance_from_last_transaction','ratio_to_median_purchase_price'], axis=1, inplace=True)
print('Successfully using RobustScaler!')
scaled_distance_from_home = df['scaled_distance_from_home']
scaled_distance_from_last_transaction = df['scaled_distance_from_last_transaction']
scaled_ratio_to_median_purchase_price = df['scaled_ratio_to_median_purchase_price']
df.drop(['scaled_distance_from_home','scaled_distance_from_last_transaction','scaled_ratio_to_median_purchase_price'], axis=1, inplace=True)
df.insert(0,'scaled_distance_from_home',scaled_distance_from_home)
df.insert(0,'scaled_distance_from_last_transaction',scaled_distance_from_last_transaction)
df.insert(0,'scaled_ratio_to_median_purchase_price',scaled_ratio_to_median_purchase_price)
np.savetxt('df_after_robust.csv', df, delimiter = ',')
print('合法交易占比',round(df['fraud'].value_counts()[0]/len(df)*100,2),'%')
print('欺诈交易占比',round(df['fraud'].value_counts()[1]/len(df)*100,2),'%')
X = df.drop('fraud',axis=1)
y = df['fraud']
sss = StratifiedKFold(n_splits=5,random_state=None,shuffle=False)
for train_index, test_index in sss.split(X,y):
print("Train:",train_index,"Test:",test_index)
train_index = np.array(train_index)
test_index = np.array(test_index)
original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values
train_unique_label,train_counts_label = np.unique(original_ytrain,return_counts=True)
test_unique_label,test_counts_label = np.unique(original_ytest,return_counts=True)
print('-'*100)
print('标签分布比例————\n')
print('训练集:',train_counts_label/len(original_ytrain))
print('测试集:',test_counts_label/len(original_ytest))
df = df.sample(frac=1)
fraud_df = df.loc[df['fraud'] == 1]
not_fraud_df = df.loc[df['fraud'] == 0][:87403]
normal_distributed_df = pd.concat([fraud_df,not_fraud_df])
new_df = normal_distributed_df.sample(frac=1, random_state=42)
new_df.head()
np.savetxt('df_4231.csv', new_df, delimiter = ',')
print('子样本数据集中的类别分布')
print(new_df['fraud'].value_counts()/len(new_df))
sns.countplot('fraud',data = new_df)
plt.title('balance the data size in diff matrix',fontsize = 14)
plt.show()
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(16,10))
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm_r',annot_kws={'size':20}, ax=ax1)
ax1.set_title('不平衡数据关联矩阵\n(不作为参考)',fontsize=14)
sub_sample_corr = new_df.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r',annot_kws={'size':20}, ax=ax2)
ax2.set_title('数据子集关联矩阵\n(参考)',fontsize=14)
plt.show()
f, axes = plt.subplots(ncols=2, figsize=(16,4))
sns.boxplot(x='fraud',y='scaled_distance_from_last_transaction',data=new_df,ax=axes[0])
axes[0].set_title('used_pin_number与交易类型的负相关联性')
f, axes = plt.subplots(ncols=2, figsize=(16,4))
sns.boxplot(x='fraud',y='scaled_ratio_to_median_purchase_price',data=new_df,ax=axes[0])
axes[0].set_title('ratio_to_median_purchase_price与交易类型的负相关联性')
sns.boxplot(x='fraud',y='scaled_distance_from_home',data=new_df,ax=axes[1])
axes[1].set_title('distance_from_home与交易类型的负相关联性')
f,(ax1,ax2,ax3) = plt.subplots(1,3,figsize=(20,6))
distance_from_home_fraud_dist = new_df['scaled_distance_from_home'].loc[new_df['fraud'] == 1].values
ax1.set_xlim([-2,12.5])
sns.distplot(distance_from_home_fraud_dist,ax=ax1,fit=norm,color='#FB8861')
ax1.set_title('distance_from_home 数据分布\n(欺诈交易)',fontsize=14)
distance_from_last_transaction_fraud_dist = new_df['scaled_distance_from_last_transaction'].loc[new_df['fraud'] == 1].values
ax2.set_xlim([-2,5])
sns.distplot(distance_from_last_transaction_fraud_dist,ax=ax2,fit=norm,color='#56F9BB')
ax2.set_title('distance_from_last_transaction 数据分布\n(欺诈交易)',fontsize=14)
ratio_to_median_purchase_price_fraud_dist = new_df['scaled_ratio_to_median_purchase_price'].loc[new_df['fraud'] == 1].values
ax3.set_xlim([-2,8])
sns.distplot(ratio_to_median_purchase_price_fraud_dist,ax=ax3,fit=norm,color='#C5B3F9')
ax3.set_title('ratio_to_median_purchase_price 数据分布\n(欺诈交易)',fontsize=14)
plt.show()
distance_from_home_fraud = new_df['scaled_distance_from_home'].loc[new_df['fraud'] == 1].values
q25,q75 = np.percentile(distance_from_home_fraud,25),np.percentile(distance_from_home_fraud,75)
print('25% 四分位数:{} | 75% 四分位数:{}'.format(q25,q75))
distance_from_home_iqr = q75-q25
print('IQR:{}'.format(distance_from_home_iqr))
distance_from_home_cut_off = distance_from_home_iqr * 1.5
distance_from_home_lower,distance_from_home_upper = q25 - distance_from_home_cut_off,q75 + distance_from_home_cut_off
print('范围:{}'.format(distance_from_home_cut_off))
print('distance_from_home下阈值:{}'.format(distance_from_home_lower))
print('distance_from_home上阈值:{}'.format(distance_from_home_upper))
outliers = [x for x in distance_from_home_fraud if x < distance_from_home_lower or x > distance_from_home_upper]
print('欺诈交易中 distance_from_home 的离群值数量:{}'.format(len(outliers)))
new_df = new_df.drop(new_df[(new_df['scaled_distance_from_home'] > distance_from_home_upper) | (new_df['scaled_distance_from_home'] < distance_from_home_lower)].index)
print('离群值移除后的样本数:{}'.format(len(new_df)))
print('----' * 44)
distance_from_last_transaction_fraud = new_df['scaled_distance_from_last_transaction'].loc[new_df['fraud'] == 1].values
q25,q75 = np.percentile(distance_from_last_transaction_fraud,25),np.percentile(distance_from_last_transaction_fraud,75)
distance_from_last_transaction_iqr = q75-q25
distance_from_last_transaction_cut_off = distance_from_last_transaction_iqr * 1.5
distance_from_last_transaction_lower,distance_from_last_transaction_upper = q25 - distance_from_last_transaction_cut_off,q75 + distance_from_last_transaction_cut_off
print('distance_from_last_transaction 下阈值:{}'.format(distance_from_last_transaction_lower))
print('distance_from_last_transaction 上阈值:{}'.format(distance_from_last_transaction_upper))
outliers = [x for x in distance_from_last_transaction_fraud if x < distance_from_last_transaction_lower or x > distance_from_last_transaction_upper]
print('欺诈交易中 distance_from_last_transaction 的离群值数量:{}'.format(len(outliers)))
new_df = new_df.drop(new_df[(new_df['scaled_distance_from_last_transaction'] > distance_from_last_transaction_upper) | (new_df['scaled_distance_from_last_transaction'] < distance_from_last_transaction_lower)].index)
print('离群值移除后的样本数:{}'.format(len(new_df)))
print('----' * 44)
ratio_to_median_purchase_price_fraud = new_df['scaled_ratio_to_median_purchase_price'].loc[new_df['fraud'] == 1].values
q25,q75 = np.percentile(ratio_to_median_purchase_price_fraud,25),np.percentile(ratio_to_median_purchase_price_fraud,75)
ratio_to_median_purchase_price_iqr = q75-q25
ratio_to_median_purchase_price_cut_off = ratio_to_median_purchase_price_iqr * 1.5
ratio_to_median_purchase_price_lower,ratio_to_median_purchase_price_upper = q25 - ratio_to_median_purchase_price_cut_off,q75 + ratio_to_median_purchase_price_cut_off
print('ratio_to_median_purchase_price 下阈值:{}'.format(ratio_to_median_purchase_price_lower))
print('ratio_to_median_purchase_price 上阈值:{}'.format(ratio_to_median_purchase_price_upper))
outliers = [x for x in ratio_to_median_purchase_price_fraud if x < ratio_to_median_purchase_price_lower or x > ratio_to_median_purchase_price_upper]
print('欺诈交易中 ratio_to_median_purchase_price 的离群值数量:{}'.format(len(outliers)))
new_df = new_df.drop(new_df[(new_df['scaled_ratio_to_median_purchase_price'] > ratio_to_median_purchase_price_upper) | (new_df['scaled_ratio_to_median_purchase_price'] < ratio_to_median_purchase_price_lower)].index)
print('离群值移除后的样本数:{}'.format(len(new_df)))
print('----' * 44)
f,(ax1,ax2,ax3) = plt.subplots(1,3,figsize=(20,6))
sns.boxplot(x='fraud',y='scaled_distance_from_home',data=new_df,ax=ax1)
ax1.set_title('distance_from_home 特征\n离群值移除',fontsize=14)
ax1.annotate('降低极端\n离群值数量',xy=(0.98,-17.5),xytext=(0,-12),arrowprops=dict(facecolor='black'),fontsize=14)
sns.boxplot(x='fraud',y='scaled_distance_from_last_transaction',data=new_df,ax=ax2)
ax2.set_title('distance_from_last_transaction 特征\n离群值移除',fontsize=14)
ax2.annotate('降低极端\n离群值数量',xy=(0.98,-17.3),xytext=(0,-12),arrowprops=dict(facecolor='black'),fontsize=14)
sns.boxplot(x='fraud',y='scaled_ratio_to_median_purchase_price',data=new_df,ax=ax3)
ax3.set_title('ratio_to_median_purchase_price 特征\n离群值移除',fontsize=14)
ax3.annotate('降低极端\n离群值数量',xy=(0.98,-14.3),xytext=(0,-12),arrowprops=dict(facecolor='black'),fontsize=14)
plt.show()
new_df.head()
print(new_df.shape)
X = new_df.drop('fraud',axis=1)
y = new_df['fraud']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=42)
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values
classifiers = {
'逻辑回归':LogisticRegression(),
'K 近邻':KNeighborsClassifier(),
'支持向量机':SVC(),
'决策树':DecisionTreeClassifier()
}
for key,classifier in classifiers.items():
classifier.fit(X_train,y_train)
training_score = cross_val_score(classifier,X_train,y_train,cv=5)
print('分类器',classifier.__class__.__name__,'准确率',round(training_score.mean(),2)*100,'%')
log_reg_params = {"penalty":["l1","l2"],"C":[0.001,0.01,0.1,1,10,100,1000]}
grid_log_reg = GridSearchCV(LogisticRegression(),log_reg_params)
grid_log_reg.fit(X_train,y_train)
log_reg = grid_log_reg.best_estimator_
knears_params = {"n_neighbors":list(range(2,5,1)),"algorithm":["auto","ball_tree","kd_tree","brute"]}
grid_knears = GridSearchCV(KNeighborsClassifier(),knears_params)
grid_knears.fit(X_train,y_train)
knears_neighbors = grid_knears.best_estimator_
svc_params = {"C":[0.5,0.7,0.9,1],"kernel":["rbf","poly","sigmoid","linear"]}
grid_svc = GridSearchCV(SVC(),svc_params)
grid_svc.fit(X_train,y_train)
svc = grid_svc.best_estimator_
tree_params = {"criterion":["gini","entropy"],"max_depth":list(range(2,4,1)),"min_samples_leaf":list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(),tree_params)
grid_tree.fit(X_train,y_train)
tree_clf = grid_tree.best_estimator_
clf = DecisionTreeClassifier()
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.figure(figsize=(40,20))
clf = clf.fit(X_train, y_train)
plot_tree(clf, filled=True)
plt.title("使用训练集训练决策树",fontsize=30)
plt.show()
log_reg_score = cross_val_score(log_reg,X_train,y_train,cv=5)
print('逻辑回归 交叉验证得分:',round(log_reg_score.mean()*100,2).astype(str)+'%')
knears_score = cross_val_score(knears_neighbors,X_train,y_train,cv=5)
print('k 近邻 交叉验证得分:',round(knears_score.mean()*100,2).astype(str)+'%')
svc_score = cross_val_score(svc,X_train,y_train,cv=5)
print('支持向量机 交叉验证得分:',round(svc_score.mean()*100,2).astype(str)+'%')
tree_score = cross_val_score(tree_clf,X_train,y_train,cv=5)
print('决策树 交叉验证得分:',round(tree_score.mean()*100,2).astype(str)+'%')