Kmeans 银行数据聚类分析

K-MEANS聚类分析银行数据分析记录

调用的包

import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor

#读取文件
df = pd.read_csv('data.xlsx')
  1. 数据预处理

①数据缺失

a = df[df['年龄']>0]
b = a['年龄']
for i in range(len(df)):
    if df['年龄'][i]<=0:
        df['年龄'][i] = b.mean()

②异常值处理

df['年龄'] = df['年龄'].fillna(b.mean())
df['职业'] = df['职业'].fillna(0)
df['职业'] = df['职业'].replace([9999, '@'], 0)
  1. 特征相关性分析
list2 = ['age', 'money_type', 'career', 'custo_type', 'balance', 'loan', 'pos', 'alipay', 'qqpay', 'jingdong', 'cloud', 'jin']
df2 = df1[list2]
data_corr = df2.corr()
sns.heatmap(data, square=True, linewidth=0.5, annot=True)
plt.show()

Kmeans 银行数据聚类分析_第1张图片

  1. 画每一个特征的箱型图,对应最大值、最小值、中位数及上下四分位数。
sns.boxplot(df1['age'])
plt.show

对多个图画箱型图像,我有12个特征,所以总体的特征图过程如下:

for i in range(3):
	for j in range(4):
		plt.subplot(3, 4, k)
		sns.boxplot(df1.iloc[:, k-1], orient='vertical')
		k = k + 1

Kmeans 银行数据聚类分析_第2张图片
4. Kmeans 聚类分析
改变K值进行聚类分析

def K_means(k, data):
    kmodel = KMeans(n_clusters=k, n_jobs=4) #n_jobs是并行数, 一般等于CPU数比较好
    kmodel.fit(data)
    r = pd.concat([data, pd.Series(kmodel.labels_, index=data.index)], axis=1)
    r.columns = list(data.columns) + ['聚类类别']
    centers=kmodel.cluster_centers_
    return centers, r
colors=['r','c','b', 'g', 'm', 'gold', 'steelblue', 'crimson', 'navy', 'forestgreen']
plt.figure()
for k in range(2, 8):
    centers, r = K_means(k, data)
    X = r[list1]
    Y = r['聚类类别']
    clf = RandomForestRegressor(oob_score=True)
    model1 = clf.fit(X, Y)
    print(k)     #k=5相对合适
    print(clf.oob_score_)
    a = model1.feature_importances_
    for i in range(len(a)):
        print(a[i])
   
    plt.subplot(2, 3, k-1)
    for j in range(k):
        index_set=np.where(r['聚类类别']==j)
        cluster=data.iloc[index_set]
        plt.scatter(cluster.iloc[:,0],cluster.iloc[:,4],c=colors[j],marker='.')
        plt.plot(centers[j][0],centers[j][4],'o',markerfacecolor=colors[j],markeredgecolor='k',markersize=8)
plt.show()

不同K值的聚类效果
Kmeans 银行数据聚类分析_第3张图片
5. 聚类后的特征重要性分析,使用随机森林

colors=['r','c','b', 'g', 'm', 'gold', 'steelblue', 'crimson', 'navy', 'forestgreen']
plt.figure()
for k in range(2, 8):
    centers, r = K_means(k, data)
    X = r[list1]
    Y = r['聚类类别']
    clf = RandomForestRegressor(oob_score=True)
    model1 = clf.fit(X, Y)
    print(k)     
    print(clf.oob_score_)
    a = model1.feature_importances_
    for i in range(len(a)):
        print(a[i])
    plt.subplot(2, 3, k-1)
    for j in range(k):
        index_set=np.where(r['聚类类别']==j)
        cluster=data.iloc[index_set]
        plt.scatter(cluster.iloc[:,0],cluster.iloc[:,4],c=colors[j],marker='.')
        plt.plot(centers[j][0],centers[j][4],'o',markerfacecolor=colors[j],markeredgecolor='k',markersize=8)
plt.show()

通过上述过程可以知道最合适的k值是哪个。

你可能感兴趣的:(Python,数据分析)