k-means 选择K的demo

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/10/12 下午5:29
# @Author  : liuchengwei
# @Site    : 
# @File    : demo.py
# @Software: PyCharm

import sys

if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8")

from gensim.models import word2vec
from sklearn.cluster import MiniBatchKMeans,KMeans
from sklearn import metrics
import pandas as pd
import numpy as np

import codecs, jieba


idx = 0
filepath = "xiaoaojianghu_jinyong.txt"
outpath = "train_data/"+filepath+".w2vinp"
with codecs.open(filename="file/"+filepath, mode="r", encoding="utf8") as r,\
    codecs.open(filename=outpath, mode="w", encoding="utf8") as w:
    for line in r:
        line = line.replace('“|”',"").replace("\n","。")
        line_lst = line.split("。")
        for line2 in line_lst:
            if len(line2) < 3: continue
            w.write("\t".join([str(idx), " ".join(jieba.cut(line2))]))
            idx += 1


sentences = word2vec.Text8Corpus(outpath)  # 加载语料
model = word2vec.Word2Vec(sentences, size=128)  # 训练skip-gram模型; 默认window=5

# 保存模型,以便重用
outp1 = "train_data/w2v.model"
outp2 = 'train_data/w2v.zh.text.vector'

model.wv.save(outp1)
model.wv.save_word2vec_format(outp2, binary=False)

outp2 = 'train_data/w2v.zh.text.vector'

X = []
with codecs.open(filename=outp2, mode="r",encoding="utf8") as r:
    for line in r:
        line_lst = line.strip().split(" ")
        if len(line_lst) < 3: continue
        word = line_lst[0]
        w2v = [float(item) for item in line_lst[1:]]
        # print(len(w2v), w2v)
        X.append(w2v)


X = np.array(X)
res_dict={}
for index, k in enumerate(range(2, 200)):
    y_pred = KMeans(n_clusters=k).fit_predict(X)
    calinski_harabaz_score = metrics.calinski_harabaz_score(X, y_pred)  # 越大越好
    silhouette_score = metrics.silhouette_score(X, y_pred, metric='euclidean')  # -1~1越大越好
    res_dict[str(k)] = [calinski_harabaz_score, silhouette_score]

# sorted(res_dict.items(),lambda x:x[1])
# res_dict = {'2': [2966.380199316429, 0.6042266793168797], '3': [2619.6067449764823, 0.588576649249035], '4': [2213.683159880981, 0.5352884051405044], '5': [1994.1547524909254, 0.4480853412205973], '6': [1811.0972549137832, 0.4526014815999876], '7': [1696.6817463544555, 0.4121975479343535], '8': [1583.783855831214, 0.4018322927314881], '9': [1478.5664730783246, 0.3886410167755743], '10': [1388.9308942483235, 0.37559763166656546], '11': [1313.3060331283746, 0.3190552792351522], '12': [1254.2412668294585, 0.38007110319322923], '13': [1203.3055375307688, 0.3643819158404515], '14': [1162.199851276613, 0.3033368659824741], '15': [1096.3600544283636, 0.2696483694474599], '16': [1061.5667137605162, 0.29030902304212264], '17': [1042.5680189848822, 0.2735821742291549], '18': [1008.1465829942655, 0.29207646714611646], '19': [971.2642340637611, 0.2925509472738477], '20': [936.0171179482027, 0.23394088243958144], '21': [930.1568748682379, 0.23235113371028024], '22': [901.3136337436365, 0.25094526751306], '23': [881.9364782491148, 0.19401622622990544], '24': [870.670835889951, 0.24700172445760588], '25': [860.6383129403824, 0.19770773157122434], '26': [850.3370163931321, 0.20386016872844068], '27': [831.944076467707, 0.22333896244577386], '28': [820.5342262559137, 0.2311296534670078], '29': [804.0113288379034, 0.18262897549892665], '30': [797.8597807842392, 0.21123303875704993], '31': [785.3822277682488, 0.22693783046142355], '32': [773.0208881712138, 0.17420473390928062], '33': [763.3940162500764, 0.20055341195501508], '34': [757.6997580750447, 0.17765346775023583], '35': [749.1378248035995, 0.20103424776594278], '36': [735.977156705786, 0.19728783089254662], '37': [731.7978667431086, 0.196973204116796], '38': [713.1932847401716, 0.19066132611503536], '39': [711.9085256394585, 0.1623016886916113], '40': [703.6287614387686, 0.1332441547191878], '41': [701.9191031872563, 0.17608702008998076], '42': [684.0837722779012, 0.18041554694939893], '43': [679.5066085697628, 0.1704518847932796], '44': [681.4633935814237, 0.15642713940900624], '45': [676.5610442577347, 0.1793155600021589], '46': [678.4206412455399, 0.1810217820726726], '47': [669.6772399479161, 0.17905855108536675], '48': [660.2450952184029, 0.18957058972833285], '49': [658.9824721756778, 0.1386746715743496]}
print("calinski_harabaz_score:", sorted(res_dict.items(), key=lambda x: x[1][0], reverse=True))
print("silhouette_score:", sorted(res_dict.items(), key=lambda x: x[1][1], reverse=True))



if __name__ == '__main__':
    pass

你可能感兴趣的:(k-means 选择K的demo)