手写数字数据集聚类例子

 

 

from time import time
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.datasets import load_digits



X_digits,y_digits=load_digits(return_X_y=True)


data=scale(X_digits)
n_samples,n_features=data.shape
n_digits=len(np.unique(y_digits))
labels=y_digits
print(labels)

sample_size=300

print(82*'-')
print('init\ttime\tinertia\thome\tcompo\tv_meas\tars\tami\tailhouette')

def bench_k_means(estimator,name,data):
    t0=time()
    estimator.fit(data)
    
    
    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
         % (name,(time()-t0),estimator.inertia_,
           metrics.homogeneity_score(labels,estimator.labels_),
           metrics.completeness_score(labels,estimator.labels_),
           metrics.v_measure_score(labels,estimator.labels_),
           metrics.adjusted_rand_score(labels,estimator.labels_),
           metrics.adjusted_mutual_info_score(labels,estimator.labels_),
           metrics.silhouette_score(data,estimator.labels_,
                                   metric='euclidean',
                                   sample_size=sample_size)))
    


bench_k_means(KMeans(init='k-means++',n_clusters=n_digits,n_init=10),name='k-means++',data=data)
    
bench_k_means(KMeans(init='random',n_clusters=n_digits,n_init=10),name='random',data=data)

#在上面的情况中初始质心是确定的,因此我们把初始质心设定为1进行测试

pca=PCA(n_components=n_digits).fit(data)
bench_k_means(KMeans(n_clusters=n_digits,init='k-means++',n_init=1),name='pca_based',data=data)
print(82*'-')


#在降维的数据上图形化显示

reduced_data=PCA(n_components=2).fit_transform(data)
kmeans=KMeans(init='k-means++',n_clusters=n_digits,n_init=10)
kmeans.fit(reduced_data)

#网格步幅
h=0.02

#画决策边界。
x_min,x_max=reduced_data[:,0].min()-1,reduced_data[:,0].max()+1
y_min,y_max=reduced_data[:,1].min()-1,reduced_data[:,1].max()+1
xx,yy=np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))


# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

 

 

----------------------------------------------------------------------------------
init	time	inertia	home	compo	v_meas	ars	ami	ailhouette
D:\anoconda\lib\site-packages\sklearn\metrics\cluster\supervised.py:746: FutureWarning: The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.
  FutureWarning)
k-means++	0.19s	69477	0.606	0.654	0.629	0.475	0.602	0.137
D:\anoconda\lib\site-packages\sklearn\metrics\cluster\supervised.py:746: FutureWarning: The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.
  FutureWarning)
D:\anoconda\lib\site-packages\sklearn\metrics\cluster\supervised.py:746: FutureWarning: The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.
  FutureWarning)
random   	0.20s	69528	0.606	0.656	0.630	0.478	0.602	0.129
pca_based	0.03s	69673	0.681	0.720	0.700	0.570	0.677	0.136
----------------------------------------------------------------------------------

手写数字数据集聚类例子_第1张图片

你可能感兴趣的:(手写数字数据集聚类例子)