Agglomerative Clustering

import pandas as pd
import numpy as np
import random
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc
import matplotlib.pyplot as plt

# generate data
pages = ['page_{n}'.format(n=page) for page in range(20)]

page_data = pd.DataFrame(index=[i for i in range(10000)])
for i in pages:
    page_data[i] = [random.randrange(0,2,1) for i in range(10000)]

# model
ag = AgglomerativeClustering(n_clusters=2,linkage='ward')

# fit data
page_data_n = page_data[:200]
ag.fit(page_data_n)

# plot dendrogram
plt.figure(figsize=(15, 10))
dend = shc.dendrogram(shc.linkage(page_data_n, method='ward'))

# use sklearn compute distance of paired children, node for linkage matrix for dendrogram
def plot_dendrogram(self, y_top, y_bot, **kwargs):
    distance = np.arange(self.children_.shape[0])
    position = np.arange(2, self.children_.shape[0]+2)

    linkage_matrix = np.column_stack([self.children_, distance, position]).astype(float)

    fig, ax = plt.subplots(figsize=(15, 10))
    shc.dendrogram(linkage_matrix, **kwargs)
    ax.set_ylim(y_bot,y_top)
    plt.show()

plot_dendrogram(ag,y_bot=100, y_top=210, p=100,truncate_mode='lastp',color_threshold=180)

你可能感兴趣的:(model)