%matplotlib inline

from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

x, y = make_blobs(n_samples=500, n_features=2, centers=4, random_state=1)


color = ["red", "green", "blue", "black"]
fig, ax1 = plt.subplots(1)
for i in range(4):
    ax1.scatter(x[y==i, 0]
                ,x[y==i, 1]
                ,marker="o"
                ,s=8
                ,color=color[i]
               )
plt.show()


from sklearn.cluster import KMeans

n_clusters = 3

cluster = KMeans(n_clusters=n_clusters, random_state=0).fit(x)


y_pred = cluster.labels_
y_pred[:20]

array([2, 2, 0, 1, 0, 1, 0, 0, 0, 0, 2, 2, 0, 1, 0, 2, 0, 2, 1, 0],
      dtype=int32)


pre = cluster.fit_predict(x)
pre[:20]

array([2, 2, 0, 1, 0, 1, 0, 0, 0, 0, 2, 2, 0, 1, 0, 2, 0, 2, 1, 0],
      dtype=int32)


(pre == y_pred)[:20]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])


cluster_smallsub = KMeans(n_clusters=n_clusters, random_state=0).fit(x[:200])


y_pred_ = cluster_smallsub.predict(x)
(y_pred == y_pred_)[:20]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])


centroid = cluster.cluster_centers_


centroid

array([[-8.09286791, -3.50997357],
       [-1.54234022,  4.43517599],
       [-7.0877462 , -8.08923534]])


centroid.shape

(3, 2)


inertia = cluster.inertia_


inertia

1903.5342237665059


fig, ax1 = plt.subplots(1)
for i in range(n_clusters):
    ax1.scatter(x[y_pred==i, 0], x[y_pred==i, 1]
                ,marker='o'
                ,s=8
                ,c=color[i]
               )
ax1.scatter(centroid[:, 0], centroid[:, 1]
            ,marker='x'
            ,s=15
            ,c="black"
           )
plt.show()


n_clusters = 4
cluster_ = KMeans(n_clusters=n_clusters, random_state=0).fit(x)
inertia_ = cluster_.inertia_
inertia_

908.3855684760603


n_clusters = 5
cluster_ = KMeans(n_clusters=n_clusters, random_state=0).fit(x)
inertia_ = cluster_.inertia_
inertia_

811.0952123653016


n_clusters = 6
cluster_ = KMeans(n_clusters=n_clusters, random_state=0).fit(x)
inertia_ = cluster_.inertia_
inertia_

728.2827697678249


from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score


silhouette_score(x, y_pred)

0.5882004012129721


silhouette_score(x, cluster_.labels_)

0.4532882033128698


silhouette_samples(x, y_pred)[:20]

array([0.62982017, 0.5034877 , 0.56148795, 0.84881844, 0.56034142,
       0.78740319, 0.39254042, 0.4424015 , 0.48582704, 0.41586457,
       0.62497924, 0.75540751, 0.50080674, 0.8452256 , 0.54730432,
       0.60232423, 0.54574988, 0.68789747, 0.86605921, 0.25389678])


from time import time
from sklearn.metrics import calinski_harabasz_score

t0 = time()
calinski_harabasz_score(x, y_pred)
time() - t0

0.0010917186737060547


t0 = time()
silhouette_score(x, y_pred)
time() - t0

0.012775182723999023


from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd


n_clusters = 4
fig, (ax1, ax2) = plt.subplots(1, 2)


# 画布尺寸
fig.set_size_inches(18, 7)


ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, x.shape[0] + (n_clusters + 1) * 10])

(0.0, 550.0)


clusterer = KMeans(n_clusters=n_clusters, random_state=10).fit(x)


cluster_labels = clusterer.labels_


silhouette_avg = silhouette_score(x, cluster_labels)


print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

For n_clusters = 4 The average silhouette_score is : 0.6505186632729437


sample_silhouette_values = silhouette_samples(x, cluster_labels)
sample_silhouette_values[:20]

array([0.62903385, 0.43289576, 0.55834047, 0.82660742, 0.35213124,
       0.74123252, 0.68902347, 0.58705868, 0.04062548, 0.73241492,
       0.59363669, 0.75135825, 0.66326503, 0.81480193, 0.45066007,
       0.59477448, 0.10348453, 0.66633309, 0.84176332, 0.6089521 ])


y_lower = 10
for i in range(n_clusters):
    ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0] # 一个簇的样本量
    y_upper = y_lower + size_cluster_i
    color = cm.nipy_spectral(float(i) / n_clusters)
    ax1.fill_betweenx(np.arange(y_lower, y_upper)
                      ,ith_cluster_silhouette_values
                      ,facecolor=color
                      ,alpha=0.7
                     )
    ax1.text(-0.05
             ,y_lower + 0.5 * size_cluster_i
             ,str(i)
            )
    y_lower = y_upper + 10


ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

[<matplotlib.axis.XTick at 0x7f28541efe80>,
 <matplotlib.axis.XTick at 0x7f28541efa58>,
 <matplotlib.axis.XTick at 0x7f28541c2908>,
 <matplotlib.axis.XTick at 0x7f28541d0b00>,
 <matplotlib.axis.XTick at 0x7f28541d0f98>,
 <matplotlib.axis.XTick at 0x7f28541dc470>,
 <matplotlib.axis.XTick at 0x7f28541ef6a0>]


colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)

ax2.scatter(x[:, 0], x[:, 1]
            ,marker='o'
            ,s=8
            ,c=colors
           )
centers = clusterer.cluster_centers_
ax2.scatter(centers[:, 0], centers[:, 1]
            ,marker='x'
            ,s=200
            ,alpha=1
           )

<matplotlib.collections.PathCollection at 0x7f2854131048>


ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
              "with n_clusters = %d" % n_clusters),
             fontsize=14, fontweight='bold')
plt.show()

<Figure size 432x288 with 0 Axes>

fig


for j in [2, 3, 4, 5, 6, 7]:
    n_clusters = j
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, x.shape[0] + (n_clusters + 1) * 10])
    clusterer = KMeans(n_clusters=n_clusters, random_state=10).fit(x)
    cluster_labels = clusterer.labels_
    silhouette_avg = silhouette_score(x, cluster_labels)
    print("for n_clustsers={}, The average silhouette_score is {}".format(n_clusters, silhouette_avg))
    sample_silouette_values = silhouette_samples(x, cluster_labels)
    
    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),ith_cluster_silhouette_values,facecolor=color,alpha=0.7)
        ax1.text(-0.0,y_lower + 0.5 * size_cluster_i,str(i))
        y_lower = y_upper + 10
        
        
        
    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    
    colors = cm.nipy_spectral(cluster_labels.astype(float), n_clusters)
    ax2.scatter(x[:, 0], x[:, 1],marker='o',s=8,c=colors)
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='x',c="red", alpha=1, s=200) 
   
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")
    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")
    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')
    
    plt.show()

for n_clustsers=2, The average silhouette_score is 0.7049787496083262

for n_clustsers=3, The average silhouette_score is 0.5882004012129721

for n_clustsers=4, The average silhouette_score is 0.6505186632729437

for n_clustsers=5, The average silhouette_score is 0.5745566973301872

for n_clustsers=6, The average silhouette_score is 0.4387644975296138

for n_clustsers=7, The average silhouette_score is 0.3728615111052895


plus = KMeans(n_clusters=10).fit(x)
plus.n_iter_

13


random = KMeans(n_clusters=10, init="random", random_state=420).fit(x)
random.n_iter_

11


random = KMeans(n_clusters=10, max_iter=10, random_state=420).fit(x)
y_pred_max10 = random.labels_
silhouette_score(x, y_pred_max10)

0.3354172005203983


random = KMeans(n_clusters=50, init="random", max_iter=20, random_state=420).fit(x)
y_pred_max20 = random.labels_
silhouette_score(x, y_pred_max20)

0.33138458218481653

轮廓系数¶

标签未知的另一个指标: 卡林斯基-哈拉巴斯¶

用轮廓系数找 n_cluster¶

绘制轮廓系数分布图和聚类后的数据分布图来选择我们的最佳n_clusters¶

以 k = 4 为例走一遍流程¶

将 n_clusters = 4 的流程写成循环¶

init 参数¶