%matplotlib inline
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
x, y = make_blobs(n_samples=500, n_features=2, centers=4, random_state=1)
color = ["red", "green", "blue", "black"]
fig, ax1 = plt.subplots(1)
for i in range(4):
ax1.scatter(x[y==i, 0]
,x[y==i, 1]
,marker="o"
,s=8
,color=color[i]
)
plt.show()
from sklearn.cluster import KMeans
n_clusters = 3
cluster = KMeans(n_clusters=n_clusters, random_state=0).fit(x)
y_pred = cluster.labels_
y_pred[:20]
array([2, 2, 0, 1, 0, 1, 0, 0, 0, 0, 2, 2, 0, 1, 0, 2, 0, 2, 1, 0], dtype=int32)
pre = cluster.fit_predict(x)
pre[:20]
array([2, 2, 0, 1, 0, 1, 0, 0, 0, 0, 2, 2, 0, 1, 0, 2, 0, 2, 1, 0], dtype=int32)
(pre == y_pred)[:20]
array([ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])
cluster_smallsub = KMeans(n_clusters=n_clusters, random_state=0).fit(x[:200])
y_pred_ = cluster_smallsub.predict(x)
(y_pred == y_pred_)[:20]
array([ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])
centroid = cluster.cluster_centers_
centroid
array([[-8.09286791, -3.50997357], [-1.54234022, 4.43517599], [-7.0877462 , -8.08923534]])
centroid.shape
(3, 2)
inertia = cluster.inertia_
inertia
1903.5342237665059
fig, ax1 = plt.subplots(1)
for i in range(n_clusters):
ax1.scatter(x[y_pred==i, 0], x[y_pred==i, 1]
,marker='o'
,s=8
,c=color[i]
)
ax1.scatter(centroid[:, 0], centroid[:, 1]
,marker='x'
,s=15
,c="black"
)
plt.show()
n_clusters = 4
cluster_ = KMeans(n_clusters=n_clusters, random_state=0).fit(x)
inertia_ = cluster_.inertia_
inertia_
908.3855684760603
n_clusters = 5
cluster_ = KMeans(n_clusters=n_clusters, random_state=0).fit(x)
inertia_ = cluster_.inertia_
inertia_
811.0952123653016
n_clusters = 6
cluster_ = KMeans(n_clusters=n_clusters, random_state=0).fit(x)
inertia_ = cluster_.inertia_
inertia_
728.2827697678249
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
silhouette_score(x, y_pred)
0.5882004012129721
silhouette_score(x, cluster_.labels_)
0.4532882033128698
silhouette_samples(x, y_pred)[:20]
array([0.62982017, 0.5034877 , 0.56148795, 0.84881844, 0.56034142, 0.78740319, 0.39254042, 0.4424015 , 0.48582704, 0.41586457, 0.62497924, 0.75540751, 0.50080674, 0.8452256 , 0.54730432, 0.60232423, 0.54574988, 0.68789747, 0.86605921, 0.25389678])
from time import time
from sklearn.metrics import calinski_harabasz_score
t0 = time()
calinski_harabasz_score(x, y_pred)
time() - t0
0.0010917186737060547
t0 = time()
silhouette_score(x, y_pred)
time() - t0
0.012775182723999023
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
n_clusters = 4
fig, (ax1, ax2) = plt.subplots(1, 2)
# 画布尺寸
fig.set_size_inches(18, 7)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, x.shape[0] + (n_clusters + 1) * 10])
(0.0, 550.0)
clusterer = KMeans(n_clusters=n_clusters, random_state=10).fit(x)
cluster_labels = clusterer.labels_
silhouette_avg = silhouette_score(x, cluster_labels)
print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)
For n_clusters = 4 The average silhouette_score is : 0.6505186632729437
sample_silhouette_values = silhouette_samples(x, cluster_labels)
sample_silhouette_values[:20]
array([0.62903385, 0.43289576, 0.55834047, 0.82660742, 0.35213124, 0.74123252, 0.68902347, 0.58705868, 0.04062548, 0.73241492, 0.59363669, 0.75135825, 0.66326503, 0.81480193, 0.45066007, 0.59477448, 0.10348453, 0.66633309, 0.84176332, 0.6089521 ])
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0] # 一个簇的样本量
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper)
,ith_cluster_silhouette_values
,facecolor=color
,alpha=0.7
)
ax1.text(-0.05
,y_lower + 0.5 * size_cluster_i
,str(i)
)
y_lower = y_upper + 10
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
[<matplotlib.axis.XTick at 0x7f28541efe80>, <matplotlib.axis.XTick at 0x7f28541efa58>, <matplotlib.axis.XTick at 0x7f28541c2908>, <matplotlib.axis.XTick at 0x7f28541d0b00>, <matplotlib.axis.XTick at 0x7f28541d0f98>, <matplotlib.axis.XTick at 0x7f28541dc470>, <matplotlib.axis.XTick at 0x7f28541ef6a0>]
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(x[:, 0], x[:, 1]
,marker='o'
,s=8
,c=colors
)
centers = clusterer.cluster_centers_
ax2.scatter(centers[:, 0], centers[:, 1]
,marker='x'
,s=200
,alpha=1
)
<matplotlib.collections.PathCollection at 0x7f2854131048>
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
<Figure size 432x288 with 0 Axes>
fig
for j in [2, 3, 4, 5, 6, 7]:
n_clusters = j
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, x.shape[0] + (n_clusters + 1) * 10])
clusterer = KMeans(n_clusters=n_clusters, random_state=10).fit(x)
cluster_labels = clusterer.labels_
silhouette_avg = silhouette_score(x, cluster_labels)
print("for n_clustsers={}, The average silhouette_score is {}".format(n_clusters, silhouette_avg))
sample_silouette_values = silhouette_samples(x, cluster_labels)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),ith_cluster_silhouette_values,facecolor=color,alpha=0.7)
ax1.text(-0.0,y_lower + 0.5 * size_cluster_i,str(i))
y_lower = y_upper + 10
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
colors = cm.nipy_spectral(cluster_labels.astype(float), n_clusters)
ax2.scatter(x[:, 0], x[:, 1],marker='o',s=8,c=colors)
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1], marker='x',c="red", alpha=1, s=200)
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
for n_clustsers=2, The average silhouette_score is 0.7049787496083262
for n_clustsers=3, The average silhouette_score is 0.5882004012129721
for n_clustsers=4, The average silhouette_score is 0.6505186632729437
for n_clustsers=5, The average silhouette_score is 0.5745566973301872
for n_clustsers=6, The average silhouette_score is 0.4387644975296138
for n_clustsers=7, The average silhouette_score is 0.3728615111052895
plus = KMeans(n_clusters=10).fit(x)
plus.n_iter_
13
random = KMeans(n_clusters=10, init="random", random_state=420).fit(x)
random.n_iter_
11
random = KMeans(n_clusters=10, max_iter=10, random_state=420).fit(x)
y_pred_max10 = random.labels_
silhouette_score(x, y_pred_max10)
0.3354172005203983
random = KMeans(n_clusters=50, init="random", max_iter=20, random_state=420).fit(x)
y_pred_max20 = random.labels_
silhouette_score(x, y_pred_max20)
0.33138458218481653