%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin
from sklearn.datasets import load_sample_image
from sklearn.utils import shuffle


china = load_sample_image("china.jpg")


china.dtype

dtype('uint8')


china.shape

(427, 640, 3)


china[0][0]

array([174, 201, 231], dtype=uint8)


newimage = china.reshape((427 * 640, 3))
newimage.shape

(273280, 3)


import pandas as pd


# 颜色种类
pd.DataFrame(newimage).drop_duplicates().shape

(96615, 3)


# 图片可视化
plt.figure(figsize=(15, 15))
plt.imshow(china) # imshow 只接受三维

<matplotlib.image.AxesImage at 0x7fe7ea57c748>


n_clusters = 64

# 三维二维

# 图像的预处理

# 归一化
china = np.array(china, dtype=np.float64) / china.max()
w, h, d = original_shape = tuple(china.shape) # 
china[:3]

array([[[0.68235294, 0.78823529, 0.90588235],
        [0.68235294, 0.78823529, 0.90588235],
        [0.68235294, 0.78823529, 0.90588235],
        ...,
        [0.98039216, 0.98431373, 1.        ],
        [0.98039216, 0.98431373, 1.        ],
        [0.98039216, 0.98431373, 1.        ]],

       [[0.6745098 , 0.78039216, 0.89803922],
        [0.67843137, 0.78431373, 0.90196078],
        [0.67843137, 0.78431373, 0.90196078],
        ...,
        [0.98431373, 0.98823529, 1.        ],
        [0.98431373, 0.98823529, 1.        ],
        [0.98431373, 0.98823529, 1.        ]],

       [[0.68235294, 0.78823529, 0.90588235],
        [0.68235294, 0.78823529, 0.90588235],
        [0.68235294, 0.78823529, 0.90588235],
        ...,
        [0.98823529, 0.99215686, 1.        ],
        [0.98823529, 0.99215686, 1.        ],
        [0.98823529, 0.99215686, 1.        ]]])


type(china)

numpy.ndarray


# 转换为矩阵

image_array = np.reshape(china, (w * h, d))
image_array.shape

(273280, 3)


# 使用 1000 个数据找质心
image_array_sample = shuffle(image_array, random_state=0)[:1000]


kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(image_array_sample)


kmeans.cluster_centers_[:20]

array([[0.11798806, 0.11884058, 0.07007673],
       [0.80874811, 0.82262443, 0.85671192],
       [0.47614379, 0.46895425, 0.27124183],
       [0.92831097, 0.95803234, 0.99566563],
       [0.52      , 0.5254902 , 0.39529412],
       [0.61895425, 0.67712418, 0.70816993],
       [0.31198257, 0.34030501, 0.18954248],
       [0.82923351, 0.90641711, 0.98743316],
       [0.80392157, 0.53006536, 0.3751634 ],
       [0.25202614, 0.23764706, 0.20104575],
       [0.03328773, 0.02836297, 0.01732786],
       [0.3454902 , 0.1854902 , 0.12470588],
       [0.52156863, 0.49150327, 0.52592593],
       [0.74457516, 0.83934641, 0.95045752],
       [0.72941176, 0.35764706, 0.23137255],
       [0.41470588, 0.44656863, 0.40980392],
       [0.96176471, 0.77058824, 0.63039216],
       [0.69019608, 0.74705882, 0.7605042 ],
       [0.57019608, 0.41098039, 0.34588235],
       [0.93630422, 0.93594771, 0.94913844]])


labels = kmeans.predict(image_array)


labels.shape

(273280,)


image_kmeans = image_array.copy()


# 用质心替换样本

for i in range(w * h):
    image_kmeans[i] = kmeans.cluster_centers_[labels[i]]


pd.DataFrame(image_kmeans).drop_duplicates().shape

(64, 3)


image_kmeans = image_kmeans.reshape(w, h, d)
image_kmeans.shape

(427, 640, 3)


plt.figure(figsize=(15, 15))
plt.imshow(image_kmeans)

<matplotlib.image.AxesImage at 0x7fe7ea573ac8>


# 随机矢量量化，随机抽取质心

centroid_random = shuffle(image_array, random_state=0)[:n_clusters]
# pairwise_distances_argmin(x1, x2, axis) 计算 x2 中每个样本到 x1 中每个样本的距离，并返回每个 x2 对应在 x1 中最近的样本的索引
labels_random = pairwise_distances_argmin(centroid_random, image_array, axis=0)


labels_random.shape

(273280,)


len(set(labels_random))

64


image_random = image_array.copy()


for i in range(w * h):
    image_random[i] = centroid_random[labels_random[i]]


image_random = image_random.reshape(w, h, d)


plt.figure(figsize=(15, 15))
plt.imshow(image_random)

<matplotlib.image.AxesImage at 0x7fe7ea4e9eb8>