%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin
from sklearn.datasets import load_sample_image
from sklearn.utils import shuffle
china = load_sample_image("china.jpg")
china.dtype
dtype('uint8')
china.shape
(427, 640, 3)
china[0][0]
array([174, 201, 231], dtype=uint8)
newimage = china.reshape((427 * 640, 3))
newimage.shape
(273280, 3)
import pandas as pd
# 颜色种类
pd.DataFrame(newimage).drop_duplicates().shape
(96615, 3)
# 图片可视化
plt.figure(figsize=(15, 15))
plt.imshow(china) # imshow 只接受三维
<matplotlib.image.AxesImage at 0x7fe7ea57c748>
图像现在有 9 万多种颜色 尝试用 kmeans 压缩到 64 种颜色(用 64 簇质心代替 9 万多种颜色)
画出随机压缩到64种颜色的矢量量化图像。
n_clusters = 64
# 三维二维
# 图像的预处理
# 归一化
china = np.array(china, dtype=np.float64) / china.max()
w, h, d = original_shape = tuple(china.shape) #
china[:3]
array([[[0.68235294, 0.78823529, 0.90588235], [0.68235294, 0.78823529, 0.90588235], [0.68235294, 0.78823529, 0.90588235], ..., [0.98039216, 0.98431373, 1. ], [0.98039216, 0.98431373, 1. ], [0.98039216, 0.98431373, 1. ]], [[0.6745098 , 0.78039216, 0.89803922], [0.67843137, 0.78431373, 0.90196078], [0.67843137, 0.78431373, 0.90196078], ..., [0.98431373, 0.98823529, 1. ], [0.98431373, 0.98823529, 1. ], [0.98431373, 0.98823529, 1. ]], [[0.68235294, 0.78823529, 0.90588235], [0.68235294, 0.78823529, 0.90588235], [0.68235294, 0.78823529, 0.90588235], ..., [0.98823529, 0.99215686, 1. ], [0.98823529, 0.99215686, 1. ], [0.98823529, 0.99215686, 1. ]]])
type(china)
numpy.ndarray
# 转换为矩阵
image_array = np.reshape(china, (w * h, d))
image_array.shape
(273280, 3)
# 使用 1000 个数据找质心
image_array_sample = shuffle(image_array, random_state=0)[:1000]
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(image_array_sample)
kmeans.cluster_centers_[:20]
array([[0.11798806, 0.11884058, 0.07007673], [0.80874811, 0.82262443, 0.85671192], [0.47614379, 0.46895425, 0.27124183], [0.92831097, 0.95803234, 0.99566563], [0.52 , 0.5254902 , 0.39529412], [0.61895425, 0.67712418, 0.70816993], [0.31198257, 0.34030501, 0.18954248], [0.82923351, 0.90641711, 0.98743316], [0.80392157, 0.53006536, 0.3751634 ], [0.25202614, 0.23764706, 0.20104575], [0.03328773, 0.02836297, 0.01732786], [0.3454902 , 0.1854902 , 0.12470588], [0.52156863, 0.49150327, 0.52592593], [0.74457516, 0.83934641, 0.95045752], [0.72941176, 0.35764706, 0.23137255], [0.41470588, 0.44656863, 0.40980392], [0.96176471, 0.77058824, 0.63039216], [0.69019608, 0.74705882, 0.7605042 ], [0.57019608, 0.41098039, 0.34588235], [0.93630422, 0.93594771, 0.94913844]])
labels = kmeans.predict(image_array)
labels.shape
(273280,)
image_kmeans = image_array.copy()
# 用质心替换样本
for i in range(w * h):
image_kmeans[i] = kmeans.cluster_centers_[labels[i]]
pd.DataFrame(image_kmeans).drop_duplicates().shape
(64, 3)
image_kmeans = image_kmeans.reshape(w, h, d)
image_kmeans.shape
(427, 640, 3)
plt.figure(figsize=(15, 15))
plt.imshow(image_kmeans)
<matplotlib.image.AxesImage at 0x7fe7ea573ac8>
# 随机矢量量化,随机抽取质心
centroid_random = shuffle(image_array, random_state=0)[:n_clusters]
# pairwise_distances_argmin(x1, x2, axis) 计算 x2 中每个样本到 x1 中每个样本的距离,并返回每个 x2 对应在 x1 中最近的样本的索引
labels_random = pairwise_distances_argmin(centroid_random, image_array, axis=0)
labels_random.shape
(273280,)
len(set(labels_random))
64
image_random = image_array.copy()
for i in range(w * h):
image_random[i] = centroid_random[labels_random[i]]
image_random = image_random.reshape(w, h, d)
plt.figure(figsize=(15, 15))
plt.imshow(image_random)
<matplotlib.image.AxesImage at 0x7fe7ea4e9eb8>