介绍

K-center聚类和K-means聚类类似，都是通过迭代类中心点直至收敛，不同的是K-center的中心点必须是一个真实的样本点，而K-means并不需要。

函数介绍

python">class Kmedoid:
    def __init__(self, data, k):
        self.data = data
        self.k = k

    def randCent(self):  # 随机选取一个点
        random_index = random.randint(0, self.data.shape[0]-1)
        return random_index, self.data[random_index, :]

    def distance(self, vecA, vecB):  # 计算曼哈顿距离
        return sum(abs(vecA - vecB))

    def run(self):
        init_centers = []  # 初始化中心的列表
        init_indexs = []  # 被选中作为中心的点的下标
        while len(init_centers) < self.k:
            index, center = self.randCent()
            if index not in init_indexs:  # 保证选点不重复
                init_centers.append(center)
                init_indexs.append(index)
            else:
                continue

        while True:
            cluster_category = []  # 记录聚类结果
            for i in range(self.data.shape[0]):  # 遍历每一个点
                minv = np.inf  # 最小距离，初始为正无穷
                cluster_index = 0  # 所属簇的下标
                for index, center in enumerate(init_centers):  # 遍历每个中心
                    # 选取离得最近的中心作为归属簇
                    dist = self.distance(center, self.data[i, :])
                    if dist < minv:
                        minv = dist
                        cluster_index = index
                cluster_category.append(cluster_index)

            # 重新计算中心点
            new_indexs = [0 for i in range(len(init_centers))]  # 更新被选中作为中心的点的下标
            min_dists = [np.inf for i in range(len(init_centers))]  # 中心点对应最小距离
            for i in range(self.data.shape[0]):
                min_dist = 0  # 求与当前簇其他点的距离之和
                for j in range(self.data.shape[0]):  # 遍历每一个点
                    if cluster_category[i] == cluster_category[j]:  # 属于同一个簇才进行累加
                        min_dist += self.distance(self.data[i, :], self.data[j, :])
                if min_dist < min_dists[cluster_category[i]]:  # 保存数据到列表
                    min_dists[cluster_category[i]] = min_dist
                    new_indexs[cluster_category[i]] = i

            init_centers = []  # 新的聚类中心
            for index in new_indexs:
                init_centers.append(self.data[index, :])

            if new_indexs == init_indexs:  # 如果新的中心与上次相同则结束循环
                return cluster_category, init_centers
            else:
                init_indexs = new_indexs  # 更新聚类中心下标

实例

python">import random
import numpy as np
from sklearn.datasets import make_blobs
from matplotlib import pyplot
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

ir = datasets.load_iris() 
y = ir.data[:, :4] # #表示我们取特征空间中的4个维度

model = Kmedoid(data=y, k=3)
cluster_category, init_centers=model.run()

dat=np.concatenate([y,np.array(cluster_category).reshape(-1,1)],axis=1)
columns=ir.feature_names
columns.append('result')

X=pd.DataFrame(dat,columns=columns)


# 绘制结果
x0 = X[X['result'] == 0]
x1 = X[X['result'] == 1]
x2 = X[X['result'] == 2]
plt.scatter(x0['sepal length (cm)'], x0['sepal width (cm)'], c="red", marker='o', label='label0') 
plt.scatter(x1['sepal length (cm)'], x1['sepal width (cm)'], c="green", marker='*', label='label1') 
plt.scatter(x2['sepal length (cm)'], x2['sepal width (cm)'], c="blue", marker='+', label='label2') 
plt.xlabel('sepal length') 
plt.ylabel('sepal width') 
plt.legend(loc=2) 
plt.show()