代码:
import numpy as np
import random
from math import sqrt
def dist(arr1, arr2):
return sqrt(np.sum(arr1 - arr2) ** 2)
def random_center(dataset, k):
"""
随机生成初始的聚类中心,dataset的每一行是一个样本
:param dataset:
:param k: 聚类中心的个数
:return:
"""
number_cols = dataset.shape[1]
centers = np.mat(np.zeros([k, number_cols]))
for i in range(number_cols):
min_value = np.min(dataset[:, i])
max_value = np.max(dataset[:, i])
centers[:, i] = min_value + (max_value - min_value) * np.random.random([k, 1])
return centers
def kmeans(dataset, k):
centers = random_center(dataset, k) # 生成初始的聚类中心
num_data = dataset.shape[0] # 数据的个数
# 保存每个样本的聚类情况,第一列表示该样本属于某一类,第二列是与该类聚类中心的距离
clusterAssment = np.mat(np.zeros((num_data, 2)))
cluster_changed = True # 控制聚类算法迭代停止的标志,当聚类中心不在改变时停止
while cluster_changed:
cluster_changed = False
for i in range(num_data):
min_dist = np.inf # 初始化最小的距离
min_index = -1 # 初始化属于某一类
for j in range(k):
dist_j = dist(dataset[i, :], centers[j, :])
if dist_j < min_dist:
min_dist = dist_j
min_index = j
if clusterAssment[i, 0] != min_index:
cluster_changed = True
clusterAssment[i, :] = min_index, min_dist ** 2
# 更新聚类中心
for cent in range(k):
data_cent = dataset[np.nonzero(clusterAssment[:, 0].A == cent)[0]] # .A表示将矩阵转化为数组
centers[cent, :] = np.mean(data_cent, axis=0)
return centers, clusterAssment
dataset = np.random.randint(1, 20, [20, 5])
centers, clusterAssment = kmeans(dataset, 3)