本文将对k-means聚类算法原理和实现过程进行简述
算法原理
k-means算法原理较简单,基本步骤如下:
1、假定我们要对N个样本观测做聚类,要求聚为K类,首先选择K个点作为初始中心点;
2、接下来,按照距离初始中心点最小的原则,把所有观测分到各中心点所在的类中;
3、每类中有若干个观测,计算K个类中所有样本点的均值,作为第二次迭代的K个中心点;
4、然后根据这个中心重复第2、3步,直到收敛(中心点不再改变或达到指定的迭代次数),聚类过程结束。
聚类过程示意图:
算法实践
下面对一个具体场景做聚类分析:500x500px的地图上,随机生成60个城市,要求生成10个聚类中心。
Sklearn实现
下面是调取sklearn相关的函数进行实现:
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn.cluster import KMeans
Num_dots = 60 # 城市总数
Num_gas = 10 # 聚类中心总数
Size_map = 500 # 500x500地图
colors = ['#FF3838', '#FF9D97', '#FF701F', '#FFB21D', '#CFD231', '#48F90A', '#92CC17', '#3DDB86', '#1A9334', '#00D4BB',
'#2C99A8', '#00C2FF', '#344593', '#6473FF', '#0018EC', '8438FF', '#520085', '#CB38FF', '#FF95C8', '#FF37C7']
warnings.filterwarnings("ignore")
# 生成随机点
def generate():
dots = []
for i in range(Num_dots):
dots.append(np.random.uniform([Size_map, Size_map]))
# dots_sorted_x = sorted(dots, key=lambda dot: dot[0])
return dots
# 计算两点之间欧式距离
def cal_dist(x, y):
return ((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2) ** 0.5
# 统计数组中各种相同元素个数
def num_same(dots_labels):
num_labels = []
key = np.unique(dots_labels)
for k in key:
mask = (dots_labels == k)
y_new = dots_labels[mask]
v = y_new.size
num_labels.append(v)
return num_labels
def cal_center_dist(center, dots):
distance = 0
for i in range(len(dots)):
distance += cal_dist(center, dots[i])
return distance
# K-Means聚类
def k_means(dots):
cluster = KMeans(n_clusters=Num_gas)
dots_labels = cluster.fit_predict(dots)
centers = cluster.cluster_centers_
return dots_labels, centers
# 绘制图像
def plot_dots(dots, dots_labels, centers):
# 绘制点
for i in range(len(dots_labels)):
plt.scatter(dots[i][0], dots[i][1], color=colors[dots_labels[i]])
# 绘制聚类中心
for i in range(len(centers)):
plt.scatter(centers[i][0], centers[i][1], marker='x', color="#000000", s=50)
plt.show()
if __name__ == '__main__':
np.random.seed(250)
dots = generate()
dots_labels, centers = k_means(dots)
num_labels = num_same(dots_labels)
# 输出结果
distance = 0
for i in range(len(centers)):
print("聚类中心", i+1, "坐标为", np.round(centers[i], 2))
index = np.argwhere(dots_labels == i)
print("属于该聚类中心的城市标号为", [int(x)+1 for x in index])
mark = [int(x) for x in index]
distance += cal_center_dist(centers[i], [dots[i] for i in mark])
print("所有聚类中心和所辖城市的距离之和为", np.round(distance,2))
# 绘图
plot_dots(dots, dots_labels, centers)
输出总距离:所有聚类中心和所辖城市的距离之和为 2860.48.
手动实现
下面根据算法的理解,进行手动实现:
import numpy as np
from matplotlib import pyplot as plt
Num_dots = 60 # 城市总数
Num_gas = 10 # 聚类中心总数
Size_map = 500 # 500x500地图
colors = ['#FF3838', '#FF9D97', '#FF701F', '#FFB21D', '#CFD231', '#48F90A', '#92CC17', '#3DDB86', '#1A9334', '#00D4BB',
'#2C99A8', '#00C2FF', '#344593', '#6473FF', '#0018EC', '8438FF', '#520085', '#CB38FF', '#FF95C8', '#FF37C7']
# 生成随机点
def generate():
dots = []
for i in range(Num_dots):
dots.append(np.random.uniform([Size_map, Size_map]))
# dots_sorted_x = sorted(dots, key=lambda dot: dot[0])
return dots
# 计算两点之间欧式距离
def cal_dist(x, y):
return ((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2) ** 0.5
# 计算中心点距离它所负责的所有点之和
def cal_center_dist(center, dots):
distance = 0
for i in range(len(dots)):
distance += cal_dist(center, dots[i])
return distance
# 根据城市坐标搜索城市序号
def search_city(value, dots):
for i, item in enumerate(dots):
if (item == value).any():
return i
class K_Means:
# k是分组数;tolerance‘中心点误差’;max_iter是迭代次数
def __init__(self, k=2, tolerance=0.0001, max_iter=300):
self.k_ = k
self.tolerance_ = tolerance
self.max_iter_ = max_iter
self.distance = 0
def fit(self, data):
self.centers_ = {}
for i in range(self.k_):
self.centers_[i] = data[i]
# print(self.centers_[i]) # {0: array([256.5, 542. ]), 1: array([586.5, 261.5]), 2: array([869. , 449.5])}
for iter in range(self.max_iter_):
self.clf_ = {}
for i in range(self.k_):
self.clf_[i] = []
for feature in data:
distances = []
for center in self.centers_:
distances.append(cal_dist(feature, self.centers_[center]))
classification = distances.index(min(distances))
self.clf_[classification].append(feature)
# 记录总路程
self.distance = np.sum(distances)
# 记录上一阶段中心点位置
prev_centers = dict(self.centers_)
# 移动每一个center到所辖城市的中心位置
for c in self.clf_:
self.centers_[c] = np.average(self.clf_[c], axis=0)
# 若center的移动空间在误差范围内,跳出循环得到结果
optimized = True
for center in self.centers_:
org_centers = prev_centers[center]
cur_centers = self.centers_[center]
if np.sum((cur_centers - org_centers) / org_centers * 100.0) > self.tolerance_:
optimized = False
if optimized:
break
if __name__ == '__main__':
np.random.seed(250)
dots = generate()
k_means = K_Means(Num_gas)
k_means.fit(dots)
# 输出结果
for i in range(Num_gas):
print("聚类中心", i + 1, "坐标为", np.round(k_means.centers_[i], 2))
city_index = []
for x in k_means.clf_[i]:
city_index.append(search_city(x, dots))
print("属于该聚类中心的城市标号为", city_index)
print("所有聚类中心和所辖城市的距离之和为", np.round(k_means.distance, 2))
# 绘制中心点
for center in k_means.centers_:
plt.scatter(k_means.centers_[center][0], k_means.centers_[center][1], marker='x', color="#000000", s=50)
# 绘制城市点
for cat in k_means.clf_:
for point in k_means.clf_[cat]:
plt.scatter(point[0], point[1], c=colors[cat])
plt.show()
输出总距离:所有聚类中心和所辖城市的距离之和为 2816.76
结论
聚类的常规标准是让聚类中心和所辖城市的距离之和,在本实验中,手动实现的k-means算法的结果要优于sklearn的结果。
这主要是由于k-means算法本身并不是非常稳定,容易受到初始点、离群点的影响,因此,所求解不一定是最优解。
附录:sklearn K-means参数/属性/接口
下面是sklearn中K-means算法的常用接口参数,数据来自菜菜的机器学习sklearn