K-Means 是最经典的无监督学习算法之一,用于将数据划分为 K 个簇。本文详细介绍其原理、实现以及实践技巧。
算法原理
K-Means 算法的目标是最小化簇内平方和(Within-Cluster Sum of Squares, WCSS):
$$J = \sum_{k=1}^{K} \sum_{x_i \in C_k} \|x_i - \mu_k\|^2$$
其中 $\mu_k$ 是第 $k$ 个簇的中心。
算法流程
- 分配:每个样本分配到最近的质心
- 更新:重新计算每个簇的质心
import numpy as np
import matplotlib.pyplot as plt
class KMeans:
def __init__(self, n_clusters=3, max_iter=300):
self.n_clusters = n_clusters
self.max_iter = max_iter
self.centroids = None
def fit(self, X):
# 随机初始化质心
np.random.seed(42)
indices = np.random.choice(len(X), self.n_clusters, replace=False)
self.centroids = X[indices]
for _ in range(self.max_iter):
# 分配样本到最近的质心
labels = self._assign_clusters(X)
# 保存旧的质心
old_centroids = self.centroids.copy()
# 更新质心
for k in range(self.n_clusters):
cluster_points = X[labels == k]
if len(cluster_points) > 0:
self.centroids[k] = cluster_points.mean(axis=0)
# 检查收敛
if np.allclose(old_centroids, self.centroids):
break
return labels
def _assign_clusters(self, X):
distances = np.zeros((len(X), self.n_clusters))
for k in range(self.n_clusters):
distances[:, k] = np.linalg.norm(X - self.centroids[k], axis=1)
return np.argmin(distances, axis=1)
def predict(self, X):
return self._assign_clusters(X)
示例
np.random.seed(42)
X1 = np.random.randn(100, 2) + np.array([2, 2])
X2 = np.random.randn(100, 2) + np.array([-2, 2])
X3 = np.random.randn(100, 2) + np.array([0, -2])
X = np.vstack([X1, X2, X3])
kmeans = KMeans(n_clusters=3, max_iter=300)
labels = kmeans.fit(X)
print(f"聚类中心:\n{kmeans.centroids}")
选择最优 K 值
肘部法则(Elbow Method)
def elbow_method(X, k_range):
wcss = []
for k in k_range:
kmeans = KMeans(n_clusters=k)
labels = kmeans.fit(X)
wcss_k = 0
for i in range(k):
cluster_points = X[labels == i]
wcss_k += np.sum((cluster_points - kmeans.centroids[i])2)
wcss.append(wcss_k)
return wcss
使用肘部法则
k_range = range(1, 11)
wcss = elbow_method(X, k_range)
print("WCSS:", wcss)
优缺点
优点
- 简单高效,时间复杂度 $O(n \cdot k \cdot t)$
- 易于理解和实现
缺点
- 需要预先指定 K 值
- 对初始质心敏感
- 只能发现球形簇
- 容易陷入局部最优