注释都在代码里了,就不细说。
聚类的数据集和提取的matlab代码会放在最后
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as random
from scipy.io import loadmat
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
class readdata:
def __init__(self, data_path='./data.mat'):
# 读取数据集
self.data = loadmat(data_path)
def get_data(self):
X = self.data['X']
y = []
j = 0
for i in range(len(X)):
y.append(j)
if (i + 1) % 200 == 0:
j = j + 1
y = np.array(y)
return X, y
def plotData(self):
X, y = self.get_data()
plt.scatter(X[:, 0], X[:, 1])
plt.show()
return 1
class Kmeans:
def __init__(self, data, k):
"""
初始化
:param data: 输入数据
:param k: 输入要类别数量
"""
self.data = data
self.k = k
self.pred_label = None
self.skl_pre_label = None
self.center = None
def sklearn_kmeans(self):
"""
使用sklearn中的k-means聚类
:return: sk_label
"""
skl_kmean = KMeans(self.k, random_state=1)
skl_label = skl_kmean.fit(self.data).labels_
# print(skl_label)
self.skl_pre_label = skl_label
return skl_label
def k_means(self):
"""
自己的k_means
:return: center_ 每一类的中心, label 类别标签
"""
# 获取k个随机数
data = self.data
pick_dot = random.random(size=self.k)
# 乘以数据集大小——>数据集中随机的4个点
pick_dot = np.floor(pick_dot * len(data)).astype(int)
print('数据集中随机索引', pick_dot)
# 取pick_dot作为初始中心点
center = data[pick_dot]
print('初始聚类中心:', center)
# 标记每个样本所属的类(k[i])
cls = np.zeros([len(data)], int)
# print('初始center=n', center)
run = True
iteration = 0
n = len(data)
while run:
iteration = iteration + 1
for i in range(n):
# 求差
tmp = data[i] - center
# 求平方
tmp = np.square(tmp)
# axis=1表示按行求和
tmp = np.sum(tmp, axis=1)
# 取最小(最近)的给该点“染色”(标记每个样本所属的类(k[i]))
cls[i] = np.argmin(tmp)
# 如果没有修改各分类中心点,就结束循环
run = False
# 计算更新每个类的中心点
for i in range(self.k):
# 找到属于该类的所有样本
club = data[cls == i]
# axis=0表示按列求平均值,计算出新的中心点
newcenter = np.mean(club, axis=0)
# 如果新旧center的差距很小,看做他们相等,否则更新之。run置true,再来一次循环
ss = np.abs(center[i] - newcenter)
if np.sum(ss, axis=0) > 1e-4:
center[i] = newcenter
run = True
# print('new center=n', center)
print('程序结束,迭代次数:', iteration)
# 接下来按照出现顺序重新定义标签并校准中心位置
old_label, index = np.unique(cls, return_index=True)
old_label = cls[np.sort(index)]
label = np.zeros(cls.shape, dtype=int)
center_ = np.zeros(center.shape)
for i in range(len(old_label)):
index_ = np.where(cls == old_label[i])
label[index_] = i
center_[i] = center[old_label[i]]
self.pred_label = label
self.center = center_
# print(label)
return center_, label
def plot_data(self, y):
if self.pred_label is None:
self.k_means()
if self.skl_pre_label is None:
self.sklearn_kmeans()
plt.figure(figsize=(9, 9))
plt.subplot(221)
plt.title("origin data")
plt.scatter(self.data[:, 0], self.data[:, 1])
plt.xlabel('x1')
plt.ylabel('x2')
plt.subplot(222)
plt.title("origin label")
plt.scatter(self.data[:, 0], self.data[:, 1], c=y)
plt.xlabel('x1')
plt.ylabel('x2')
plt.subplot(224)
plt.title("my k_means")
plt.scatter(self.data[:, 0], self.data[:, 1], c=self.pred_label)
plt.xlabel('x1')
plt.ylabel('x2')
plt.subplot(223)
plt.title("sklearn k_means")
plt.scatter(self.data[:, 0], self.data[:, 1], c=self.skl_pre_label)
plt.xlabel('x1')
plt.ylabel('x2')
plt.savefig("k_means.png", dpi=1200)
plt.show()
def calculate_acc(self, y):
assert self.pred_label is not None, "请先完成聚类!"
error = y - self.pred_label
count = np.count_nonzero(error)
print('错分个数为:', count)
acc = (1-count/len(y))*100
return acc
if __name__ == '__main__':
np.random.seed(99)
r = readData()
real_center = np.array([[1, -1],
[5.5, -4.5],
[1, 4],
[6, 4.5],
[9, 0.0]])
# r.plotData()
X, y = r.get_data()
K = 5
model = Kmeans(X, K)
center, label = model.k_means()
# model.plot_data(y)
dist = np.sqrt(np.sum(np.square(center - real_center), axis=1))
print('划分正确率为:%f' % model.calculate_acc(y))
print('k_means得到的预测中心:', center)
print('聚类中心与真实分布的欧拉距离为:', dist)
print('聚类中心与真实分布的欧拉距离和为:', np.sum(dist))
这个是matlab划分数据的代码
sigma = [1,0;0,1]; mu1 = [1,-1]; x1 = mvnrnd(mu1,sigma,200); mu2 = [5.5, -4.5]; x2 = mvnrnd(mu2,sigma,200); mu3 = [1,4]; x3 = mvnrnd(mu3,sigma,200); mu4 = [6,4.5]; x4 = mvnrnd(mu4,sigma,200); mu5 = [9,0.0]; x5 = mvnrnd(mu5,sigma,200); X = [x1;x2;x3;x4;x5]; plot(x1(:,1),x1(:,2),'r.'); hold on; plot(x2(:,1),x2(:,2),'b.'); plot(x3(:,1),x3(:,2),'k.'); plot(x4(:,1),x4(:,2),'g.'); plot(x5(:,1),x5(:,2),'m.');
matlab绘制的图像如下:
这个是用到的数据集,我贴在这里,设置了免费下载
数据集https://download.csdn.net/download/CynicalRat/85040638这个是最后聚类的效果以及与sklearn中方法的对比



