大一时了解到K-Means算法,刚好在学习C++,想着动手写一个这算法,以达到更好地学习效果,因为只支持三维数据,因此仅供学习。
算法流程参考下面文章:
(152条消息) K-means算法的基本原理_纯粹.的博客-CSDN博客_k-means聚类算法的原理:
1) K-means算法首先需要选择K个初始化聚类中心
(2) 计算每个数据对象到K个初始化聚类中心的距离(该距离可以为多种度量方式,如曼哈顿距离、欧氏距离、马氏距离,甚至你自定义的距离度量),将数据对象分到距离聚类中心最近的那个数据集中,当所有数据对象都划分以后,就形成了K个数据集(即K个簇)
(3)接下来重新计算每个簇的数据对象的均值,将均值作为新的聚类中心
(4)最后计算每个数据对象到新的K个初始化聚类中心的距离,重新划分
(5)每次划分以后,都需要重新计算初始化聚类中心,一直重复这个过程,直到所有的数据对象
看个动态图(该图来自Matplotlib输出动画实现K-means聚类过程可视化 - 知乎 (zhihu.com)):
首先看头文件
#pragma once #ifndef K_MEANS_H #define K_MEANS_H #includeusing namespace std; struct Tuple { float x; float y; float z; };//这里只将聚类对象的纬度定义为三维,事实上可以为任意大于零的纬度,可以用动态数组来表示 class K_means { private: vector basic_data;//初始数据 int mean_number;//k的值 vector > result_data;//{ mean_number };//结果数据,也用来储存中间聚类数据 vector means;//中间计算储存的聚类中心 vector mea_square_error;//每个簇的均方差 public: K_means(vector basic_data, int mean_number) : result_data(mean_number), mea_square_error(mean_number) { this->basic_data = basic_data; this->mean_number = mean_number; } //计算最后的聚类结果 void get_fina_result_data(); private: //计算两个点间的距离 float get_distxyz(Tuple tuple1, Tuple tuple2); //初始化k个聚类中心 void prime_k_meansbase(); //初始化得到k个簇 void get_k_means(); //得到当前每个簇类中心 void get_basemean(); //得到新的簇 void get_new_k_means(); //得到每一个簇的方差 void get_k_mean_erro(); };
实现代码如下:
#include#include #include #include"k_means.h" using namespace std; float K_means::get_distxyz(Tuple tuple1, Tuple tuple2)//这里采用欧氏距离度量 { int distance = pow(pow((tuple1.x - tuple2.x), 2) + pow((tuple1.y - tuple2.y), 2) + pow((tuple1.z - tuple2.z), 2), 0.5); return distance; } //初始化k个聚类中心 void K_means::prime_k_meansbase() { for (int i = 0;i != mean_number; i++) { means.push_back(basic_data[i]); } } //初始化得到k个簇 void K_means::get_k_means() { for (int i = 0;i != basic_data.size();++i) { float min_distance = 0x3f3f3f3f; int static temp; for (int j = 0;j != mean_number;++j) { if (min_distance > get_distxyz(basic_data[i], means[j])) { min_distance = get_distxyz(basic_data[i], means[j]); temp = j; cout << temp << endl;; } } result_data[temp].push_back(basic_data[i]); } } //得到当前每个簇类中心 void K_means::get_basemean() { vector A_mean; for (int j = 0;j < mean_number;j++) { float sumx = 0; float sumy = 0; float sumz = 0; int num = result_data[j].size(); A_mean = result_data[j]; cout << num << endl; for (int i = 0;i < num;i++) { sumx += A_mean[i].x; sumy += A_mean[i].y; sumz += A_mean[i].z; } Tuple basemean = { sumx / num,sumy / num,sumz / num }; means[j] = basemean; } } //得到新的簇 void K_means::get_new_k_means() { result_data.clear();//清除迁移过程结果 result_data = vector >(mean_number);//分配结果储存容器 for (int i = 0;i != basic_data.size();++i)//每个点与每个簇类中心的距离进行比较 { float min_distance = 0x3f3f3f3f;//表示无穷大 int static temp2; for (int j = 0;j != mean_number;++j) { if (min_distance > get_distxyz(basic_data[i], means[j])) { //选距离最近的簇中心作为中心 min_distance = get_distxyz(basic_data[i], means[j]); temp2 = j; } } result_data[temp2].push_back(basic_data[i]); } } //得到每一个簇的方差 void K_means::get_k_mean_erro() { vector square_error; for (int j = 0;j < mean_number;j++) { float sum = 0;//方差 int num = result_data[j].size();//方差的被除数,得到均方差 vector A_mean = result_data[j]; Tuple basemean = means[j]; for (int i = 0;i < num;i++) { sum += pow(get_distxyz(A_mean[i], basemean), 2); } float result = sum / num; mea_square_error[j] = result; } } //计算最后的聚类结果 void K_means::get_fina_result_data() { //初始化 prime_k_meansbase(); get_k_means(); get_k_mean_erro();//得到每一个簇的方差 int i = 1; while (i) { i = 0; vector last_k_mean_erro = mea_square_error; get_basemean();//得到当前每个簇的中心 get_new_k_means();//得到新的聚类 get_k_mean_erro();//得到每一个簇的方差 for (int j = 0;j != mean_number;++j) { cout << mea_square_error[j] << 't' << last_k_mean_erro[j] << endl; if (mea_square_error[j]!=last_k_mean_erro[j]) { i = 1; } } } //打印分类结果 for (int j = 0;j != mean_number;++j) { cout << "****************************************************************" << endl; for (vector ::iterator m = result_data[j].begin();m != result_data[j].end();++m) { cout << (*m).x << 't' << (*m).y << 't' << (*m).z << endl; } } }
测试代码:
#include#include #include"k_means.h" using namespace std; void test() { vector basic_data_test; Tuple temp_data; int x = 0, y = 0, z = 0; int k;//分类数量 int n;//样本数量 cout << "请输入类别数量:" << endl; cin >> k; cout << "请输入数据量" << endl; cin >> n; for (int i = 0;i != n;++i) { cout << "请输入x,y,z:" << endl; cin >> x >> y >> z; temp_data.x = x; temp_data.y = y; temp_data.z = z; basic_data_test.push_back(temp_data); } K_means a(basic_data_test, k); a.get_fina_result_data(); } int main() { test(); }



