数据分析-聚类-案例

1、导入必要的包

2、读入数据

3、数据探索

4、数据预处理

5、建模

5.1 Kmeans

5.2 MeanShift

5.3 AgglomerativeClustering

5.4 DBSCAN

1、导入必要的包

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn import metrics 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

2、读入数据

data=pd.read_csv("d:/datasets/auto-mpg.csv")

3、数据探索

data.head()
data.info()
data.describe()

4、数据预处理

data_auto=data.drop("car name",axis=1)
print(data_auto[data_auto["horsepower"]=="?"])
horse=data_auto["horsepower"].value_counts()  #统计缺失值数量
#删除不完整的样本
data_auto.drop(data_auto[data_auto["horsepower"]=="?"].index,inplace=True)
#标准化
model_sc=StandardScaler()
model_sc.fit(data_auto)
data_auto_sc=model_sc.transform(data_auto)

5、建模

5.1 Kmeans

model_km=KMeans(n_clusters=3,random_state=10)
model_km.fit(data_auto_sc)
auto_label=model_km.labels_
auto_cluster=model_km.cluster_centers_
pd.Series(auto_label).value_counts()
print(auto_cluster)
print(model_sc.inverse_transform(auto_cluster))

探寻最优的K值

for k in [2,3,4,6,300]:
    model_km=KMeans(n_clusters=k,random_state=10).fit(data_auto_sc)
    auto_label=model_km.labels_
    auto_cluster=model_km.cluster_centers_
    print(k,"   ",round(metrics.silhouette_score(data_auto_sc,auto_label),4))

5.2 MeanShift

model_mn=MeanShift(bandwidth=2).fit(data_auto_sc)
auto_label=model_mn.labels_
auto_cluster=model_mn.cluster_centers_

bandwidth_grid=np.arange(1,2.5,0.2)
cluster_number=[]
slt_score=[]
for i in bandwidth_grid:
    model=MeanShift(bandwidth=i).fit(data_auto_sc)
    cluster_number.append(len(np.unique(model.labels_)))
    slt_score.append(metrics.silhouette_score(data_auto_sc,model.labels_))

from prettytable import PrettyTable
x = PrettyTable(["窗宽","蔟的个数","轮廓系数"])
#x.align["窗宽"] = "1" #以姓名字段左对齐
#x.padding_width = 1 # 填充宽度
for i,j,k in zip(bandwidth_grid,cluster_number,slt_score):
    x.add_row([i,j,k])
print(x)

5.3 AgglomerativeClustering

model=AgglomerativeClustering(n_clusters=3,linkage="average").fit(data_auto_sc)
auto_label=model.labels_

lbs=pd.Series(auto_label).value_counts()
#plt.bar(x=lbs.index,height=lbs )
lbs.plot(kind="bar",rot=0)

# 绘制谱系图
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
#利用scipy中pdist,linkage,dendrogram函数绘制谱系图
#pdist函数返回距离矩阵，linkage函数返回一个ndarray对象，描述了簇合并的过程
#dendrogram函数用来绘制谱系图
row_clusters = linkage(pdist(data_auto_sc,metric='euclidean'),method='ward')
fig = plt.figure(figsize=(16,8))
#参数p和参数truncate_mode用来将谱系图截断，部分结点的子树被剪枝，横轴显示的是该结点包含的样本数
row_dendr = dendrogram(row_clusters, p=50, truncate_mode='lastp',color_threshold=5)
plt.tight_layout()
plt.title('谱系图', fontsize=15)

5.4 DBSCAN

# 训练模型
model = DBSCAN(eps=1,min_samples=2).fit(data_auto_sc_0)
# 输出模型结果
auto_label = model.labels_
# 核心对象的索引
model.core_sample_indices_
# 输出核心对象
model.components_

clu_num=[]
for min_ in [1,3,5,7,9]:
    model = DBSCAN(eps=1,min_samples=min_).fit(data_auto_sc_0)
    # 输出模型结果
    labels=model.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clu_num.append(n_clusters_)

5.5 SpectralClustering

from sklearn.cluster import SpectralClustering
model= SpectralClustering(n_clusters=3)
model.fit(data_auto_sc)
auto_label=model.labels_

数据分析-聚类-案例

2、读入数据 data=pd.read_csv("d:/datasets/auto-mpg.csv")

3、数据探索 data.head() data.info() data.describe()

5、建模

Python相关栏目本月热门文章

2、读入数据
data=pd.read_csv("d:/datasets/auto-mpg.csv")

3、数据探索
data.head() data.info() data.describe()