import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn import metrics
iris = pd.read_csv('E:/练习/Iris.csv')
iris.head()
| id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species |
|---|
| 0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
|---|
| 1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
|---|
| 2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
|---|
| 3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
|---|
| 4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
|---|
iris.drop('id',axis=1,inplace=True)
iris.info()
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SepalLengthCm 150 non-null float64
1 SepalWidthCm 150 non-null float64
2 PetalLengthCm 150 non-null float64
3 PetalWidthCm 150 non-null float64
4 Species 150 non-null int64
dtypes: float64(4), int64(1)
memory usage: 6.0 KB
iris.head()
| SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species |
|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
|---|
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
|---|
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
|---|
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
|---|
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
|---|
fig = iris[iris.Species==0].plot(kind='scatter',x='SepalLengthCm',y='SepalWidthCm',color='orange', label='Setosa')
iris[iris.Species==1].plot(kind='scatter',x='SepalLengthCm',y='SepalWidthCm',color='blue', label='versicolor',ax=fig)
iris[iris.Species==2].plot(kind='scatter',x='SepalLengthCm',y='SepalWidthCm',color='green', label='virginica', ax=fig)
fig.set_xlabel("Sepal Length")
fig.set_ylabel("Sepal Width")
fig.set_title("Sepal Length VS Width")
fig=plt.gcf()
fig.set_size_inches(10,6)
plt.show()
fig = iris[iris.Species==0].plot.scatter(x='PetalLengthCm',y='PetalWidthCm',color='orange', label='Setosa')
iris[iris.Species==1].plot.scatter(x='PetalLengthCm',y='PetalWidthCm',color='blue', label='versicolor',ax=fig)
iris[iris.Species==2].plot.scatter(x='PetalLengthCm',y='PetalWidthCm',color='green', label='virginica', ax=fig)
fig.set_xlabel("Petal Length")
fig.set_ylabel("Petal Width")
fig.set_title(" Petal Length VS Width")
fig=plt.gcf()
fig.set_size_inches(10,6)
plt.show()
plt.figure(figsize=(7,4)) #计算相关系数
sns.heatmap(iris.corr(),annot=True,cmap='cubehelix_r')
plt.show()
train, test = train_test_split(iris, test_size = 0.3)
print(train.shape)
print(test.shape)
(105, 5)
(45, 5)
train_X = train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
train_y=train.Species# output of our training data
test_X= test[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
test_y =test.Species
train_X.head(2)
| SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm |
|---|
| 98 | 5.1 | 2.5 | 3.0 | 1.1 |
|---|
| 89 | 5.5 | 2.5 | 4.0 | 1.3 |
|---|
test_X.head(2)
| SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm |
|---|
| 110 | 6.5 | 3.2 | 5.1 | 2.0 |
|---|
| 124 | 6.7 | 3.3 | 5.7 | 2.1 |
|---|
train_y.head()
98 1
89 1
55 1
111 2
117 2
Name: Species, dtype: int64
x = iris.iloc[:, [0, 1, 2, 3]].values
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = kmeans.fit_predict(x)
#Visualising the clusters
plt.scatter(x[y_kmeans == 0, 0], x[y_kmeans == 0, 1], s = 100, c = 'purple', label = '0')
plt.scatter(x[y_kmeans == 1, 0], x[y_kmeans == 1, 1], s = 100, c = 'orange', label = '1')
plt.scatter(x[y_kmeans == 2, 0], x[y_kmeans == 2, 1], s = 100, c = 'green', label = '2')
#Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 100, c = 'red', label = 'Centroids')
plt.legend()
model = KMeans(n_clusters=3)
model.fit(train_X,train_y)
prediction=model.predict(test_X)
print('The accuracy of the KMeans is',metrics.accuracy_score(prediction,test_y))
The accuracy of the KMeans is 0.9111111111111111
r1 = pd.Series(model.labels_).value_counts()
r2 = pd.Dataframe(model.cluster_centers_) #找出聚类中心
r = pd.concat([r2, r1], axis = 1) #横向连接(0是纵向), 得到聚类中心对应的类别下的数目
print(r)
0 1 2 3 0
0 5.063636 3.472727 1.500000 0.251515 33
1 5.817500 2.722500 4.340000 1.447500 40
2 6.881250 3.053125 5.746875 2.015625 32
petal=iris[['PetalLengthCm','PetalWidthCm','Species']]
sepal=iris[['SepalLengthCm','SepalWidthCm','Species']]
train_p,test_p=train_test_split(petal,test_size=0.3,random_state=0) #petals
train_x_p=train_p[['PetalWidthCm','PetalLengthCm']]
train_y_p=train_p.Species
test_x_p=test_p[['PetalWidthCm','PetalLengthCm']]
test_y_p=test_p.Species
train_s,test_s=train_test_split(sepal,test_size=0.3,random_state=0) #Sepal
train_x_s=train_s[['SepalWidthCm','SepalLengthCm']]
train_y_s=train_s.Species
test_x_s=test_s[['SepalWidthCm','SepalLengthCm']]
test_y_s=test_s.Species
model=KMeans(n_clusters=3)
model.fit(train_x_p,train_y_p)
prediction=model.predict(test_x_p)
print('The accuracy of the KMeans using Petals is:',metrics.accuracy_score(prediction,test_y_p))
model.fit(train_x_s,train_y_s)
prediction=model.predict(test_x_s)
print('The accuracy of the KMeans using Sepals is:',metrics.accuracy_score(prediction,test_y_s))
The accuracy of the KMeans using Petals is: 0.4
The accuracy of the KMeans using Sepals is: 0.13333333333333333
r1 = pd.Series(model.labels_).value_counts()
r2 = pd.Dataframe(model.cluster_centers_) #找出聚类中心
r = pd.concat([r2, r1], axis = 1) #横向连接(0是纵向), 得到聚类中心对应的类别下的数目
print(r)
0 1 0
0 3.108571 6.885714 35
1 3.330556 4.988889 36
2 2.676471 5.829412 34