机器学习分类算法

1引入所需库

import numpy as np
import matplotlib as mpl
from matplotlib import colors
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn import model_selection

2加载数据切分数据集

#将字符串转化为整型 
def iris_type(s):
    it = {b'Iris-setosa':0, b'Iris-versicolor':1, b'Iris-virginica':2}
    return it[s]

#加载数据集
data = np.loadtxt('data/data97795/Iris.data', dtype=float, delimiter=',', converters={4:iris_type})
#第一个参数数据集的路径 将其送到函数里，第二个参数是数据集类型，第三个参数是分隔符，第四个参数运用了上面定义的将字符串转化为整型的函数 4表示从0开始 第五列那一列

#训练集和测试集切分
x, y = np.split(data, (4, ), axis=1)
x = x[:,:2]#取出x的前两列
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, random_state=1, test_size=0.2)

3构建svm分类器，训练函数

#构建SVM分类器
def classifer():
    #定义分类器
    #第一个参数为错误项的惩罚系数（松弛变量）,c越大对训练集的惩罚越大，准确度越高越容易出现过拟合
    #c越小对训练集的惩罚越小，提高泛化能力（对带有噪声的样本集一般用小的）
    #kernel svm分类器采用的和函数 可供选择的包括线性和函数 多项式和函数 高斯和函数
    #decision_function_shape 'ovr'代表svm一对多分类 如果要完成k分类的就要构造k个ovr的分类器
    #'ovo'代表一对一 如果要完成k分类的就要构造k（k-1)/2个分类器
    clf = svm.SVC(C=0.8, kernel='linear', decision_function_shape='ovr')
    return clf

#模型训练的函数 第一个参数为svm分类器 第二个参数为训练集数据的特征 ，第三个为训练集数据的标签
def train(clf, x_train, y_train):
    clf.fit(x_train, y_train.ravel())

#SVM模型的定义以及调用函数训练
clf = classifer()
train(clf, x_train, y_train)

4初始化分类器实例训练模型

def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    print('%s accuracy:%.3f' %(tip,np.mean(acc)))
    #np.mean()表示对准确率取平均

5展示训练结果及验证结果

def print_accuracy(clf, x_train, y_train, x_test, y_test):
    #在模型训练过程中要不断观察他在训练集和测试集的结果

    #1.先用clf自带的score函数完成对准确率的计算
    print('training prediction:%.3f'%(clf.score(x_train, y_train)))
    print('test prediction:%.3f'%(clf.score(x_test, y_test)))

    #2.用自己定义的函数实现计算
    show_accuracy(clf.predict(x_train),y_train,'training data')
    show_accuracy(clf.predict(x_test),y_test,'testing data')

    print('decision_function:n',clf.decision_function(x_train)[:2])
    #[:2]为numpy的数组切片 这里表示从第0行切到第2-1行
print_accuracy(clf, x_train, y_train, x_test, y_test)

6可视化

#想要绘制各个类型的空间区域 需要大量采样
#但此数据集只包括150条 绘制出来可能不太精细
#因此需要绘制大规模的样本数据
#根据生成的样本数据进行对分类区域绘制
#

def draw(clf, x):
    iris_feature = 'sepal length', ' sepal width', 'petal length', 'petal width'
    #用样本前两位特征来分类
    x1_min, x1_max = x[:, 0].min(), x[:, 0].max() #第0列的最小值和最大值（第一个特征）
    x2_min, x2_max = x[:, 1].min(), x[:, 1].max() #第1列的最小值和最大值 （第二个特征）

    #生成网格采样点
    x1,x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200]
    #这样就在各位特征最大最小值区间类采样 可生成一个行相同和一个列相同矩阵 （矩阵每行每列的值是一样）

    #把两个矩阵拉平为两个长向量
    grid_test = np.stack((x1.flat, x2.flat),axis=1)
    #两个长向量的每个元素作为样本的第一第二个特征
    print('grid_test:n',grid_test[:2])

    z = clf.decision_function(grid_test)
    print("the distance to decision plane:n", z[:2])

    grid_hat = clf.predict(grid_test)

    #思想：当样本点足够多 分类边界显现的非常精细

    #预测分类值 得到[0, 0, ..., 2, 2]
    print('grid_hat:n', grid_hat)
    # 使得grid_hat 与 x1 形状一致
    grid_hat = grid_hat.reshape(x1.shape)
    cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = mpl.colors.ListedColormap(['g', 'b', 'r'])

    plt.pcolormesh(x1, x2, grid_hat, cmap = cm_light)
    plt.scatter(x[:, 0],x[:, 1], c=np.squeeze(y),edgecolors='k', s=50, cmap=cm_dark)
    plt.scatter(x_test[:, 0],x_test[:, 1], s=120, facecolor='none', zorder=10)
    plt.xlabel(iris_feature[0],fontsize=20) #注意单词拼写label
    plt.ylabel(iris_feature[1],fontsize=20)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.title('Iris data classification via SVM', fontsize=30)
    plt.grid()
    plt.show()

draw(clf, x)

机器学习分类算法

Python相关栏目本月热门文章