机器学习——决策树函数（ID3）

决策树

一种简单但广泛使用的分类器，它通过训练数据构建决策树，对未知的数据进行分类。决策树的每个内部节点表示在一个属性上的测试，每个分枝代表该测试的一个输出，而每个树叶结点存放着一个类标号。

ID3算法：

该模型数据为整条数据，目标值在特征值后一列。

信息熵：

def calEnt(dataSet):
    n = dataSet.shape[0]
    iset = dataSet.iloc[:,-1].value_counts()
    p = iset / n
    ent = (-p * np.log2(p)).sum()
    return ent

信息增益：

"""
    根据信息增益选择出最佳数据集切分的列
"""
def bestSplit(dataSet):
    baseEnt = calEnt(dataSet)
    bestGain = 0
    axis = -1
    for i in range(dataSet.shape[1]-1):
        levels = dataSet.iloc[:,i].value_counts().index
        ents = 0
        for j in levels:
            childSet = dataSet[dataSet.iloc[:,i]==j]
            ent = calEnt(childSet)
            ents += (childSet.shape[0]/dataSet.shape[0])*ent
        infoGain = baseEnt - ents
        #print(f'第{i}列的信息熵为{infoGain}')
        if(infoGain > bestGain):
            bestGain = infoGain
            axis = i
    return axis

完整实现ID3算法：

用到的函数如下：

calEnt(dataSet)#计算信息熵
bestSplit(dataSet)#根据信息增益最大的选择出最佳数据集切分的列
mySplit(dataSet,axis,value)#按照给定的列划分数据集
createTree(dataSet)#基于最大信息增益切分数据集，递归构建决策树
classify(inputTree, labels, testVec)#对一个测试实例进行分类
acc_classify(train,test)#对测试集进行预测，并返回预测后的结果

"""
    计算信息熵
"""
def calEnt(dataSet):
    n = dataSet.shape[0]
    iset = dataSet.iloc[:,-1].value_counts()
    p = iset / n
    ent = (-p * np.log2(p)).sum()
    return ent
    
"""
    根据信息增益选择出最佳数据集切分的列
"""
def bestSplit(dataSet):
    baseEnt = calEnt(dataSet)
    bestGain = 0
    axis = -1
    for i in range(dataSet.shape[1]-1):
        levels = dataSet.iloc[:,i].value_counts().index
        ents = 0
        for j in levels:
            childSet = dataSet[dataSet.iloc[:,i]==j]
            ent = calEnt(childSet)
            ents += (childSet.shape[0]/dataSet.shape[0])*ent
        infoGain = baseEnt - ents
        #print(f'第{i}列的信息熵为{infoGain}')
        if(infoGain > bestGain):
            bestGain = infoGain
            axis = i
    return axis
    
"""
    按照给定的列划分数据集
"""
def mySplit(dataSet,axis,value):
    col = dataSet.columns[axis]
    redataSet = dataSet.loc[dataSet[col]==value,:].drop(col,axis=1)
    return redataSet

"""
    函数功能：基于最大信息增益切分数据集，递归构建决策树
"""
def createTree(dataSet):
    featlist = list(dataSet.columns)
    classlist = dataSet.iloc[:,-1].value_counts()
    if classlist[0]==dataSet.shape[0] or dataSet.shape[1]==1:
        return classlist.index[0]
    axis = bestSplit(dataSet)
    bestfeat = featlist[axis]
    myTree = {bestfeat:{}}
    #del featlist[axis]
    valuelist = set(dataSet.iloc[:,axis])
    for value in valuelist:
        myTree[bestfeat][value] = createTree(mySplit(dataSet,axis,value))
    return myTree

"""
函数功能：对一个测试实例进行分类
"""
def classify(inputTree, labels, testVec):
    firstStr = next(iter(inputTree))
    secondDict = inputTree[firstStr]
    featIndex = labels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]) == dict:
                classLabel = classify(secondDict[key],labels,testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

"""
    函数功能:对测试集进行预测，并返回预测后的结果
"""
def acc_classify(train,test):
    inputTree = createTree(train)
    labels = list(train.columns)
    result = []
    for i in range(test.shape[0]):
        testVec = test.iloc[i,:-1]
        classLabel = classify(inputTree,labels,testVec)
        result.append(classLabel)
        
    test['perdict'] =result
    acc =  (test.iloc[:,-1]== test.iloc[:,-2]).mean()
    print(f'模型预测准确率为{acc}')
    return test

绘制决策树ID3（graphviz）：

sklearn不能处理字符串内容，我们对于字符串数据转换成数值型数据：
处理函数如下：

#特征列
Xtrain1 = train1.iloc[:,:-1]
for i in Xtrain1.columns:
    labels = Xtrain1[i].unique().tolist()
    Xtrain1[i] = Xtrain1[i].apply(lambda x:labels.index(x))
    
#标签列
Ytrain1 = train1.iloc[:,-1]
labels = Ytrain1.unique().tolist()
Ytrain1 = Ytrain1.apply(lambda x: labels.index(x))

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import graphviz

#特征
Xtrain = dataSet.iloc[:,:-1]#数值型，不需要处理
#标签
Ytrain = dataSet.iloc[:,-1]
labels = Ytrain.unique().tolist()
Ytrain = Ytrain.apply(lambda x:labels.index(x))
#绘制树模型
clf = DecisionTreeClassifier()
clf = clf.fit(Xtrain, Ytrain)
tree.export_graphviz(clf)
#给图形增加标签和颜色
dot_data = tree.export_graphviz(clf, out_file=None,
                                feature_names=['no surfacing', 'flippers'],
                                class_names=['fish', 'not fish'],
                                filled=True, rounded=True,
                                special_characters=True)
graphviz.Source(dot_data)

C4.5算法（本文未实现）：

信息增益率：

CART算法（本文未实现）：

基尼值：

def Gini(dataSet):
    m = dataSet.shape[0]
    iSet = dataSet.iloc[:, -1].value_counts()
    p = iSet / m
    gini = 1 - (np.power(p, 2)).sum()
    return gini

基尼指数：

机器学习——决策树函数（ID3）

Python相关栏目本月热门文章