一种简单但广泛使用的分类器,它通过训练数据构建决策树,对未知的数据进行分类。决策树的每个内部节点表示在一个属性上的测试,每个分枝代表该测试的一个输出,而每个树叶结点存放着一个类标号。
ID3算法:该模型数据为整条数据,目标值在特征值后一列。
信息熵:
def calEnt(dataSet):
n = dataSet.shape[0]
iset = dataSet.iloc[:,-1].value_counts()
p = iset / n
ent = (-p * np.log2(p)).sum()
return ent
信息增益:
"""
根据信息增益选择出最佳数据集切分的列
"""
def bestSplit(dataSet):
baseEnt = calEnt(dataSet)
bestGain = 0
axis = -1
for i in range(dataSet.shape[1]-1):
levels = dataSet.iloc[:,i].value_counts().index
ents = 0
for j in levels:
childSet = dataSet[dataSet.iloc[:,i]==j]
ent = calEnt(childSet)
ents += (childSet.shape[0]/dataSet.shape[0])*ent
infoGain = baseEnt - ents
#print(f'第{i}列的信息熵为{infoGain}')
if(infoGain > bestGain):
bestGain = infoGain
axis = i
return axis
完整实现ID3算法:
用到的函数如下:
calEnt(dataSet)#计算信息熵 bestSplit(dataSet)#根据信息增益最大的选择出最佳数据集切分的列 mySplit(dataSet,axis,value)#按照给定的列划分数据集 createTree(dataSet)#基于最大信息增益切分数据集,递归构建决策树 classify(inputTree, labels, testVec)#对一个测试实例进行分类 acc_classify(train,test)#对测试集进行预测,并返回预测后的结果
"""
计算信息熵
"""
def calEnt(dataSet):
n = dataSet.shape[0]
iset = dataSet.iloc[:,-1].value_counts()
p = iset / n
ent = (-p * np.log2(p)).sum()
return ent
"""
根据信息增益选择出最佳数据集切分的列
"""
def bestSplit(dataSet):
baseEnt = calEnt(dataSet)
bestGain = 0
axis = -1
for i in range(dataSet.shape[1]-1):
levels = dataSet.iloc[:,i].value_counts().index
ents = 0
for j in levels:
childSet = dataSet[dataSet.iloc[:,i]==j]
ent = calEnt(childSet)
ents += (childSet.shape[0]/dataSet.shape[0])*ent
infoGain = baseEnt - ents
#print(f'第{i}列的信息熵为{infoGain}')
if(infoGain > bestGain):
bestGain = infoGain
axis = i
return axis
"""
按照给定的列划分数据集
"""
def mySplit(dataSet,axis,value):
col = dataSet.columns[axis]
redataSet = dataSet.loc[dataSet[col]==value,:].drop(col,axis=1)
return redataSet
"""
函数功能:基于最大信息增益切分数据集,递归构建决策树
"""
def createTree(dataSet):
featlist = list(dataSet.columns)
classlist = dataSet.iloc[:,-1].value_counts()
if classlist[0]==dataSet.shape[0] or dataSet.shape[1]==1:
return classlist.index[0]
axis = bestSplit(dataSet)
bestfeat = featlist[axis]
myTree = {bestfeat:{}}
#del featlist[axis]
valuelist = set(dataSet.iloc[:,axis])
for value in valuelist:
myTree[bestfeat][value] = createTree(mySplit(dataSet,axis,value))
return myTree
"""
函数功能:对一个测试实例进行分类
"""
def classify(inputTree, labels, testVec):
firstStr = next(iter(inputTree))
secondDict = inputTree[firstStr]
featIndex = labels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]) == dict:
classLabel = classify(secondDict[key],labels,testVec)
else:
classLabel = secondDict[key]
return classLabel
"""
函数功能:对测试集进行预测,并返回预测后的结果
"""
def acc_classify(train,test):
inputTree = createTree(train)
labels = list(train.columns)
result = []
for i in range(test.shape[0]):
testVec = test.iloc[i,:-1]
classLabel = classify(inputTree,labels,testVec)
result.append(classLabel)
test['perdict'] =result
acc = (test.iloc[:,-1]== test.iloc[:,-2]).mean()
print(f'模型预测准确率为{acc}')
return test
绘制决策树ID3(graphviz):
sklearn不能处理字符串内容,我们对于字符串数据转换成数值型数据:
处理函数如下:
#特征列
Xtrain1 = train1.iloc[:,:-1]
for i in Xtrain1.columns:
labels = Xtrain1[i].unique().tolist()
Xtrain1[i] = Xtrain1[i].apply(lambda x:labels.index(x))
#标签列
Ytrain1 = train1.iloc[:,-1]
labels = Ytrain1.unique().tolist()
Ytrain1 = Ytrain1.apply(lambda x: labels.index(x))
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import graphviz
#特征
Xtrain = dataSet.iloc[:,:-1]#数值型,不需要处理
#标签
Ytrain = dataSet.iloc[:,-1]
labels = Ytrain.unique().tolist()
Ytrain = Ytrain.apply(lambda x:labels.index(x))
#绘制树模型
clf = DecisionTreeClassifier()
clf = clf.fit(Xtrain, Ytrain)
tree.export_graphviz(clf)
#给图形增加标签和颜色
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=['no surfacing', 'flippers'],
class_names=['fish', 'not fish'],
filled=True, rounded=True,
special_characters=True)
graphviz.Source(dot_data)
C4.5算法(本文未实现):
信息增益率:
基尼值:
def Gini(dataSet):
m = dataSet.shape[0]
iSet = dataSet.iloc[:, -1].value_counts()
p = iSet / m
gini = 1 - (np.power(p, 2)).sum()
return gini
基尼指数:



