参考文章:https://blog.csdn.net/qq_30091945/article/details/81508055
作为机器学习的小白,最近将GDA给简单实现了,有很多不足的地方,欢迎大家指出。
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Author:
# Time:2021-11-21
# 参考文章:https://blog.csdn.net/qq_30091945/article/details/81508055
import numpy as np
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
# 高斯判别分析的类
class GDA:
def __init__(self,train_data,train_labels):
self.train_data = train_data
self.train_labels = train_labels
self.bonuli = 0 # y对应的伯努利参数
self.miu0 = 0 # y=0时对应的高斯分布的参数u0
self.miu1 = 0 # y=1时对应的高斯分布的参数u1
self.cov = 0 # 高斯分布的协方差矩阵
# 计算各个参数
def cal_param(self):
train_data = self.train_data
train_labels = self.train_labels
positive_num = 0 # y=1的数量
negative_num = 0 # y=0的数量
miu0 = 0. # μ0参数
miu1 = 0. # μ1参数
sum_x1 = 0. # y=1时x的求和
sum_x0 = 0. # y=0时x的求和
for i in range(len(train_labels)):
if train_labels[i]:
positive_num += 1
sum_x1 += train_data[i]
else:
negative_num += 1
sum_x0 += train_data[i]
miu0 += (1-train_labels[i])*train_data[i]
miu1 += train_labels[i]*train_data[i]
# 计算y的伯努利参数
bonuli = positive_num/(positive_num+negative_num)
# 计算μ0
miu0 = miu0/negative_num
# 计算μ1
miu1 = miu1/positive_num
# 计算协方差矩阵,即cov参数
s1 = 0 # y=1的方差
s0 = 0 # y=0的方差
train_data = np.mat(train_data) # 方便矩阵运算
sum_x0 = np.mat(sum_x0) # 行向量
sum_x1 = np.mat(sum_x1) # 行向量
for i in range(len(train_labels)):
if train_labels: # 这里的方差没有除以N1,因为后面要乘以N1,同时由于原来的向量是行向量,所以先转置
s1 += (train_data[i]-1.0/positive_num*sum_x1).T*(train_data[i]-1.0/positive_num*sum_x1)
else: # 这里的方差没有除以N0,因为后面要乘以N0
s0 += (train_data[i] - 1.0 / negative_num * sum_x0).T * (train_data[i] - 1.0 / negative_num * sum_x0)
cov = (s0+s1)/(negative_num+positive_num)
self.bonuli = bonuli
self.miu0 = np.mat(miu0)
self.miu1 = np.mat(miu1)
self.cov = np.mat(cov)
# 模型,即高斯密度函数
def gauss_model(self,x,miu,cov):
dim = np.shape(cov)[0] # 这里不能用len,否则算出来的就是行的数量了
cov_det = np.linalg.det(cov+np.eye(dim)*0.001) # 为了防止det为0,但为啥要在对角线加元素就可以了呢?
cov_inv = np.linalg.inv(cov+np.eye(dim)*0.001)
prob = 1.0/np.power(np.power(2*np.pi,dim)*np.abs(cov_det),0.5)*np.exp(-0.5*(x-miu)*cov_inv*(x-miu).T) # 求的x和μ本来是行向量
return prob
# 预测
def prediction(self,test_data,test_labels):
self.cal_param()
predict_labels = []
for i in range(np.shape(test_labels)[0]):
positive_prob = self.gauss_model(test_data[i],self.miu1,self.cov)*self.bonuli
negative_prob = self.gauss_model(test_data[i],self.miu0,self.cov)*(1-self.bonuli)
if positive_prob >= negative_prob:
predict_labels.append(1)
else:
predict_labels.append(0)
return predict_labels
# 损失函数部分
# 没有损失函数,因为分类结果只有0,1不好有损失函数
# 画图部分
# GDA 是由概率求出的,所以我不会画图来显示分界线
# 数据加载部分
def load_data():
fp = open(r'E:/data/textset.txt', encoding='UTF-8')
line = fp.readline()
datax = []
labels = []
while line:
data = list(map(float,line.split()))
datax.append([data[0],data[1]])
labels.append(int(data[2])) # 这里labels是一个行数列
line = fp.readline()
datax = np.array(datax)
fp.close()
return datax,labels
# 主函数
def main():
datax ,labels = load_data()
train_data,test_data,train_label,test_label = train_test_split(datax,labels,test_size=0.1,random_state=None)
# print(type(test_data),type(test_label)) # 这里得到的data是array形式的,label是list
# GDA的结果
gda = GDA(train_data,train_label)
gda_predict_labels = gda.prediction(test_data,test_label)
print("高斯判别分析的准确率为:",accuracy_score(test_label,gda_predict_labels))
# logistic regression的结果
lr = LogisticRegression()
lr.fit(train_data,train_label)
lr_predict_labels = lr.predict(test_data)
print("逻辑回归的准确率为:",accuracy_score(test_label,lr_predict_labels))
if __name__ == '__main__': # 入口
main()
结果显示:
小结:
由于公式都是数学推导已经得到了,而且这个代码的泛化能力很弱,所以整体代码并不难。
对于GDA和LR,GDA需要p(x|y)是高斯分布的,p(y)是伯努利分布的,所以对数据要求更高,但在数据比较少,维度比较低的情况下效果更好
LR需要比较弱的假设,数据不需要满足高斯分布,在数据较大下效果会更好一点



