搭建一个将二维平面坐标点分开的简单神经网络,输入维度为2*1(特征为2),神经网络结构如下图:
则网络第一层Layer_1的输出为:
L
1
=
f
(
w
1
x
1
+
w
3
x
2
+
b
1
)
L
2
=
f
(
w
2
x
1
+
w
4
x
2
+
b
2
)
L_1 = f(w_1x_1+w_3x_2+b_1) \ L2 = f(w_2x_1+w_4x_2+b_2)
L1=f(w1x1+w3x2+b1)L2=f(w2x1+w4x2+b2)
其中
f
(
⋅
)
f(·)
f(⋅)表示
s
i
g
m
o
i
d
sigmoid
sigmoid激活函数,
b
b
b为偏置项
最后该神经网络的输出为:
p
r
e
d
=
f
(
w
5
L
1
+
w
2
L
2
+
b
3
)
pred = f(w_5L_1+w_2L_2+b_3)
pred=f(w5L1+w2L2+b3)
按照反向传播算法和梯度下降算法优化参数
首先定义损失函数为均方误差函数:
L
o
s
s
=
1
N
∑
i
=
1
N
(
y
i
−
p
r
e
d
i
)
2
Loss = frac{1}{N}sum_{i=1}^N(y_i-pred_i)^2
Loss=N1i=1∑N(yi−predi)2
为了简化条件,每次我们只优化一个样本,即
N
=
1
N=1
N=1,则
L
o
s
s
=
(
y
−
p
r
e
d
)
2
Loss = (y-pred)^2
Loss=(y−pred)2
反向传播的意义在于根据损失对w求偏导的值,来判断w该加还是减使得损失减小
以
w
1
w_1
w1为例,使用损失对
w
1
w_1
w1进行求导:
∂
L
o
s
s
∂
w
1
=
∂
L
o
s
s
∂
p
r
e
d
×
∂
p
r
e
d
∂
w
1
=
∂
L
o
s
s
∂
p
r
e
d
×
∂
p
r
e
d
∂
L
1
×
∂
L
1
∂
w
1
frac{partial Loss}{partial w_1} = frac{partial Loss}{partial pred}times frac{partial pred}{partial w_1} = frac{partial Loss}{partial pred}times frac{partial pred}{partial L_1}times frac{partial L_1}{partial w_1}
∂w1∂Loss=∂pred∂Loss×∂w1∂pred=∂pred∂Loss×∂L1∂pred×∂w1∂L1
显然
∂
L
o
s
s
∂
p
r
e
d
=
−
2
∗
(
y
−
p
r
e
d
)
frac{partial Loss}{partial pred} = -2*(y-pred)
∂pred∂Loss=−2∗(y−pred)
而后面两项涉及到
s
i
g
o
m
i
d
sigomid
sigomid函数的求导,先推算
s
i
g
m
o
i
d
sigmoid
sigmoid函数及其求导:
f
(
x
)
=
1
1
+
e
−
x
f
′
(
x
)
=
e
−
x
(
1
+
e
−
x
)
2
=
f
(
x
)
×
(
1
−
f
(
x
)
)
f(x) = frac{1}{1+e^{-x}}\ f'(x) = frac{e^{-x}}{(1+e^{-x})^2} = f(x)times (1-f(x))
f(x)=1+e−x1f′(x)=(1+e−x)2e−x=f(x)×(1−f(x))
则我们可以推得:
∂
p
r
e
d
∂
L
1
=
w
5
×
f
(
w
5
L
1
+
w
2
L
2
+
b
3
)
×
(
1
−
f
(
w
5
L
1
+
w
2
L
2
+
b
3
)
)
frac{partial pred}{partial L_1} = w_5times f(w_5L_1+w_2L_2+b_3)times(1-f(w_5L_1+w_2L_2+b_3))
∂L1∂pred=w5×f(w5L1+w2L2+b3)×(1−f(w5L1+w2L2+b3))
∂ L 1 ∂ w 1 = x 1 × f ( w 1 x 1 + w 3 x 2 + b 1 ) × ( 1 − f ( w 1 x 1 + w 3 x 2 + b 1 ) ) frac{partial L_1}{partial w_1} = x_1times f(w_1x_1+w_3x_2+b_1)times (1-f(w_1x_1+w_3x_2+b_1)) ∂w1∂L1=x1×f(w1x1+w3x2+b1)×(1−f(w1x1+w3x2+b1))
把三个式子相乘就得到了 w 1 w_1 w1的梯度计算公式,同理可以推得其他几个 w w w和 b b b
最后将原来的 w w w和 b b b按照计算的梯度乘上学习率进行衰减,即梯度下降算法
2.简单代码实现数据集:鸢尾花数据集,python自带的,只取其中的两个特征和两类,画成图像如下图所示:
根据之前推导的公式可写出代码:
写代码一定注意符号,我打错了几次导致loss不下降的bug,如果自己写一定仔细检查你的公式符号有没有打错
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
def sigmoid(x):
return 1/(1+np.exp(-x))
def sigmoid_part(x):
fx = sigmoid(x)
return fx * (1-fx)
class my_NN:
def __init__(self):
# 随机初始化
self.w1 = np.random.randn()
self.w2 = np.random.randn()
self.w3 = np.random.randn()
self.w4 = np.random.randn()
self.w5 = np.random.randn()
self.w6 = np.random.randn()
self.b1 = 0
self.b2 = 0
self.b3 = 0
def forward(self, x):
#可以通过这个函数去预测新的数据
L1 = sigmoid(self.w1 * x[0] + self.w3 * x[1] + self.b1)
L2 = sigmoid(self.w2 * x[0] + self.w4 * x[1] + self.b2)
out = sigmoid(self.w5 * L1 + self.w6 * L2 + self.b3)
return out
def train(self, train_data, EPOCHS, lr):
Loss = []
for epoch in range(EPOCHS):
temp_loss = 0
for i, data in enumerate(train_data):
x = data[:2]
y = data[-1]
#前馈网络
out_1 = self.w1 * x[0] + self.w3 * x[1] + self.b1
out_2 = self.w2 * x[0] + self.w4 * x[1] + self.b2
L1 = sigmoid(out_1)
L2 = sigmoid(out_2)
out = self.w5 * L1 + self.w6 * L2 + self.b3
pred = sigmoid(out)
# 计算损失
loss = (y-pred) ** 2
temp_loss += loss
#反向传播
#计算梯度
dw_1 = -2 * (y - pred) * self.w5 * sigmoid_part(out) * x[0] * sigmoid_part(out_1)
dw_2 = -2 * (y - pred) * self.w6 * sigmoid_part(out) * x[0] * sigmoid_part(out_2)
db_1 = -2 * (y - pred) * self.w5 * sigmoid_part(out) * sigmoid_part(out_1)
dw_3 = -2 * (y - pred) * self.w5 * sigmoid_part(out) * x[1] * sigmoid_part(out_1)
dw_4 = -2 * (y - pred) * self.w6 * sigmoid_part(out) * x[1] * sigmoid_part(out_2)
db_2 = -2 * (y - pred) * self.w6 * sigmoid_part(out) * sigmoid_part(out_2)
dw_5 = -2 * (y - pred) * sigmoid_part(out) * L1
dw_6 = -2 * (y - pred) * sigmoid_part(out) * L2
db_3 = -2 * (y - pred) * sigmoid_part(out)
# 梯度下降
self.w1 -= lr * dw_1
self.w2 -= lr * dw_2
self.w3 -= lr * dw_3
self.w4 -= lr * dw_4
self.w5 -= lr * dw_5
self.w6 -= lr * dw_6
self.b1 -= lr * db_1
self.b2 -= lr * db_2
self.b3 -= lr * db_3
temp_loss /= len(train_data)
Loss.append(temp_loss)
print('EPOCH:{}, loss:{:.4f}'.format(epoch, temp_loss))
return Loss
def get_iris_data():
iris = datasets.load_iris()
iris_x = iris.data[:,:2] #只取两个特征
iris_y = iris.target
df = pd.DataFrame(iris_x)
df.columns = iris.feature_names[:2]
df['label'] = iris_y
df = df.loc[df['label'] != 2] #去除第三类,只取两类
df = df.sample(frac = 1) #打乱数据
return df.values
if __name__ == "__main__":
nn = my_NN()
lr = 0.01
EPOCHS = 500
train_data = get_iris_data()
Loss = nn.train(train_data, EPOCHS, lr)
plt.plot(np.arange(1,EPOCHS+1,1), Loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
最后的损失下降曲线:
其实可以进一步划分训练集和测试集,然后使用训练后的参数 w w w和 b b b进行预测,来验证神经网络的准确率。
3.矩阵形式优化我们发现:在上面的推导中, w 1 , w 2 , w 3 , w 4 w_1,w_2,w_3,w_4 w1,w2,w3,w4的计算极为相识, w 5 , w 6 w_5,w_6 w5,w6的计算极为相似, b 1 , b 2 b_1,b_2 b1,b2完全一致,代码过于冗余,那么我们完全可以使用矩阵的乘法一次性计算多个参数。现推导如下:
之前第一层网络的输出为:
L
1
=
f
(
w
1
x
1
+
w
3
x
2
+
b
1
)
L
2
=
f
(
w
2
x
1
+
w
4
x
2
+
b
2
)
L_1 = f(w_1x_1+w_3x_2+b_1) \ L2 = f(w_2x_1+w_4x_2+b_2)
L1=f(w1x1+w3x2+b1)L2=f(w2x1+w4x2+b2)
表述成矩阵形式,先定义第一层网络参数:
W 1 = [ w 1 w 3 w 2 w 4 ] W_1 = begin{bmatrix}w_1 &w_3\w_2&w_4\end{bmatrix} W1=[w1w2w3w4], X = [ x 1 x 2 ] , b 1 = [ b 1 b 2 ] X = begin{bmatrix}x_1 \x_2end{bmatrix},b_1= begin{bmatrix}b_1 \b_2end{bmatrix} X=[x1x2],b1=[b1b2]
则第一层网络输出可以表述为:
L
1
=
f
(
W
1
X
+
b
1
)
L_1 = f(W_1X+b_1)
L1=f(W1X+b1)
同样的,定义输出层参数:
W 2 = [ w 5 w 6 ] , b 2 = 0 W_2 = begin{bmatrix}w_5 &w_6end{bmatrix},b_2 = 0 W2=[w5w6],b2=0
则最后网络的输出为:
p
r
e
d
=
f
(
W
2
L
1
+
b
2
)
pred = f(W_2L_1+b_2)
pred=f(W2L1+b2)
计算损失:
L
o
s
s
=
(
y
−
p
r
e
d
)
2
Loss = (y-pred)^2
Loss=(y−pred)2
以
W
1
W_1
W1为例进行求导(此处需要查询一下矩阵的求导规则):
∂
L
o
s
s
∂
W
1
=
∂
L
o
s
s
∂
p
r
e
d
×
∂
p
r
e
d
∂
W
1
=
∂
L
o
s
s
∂
p
r
e
d
×
∂
p
r
e
d
∂
L
1
×
∂
L
1
∂
W
1
frac{partial Loss}{partial W_1} = frac{partial Loss}{partial pred}times frac{partial pred}{partial W_1} = frac{partial Loss}{partial pred}times frac{partial pred}{partial L_1}times frac{partial L_1}{partial W_1}
∂W1∂Loss=∂pred∂Loss×∂W1∂pred=∂pred∂Loss×∂L1∂pred×∂W1∂L1
显然
∂
L
o
s
s
∂
p
r
e
d
=
−
2
(
y
−
p
r
e
d
)
frac{partial Loss}{partial pred} = -2(y-pred)
∂pred∂Loss=−2(y−pred)
∂
p
r
e
d
∂
L
1
=
[
f
(
W
2
L
1
+
b
2
)
×
(
1
−
f
(
W
2
L
1
+
b
2
)
)
]
W
2
T
frac{partial pred}{partial L_1} = [f(W_2L_1+b_2)times(1-f(W_2L_1+b_2))]W_2^T
∂L1∂pred=[f(W2L1+b2)×(1−f(W2L1+b2))]W2T
∂ L 1 ∂ W 1 = [ f ( W 1 X + b 1 ) × ( 1 − f ( W 1 X + b 1 ) ) ] X T frac{partial L_1}{partial W_1} = [f(W_1X+b_1)times(1-f(W_1X+b_1))]X^T ∂W1∂L1=[f(W1X+b1)×(1−f(W1X+b1))]XT
上面三个式子相乘即是 W 1 W_1 W1的梯度,将其乘上学习率进行衰减即是梯度下降算法
我们注意到: W 1 W_1 W1是一个 2 ∗ 2 2*2 2∗2的矩阵,相当于我们一下优化原来的4个参数,计算过程没有变化,但是代码量大大减少。
在矩阵形式中,时时刻刻都要查看矩阵的维度,看看算出来的维度是不是想要的,必要时需要查询一下矩阵的求导法则,看看是不是需要转置>
4.矩阵形式代码实现依然使用iris数据集实现二分类训练
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
def sigmoid(x):
return 1/(1+np.exp(-x))
def sigmoid_part(x):
fx = sigmoid(x)
return fx * (1-fx)
class my_NN:
def __init__(self):
# 随机初始化
self.w1 = np.random.randn(2,2) #第一层网路,输入维度2,输出维度2
self.w2 = np.random.randn(1,2) #输出网络,输入维度2,输出维度1,反着写是为了便于点乘
self.b1 = np.zeros((2,1))
self.b2 = 0
def forward(self, x):
#可以通过这个函数去预测新的数据
L1 = sigmoid(self.w1.dot(x.T) + self.b1) #得到2*1维度的输出
out = sigmoid(self.w2.dot(L1) + self.b2) #得到1*1维度的输出
return out
def train(self, train_data, EPOCHS, lr):
Loss = []
for epoch in range(EPOCHS):
temp_loss = 0
for i, data in enumerate(train_data):
x = np.reshape(data[:2],(2,-1))
y = data[-1]
#前馈网络
out_1 = self.w1.dot(x) + self.b1
L1 = sigmoid(out_1)
out = self.w2.dot(L1) + self.b2
pred = sigmoid(out)
# 计算损失
loss = (y-pred) ** 2 #现在loss是1*1矩阵
loss = np.squeeze(loss) #将loss变为数字
temp_loss += loss
#反向传播
#计算梯度
dw_1 = (-2 * (y - pred)) * (self.w2.T * sigmoid_part(out)) * (sigmoid_part(out_1).dot(x.T))
db_1 = (-2 * (y - pred)) * (self.w2.T * sigmoid_part(out)) * sigmoid_part(out_1)
dw_2 = -2 * (y - pred) * sigmoid_part(out) * L1.T
db_2 = -2 * (y - pred) * sigmoid_part(out)
# 梯度下降
self.w1 -= lr * dw_1
self.w2 -= lr * dw_2
self.b1 -= lr * db_1
self.b2 -= lr * db_2
temp_loss /= len(train_data)
Loss.append(temp_loss)
print('EPOCH:{}, loss:{:.4f}'.format(epoch, temp_loss))
return Loss
def get_iris_data():
iris = datasets.load_iris()
iris_x = iris.data[:,:2] #只取两个特征
iris_y = iris.target
df = pd.DataFrame(iris_x)
df.columns = iris.feature_names[:2]
df['label'] = iris_y
df = df.loc[df['label'] != 2] #去除第三类,只取两类
df = df.sample(frac = 1) #打乱数据
return df.values
if __name__ == "__main__":
nn = my_NN()
lr = 0.01
EPOCHS = 500
train_data = get_iris_data()
Loss = nn.train(train_data, EPOCHS, lr)
plt.plot(np.arange(1,EPOCHS+1,1), Loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
损失下降图:
可以看到,在相同的计算条件下,使用矩阵形式大大地减少了代码量,也更便于理解。



