这次练习我们将会看到如何使用课上的方法改进机器学习算法,包括过拟合、欠拟合的状态判断以及学习曲线的绘制。
import numpy as np import scipy.io as sio import scipy.optimize as opt import pandas as pd import matplotlib.pyplot as plt import seaborn as sns
def load_data():
d = sio.loadmat('ex5data1.mat')
return map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']]) # 降维
X, y, Xval, yval, Xtest, ytest = load_data()
df = pd.DataFrame({'water_level':X, 'flow':y})
sns.lmplot('water_level', 'flow', data = df, fit_reg = False, height = 7)
plt.show()
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis = 1) for x in (X, Xval, Xtest)]代价函数
J ( θ ) = 1 2 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 J(theta)=frac {1}{2m}sum_{i=1}^{m}(h_theta(x^{(i)})-y^{(i)})^2 J(θ)=2m1i=1∑m(hθ(x(i))−y(i))2
def cost(theta, X, y):
# INPUT:参数值theta,数据X,标签y
# OUTPUT:当前参数值下代价函数
# TODO:根据参数和输入的数据计算代价函数
# STEP1:获取样本个数
m = X.shape[0]
# STEP2:计算代价函数
inner = X @ theta - y
square_sum = inner.T @ inner
cost = square_sum / (2 * m)
return cost
theta = np.ones(X.shape[1]) cost(theta, X, y)
303.9515255535976梯度
θ j : = θ j − α ∂ ∂ θ j J ( θ ) theta_j := theta_j - alpha frac {partial}{partial theta_j}J(theta) θj:=θj−α∂θj∂J(θ)
def gradient(theta, X, y):
# INPUT:参数值theta,数据X,标签y
# OUTPUT:当前参数值下梯度
# TODO:根据参数和输入的数据计算梯度
# STEP1:获取样本个数
m = X.shape[0]
# STEP2:计算代价函数
grad = (X.T @ (X @ theta - y)) / m
return grad
gradient(theta, X, y)
array([-15.30301567, 598.16741084])正则化梯度与代价函数
∂
J
(
θ
)
∂
θ
0
=
1
m
∑
i
=
1
m
(
h
θ
(
x
(
i
)
)
−
y
(
i
)
)
x
j
(
i
)
frac {partial J(theta)}{partial theta_0}=frac {1}{m} sum_{i=1}^{m}(h_theta(x^{(i)})-y^{(i)})x_j^{(i)}
∂θ0∂J(θ)=m1∑i=1m(hθ(x(i))−y(i))xj(i) for
j
=
0
j=0
j=0
∂
J
(
θ
)
∂
θ
j
=
(
1
m
∑
i
=
1
m
(
h
θ
(
x
(
i
)
)
−
y
(
i
)
)
x
j
(
i
)
)
+
λ
m
θ
j
frac {partial J(theta)}{partial theta_j}=(frac {1}{m}sum_{i=1}^{m}(h_theta(x^{(i)})-y^{(i)})x_j^{(i)}) + frac {lambda}{m}theta_j
∂θj∂J(θ)=(m1∑i=1m(hθ(x(i))−y(i))xj(i))+mλθj for
j
≥
1
jgeq 1
j≥1
def regularized_gradient(theta, X, y, l = 1):
# INPUT:参数值theta,数据X,标签y
# OUTPUT:当前参数值下梯度
# TODO:根据参数和输入的数据计算梯度
# STEP1:获取样本个数
m = X.shape[0]
# STEP2:计算正则化梯度
regularized_term = theta.copy()
regularized_term[0] = 0
regularized_term = (l / m) * regularized_term
return gradient(theta, X, y) + regularized_term
regularized_gradient(theta, X, y)
array([-15.30301567, 598.25074417])
def regularized_cost(theta, X, y, l = 1):
m = X.shape[0]
regularized_term = (l / (2 * m)) * np.power(theta[1:], 2).sum()
return cost(theta, X, y) + regularized_term
拟合数据
正则化项 λ = 0 lambda = 0 λ=0
def linear_regression_np(X, y, l = 1):
# INPUT:数据X,标签y,正则化参数l
# OUTPUT:当前参数值下梯度
# TODO:根据参数和输入的数据计算梯度
# STEP1:初始化参数
theta = np.ones(X.shape[1])
# STEP2:调用优化算法拟合参数
res = opt.minimize(fun = regularized_cost,
x0 = theta,
args = (X, y, l),
method = 'TNC',
jac = regularized_gradient,
options = {'disp':True})
return res
theta = np.ones(X.shape[1])
final_theta = linear_regression_np(X, y, l = 0).get('x')
b = final_theta[0] # intercept m = final_theta[1] # slope plt.scatter(X[:, 1], y, label = 'Training data') plt.plot(X[:, 1], X[:, 1] * m + b, label = 'Prediction') plt.legend(loc = 2) plt.show()
training_cost, cv_cost = [], []
- 使用训练集的子集来拟合模型
- 在计算训练代价和交叉验证代价时,没有用正则化
- 记住使用相同的训练集子集来计算训练代价
Tip:向数组里添加新元素可使用append函数
# TODO:计算训练代价和交叉验证集代价
# STEP1:获取样本个数,遍历每个样本
m = X.shape[0]
for i in range(1, m + 1):
# STEP2:计算当前样本的代价
res = linear_regression_np(X[:i, :], y[:i], l = 0)
to = regularized_cost(res.x, X[:i, :], y[:i], l = 0)
cv = regularized_cost(res.x, Xval, yval,l = 0)
training_cost.append(to)
cv_cost.append(cv)
plt.plot(np.arange(1, m + 1), training_cost, label = 'training cost') plt.plot(np.arange(1, m + 1), cv_cost, label = 'cv cost') plt.legend(loc = 1) plt.show()
这个模型拟合不太好,欠拟合了
创建多项式特征def prepare_poly_data(*args, power):
def prepare(x):
# 特征映射
df = poly_features(x, power = power)
# 归一化处理
ndarr = normalize_feature(df).values
# 添加偏置项
return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis = 1)
return [prepare(x) for x in args]
def poly_features(x, power, as_ndarray = False): # 特征映射
data = {'f{}'.format(i):np.power(x, i) for i in range(1, power + 1)}
df = pd.DataFrame(data)
return df.values if as_ndarray else df
X, y, Xval, yval, Xtest, ytest = load_data()
poly_features(X, power = 3)
| f1 | f2 | f3 | |
|---|---|---|---|
| 0 | -15.936758 | 253.980260 | -4047.621971 |
| 1 | -29.152979 | 849.896197 | -24777.006175 |
| 2 | 36.189549 | 1309.683430 | 47396.852168 |
| 3 | 37.492187 | 1405.664111 | 52701.422173 |
| 4 | -48.058829 | 2309.651088 | -110999.127750 |
| 5 | -8.941458 | 79.949670 | -714.866612 |
| 6 | 15.307793 | 234.328523 | 3587.052500 |
| 7 | -34.706266 | 1204.524887 | -41804.560890 |
| 8 | 1.389154 | 1.929750 | 2.680720 |
| 9 | -44.383760 | 1969.918139 | -87432.373590 |
| 10 | 7.013502 | 49.189211 | 344.988637 |
| 11 | 22.762749 | 518.142738 | 11794.353058 |
- 扩展特征到8阶,或者你需要的阶数
- 使用 归一化 来合并 x n x^n xn
- 不要忘记添加偏置项
def normalize_feature(df):
return df.apply(lambda column: (column - column.mean()) / column.std())
X_poly, Xval_poly, Xtest_poly = prepare_poly_data(X, Xval, Xtest, power = 8) X_poly[:3, :]
array([[ 1.00000000e+00, -3.62140776e-01, -7.55086688e-01,
1.82225876e-01, -7.06189908e-01, 3.06617917e-01,
-5.90877673e-01, 3.44515797e-01, -5.08481165e-01],
[ 1.00000000e+00, -8.03204845e-01, 1.25825266e-03,
-2.47936991e-01, -3.27023420e-01, 9.33963187e-02,
-4.35817606e-01, 2.55416116e-01, -4.48912493e-01],
[ 1.00000000e+00, 1.37746700e+00, 5.84826715e-01,
1.24976856e+00, 2.45311974e-01, 9.78359696e-01,
-1.21556976e-02, 7.56568484e-01, -1.70352114e-01]])
画出学习曲线
首先。我们没有使用正则化,所以 λ = 0 lambda = 0 λ=0
def plot_learning_curve(X, y, Xval, yval, l = 0):
# INPUT:训练数据集X,y,交叉验证集Xval,yval,正则化参数l
# OUTPUT:当前参数值下梯度
# TODO:根据参数和输入的数据计算梯度
# STEP1:初始化参数,获取样本个数,开始遍历
training_cost, cv_cost = [], []
m = X.shape[0]
for i in range(1, m + 1):
# STEP2:调用之前写好的拟合数据函数进行数据拟合
res = linear_regression_np(X[:i, :], y[:i], l = l)
# STEP3:计算样本代价
tc = cost(res.x, X[:i, :], y[:i])
cv = cost(res.x, Xval, yval)
# STEP4:把计算结果存储至预先定义的数据training_cost, cv_cost中
training_cost.append(tc)
cv_cost.append(cv)
plt.plot(np.arange(1, m + 1), training_cost, label = 'training cost')
plt.plot(np.arange(1, m + 1), cv_cost, label = 'cv cost')
plt.legend(loc = 1)
plot_learning_curve(X_poly, y, Xval_poly, yval, l = 0) plt.show()
那可以看到训练的代价太低了,不真实,这是过拟合了
try λ = 1 lambda = 1 λ=1plot_learning_curve(X_poly, y, Xval_poly, yval, l = 1) plt.show()
训练代价增加了些,不再是0了,也就是说我们减轻过拟合
try λ = 100 lambda = 100 λ=100plot_learning_curve(X_poly, y, Xval_poly, yval, l = 100) plt.show()
太多正则化了。
变成 欠拟合 状态
l_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10] training_cost, cv_cost = [], []
for l in l_candidate:
res = linear_regression_np(X_poly, y, l)
tc = cost(res.x, X_poly, y)
cv = cost(res.x, Xval_poly, yval)
training_cost.append(tc)
cv_cost.append(cv)
plt.plot(l_candidate, training_cost, label = 'training cost')
plt.plot(l_candidate, cv_cost, label = 'cv cost')
plt.legend(loc = 2)
plt.xlabel('lambda')
plt.ylabel('cost')
plt.show()
l_candidate[np.argmin(cv_cost)]
1
for l in l_candidate:
theta = linear_regression_np(X_poly, y, l).x
print('test cost(l={}) = {}'.format(l, cost(theta, Xtest_poly, ytest)))
test cost(l=0) = 10.055426362410126 test cost(l=0.001) = 11.001927632262907 test cost(l=0.003) = 11.26474655167747 test cost(l=0.01) = 10.880780731411715 test cost(l=0.03) = 10.022100517865269 test cost(l=0.1) = 8.63190793331871 test cost(l=0.3) = 7.3366077892272585 test cost(l=1) = 7.466283751156784 test cost(l=3) = 11.643941860536106 test cost(l=10) = 27.715080254176254
调参后,
λ
=
0.3
lambda = 0.3
λ=0.3 是最优选择,这个时候预测代价最小
y, y, l).x
print(‘test cost(l={}) = {}’.format(l, cost(theta, Xtest_poly, ytest)))
test cost(l=0) = 10.055426362410126
test cost(l=0.001) = 11.001927632262907
test cost(l=0.003) = 11.26474655167747
test cost(l=0.01) = 10.880780731411715
test cost(l=0.03) = 10.022100517865269
test cost(l=0.1) = 8.63190793331871
test cost(l=0.3) = 7.3366077892272585
test cost(l=1) = 7.466283751156784
test cost(l=3) = 11.643941860536106
test cost(l=10) = 27.715080254176254
调参后,$lambda = 0.3$ 是最优选择,这个时候预测代价最小



