Python吴恩达机器学习作业 5 - 偏差和方差

编程作业5 - 偏差和方差

这次练习我们将会看到如何使用课上的方法改进机器学习算法，包括过拟合、欠拟合的状态判断以及学习曲线的绘制。

import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def load_data():
    d = sio.loadmat('ex5data1.mat')
    return map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']]) # 降维

X, y, Xval, yval, Xtest, ytest = load_data()

df = pd.DataFrame({'water_level':X, 'flow':y})

sns.lmplot('water_level', 'flow', data = df, fit_reg = False, height = 7)
plt.show()

X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis = 1) for x in (X, Xval, Xtest)]

代价函数

J ( θ ) = 1 2 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 J(theta)=frac {1}{2m}sum_{i=1}^{m}(h_theta(x^{(i)})-y^{(i)})^2 J(θ)=2m1i=1∑m(hθ(x(i))−y(i))2

def cost(theta, X, y):
    # INPUT：参数值theta，数据X，标签y
    # OUTPUT：当前参数值下代价函数
    # TODO：根据参数和输入的数据计算代价函数
    
    # STEP1：获取样本个数
    m = X.shape[0]
    
    # STEP2：计算代价函数
    inner = X @ theta - y
    square_sum = inner.T @ inner
    cost = square_sum / (2 * m)
    
    return cost

theta = np.ones(X.shape[1])
cost(theta, X, y)

303.9515255535976

梯度

θ j : = θ j − α ∂ ∂ θ j J ( θ ) theta_j := theta_j - alpha frac {partial}{partial theta_j}J(theta) θj:=θj−α∂θj∂J(θ)

def gradient(theta, X, y):
    # INPUT：参数值theta，数据X，标签y
    # OUTPUT：当前参数值下梯度
    # TODO：根据参数和输入的数据计算梯度
    
    # STEP1：获取样本个数
    m = X.shape[0]
    
    # STEP2：计算代价函数
    grad = (X.T @ (X @ theta - y)) / m
    return grad

gradient(theta, X, y)

array([-15.30301567, 598.16741084])

正则化梯度与代价函数

∂ J ( θ ) ∂ θ 0 = 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) x j ( i ) frac {partial J(theta)}{partial theta_0}=frac {1}{m} sum_{i=1}^{m}(h_theta(x^{(i)})-y^{(i)})x_j^{(i)} ∂θ0∂J(θ)=m1∑i=1m(hθ(x(i))−y(i))xj(i) for j = 0 j=0 j=0

∂ J ( θ ) ∂ θ j = ( 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) x j ( i ) ) + λ m θ j frac {partial J(theta)}{partial theta_j}=(frac {1}{m}sum_{i=1}^{m}(h_theta(x^{(i)})-y^{(i)})x_j^{(i)}) + frac {lambda}{m}theta_j ∂θj∂J(θ)=(m1∑i=1m(hθ(x(i))−y(i))xj(i))+mλθj for j ≥ 1 jgeq 1 j≥1

def regularized_gradient(theta, X, y, l = 1):
    # INPUT：参数值theta，数据X，标签y
    # OUTPUT：当前参数值下梯度
    # TODO：根据参数和输入的数据计算梯度
    
    # STEP1：获取样本个数
    m = X.shape[0]
    
    # STEP2：计算正则化梯度
    regularized_term = theta.copy()
    regularized_term[0] = 0
    
    regularized_term = (l / m) * regularized_term
    
    return gradient(theta, X, y) + regularized_term

regularized_gradient(theta, X, y)

array([-15.30301567, 598.25074417])

def regularized_cost(theta, X, y, l = 1):
    m = X.shape[0]
    
    regularized_term = (l / (2 * m)) * np.power(theta[1:], 2).sum()
    
    return cost(theta, X, y) + regularized_term

拟合数据

正则化项 λ = 0 lambda = 0 λ=0

def linear_regression_np(X, y, l = 1):
    # INPUT：数据X，标签y，正则化参数l
    # OUTPUT：当前参数值下梯度
    # TODO：根据参数和输入的数据计算梯度
    
    # STEP1：初始化参数
    theta = np.ones(X.shape[1])
    
    # STEP2：调用优化算法拟合参数
    res = opt.minimize(fun = regularized_cost,
                      x0 = theta,
                      args = (X, y, l),
                      method = 'TNC',
                      jac = regularized_gradient,
                      options = {'disp':True})
    return res

theta = np.ones(X.shape[1])

final_theta = linear_regression_np(X, y, l = 0).get('x')

b = final_theta[0] # intercept
m = final_theta[1] # slope

plt.scatter(X[:, 1], y, label = 'Training data')
plt.plot(X[:, 1], X[:, 1] * m + b, label = 'Prediction')
plt.legend(loc = 2)
plt.show()

training_cost, cv_cost = [], []

使用训练集的子集来拟合模型
在计算训练代价和交叉验证代价时，没有用正则化
记住使用相同的训练集子集来计算训练代价

Tip：向数组里添加新元素可使用append函数

# TODO：计算训练代价和交叉验证集代价
# STEP1：获取样本个数，遍历每个样本
m = X.shape[0]
for i in range(1, m + 1):
    # STEP2：计算当前样本的代价
    res = linear_regression_np(X[:i, :], y[:i], l = 0)
    to = regularized_cost(res.x, X[:i, :], y[:i], l = 0)
    cv = regularized_cost(res.x, Xval, yval,l = 0)
    
    training_cost.append(to)
    cv_cost.append(cv)

plt.plot(np.arange(1, m + 1), training_cost, label = 'training cost')
plt.plot(np.arange(1, m + 1), cv_cost, label = 'cv cost')
plt.legend(loc = 1)
plt.show()

这个模型拟合不太好，欠拟合了

创建多项式特征

def prepare_poly_data(*args, power):
    def prepare(x):
        # 特征映射
        df = poly_features(x, power = power)
        
        # 归一化处理
        ndarr = normalize_feature(df).values
        
        # 添加偏置项
        return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis = 1)
    return [prepare(x) for x in args]

def poly_features(x, power, as_ndarray = False): # 特征映射
    data = {'f{}'.format(i):np.power(x, i) for i in range(1, power + 1)}
    df = pd.DataFrame(data)
    
    return df.values if as_ndarray else df

X, y, Xval, yval, Xtest, ytest = load_data()

poly_features(X, power = 3)

	f1	f2	f3
0	-15.936758	253.980260	-4047.621971
1	-29.152979	849.896197	-24777.006175
2	36.189549	1309.683430	47396.852168
3	37.492187	1405.664111	52701.422173
4	-48.058829	2309.651088	-110999.127750
5	-8.941458	79.949670	-714.866612
6	15.307793	234.328523	3587.052500
7	-34.706266	1204.524887	-41804.560890
8	1.389154	1.929750	2.680720
9	-44.383760	1969.918139	-87432.373590
10	7.013502	49.189211	344.988637
11	22.762749	518.142738	11794.353058

准备多项式回归数据

扩展特征到8阶，或者你需要的阶数
使用归一化来合并 x n x^n xn
不要忘记添加偏置项

def normalize_feature(df):
    return df.apply(lambda column: (column - column.mean()) / column.std())

X_poly, Xval_poly, Xtest_poly = prepare_poly_data(X, Xval, Xtest, power = 8)
X_poly[:3, :]

array([[ 1.00000000e+00, -3.62140776e-01, -7.55086688e-01,
         1.82225876e-01, -7.06189908e-01,  3.06617917e-01,
        -5.90877673e-01,  3.44515797e-01, -5.08481165e-01],
       [ 1.00000000e+00, -8.03204845e-01,  1.25825266e-03,
        -2.47936991e-01, -3.27023420e-01,  9.33963187e-02,
        -4.35817606e-01,  2.55416116e-01, -4.48912493e-01],
       [ 1.00000000e+00,  1.37746700e+00,  5.84826715e-01,
         1.24976856e+00,  2.45311974e-01,  9.78359696e-01,
        -1.21556976e-02,  7.56568484e-01, -1.70352114e-01]])

画出学习曲线

首先。我们没有使用正则化，所以 λ = 0 lambda = 0 λ=0

def plot_learning_curve(X, y, Xval, yval, l = 0):
    # INPUT：训练数据集X，y，交叉验证集Xval，yval，正则化参数l
    # OUTPUT：当前参数值下梯度
    # TODO：根据参数和输入的数据计算梯度
    
    # STEP1：初始化参数，获取样本个数，开始遍历
    training_cost, cv_cost = [], []
    m = X.shape[0]
    for i in range(1, m + 1):
        # STEP2：调用之前写好的拟合数据函数进行数据拟合
        res = linear_regression_np(X[:i, :], y[:i], l = l)
        # STEP3：计算样本代价
        tc = cost(res.x, X[:i, :], y[:i])
        cv = cost(res.x, Xval, yval)
        # STEP4：把计算结果存储至预先定义的数据training_cost, cv_cost中
        training_cost.append(tc)
        cv_cost.append(cv)
        
    plt.plot(np.arange(1, m + 1), training_cost, label = 'training cost')
    plt.plot(np.arange(1, m + 1), cv_cost, label = 'cv cost')
    plt.legend(loc = 1)

plot_learning_curve(X_poly, y, Xval_poly, yval, l = 0)
plt.show()

那可以看到训练的代价太低了，不真实，这是过拟合了

try λ = 1 lambda = 1 λ=1

plot_learning_curve(X_poly, y, Xval_poly, yval, l = 1)
plt.show()

训练代价增加了些，不再是0了，也就是说我们减轻过拟合

try λ = 100 lambda = 100 λ=100

plot_learning_curve(X_poly, y, Xval_poly, yval, l = 100)
plt.show()

太多正则化了。

变成欠拟合状态

找到最佳的 λ lambda λ

l_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cv_cost = [], []

for l in l_candidate:
    res = linear_regression_np(X_poly, y, l)
    
    tc = cost(res.x, X_poly, y)
    cv = cost(res.x, Xval_poly, yval)
    
    training_cost.append(tc)
    cv_cost.append(cv)

plt.plot(l_candidate, training_cost, label = 'training cost')
plt.plot(l_candidate, cv_cost, label = 'cv cost')
plt.legend(loc = 2)

plt.xlabel('lambda')
plt.ylabel('cost')
plt.show()

l_candidate[np.argmin(cv_cost)]

for l in l_candidate:
    theta = linear_regression_np(X_poly, y, l).x
    print('test cost(l={}) = {}'.format(l, cost(theta, Xtest_poly, ytest)))

test cost(l=0) = 10.055426362410126
test cost(l=0.001) = 11.001927632262907
test cost(l=0.003) = 11.26474655167747
test cost(l=0.01) = 10.880780731411715
test cost(l=0.03) = 10.022100517865269
test cost(l=0.1) = 8.63190793331871
test cost(l=0.3) = 7.3366077892272585
test cost(l=1) = 7.466283751156784
test cost(l=3) = 11.643941860536106
test cost(l=10) = 27.715080254176254

调参后， λ = 0.3 lambda = 0.3 λ=0.3 是最优选择，这个时候预测代价最小
y, y, l).x
print(‘test cost(l={}) = {}’.format(l, cost(theta, Xtest_poly, ytest)))

    test cost(l=0) = 10.055426362410126
    test cost(l=0.001) = 11.001927632262907
    test cost(l=0.003) = 11.26474655167747
    test cost(l=0.01) = 10.880780731411715
    test cost(l=0.03) = 10.022100517865269
    test cost(l=0.1) = 8.63190793331871
    test cost(l=0.3) = 7.3366077892272585
    test cost(l=1) = 7.466283751156784
    test cost(l=3) = 11.643941860536106
    test cost(l=10) = 27.715080254176254
    

调参后，$lambda = 0.3$ 是最优选择，这个时候预测代价最小

Python吴恩达机器学习作业 5 - 偏差和方差

Python相关栏目本月热门文章