这次作业主要是体会如何根据训练集验证集选择模型参数。
给出的数据为一个水库的流出水量以及水库水位,数据被分为了训练集、验证集、测试集。
首先读取数据,代码如下。
import numpy as np import matplotlib.pyplot as plt import matplotlib from scipy.io import loadmat from scipy.optimize import minimize path = 'C:/Users/ASUS/Desktop/ex5data1.mat' data = loadmat(path) data.keys() #dict_keys(['__header__', '__version__', '__globals__', 'X', 'y', 'Xtest', 'ytest', 'Xval', 'yval']) #查看数据形状 # data['X'].shape,data['y'].shape#X的形状(12, 1),y的形状(12, 1) #data['Xtest'].shape,data['ytest'].shape#测试集 Xtest的形状(21,1),ytest的形状(21, 1) data['Xval'].shape,data['yval'].shape#验证集 Xval的形状(21,1),yval的形状(21, 1)
之后用线性回归建模看看效果,代码如下。
x_train, y_train = data['X'],data['y']
x_val, y_val = data['Xval'],data['yval']
x_test, y_test = data['Xtest'],data['ytest']
x_train=np.insert(x_train,0,1,axis=1)
x_val=np.insert(x_val,0,1,axis=1)
x_test=np.insert(x_test,0,1,axis=1)
def cost_reg(theta,x,y,lamda):
theta_1 = np.mat(theta)
cost1 = np.sum(np.power((x@theta_1.T-y),2))/(2*x.shape[0])
cost = cost1 + lamda*np.sum(theta[1:]**2)/(2*x.shape[0])
return cost
def gradient_reg(theta,x,y,lamda):
theta_1 = np.mat(theta)
gra1 = np.array(x.T@(x@theta_1.T-y)).flatten()/(x.shape[0])
reg = lamda*theta/(x.shape[0])
reg[0]=0
return gra1+reg
def training(X,y,lamda):
theta0 = np.ones(X.shape[1])
res = minimize(fun = cost_reg,
x0 = theta0,
args = (X,y,lamda),
method = 'TNC',
jac = gradient_reg)
return res.x
theta_train = training(x_train,y_train,0)
fig,ax = plt.subplots()
ax.scatter(x_train[:,1],y_train)
ax.plot(x_train[:,1],x_train@theta_train,color='r')
plt.show()
从图像上可以看出拟合效果不好,模型欠拟合。
接着看看训练样本数目对模型效果的影响,代码如下。
#训练样本从1递增,看验证集和训练集上的误差,不用正则化
def val_train(x_train,y_train,x_val,y_val,theta_train):
train_loss=[]
val_loss=[]
for i in range(1,x_train.shape[0]+1):
theta_train = training(x_train[:i,:],y_train[:i],0)
train_loss.append(cost_reg(theta_train,x_train[:i,:],y_train[:i],0))
val_loss.append(cost_reg(theta_train,x_val,y_val,0))
fig,ax = plt.subplots()
ax.plot(train_loss,label='train_loss')
ax.plot(val_loss,label='val_loss')
plt.xlabel('yangben')
plt.ylabel('loss')
ax.legend()
plt.show()
val_train(x_train,y_train,x_val,y_val,theta_train)
#由于训练集和验证集损失均较大因此模型欠拟合
从图像上可以看出当训练样本较少时,训练集误差较小,验证集误差很大,随着训练样本数目的增加,训练集上误差逐渐变大,而验证集上误差逐渐变小。
然后尝试给模型增加特征项来优化模型,代码如下。
#模型欠拟合添加特征来优化模型 n表示添加到x^(n+2)
def poly_feature(x,n):
x_temp = x
for i in range(2,n+1):
x_temp = np.insert(x_temp,i-1,np.power(x[:,0],i),axis=1)
return normalize(x_temp)
#添加的特征有的值太大,需要进行归一化处理
def normalize(x):
mean = x.mean(axis=0)
std = x.std(axis=0)
x=(x-mean)/std
return x
x = data['X']
x_train_temp = poly_feature(x,6)
x_train_new = np.insert(x_train_temp,0,1,axis=1)
x_val_temp = poly_feature(data['Xval'],6)
x_val_new = np.insert(x_val_temp,0,1,axis=1)
x_test_temp = poly_feature(data['Xtest'],6)
x_test_new = np.insert(x_test_temp,0,1,axis=1)
theta_train_new = training(x_train_new,y_train,0)
val_train(x_train_new,y_train,x_val_new,y_val,theta_train_new)
#添加特征后训练集和验证集损失都太小,模型欠拟合
增加特征项的训练集误差和验证集误差随样本数据变化如上图,可以看出模型过拟合。
最后尝试找到合适的lamda,来优化模型,代码如下。
#下面找到合适的lamda加入正则化
lamda_list = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
train_reg_loss,val_reg_loss = [],[]
for i in lamda_list:
theta_train_temp = training(x_train_new,y_train,i)
train_reg_loss.append(cost_reg(theta_train_temp,x_train_new,y_train,0))
val_reg_loss.append(cost_reg(theta_train_temp,x_val_new,y_val,0))
fig,ax = plt.subplots()
ax.plot(train_reg_loss,label='train_loss')
ax.plot(val_reg_loss,label='val_loss')
plt.xlabel('lamda')
plt.ylabel('loss')
ax.legend()
plt.show()
可以看出当lamda大约是1的时候训练集和验证集损失均较小,此时模型较好。



