import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
import matplotlib.pyplot as plt
train_data=pd.read_csv('data/train.csv')
test_data=pd.read_csv('data/test.csv')
print(train_data.shape)
print(test_data.shape)
print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
print(all_features.shape)
#特征标准化
numeric_features=all_features.dtypes[all_features.dtypes!='object'].index
all_features[numeric_features]=all_features[numeric_features].apply(
lambda x:(x-x.mean())/x.std()
)
all_features[numeric_features]=all_features[numeric_features].fillna(0)
#离散数据独热化
all_features=pd.get_dummies(all_features,dummy_na=True)
print(all_features.shape)
#转化为张量
n_train=train_data.shape[0]
train_features=torch.tensor(all_features[:n_train].values,dtype=torch.float32)
test_features=torch.tensor(all_features[n_train:].values,dtype=torch.float32)
train_labels=torch.tensor(train_data.SalePrice.values.reshape(-1,1),dtype=torch.float32)
loss=nn.MSELoss()
in_features=train_features.shape[1]
def get_net():
net=nn.Sequential(nn.Linear(in_features,1))
return net
#评价函数
def log_rmse(net,features,labels):
clipped_preds=torch.clamp(net(features),1,float('inf'))
rmse=torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
return rmse.item()
def train(net, train_features, train_labels, test_features, test_labels,
num_epochs, learning_rate, weight_decay, batch_size):
train_ls,test_ls=[],[]
#print(train_features.shape,train_labels.shape)
train_iter=d2l.load_array((train_features,train_labels),batch_size)
optimizer=torch.optim.Adam(net.parameters(),lr=learning_rate,weight_decay=weight_decay)
for epoch in range(num_epochs):
for X,y in train_iter:
optimizer.zero_grad()
l=loss(net(X),y)
l.backward()
optimizer.step()
train_ls.append(log_rmse(net,train_features,train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net,test_features,test_labels))
return train_ls,test_ls
def get_k_fold_data(k, i, X, y):
assert k>1
fold_size=X.shape[0]//k
#print(fold_size)
X_train,y_train=None,None
for j in range(k):
idx=slice(fold_size*j,fold_size*(j+1))
X_part,y_part=X[idx,:],y[idx]
if j==i:
X_valid,y_valid=X_part,y_part
elif X_train==None:
X_train, y_train=X_part,y_part
else:
X_train=torch.cat([X_train,X_part],0)
y_train=torch.cat([y_train,y_part],0)
#print(X_train.shape,y_train.shape,X_valid.shape,y_valid.shape)
return X_train, y_train, X_valid, y_valid
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
train_l_sum,valid_l_sum=0,0
for i in range(k):
data=get_k_fold_data(k,i,X_train,y_train)
net=get_net()
train_ls,valid_ls=train(net,*data, num_epochs, learning_rate, weight_decay, batch_size)
train_l_sum+=train_ls[-1]
valid_l_sum+=valid_ls[-1]
if i == 0:
d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
legend=['train', 'valid'], yscale='log')
print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
f'验证log rmse{float(valid_ls[-1]):f}')
plt.show()
return train_l_sum / k, valid_l_sum / k
k,num_epochs,lr,weight_decay,batch_size=5,100,5,0.01,64
train_l,valid_l=k_fold(k, train_features, train_labels, num_epochs, lr,weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
f'平均验证log rmse: {float(valid_l):f}')
def train_and_pred(train_features, test_feature, train_labels, test_data,num_epochs, lr, weight_decay, batch_size):
net = get_net()
train_ls, _ = train(net, train_features, train_labels, None, None,num_epochs, lr, weight_decay, batch_size)
d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch', ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
print(f'训练log rmse:{float(train_ls[-1]):f}')
# 将⽹络应⽤于测试集。
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)
train_and_pred(train_features, test_features, train_labels, test_data,num_epochs, lr, weight_decay, batch_size)