二、简单实现RNN
import math import torch from torch import nn from torch.nn import functional as F from d2l import torch as d2l import sequence #加载数据 train_iter batch_size,num_steps 32,35 train_iter,vocab d2l.load_data_time_machine(batch_size,num_steps) #独热编码,这里的[0,2]0,2表示的是下标 后面传入长度 F.one_hot(torch.tensor([0,2]),len(vocab)) # #tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0, 0, 0, 0], # [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0, 0, 0, 0]]) #小批量的数据 批量大小 时间步数 X torch.arange(10).reshape((2,5)) #X.T表示对X做转置 这里转置的一个目的是将时间维度放到第一位 因为这是一个时间序列 2代表上下两部分x和h隐藏层 28代表长度 F.one_hot(X.T,28).shape print(F.one_hot(X.T,28)) # torch.Size([5, 2, 28]) #初始化RNN网络 def get_params(vocab_size,num_hidden,device): #由于上一步one_hot的大小就是这里输入输出数据的大小 num_inputs num_outputs vocab_size #进行初始化并乘0.01 具体原因还有待考量 def normal(shape): return torch.randn(size shape,device device)*0.01 W_xh normal((num_inputs,num_hidden)) W_hh normal((num_hidden,num_hidden)) #偏移都先设置为0 b_h torch.zeros((num_hidden,num_hidden)) W_hq normal((num_hidden,num_outputs)) # 偏移都先设置为0 b_q torch.zeros(num_outputs,device device) params [W_xh,W_hh,b_h,W_hq,b_q] for param in params: param.requires_grad_(True) return params #初始化隐藏状态 因为在零时刻未产生隐藏状态 def init_rnn_state(batch_size,num_hiddens,device): return (torch.zeros((batch_size,num_hiddens),device device),) def rnn(inputs,state,params): W_xh, W_hh, b_h, W_hq, b_q params H, state outputs [] for X in inputs: #由(X,W_xh)前一刻的x和torch.mm(H,W_hh)前一刻的隐藏状态一起作用加上b_h,产生新的隐藏状态 H torch.tanh(torch.mm(X,W_xh) torch.mm(H,W_hh) b_h) #新的输出由前一刻的隐藏状态决定 Y torch.mm(H,W_hq) b_q outputs.append(Y) #在0的维度拼起来 也就是构成一个时间序列 并输出隐藏状态 return torch.cat(outputs,dim 0),(H,) #创建一个类来包装这些函数 class RNNModelScratch: 从零开始实现的RNN def __init__(self,vocab_size,num_hiddens,device,get_params,init_state,forward_fn): self.vocab_size,self.num_hiddens vocab_size,num_hiddens self.params get_params(vocab_size,num_hiddens,device) self.init_state,self.forward_fn init_state,forward_fn def __call__(self,X,state): X F.one_hot(X.T,self.vocab_size).type(torch.float32) return self.forward_fn(X,state,self.params) def begin_state(self,batch_size,device): return self.init_state(batch_size,self.num_hiddens,device) #定义预测函数来生成后面的prefix之后的新字符 def predict_ch8(prefix,num_preds,net,vocab,device): state net.begin_state(batch_size 1,device device) outputs [vocab[prefix[0]]] get_input lambda :torch.tensor([outputs[-1]],device device)((1,1)) #这里保存进outputs的是原文 训练中的误差没必要保存 训练之后的预测才是要保存的 for y in prefix[1:]: _,state net(get_input(),state) outputs.append(vocab[y]) #进行预测并保存结果 for _ in range(num_preds): y,state net(get_input(),state) outputs.append(int(y.argmax(dim 1).reshape(1))) return .join([vocab.idx_to_token[i] for i in outputs]) net RNNModelScratch(len(vocab),num_hiddens,d2l.try_gpu(),get_params,init_rnn_state,rnn) predict_ch8( time traveller ,10,net,vocab,d2l.try_gpu())
三、简洁实现
import torch from torch import nn from torch.nn import functional as F from d2l import torch as d2l batch_size,num_steps 32,35 train_iter,vocab d2l.load_data_time_machine(batch_size,num_steps) #自定义模型 num_hiddens 256 rnn_layer nn.RNN(len(vocab),num_hiddens) #初始化状态 state torch.zeros((1,batch_size,num_steps)) X torch.rand(size (num_steps,batch_size,len(vocab))) Y,state_new rnn_layer(X,state) class RNNModel(nn.Module): def __init__(self,rnn_layer,vocab_size,**kwargs): super(RNNModel, self).__init__(**kwargs) self.rnn rnn_layer self.vocab_size vocab_size self.num_hiddens self.rnn.hidden_size if not self.rnn.bidirectional: self.num_direction 1 self.linear nn.Linear(self.num_hiddens,self.vocab_size) else: self.num_direction 2 self.linear nn.Linear(self.num_hiddens*2,self.vocab_size) def forward(self,inputs,state): X F.one_hot(inputs.T.long(),self.vocab_size) X X.to(torch.float32) Y,state self.rnn(X,state) output self.linear(Y.reshape((-1,Y.shape[-1]))) return output,state # def begin_state(self,device,batch_size 1): # if not isinstance(self.rnn,nn.LSTM): # return torch.zeros((self.num_direction*self.rnn.num_layer*batch_size,self.num_hiddens,device device))



