PyTorch中使用Tensor记录数据。
Pytorch安装根据pytorch官网的命令行 对pytorch 在anaconda中安装。 默认python >= 3.6
数据的类型有8种, three float types (16-bit, 32-bit, 64-bit), five float types ( 8-bit signed, 8-bit unsigned, 16-bit, 32-bit and 64-bit )
torch.FloatTensor(32-bit float), torch.ByteTensor(8-bit unsigned), torch.LongTensor (64-bit signed)
import torch
import numpy as np
PyTorch中创建tensor 的三种方法:1. a = torch.FloatTensor(3,2) # 得到 3X2的空间,但是数据没有初始化,是凌乱的数据
a._zero() # 实现 tensor 的初始化
2. torch.FloatTensor( [ [1,2,3], [3,2,1] ] )
3. n = np.zeros( shape=(3 , 2), dtype = np.float32 )
torch.tensor(n, dtype = torch.float32)
PyTorch中运算:a = torch.rensor ([1,2,3])
s = a.sum() # s = tensor(6)
s.item() # value: 6
Page Redirection for more PyTorch operations
v1 = torch.tensor( [1.0 , 1.0], required_grad = True ) # 要求该tensor 需要求梯度
v1.is_leaf: True 说明 是由user构建的, False 说明是 function transformation得到的
v1.grad: 用于存放 v1 tensor的梯度tensor, 但使用之前 需要先用 v1.backward() 计算得到梯度
构建Neural networkimport torch.nn as nn
l = nn.Linear(2,5) # 构建了 input为2, output为5 的线性NN
l(v) 将得到 含有五个component 的 tensor
http://pytorch.org/docs
s = nn.Sequential(
nn.Linear(2,5) ,
nn.ReLU() ,
nn.Linear(5,20) ,
nn.ReLU() ,
nn.Linear(20,10) ,
nn.Dropout(p=0.3)
nn.Softmax(dim=1)
)
构建 loss function两个输入, 一个是 网络的输出结果,一个是期望的结果(target label)
nn.MSELoss() 方均根误差
nn.BCELoss() nn.BCEWithLogits() Binary cross-entropy loss
Cartpole game to explain the whole process of DQN
import torch import torch.nn as nn # use nn.module to build the full-connected networks import torch.nn.functional as F # F.relu() is another way to calculate the relu import numpy as np import gym
# Hyper Parameters
BATCH_SIZE = 32 # When we update the Q-learning NN, we will randomly
# choose the BATCH_SIZE samples from the experience
# pool memory variable:
# np.zeros((MEMORY_CAPACITY,current_state,action,
# reward,next_state))
LR = 0.01 # learning rate
EPSILON = 0.9 # greedy policy
GAMMA = 0.9 # reward discount
TARGET_REPLACe_ITER = 100 # target update frequency:
# we do not update the target Q NN after every episode,
# we update this every TARGET_REPLACE_ITER episodes
MEMORY_CAPACITY = 2000 # the maximum number of episodes we save in the
# experience pool
env = gym.make('CartPole-v0') # create the cart pole
env = env.unwrapped # cancel the settings of number successful steps
# number
N_ACTIONS = env.action_space.n # the number of actions because the action is saved
# in Discrete and in the one-dimensional list
N_STATES = env.observation_space.shape[0] # the number of variables in state
# because the state is continuous, so it's saved in the box with the lower and upper bound, we could obtain the number of variants packed in this box by shape[0]
ENV_A_SHAPE = 0 if isinstance(env.action_space.sample(), int) else env.action_space.sample().shape
# obtain the format of every sample of action_space
# isinstance(A,int) will check if the variable A is format of int.
# shape is the attribute of list, int variable does not have this attribute.
class Net(nn.Module): # 继承 nn.Module中的类
def __init__(self, ):
super(Net, self).__init__() # 继承之前的变量
self.fc1 = nn.Linear(N_STATES, 50) # 设置 第一层网络 NX50
self.fc1.weight.data.normal_(0, 0.1) # initialization # 设置初始参数
self.out = nn.Linear(50, N_ACTIONS)
self.out.weight.data.normal_(0, 0.1) # initialization
"""
s = nn.Sequential(
nn.Linear(N_STATES, 50),
nn.RELU(),
nn.Linear(5,N_ACTIONS),
nn.RELU()
)
"""
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
actions_value = self.out(x)
return actions_value
class DQN(object): #DQN agent
def __init__(self):
self.eval_net, self.target_net = Net(), Net()
# behavioral and target Q NN
self.learn_step_counter = 0
# for target updating, EVERY target_replace_iter, we will update the Q NN
self.memory_counter = 0
# for storing memory
# for which memory to store or substitute
self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))
# initialize memory
# every time we will save current_state, action, reward, next_state
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
# use Adam optimizer, we should input the parameters of the NN,
# the leanring rate
self.loss_func = nn.MSELoss()
# use MSEloss function
def choose_action(self, x):
x = torch.unsqueeze(torch.FloatTensor(x), 0)
# 将 [1Xn]的一维数组转换成[ [1Xn] ], 因为pytorch 都是对mini-batch进行处理,
# 因此需要将 1xn 升级成 1x1xn, 第一个1 说明mini-batch中独立的数量
# torch.unsequeeze(x,0) dim=0将行进行扩展, dim=1 将列进行扩展 变成 nX1x1
# input only one sample
if np.random.uniform() < EPSILON: # greedy
actions_value = self.eval_net.forward(x)
action = torch.max(actions_value, 1)[1].data.numpy()
# action_value: 1x1xn
# torch.max(action_value,1) 将按照每行dim=1对结果进行取max
# torch.max(action_value,1)[1] 将返回 每行最大值对应的序号 形成 1xn的tensor
# torch.max(action_value,1)[0] 将返回 每行最大值
# .data.numpy() 将tensor转化为nparray格式,而且与tensor共享同一个内存位置
action = action[0] if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE) # return the argmax index
else: # random
action = np.random.randint(0, N_ACTIONS)
action = action if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE)
return action
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, [a, r], s_))
# np.hstack() 能够将行进行合并,但是合并的每个component 必须是数组
# np.vstack() 能够将列进行合并
# replace the old memory with new memory
index = self.memory_counter % MEMORY_CAPACITY
self.memory[index, :] = transition
self.memory_counter += 1
# 修改 memory 中的信息
def learn(self):
# target parameter update
if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
# .state_dict() 将提取出来每一个nn中的 weight parameters
# .load_state_dict() 将weight parameter进行更换, 复制内容,而不是地址
self.learn_step_counter += 1
# sample batch transitions
sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
# 随机选择 BATCH_SIZE数量的 episode 进行训练
b_memory = self.memory[sample_index, :]
# 得到对应的 memory序列 [BATCH_NUMBER,STATE_NUMBER*2+2]
b_s = torch.FloatTensor(b_memory[:, :N_STATES])
# 取出每行对应的curren_state
b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int))
# 取出来每行对应的action
b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2])
# 取出来每行对应的reward
b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:])
# 取出来每行对应的next_state
# q_eval w.r.t the action in experience
q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
# self.eval_net(b_s), 每个state将得到所有的 action_value
# .gather(1,b_a) 将得到每行中 对应的b_a 的 action value,
# dim=1 表示 gather是在每行中选择 对应的 action value,而不是在每列中
q_next = self.target_net(b_s_).detach() # detach from graph, don't backpropagate
# 将得到 对应的target value
# detach 是为了说明不用back propagate, 减少计算量,只用target value就行
q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1) # shape (batch, 1)
# 1xn的tensor进行升维
loss = self.loss_func(q_eval, q_target)
# 得到真值 和 target 之间的差
self.optimizer.zero_grad()
# 消除之前存储在optimizer的梯度值
loss.backward()
# 通过Loss function计算梯度
self.optimizer.step()
# 将Q nn 进行更新
"""
main program
"""
dqn = DQN()
print('nCollecting experience...')
for i_episode in range(400):
s = env.reset() # 重设环境参数
ep_r = 0 # 每个episode中的reward sum
while True:
env.render() # 画图展示结果
a = dqn.choose_action(s) # 根据state 选择 action int type
# take action
s_, r, done, info = env.step(a) #得到结果
# modify the reward
x, x_dot, theta, theta_dot = s_
r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
r = r1 + r2
dqn.store_transition(s, a, r, s_) # 存储结果, 放入 experience pool
ep_r += r
if dqn.memory_counter > MEMORY_CAPACITY:
# 当experience pool 中的 [state,action,reward,state] 满了之后 就学习
dqn.learn()
if done:
print('Ep: ', i_episode,
'| Ep_r: ', round(ep_r, 2))
if done:
break
s = s_
reward change + 100 hiden cells + one layer
At the last, the car did not perform well. It will stick on the bottom. I think because there it will obtain some less positive rewards. However, they still get very negative reward sum.
reward change + 50 hiden cells + one layer
reward change (only speed reward) + 50 hiden cells + one layer
reward change (only speed reward) + 50 hiden cells + one layer + successful experience pool



