-
超参数设置
params = { 'gamma': 0.8,#bellman方程中的gamma 'epsi_high': 0.9,#和epsi_low和decay一起用于设置随机选择动作的概率 'epsi_low': 0.05, 'decay': 200, 'lr': 0.001,#学习率 'capacity': 10000,#memory容量 'batch_size': 64,#批处理大小 'state_space_dim': env.observation_space.shape[0], 'action_space_dim': env.action_space.n } -
迭代过程,若游戏未结束agent得到正奖励,若done则reward=-1
for episode in range(1000): s0 = env.reset() total_reward = 1 while True: env.render()#渲染环境 a0 = agent.select_action(s0)#选择动作执行 s1, r1, done, _ = env.step(a0)#执行动作 if done: r1 = -1#done则惩罚agent memory.append((s0, a0, r1, s1))#保存数据以便进行经验回放 if done: break total_reward += r1 s0 = s1 if len(memory)>64: agent.learn(memory)#根据保存每次转换的memory学习更好的策略 if episode % 100 == 0: print(total_reward / 100) total_reward = 1
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(4, 120) # an affine operation: y = Wx + b
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 2)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
DQN.py
-
初始化
egreedy=0.1 gamma=0.9 net=Net() optimizer = optim.Adam(net.parameters(), lr=0.001) batch_size=64 action_space=None observation_space=None def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) -
根据观测情况得到动作:
rand=random.random() if rand
-
def learn(self,memory): samples = random.sample(memory, self.batch_size)#随机取batch_size大小的样本 s0, a0, r1, s1= zip(*samples)#解压缩,分别得到当前状态,动作,奖励,下一个状态的数组,规模均为batch_size #转为pytorch张量 s0 = torch.tensor(s0, dtype=torch.float) a0 = torch.tensor(a0, dtype=torch.long).view(self.batch_size, -1) r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size, -1) s1 = torch.tensor(s1, dtype=torch.float) y_true = r1 + self.gamma * torch.max(self.net(s1).detach(), dim=1)[0].view(self.batch_size, -1)#bellman方程 y_pred = self.net(s0).gather(1, a0) #获得实际采取动作的对应Q值 #个人体会:不管损失函数最后是对怎样的数据形状求损失都无所谓,有反向传播呢,所以这里预测和target都只是一个数而并非数组也可以 #net可以直接接收一个batch的输入并输出。 loss_fn = nn.MSELoss()#均方损失函数 loss = loss_fn(y_pred, y_true) #梯度下降 self.optimizer.zero_grad() loss.backward() self.optimizer.step()
-
经过实现,对DQN的了解更加深刻了。
-
更加熟练的使用pytorch。



