update codes

xjqbest · Dec 21, 2021 · 3b712e8 · 3b712e8
1 parent 64c319c
commit 3b712e8
Show file tree

Hide file tree

Showing 71 changed files with 1,096 additions and 1,339 deletions.
diff --git a/codes/A2C/agent.py b/codes/A2C/agent.py
@@ -10,12 +10,40 @@
 Environment: 
 '''
 import torch.optim as optim
-from A2C.model import ActorCritic
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Categorical
+
+class ActorCritic(nn.Module):
+    ''' A2C网络模型，包含一个Actor和Critic
+    '''
+    def __init__(self, input_dim, output_dim, hidden_dim):
+        super(ActorCritic, self).__init__()
+        self.critic = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1)
+        )
+
+        self.actor = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, output_dim),
+            nn.Softmax(dim=1),
+        )
+
+    def forward(self, x):
+        value = self.critic(x)
+        probs = self.actor(x)
+        dist  = Categorical(probs)
+        return dist, value
 class A2C:
-    def __init__(self,n_states,n_actions,cfg) -> None:
+    ''' A2C算法
+    '''
+    def __init__(self,state_dim,action_dim,cfg) -> None:
         self.gamma = cfg.gamma
         self.device = cfg.device
-        self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
+        self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device)
         self.optimizer = optim.Adam(self.model.parameters())
 
     def compute_returns(self,next_value, rewards, masks):

diff --git a/codes/A2C/model.py b/codes/A2C/model.py
diff --git a/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy
diff --git a/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy
diff --git a/codes/A2C/task0_train.ipynb → codes/A2C/task0.ipynb b/codes/A2C/task0_train.ipynb → codes/A2C/task0.ipynb
diff --git a/codes/A2C/task0_train.py → codes/A2C/task0.py b/codes/A2C/task0_train.py → codes/A2C/task0.py
@@ -1,23 +1,27 @@
-import sys,os
-curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
-parent_path = os.path.dirname(curr_path) # 父路径
-sys.path.append(parent_path) # 添加路径到系统路径sys.path
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
 
 import gym
 import numpy as np
 import torch
 import torch.optim as optim
 import datetime
 from common.multiprocessing_env import SubprocVecEnv
-from A2C.model import ActorCritic
+from A2C.agent import ActorCritic
 from common.utils import save_results, make_dir
-from common.plot import plot_rewards
+from common.utils import plot_rewards
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
+algo_name = 'A2C'  # 算法名称
+env_name = 'CartPole-v0'  # 环境名称
 
-curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
 class A2CConfig:
     def __init__(self) -> None:
-        self.algo='A2C' # 算法名称
-        self.env_name= 'CartPole-v0' # 环境名称
+        self.algo_name = algo_name# 算法名称
+        self.env_name = env_name # 环境名称
         self.n_envs = 8 # 异步的环境数目
         self.gamma = 0.99 # 强化学习中的折扣因子
         self.hidden_dim = 256
@@ -27,10 +31,9 @@ def __init__(self) -> None:
         self.device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 class PlotConfig:
     def __init__(self) -> None:
-        self.algo = "DQN"  # 算法名称
-        self.env_name = 'CartPole-v0' # 环境名称
+        self.algo_name = algo_name # 算法名称
+        self.env_name = env_name # 环境名称
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
-
         self.result_path = curr_path+"/outputs/" + self.env_name + \
             '/'+curr_time+'/results/'  # 保存结果的路径
         self.model_path = curr_path+"/outputs/" + self.env_name + \
@@ -67,6 +70,8 @@ def compute_returns(next_value, rewards, masks, gamma=0.99):
 
 
 def train(cfg,envs):
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
     env = gym.make(cfg.env_name) # a single env
     env.seed(10)
     state_dim  = envs.observation_space.shape[0]
@@ -119,6 +124,7 @@ def train(cfg,envs):
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
+    print('完成训练！')
     return test_rewards, test_ma_rewards
 if __name__ == "__main__":
     cfg = A2CConfig()

diff --git a/codes/DDPG/agent.py b/codes/DDPG/agent.py
@@ -9,22 +9,75 @@
 @Discription: 
 @Environment: python 3.7.7
 '''
+import random
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.optim as optim
-
-from common.model import Actor, Critic
-from common.memory import ReplayBuffer
-
-
+import torch.nn.functional as F
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+class Actor(nn.Module):
+    def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
+        super(Actor, self).__init__()  
+        self.linear1 = nn.Linear(n_states, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, n_actions)
+
+        self.linear3.weight.data.uniform_(-init_w, init_w)
+        self.linear3.bias.data.uniform_(-init_w, init_w)
+
+    def forward(self, x):
+        x = F.relu(self.linear1(x))
+        x = F.relu(self.linear2(x))
+        x = torch.tanh(self.linear3(x))
+        return x
+class Critic(nn.Module):
+    def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
+        super(Critic, self).__init__()
+
+        self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, 1)
+        # 随机初始化为较小的值
+        self.linear3.weight.data.uniform_(-init_w, init_w)
+        self.linear3.bias.data.uniform_(-init_w, init_w)
+
+    def forward(self, state, action):
+        # 按维数1拼接
+        x = torch.cat([state, action], 1)
+        x = F.relu(self.linear1(x))
+        x = F.relu(self.linear2(x))
+        x = self.linear3(x)
+        return x
 class DDPG:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_states, n_actions, cfg):
         self.device = cfg.device
-        self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
-        self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
-        self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
-        self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
+        self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
+        self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
+        self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
 
         # 复制参数到目标网络
         for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):

diff --git a/codes/DDPG/env.py b/codes/DDPG/env.py
@@ -16,12 +16,10 @@ class NormalizedActions(gym.ActionWrapper):
     ''' 将action范围重定在[0.1]之间
     '''
     def action(self, action):
-
         low_bound   = self.action_space.low
         upper_bound = self.action_space.high
         action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
         action = np.clip(action, low_bound, upper_bound)
-
         return action
 
     def reverse_action(self, action):

diff --git a/codes/DDPG/task0.py b/codes/DDPG/task0.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 20:58:21
+@LastEditor: John
+LastEditTime: 2021-09-16 01:31:33
+@Discription: 
+@Environment: python 3.7.7
+'''
+import sys,os
+curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path) # 父路径
+sys.path.append(parent_path) # 添加路径到系统路径sys.path
+
+import datetime
+import gym
+import torch
+
+from DDPG.env import NormalizedActions
+from DDPG.agent import DDPG
+from DDPG.train import train,test
+from common.utils import save_results,make_dir
+from common.utils import plot_rewards
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+algo_name = 'DDPG'  # 算法名称
+env_name = 'Pendulum-v1'  # 环境名称，gym新版本（约0.21.0之后）中Pendulum-v0改为Pendulum-v1
+
+class DDPGConfig:
+    def __init__(self):
+        self.algo_name = algo_name # 算法名称
+        self.env_name = env_name # 环境名称
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
+        self.train_eps = 300 # 训练的回合数
+        self.eval_eps = 50 # 测试的回合数
+        self.gamma = 0.99 # 折扣因子
+        self.critic_lr = 1e-3 # 评论家网络的学习率
+        self.actor_lr = 1e-4 # 演员网络的学习率
+        self.memory_capacity = 8000 # 经验回放的容量
+        self.batch_size = 128 # mini-batch SGD中的批量大小
+        self.target_update = 2 # 目标网络的更新频率
+        self.hidden_dim = 256 # 网络隐藏层维度
+        self.soft_tau = 1e-2 # 软更新参数
+
+class PlotConfig:
+    def __init__(self) -> None:
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name # 环境名称
+        self.result_path = curr_path+"/outputs/" + self.env_name + \
+            '/'+curr_time+'/results/'  # 保存结果的路径
+        self.model_path = curr_path+"/outputs/" + self.env_name + \
+            '/'+curr_time+'/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+
+def env_agent_config(cfg,seed=1):
+    env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声
+    env.seed(seed) # 随机种子
+    n_states = env.observation_space.shape[0]
+    n_actions = env.action_space.shape[0]
+    agent = DDPG(n_states,n_actions,cfg)
+    return env,agent
+
+cfg = DDPGConfig()
+plot_cfg = PlotConfig()
+# 训练
+env,agent = env_agent_config(cfg,seed=1)
+rewards, ma_rewards = train(cfg, env, agent)
+make_dir(plot_cfg.result_path, plot_cfg.model_path)
+agent.save(path=plot_cfg.model_path)
+save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
+plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+# 测试
+env,agent = env_agent_config(cfg,seed=10)
+agent.load(path=plot_cfg.model_path)
+rewards,ma_rewards = test(plot_cfg,env,agent)
+save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
+plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
+