Skip to content

Commit

Permalink
update codes
Browse files Browse the repository at this point in the history
  • Loading branch information
johnjim0816 committed Dec 21, 2021
1 parent 64c319c commit 3b712e8
Show file tree
Hide file tree
Showing 71 changed files with 1,096 additions and 1,339 deletions.
34 changes: 31 additions & 3 deletions codes/A2C/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,40 @@
Environment:
'''
import torch.optim as optim
from A2C.model import ActorCritic
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

class ActorCritic(nn.Module):
''' A2C网络模型,包含一个Actor和Critic
'''
def __init__(self, input_dim, output_dim, hidden_dim):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)

self.actor = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, output_dim),
nn.Softmax(dim=1),
)

def forward(self, x):
value = self.critic(x)
probs = self.actor(x)
dist = Categorical(probs)
return dist, value
class A2C:
def __init__(self,n_states,n_actions,cfg) -> None:
''' A2C算法
'''
def __init__(self,state_dim,action_dim,cfg) -> None:
self.gamma = cfg.gamma
self.device = cfg.device
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device)
self.optimizer = optim.Adam(self.model.parameters())

def compute_returns(self,next_value, rewards, masks):
Expand Down
36 changes: 0 additions & 36 deletions codes/A2C/model.py

This file was deleted.

Binary file not shown.
Binary file not shown.
File renamed without changes.
30 changes: 18 additions & 12 deletions codes/A2C/task0_train.py → codes/A2C/task0.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径sys.path
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径

import gym
import numpy as np
import torch
import torch.optim as optim
import datetime
from common.multiprocessing_env import SubprocVecEnv
from A2C.model import ActorCritic
from A2C.agent import ActorCritic
from common.utils import save_results, make_dir
from common.plot import plot_rewards
from common.utils import plot_rewards

curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'A2C' # 算法名称
env_name = 'CartPole-v0' # 环境名称

curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
class A2CConfig:
def __init__(self) -> None:
self.algo='A2C' # 算法名称
self.env_name= 'CartPole-v0' # 环境名称
self.algo_name = algo_name# 算法名称
self.env_name = env_name # 环境名称
self.n_envs = 8 # 异步的环境数目
self.gamma = 0.99 # 强化学习中的折扣因子
self.hidden_dim = 256
Expand All @@ -27,10 +31,9 @@ def __init__(self) -> None:
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class PlotConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU

self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
Expand Down Expand Up @@ -67,6 +70,8 @@ def compute_returns(next_value, rewards, masks, gamma=0.99):


def train(cfg,envs):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
env = gym.make(cfg.env_name) # a single env
env.seed(10)
state_dim = envs.observation_space.shape[0]
Expand Down Expand Up @@ -119,6 +124,7 @@ def train(cfg,envs):
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('完成训练!')
return test_rewards, test_ma_rewards
if __name__ == "__main__":
cfg = A2CConfig()
Expand Down
73 changes: 63 additions & 10 deletions codes/DDPG/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,75 @@
@Discription:
@Environment: python 3.7.7
'''
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from common.model import Actor, Critic
from common.memory import ReplayBuffer


import torch.nn.functional as F
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0

def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity

def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done

def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class Actor(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
super(Actor, self).__init__()
self.linear1 = nn.Linear(n_states, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, n_actions)

self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)

def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = torch.tanh(self.linear3(x))
return x
class Critic(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
super(Critic, self).__init__()

self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
# 随机初始化为较小的值
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)

def forward(self, state, action):
# 按维数1拼接
x = torch.cat([state, action], 1)
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = self.linear3(x)
return x
class DDPG:
def __init__(self, state_dim, action_dim, cfg):
def __init__(self, n_states, n_actions, cfg):
self.device = cfg.device
self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)

# 复制参数到目标网络
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
Expand Down
2 changes: 0 additions & 2 deletions codes/DDPG/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,10 @@ class NormalizedActions(gym.ActionWrapper):
''' 将action范围重定在[0.1]之间
'''
def action(self, action):

low_bound = self.action_space.low
upper_bound = self.action_space.high
action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
action = np.clip(action, low_bound, upper_bound)

return action

def reverse_action(self, action):
Expand Down
81 changes: 81 additions & 0 deletions codes/DDPG/task0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
LastEditTime: 2021-09-16 01:31:33
@Discription:
@Environment: python 3.7.7
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径sys.path

import datetime
import gym
import torch

from DDPG.env import NormalizedActions
from DDPG.agent import DDPG
from DDPG.train import train,test
from common.utils import save_results,make_dir
from common.utils import plot_rewards

curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'DDPG' # 算法名称
env_name = 'Pendulum-v1' # 环境名称,gym新版本(约0.21.0之后)中Pendulum-v0改为Pendulum-v1

class DDPGConfig:
def __init__(self):
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 300 # 训练的回合数
self.eval_eps = 50 # 测试的回合数
self.gamma = 0.99 # 折扣因子
self.critic_lr = 1e-3 # 评论家网络的学习率
self.actor_lr = 1e-4 # 演员网络的学习率
self.memory_capacity = 8000 # 经验回放的容量
self.batch_size = 128 # mini-batch SGD中的批量大小
self.target_update = 2 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层维度
self.soft_tau = 1e-2 # 软更新参数

class PlotConfig:
def __init__(self) -> None:
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU

def env_agent_config(cfg,seed=1):
env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声
env.seed(seed) # 随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DDPG(n_states,n_actions,cfg)
return env,agent

cfg = DDPGConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path)
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = test(plot_cfg,env,agent)
save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果

Loading

0 comments on commit 3b712e8

Please sign in to comment.