Skip to content

Commit

Permalink
daily update 2019-11-22
Browse files Browse the repository at this point in the history
  • Loading branch information
zachary committed Nov 22, 2019
1 parent d796e83 commit 91446fe
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 62 deletions.
8 changes: 1 addition & 7 deletions arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,16 @@
def parse_args():
parser = argparse.ArgumentParser("reinforcement learning experiments for multiagent environments")
# environment
parser.add_argument("--scenario_name", type=str, default="simple_tag", help="name of the scenario script")
parser.add_argument("--scenario_name", type=str, default="simple_adversary", help="name of the scenario script")
parser.add_argument("--start_time", type=str, default=time_now, help="the time when start the game")
parser.add_argument("--per_episode_max_len", type=int, default=45, help="maximum episode length")
parser.add_argument("--max_episode", type=int, default=150000, help="maximum episode length")
parser.add_argument("--num-adversaries", type=int, default=1, help="number of adversaries")
parser.add_argument("--good-policy", type=str, default="maddpg", help="policy for good agents")
parser.add_argument("--adv-policy", type=str, default="maddpg", help="policy of adversaries")
# core training parameters
parser.add_argument("--device", default=device, help="torch device ")
parser.add_argument("--learning_start_step", type=int, default=50000, help="learning start steps")
parser.add_argument("--max_grad_norm", type=float, default=0.5, help="max gradient norm for clip")
parser.add_argument("--learning_fre", type=int, default=100, help="learning frequency")
parser.add_argument("--var", type=int, default=1.0, help="var of the noise")
parser.add_argument("--var_discount", type=float, default=0.999998, help="the discount for var")
parser.add_argument("--tao", type=int, default=0.01, help="how depth we exchange the par of the nn")
parser.add_argument("--lr_a", type=float, default=1e-2, help="learning rate for adam optimizer")
parser.add_argument("--lr_c", type=float, default=1e-2, help="learning rate for adam optimizer")
Expand All @@ -36,12 +32,10 @@ def parse_args():
parser.add_argument("--num_units_2", type=int, default=64, help="number of units in the mlp")
parser.add_argument("--num_units_openai", type=int, default=64, help="number of units in the mlp")
# checkpointing
parser.add_argument("--exp-name", type=str, default="maddpg", help="name of the experiment")
parser.add_argument("--fre4save_model", type=int, default=400, help="the number of the episode for saving the model")
parser.add_argument("--start_save_model", type=int, default=400, help="the number of the episode for saving the model")
parser.add_argument("--save_dir", type=str, default="models", help="directory in which training state and model should be saved")
parser.add_argument("--old_model_name", type=str, default="models/1911_122134_20000/", help="directory in which training state and model are loaded")
parser.add_argument("--restore_idxs", type=list, default=[0], help="the idx of agents need to restore from the model trained")
# evaluation
parser.add_argument("--restore", action="store_true", default=False)
parser.add_argument("--display", action="store_true", default=False)
Expand Down
32 changes: 10 additions & 22 deletions main_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
import pickle
import argparse
import numpy as np
import torch.optim as optim
import torch.nn as nn
import torch.optim as optim

from model import openai_actor, openai_critic
from replay_buffer import ReplayBuffer
from arguments import parse_args
from replay_buffer import ReplayBuffer
import multiagent.scenarios as scenarios
from model import openai_actor, openai_critic
from multiagent.environment import MultiAgentEnv

def make_env(scenario_name, arglist, benchmark=False):
Expand Down Expand Up @@ -110,26 +110,17 @@ def agents_train(arglist, game_step, update_cnt, memory, obs_size, action_size,

# use the data to update the ACTOR
# There is no need to cal other agent's action
# policy_all = [actors_cur[ac_idx](obs_n_o[:, obs_size[ac_idx][0]:obs_size[ac_idx][1]], batch_flag=True) \
# for ac_idx in range(actors_cur.__len__())]
# policy_actor = torch.cat(policy_all, dim=1)
#agent_own_policy = policy_all[agent_idx].detach().cpu().numpy()
#loss_pse = 1e-3*np.mean(np.power(agent_own_policy, 2))
policy_c_new = actor_c(obs_n_o[:, obs_size[agent_idx][0]:obs_size[agent_idx][1]])
model_out, policy_c_new = actor_c( \
obs_n_o[:, obs_size[agent_idx][0]:obs_size[agent_idx][1]], model_original_out=True)
action_cur_o[:, action_size[agent_idx][0]:action_size[agent_idx][1]] = policy_c_new
loss_pse = torch.mean(torch.pow(policy_c_new, 2))
loss_pse = torch.mean(torch.pow(model_out, 2))
loss_a = torch.mul(-1, torch.mean(critic_c(obs_n_o, action_cur_o)))

opt_a.zero_grad()
(1e-3*loss_pse+loss_a).backward()
nn.utils.clip_grad_norm_(actor_c.parameters(), arglist.max_grad_norm)
opt_a.step()

# record the data
# file_text = open('logs/loss_record_{}'.format(agent_idx), 'a')
# file_text.writelines('actor loss:{} critic loss:{} action_mse:{} \n'.format(loss_a.detach().cpu().numpy(), \
# loss_c.detach().cpu().numpy(), policy_all[agent_idx][0], dim=0).detach().cpu().numpy()))
# file_text.close()

# save the model to the path_dir ---cnt by update number
if update_cnt > arglist.start_save_model and update_cnt % arglist.fre4save_model == 0:
time_now = time.strftime('%y%m_%d%H%M')
Expand Down Expand Up @@ -175,20 +166,16 @@ def train(arglist):
print('=============================')

"""step3: init the pars """
obs_size = []
action_size = []
game_step = 0
collision_cnt = 0
episode_cnt = 0
obs_n = env.reset()
update_cnt = 0
t_start = time.time()
rew_n_old = [0.0 for _ in range(env.n)] # set the init reward
final_ep_rewards = [] # sum of rewards for training curve
final_ep_ag_rewards = [] # agent rewards for training curve
agent_info = [[[]]] # placeholder for benchmarking info
episode_rewards = [0.0] # sum of rewards for all agents
agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward
obs_size = []
action_size = []
head_o, head_a, end_o, end_a = 0, 0, 0, 0
for obs_shape, action_shape in zip(obs_shape_n, action_shape_n):
end_o = end_o + obs_shape
Expand All @@ -202,6 +189,7 @@ def train(arglist):

print('=3 starting iterations ...')
print('=============================')
obs_n = env.reset()

for episode_gone in range(arglist.max_episode):
# cal the reward print the debug data
Expand Down
50 changes: 17 additions & 33 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ def forward(self, input):
The forward func defines how the data flows through the graph(layers)
"""
x = self.LReLU(self.linear_a1(input))
#x = self.tanh(x)
x = self.LReLU(self.linear_a2(x))
#x = self.tanh(x)
policy = self.tanh(self.linear_a(x))
return policy

Expand Down Expand Up @@ -80,34 +78,24 @@ def forward(self, obs_input, action_input):
class openai_critic(abstract_agent):
def __init__(self, obs_shape_n, action_shape_n, args):
super(openai_critic, self).__init__()
self.linear_o_c1 = nn.Linear(obs_shape_n, args.num_units_openai)
self.linear_a_c1 = nn.Linear(action_shape_n, args.num_units_openai)
self.LReLU = nn.LeakyReLU(0.01)
self.linear_c1 = nn.Linear(action_shape_n+obs_shape_n, args.num_units_openai)
self.linear_c1.weight.data.normal_(0, 0.1)
self.linear_c2 = nn.Linear(args.num_units_openai, args.num_units_openai)
self.linear_c2.weight.data.normal_(0, 0.1)
self.linear_c = nn.Linear(args.num_units_openai, 1)
self.linear_c.weight.data.normal_(0, 0.1)
#self.reset_parameters()

self.LReLU = nn.LeakyReLU(0.01)
self.tanh= nn.Tanh()
self.reset_parameters()
self.train()

def reset_parameters(self):
gain = nn.init.calculate_gain('leaky_relu')
self.linear_o_c1.weight.data.mul_(gain)
self.linear_a_c1.weight.data.mul_(gain)
self.linear_c2.weight.data.mul_(gain)
self.linear_c.weight.data.mul_(gain)
nn.init.xavier_uniform_(self.linear_c1.weight, gain=nn.init.calculate_gain('leaky_relu'))
nn.init.xavier_uniform_(self.linear_c2.weight, gain=nn.init.calculate_gain('leaky_relu'))
nn.init.xavier_uniform_(self.linear_c.weight, gain=nn.init.calculate_gain('leaky_relu'))

def forward(self, obs_input, action_input):
"""
input_g: input_global, input features of all agents
"""
# x_o = self.LReLU(self.linear_o_c1(obs_input))
# x_a = self.LReLU(self.linear_a_c1(action_input))
#x_cat = torch.cat([x_o, x_a], dim=1)
x_cat = self.LReLU(self.linear_c1(torch.cat([obs_input, action_input], dim=1)))
x = self.LReLU(self.linear_c2(x_cat))
value = self.linear_c(x)
Expand All @@ -116,35 +104,31 @@ def forward(self, obs_input, action_input):
class openai_actor(abstract_agent):
def __init__(self, num_inputs, action_size, args):
super(openai_actor, self).__init__()
self.tanh= nn.Tanh()
self.LReLU = nn.LeakyReLU(0.01)
self.linear_a1 = nn.Linear(num_inputs, args.num_units_openai)
self.linear_a1.weight.data.normal_(0, 0.1)
self.linear_a2 = nn.Linear(args.num_units_openai, args.num_units_openai)
self.linear_a2.weight.data.normal_(0, 0.1)
self.linear_a = nn.Linear(args.num_units_openai, action_size)
self.linear_a.weight.data.normal_(0, 0.1)
#self.reset_parameters()
# Activation func init
self.LReLU = nn.LeakyReLU(0.01)
self.tanh= nn.Tanh()

self.reset_parameters()
self.train()

def reset_parameters(self):
gain = nn.init.calculate_gain('leaky_relu')
gain_tanh = nn.init.calculate_gain('tanh')
self.linear_a1.weight.data.mul_(gain)
self.linear_a2.weight.data.mul_(gain)
self.linear_a.weight.data.mul_(gain)
nn.init.xavier_uniform_(self.linear_a1.weight, gain=nn.init.calculate_gain('leaky_relu'))
nn.init.xavier_uniform_(self.linear_a2.weight, gain=nn.init.calculate_gain('leaky_relu'))
nn.init.xavier_uniform_(self.linear_a.weight, gain=nn.init.calculate_gain('leaky_relu'))

def forward(self, input):
def forward(self, input, model_original_out=False):
"""
The forward func defines how the data flows through the graph(layers)
flag: 0 sigle input 1 batch input
"""
x = self.LReLU(self.linear_a1(input))
#x = self.tanh(x)
x = self.LReLU(self.linear_a2(x))
#x = self.tanh(x)
policy = self.linear_a(x)
u = torch.rand_like(policy)
policy = F.softmax(policy - torch.log(-torch.log(u)), dim=-1)
model_out = self.linear_a(x)
u = torch.rand_like(model_out)
policy = F.softmax(model_out - torch.log(-torch.log(u)), dim=-1)
if model_original_out == True: return model_out, policy # for model_out criterion
return policy

0 comments on commit 91446fe

Please sign in to comment.