daily update 2019-11-22

DKuan · Nov 22, 2019 · 91446fe · 91446fe
1 parent d796e83
commit 91446fe
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 62 deletions.
diff --git a/arguments.py b/arguments.py
@@ -12,20 +12,16 @@
 def parse_args():
     parser = argparse.ArgumentParser("reinforcement learning experiments for multiagent environments")
     # environment
-    parser.add_argument("--scenario_name", type=str, default="simple_tag", help="name of the scenario script")
+    parser.add_argument("--scenario_name", type=str, default="simple_adversary", help="name of the scenario script")
     parser.add_argument("--start_time", type=str, default=time_now, help="the time when start the game")
     parser.add_argument("--per_episode_max_len", type=int, default=45, help="maximum episode length")
     parser.add_argument("--max_episode", type=int, default=150000, help="maximum episode length")
     parser.add_argument("--num-adversaries", type=int, default=1, help="number of adversaries")
-    parser.add_argument("--good-policy", type=str, default="maddpg", help="policy for good agents")
-    parser.add_argument("--adv-policy", type=str, default="maddpg", help="policy of adversaries")
     # core training parameters
     parser.add_argument("--device", default=device, help="torch device ")
     parser.add_argument("--learning_start_step", type=int, default=50000, help="learning start steps")
     parser.add_argument("--max_grad_norm", type=float, default=0.5, help="max gradient norm for clip")
     parser.add_argument("--learning_fre", type=int, default=100, help="learning frequency")
-    parser.add_argument("--var", type=int, default=1.0, help="var of the noise")
-    parser.add_argument("--var_discount", type=float, default=0.999998, help="the discount for var")
     parser.add_argument("--tao", type=int, default=0.01, help="how depth we exchange the par of the nn")
     parser.add_argument("--lr_a", type=float, default=1e-2, help="learning rate for adam optimizer")
     parser.add_argument("--lr_c", type=float, default=1e-2, help="learning rate for adam optimizer")
@@ -36,12 +32,10 @@ def parse_args():
     parser.add_argument("--num_units_2", type=int, default=64, help="number of units in the mlp")
     parser.add_argument("--num_units_openai", type=int, default=64, help="number of units in the mlp")
     # checkpointing
-    parser.add_argument("--exp-name", type=str, default="maddpg", help="name of the experiment")
     parser.add_argument("--fre4save_model", type=int, default=400, help="the number of the episode for saving the model")
     parser.add_argument("--start_save_model", type=int, default=400, help="the number of the episode for saving the model")
     parser.add_argument("--save_dir", type=str, default="models", help="directory in which training state and model should be saved")
     parser.add_argument("--old_model_name", type=str, default="models/1911_122134_20000/", help="directory in which training state and model are loaded")
-    parser.add_argument("--restore_idxs", type=list, default=[0], help="the idx of agents need to restore from the model trained")
     # evaluation
     parser.add_argument("--restore", action="store_true", default=False)
     parser.add_argument("--display", action="store_true", default=False)

diff --git a/main_openai.py b/main_openai.py
@@ -8,13 +8,13 @@
 import pickle
 import argparse
 import numpy as np
-import torch.optim as optim
 import torch.nn as nn
+import torch.optim as optim
 
-from model import openai_actor, openai_critic
-from replay_buffer import ReplayBuffer
 from arguments import parse_args
+from replay_buffer import ReplayBuffer
 import multiagent.scenarios as scenarios
+from model import openai_actor, openai_critic
 from multiagent.environment import MultiAgentEnv
 
 def make_env(scenario_name, arglist, benchmark=False):
@@ -110,26 +110,17 @@ def agents_train(arglist, game_step, update_cnt, memory, obs_size, action_size,
 
             # use the data to update the ACTOR
             # There is no need to cal other agent's action
-            # policy_all = [actors_cur[ac_idx](obs_n_o[:, obs_size[ac_idx][0]:obs_size[ac_idx][1]], batch_flag=True) \
-            #     for ac_idx in range(actors_cur.__len__())]
-            # policy_actor = torch.cat(policy_all, dim=1)
-            #agent_own_policy = policy_all[agent_idx].detach().cpu().numpy()
-            #loss_pse = 1e-3*np.mean(np.power(agent_own_policy, 2))
-            policy_c_new = actor_c(obs_n_o[:, obs_size[agent_idx][0]:obs_size[agent_idx][1]])
+            model_out, policy_c_new = actor_c( \
+                obs_n_o[:, obs_size[agent_idx][0]:obs_size[agent_idx][1]], model_original_out=True)
             action_cur_o[:, action_size[agent_idx][0]:action_size[agent_idx][1]] = policy_c_new 
-            loss_pse = torch.mean(torch.pow(policy_c_new, 2))
+            loss_pse = torch.mean(torch.pow(model_out, 2))
             loss_a = torch.mul(-1, torch.mean(critic_c(obs_n_o, action_cur_o)))
+
             opt_a.zero_grad()
             (1e-3*loss_pse+loss_a).backward()
             nn.utils.clip_grad_norm_(actor_c.parameters(), arglist.max_grad_norm)
             opt_a.step()
 
-            # record the data
-            # file_text = open('logs/loss_record_{}'.format(agent_idx), 'a')
-            # file_text.writelines('actor loss:{} critic loss:{} action_mse:{} \n'.format(loss_a.detach().cpu().numpy(), \
-            #      loss_c.detach().cpu().numpy(), policy_all[agent_idx][0], dim=0).detach().cpu().numpy()))
-            # file_text.close()
-
         # save the model to the path_dir ---cnt by update number
         if update_cnt > arglist.start_save_model and update_cnt % arglist.fre4save_model == 0:
             time_now = time.strftime('%y%m_%d%H%M')
@@ -175,20 +166,16 @@ def train(arglist):
     print('=============================')
 
     """step3: init the pars """
+    obs_size = []
+    action_size = []
     game_step = 0
-    collision_cnt = 0
     episode_cnt = 0
-    obs_n = env.reset()
     update_cnt = 0
     t_start = time.time()
     rew_n_old = [0.0 for _ in range(env.n)] # set the init reward
-    final_ep_rewards = [] # sum of rewards for training curve
-    final_ep_ag_rewards = [] # agent rewards for training curve
     agent_info = [[[]]] # placeholder for benchmarking info
     episode_rewards = [0.0] # sum of rewards for all agents
     agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward
-    obs_size = []
-    action_size = []
     head_o, head_a, end_o, end_a = 0, 0, 0, 0
     for obs_shape, action_shape in zip(obs_shape_n, action_shape_n):
         end_o = end_o + obs_shape
@@ -202,6 +189,7 @@ def train(arglist):
 
     print('=3 starting iterations ...')
     print('=============================')
+    obs_n = env.reset()
 
     for episode_gone in range(arglist.max_episode):
         # cal the reward print the debug data

diff --git a/model.py b/model.py
@@ -39,9 +39,7 @@ def forward(self, input):
         The forward func defines how the data flows through the graph(layers)
         """
         x = self.LReLU(self.linear_a1(input))
-        #x = self.tanh(x)
         x = self.LReLU(self.linear_a2(x))
-        #x = self.tanh(x)
         policy = self.tanh(self.linear_a(x))
         return policy 
 
@@ -80,34 +78,24 @@ def forward(self, obs_input, action_input):
 class openai_critic(abstract_agent):
     def __init__(self, obs_shape_n, action_shape_n, args):
         super(openai_critic, self).__init__()
-        self.linear_o_c1 = nn.Linear(obs_shape_n, args.num_units_openai)
-        self.linear_a_c1 = nn.Linear(action_shape_n, args.num_units_openai)
+        self.LReLU = nn.LeakyReLU(0.01)
         self.linear_c1 = nn.Linear(action_shape_n+obs_shape_n, args.num_units_openai)
-        self.linear_c1.weight.data.normal_(0, 0.1)
         self.linear_c2 = nn.Linear(args.num_units_openai, args.num_units_openai)
-        self.linear_c2.weight.data.normal_(0, 0.1)
         self.linear_c = nn.Linear(args.num_units_openai, 1)
-        self.linear_c.weight.data.normal_(0, 0.1)
-        #self.reset_parameters()
 
-        self.LReLU = nn.LeakyReLU(0.01)
-        self.tanh= nn.Tanh()
+        self.reset_parameters()
         self.train()
 
     def reset_parameters(self):
         gain = nn.init.calculate_gain('leaky_relu')
-        self.linear_o_c1.weight.data.mul_(gain)
-        self.linear_a_c1.weight.data.mul_(gain)
-        self.linear_c2.weight.data.mul_(gain)
-        self.linear_c.weight.data.mul_(gain)
+        nn.init.xavier_uniform_(self.linear_c1.weight, gain=nn.init.calculate_gain('leaky_relu'))
+        nn.init.xavier_uniform_(self.linear_c2.weight, gain=nn.init.calculate_gain('leaky_relu'))
+        nn.init.xavier_uniform_(self.linear_c.weight, gain=nn.init.calculate_gain('leaky_relu'))
 
     def forward(self, obs_input, action_input):
         """
         input_g: input_global, input features of all agents
         """
-        # x_o = self.LReLU(self.linear_o_c1(obs_input))
-        # x_a = self.LReLU(self.linear_a_c1(action_input))
-        #x_cat = torch.cat([x_o, x_a], dim=1)
         x_cat = self.LReLU(self.linear_c1(torch.cat([obs_input, action_input], dim=1)))
         x = self.LReLU(self.linear_c2(x_cat))
         value = self.linear_c(x)
@@ -116,35 +104,31 @@ def forward(self, obs_input, action_input):
 class openai_actor(abstract_agent):
     def __init__(self, num_inputs, action_size, args):
         super(openai_actor, self).__init__()
+        self.tanh= nn.Tanh()
+        self.LReLU = nn.LeakyReLU(0.01)
         self.linear_a1 = nn.Linear(num_inputs, args.num_units_openai)
-        self.linear_a1.weight.data.normal_(0, 0.1)
         self.linear_a2 = nn.Linear(args.num_units_openai, args.num_units_openai)
-        self.linear_a2.weight.data.normal_(0, 0.1)
         self.linear_a = nn.Linear(args.num_units_openai, action_size)
-        self.linear_a.weight.data.normal_(0, 0.1)
-        #self.reset_parameters()
-        # Activation func init
-        self.LReLU = nn.LeakyReLU(0.01)
-        self.tanh= nn.Tanh()
+
+        self.reset_parameters()
         self.train()
 
     def reset_parameters(self):
         gain = nn.init.calculate_gain('leaky_relu')
         gain_tanh = nn.init.calculate_gain('tanh')
-        self.linear_a1.weight.data.mul_(gain)
-        self.linear_a2.weight.data.mul_(gain)
-        self.linear_a.weight.data.mul_(gain)
+        nn.init.xavier_uniform_(self.linear_a1.weight, gain=nn.init.calculate_gain('leaky_relu'))
+        nn.init.xavier_uniform_(self.linear_a2.weight, gain=nn.init.calculate_gain('leaky_relu'))
+        nn.init.xavier_uniform_(self.linear_a.weight, gain=nn.init.calculate_gain('leaky_relu'))
 
-    def forward(self, input):
+    def forward(self, input, model_original_out=False):
         """
         The forward func defines how the data flows through the graph(layers)
         flag: 0 sigle input 1 batch input
         """
         x = self.LReLU(self.linear_a1(input))
-        #x = self.tanh(x)
         x = self.LReLU(self.linear_a2(x))
-        #x = self.tanh(x)
-        policy = self.linear_a(x)
-        u = torch.rand_like(policy)
-        policy = F.softmax(policy - torch.log(-torch.log(u)), dim=-1) 
+        model_out = self.linear_a(x)
+        u = torch.rand_like(model_out)
+        policy = F.softmax(model_out - torch.log(-torch.log(u)), dim=-1)
+        if model_original_out == True:   return model_out, policy # for model_out criterion
         return policy