Unify num players and actions

Former-commit-id: a78fdad
chenmich · May 14, 2021 · cde2734 · cde2734
1 parent 9153b4a
commit cde2734
Show file tree

Hide file tree

Showing 59 changed files with 230 additions and 398 deletions.
diff --git a/examples/evaluate.py b/examples/evaluate.py
@@ -18,7 +18,7 @@ def load_model(model_path, env=None, position=None, device=None):
         agent.load()
     elif model_path == 'random':  # Random model
         from rlcard.agents import RandomAgent
-        agent = RandomAgent(action_num=env.action_num)
+        agent = RandomAgent(num_actions=env.num_actions)
     else:  # A model in the model zoo
         from rlcard import models
         agent = models.load(model_path).agents[position]
@@ -43,7 +43,7 @@ def evaluate(args):
     env.set_agents(agents)
 
     # Evaluate
-    rewards = tournament(env, args.evaluate_num)
+    rewards = tournament(env, args.num_games)
     for position, reward in enumerate(rewards):
         print(position, args.models[position], reward)
 
@@ -53,7 +53,7 @@ def evaluate(args):
     parser.add_argument('--models', nargs='*', default=['experiments/leduc_holdem_dqn_result/model.pth', 'random'])
     parser.add_argument('--cuda', type=str, default='')
     parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--evaluate_num', type=int, default=10000)
+    parser.add_argument('--num_games', type=int, default=10000)
 
     args = parser.parse_args()
 

diff --git a/examples/run_cfr.py b/examples/run_cfr.py
@@ -20,26 +20,26 @@ def train(args):
     agent.load()  # If we have saved model, we first load the model
 
     # Evaluate CFR against random
-    eval_env.set_agents([agent, RandomAgent(action_num=env.action_num)])
+    eval_env.set_agents([agent, RandomAgent(num_actions=env.num_actions)])
 
     # Start training
     with Logger(args.log_dir) as logger:
-        for episode in range(args.episode_num):
+        for episode in range(args.num_episodes):
             agent.train()
             print('\rIteration {}'.format(episode), end='')
             # Evaluate the performance. Play with Random agents.
             if episode % args.evaluate_every == 0:
                 agent.save() # Save model
-                logger.log_performance(env.timestep, tournament(eval_env, args.evaluate_num)[0])
+                logger.log_performance(env.timestep, tournament(eval_env, args.num_games)[0])
 
         # Plot the learning curve
         logger.plot('CFR')
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser("DQN example in RLCard")
     parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--episode_num', type=int, default=5000)
-    parser.add_argument('--evaluate_num', type=int, default=2000)
+    parser.add_argument('--num_episodes', type=int, default=5000)
+    parser.add_argument('--num_games', type=int, default=2000)
     parser.add_argument('--evaluate_every', type=int, default=100)
     parser.add_argument('--log_dir', type=str, default='experiments/leduc_holdem_cfr_result/')
 

diff --git a/examples/run_random.py b/examples/run_random.py
@@ -9,16 +9,16 @@
 def run(args):
     # Make environment
     env = rlcard.make(args.env, config={'seed': 42})
-    episode_num = 1
+    num_episodes = 1
 
     # Seed numpy, torch, random
     set_seed(42)
 
     # Set agents
-    agent = RandomAgent(action_num=env.action_num)
-    env.set_agents([agent for _ in range(env.player_num)])
+    agent = RandomAgent(num_actions=env.num_actions)
+    env.set_agents([agent for _ in range(env.num_players)])
 
-    for episode in range(episode_num):
+    for episode in range(num_episodes):
 
         # Generate data from the environment
         trajectories, player_wins = env.run(is_training=False)

diff --git a/examples/run_rl.py b/examples/run_rl.py
@@ -23,25 +23,25 @@ def train(args):
     # Initialize the agent and use random agents as opponents
     if args.algorithm == 'dqn':
         from rlcard.agents import DQNAgent
-        agent = DQNAgent(action_num=env.action_num,
+        agent = DQNAgent(num_actions=env.num_actions,
                          state_shape=env.state_shape[0],
                          mlp_layers=[64,64],
                          device=device)
     elif args.algorithm == 'nfsp':
         from rlcard.agents import NFSPAgent
-        agent = NFSPAgent(action_num=env.action_num,
+        agent = NFSPAgent(num_actions=env.num_actions,
                           state_shape=env.state_shape[0],
                           hidden_layers_sizes=[64,64],
                           q_mlp_layers=[64,64],
                           device=device)
     agents = [agent]
-    for _ in range(env.player_num):
-        agents.append(RandomAgent(action_num=env.action_num))
+    for _ in range(env.num_players):
+        agents.append(RandomAgent(num_actions=env.num_actions))
     env.set_agents(agents)
 
     # Start training
     with Logger(args.log_dir) as logger:
-        for episode in range(args.episode_num):
+        for episode in range(args.num_episodes):
 
             if args.algorithm == 'nfsp':
                 agents[0].sample_episode_policy()
@@ -60,7 +60,7 @@ def train(args):
 
             # Evaluate the performance. Play with random agents.
             if episode % args.evaluate_every == 0:
-                logger.log_performance(env.timestep, tournament(env, args.evaluate_num)[0])
+                logger.log_performance(env.timestep, tournament(env, args.num_games)[0])
 
         # Plot the learning curve
         logger.plot(args.algorithm)
@@ -76,8 +76,8 @@ def train(args):
     parser.add_argument('--algorithm', type=str, default='dqn', choices=['dqn', 'nfsp'])
     parser.add_argument('--cuda', type=str, default='')
     parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--episode_num', type=int, default=5000)
-    parser.add_argument('--evaluate_num', type=int, default=2000)
+    parser.add_argument('--num_episodes', type=int, default=5000)
+    parser.add_argument('--num_games', type=int, default=2000)
     parser.add_argument('--evaluate_every', type=int, default=100)
     parser.add_argument('--log_dir', type=str, default='experiments/leduc_holdem_dqn_result/')
 

diff --git a/rlcard/agents/cfr_agent.py b/rlcard/agents/cfr_agent.py
@@ -35,9 +35,9 @@ def train(self):
         self.iteration += 1
         # Firstly, traverse tree to compute counterfactual regret for each player
         # The regrets are recorded in traversal
-        for player_id in range(self.env.player_num):
+        for player_id in range(self.env.num_players):
             self.env.reset()
-            probs = np.ones(self.env.player_num)
+            probs = np.ones(self.env.num_players)
             self.traverse_tree(probs, player_id)
 
         # Update policy
@@ -59,7 +59,7 @@ def traverse_tree(self, probs, player_id):
         current_player = self.env.get_player_id()
 
         action_utilities = {}
-        state_utility = np.zeros(self.env.player_num)
+        state_utility = np.zeros(self.env.num_players)
         obs, legal_actions = self.get_state(current_player)
         action_probs = self.action_probs(obs, legal_actions, self.policy)
 
@@ -86,9 +86,9 @@ def traverse_tree(self, probs, player_id):
         player_state_utility = state_utility[current_player]
 
         if obs not in self.regrets:
-            self.regrets[obs] = np.zeros(self.env.action_num)
+            self.regrets[obs] = np.zeros(self.env.num_actions)
         if obs not in self.average_policy:
-            self.average_policy[obs] = np.zeros(self.env.action_num)
+            self.average_policy[obs] = np.zeros(self.env.num_actions)
         for action in legal_actions:
             action_prob = action_probs[action]
             regret = counterfactual_prob * (action_utilities[action][current_player]
@@ -112,13 +112,13 @@ def regret_matching(self, obs):
         regret = self.regrets[obs]
         positive_regret_sum = sum([r for r in regret if r > 0])
 
-        action_probs = np.zeros(self.env.action_num)
+        action_probs = np.zeros(self.env.num_actions)
         if positive_regret_sum > 0:
-            for action in range(self.env.action_num):
+            for action in range(self.env.num_actions):
                 action_probs[action] = max(0.0, regret[action] / positive_regret_sum)
         else:
-            for action in range(self.env.action_num):
-                action_probs[action] = 1.0 / self.env.action_num
+            for action in range(self.env.num_actions):
+                action_probs[action] = 1.0 / self.env.num_actions
         return action_probs
 
     def action_probs(self, obs, legal_actions, policy):
@@ -136,7 +136,7 @@ def action_probs(self, obs, legal_actions, policy):
                 legal_actions (list): Indices of legal actions
         '''
         if obs not in policy.keys():
-            action_probs = np.array([1.0/self.env.action_num for _ in range(self.env.action_num)])
+            action_probs = np.array([1.0/self.env.num_actions for _ in range(self.env.num_actions)])
             self.policy[obs] = action_probs
         else:
             action_probs = policy[obs]

diff --git a/rlcard/agents/dqn_agent.py b/rlcard/agents/dqn_agent.py
@@ -51,7 +51,7 @@ def __init__(self,
                  epsilon_end=0.1,
                  epsilon_decay_steps=20000,
                  batch_size=32,
-                 action_num=2,
+                 num_actions=2,
                  state_shape=None,
                  train_every=1,
                  mlp_layers=None,
@@ -75,7 +75,7 @@ def __init__(self,
             epsilon_decay_steps (int): Number of steps to decay epsilon over
             batch_size (int): Size of batches to sample from the replay memory
             evaluate_every (int): Evaluate every N steps
-            action_num (int): The number of the actions
+            num_actions (int): The number of the actions
             state_space (list): The space of the state vector
             train_every (int): Train the network every X steps.
             mlp_layers (list): The layer number and the dimension of each layer in MLP
@@ -88,7 +88,7 @@ def __init__(self,
         self.discount_factor = discount_factor
         self.epsilon_decay_steps = epsilon_decay_steps
         self.batch_size = batch_size
-        self.action_num = action_num
+        self.num_actions = num_actions
         self.train_every = train_every
 
         # Torch device
@@ -107,9 +107,9 @@ def __init__(self,
         self.epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
 
         # Create estimators
-        self.q_estimator = Estimator(action_num=action_num, learning_rate=learning_rate, state_shape=state_shape, \
+        self.q_estimator = Estimator(num_actions=num_actions, learning_rate=learning_rate, state_shape=state_shape, \
             mlp_layers=mlp_layers, device=self.device)
-        self.target_estimator = Estimator(action_num=action_num, learning_rate=learning_rate, state_shape=state_shape, \
+        self.target_estimator = Estimator(num_actions=num_actions, learning_rate=learning_rate, state_shape=state_shape, \
             mlp_layers=mlp_layers, device=self.device)
 
         # Create replay memory
@@ -170,7 +170,7 @@ def predict(self, state):
             q_values (numpy.array): a 1-d array where each entry represents a Q value
         '''
         epsilon = self.epsilons[min(self.total_t, self.epsilon_decay_steps-1)]
-        A = np.ones(self.action_num, dtype=float) * epsilon / self.action_num
+        A = np.ones(self.num_actions, dtype=float) * epsilon / self.num_actions
         q_values = self.q_estimator.predict_nograd(np.expand_dims(state, 0))[0]
         best_action = np.argmax(q_values)
         A[best_action] += (1.0 - epsilon)
@@ -232,7 +232,7 @@ class Estimator(object):
     This network is used for both the Q-Network and the Target Network.
     '''
 
-    def __init__(self, action_num=2, learning_rate=0.001, state_shape=None, mlp_layers=None, device=None):
+    def __init__(self, num_actions=2, learning_rate=0.001, state_shape=None, mlp_layers=None, device=None):
         ''' Initilalize an Estimator object.
 
         Args:
@@ -241,14 +241,14 @@ def __init__(self, action_num=2, learning_rate=0.001, state_shape=None, mlp_laye
             mlp_layers (list): size of outputs of mlp layers
             device (torch.device): whether to use cpu or gpu
         '''
-        self.action_num = action_num
+        self.num_actions = num_actions
         self.learning_rate=learning_rate
         self.state_shape = state_shape
         self.mlp_layers = mlp_layers
         self.device = device
 
         # set up Q model and place it in eval mode
-        qnet = EstimatorNetwork(action_num, state_shape, mlp_layers)
+        qnet = EstimatorNetwork(num_actions, state_shape, mlp_layers)
         qnet = qnet.to(self.device)
         self.qnet = qnet
         self.qnet.eval()
@@ -325,17 +325,17 @@ class EstimatorNetwork(nn.Module):
         It is just a series of tanh layers. All in/out are torch.tensor
     '''
 
-    def __init__(self, action_num=2, state_shape=None, mlp_layers=None):
+    def __init__(self, num_actions=2, state_shape=None, mlp_layers=None):
         ''' Initialize the Q network
 
         Args:
-            action_num (int): number of legal actions
+            num_actions (int): number of legal actions
             state_shape (list): shape of state tensor
             mlp_layers (list): output size of each fc layer
         '''
         super(EstimatorNetwork, self).__init__()
 
-        self.action_num = action_num
+        self.num_actions = num_actions
         self.state_shape = state_shape
         self.mlp_layers = mlp_layers
 
@@ -346,7 +346,7 @@ def __init__(self, action_num=2, state_shape=None, mlp_layers=None):
         for i in range(len(layer_dims)-1):
             fc.append(nn.Linear(layer_dims[i], layer_dims[i+1], bias=True))
             fc.append(nn.Tanh())
-        fc.append(nn.Linear(layer_dims[-1], self.action_num, bias=True))
+        fc.append(nn.Linear(layer_dims[-1], self.num_actions, bias=True))
         self.fc_layers = nn.Sequential(*fc)
 
     def forward(self, s):

diff --git a/rlcard/agents/human_agents/blackjack_human_agent.py b/rlcard/agents/human_agents/blackjack_human_agent.py
@@ -5,14 +5,14 @@ class HumanAgent(object):
     ''' A human agent for Blackjack. It can be used to play alone for understand how the blackjack code runs
     '''
 
-    def __init__(self, action_num):
+    def __init__(self, num_actions):
         ''' Initilize the human agent
 
         Args:
-            action_num (int): the size of the output action space
+            num_actions (int): the size of the output action space
         '''
         self.use_raw = True
-        self.action_num = action_num
+        self.num_actions = num_actions
 
     @staticmethod
     def step(state):
@@ -59,12 +59,12 @@ def _print_state(state, raw_legal_actions, action_record):
     print('\n=============   Dealer Hand   ===============')
     print_card(state['dealer hand'])
 
-    num_player = len(state) - 3
+    num_players = len(state) - 3
 
-    for i in range(num_player):
+    for i in range(num_players):
         print('===============   Player {} Hand   ==============='.format(i))
         print_card(state['player' + str(i) + ' hand'])
 
     print('\n=========== Actions You Can Choose ===========')
     print(', '.join([str(index) + ': ' + action for index, action in enumerate(raw_legal_actions)]))
-    print('')
+    print('')
diff --git a/rlcard/agents/human_agents/gin_rummy_human_agent/gin_rummy_human_agent.py b/rlcard/agents/human_agents/gin_rummy_human_agent/gin_rummy_human_agent.py
@@ -15,14 +15,14 @@ class HumanAgent(object):
     ''' A human agent for Gin Rummy. It can be used to play against trained models.
     '''
 
-    def __init__(self, action_num):
+    def __init__(self, num_actions):
         ''' Initialize the human agent
 
         Args:
-            action_num (int): the size of the output action space
+            num_actions (int): the size of the output action space
         '''
         self.use_raw = True
-        self.action_num = action_num
+        self.num_actions = num_actions
         self.is_choosing_action_id = False
         self.chosen_action_id = None  # type: int or None
         self.state = None

diff --git a/rlcard/agents/human_agents/leduc_holdem_human_agent.py b/rlcard/agents/human_agents/leduc_holdem_human_agent.py
@@ -5,14 +5,14 @@ class HumanAgent(object):
     ''' A human agent for Leduc Holdem. It can be used to play against trained models
     '''
 
-    def __init__(self, action_num):
+    def __init__(self, num_actions):
         ''' Initilize the human agent
 
         Args:
-            action_num (int): the size of the ouput action space
+            num_actions (int): the size of the ouput action space
         '''
         self.use_raw = True
-        self.action_num = action_num
+        self.num_actions = num_actions
 
     @staticmethod
     def step(state):

diff --git a/rlcard/agents/human_agents/limit_holdem_human_agent.py b/rlcard/agents/human_agents/limit_holdem_human_agent.py
@@ -5,14 +5,14 @@ class HumanAgent(object):
     ''' A human agent for Limit Holdem. It can be used to play against trained models
     '''
 
-    def __init__(self, action_num):
+    def __init__(self, num_actions):
         ''' Initilize the human agent
 
         Args:
-            action_num (int): the size of the ouput action space
+            num_actions (int): the size of the ouput action space
         '''
         self.use_raw = True
-        self.action_num = action_num
+        self.num_actions = num_actions
 
     @staticmethod
     def step(state):