Skip to content

Commit

Permalink
Unify num players and actions
Browse files Browse the repository at this point in the history
Former-commit-id: a78fdad
  • Loading branch information
daochenzha committed May 14, 2021
1 parent 9153b4a commit cde2734
Show file tree
Hide file tree
Showing 59 changed files with 230 additions and 398 deletions.
6 changes: 3 additions & 3 deletions examples/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def load_model(model_path, env=None, position=None, device=None):
agent.load()
elif model_path == 'random': # Random model
from rlcard.agents import RandomAgent
agent = RandomAgent(action_num=env.action_num)
agent = RandomAgent(num_actions=env.num_actions)
else: # A model in the model zoo
from rlcard import models
agent = models.load(model_path).agents[position]
Expand All @@ -43,7 +43,7 @@ def evaluate(args):
env.set_agents(agents)

# Evaluate
rewards = tournament(env, args.evaluate_num)
rewards = tournament(env, args.num_games)
for position, reward in enumerate(rewards):
print(position, args.models[position], reward)

Expand All @@ -53,7 +53,7 @@ def evaluate(args):
parser.add_argument('--models', nargs='*', default=['experiments/leduc_holdem_dqn_result/model.pth', 'random'])
parser.add_argument('--cuda', type=str, default='')
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--evaluate_num', type=int, default=10000)
parser.add_argument('--num_games', type=int, default=10000)

args = parser.parse_args()

Expand Down
10 changes: 5 additions & 5 deletions examples/run_cfr.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,26 @@ def train(args):
agent.load() # If we have saved model, we first load the model

# Evaluate CFR against random
eval_env.set_agents([agent, RandomAgent(action_num=env.action_num)])
eval_env.set_agents([agent, RandomAgent(num_actions=env.num_actions)])

# Start training
with Logger(args.log_dir) as logger:
for episode in range(args.episode_num):
for episode in range(args.num_episodes):
agent.train()
print('\rIteration {}'.format(episode), end='')
# Evaluate the performance. Play with Random agents.
if episode % args.evaluate_every == 0:
agent.save() # Save model
logger.log_performance(env.timestep, tournament(eval_env, args.evaluate_num)[0])
logger.log_performance(env.timestep, tournament(eval_env, args.num_games)[0])

# Plot the learning curve
logger.plot('CFR')

if __name__ == '__main__':
parser = argparse.ArgumentParser("DQN example in RLCard")
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--episode_num', type=int, default=5000)
parser.add_argument('--evaluate_num', type=int, default=2000)
parser.add_argument('--num_episodes', type=int, default=5000)
parser.add_argument('--num_games', type=int, default=2000)
parser.add_argument('--evaluate_every', type=int, default=100)
parser.add_argument('--log_dir', type=str, default='experiments/leduc_holdem_cfr_result/')

Expand Down
8 changes: 4 additions & 4 deletions examples/run_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
def run(args):
# Make environment
env = rlcard.make(args.env, config={'seed': 42})
episode_num = 1
num_episodes = 1

# Seed numpy, torch, random
set_seed(42)

# Set agents
agent = RandomAgent(action_num=env.action_num)
env.set_agents([agent for _ in range(env.player_num)])
agent = RandomAgent(num_actions=env.num_actions)
env.set_agents([agent for _ in range(env.num_players)])

for episode in range(episode_num):
for episode in range(num_episodes):

# Generate data from the environment
trajectories, player_wins = env.run(is_training=False)
Expand Down
16 changes: 8 additions & 8 deletions examples/run_rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,25 @@ def train(args):
# Initialize the agent and use random agents as opponents
if args.algorithm == 'dqn':
from rlcard.agents import DQNAgent
agent = DQNAgent(action_num=env.action_num,
agent = DQNAgent(num_actions=env.num_actions,
state_shape=env.state_shape[0],
mlp_layers=[64,64],
device=device)
elif args.algorithm == 'nfsp':
from rlcard.agents import NFSPAgent
agent = NFSPAgent(action_num=env.action_num,
agent = NFSPAgent(num_actions=env.num_actions,
state_shape=env.state_shape[0],
hidden_layers_sizes=[64,64],
q_mlp_layers=[64,64],
device=device)
agents = [agent]
for _ in range(env.player_num):
agents.append(RandomAgent(action_num=env.action_num))
for _ in range(env.num_players):
agents.append(RandomAgent(num_actions=env.num_actions))
env.set_agents(agents)

# Start training
with Logger(args.log_dir) as logger:
for episode in range(args.episode_num):
for episode in range(args.num_episodes):

if args.algorithm == 'nfsp':
agents[0].sample_episode_policy()
Expand All @@ -60,7 +60,7 @@ def train(args):

# Evaluate the performance. Play with random agents.
if episode % args.evaluate_every == 0:
logger.log_performance(env.timestep, tournament(env, args.evaluate_num)[0])
logger.log_performance(env.timestep, tournament(env, args.num_games)[0])

# Plot the learning curve
logger.plot(args.algorithm)
Expand All @@ -76,8 +76,8 @@ def train(args):
parser.add_argument('--algorithm', type=str, default='dqn', choices=['dqn', 'nfsp'])
parser.add_argument('--cuda', type=str, default='')
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--episode_num', type=int, default=5000)
parser.add_argument('--evaluate_num', type=int, default=2000)
parser.add_argument('--num_episodes', type=int, default=5000)
parser.add_argument('--num_games', type=int, default=2000)
parser.add_argument('--evaluate_every', type=int, default=100)
parser.add_argument('--log_dir', type=str, default='experiments/leduc_holdem_dqn_result/')

Expand Down
20 changes: 10 additions & 10 deletions rlcard/agents/cfr_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def train(self):
self.iteration += 1
# Firstly, traverse tree to compute counterfactual regret for each player
# The regrets are recorded in traversal
for player_id in range(self.env.player_num):
for player_id in range(self.env.num_players):
self.env.reset()
probs = np.ones(self.env.player_num)
probs = np.ones(self.env.num_players)
self.traverse_tree(probs, player_id)

# Update policy
Expand All @@ -59,7 +59,7 @@ def traverse_tree(self, probs, player_id):
current_player = self.env.get_player_id()

action_utilities = {}
state_utility = np.zeros(self.env.player_num)
state_utility = np.zeros(self.env.num_players)
obs, legal_actions = self.get_state(current_player)
action_probs = self.action_probs(obs, legal_actions, self.policy)

Expand All @@ -86,9 +86,9 @@ def traverse_tree(self, probs, player_id):
player_state_utility = state_utility[current_player]

if obs not in self.regrets:
self.regrets[obs] = np.zeros(self.env.action_num)
self.regrets[obs] = np.zeros(self.env.num_actions)
if obs not in self.average_policy:
self.average_policy[obs] = np.zeros(self.env.action_num)
self.average_policy[obs] = np.zeros(self.env.num_actions)
for action in legal_actions:
action_prob = action_probs[action]
regret = counterfactual_prob * (action_utilities[action][current_player]
Expand All @@ -112,13 +112,13 @@ def regret_matching(self, obs):
regret = self.regrets[obs]
positive_regret_sum = sum([r for r in regret if r > 0])

action_probs = np.zeros(self.env.action_num)
action_probs = np.zeros(self.env.num_actions)
if positive_regret_sum > 0:
for action in range(self.env.action_num):
for action in range(self.env.num_actions):
action_probs[action] = max(0.0, regret[action] / positive_regret_sum)
else:
for action in range(self.env.action_num):
action_probs[action] = 1.0 / self.env.action_num
for action in range(self.env.num_actions):
action_probs[action] = 1.0 / self.env.num_actions
return action_probs

def action_probs(self, obs, legal_actions, policy):
Expand All @@ -136,7 +136,7 @@ def action_probs(self, obs, legal_actions, policy):
legal_actions (list): Indices of legal actions
'''
if obs not in policy.keys():
action_probs = np.array([1.0/self.env.action_num for _ in range(self.env.action_num)])
action_probs = np.array([1.0/self.env.num_actions for _ in range(self.env.num_actions)])
self.policy[obs] = action_probs
else:
action_probs = policy[obs]
Expand Down
26 changes: 13 additions & 13 deletions rlcard/agents/dqn_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(self,
epsilon_end=0.1,
epsilon_decay_steps=20000,
batch_size=32,
action_num=2,
num_actions=2,
state_shape=None,
train_every=1,
mlp_layers=None,
Expand All @@ -75,7 +75,7 @@ def __init__(self,
epsilon_decay_steps (int): Number of steps to decay epsilon over
batch_size (int): Size of batches to sample from the replay memory
evaluate_every (int): Evaluate every N steps
action_num (int): The number of the actions
num_actions (int): The number of the actions
state_space (list): The space of the state vector
train_every (int): Train the network every X steps.
mlp_layers (list): The layer number and the dimension of each layer in MLP
Expand All @@ -88,7 +88,7 @@ def __init__(self,
self.discount_factor = discount_factor
self.epsilon_decay_steps = epsilon_decay_steps
self.batch_size = batch_size
self.action_num = action_num
self.num_actions = num_actions
self.train_every = train_every

# Torch device
Expand All @@ -107,9 +107,9 @@ def __init__(self,
self.epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

# Create estimators
self.q_estimator = Estimator(action_num=action_num, learning_rate=learning_rate, state_shape=state_shape, \
self.q_estimator = Estimator(num_actions=num_actions, learning_rate=learning_rate, state_shape=state_shape, \
mlp_layers=mlp_layers, device=self.device)
self.target_estimator = Estimator(action_num=action_num, learning_rate=learning_rate, state_shape=state_shape, \
self.target_estimator = Estimator(num_actions=num_actions, learning_rate=learning_rate, state_shape=state_shape, \
mlp_layers=mlp_layers, device=self.device)

# Create replay memory
Expand Down Expand Up @@ -170,7 +170,7 @@ def predict(self, state):
q_values (numpy.array): a 1-d array where each entry represents a Q value
'''
epsilon = self.epsilons[min(self.total_t, self.epsilon_decay_steps-1)]
A = np.ones(self.action_num, dtype=float) * epsilon / self.action_num
A = np.ones(self.num_actions, dtype=float) * epsilon / self.num_actions
q_values = self.q_estimator.predict_nograd(np.expand_dims(state, 0))[0]
best_action = np.argmax(q_values)
A[best_action] += (1.0 - epsilon)
Expand Down Expand Up @@ -232,7 +232,7 @@ class Estimator(object):
This network is used for both the Q-Network and the Target Network.
'''

def __init__(self, action_num=2, learning_rate=0.001, state_shape=None, mlp_layers=None, device=None):
def __init__(self, num_actions=2, learning_rate=0.001, state_shape=None, mlp_layers=None, device=None):
''' Initilalize an Estimator object.
Args:
Expand All @@ -241,14 +241,14 @@ def __init__(self, action_num=2, learning_rate=0.001, state_shape=None, mlp_laye
mlp_layers (list): size of outputs of mlp layers
device (torch.device): whether to use cpu or gpu
'''
self.action_num = action_num
self.num_actions = num_actions
self.learning_rate=learning_rate
self.state_shape = state_shape
self.mlp_layers = mlp_layers
self.device = device

# set up Q model and place it in eval mode
qnet = EstimatorNetwork(action_num, state_shape, mlp_layers)
qnet = EstimatorNetwork(num_actions, state_shape, mlp_layers)
qnet = qnet.to(self.device)
self.qnet = qnet
self.qnet.eval()
Expand Down Expand Up @@ -325,17 +325,17 @@ class EstimatorNetwork(nn.Module):
It is just a series of tanh layers. All in/out are torch.tensor
'''

def __init__(self, action_num=2, state_shape=None, mlp_layers=None):
def __init__(self, num_actions=2, state_shape=None, mlp_layers=None):
''' Initialize the Q network
Args:
action_num (int): number of legal actions
num_actions (int): number of legal actions
state_shape (list): shape of state tensor
mlp_layers (list): output size of each fc layer
'''
super(EstimatorNetwork, self).__init__()

self.action_num = action_num
self.num_actions = num_actions
self.state_shape = state_shape
self.mlp_layers = mlp_layers

Expand All @@ -346,7 +346,7 @@ def __init__(self, action_num=2, state_shape=None, mlp_layers=None):
for i in range(len(layer_dims)-1):
fc.append(nn.Linear(layer_dims[i], layer_dims[i+1], bias=True))
fc.append(nn.Tanh())
fc.append(nn.Linear(layer_dims[-1], self.action_num, bias=True))
fc.append(nn.Linear(layer_dims[-1], self.num_actions, bias=True))
self.fc_layers = nn.Sequential(*fc)

def forward(self, s):
Expand Down
12 changes: 6 additions & 6 deletions rlcard/agents/human_agents/blackjack_human_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ class HumanAgent(object):
''' A human agent for Blackjack. It can be used to play alone for understand how the blackjack code runs
'''

def __init__(self, action_num):
def __init__(self, num_actions):
''' Initilize the human agent
Args:
action_num (int): the size of the output action space
num_actions (int): the size of the output action space
'''
self.use_raw = True
self.action_num = action_num
self.num_actions = num_actions

@staticmethod
def step(state):
Expand Down Expand Up @@ -59,12 +59,12 @@ def _print_state(state, raw_legal_actions, action_record):
print('\n============= Dealer Hand ===============')
print_card(state['dealer hand'])

num_player = len(state) - 3
num_players = len(state) - 3

for i in range(num_player):
for i in range(num_players):
print('=============== Player {} Hand ==============='.format(i))
print_card(state['player' + str(i) + ' hand'])

print('\n=========== Actions You Can Choose ===========')
print(', '.join([str(index) + ': ' + action for index, action in enumerate(raw_legal_actions)]))
print('')
print('')
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ class HumanAgent(object):
''' A human agent for Gin Rummy. It can be used to play against trained models.
'''

def __init__(self, action_num):
def __init__(self, num_actions):
''' Initialize the human agent
Args:
action_num (int): the size of the output action space
num_actions (int): the size of the output action space
'''
self.use_raw = True
self.action_num = action_num
self.num_actions = num_actions
self.is_choosing_action_id = False
self.chosen_action_id = None # type: int or None
self.state = None
Expand Down
6 changes: 3 additions & 3 deletions rlcard/agents/human_agents/leduc_holdem_human_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ class HumanAgent(object):
''' A human agent for Leduc Holdem. It can be used to play against trained models
'''

def __init__(self, action_num):
def __init__(self, num_actions):
''' Initilize the human agent
Args:
action_num (int): the size of the ouput action space
num_actions (int): the size of the ouput action space
'''
self.use_raw = True
self.action_num = action_num
self.num_actions = num_actions

@staticmethod
def step(state):
Expand Down
6 changes: 3 additions & 3 deletions rlcard/agents/human_agents/limit_holdem_human_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ class HumanAgent(object):
''' A human agent for Limit Holdem. It can be used to play against trained models
'''

def __init__(self, action_num):
def __init__(self, num_actions):
''' Initilize the human agent
Args:
action_num (int): the size of the ouput action space
num_actions (int): the size of the ouput action space
'''
self.use_raw = True
self.action_num = action_num
self.num_actions = num_actions

@staticmethod
def step(state):
Expand Down
Loading

0 comments on commit cde2734

Please sign in to comment.