forked from datamllab/rlcard
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuno_dqn.py
95 lines (73 loc) · 3.11 KB
/
uno_dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
''' An example of learning a Deep-Q Agent on Dou Dizhu
'''
import tensorflow as tf
import rlcard
from rlcard.agents.dqn_agent import DQNAgent
from rlcard.agents.random_agent import RandomAgent
from rlcard.utils.utils import set_global_seed
from rlcard.utils.logger import Logger
# Make environment
env = rlcard.make('uno')
eval_env = rlcard.make('uno')
# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 100
save_plot_every = 1000
evaluate_num = 10000
episode_num = 1000000
# Set the the number of steps for collecting normalization statistics
# and intial memory size
memory_init_size = 1000
norm_step = 1000
# The paths for saving the logs and learning curves
root_path = './experiments/uno_dqn_result/'
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'
# Set a global seed
set_global_seed(0)
with tf.Session() as sess:
# Set agents
global_step = tf.Variable(0, name='global_step', trainable=False)
agent = DQNAgent(sess,
scope='dqn',
action_num=env.action_num,
replay_memory_size=20000,
replay_memory_init_size=memory_init_size,
norm_step=norm_step,
state_shape=env.state_shape,
mlp_layers=[512, 512])
random_agent = RandomAgent(action_num=eval_env.action_num)
sess.run(tf.global_variables_initializer())
env.set_agents([agent, random_agent, random_agent])
eval_env.set_agents([agent, random_agent, random_agent])
# Count the number of steps
step_counter = 0
# Init a Logger to plot the learning curve
logger = Logger(xlabel='timestep', ylabel='reward', legend='DQN on UNO', log_path=log_path, csv_path=csv_path)
for episode in range(episode_num):
# Generate data from the environment
trajectories, _ = env.run(is_training=True)
# Feed transitions into agent memory, and train the agent
for ts in trajectories[0]:
agent.feed(ts)
step_counter += 1
# Train the agent
train_count = step_counter - (memory_init_size + norm_step)
if train_count > 0:
loss = agent.train()
print('\rINFO - Step {}, loss: {}'.format(step_counter, loss), end='')
# Evaluate the performance. Play with random agents.
if episode % evaluate_every == 0:
reward = 0
for eval_episode in range(evaluate_num):
_, payoffs = eval_env.run(is_training=False)
reward += payoffs[0]
logger.log('\n########## Evaluation ##########')
logger.log('Timestep: {} Average reward is {}'.format(env.timestep, float(reward)/evaluate_num))
# Add point to logger
logger.add_point(x=env.timestep, y=float(reward)/evaluate_num)
# Make plot
if episode % save_plot_every == 0 and episode > 0:
logger.make_plot(save_path=figure_path+str(episode)+'.png')
# Make the final plot
logger.make_plot(save_path=figure_path+'final_'+str(episode)+'.png')