-
Notifications
You must be signed in to change notification settings - Fork 1
/
dqn_agent.py
64 lines (49 loc) · 2.47 KB
/
dqn_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from wrappers import to_tensor, obs2tensor
import torch
from bmgame import BMGame
import random
import numpy as np
class DQNBMAgent():
def __init__(self, model, device, args=None):
self.model = model
self.device = device
self.args = args
self.game = BMGame()
def predict(self, obs, current_player):
combine_state = obs2tensor(obs,current_player).to(self.device)
available_actions = self.game.check(obs, current_player)
# act network VS target network
# if current_player == 1:
# action_prob, value = self.model.act_predict(combine_state)
# elif current_player == -1:
# action_prob, value = self.model.tgt_predict(combine_state)
# act network VS random
# if current_player == 1:
# action_prob, value = self.model.act_predict(combine_state)
# elif current_player == -1:
# action_prob, value = torch.randn((1,5)), torch.tensor([0])
# act network VS act network(noise)
action_prob, value = self.model.act_predict(combine_state)
if self.args.noise and current_player == -1:
epsilon = 0.2
# different from paper, in the paper, noise is added to the root of MCTS Tree
# Here, noise is just added to the result
noise_distri = np.random.dirichlet(0.3 * np.ones(len(action_prob)))
noise_distri = torch.from_numpy(noise_distri).float().to(self.device)
action_prob = (1 - epsilon) * action_prob + epsilon * noise_distri
action_prob_mask = to_tensor([action_prob[0][i] for i in available_actions], 'cpu') # store in cpu device
action_prob = action_prob_mask/torch.sum(action_prob_mask)
return action_prob, value.item(), combine_state, available_actions
class DQNBMAgent_E():
def __init__(self, model, device, args=None):
self.model = model
self.device =device
self.args = args
self.game = BMGame()
def predict(self, obs, current_player):
combine_state = obs2tensor(obs,current_player).to(self.device)
available_actions = self.game.check(obs, current_player)
action_prob, value = self.model.act_predict(combine_state)
action_prob_mask = to_tensor([action_prob[0][i] for i in available_actions], 'cpu') # store in cpu device
action_prob = action_prob_mask/torch.sum(action_prob_mask)
return action_prob, value.item(), combine_state, available_actions