-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
1,026 additions
and
0 deletions.
There are no files selected for viewing
128 changes: 128 additions & 0 deletions
128
use reinforce method to attack reinforce method/dqn_attack_enchanting.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import gym | ||
import time | ||
import argparse | ||
import numpy as np | ||
|
||
import torch | ||
|
||
from lib0 import wrappers | ||
from lib0 import dqn_model | ||
|
||
import collections | ||
import torch.nn as nn | ||
import torchvision.utils as vutils | ||
|
||
from tensorboardX import SummaryWriter | ||
|
||
DEFAULT_ENV_NAME = "PongNoFrameskip-v4" | ||
FPS = 25 | ||
|
||
|
||
# frame of second | ||
# buffer 4 * 84 * 84 | ||
def FGSM_ATTACK(state, net, epislon=0.01, device=torch.device("cuda")): | ||
state = state.to(device) | ||
state.requires_grad = True | ||
output = net(state) | ||
# output = nn.Softmax(dim=1)(output) | ||
target = torch.tensor([[0, 0, 0, 0, 2, 0]], dtype=torch.float).to(device) | ||
Iter_no = 0 | ||
Iter_max_no = 5 | ||
# while np.argmax(output.cpu().data.numpy()[0]) != 5 or Iter_no < Iter_max_no: | ||
while Iter_no < Iter_max_no: | ||
Iter_no = Iter_no + 1 | ||
# print(torch.max(output)) | ||
loss = nn.MSELoss()(output, target) | ||
net.zero_grad() | ||
loss.backward() | ||
with torch.no_grad(): | ||
state_grad = state.grad.data | ||
sign_data_grad = state_grad.sign() | ||
Noise = epislon * sign_data_grad | ||
state = state + Noise[0][0] | ||
# print(state.is_leaf) | ||
# 注意这里要用with no grad 来避免state反复加入计算图中 | ||
state.requires_grad = True | ||
output = net(state) | ||
# output = nn.Softmax(dim=1)(output) | ||
return state | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
# parser.add_argument("-m", "--model", required=True, | ||
# help="Model file to load") | ||
parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME, | ||
help="Environment name to use, default=" + | ||
DEFAULT_ENV_NAME) | ||
parser.add_argument("-r", "--record", help="Directory for video") | ||
parser.add_argument("--no-vis", default=True, dest='vis', | ||
help="Disable visualization", | ||
action='store_false') | ||
args = parser.parse_args() | ||
|
||
device = torch.device("cuda") | ||
|
||
writer = SummaryWriter() | ||
|
||
env = wrappers.make_env(args.env) | ||
# if args.record: | ||
# env = gym.wrappers.Monitor(env, args.record) | ||
# 第二个参数传入的是放到的文件的地址, 暂时先不用 | ||
# env = gym.wrappers.Monitor(env, "recording") | ||
|
||
net = dqn_model.DQN(env.observation_space.shape, | ||
env.action_space.n).to(device) | ||
state = torch.load(".\PongNoFrameskip-v4-best_14.dat", map_location=lambda stg, _: stg) | ||
# net 是 gpu 参数是 CPU??? | ||
# by default, torch save the cpu, gpu version of the tensor if you use gpu on training | ||
# map_location is needed to map the loaded tensor location from gpu to cpu | ||
# if you train on a cpu map_location is not needed | ||
|
||
net.load_state_dict(state) | ||
|
||
state = env.reset() | ||
total_reward = 0.0 | ||
c = collections.Counter() | ||
# counter 集合中的元素 并以字典的形式返回 | ||
|
||
ATTACK_FRAME_SKIP = 1 | ||
# 每10个frame执行一次attack | ||
step_count = 0 | ||
|
||
while True: | ||
step_count = step_count + 1 | ||
start_ts = time.time() | ||
# if args.vis: | ||
# env.render() | ||
env.render() | ||
state_v = torch.tensor(np.array([state], copy=False)).to(device) | ||
# 注意这里要使用[state] dim 会有问题 | ||
|
||
if step_count % ATTACK_FRAME_SKIP == 0: | ||
writer.add_image("img", vutils.make_grid( | ||
state_v.data.cpu()[0][0], normalize=True), step_count) | ||
state_v = FGSM_ATTACK(state=state_v, net=net, device=device) | ||
writer.add_image("attack", vutils.make_grid( | ||
state_v.data.cpu()[0][0], normalize=True), step_count) | ||
|
||
|
||
q_vals = net(state_v).cpu().data.numpy()[0] | ||
# [[v1, v2, v3, v4, v5]] | ||
action = np.argmax(q_vals) | ||
print(action) | ||
# argmax 返回 index | ||
c[action] += 1 | ||
state, reward, done, _ = env.step(action) | ||
total_reward += reward | ||
if done: | ||
break | ||
# if args.vis: | ||
delta = 1 / FPS - (time.time() - start_ts) | ||
if delta > 0: | ||
time.sleep(delta) | ||
print("Total reward: %.2f" % total_reward) | ||
print("Action counts:", c) | ||
# if args.record: | ||
env.env.close() |
240 changes: 240 additions & 0 deletions
240
use reinforce method to attack reinforce method/dqn_pong.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,240 @@ | ||
from lib0 import wrappers | ||
from lib0 import dqn_model | ||
|
||
import argparse | ||
# 命令行提示用的 | ||
import time | ||
# 日期不知道是要来干嘛的 | ||
import numpy as np | ||
import collections | ||
# collections 就是一系列的集合,改写 tuple 命名一些三元组之类的 | ||
|
||
import torch | ||
import torch.nn as nn | ||
import torch.optim as optim | ||
|
||
from tensorboardX import SummaryWriter | ||
|
||
DEFAULT_ENV_NAME = "PongNoFrameskip-v4" | ||
MEAN_REWARD_BOUND = 19 | ||
|
||
GAMMA = 0.99 # degree of "death" or how eager your agent | ||
BATCH_SIZE = 32 | ||
REPLAY_SIZE = 10000 | ||
LEARNING_RATE = 1e-4 | ||
SYNC_TARGET_FRAMES = 1000 # k3 times the target net work update | ||
REPLAY_START_SIZE = 10000 | ||
# AGENT_PROCESS = 100 # k1 times the agent collects data | ||
# REGRESSION_EPOCH = 5 # K2 times epoch is a single pass through the full training set | ||
|
||
EPSILON_DECAY_LAST_FRAME = 150000 | ||
EPSILON_START = 1.0 # epsilon decay, during the first 150,000 frames, epsilon | ||
EPSILON_FINAL = 0.01 # is linearly decayed to 0.01 | ||
|
||
|
||
|
||
# core function 1 replay buffer | ||
Experience = collections.namedtuple( | ||
'Experience', field_names=['state', 'action', 'reward', | ||
'done', 'new_state']) | ||
|
||
class ExperienceBuffer: | ||
def __init__(self, capacity): | ||
self.buffer = collections.deque(maxlen=capacity) # 双端队列 | ||
|
||
def __len__(self): | ||
return len(self.buffer) | ||
|
||
def append(self, experience): | ||
self.buffer.append(experience) | ||
|
||
def sample(self, batch_size): | ||
indices = np.random.choice(len(self.buffer), batch_size, | ||
replace=False) | ||
# replace=False保证调出来的不一样 | ||
states, actions, rewards, dones, next_states = \ | ||
zip(*[self.buffer[idx] for idx in indices]) | ||
# * 解包 [(1),(2), (3)] => (1), (2), (3) 再将这些单独的“个体” 传入到zip(iter1, iter2, ...)参数中 | ||
# zip is an iterator of tuples where the first item in each passed iterator | ||
# is paired together, and then the second item in each passed iterator are | ||
# paired together etc. | ||
return np.array(states), np.array(actions), \ | ||
np.array(rewards, dtype=np.float32), \ | ||
np.array(dones, dtype=np.uint8), \ | ||
np.array(next_states) | ||
|
||
|
||
class Agent: | ||
def __init__(self, env, exp_buffer): | ||
self.env = env | ||
self.exp_buffer = exp_buffer | ||
self._reset() | ||
|
||
def _reset(self): | ||
self.state = self.env.reset() | ||
self.total_reward = 0.0 | ||
|
||
@torch.no_grad() | ||
def play_step(self, net, epsilon=0.0, device="cpu"): | ||
done_reward = None | ||
|
||
if np.random.random() < epsilon: | ||
action = self.env.action_space.sample() | ||
else: | ||
state_a = np.array([self.state], copy=False) | ||
# 注意 state_a 的 shape 是 (1, ...) 不能写成是这样的: | ||
# np.array(self.state, copy=False) | ||
# copy = False 使用的是原来的数据, 一个copy 花费的时间多 | ||
state_v = torch.tensor(state_a).to(device) | ||
q_vals_v = net(state_v) | ||
_, act_v = torch.max(q_vals_v, dim=1) | ||
# 注意这里的max 输出是index, value, 下面的 tensor.max(dim = 1) 输出的是 value, tensor | ||
# 注意这里的dim = 1 outshape = (1, actions) = [[1,2,3,4]] | ||
# _, 是索引 | ||
action = int(act_v.item()) | ||
|
||
# do step in the environment | ||
new_state, reward, is_done, _ = self.env.step(action) | ||
self.total_reward += reward | ||
|
||
exp = Experience(self.state, action, reward, | ||
is_done, new_state) | ||
self.exp_buffer.append(exp) | ||
self.state = new_state | ||
if is_done: | ||
done_reward = self.total_reward | ||
self._reset() | ||
return done_reward | ||
|
||
|
||
def calc_loss(batch, net, tgt_net, device="cpu"): | ||
states, actions, rewards, dones, next_states = batch | ||
|
||
states_v = torch.tensor(np.array( | ||
states, copy=False)).to(device) | ||
# 可能states 都太大所以 copy false 然后转成 torch? | ||
next_states_v = torch.tensor(np.array( | ||
next_states, copy=False)).to(device) | ||
actions_v = torch.tensor(actions).long().to(device) | ||
rewards_v = torch.tensor(rewards).to(device) | ||
done_mask = torch.BoolTensor(dones).to(device) | ||
|
||
state_action_values = net(states_v).gather( | ||
1, actions_v.unsqueeze(-1)).squeeze(-1) | ||
# unsqueeze(-1) -1 处再加一个维度 | ||
# squeeze(): Returns a tensor with all the dimensions of input of size 1 removed. | ||
# index (LongTensor) 必须是和 input (Tensor) "上面" 同类的 tensor | ||
# 值储存的是对应的索引 | ||
# [[1,2,3],[1,2,3],[1,2,3]] [[1], [0], [2]] => [[2], [1], [3]] | ||
# squeeze(-1) -1 squeeze最后一个维度(-1) 取消掉最后一个维度 | ||
# the result of gather() applied to tensor is a differentiable operation | ||
|
||
|
||
with torch.no_grad(): | ||
next_state_values = tgt_net(next_states_v).max(1)[0] | ||
# tensor.max(dim = n) => (max_value, argmax) | ||
next_state_values[done_mask] = 0.0 | ||
# without this training will not converge | ||
# 可能比如说最后一个state的值会摇摆, 由于generalize的原因 | ||
next_state_values = next_state_values.detach() | ||
# no_grad 是暂时的detach() .detach()是永久的detach() | ||
|
||
expected_state_action_values = next_state_values * GAMMA + \ | ||
rewards_v | ||
return nn.MSELoss()(state_action_values, | ||
expected_state_action_values) | ||
# mean squared error | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--cuda", default=False, | ||
action="store_true", help="Enable cuda") | ||
parser.add_argument("--env", default=DEFAULT_ENV_NAME, | ||
help="Name of the environment, default=" + | ||
DEFAULT_ENV_NAME) | ||
args = parser.parse_args() | ||
device = torch.device("cuda") | ||
|
||
env = wrappers.make_env(args.env) | ||
|
||
net = dqn_model.DQN(env.observation_space.shape, | ||
env.action_space.n).to(device) | ||
tgt_net = dqn_model.DQN(env.observation_space.shape, | ||
env.action_space.n).to(device) | ||
writer = SummaryWriter(comment="-" + args.env) | ||
print(net) | ||
|
||
buffer = ExperienceBuffer(REPLAY_SIZE) | ||
agent = Agent(env, buffer) | ||
epsilon = EPSILON_START | ||
|
||
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) | ||
total_rewards = [] | ||
frame_idx = 0 | ||
|
||
ts_frame = 0 | ||
ts = time.time() | ||
# track our speed | ||
|
||
best_m_reward = None | ||
# best mean reward | ||
|
||
while True: | ||
frame_idx += 1 | ||
epsilon = max(EPSILON_FINAL, EPSILON_START - | ||
frame_idx / EPSILON_DECAY_LAST_FRAME) | ||
|
||
reward = agent.play_step(net, epsilon, device=device) | ||
if reward is not None: | ||
# 结束了一个 eposide | ||
total_rewards.append(reward) | ||
speed = (frame_idx - ts_frame) / (time.time() - ts) | ||
ts_frame = frame_idx | ||
ts = time.time() | ||
m_reward = np.mean(total_rewards[-100:]) | ||
# 不住100就按不足100的处理 | ||
print("%d: done %d games, reward %.3f, " | ||
"eps %.2f, speed %.2f f/s" % ( | ||
frame_idx, len(total_rewards), m_reward, epsilon, | ||
speed | ||
)) | ||
writer.add_scalar("epsilon", epsilon, frame_idx) | ||
writer.add_scalar("speed", speed, frame_idx) | ||
writer.add_scalar("reward_100", m_reward, frame_idx) | ||
writer.add_scalar("reward", reward, frame_idx) | ||
if best_m_reward is None or best_m_reward < m_reward: | ||
torch.save(net.state_dict(), args.env + | ||
"-best_%.0f.dat" % m_reward) | ||
if best_m_reward is not None: | ||
print("Best reward updated %.3f -> %.3f" % ( | ||
best_m_reward, m_reward)) | ||
best_m_reward = m_reward | ||
if m_reward > MEAN_REWARD_BOUND: | ||
print("Solved in %d frames!" % frame_idx) | ||
break | ||
|
||
if len(buffer) < REPLAY_START_SIZE: | ||
continue | ||
|
||
if frame_idx % SYNC_TARGET_FRAMES == 0: | ||
tgt_net.load_state_dict(net.state_dict()) | ||
|
||
optimizer.zero_grad() | ||
batch = buffer.sample(BATCH_SIZE) | ||
loss_t = calc_loss(batch, net, tgt_net, device=device) | ||
loss_t.backward() | ||
optimizer.step() | ||
|
||
writer.close() | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.