Add files via upload

Chenkehan21 · Sep 11, 2020 · be75af3 · be75af3
1 parent 0129ae5
commit be75af3
Show file tree

Hide file tree

Showing 12 changed files with 1,026 additions and 0 deletions.
diff --git a/use reinforce method to attack reinforce method/dqn_attack_enchanting.py b/use reinforce method to attack reinforce method/dqn_attack_enchanting.py
@@ -0,0 +1,128 @@
+import gym
+import time
+import argparse
+import numpy as np
+
+import torch
+
+from lib0 import wrappers
+from lib0 import dqn_model
+
+import collections
+import torch.nn as nn
+import torchvision.utils as vutils
+
+from tensorboardX import SummaryWriter
+
+DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
+FPS = 25
+
+
+# frame of second
+# buffer 4 * 84 * 84
+def FGSM_ATTACK(state, net, epislon=0.01, device=torch.device("cuda")):
+    state = state.to(device)
+    state.requires_grad = True
+    output = net(state)
+    # output = nn.Softmax(dim=1)(output)
+    target = torch.tensor([[0, 0, 0, 0, 2, 0]], dtype=torch.float).to(device)
+    Iter_no = 0
+    Iter_max_no = 5
+    # while np.argmax(output.cpu().data.numpy()[0]) != 5 or Iter_no < Iter_max_no:
+    while Iter_no < Iter_max_no:
+        Iter_no = Iter_no + 1
+        # print(torch.max(output))
+        loss = nn.MSELoss()(output, target)
+        net.zero_grad()
+        loss.backward()
+        with torch.no_grad():
+            state_grad = state.grad.data
+            sign_data_grad = state_grad.sign()
+            Noise = epislon * sign_data_grad
+            state = state + Noise[0][0]
+            # print(state.is_leaf)
+        # 注意这里要用with no grad 来避免state反复加入计算图中
+        state.requires_grad = True
+        output = net(state)
+        # output = nn.Softmax(dim=1)(output)
+    return state
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("-m", "--model", required=True,
+    #                     help="Model file to load")
+    parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME,
+                        help="Environment name to use, default=" +
+                             DEFAULT_ENV_NAME)
+    parser.add_argument("-r", "--record", help="Directory for video")
+    parser.add_argument("--no-vis", default=True, dest='vis',
+                        help="Disable visualization",
+                        action='store_false')
+    args = parser.parse_args()
+
+    device = torch.device("cuda")
+
+    writer = SummaryWriter()
+
+    env = wrappers.make_env(args.env)
+    # if args.record:
+    # env = gym.wrappers.Monitor(env, args.record)
+    # 第二个参数传入的是放到的文件的地址, 暂时先不用
+    # env = gym.wrappers.Monitor(env, "recording")
+
+    net = dqn_model.DQN(env.observation_space.shape,
+                        env.action_space.n).to(device)
+    state = torch.load(".\PongNoFrameskip-v4-best_14.dat", map_location=lambda stg, _: stg)
+    # net 是 gpu 参数是 CPU???
+    # by default, torch save the cpu, gpu version of the tensor if you use gpu on training
+    # map_location is needed to map the loaded tensor location from gpu to cpu
+    # if you train on a cpu map_location is not needed
+
+    net.load_state_dict(state)
+
+    state = env.reset()
+    total_reward = 0.0
+    c = collections.Counter()
+    # counter 集合中的元素 并以字典的形式返回
+
+    ATTACK_FRAME_SKIP = 1
+    # 每10个frame执行一次attack
+    step_count = 0
+
+    while True:
+        step_count = step_count + 1
+        start_ts = time.time()
+        # if args.vis:
+        #   env.render()
+        env.render()
+        state_v = torch.tensor(np.array([state], copy=False)).to(device)
+        # 注意这里要使用[state] dim 会有问题
+
+        if step_count % ATTACK_FRAME_SKIP == 0:
+            writer.add_image("img", vutils.make_grid(
+                state_v.data.cpu()[0][0], normalize=True), step_count)
+            state_v = FGSM_ATTACK(state=state_v, net=net, device=device)
+            writer.add_image("attack", vutils.make_grid(
+                state_v.data.cpu()[0][0], normalize=True), step_count)
+
+
+        q_vals = net(state_v).cpu().data.numpy()[0]
+        # [[v1, v2, v3, v4, v5]]
+        action = np.argmax(q_vals)
+        print(action)
+        # argmax 返回 index
+        c[action] += 1
+        state, reward, done, _ = env.step(action)
+        total_reward += reward
+        if done:
+            break
+        # if args.vis:
+        delta = 1 / FPS - (time.time() - start_ts)
+        if delta > 0:
+            time.sleep(delta)
+    print("Total reward: %.2f" % total_reward)
+    print("Action counts:", c)
+    # if args.record:
+    env.env.close()
diff --git a/use reinforce method to attack reinforce method/dqn_pong.py b/use reinforce method to attack reinforce method/dqn_pong.py
@@ -0,0 +1,240 @@
+from lib0 import wrappers
+from lib0 import dqn_model
+
+import argparse
+# 命令行提示用的
+import time
+# 日期不知道是要来干嘛的
+import numpy as np
+import collections
+# collections 就是一系列的集合，改写 tuple 命名一些三元组之类的
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from tensorboardX import SummaryWriter
+
+DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
+MEAN_REWARD_BOUND = 19
+
+GAMMA = 0.99                # degree of "death" or how eager your agent
+BATCH_SIZE = 32
+REPLAY_SIZE = 10000
+LEARNING_RATE = 1e-4
+SYNC_TARGET_FRAMES = 1000   # k3 times the target net work update
+REPLAY_START_SIZE = 10000
+# AGENT_PROCESS = 100       # k1 times the agent collects data
+# REGRESSION_EPOCH = 5      # K2 times epoch is a single pass through the full training set
+
+EPSILON_DECAY_LAST_FRAME = 150000
+EPSILON_START = 1.0         # epsilon decay, during the first 150,000 frames, epsilon
+EPSILON_FINAL = 0.01        # is linearly decayed to 0.01
+
+
+
+# core function 1 replay buffer
+Experience = collections.namedtuple(
+    'Experience', field_names=['state', 'action', 'reward',
+                               'done', 'new_state'])
+
+class ExperienceBuffer:
+    def __init__(self, capacity):
+        self.buffer = collections.deque(maxlen=capacity)    # 双端队列
+
+    def __len__(self):
+        return len(self.buffer)
+
+    def append(self, experience):
+        self.buffer.append(experience)
+
+    def sample(self, batch_size):
+        indices = np.random.choice(len(self.buffer), batch_size,
+                                   replace=False)
+        # replace=False保证调出来的不一样
+        states, actions, rewards, dones, next_states = \
+            zip(*[self.buffer[idx] for idx in indices])
+        # * 解包 [(1),(2), (3)] => (1), (2), (3) 再将这些单独的“个体” 传入到zip(iter1, iter2, ...)参数中
+        #  zip is an iterator of tuples where the first item in each passed iterator
+        #  is paired together, and then the second item in each passed iterator are
+        #  paired together etc.
+        return np.array(states), np.array(actions), \
+               np.array(rewards, dtype=np.float32), \
+               np.array(dones, dtype=np.uint8), \
+               np.array(next_states)
+
+
+class Agent:
+    def __init__(self, env, exp_buffer):
+        self.env = env
+        self.exp_buffer = exp_buffer
+        self._reset()
+
+    def _reset(self):
+        self.state = self.env.reset()
+        self.total_reward = 0.0
+
+    @torch.no_grad()
+    def play_step(self, net, epsilon=0.0, device="cpu"):
+        done_reward = None
+
+        if np.random.random() < epsilon:
+            action = self.env.action_space.sample()
+        else:
+            state_a = np.array([self.state], copy=False)
+            # 注意 state_a 的 shape 是 (1, ...) 不能写成是这样的：
+            # np.array(self.state, copy=False)
+            # copy = False 使用的是原来的数据， 一个copy 花费的时间多
+            state_v = torch.tensor(state_a).to(device)
+            q_vals_v = net(state_v)
+            _, act_v = torch.max(q_vals_v, dim=1)
+            # 注意这里的max 输出是index, value, 下面的 tensor.max(dim = 1) 输出的是 value, tensor
+            # 注意这里的dim = 1 outshape = (1, actions) = [[1,2,3,4]]
+            # _, 是索引
+            action = int(act_v.item())
+
+        # do step in the environment
+        new_state, reward, is_done, _ = self.env.step(action)
+        self.total_reward += reward
+
+        exp = Experience(self.state, action, reward,
+                         is_done, new_state)
+        self.exp_buffer.append(exp)
+        self.state = new_state
+        if is_done:
+            done_reward = self.total_reward
+            self._reset()
+        return done_reward
+
+
+def calc_loss(batch, net, tgt_net, device="cpu"):
+    states, actions, rewards, dones, next_states = batch
+
+    states_v = torch.tensor(np.array(
+        states, copy=False)).to(device)
+    # 可能states 都太大所以 copy false 然后转成 torch？
+    next_states_v = torch.tensor(np.array(
+        next_states, copy=False)).to(device)
+    actions_v = torch.tensor(actions).long().to(device)
+    rewards_v = torch.tensor(rewards).to(device)
+    done_mask = torch.BoolTensor(dones).to(device)
+
+    state_action_values = net(states_v).gather(
+        1, actions_v.unsqueeze(-1)).squeeze(-1)
+    # unsqueeze(-1) -1 处再加一个维度
+    # squeeze(): Returns a tensor with all the dimensions of input of size 1 removed.
+    # index (LongTensor) 必须是和 input (Tensor) "上面" 同类的 tensor
+    # 值储存的是对应的索引
+    # [[1,2,3],[1,2,3],[1,2,3]]  [[1], [0], [2]] => [[2], [1], [3]]
+    # squeeze(-1)   -1 squeeze最后一个维度(-1)  取消掉最后一个维度
+    # the result of gather() applied to tensor is a differentiable operation
+
+
+    with torch.no_grad():
+        next_state_values = tgt_net(next_states_v).max(1)[0]
+        # tensor.max(dim = n) => (max_value, argmax)
+        next_state_values[done_mask] = 0.0
+        # without this training will not converge
+        # 可能比如说最后一个state的值会摇摆, 由于generalize的原因
+        next_state_values = next_state_values.detach()
+        # no_grad 是暂时的detach()   .detach()是永久的detach()
+
+    expected_state_action_values = next_state_values * GAMMA + \
+                                   rewards_v
+    return nn.MSELoss()(state_action_values,
+                        expected_state_action_values)
+        # mean squared error
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--cuda", default=False,
+                        action="store_true", help="Enable cuda")
+    parser.add_argument("--env", default=DEFAULT_ENV_NAME,
+                        help="Name of the environment, default=" +
+                             DEFAULT_ENV_NAME)
+    args = parser.parse_args()
+    device = torch.device("cuda")
+
+    env = wrappers.make_env(args.env)
+
+    net = dqn_model.DQN(env.observation_space.shape,
+                        env.action_space.n).to(device)
+    tgt_net = dqn_model.DQN(env.observation_space.shape,
+                            env.action_space.n).to(device)
+    writer = SummaryWriter(comment="-" + args.env)
+    print(net)
+
+    buffer = ExperienceBuffer(REPLAY_SIZE)
+    agent = Agent(env, buffer)
+    epsilon = EPSILON_START
+
+    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
+    total_rewards = []
+    frame_idx = 0
+
+    ts_frame = 0
+    ts = time.time()
+    # track our speed
+
+    best_m_reward = None
+    # best mean reward
+
+    while True:
+        frame_idx += 1
+        epsilon = max(EPSILON_FINAL, EPSILON_START -
+                      frame_idx / EPSILON_DECAY_LAST_FRAME)
+
+        reward = agent.play_step(net, epsilon, device=device)
+        if reward is not None:
+            # 结束了一个 eposide
+            total_rewards.append(reward)
+            speed = (frame_idx - ts_frame) / (time.time() - ts)
+            ts_frame = frame_idx
+            ts = time.time()
+            m_reward = np.mean(total_rewards[-100:])
+            # 不住100就按不足100的处理
+            print("%d: done %d games, reward %.3f, "
+                  "eps %.2f, speed %.2f f/s" % (
+                frame_idx, len(total_rewards), m_reward, epsilon,
+                speed
+            ))
+            writer.add_scalar("epsilon", epsilon, frame_idx)
+            writer.add_scalar("speed", speed, frame_idx)
+            writer.add_scalar("reward_100", m_reward, frame_idx)
+            writer.add_scalar("reward", reward, frame_idx)
+            if best_m_reward is None or best_m_reward < m_reward:
+                torch.save(net.state_dict(), args.env +
+                           "-best_%.0f.dat" % m_reward)
+                if best_m_reward is not None:
+                    print("Best reward updated %.3f -> %.3f" % (
+                        best_m_reward, m_reward))
+                best_m_reward = m_reward
+            if m_reward > MEAN_REWARD_BOUND:
+                print("Solved in %d frames!" % frame_idx)
+                break
+
+        if len(buffer) < REPLAY_START_SIZE:
+            continue
+
+        if frame_idx % SYNC_TARGET_FRAMES == 0:
+            tgt_net.load_state_dict(net.state_dict())
+
+        optimizer.zero_grad()
+        batch = buffer.sample(BATCH_SIZE)
+        loss_t = calc_loss(batch, net, tgt_net, device=device)
+        loss_t.backward()
+        optimizer.step()
+
+    writer.close()
+
+
+
+
+
+
+
+
+
+
+