|
| 1 | +#!/usr/bin/env python |
| 2 | +from __future__ import print_function |
| 3 | + |
| 4 | +import tensorflow as tf |
| 5 | +import cv2 |
| 6 | +import sys |
| 7 | +sys.path.append("game/") |
| 8 | +import wrapped_flappy_bird as game |
| 9 | +import random |
| 10 | +import numpy as np |
| 11 | +from collections import deque |
| 12 | + |
| 13 | +GAME = 'bird' # the name of the game being played for log files |
| 14 | +ACTIONS = 2 # number of valid actions |
| 15 | +GAMMA = 0.99 # decay rate of past observations |
| 16 | +OBSERVE = 100000. # timesteps to observe before training |
| 17 | +EXPLORE = 2000000. # frames over which to anneal epsilon |
| 18 | +FINAL_EPSILON = 0.0001 # final value of epsilon |
| 19 | +INITIAL_EPSILON = 0. # starting value of epsilon |
| 20 | +REPLAY_MEMORY = 50000 # number of previous transitions to remember |
| 21 | +BATCH = 32 # size of minibatch |
| 22 | +FRAME_PER_ACTION = 1 |
| 23 | + |
| 24 | +def weight_variable(shape): |
| 25 | + initial = tf.truncated_normal(shape, stddev = 0.01) |
| 26 | + return tf.Variable(initial) |
| 27 | + |
| 28 | +def bias_variable(shape): |
| 29 | + initial = tf.constant(0.01, shape = shape) |
| 30 | + return tf.Variable(initial) |
| 31 | + |
| 32 | +def conv2d(x, W, stride): |
| 33 | + return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME") |
| 34 | + |
| 35 | +def max_pool_2x2(x): |
| 36 | + return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME") |
| 37 | + |
| 38 | +def createNetwork(): |
| 39 | + # network weights |
| 40 | + W_conv1 = weight_variable([8, 8, 4, 32]) |
| 41 | + b_conv1 = bias_variable([32]) |
| 42 | + |
| 43 | + W_conv2 = weight_variable([4, 4, 32, 64]) |
| 44 | + b_conv2 = bias_variable([64]) |
| 45 | + |
| 46 | + W_conv3 = weight_variable([3, 3, 64, 64]) |
| 47 | + b_conv3 = bias_variable([64]) |
| 48 | + |
| 49 | + W_fc1 = weight_variable([1600, 512]) |
| 50 | + b_fc1 = bias_variable([512]) |
| 51 | + |
| 52 | + W_fc2 = weight_variable([512, ACTIONS]) |
| 53 | + b_fc2 = bias_variable([ACTIONS]) |
| 54 | + |
| 55 | + # input layer |
| 56 | + s = tf.placeholder("float", [None, 80, 80, 4]) |
| 57 | + |
| 58 | + # hidden layers |
| 59 | + h_conv1 = tf.nn.relu(conv2d(s, W_conv1, 4) + b_conv1) |
| 60 | + h_pool1 = max_pool_2x2(h_conv1) |
| 61 | + |
| 62 | + h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2, 2) + b_conv2) |
| 63 | + #h_pool2 = max_pool_2x2(h_conv2) |
| 64 | + |
| 65 | + h_conv3 = tf.nn.relu(conv2d(h_conv2, W_conv3, 1) + b_conv3) |
| 66 | + #h_pool3 = max_pool_2x2(h_conv3) |
| 67 | + |
| 68 | + #h_pool3_flat = tf.reshape(h_pool3, [-1, 256]) |
| 69 | + h_conv3_flat = tf.reshape(h_conv3, [-1, 1600]) |
| 70 | + |
| 71 | + h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1) |
| 72 | + |
| 73 | + # readout layer |
| 74 | + readout = tf.matmul(h_fc1, W_fc2) + b_fc2 |
| 75 | + |
| 76 | + return s, readout, h_fc1 |
| 77 | + |
| 78 | +def trainNetwork(s, readout, h_fc1, sess): |
| 79 | + # define the cost function |
| 80 | + a = tf.placeholder("float", [None, ACTIONS]) |
| 81 | + y = tf.placeholder("float", [None]) |
| 82 | + readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) |
| 83 | + cost = tf.reduce_mean(tf.square(y - readout_action)) |
| 84 | + train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) |
| 85 | + |
| 86 | + # open up a game state to communicate with emulator |
| 87 | + game_state = game.GameState() |
| 88 | + |
| 89 | + # store the previous observations in replay memory |
| 90 | + D = deque() |
| 91 | + |
| 92 | + # printing |
| 93 | + a_file = open("logs_" + GAME + "/readout.txt", 'w') |
| 94 | + h_file = open("logs_" + GAME + "/hidden.txt", 'w') |
| 95 | + |
| 96 | + # get the first state by doing nothing and preprocess the image to 80x80x4 |
| 97 | + do_nothing = np.zeros(ACTIONS) |
| 98 | + do_nothing[0] = 1 |
| 99 | + x_t, r_0, terminal = game_state.frame_step(do_nothing) |
| 100 | + x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) |
| 101 | + ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) |
| 102 | + s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) |
| 103 | + |
| 104 | + # saving and loading networks |
| 105 | + saver = tf.train.Saver() |
| 106 | + sess.run(tf.initialize_all_variables()) |
| 107 | + checkpoint = tf.train.get_checkpoint_state("saved_networks") |
| 108 | + if checkpoint and checkpoint.model_checkpoint_path: |
| 109 | + saver.restore(sess, checkpoint.model_checkpoint_path) |
| 110 | + print("Successfully loaded:", checkpoint.model_checkpoint_path) |
| 111 | + else: |
| 112 | + print("Could not find old network weights") |
| 113 | + |
| 114 | + # start training |
| 115 | + epsilon = INITIAL_EPSILON |
| 116 | + t = 0 |
| 117 | + while "flappy bird" != "angry bird": |
| 118 | + # choose an action epsilon greedily |
| 119 | + readout_t = readout.eval(feed_dict={s : [s_t]})[0] |
| 120 | + a_t = np.zeros([ACTIONS]) |
| 121 | + action_index = 0 |
| 122 | + if t % FRAME_PER_ACTION == 0: |
| 123 | + if random.random() <= epsilon: |
| 124 | + print("----------Random Action----------") |
| 125 | + action_index = random.randrange(ACTIONS) |
| 126 | + a_t[random.randrange(ACTIONS)] = 1 |
| 127 | + else: |
| 128 | + action_index = np.argmax(readout_t) |
| 129 | + a_t[action_index] = 1 |
| 130 | + else: |
| 131 | + a_t[0] = 1 # do nothing |
| 132 | + |
| 133 | + # scale down epsilon |
| 134 | + if epsilon > FINAL_EPSILON and t > OBSERVE: |
| 135 | + epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE |
| 136 | + |
| 137 | + # run the selected action and observe next state and reward |
| 138 | + x_t1_colored, r_t, terminal = game_state.frame_step(a_t) |
| 139 | + x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) |
| 140 | + ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) |
| 141 | + x_t1 = np.reshape(x_t1, (80, 80, 1)) |
| 142 | + #s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2) |
| 143 | + s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2) |
| 144 | + |
| 145 | + # store the transition in D |
| 146 | + D.append((s_t, a_t, r_t, s_t1, terminal)) |
| 147 | + if len(D) > REPLAY_MEMORY: |
| 148 | + D.popleft() |
| 149 | + |
| 150 | + # only train if done observing |
| 151 | + if t > OBSERVE: |
| 152 | + # sample a minibatch to train on |
| 153 | + minibatch = random.sample(D, BATCH) |
| 154 | + |
| 155 | + # get the batch variables |
| 156 | + s_j_batch = [d[0] for d in minibatch] |
| 157 | + a_batch = [d[1] for d in minibatch] |
| 158 | + r_batch = [d[2] for d in minibatch] |
| 159 | + s_j1_batch = [d[3] for d in minibatch] |
| 160 | + |
| 161 | + y_batch = [] |
| 162 | + readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) |
| 163 | + for i in range(0, len(minibatch)): |
| 164 | + terminal = minibatch[i][4] |
| 165 | + # if terminal, only equals reward |
| 166 | + if terminal: |
| 167 | + y_batch.append(r_batch[i]) |
| 168 | + else: |
| 169 | + y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) |
| 170 | + |
| 171 | + # perform gradient step |
| 172 | + train_step.run(feed_dict = { |
| 173 | + y : y_batch, |
| 174 | + a : a_batch, |
| 175 | + s : s_j_batch} |
| 176 | + ) |
| 177 | + |
| 178 | + # update the old values |
| 179 | + s_t = s_t1 |
| 180 | + t += 1 |
| 181 | + |
| 182 | + # save progress every 10000 iterations |
| 183 | + if t % 10000 == 0: |
| 184 | + saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t) |
| 185 | + |
| 186 | + # print info |
| 187 | + state = "" |
| 188 | + if t <= OBSERVE: |
| 189 | + state = "observe" |
| 190 | + elif t > OBSERVE and t <= OBSERVE + EXPLORE: |
| 191 | + state = "explore" |
| 192 | + else: |
| 193 | + state = "train" |
| 194 | + |
| 195 | + print("TIMESTEP", t, "/ STATE", state, \ |
| 196 | + "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \ |
| 197 | + "/ Q_MAX %e" % np.max(readout_t)) |
| 198 | + # write info to files |
| 199 | + ''' |
| 200 | + if t % 10000 <= 100: |
| 201 | + a_file.write(",".join([str(x) for x in readout_t]) + '\n') |
| 202 | + h_file.write(",".join([str(x) for x in h_fc1.eval(feed_dict={s:[s_t]})[0]]) + '\n') |
| 203 | + cv2.imwrite("logs_tetris/frame" + str(t) + ".png", x_t1) |
| 204 | + ''' |
| 205 | + |
| 206 | +def playGame(): |
| 207 | + sess = tf.InteractiveSession() |
| 208 | + s, readout, h_fc1 = createNetwork() |
| 209 | + trainNetwork(s, readout, h_fc1, sess) |
| 210 | + |
| 211 | +def main(): |
| 212 | + playGame() |
| 213 | + |
| 214 | +if __name__ == "__main__": |
| 215 | + main() |
0 commit comments