forked from BinRoot/TensorFlow-Book
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrl.py
142 lines (117 loc) · 5.17 KB
/
rl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from yahoo_finance import Share
from matplotlib import pyplot as plt
import numpy as np
import random
import tensorflow as tf
import random
class DecisionPolicy:
def select_action(self, current_state, step):
pass
def update_q(self, state, action, reward, next_state):
pass
class RandomDecisionPolicy(DecisionPolicy):
def __init__(self, actions):
self.actions = actions
def select_action(self, current_state, step):
action = self.actions[random.randint(0, len(self.actions) - 1)]
return action
class QLearningDecisionPolicy(DecisionPolicy):
def __init__(self, actions, input_dim):
self.epsilon = 0.9
self.gamma = 0.001
self.actions = actions
output_dim = len(actions)
h1_dim = 200
self.x = tf.placeholder(tf.float32, [None, input_dim])
self.y = tf.placeholder(tf.float32, [output_dim])
W1 = tf.Variable(tf.random_normal([input_dim, h1_dim]))
b1 = tf.Variable(tf.constant(0.1, shape=[h1_dim]))
h1 = tf.nn.relu(tf.matmul(self.x, W1) + b1)
W2 = tf.Variable(tf.random_normal([h1_dim, output_dim]))
b2 = tf.Variable(tf.constant(0.1, shape=[output_dim]))
self.q = tf.nn.relu(tf.matmul(h1, W2) + b2)
loss = tf.square(self.y - self.q)
self.train_op = tf.train.AdagradOptimizer(0.01).minimize(loss)
self.sess = tf.Session()
self.sess.run(tf.initialize_all_variables())
def select_action(self, current_state, step):
threshold = min(self.epsilon, step / 1000.)
if random.random() < threshold:
# Exploit best option with probability epsilon
action_q_vals = self.sess.run(self.q, feed_dict={self.x: current_state})
action_idx = np.argmax(action_q_vals) # TODO: replace w/ tensorflow's argmax
action = self.actions[action_idx]
else:
# Explore random option with probability 1 - epsilon
action = self.actions[random.randint(0, len(self.actions) - 1)]
return action
def update_q(self, state, action, reward, next_state):
action_q_vals = self.sess.run(self.q, feed_dict={self.x: state})
next_action_q_vals = self.sess.run(self.q, feed_dict={self.x: next_state})
next_action_idx = np.argmax(next_action_q_vals)
action_q_vals[0, next_action_idx] = reward + self.gamma * next_action_q_vals[0, next_action_idx]
action_q_vals = np.squeeze(np.asarray(action_q_vals))
self.sess.run(self.train_op, feed_dict={self.x: state, self.y: action_q_vals})
def run_simulation(policy, initial_budget, initial_num_stocks, prices, hist, debug=False):
budget = initial_budget
num_stocks = initial_num_stocks
share_value = 0
transitions = list()
for i in range(len(prices) - hist - 1):
if i % 100 == 0:
print('progress {:.2f}%'.format(float(100*i) / (len(prices) - hist - 1)))
current_state = np.asmatrix(np.hstack((prices[i:i+hist], budget, num_stocks)))
current_portfolio = budget + num_stocks * share_value
action = policy.select_action(current_state, i)
share_value = float(prices[i + hist + 1])
if action == 'Buy' and budget >= share_value:
budget -= share_value
num_stocks += 1
elif action == 'Sell' and num_stocks > 0:
budget += share_value
num_stocks -= 1
else:
action = 'Hold'
new_portfolio = budget + num_stocks * share_value
reward = new_portfolio - current_portfolio
next_state = np.asmatrix(np.hstack((prices[i+1:i+hist+1], budget, num_stocks)))
transitions.append((current_state, action, reward, next_state))
policy.update_q(current_state, action, reward, next_state)
portfolio = budget + num_stocks * share_value
if debug:
print('${}\t{} shares'.format(budget, num_stocks))
return portfolio
def run_simulations(policy, budget, num_stocks, prices, hist):
num_tries = 10
final_portfolios = list()
for i in range(num_tries):
final_portfolio = run_simulation(policy, budget, num_stocks, prices, hist)
final_portfolios.append(final_portfolio)
avg, std = np.mean(final_portfolios), np.std(final_portfolios)
return avg, std
def get_prices(share_symbol, start_date, end_date, cache_filename='stock_prices.npy'):
try:
stock_prices = np.load(cache_filename)
except IOError:
share = Share(share_symbol)
stock_hist = share.get_historical(start_date, end_date)
stock_prices = [stock_price['Open'] for stock_price in stock_hist]
np.save(cache_filename, stock_prices)
return stock_prices
def plot_prices(prices):
plt.title('Opening stock prices')
plt.xlabel('day')
plt.ylabel('price ($)')
plt.plot(prices)
plt.savefig('prices.png')
if __name__ == '__main__':
prices = get_prices('MSFT', '1992-07-22', '2016-07-22')
plot_prices(prices)
actions = ['Buy', 'Sell', 'Hold']
hist = 200
# policy = RandomDecisionPolicy(actions)
policy = QLearningDecisionPolicy(actions, hist + 2)
budget = 1000.0
num_stocks = 0
avg, std = run_simulations(policy, budget, num_stocks, prices, hist)
print(avg, std)