-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGamblersProblem.py
66 lines (53 loc) · 2.38 KB
/
GamblersProblem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
FROM: http://www.incompleteideas.net/sutton/book/first/4/node5.html
A gambler has the opportunity to make bets on the outcomes of a sequence of coin flips.
If the coin comes up heads, then he wins as many dollars as he has staked on that flip,
but if it is tails then he loses his stake.
The game ends when the gambler wins by reaching his goal of 100 dollars,
or loses by running out of money. On each flip,
the gambler must decide what portion of his capital to stake, in integer numbers of dollars.
This problem can be formulated as an undiscounted, episodic, finite MDP.
state: s in {1,2,...,99}
action(state): a in {0,1,...,min(s,100-s)}
reward(state): 1 if state is terminal and 0 otherwise
parameters:
p: the probability of the coin landing heads
"""
from MDP import *
from ValueIteration import *
class GamblersProblem(MDP):
def __init__(self, p, goal=100):
super().__init__(discount=1.0)
self.p = p
self.goal = goal
def is_terminal(self, state):
return state == self.goal or state < 1
def actions(self, state):
# returns an iterable of all possible actions
# that can be taken from a given state
return range(1, min(state, self.goal-state)+1)
def succProbReward(self, state, action):
# return list of (newState, prob, reward) triples
# state = s, action = a, newState = s'
# Transition Prob: T(s, a, s'), reward = Reward(s, a, s')
# when you are in state s and take action a,
# There are two cases: you win with prob p or lose with prob 1-p
heads_state = state + action
tails_state = state - action
case1 = (heads_state, self.p, self.reward(next_state=heads_state))
case2 = (tails_state, 1-self.p, self.reward(next_state=tails_state))
return [case1, case2]
def reward(self, state=None, action=None, next_state=None):
# returns the reward received when transitioning from state s to s'
# as long as you specify what you are putting in,
# you don't have to put in all params
return 1 if self.is_terminal(next_state) and next_state == self.goal else 0
@property
def states(self):
# returns an iterable of all possible states
return range(1, self.goal)
if __name__ == "__main__":
gp = GamblersProblem(p=0.25)
V, pi = ValueIteration(gp)
print(f"Values: {V}")
print(f"Policy: {pi}")