forked from boyu-ai/Hands-on-RL
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchapter3.py
46 lines (40 loc) · 1.44 KB
/
chapter3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
np.random.seed(0)
# 定义状态转移概率矩阵P
P = [
[0.9, 0.1, 0.0, 0.0, 0.0, 0.0],
[0.5, 0.0, 0.5, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.6, 0.0, 0.4],
[0.0, 0.0, 0.0, 0.0, 0.3, 0.7],
[0.0, 0.2, 0.3, 0.5, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
]
P = np.array(P)
rewards = [-1, -2, -2, 10, 1, 0] # 定义奖励函数
gamma = 0.5 # 定义折扣因子
# 给定一条序列,计算从某个索引(起始状态)开始到序列最后(终止状态)得到的回报
from copy import deepcopy
def compute_return(start_index, chain, gamma):
G = 0
ret = []
for i in reversed(range(start_index, len(chain))): ## 运算的方向要倒着来
g_ = deepcopy(G)
G = gamma * G + rewards[chain[i] - 1] ## 从后往前依次运算求出结果
ret.append([G, gamma, g_, rewards[chain[i] - 1], chain[i]]) ## 保存
'''
0.0 = 0.5 * 0 + 0 6
-2.0 = 0.5 * 0 + -2 6->3
-3.0 = 0.5 * -2.0 + -2 3->2
-2.5 = 0.5 * -3.0 + -1 2->1
ret= [[ 0.0, 0.5, 0, 0, 6],
[-2.0, 0.5, 0.0, -2, 3],
[-3.0, 0.5, -2.0, -2, 2],
[-2.5, 0.5, -3.0, -1, 1]]
'''
return G
# 一个状态序列,s1-s2-s3-s6
chain = [1, 2, 3, 6]
start_index = 0
G = compute_return(start_index, chain, gamma)
print("根据本序列计算得到回报为:%s。" % G)
# 根据本序列计算得到回报为:-2.5。