-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuffer.py
152 lines (124 loc) · 4.81 KB
/
buffer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
__author__ = "Georgi Tancev, PhD"
__copyright__ = "© Georgi Tancev"
import numpy as np
import scipy.signal
import torch as th
import scipy.signal
def discount_cumsum(x, discount):
"""
Compute cumulative sums of vectors.
Input: [x0, x1, ..., xn]
Output: [x0 + discount * x1 + discount^2 * x2 ... disounct^n * xn,
x1 + discount * x2 ... discount^(n-1) * xn,
...,
xn]
"""
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1],
axis=0)[::-1]
def combined_shape(length, shape=None):
"""
Helper function that combines two array shapes.
"""
if shape is None:
return (length,)
return (length, shape) if np.isscalar(shape) else (length, *shape)
class Buffer:
"""
Buffer to store trajectories.
"""
def __init__(self, obs_dim, act_dim, size, gamma, lamda):
# state space
self.obs_buf = np.zeros(combined_shape(size, obs_dim),
dtype=np.float32)
# action space
self.act_buf = np.zeros(combined_shape(size, act_dim),
dtype=np.float32)
# calculated TD residuals
self.tdres_buf = np.zeros(size, dtype=np.float32)
# rewards
self.rew_buf = np.zeros(size, dtype=np.float32)
# trajectory's remaining return
self.ret_buf = np.zeros(size, dtype=np.float32)
# values predicted
self.val_buf = np.zeros(size, dtype=np.float32)
# log probabilities of chosen actions under behavior policy
self.logp_buf = np.zeros(size, dtype=np.float32)
# hyperparameters for GAE
self.gamma = gamma
self.lamda = lamda
# pointer to the latest data point in the buffer
self.ptr = 0
# pointer to the start of the trajectory
self.path_start_idx = 0
# maximum size of the buffer
self.max_size = size
def store(self, obs, act, rew, val, logp):
"""
Append a single timestep to the buffer. This is called at
each environment update to store the observed outcome in
self.obs_buf,
self.act_buf,
self.rew_buf,
self.val_buff,
self.logp_buff.
Parameters
----------
obs: torch.Tensor of shape (obs_dim, )
State observation.
act: torch.Tensor of shape (act_dim, )
Applied action.
rew: torch.Tensor of shape (1, )
Observed rewards.
val: torch.Tensor of shape (1, )
Predicted values.
logp: torch.Tensor of shape (1, )
log probability of act under behavior policy
"""
# buffer has to have room so you can store
assert self.ptr < self.max_size
self.obs_buf[self.ptr, :] = obs
self.act_buf[self.ptr, :] = act
self.rew_buf[self.ptr] = rew
self.val_buf[self.ptr] = val
self.logp_buf[self.ptr] = logp
# Update pointer after data is stored.
self.ptr += 1
def end_traj(self, last_val=0):
"""
Calculate for a trajectory
1) discounted rewards-to-go, and
2) TD residuals.
Store these into self.ret_buf, and self.tdres_buf respectively.
The function is called after a trajectory ends.
Parameters
----------
last_val: np.float32
Last value is value (state) if the rollout is cut-off at a
certain state, or 0 if trajectory ended uninterrupted.
"""
# Get the indexes where TD residuals and discounted
# rewards-to-go are stored.
path_slice = slice(self.path_start_idx, self.ptr)
rews = np.append(self.rew_buf[path_slice], last_val)
vals = np.append(self.val_buf[path_slice], last_val)
delta = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
self.tdres_buf[path_slice] = discount_cumsum(delta,
self.lamda * self.gamma)
self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1]
# Update the path_start_idx
self.path_start_idx = self.ptr
def get(self):
"""
Call after an epoch ends.
Resets pointers and returns the buffer contents.
"""
# Buffer has to be full before you can get something from it.
assert self.ptr == self.max_size
self.ptr, self.path_start_idx = 0, 0
tdres_mean = np.mean(self.tdres_buf)
tdres_std = np.std(self.tdres_buf)
self.tdres_buf = (self.tdres_buf - tdres_mean) / (tdres_std)
data = dict(obs=self.obs_buf, act=self.act_buf, ret=self.ret_buf,
val=self.val_buf, tdres=self.tdres_buf, logp=self.logp_buf)
return {k: th.as_tensor(v, dtype=th.float32)
for k, v in data.items()}