forked from ConvLab/ConvLab
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reinforce.py
211 lines (192 loc) · 8.6 KB
/
reinforce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# Modified by Microsoft Corporation.
# Licensed under the MIT license.
import numpy as np
from convlab.agent import net
from convlab.agent.algorithm import policy_util
from convlab.agent.algorithm.base import Algorithm
from convlab.agent.net import net_util
from convlab.lib import logger, math_util, util
from convlab.lib.decorator import lab_api
logger = logger.get_logger(__name__)
class Reinforce(Algorithm):
'''
Implementation of REINFORCE (Williams, 1992) with baseline for discrete or continuous actions http:https://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf
Adapted from https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py
Algorithm:
0. Collect n episodes of data
1. At each timestep in an episode
- Calculate the advantage of that timestep
- Multiply the advantage by the negative of the log probability of the action taken
2. Sum all the values above.
3. Calculate the gradient of this value with respect to all of the parameters of the network
4. Update the network parameters using the gradient
e.g. algorithm_spec:
"algorithm": {
"name": "Reinforce",
"action_pdtype": "default",
"action_policy": "default",
"explore_var_spec": null,
"gamma": 0.99,
"entropy_coef_spec": {
"name": "linear_decay",
"start_val": 0.01,
"end_val": 0.001,
"start_step": 100,
"end_step": 5000,
},
"training_frequency": 1,
}
'''
@lab_api
def init_algorithm_params(self):
'''Initialize other algorithm parameters'''
# set default
util.set_attr(self, dict(
action_pdtype='default',
action_policy='default',
explore_var_spec=None,
entropy_coef_spec=None,
policy_loss_coef=1.0,
))
util.set_attr(self, self.algorithm_spec, [
'action_pdtype',
'action_policy',
# theoretically, REINFORCE does not have policy update; but in this implementation we have such option
'explore_var_spec',
'gamma', # the discount factor
'entropy_coef_spec',
'policy_loss_coef',
'training_frequency',
])
self.to_train = 0
self.action_policy = getattr(policy_util, self.action_policy)
self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec)
self.body.explore_var = self.explore_var_scheduler.start_val
if self.entropy_coef_spec is not None:
self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec)
self.body.entropy_coef = self.entropy_coef_scheduler.start_val
@lab_api
def init_nets(self, global_nets=None):
'''
Initialize the neural network used to learn the policy function from the spec
Below we automatically select an appropriate net for a discrete or continuous action space if the setting is of the form 'MLPNet'. Otherwise the correct type of network is assumed to be specified in the spec.
Networks for continuous action spaces have two heads and return two values, the first is a tensor containing the mean of the action policy, the second is a tensor containing the std deviation of the action policy. The distribution is assumed to be a Gaussian (Normal) distribution.
Networks for discrete action spaces have a single head and return the logits for a categorical probability distribution over the discrete actions
'''
in_dim = self.body.state_dim
out_dim = net_util.get_out_dim(self.body)
NetClass = getattr(net, self.net_spec['type'])
self.net = NetClass(self.net_spec, in_dim, out_dim)
self.net_names = ['net']
# init net optimizer and its lr scheduler
self.optim = net_util.get_optim(self.net, self.net.optim_spec)
self.lr_scheduler = net_util.get_lr_scheduler(self.optim, self.net.lr_scheduler_spec)
net_util.set_global_nets(self, global_nets)
self.post_init_nets()
@lab_api
def calc_pdparam(self, x, net=None):
'''
The pdparam will be the logits for discrete prob. dist., or the mean and std for continuous prob. dist.
'''
net = self.net if net is None else net
pdparam = net(x)
return pdparam
@lab_api
def act(self, state):
body = self.body
action = self.action_policy(state, self, body)
return action.cpu().squeeze().numpy() # squeeze to handle scalar
@lab_api
def sample(self):
'''Samples a batch from memory'''
batch = self.body.memory.sample()
batch = util.to_torch_batch(batch, self.net.device, self.body.memory.is_episodic)
return batch
def calc_pdparam_batch(self, batch):
'''Efficiently forward to get pdparam and by batch for loss computation'''
states = batch['states']
if self.body.env.is_venv:
states = math_util.venv_unpack(states)
pdparam = self.calc_pdparam(states)
return pdparam
def calc_ret_advs(self, batch):
'''Calculate plain returns; which is generalized to advantage in ActorCritic'''
rets = math_util.calc_returns(batch['rewards'], batch['dones'], self.gamma)
advs = rets
if self.body.env.is_venv:
advs = math_util.venv_unpack(advs)
logger.debug(f'advs: {advs}')
return advs
def calc_policy_loss(self, batch, pdparams, advs):
'''Calculate the actor's policy loss'''
action_pd = policy_util.init_action_pd(self.body.ActionPD, pdparams)
actions = batch['actions']
if self.body.env.is_venv:
actions = math_util.venv_unpack(actions)
log_probs = action_pd.log_prob(actions)
policy_loss = - self.policy_loss_coef * (log_probs * advs).mean()
if self.entropy_coef_spec:
entropy = action_pd.entropy().mean()
self.body.mean_entropy = entropy # update logging variable
policy_loss += (-self.body.entropy_coef * entropy)
logger.debug(f'Actor policy loss: {policy_loss:g}')
return policy_loss
@lab_api
def train(self):
if util.in_eval_lab_modes():
return np.nan
clock = self.body.env.clock
if self.to_train == 1:
batch = self.sample()
clock.set_batch_size(len(batch))
pdparams = self.calc_pdparam_batch(batch)
advs = self.calc_ret_advs(batch)
loss = self.calc_policy_loss(batch, pdparams, advs)
self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
# reset
self.to_train = 0
logger.info(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
return loss.item()
else:
return np.nan
@lab_api
def update(self):
self.body.explore_var = self.explore_var_scheduler.update(self, self.body.env.clock)
if self.entropy_coef_spec is not None:
self.body.entropy_coef = self.entropy_coef_scheduler.update(self, self.body.env.clock)
return self.body.explore_var
class WarmUpReinforce(Reinforce):
'''
Implementation of REINFORCE (Williams, 1992) with baseline for discrete or continuous actions http:https://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf
Adapted from https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py
Algorithm:
0. Collect n episodes of data
1. At each timestep in an episode
- Calculate the advantage of that timestep
- Multiply the advantage by the negative of the log probability of the action taken
2. Sum all the values above.
3. Calculate the gradient of this value with respect to all of the parameters of the network
4. Update the network parameters using the gradient
e.g. algorithm_spec:
"algorithm": {
"name": "Reinforce",
"action_pdtype": "default",
"action_policy": "default",
"warmup_epi": 300,
"explore_var_spec": null,
"gamma": 0.99,
"entropy_coef_spec": {
"name": "linear_decay",
"start_val": 0.01,
"end_val": 0.001,
"start_step": 100,
"end_step": 5000,
},
"training_frequency": 1,
}
'''
def __init__(self, agent, global_nets=None):
super().__init__(agent, global_nets)
util.set_attr(self, self.algorithm_spec, [
'warmup_epi',
])