Temp

Signed-off-by: Avnish <[email protected]>
ray-project · gjoliver · Mar 26, 2023 · Mar 13, 2023 · Mar 13, 2023 · Mar 14, 2023
commit 2ec8d088bc15fb4a6bd524a87c391121fac9d442
@@ -262,33 +262,38 @@ def after_train_step(self, train_results: ResultDict) -> None:
  last_update = self._counters[LAST_TARGET_UPDATE_TS]
 
  if self.config._enable_learner_api:
- if train_results:
- # using steps trained here instead of sampled ... I'm not sure why the
- # other implemenetation uses sampled.
- # to be quite frank, im not sure if I understand how their target update
- # freq would work. The difference in steps sampled/trained is pretty
- # much always going to be larger than self.config.num_sgd_iter *
- # self.config.minibatch_buffer_size unless the number of steps collected
- # is really small. The thing is that the default rollout fragment length
- # is 50, so the minibatch buffer size * num_sgd_iter is going to be
- # have to be 50 to even meet the threshold of having delayed target
- # updates.
- # we should instead have the target / kl threshold update be based off
- # of the train_batch_size * some target update frequency * num_sgd_iter.
- cur_ts = self._counters[
- NUM_ENV_STEPS_TRAINED
- if self.config.count_steps_by == "env_steps"
- else NUM_AGENT_STEPS_TRAINED
- ]
- target_update_steps_freq = (
- self.config.num_sgd_iter
- * self.config.train_batch_size
- * self.config.target_update_frequency
- )
- if cur_ts - last_update > target_update_steps_freq:
- self._counters[NUM_TARGET_UPDATES] += 1
- self._counters[LAST_TARGET_UPDATE_TS] = cur_ts
- self.learner_group.additional_update()
+ # using steps trained here instead of sampled ... I'm not sure why the
+ # other implemenetation uses sampled.
+ # to be quite frank, im not sure if I understand how their target update
+ # freq would work. The difference in steps sampled/trained is pretty
+ # much always going to be larger than self.config.num_sgd_iter *
+ # self.config.minibatch_buffer_size unless the number of steps collected
+ # is really small. The thing is that the default rollout fragment length
+ # is 50, so the minibatch buffer size * num_sgd_iter is going to be
+ # have to be 50 to even meet the threshold of having delayed target
+ # updates.
+ # we should instead have the target / kl threshold update be based off
+ # of the train_batch_size * some target update frequency * num_sgd_iter.
+ # cur_ts = self._counters[
+ # NUM_ENV_STEPS_TRAINED
+ # if self.config.count_steps_by == "env_steps"
+ # else NUM_AGENT_STEPS_TRAINED
+ # ]
+ # target_update_steps_freq = (
+ # self.config.num_sgd_iter
+ # * self.config.train_batch_size
+ # * self.config.target_update_frequency
+ # )
+ cur_ts = self._counters[
+ NUM_AGENT_STEPS_SAMPLED
+ if self.config.count_steps_by == "agent_steps"
+ else NUM_ENV_STEPS_SAMPLED
+ ]
+ target_update_steps_freq = 1
+ if cur_ts - last_update > target_update_steps_freq:
+ self._counters[NUM_TARGET_UPDATES] += 1
+ self._counters[LAST_TARGET_UPDATE_TS] = cur_ts
+ self.learner_group.additional_update()
 
  else:
  cur_ts = self._counters[
@@ -374,6 +379,8 @@ def get_default_policy_class(
  from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
 
  return EagerTFPolicyV2
+ # from ray.rllib.algorithms.appo.tf.appo_tf_policy_rlm import APPOTfPolicyWithRLModule
+ # return APPOTfPolicyWithRLModule
  from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF2Policy
 
  return APPOTF2Policy

@@ -686,9 +686,9 @@ def training_step(self) -> ResultDict:
  timeout_seconds=self.config.worker_health_probe_timeout_s,
  mark_healthy=True,
  )
- if not train_results:
- # adding this allows results to be properly logged by ray tune.
- time.sleep(1e-1)
+ # if not train_results:
+ #  # adding this allows results to be properly logged by ray tune.
+ #  time.sleep(1e-1)
  return train_results
 
  @classmethod
@@ -865,6 +865,7 @@ def learn_on_processed_samples(self) -> ResultDict:
  reduce_fn=_reduce_impala_results,
  block=blocking,
  num_iters=self.config.num_sgd_iter,
+ # minibatch_size=(2 * self.config.rollout_fragment_length)
  )
  else:
  lg_results = None

diff --git a/rllib/tuned_examples/appo/cartpole-appo.yaml b/rllib/tuned_examples/appo/cartpole-appo.yaml
@@ -7,19 +7,29 @@ cartpole-appo:
  config:
  # Works for both torch and tf.
  framework: tf2
- num_workers: 4
+ num_workers:
+ grid_search:
+ - 3
  num_gpus: 0
  observation_filter: MeanStdFilter
  num_sgd_iter: 6
  vf_loss_coeff: 0.01
  vtrace: True
- grad_clip: 0
+
+ num_learner_workers: 1
  model:
  fcnet_hiddens: [32]
  fcnet_activation: linear
  vf_share_layers: true
  enable_connectors: True
  _enable_learner_api: True
  _enable_rl_module_api: True
- eager_tracing: False
- # lr: 0.001
+ eager_tracing: True
+ lr: 0.001
+ seed:
+ grid_search:
+ - 1
+ - 2
+ - 3
+ - 4
+ - 5