[RLlib] APPO+new-stack (Atari benchmark) - Preparatory PR 01. (ray-pr…

…oject#34743)
krfricke · Apr 26, 2023 · 5a3954e · 5a3954e
1 parent 3e04104
commit 5a3954e
Show file tree

Hide file tree

Showing 42 changed files with 243 additions and 138 deletions.
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -171,6 +171,16 @@ py_test(
  args = ["--dir=tuned_examples/appo"]
 )
 
+py_test(
+ name = "learning_tests_cartpole_appo_w_rl_modules_and_learner",
+ main = "tests/run_regression_tests.py",
+ tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "tf_only", "no_tf_static_graph"],
+ size = "medium", # bazel may complain about it being too long sometimes - medium is on purpose as some frameworks take longer
+ srcs = ["tests/run_regression_tests.py"],
+ data = ["tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml"],
+ args = ["--dir=tuned_examples/appo"]
+)
+
 # py_test(
 # name = "learning_tests_cartpole_appo_vtrace",
 # main = "tests/run_regression_tests.py",
@@ -589,8 +599,9 @@ py_test(
  args = ["--dir=tuned_examples/ppo"]
 )
 
+# TODO (Sven): Enable tf2 for this test.
 py_test(
- name = "learning_tests_pendulum_ppo_with_rl_module_torch",
+ name = "learning_tests_pendulum_ppo_with_rl_module",
  main = "tests/run_regression_tests.py",
  tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "torch_only"],
  size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer
@@ -599,18 +610,6 @@ py_test(
  args = ["--dir=tuned_examples/ppo"]
 )
 
-# TODO (Kourosh): tf2 is way slower than torch, eager mode is no enabled, I wonder if
-# it would get faster with eager mode once it is enabled.
-# py_test(
-# name = "learning_tests_pendulum_ppo_with_rl_module_tf2_eager",
-# main = "tests/run_regression_tests.py",
-# tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "tf2_only", "no_tf_static_graph"],
-# size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer
-# srcs = ["tests/run_regression_tests.py"],
-# data = ["tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml"],
-# args = ["--dir=tuned_examples/ppo"]
-# )
-
 py_test(
  name = "learning_tests_multi_agent_pendulum_ppo",
  main = "tests/run_regression_tests.py",

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
@@ -2158,7 +2158,7 @@ def default_resource_request(
 
  # Default logic for RLlib Algorithms:
  # Create one bundle per individual worker (local or remote).
- # Use `num_cpus_for_driver` and `num_gpus` for the local worker and
+ # Use `num_cpus_for_local_worker` and `num_gpus` for the local worker and
  # `num_cpus_per_worker` and `num_gpus_per_worker` for the remote
  # workers to determine their CPU/GPU resource needs.
 

diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
@@ -48,8 +48,7 @@
 logger = logging.getLogger(__name__)
 
 
-# We need this builder function because we want to share the same
-# custom logics between TF1 dynamic and TF2 eager policies.
+# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
 def get_appo_tf_policy(name: str, base: type) -> type:
  """Construct an APPOTFPolicy inheriting either dynamic or eager base policies.
 

diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py
@@ -54,8 +54,7 @@
 logger = logging.getLogger(__name__)
 
 
-# We need this builder function because we want to share the same
-# custom logics between TF1 dynamic and TF2 eager policies.
+# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
 class APPOTorchPolicy(
  VTraceOptimizer,
  LearningRateSchedule,

diff --git a/rllib/algorithms/appo/utils.py b/rllib/algorithms/appo/utils.py
@@ -6,6 +6,7 @@
 TARGET_POLICY_SCOPE = "target_func"
 
 
+# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
 def make_appo_models(policy) -> ModelV2:
  """Builds model and target model for APPO.
 

diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py
@@ -85,7 +85,7 @@ def __init__(
  config: Algorithm config dict.
  """
 
- # Compute vtrace on the CPU for better perf.
+ # Compute vtrace on the CPU for better performance.
  with tf.device("/cpu:0"):
  self.vtrace_returns = vtrace.multi_from_logits(
  behaviour_action_log_probs=behaviour_action_logp,

diff --git a/rllib/algorithms/impala/torch/vtrace_torch_v2.py b/rllib/algorithms/impala/torch/vtrace_torch_v2.py
@@ -70,7 +70,7 @@ def vtrace_torch(
  clip_rho_threshold: Union[float, "torch.Tensor"] = 1.0,
  clip_pg_rho_threshold: Union[float, "torch.Tensor"] = 1.0,
 ):
- r"""V-trace for softmax policies implemented with torch.
+ """V-trace for softmax policies implemented with torch.
 
  Calculates V-trace actor critic targets for softmax polices as described in
  "IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
@@ -13,19 +13,6 @@
 torch, nn = try_import_torch()
 
 
-def get_ppo_loss(fwd_in, fwd_out):
- # TODO: we should replace these components later with real ppo components when
- # RLOptimizer and RLModule are integrated together.
- # this is not exactly a ppo loss, just something to show that the
- # forward train works
- adv = fwd_in[SampleBatch.REWARDS] - fwd_out[SampleBatch.VF_PREDS]
- actor_loss = -(fwd_out[SampleBatch.ACTION_LOGP] * adv).mean()
- critic_loss = (adv**2).mean()
- loss = actor_loss + critic_loss
-
- return loss
-
-
 class PPOTorchRLModule(PPORLModuleBase, TorchRLModule):
  framework: str = "torch"
 

diff --git a/rllib/algorithms/slateq/slateq_tf_policy.py b/rllib/algorithms/slateq/slateq_tf_policy.py
@@ -207,15 +207,6 @@ def build_slateq_stats(policy: Policy, batch) -> Dict[str, TensorType]:
  "q_loss": policy._q_loss,
  "mean_actions": policy._mean_actions,
  }
- # if hasattr(policy, "_mean_grads_0"):
- # stats.update({"mean_grads_0": policy._mean_grads_0})
- # stats.update({"mean_grads_1": policy._mean_grads_1})
- # stats.update({"mean_grads_2": policy._mean_grads_2})
- # stats.update({"mean_grads_3": policy._mean_grads_3})
- # stats.update({"mean_grads_4": policy._mean_grads_4})
- # stats.update({"mean_grads_5": policy._mean_grads_5})
- # stats.update({"mean_grads_6": policy._mean_grads_6})
- # stats.update({"mean_grads_7": policy._mean_grads_7})
  return stats
 
 

diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py
@@ -522,14 +522,14 @@ def compile_results(
 
  # We put the stats for all modules under the ALL_MODULES key. e.g. average of
  # the gradients across all modules will go here.
- mean_grads = [
- np.mean(grad)
+ mean_abs_grads = [
+ np.mean(np.abs(grad))
  for grad in convert_to_numpy(postprocessed_gradients.values())
  if grad is not None
  ]
 
  module_learner_stats[ALL_MODULES] = {
- "mean_gradient": np.mean(mean_grads),
+ "mean_abs_postprocessed_gradients": np.mean(mean_abs_grads),
  self.TOTAL_LOSS_KEY: loss_numpy[self.TOTAL_LOSS_KEY],
  }
 
@@ -754,19 +754,21 @@ def additional_update_per_module(
 
  @OverrideToImplementCustomLogic
  def postprocess_gradients(
- self, gradients_dict: Mapping[str, Any]
+ self,
+ gradients_dict: Mapping[str, Any],
  ) -> Mapping[str, Any]:
- """Applies potential postprocessings to the gradients.
+ """Applies potential postprocessing operations on the gradients.
 
- In some algorithms, we may want to perform some postprocessing on the
- gradients before they are applied. This method is called after gradients
- have been computed, and modifies them before they are applied.
+ This method is called after gradients have been computed, and modifies them
+ before they are applied to the respective module(s).
+ This includes grad clipping by value, norm, or global-norm, or other
+ algorithm specific gradient postprocessing steps.
 
  Args:
  gradients_dict: A dictionary of gradients.
 
  Returns:
- A dictionary of updated gradients.
+ A dictionary with the updated gradients.
  """
  return gradients_dict
 
@@ -776,7 +778,9 @@ def update(
  *,
  minibatch_size: Optional[int] = None,
  num_iters: int = 1,
- reduce_fn: Callable[[ResultDict], ResultDict] = _reduce_mean_results,
+ reduce_fn: Callable[[List[Mapping[str, Any]]], ResultDict] = (
+ _reduce_mean_results
+ ),
  ) -> Mapping[str, Any]:
  """Do `num_iters` minibatch updates given the original batch.
 
@@ -957,29 +961,29 @@ def _make_module(self) -> MultiAgentRLModule:
 
  This method uses `self._module_specs` or `self._module_obj` to construct the
  module. If the module_class is a single agent RL module it will be wrapped to a
- multi-agent RL module. Override this method if there are other things than
- needs to happen for instantiation of the module.
-
+ multi-agent RL module. Override this method if there are other things that
+ need to happen for instantiation of the module.
 
  Returns:
- The constructed module.
+ A constructed MultiAgentRLModule.
  """
  if self._module_obj is not None:
  module = self._module_obj
  else:
  module = self._module_spec.build()
+ # If not already, convert to MultiAgentRLModule.
  module = module.as_multi_agent()
  return module
 
  def _check_result(self, result: Mapping[str, Any]) -> None:
  """Checks whether the result has the correct format.
 
  All the keys should be referencing the module ids that got updated. There is a
- special key `__all__` that hold any extra information that is not specific to a
- module.
+ special key `ALL_MODULES` that hold any extra information that is not specific
+ to a module.
 
  Args:
- results: The result of the update.
+ result: The result of the update.
 
  Raises:
  ValueError: If the result are not in the correct format.
@@ -1000,7 +1004,7 @@ def _check_result(self, result: Mapping[str, Any]) -> None:
  if key not in self.module.keys():
  raise ValueError(
  f"The key {key} in the result of the update is not a valid "
- f"module id. Valid module ids are: {self.module.keys()}"
+ f"module id. Valid module ids are: {list(self.module.keys())}."
  )
 
  @OverrideToImplementCustomLogic_CallToSuperRecommended

diff --git a/rllib/core/learner/learner_group_config.py b/rllib/core/learner/learner_group_config.py
@@ -114,7 +114,7 @@ def module(
  def resources(
  self,
  num_learner_workers: Optional[int] = NotProvided,
- num_gpus_per_learner_worker: Optional[Union[float, int]] = NotProvided,
+ num_gpus_per_learner_worker: Optional[int] = NotProvided,
  num_cpus_per_learner_worker: Optional[Union[float, int]] = NotProvided,
  local_gpu_idx: Optional[int] = NotProvided,
  ) -> "LearnerGroupConfig":

diff --git a/rllib/core/learner/scaling_config.py b/rllib/core/learner/scaling_config.py
@@ -13,7 +13,9 @@ class LearnerGroupScalingConfig:
  training will run on a single CPU.
  num_gpus_per_worker: The number of GPUs to allocate per worker. If
  num_workers=0, any number greater than 0 will run the training on a single
- GPU. A value of zero will run the training on a single CPU.
+ GPU. A value of zero will run the training on `num_cpus_per_worker` CPUs.
+ Fractional values (e.g. 0.5) are currently NOT supported as these might
+ cause CUDA async errors.
  local_gpu_idx: if num_gpus_per_worker > 0, and num_workers<2, then this gpu
  index will be used for training. This is an index into the available cuda
  devices. For example if os.environ["CUDA_VISIBLE_DEVICES"] = "1" then a

diff --git a/rllib/core/learner/tests/test_learner_group.py b/rllib/core/learner/tests/test_learner_group.py
@@ -138,8 +138,8 @@ def test_update_multigpu(self):
 
  self.assertLess(min_loss, 0.57)
 
- # make sure the learner_group resources are freed up so that we don't
- # autoscale
+ # Make sure the learner_group resources are freed up so that we don't
+ # autoscale.
  learner_group.shutdown()
  del learner_group
 

diff --git a/rllib/core/models/tf/primitives.py b/rllib/core/models/tf/primitives.py
@@ -24,9 +24,9 @@ def __init__(
  input_dim: int,
  hidden_layer_dims: List[int],
  hidden_layer_use_layernorm: bool = False,
- hidden_layer_activation: Union[str, Callable] = "relu",
+ hidden_layer_activation: Optional[Union[str, Callable]] = "relu",
  output_dim: Optional[int] = None,
- output_activation: Union[str, Callable] = "linear",
+ output_activation: Optional[Union[str, Callable]] = "linear",
  use_bias: bool = True,
  ):
  """Initialize a TfMLP object.
@@ -112,7 +112,7 @@ def __init__(
  input_dims: Union[List[int], Tuple[int]],
  cnn_filter_specifiers: List[List[Union[int, List]]],
  cnn_use_layernorm: bool = False,
- cnn_activation: str = "relu",
+ cnn_activation: Optional[str] = "relu",
  use_bias: bool = True,
  ):
  """Initializes a TfCNN instance.
@@ -188,7 +188,7 @@ def __init__(
  *,
  input_dims: Union[List[int], Tuple[int]],
  cnn_transpose_filter_specifiers: List[List[Union[int, List]]],
- cnn_transpose_activation: str = "relu",
+ cnn_transpose_activation: Optional[str] = "relu",
  cnn_transpose_use_layernorm: bool = False,
  use_bias: bool = True,
  ):

diff --git a/rllib/policy/eager_tf_policy_v2.py b/rllib/policy/eager_tf_policy_v2.py
@@ -98,7 +98,7 @@ def __init__(
  self._loss_initialized = False
  # Backward compatibility workaround so Policy will call self.loss() directly.
  # TODO(jungong): clean up after all policies are migrated to new sub-class
- # implementation.
+ #  implementation.
  self._loss = None
 
  self.batch_divisibility_req = self.get_batch_divisibility_req()
@@ -864,8 +864,6 @@ def _compute_actions_helper(
  dist_inputs = None
 
  elif is_overridden(self.action_sampler_fn):
- dist_inputs = None
- state_out = []
  actions, logp, dist_inputs, state_out = self.action_sampler_fn(
  self.model,
  input_dict[SampleBatch.OBS],
@@ -875,7 +873,6 @@ def _compute_actions_helper(
  )
  else:
  if is_overridden(self.action_distribution_fn):
-
  # Try new action_distribution_fn signature, supporting
  # state_batches and seq_lens.
  (

diff --git a/rllib/policy/torch_mixins.py b/rllib/policy/torch_mixins.py
@@ -172,7 +172,7 @@ class TargetNetworkMixin:
 
  - Adds the `update_target` method to the policy.
  Calling `update_target` updates all target Q-networks' weights from their
- respective "main" Q-metworks, based on tau (smooth, partial updating).
+ respective "main" Q-networks, based on tau (smooth, partial updating).
  """
 
  def __init__(self):
@@ -184,17 +184,32 @@ def update_target(self, tau=None):
  # Update_target_fn will be called periodically to copy Q network to
  # target Q network, using (soft) tau-synching.
  tau = tau or self.config.get("tau", 1.0)
+
  model_state_dict = self.model.state_dict()
+
  # Support partial (soft) synching.
  # If tau == 1.0: Full sync from Q-model to target Q-model.
- target_state_dict = next(iter(self.target_models.values())).state_dict()
- model_state_dict = {
- k: tau * model_state_dict[k] + (1 - tau) * v
- for k, v in target_state_dict.items()
- }
 
- for target in self.target_models.values():
- target.load_state_dict(model_state_dict)
+ if self.config.get("_enable_rl_module_api", False):
+ target_current_network_pairs = self.model.get_target_network_pairs()
+ for target_network, current_network in target_current_network_pairs:
+ current_state_dict = current_network.state_dict()
+ new_state_dict = {
+ k: tau * current_state_dict[k] + (1 - tau) * v
+ for k, v in target_network.state_dict().items()
+ }
+ target_network.load_state_dict(new_state_dict)
+ else:
+ # Support partial (soft) synching.
+ # If tau == 1.0: Full sync from Q-model to target Q-model.
+ target_state_dict = next(iter(self.target_models.values())).state_dict()
+ model_state_dict = {
+ k: tau * model_state_dict[k] + (1 - tau) * v
+ for k, v in target_state_dict.items()
+ }
+
+ for target in self.target_models.values():
+ target.load_state_dict(model_state_dict)
 
  @override(TorchPolicy)
  def set_weights(self, weights):

diff --git a/rllib/tuned_examples/a2c/atari-a2c.yaml b/rllib/tuned_examples/a2c/atari-a2c.yaml
@@ -11,8 +11,11 @@ atari-a2c:
  config:
  # Works for both torch and tf.
  framework: torch
+ # Make analogous to old v4 + NoFrameskip.
  env_config:
- frameskip: 1 # no frameskip
+ frameskip: 1
+ full_action_space: false
+ repeat_action_probability: 0.0
  train_batch_size: 500
  rollout_fragment_length: auto
  clip_rewards: True