Skip to content

Commit

Permalink
[RLlib] APPO+new-stack (Atari benchmark) - Preparatory PR 01. (ray-pr…
Browse files Browse the repository at this point in the history
  • Loading branch information
sven1977 committed Apr 26, 2023
1 parent 3e04104 commit 5a3954e
Show file tree
Hide file tree
Showing 42 changed files with 243 additions and 138 deletions.
25 changes: 12 additions & 13 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,16 @@ py_test(
args = ["--dir=tuned_examples/appo"]
)

py_test(
name = "learning_tests_cartpole_appo_w_rl_modules_and_learner",
main = "tests/run_regression_tests.py",
tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "tf_only", "no_tf_static_graph"],
size = "medium", # bazel may complain about it being too long sometimes - medium is on purpose as some frameworks take longer
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml"],
args = ["--dir=tuned_examples/appo"]
)

# py_test(
# name = "learning_tests_cartpole_appo_vtrace",
# main = "tests/run_regression_tests.py",
Expand Down Expand Up @@ -589,8 +599,9 @@ py_test(
args = ["--dir=tuned_examples/ppo"]
)

# TODO (Sven): Enable tf2 for this test.
py_test(
name = "learning_tests_pendulum_ppo_with_rl_module_torch",
name = "learning_tests_pendulum_ppo_with_rl_module",
main = "tests/run_regression_tests.py",
tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "torch_only"],
size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer
Expand All @@ -599,18 +610,6 @@ py_test(
args = ["--dir=tuned_examples/ppo"]
)

# TODO (Kourosh): tf2 is way slower than torch, eager mode is no enabled, I wonder if
# it would get faster with eager mode once it is enabled.
# py_test(
# name = "learning_tests_pendulum_ppo_with_rl_module_tf2_eager",
# main = "tests/run_regression_tests.py",
# tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "tf2_only", "no_tf_static_graph"],
# size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer
# srcs = ["tests/run_regression_tests.py"],
# data = ["tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml"],
# args = ["--dir=tuned_examples/ppo"]
# )

py_test(
name = "learning_tests_multi_agent_pendulum_ppo",
main = "tests/run_regression_tests.py",
Expand Down
2 changes: 1 addition & 1 deletion rllib/algorithms/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2158,7 +2158,7 @@ def default_resource_request(

# Default logic for RLlib Algorithms:
# Create one bundle per individual worker (local or remote).
# Use `num_cpus_for_driver` and `num_gpus` for the local worker and
# Use `num_cpus_for_local_worker` and `num_gpus` for the local worker and
# `num_cpus_per_worker` and `num_gpus_per_worker` for the remote
# workers to determine their CPU/GPU resource needs.

Expand Down
3 changes: 1 addition & 2 deletions rllib/algorithms/appo/appo_tf_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@
logger = logging.getLogger(__name__)


# We need this builder function because we want to share the same
# custom logics between TF1 dynamic and TF2 eager policies.
# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
def get_appo_tf_policy(name: str, base: type) -> type:
"""Construct an APPOTFPolicy inheriting either dynamic or eager base policies.
Expand Down
3 changes: 1 addition & 2 deletions rllib/algorithms/appo/appo_torch_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@
logger = logging.getLogger(__name__)


# We need this builder function because we want to share the same
# custom logics between TF1 dynamic and TF2 eager policies.
# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
class APPOTorchPolicy(
VTraceOptimizer,
LearningRateSchedule,
Expand Down
1 change: 1 addition & 0 deletions rllib/algorithms/appo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
TARGET_POLICY_SCOPE = "target_func"


# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
def make_appo_models(policy) -> ModelV2:
"""Builds model and target model for APPO.
Expand Down
2 changes: 1 addition & 1 deletion rllib/algorithms/impala/impala_tf_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def __init__(
config: Algorithm config dict.
"""

# Compute vtrace on the CPU for better perf.
# Compute vtrace on the CPU for better performance.
with tf.device("/cpu:0"):
self.vtrace_returns = vtrace.multi_from_logits(
behaviour_action_log_probs=behaviour_action_logp,
Expand Down
2 changes: 1 addition & 1 deletion rllib/algorithms/impala/torch/vtrace_torch_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def vtrace_torch(
clip_rho_threshold: Union[float, "torch.Tensor"] = 1.0,
clip_pg_rho_threshold: Union[float, "torch.Tensor"] = 1.0,
):
r"""V-trace for softmax policies implemented with torch.
"""V-trace for softmax policies implemented with torch.
Calculates V-trace actor critic targets for softmax polices as described in
"IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner
Expand Down
13 changes: 0 additions & 13 deletions rllib/algorithms/ppo/torch/ppo_torch_rl_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,6 @@
torch, nn = try_import_torch()


def get_ppo_loss(fwd_in, fwd_out):
# TODO: we should replace these components later with real ppo components when
# RLOptimizer and RLModule are integrated together.
# this is not exactly a ppo loss, just something to show that the
# forward train works
adv = fwd_in[SampleBatch.REWARDS] - fwd_out[SampleBatch.VF_PREDS]
actor_loss = -(fwd_out[SampleBatch.ACTION_LOGP] * adv).mean()
critic_loss = (adv**2).mean()
loss = actor_loss + critic_loss

return loss


class PPOTorchRLModule(PPORLModuleBase, TorchRLModule):
framework: str = "torch"

Expand Down
9 changes: 0 additions & 9 deletions rllib/algorithms/slateq/slateq_tf_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,15 +207,6 @@ def build_slateq_stats(policy: Policy, batch) -> Dict[str, TensorType]:
"q_loss": policy._q_loss,
"mean_actions": policy._mean_actions,
}
# if hasattr(policy, "_mean_grads_0"):
# stats.update({"mean_grads_0": policy._mean_grads_0})
# stats.update({"mean_grads_1": policy._mean_grads_1})
# stats.update({"mean_grads_2": policy._mean_grads_2})
# stats.update({"mean_grads_3": policy._mean_grads_3})
# stats.update({"mean_grads_4": policy._mean_grads_4})
# stats.update({"mean_grads_5": policy._mean_grads_5})
# stats.update({"mean_grads_6": policy._mean_grads_6})
# stats.update({"mean_grads_7": policy._mean_grads_7})
return stats


Expand Down
40 changes: 22 additions & 18 deletions rllib/core/learner/learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,14 +522,14 @@ def compile_results(

# We put the stats for all modules under the ALL_MODULES key. e.g. average of
# the gradients across all modules will go here.
mean_grads = [
np.mean(grad)
mean_abs_grads = [
np.mean(np.abs(grad))
for grad in convert_to_numpy(postprocessed_gradients.values())
if grad is not None
]

module_learner_stats[ALL_MODULES] = {
"mean_gradient": np.mean(mean_grads),
"mean_abs_postprocessed_gradients": np.mean(mean_abs_grads),
self.TOTAL_LOSS_KEY: loss_numpy[self.TOTAL_LOSS_KEY],
}

Expand Down Expand Up @@ -754,19 +754,21 @@ def additional_update_per_module(

@OverrideToImplementCustomLogic
def postprocess_gradients(
self, gradients_dict: Mapping[str, Any]
self,
gradients_dict: Mapping[str, Any],
) -> Mapping[str, Any]:
"""Applies potential postprocessings to the gradients.
"""Applies potential postprocessing operations on the gradients.
In some algorithms, we may want to perform some postprocessing on the
gradients before they are applied. This method is called after gradients
have been computed, and modifies them before they are applied.
This method is called after gradients have been computed, and modifies them
before they are applied to the respective module(s).
This includes grad clipping by value, norm, or global-norm, or other
algorithm specific gradient postprocessing steps.
Args:
gradients_dict: A dictionary of gradients.
Returns:
A dictionary of updated gradients.
A dictionary with the updated gradients.
"""
return gradients_dict

Expand All @@ -776,7 +778,9 @@ def update(
*,
minibatch_size: Optional[int] = None,
num_iters: int = 1,
reduce_fn: Callable[[ResultDict], ResultDict] = _reduce_mean_results,
reduce_fn: Callable[[List[Mapping[str, Any]]], ResultDict] = (
_reduce_mean_results
),
) -> Mapping[str, Any]:
"""Do `num_iters` minibatch updates given the original batch.
Expand Down Expand Up @@ -957,29 +961,29 @@ def _make_module(self) -> MultiAgentRLModule:
This method uses `self._module_specs` or `self._module_obj` to construct the
module. If the module_class is a single agent RL module it will be wrapped to a
multi-agent RL module. Override this method if there are other things than
needs to happen for instantiation of the module.
multi-agent RL module. Override this method if there are other things that
need to happen for instantiation of the module.
Returns:
The constructed module.
A constructed MultiAgentRLModule.
"""
if self._module_obj is not None:
module = self._module_obj
else:
module = self._module_spec.build()
# If not already, convert to MultiAgentRLModule.
module = module.as_multi_agent()
return module

def _check_result(self, result: Mapping[str, Any]) -> None:
"""Checks whether the result has the correct format.
All the keys should be referencing the module ids that got updated. There is a
special key `__all__` that hold any extra information that is not specific to a
module.
special key `ALL_MODULES` that hold any extra information that is not specific
to a module.
Args:
results: The result of the update.
result: The result of the update.
Raises:
ValueError: If the result are not in the correct format.
Expand All @@ -1000,7 +1004,7 @@ def _check_result(self, result: Mapping[str, Any]) -> None:
if key not in self.module.keys():
raise ValueError(
f"The key {key} in the result of the update is not a valid "
f"module id. Valid module ids are: {self.module.keys()}"
f"module id. Valid module ids are: {list(self.module.keys())}."
)

@OverrideToImplementCustomLogic_CallToSuperRecommended
Expand Down
2 changes: 1 addition & 1 deletion rllib/core/learner/learner_group_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def module(
def resources(
self,
num_learner_workers: Optional[int] = NotProvided,
num_gpus_per_learner_worker: Optional[Union[float, int]] = NotProvided,
num_gpus_per_learner_worker: Optional[int] = NotProvided,
num_cpus_per_learner_worker: Optional[Union[float, int]] = NotProvided,
local_gpu_idx: Optional[int] = NotProvided,
) -> "LearnerGroupConfig":
Expand Down
4 changes: 3 additions & 1 deletion rllib/core/learner/scaling_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ class LearnerGroupScalingConfig:
training will run on a single CPU.
num_gpus_per_worker: The number of GPUs to allocate per worker. If
num_workers=0, any number greater than 0 will run the training on a single
GPU. A value of zero will run the training on a single CPU.
GPU. A value of zero will run the training on `num_cpus_per_worker` CPUs.
Fractional values (e.g. 0.5) are currently NOT supported as these might
cause CUDA async errors.
local_gpu_idx: if num_gpus_per_worker > 0, and num_workers<2, then this gpu
index will be used for training. This is an index into the available cuda
devices. For example if os.environ["CUDA_VISIBLE_DEVICES"] = "1" then a
Expand Down
4 changes: 2 additions & 2 deletions rllib/core/learner/tests/test_learner_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ def test_update_multigpu(self):

self.assertLess(min_loss, 0.57)

# make sure the learner_group resources are freed up so that we don't
# autoscale
# Make sure the learner_group resources are freed up so that we don't
# autoscale.
learner_group.shutdown()
del learner_group

Expand Down
8 changes: 4 additions & 4 deletions rllib/core/models/tf/primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def __init__(
input_dim: int,
hidden_layer_dims: List[int],
hidden_layer_use_layernorm: bool = False,
hidden_layer_activation: Union[str, Callable] = "relu",
hidden_layer_activation: Optional[Union[str, Callable]] = "relu",
output_dim: Optional[int] = None,
output_activation: Union[str, Callable] = "linear",
output_activation: Optional[Union[str, Callable]] = "linear",
use_bias: bool = True,
):
"""Initialize a TfMLP object.
Expand Down Expand Up @@ -112,7 +112,7 @@ def __init__(
input_dims: Union[List[int], Tuple[int]],
cnn_filter_specifiers: List[List[Union[int, List]]],
cnn_use_layernorm: bool = False,
cnn_activation: str = "relu",
cnn_activation: Optional[str] = "relu",
use_bias: bool = True,
):
"""Initializes a TfCNN instance.
Expand Down Expand Up @@ -188,7 +188,7 @@ def __init__(
*,
input_dims: Union[List[int], Tuple[int]],
cnn_transpose_filter_specifiers: List[List[Union[int, List]]],
cnn_transpose_activation: str = "relu",
cnn_transpose_activation: Optional[str] = "relu",
cnn_transpose_use_layernorm: bool = False,
use_bias: bool = True,
):
Expand Down
5 changes: 1 addition & 4 deletions rllib/policy/eager_tf_policy_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def __init__(
self._loss_initialized = False
# Backward compatibility workaround so Policy will call self.loss() directly.
# TODO(jungong): clean up after all policies are migrated to new sub-class
# implementation.
# implementation.
self._loss = None

self.batch_divisibility_req = self.get_batch_divisibility_req()
Expand Down Expand Up @@ -864,8 +864,6 @@ def _compute_actions_helper(
dist_inputs = None

elif is_overridden(self.action_sampler_fn):
dist_inputs = None
state_out = []
actions, logp, dist_inputs, state_out = self.action_sampler_fn(
self.model,
input_dict[SampleBatch.OBS],
Expand All @@ -875,7 +873,6 @@ def _compute_actions_helper(
)
else:
if is_overridden(self.action_distribution_fn):

# Try new action_distribution_fn signature, supporting
# state_batches and seq_lens.
(
Expand Down
31 changes: 23 additions & 8 deletions rllib/policy/torch_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ class TargetNetworkMixin:
- Adds the `update_target` method to the policy.
Calling `update_target` updates all target Q-networks' weights from their
respective "main" Q-metworks, based on tau (smooth, partial updating).
respective "main" Q-networks, based on tau (smooth, partial updating).
"""

def __init__(self):
Expand All @@ -184,17 +184,32 @@ def update_target(self, tau=None):
# Update_target_fn will be called periodically to copy Q network to
# target Q network, using (soft) tau-synching.
tau = tau or self.config.get("tau", 1.0)

model_state_dict = self.model.state_dict()

# Support partial (soft) synching.
# If tau == 1.0: Full sync from Q-model to target Q-model.
target_state_dict = next(iter(self.target_models.values())).state_dict()
model_state_dict = {
k: tau * model_state_dict[k] + (1 - tau) * v
for k, v in target_state_dict.items()
}

for target in self.target_models.values():
target.load_state_dict(model_state_dict)
if self.config.get("_enable_rl_module_api", False):
target_current_network_pairs = self.model.get_target_network_pairs()
for target_network, current_network in target_current_network_pairs:
current_state_dict = current_network.state_dict()
new_state_dict = {
k: tau * current_state_dict[k] + (1 - tau) * v
for k, v in target_network.state_dict().items()
}
target_network.load_state_dict(new_state_dict)
else:
# Support partial (soft) synching.
# If tau == 1.0: Full sync from Q-model to target Q-model.
target_state_dict = next(iter(self.target_models.values())).state_dict()
model_state_dict = {
k: tau * model_state_dict[k] + (1 - tau) * v
for k, v in target_state_dict.items()
}

for target in self.target_models.values():
target.load_state_dict(model_state_dict)

@override(TorchPolicy)
def set_weights(self, weights):
Expand Down
5 changes: 4 additions & 1 deletion rllib/tuned_examples/a2c/atari-a2c.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@ atari-a2c:
config:
# Works for both torch and tf.
framework: torch
# Make analogous to old v4 + NoFrameskip.
env_config:
frameskip: 1 # no frameskip
frameskip: 1
full_action_space: false
repeat_action_probability: 0.0
train_batch_size: 500
rollout_fragment_length: auto
clip_rewards: True
Expand Down
Loading

0 comments on commit 5a3954e

Please sign in to comment.