diff --git a/rllib/BUILD b/rllib/BUILD index 221c2362b56cf..b66d0ec983e41 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1066,7 +1066,7 @@ py_test( srcs = ["algorithms/dqn/tests/test_repro_dqn.py"] ) -# Dreamer +# Dreamer (V1) py_test( name = "test_dreamer", tags = ["team:rllib", "algorithms_dir"], @@ -1074,6 +1074,16 @@ py_test( srcs = ["algorithms/dreamer/tests/test_dreamer.py"] ) +# DreamerV3 +# TODO (sven): Enable once the version conflict for gymnasium/supersuit/pettingzoo +# /shimmy/mujoco has been resolved. +#py_test( +# name = "test_dreamerv3", +# tags = ["team:rllib", "algorithms_dir"], +# size = "large", +# srcs = ["algorithms/dreamerv3/tests/test_dreamerv3.py"] +#) + # DT py_test( name = "test_segmentation_buffer", @@ -4345,6 +4355,7 @@ py_test_module_list( files = [ "tests/test_dnc.py", "tests/test_perf.py", + "algorithms/dreamerv3/tests/test_dreamerv3.py", "env/wrappers/tests/test_kaggle_wrapper.py", "examples/env/tests/test_cliff_walking_wall_env.py", "examples/env/tests/test_coin_game_non_vectorized_env.py", diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 29de0b01a3be5..7e3c32a4efc51 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -706,7 +706,19 @@ def setup(self, config: AlgorithmConfig) -> None: # the two we need to loop through the policy modules and create a simple # MARLModule from the RLModule within each policy. local_worker = self.workers.local_worker() - module_spec = local_worker.marl_module_spec + policy_dict, _ = self.config.get_multi_agent_setup( + env=local_worker.env, + spaces=getattr(local_worker, "spaces", None), + ) + # TODO (Sven): Unify the inference of the MARLModuleSpec. Right now, + # we get this from the RolloutWorker's `marl_module_spec` property. + # However, this is hacky (information leak) and should not remain this + # way. For other EnvRunner classes (that don't have this property), + # Algorithm should infer this itself. + if hasattr(local_worker, "marl_module_spec"): + module_spec = local_worker.marl_module_spec + else: + module_spec = self.config.get_marl_module_spec(policy_dict=policy_dict) learner_group_config = self.config.get_learner_group_config(module_spec) self.learner_group = learner_group_config.build() @@ -871,7 +883,7 @@ def evaluate( # Sync weights to the evaluation WorkerSet. if self.evaluation_workers is not None: self.evaluation_workers.sync_weights( - from_worker_or_trainer=self.workers.local_worker() + from_worker_or_learner_group=self.workers.local_worker() ) self._sync_filters_if_needed( central_worker=self.workers.local_worker(), @@ -1409,7 +1421,7 @@ def training_step(self) -> ResultDict: if self.config._enable_learner_api: from_worker_or_trainer = self.learner_group self.workers.sync_weights( - from_worker_or_trainer=from_worker_or_trainer, + from_worker_or_learner_group=from_worker_or_trainer, policies=list(train_results.keys()), global_vars=global_vars, ) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index a037f7bb052b3..2510490d48426 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -303,10 +303,8 @@ def __init__(self, algo_class=None): self.normalize_actions = True self.clip_actions = False self.disable_env_checking = False - # Whether this env is an atari env (for atari-specific preprocessing). - # If not specified, we will try to auto-detect this. - self.is_atari = None self.auto_wrap_old_gym_envs = True + self._is_atari = None # `self.rollouts()` self.env_runner_cls = None @@ -718,31 +716,6 @@ def freeze(self) -> None: # of themselves? This way, users won't even be able to alter those values # directly anymore. - def _detect_atari_env(self) -> bool: - """Returns whether this configured env is an Atari env or not. - - Returns: - True, if specified env is an Atari env, False otherwise. - """ - # Atari envs are usually specified via a string like "PongNoFrameskip-v4" - # or "ALE/Breakout-v5". - # We do NOT attempt to auto-detect Atari env for other specified types like - # a callable, to avoid running heavy logics in validate(). - # For these cases, users can explicitly set `environment(atari=True)`. - if not type(self.env) == str: - return False - - try: - if self.env.startswith("ALE/"): - env = gym.make("GymV26Environment-v0", env_id=self.env) - else: - env = gym.make(self.env) - except gym.error.NameNotFound: - # Not an Atari env if this is not a gym env. - return False - - return is_atari(env) - @OverrideToImplementCustomLogic_CallToSuperRecommended def validate(self) -> None: """Validates all values in this config.""" @@ -988,10 +961,6 @@ def validate(self) -> None: f"config.framework({self.framework_str})!" ) - # Detect if specified env is an Atari env. - if self.is_atari is None: - self.is_atari = self._detect_atari_env() - if self.input_ == "sampler" and self.off_policy_estimation_methods: raise ValueError( "Off-policy estimation methods can only be used if the input is a " @@ -1368,7 +1337,7 @@ def environment( disable_env_checking: If True, disable the environment pre-checking module. is_atari: This config can be used to explicitly specify whether the env is an Atari env or not. If not specified, RLlib will try to auto-detect - this during config validation. + this. auto_wrap_old_gym_envs: Whether to auto-wrap old gym environments (using the pre 0.24 gym APIs, e.g. reset() returning single obs and no info dict). If True, RLlib will automatically wrap the given gym env class @@ -1405,7 +1374,7 @@ def environment( if disable_env_checking is not NotProvided: self.disable_env_checking = disable_env_checking if is_atari is not NotProvided: - self.is_atari = is_atari + self._is_atari = is_atari if auto_wrap_old_gym_envs is not NotProvided: self.auto_wrap_old_gym_envs = auto_wrap_old_gym_envs @@ -2319,6 +2288,8 @@ def reporting( In case there are more than this many episodes collected in a single training iteration, use all of these episodes for metrics computation, meaning don't ever cut any "excess" episodes. + Set this to 1 to disable smoothing and to always report only the most + recently collected episode's return. min_time_s_per_iteration: Minimum time to accumulate within a single `train()` call. This value does not affect learning, only the number of times `Algorithm.training_step()` is called by @@ -2645,6 +2616,34 @@ def learner_class(self) -> Type["Learner"]: """ return self._learner_class or self.get_default_learner_class() + @property + def is_atari(self) -> bool: + """True if if specified env is an Atari env.""" + + # Not yet determined, try to figure this out. + if self._is_atari is None: + # Atari envs are usually specified via a string like "PongNoFrameskip-v4" + # or "ALE/Breakout-v5". + # We do NOT attempt to auto-detect Atari env for other specified types like + # a callable, to avoid running heavy logics in validate(). + # For these cases, users can explicitly set `environment(atari=True)`. + if not type(self.env) == str: + return False + try: + if self.env.startswith("ALE/"): + env = gym.make("GymV26Environment-v0", env_id=self.env) + else: + env = gym.make(self.env) + # Any gymnasium error -> Cannot be an Atari env. + except gym.error.Error: + return False + + self._is_atari = is_atari(env) + # Clean up env's resources, if any. + env.close() + + return self._is_atari + # TODO: Make rollout_fragment_length as read-only property and replace the current # self.rollout_fragment_length a private variable. def get_rollout_fragment_length(self, worker_index: int = 0) -> int: diff --git a/rllib/algorithms/dreamerv3/README.md b/rllib/algorithms/dreamerv3/README.md new file mode 100644 index 0000000000000..8c64b960b7b73 --- /dev/null +++ b/rllib/algorithms/dreamerv3/README.md @@ -0,0 +1,27 @@ +# DreamerV3 +Implementation (TensorFlow/Keras) of the "DreamerV3" model-based reinforcement learning +(RL) algorithm by D. Hafner et al. 2023 + +DreamerV3 train a world model in supervised fashion using real environment +interactions. The world model utilizes a recurrent GRU-based architecture +("recurrent state space model" or RSSM) and uses it to predicts rewards, +episode continuation flags, as well as, observations. +With these predictions (dreams) made by the world model, both actor +and critic are trained in classic REINFORCE fashion. In other words, the +actual RL components of the model are never trained on actual environment data, +but on dreamed trajectories only. + +For more algorithm details, see: + +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +.. and the "DreamerV2" paper: + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf + +## Results +TODO diff --git a/rllib/algorithms/dreamerv3/__init__.py b/rllib/algorithms/dreamerv3/__init__.py new file mode 100644 index 0000000000000..d4b2adb0d57ed --- /dev/null +++ b/rllib/algorithms/dreamerv3/__init__.py @@ -0,0 +1,15 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3, DreamerV3Config + +__all__ = [ + "DreamerV3", + "DreamerV3Config", +] diff --git a/rllib/algorithms/dreamerv3/dreamerv3.py b/rllib/algorithms/dreamerv3/dreamerv3.py new file mode 100644 index 0000000000000..515f6e3a22a29 --- /dev/null +++ b/rllib/algorithms/dreamerv3/dreamerv3.py @@ -0,0 +1,660 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +import dataclasses +import gc +import logging +import tree # pip install dm_tree +from typing import Any, Dict, List, Optional + +import gymnasium as gym +import numpy as np + +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided +from ray.rllib.algorithms.dreamerv3.dreamerv3_catalog import DreamerV3Catalog +from ray.rllib.algorithms.dreamerv3.dreamerv3_learner import ( + DreamerV3LearnerHyperparameters, +) +from ray.rllib.algorithms.dreamerv3.utils import do_symlog_obs +from ray.rllib.algorithms.dreamerv3.utils.env_runner import DreamerV3EnvRunner +from ray.rllib.algorithms.dreamerv3.utils.summaries import ( + report_predicted_vs_sampled_obs, + report_sampling_and_replay_buffer, +) +from ray.rllib.core.learner.learner import LearnerHyperparameters +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch +from ray.rllib.utils import deep_update +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.numpy import one_hot +from ray.rllib.utils.metrics import ( + ALL_MODULES, + GARBAGE_COLLECTION_TIMER, + LEARN_ON_BATCH_TIMER, + NUM_AGENT_STEPS_SAMPLED, + NUM_AGENT_STEPS_TRAINED, + NUM_ENV_STEPS_SAMPLED, + NUM_ENV_STEPS_TRAINED, + NUM_GRAD_UPDATES_LIFETIME, + NUM_SYNCH_WORKER_WEIGHTS, + NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, + SAMPLE_TIMER, + SYNCH_WORKER_WEIGHTS_TIMER, +) +from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer +from ray.rllib.utils.typing import ResultDict + + +logger = logging.getLogger(__name__) + +_, tf, _ = try_import_tf() + + +class DreamerV3Config(AlgorithmConfig): + """Defines a configuration class from which a DreamerV3 can be built. + + Example: + >>> from ray.rllib.algorithms.dreamerv3 import DreamerV3Config + >>> config = DreamerV3Config() + >>> config = config.training( # doctest: +SKIP + ... batch_size_B=8, model_size="M" + ... ) + >>> config = config.resources(num_learner_workers=4) # doctest: +SKIP + >>> print(config.to_dict()) # doctest: +SKIP + >>> # Build a Algorithm object from the config and run 1 training iteration. + >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP + >>> algo.train() # doctest: +SKIP + + Example: + >>> from ray.rllib.algorithms.dreamerv3 import DreamerV3Config + >>> from ray import air + >>> from ray import tune + >>> config = DreamerV3Config() + >>> # Print out some default values. + >>> print(config.training_ratio) # doctest: +SKIP + >>> # Update the config object. + >>> config = config.training( # doctest: +SKIP + ... training_ratio=tune.grid_search([256, 512, 1024]) + ... ) + >>> # Set the config object's env. + >>> config = config.environment(env="CartPole-v1") # doctest: +SKIP + >>> # Use to_dict() to get the old-style python config dict + >>> # when running with tune. + >>> tune.Tuner( # doctest: +SKIP + ... "DreamerV3", + ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), + ... param_space=config.to_dict(), + ... ).fit() + """ + + def __init__(self, algo_class=None): + """Initializes a DreamerV3Config instance.""" + super().__init__(algo_class=algo_class or DreamerV3) + + # fmt: off + # __sphinx_doc_begin__ + + # DreamerV3 specific settings: + self.model_size = "XS" + self.training_ratio = 1024 + + self.replay_buffer_config = { + "type": "EpisodeReplayBuffer", + "capacity": int(1e6), + } + self.world_model_lr = 1e-4 + self.actor_lr = 3e-5 + self.critic_lr = 3e-5 + self.batch_size_B = 16 + self.batch_length_T = 64 + self.horizon_H = 15 + self.gae_lambda = 0.95 # [1] eq. 7. + self.entropy_scale = 3e-4 # [1] eq. 11. + self.return_normalization_decay = 0.99 # [1] eq. 11 and 12. + self.train_critic = True + self.train_actor = True + self.intrinsic_rewards_scale = 0.1 + self.world_model_grad_clip_by_global_norm = 1000.0 + self.critic_grad_clip_by_global_norm = 100.0 + self.actor_grad_clip_by_global_norm = 100.0 + + # Reporting. + # DreamerV3 is super sample efficient and only needs very few episodes + # (normally) to learn. Leaving this at its default value would gravely + # underestimate the learning performance over the course of an experiment. + self.metrics_num_episodes_for_smoothing = 1 + self.report_individual_batch_item_stats = False + self.report_dream_data = False + self.report_images_and_videos = False + self.gc_frequency_train_steps = 100 + + # Override some of AlgorithmConfig's default values with DreamerV3-specific + # values. + self.lr = None + self.framework_str = "tf2" + self.gamma = 0.997 # [1] eq. 7. + # Do not use! Set `batch_size_B` and `batch_length_T` instead. + self.train_batch_size = None + self.env_runner_cls = DreamerV3EnvRunner + self.num_rollout_workers = 0 + self.rollout_fragment_length = 1 + # Since we are using a gymnasium-based EnvRunner, we can utilitze its + # vectorization capabilities w/o suffering performance losses (as we would + # with RLlib's `RemoteVectorEnv`). + self.remote_worker_envs = True + # Dreamer only runs on the new API stack. + self._enable_learner_api = True + self._enable_rl_module_api = True + # __sphinx_doc_end__ + # fmt: on + + @override(AlgorithmConfig) + def training( + self, + *, + model_size: Optional[str] = NotProvided, + training_ratio: Optional[float] = NotProvided, + gc_frequency_train_steps: Optional[int] = NotProvided, + batch_size_B: Optional[int] = NotProvided, + batch_length_T: Optional[int] = NotProvided, + horizon_H: Optional[int] = NotProvided, + gae_lambda: Optional[float] = NotProvided, + entropy_scale: Optional[float] = NotProvided, + return_normalization_decay: Optional[float] = NotProvided, + train_critic: Optional[bool] = NotProvided, + train_actor: Optional[bool] = NotProvided, + intrinsic_rewards_scale: Optional[float] = NotProvided, + world_model_grad_clip_by_global_norm: Optional[float] = NotProvided, + critic_grad_clip_by_global_norm: Optional[float] = NotProvided, + actor_grad_clip_by_global_norm: Optional[float] = NotProvided, + replay_buffer_config: Optional[dict] = NotProvided, + **kwargs, + ) -> "DreamerV3Config": + """Sets the training related configuration. + + Args: + model_size: The main switch for adjusting the overall model size. See [1] + (table B) for more information on the effects of this setting on the + model architecture. + Supported values are "XS", "S", "M", "L", "XL" (as per the paper), as + well as, "nano", "micro", "mini", and "XXS" (for RLlib's + implementation). See ray.rllib.algorithms.dreamerv3.utils. + __init__.py for the details on what exactly each size does to the layer + sizes, number of layers, etc.. + training_ratio: The ratio of total steps trained (sum of the sizes of all + batches ever sampled from the replay buffer) over the total env steps + taken (in the actual environment, not the dreamed one). For example, + if the training_ratio is 1024 and the batch size is 1024, we would take + 1 env step for every training update: 1024 / 1. If the training ratio + is 512 and the batch size is 1024, we would take 2 env steps and then + perform a single training update (on a 1024 batch): 1024 / 2. + gc_frequency_train_steps: The frequency (in training iterations) with which + we perform a `gc.collect()` calls at the end of a `training_step` + iteration. Doing this more often adds a (albeit very small) performance + overhead, but prevents memory leaks from becoming harmful. + TODO (sven): This might not be necessary anymore, but needs to be + confirmed experimentally. + batch_size_B: The batch size (B) interpreted as number of rows (each of + length `batch_length_T`) to sample from the replay buffer in each + iteration. + batch_length_T: The batch length (T) interpreted as the length of each row + sampled from the replay buffer in each iteration. Note that + `batch_size_B` rows will be sampled in each iteration. Rows normally + contain consecutive data (consecutive timesteps from the same episode), + but there might be episode boundaries in a row as well. + horizon_H: The horizon (in timesteps) used to create dreamed data from the + world model, which in turn is used to train/update both actor- and + critic networks. + gae_lambda: The lambda parameter used for computing the GAE-style + value targets for the actor- and critic losses. + entropy_scale: The factor with which to multiply the entropy loss term + inside the actor loss. + return_normalization_decay: The decay value to use when computing the + running EMA values for return normalization (used in the actor loss). + train_critic: Whether to train the critic network. If False, `train_actor` + must also be False (cannot train actor w/o training the critic). + train_actor: Whether to train the actor network. If True, `train_critic` + must also be True (cannot train actor w/o training the critic). + intrinsic_rewards_scale: The factor to multiply intrinsic rewards with + before adding them to the extrinsic (environment) rewards. + world_model_grad_clip_by_global_norm: World model grad clipping value + (by global norm). + critic_grad_clip_by_global_norm: Critic grad clipping value + (by global norm). + actor_grad_clip_by_global_norm: Actor grad clipping value (by global norm). + replay_buffer_config: Replay buffer config. + Only serves in DreamerV3 to set the capacity of the replay buffer. + Note though that in the paper ([1]) a size of 1M is used for all + benchmarks and there doesn't seem to be a good reason to change this + parameter. + Examples: + { + "type": "EpisodeReplayBuffer", + "capacity": 100000, + } + + Returns: + This updated AlgorithmConfig object. + """ + # Pass kwargs onto super's `training()` method. + super().training(**kwargs) + + if model_size is not NotProvided: + self.model_size = model_size + if training_ratio is not NotProvided: + self.training_ratio = training_ratio + if gc_frequency_train_steps is not NotProvided: + self.gc_frequency_train_steps = gc_frequency_train_steps + if batch_size_B is not NotProvided: + self.batch_size_B = batch_size_B + if batch_length_T is not NotProvided: + self.batch_length_T = batch_length_T + if horizon_H is not NotProvided: + self.horizon_H = horizon_H + if gae_lambda is not NotProvided: + self.gae_lambda = gae_lambda + if entropy_scale is not NotProvided: + self.entropy_scale = entropy_scale + if return_normalization_decay is not NotProvided: + self.return_normalization_decay = return_normalization_decay + if train_critic is not NotProvided: + self.train_critic = train_critic + if train_actor is not NotProvided: + self.train_actor = train_actor + if intrinsic_rewards_scale is not NotProvided: + self.intrinsic_rewards_scale = intrinsic_rewards_scale + if world_model_grad_clip_by_global_norm is not NotProvided: + self.world_model_grad_clip_by_global_norm = ( + world_model_grad_clip_by_global_norm + ) + if critic_grad_clip_by_global_norm is not NotProvided: + self.critic_grad_clip_by_global_norm = critic_grad_clip_by_global_norm + if actor_grad_clip_by_global_norm is not NotProvided: + self.actor_grad_clip_by_global_norm = actor_grad_clip_by_global_norm + if replay_buffer_config is not NotProvided: + # Override entire `replay_buffer_config` if `type` key changes. + # Update, if `type` key remains the same or is not specified. + new_replay_buffer_config = deep_update( + {"replay_buffer_config": self.replay_buffer_config}, + {"replay_buffer_config": replay_buffer_config}, + False, + ["replay_buffer_config"], + ["replay_buffer_config"], + ) + self.replay_buffer_config = new_replay_buffer_config["replay_buffer_config"] + + return self + + @override(AlgorithmConfig) + def reporting( + self, + *, + report_individual_batch_item_stats: Optional[bool] = NotProvided, + report_dream_data: Optional[bool] = NotProvided, + report_images_and_videos: Optional[bool] = NotProvided, + **kwargs, + ): + """Sets the reporting related configuration. + + Args: + report_individual_batch_item_stats: Whether to include loss and other stats + per individual timestep inside the training batch in the result dict + returned by `training_step()`. If True, besides the `CRITIC_L_total`, + the individual critic loss values per batch row and time axis step + in the train batch (CRITIC_L_total_B_T) will also be part of the + results. + report_dream_data: Whether to include the dreamed trajectory data in the + result dict returned by `training_step()`. If True, however, will + slice each reported item in the dream data down to the shape. + (H, B, t=0, ...), where H is the horizon and B is the batch size. The + original time axis will only be represented by the first timestep + to not make this data too large to handle. + report_images_and_videos: Whether to include any image/video data in the + result dict returned by `training_step()`. + **kwargs: + + Returns: + This updated AlgorithmConfig object. + """ + super().reporting(**kwargs) + + if report_individual_batch_item_stats is not NotProvided: + self.report_individual_batch_item_stats = report_individual_batch_item_stats + if report_dream_data is not NotProvided: + self.report_dream_data = report_dream_data + if report_images_and_videos is not NotProvided: + self.report_images_and_videos = report_images_and_videos + + return self + + @override(AlgorithmConfig) + def validate(self) -> None: + # Call the super class' validation method first. + super().validate() + + # Make sure, users are not using DreamerV3 yet for multi-agent: + if self.is_multi_agent(): + raise ValueError("DreamerV3 does NOT support multi-agent setups yet!") + + # Make sure, we are configure for the new API stack. + if not (self._enable_learner_api and self._enable_rl_module_api): + raise ValueError( + "DreamerV3 must be run with `config._enable_learner_api`=True AND " + "with `config._enable_rl_module_api`=True!" + ) + + # If run on several Learners, the provided batch_size_B must be a multiple + # of `num_learner_workers`. + if self.num_learner_workers > 1 and ( + self.batch_size_B % self.num_learner_workers != 0 + ): + raise ValueError( + f"Your `batch_size_B` ({self.batch_size_B}) must be a multiple of " + f"`num_learner_workers` ({self.num_learner_workers}) in order for " + "DreamerV3 to be able to split batches evenly across your Learner " + "processes." + ) + + # Cannot train actor w/o critic. + if self.train_actor and not self.train_critic: + raise ValueError( + "Cannot train actor network (`train_actor=True`) w/o training critic! " + "Make sure you either set `train_critic=True` or `train_actor=False`." + ) + # Use DreamerV3 specific batch size settings. + if self.train_batch_size is not None: + raise ValueError( + "`train_batch_size` should NOT be set! Use `batch_size_B` and " + "`batch_length_T` instead." + ) + # Must be run with `EpisodeReplayBuffer` type. + if self.replay_buffer_config.get("type") != "EpisodeReplayBuffer": + raise ValueError( + "DreamerV3 must be run with the `EpisodeReplayBuffer` type! None " + "other supported." + ) + + @override(AlgorithmConfig) + def get_learner_hyperparameters(self) -> LearnerHyperparameters: + base_hps = super().get_learner_hyperparameters() + return DreamerV3LearnerHyperparameters( + model_size=self.model_size, + training_ratio=self.training_ratio, + batch_size_B=self.batch_size_B // (self.num_learner_workers or 1), + batch_length_T=self.batch_length_T, + horizon_H=self.horizon_H, + gamma=self.gamma, + gae_lambda=self.gae_lambda, + entropy_scale=self.entropy_scale, + return_normalization_decay=self.return_normalization_decay, + train_actor=self.train_actor, + train_critic=self.train_critic, + world_model_lr=self.world_model_lr, + intrinsic_rewards_scale=self.intrinsic_rewards_scale, + actor_lr=self.actor_lr, + critic_lr=self.critic_lr, + world_model_grad_clip_by_global_norm=( + self.world_model_grad_clip_by_global_norm + ), + actor_grad_clip_by_global_norm=self.actor_grad_clip_by_global_norm, + critic_grad_clip_by_global_norm=self.critic_grad_clip_by_global_norm, + report_individual_batch_item_stats=( + self.report_individual_batch_item_stats + ), + report_dream_data=self.report_dream_data, + report_images_and_videos=self.report_images_and_videos, + **dataclasses.asdict(base_hps), + ) + + @override(AlgorithmConfig) + def get_default_learner_class(self): + if self.framework_str == "tf2": + from ray.rllib.algorithms.dreamerv3.tf.dreamerv3_tf_learner import ( + DreamerV3TfLearner, + ) + + return DreamerV3TfLearner + else: + raise ValueError(f"The framework {self.framework_str} is not supported.") + + @override(AlgorithmConfig) + def get_default_rl_module_spec(self) -> SingleAgentRLModuleSpec: + if self.framework_str == "tf2": + from ray.rllib.algorithms.dreamerv3.tf.dreamerv3_tf_rl_module import ( + DreamerV3TfRLModule, + ) + + return SingleAgentRLModuleSpec( + module_class=DreamerV3TfRLModule, catalog_class=DreamerV3Catalog + ) + else: + raise ValueError(f"The framework {self.framework_str} is not supported.") + + @property + def share_module_between_env_runner_and_learner(self) -> bool: + # If we only have one local Learner (num_learner_workers=0) and only + # one local EnvRunner (num_rollout_workers=0), share the RLModule + # between these two to avoid having to sync weights, ever. + return self.num_learner_workers == 0 and self.num_rollout_workers == 0 + + +class DreamerV3(Algorithm): + """Implementation of the model-based DreamerV3 RL algorithm described in [1].""" + + @classmethod + @override(Algorithm) + def get_default_config(cls) -> AlgorithmConfig: + return DreamerV3Config() + + @override(Algorithm) + def setup(self, config: AlgorithmConfig): + super().setup(config) + + # Share RLModule between EnvRunner and single (local) Learner instance. + # To avoid possibly expensive weight synching step. + if self.config.share_module_between_env_runner_and_learner: + assert self.workers.local_worker().module is None + self.workers.local_worker().module = self.learner_group._learner.module[ + DEFAULT_POLICY_ID + ] + + # Summarize (single-agent) RLModule (only once) here. + if self.config.framework_str == "tf2": + self.workers.local_worker().module.dreamer_model.summary(expand_nested=True) + + # Create a replay buffer for storing actual env samples. + self.replay_buffer = EpisodeReplayBuffer( + capacity=self.config.replay_buffer_config["capacity"], + batch_size_B=self.config.batch_size_B, + batch_length_T=self.config.batch_length_T, + ) + + @override(Algorithm) + def training_step(self) -> ResultDict: + results = {} + + env_runner = self.workers.local_worker() + + # Push enough samples into buffer initially before we start training. + if self.training_iteration == 0: + logger.info( + "Filling replay buffer so it contains at least " + f"{self.config.batch_size_B * self.config.batch_length_T} timesteps " + "(required for a single train batch)." + ) + + # Have we sampled yet in this `training_step()` call? + have_sampled = False + with self._timers[SAMPLE_TIMER]: + # Continue sampling from the actual environment (and add collected samples + # to our replay buffer) as long as we: + while ( + # a) Don't have at least batch_size_B x batch_length_T timesteps stored + # in the buffer. This is the minimum needed to train. + self.replay_buffer.get_num_timesteps() + < (self.config.batch_size_B * self.config.batch_length_T) + # b) The computed `training_ratio` is >= the configured (desired) + # training ratio (meaning we should continue sampling). + or self.training_ratio >= self.config.training_ratio + # c) we have not sampled at all yet in this `training_step()` call. + or not have_sampled + ): + done_episodes, ongoing_episodes = env_runner.sample() + have_sampled = True + + # We took B x T env steps. + env_steps_last_sample = sum( + len(eps) for eps in done_episodes + ongoing_episodes + ) + self._counters[NUM_AGENT_STEPS_SAMPLED] += env_steps_last_sample + self._counters[NUM_ENV_STEPS_SAMPLED] += env_steps_last_sample + + # Add ongoing and finished episodes into buffer. The buffer will + # automatically take care of properly concatenating (by episode IDs) + # the different chunks of the same episodes, even if they come in via + # separate `add()` calls. + self.replay_buffer.add(episodes=done_episodes + ongoing_episodes) + + # Summarize environment interaction and buffer data. + results[ALL_MODULES] = report_sampling_and_replay_buffer( + replay_buffer=self.replay_buffer, + ) + + # Continue sampling batch_size_B x batch_length_T sized batches from the buffer + # and using these to update our models (`LearnerGroup.update()`) until the + # computed `training_ratio` is larger than the configured one, meaning we should + # go back and collect more samples again from the actual environment. + # However, when calculating the `training_ratio` here, we use only the + # trained steps in this very `training_step()` call over the most recent sample + # amount (`env_steps_last_sample`), not the global values. This is to avoid a + # heavy overtraining at the very beginning when we have just pre-filled the + # buffer with the minimum amount of samples. + replayed_steps_this_iter = sub_iter = 0 + while ( + replayed_steps_this_iter / env_steps_last_sample + ) < self.config.training_ratio: + + # Time individual batch updates. + with self._timers[LEARN_ON_BATCH_TIMER]: + logger.info(f"\tSub-iteration {self.training_iteration}/{sub_iter})") + + # Draw a new sample from the replay buffer. + sample = self.replay_buffer.sample( + batch_size_B=self.config.batch_size_B, + batch_length_T=self.config.batch_length_T, + ) + replayed_steps = self.config.batch_size_B * self.config.batch_length_T + replayed_steps_this_iter += replayed_steps + + # Convert some bool columns to float32 and one-hot actions. + sample["is_first"] = sample["is_first"].astype(np.float32) + sample["is_last"] = sample["is_last"].astype(np.float32) + sample["is_terminated"] = sample["is_terminated"].astype(np.float32) + if isinstance(env_runner.env.single_action_space, gym.spaces.Discrete): + sample["actions_ints"] = sample[SampleBatch.ACTIONS] + sample[SampleBatch.ACTIONS] = one_hot( + sample["actions_ints"], + depth=env_runner.env.single_action_space.n, + ) + + # Perform the actual update via our learner group. + train_results = self.learner_group.update( + SampleBatch(sample).as_multi_agent(), + reduce_fn=self._reduce_results, + ) + self._counters[NUM_AGENT_STEPS_TRAINED] += replayed_steps + self._counters[NUM_ENV_STEPS_TRAINED] += replayed_steps + + # Perform additional (non-gradient updates), such as the critic EMA-copy + # update. + with self._timers["critic_ema_update"]: + self.learner_group.additional_update( + timestep=self._counters[NUM_ENV_STEPS_TRAINED], + reduce_fn=self._reduce_results, + ) + + if self.config.report_images_and_videos: + report_predicted_vs_sampled_obs( + # TODO (sven): DreamerV3 is single-agent only. + results=train_results[DEFAULT_POLICY_ID], + sample=sample, + batch_size_B=self.config.batch_size_B, + batch_length_T=self.config.batch_length_T, + symlog_obs=do_symlog_obs( + env_runner.env.single_observation_space, + self.config.model.get("symlog_obs", "auto"), + ), + ) + + res = train_results[DEFAULT_POLICY_ID] + logger.info( + f"\t\tWORLD_MODEL_L_total={res['WORLD_MODEL_L_total']:.5f} (" + f"L_pred={res['WORLD_MODEL_L_prediction']:.5f} (" + f"decoder/obs={res['WORLD_MODEL_L_decoder']} " + f"L_rew={res['WORLD_MODEL_L_reward']} " + f"L_cont={res['WORLD_MODEL_L_continue']}); " + f"L_dyn/rep={res['WORLD_MODEL_L_dynamics']:.5f})" + ) + msg = "\t\t" + if self.config.train_actor: + msg += f"L_actor={res['ACTOR_L_total']:.5f} " + if self.config.train_critic: + msg += f"L_critic={res['CRITIC_L_total']:.5f} " + logger.info(msg) + + sub_iter += 1 + self._counters[NUM_GRAD_UPDATES_LIFETIME] += 1 + + # Update weights - after learning on the LearnerGroup - on all EnvRunner + # workers. + with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: + # Only necessary if RLModule is not shared between (local) EnvRunner and + # (local) Learner. + if not self.config.share_module_between_env_runner_and_learner: + self._counters[ + NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS + ] = 0 + self._counters[NUM_SYNCH_WORKER_WEIGHTS] += 1 + self.workers.sync_weights( + from_worker_or_learner_group=self.learner_group + ) + + # Try trick from https://medium.com/dive-into-ml-ai/dealing-with-memory-leak- + # issue-in-keras-model-training-e703907a6501 + if self.config.gc_frequency_train_steps and ( + self.training_iteration % self.config.gc_frequency_train_steps == 0 + ): + with self._timers[GARBAGE_COLLECTION_TIMER]: + gc.collect() + + # Add train results and the actual training ratio to stats. The latter should + # be close to the configured `training_ratio`. + results.update(train_results) + results[ALL_MODULES]["actual_training_ratio"] = self.training_ratio + + # Return all results. + return results + + @property + def training_ratio(self) -> float: + """Returns the actual training ratio of this Algorithm. + + The training ratio is copmuted by dividing the total number of steps + trained thus far (replayed from the buffer) over the total number of actual + env steps taken thus far. + """ + return self._counters[NUM_ENV_STEPS_TRAINED] / ( + self._counters[NUM_ENV_STEPS_SAMPLED] + ) + + @staticmethod + def _reduce_results(results: List[Dict[str, Any]]): + return tree.map_structure(lambda *s: np.mean(s, axis=0), *results) diff --git a/rllib/algorithms/dreamerv3/dreamerv3_catalog.py b/rllib/algorithms/dreamerv3/dreamerv3_catalog.py new file mode 100644 index 0000000000000..50568fe1875ab --- /dev/null +++ b/rllib/algorithms/dreamerv3/dreamerv3_catalog.py @@ -0,0 +1,80 @@ +import gymnasium as gym + +from ray.rllib.core.models.catalog import Catalog +from ray.rllib.core.models.base import Encoder, Model +from ray.rllib.utils import override + + +class DreamerV3Catalog(Catalog): + """The Catalog class used to build all the models needed for DreamerV3 training.""" + + def __init__( + self, + observation_space: gym.Space, + action_space: gym.Space, + model_config_dict: dict, + ): + """Initializes a DreamerV3Catalog instance. + + Args: + observation_space: The observation space of the environment. + action_space: The action space of the environment. + model_config_dict: The model config to use. + """ + super().__init__( + observation_space=observation_space, + action_space=action_space, + model_config_dict=model_config_dict, + ) + + self.model_size = self.model_config_dict["model_size"] + self.is_img_space = len(self.observation_space.shape) in [2, 3] + self.is_gray_scale = ( + self.is_img_space and len(self.observation_space.shape) == 2 + ) + + # TODO (sven): We should work with sub-component configurations here, + # and even try replacing all current Dreamer model components with + # our default primitives. But for now, we'll construct the DreamerV3Model + # directly in our `build_...()` methods. + + @override(Catalog) + def build_encoder(self, framework: str) -> Encoder: + """Builds the World-Model's encoder network depending on the obs space.""" + if framework != "tf2": + raise NotImplementedError + + if self.is_img_space: + from ray.rllib.algorithms.dreamerv3.tf.models.components.cnn_atari import ( + CNNAtari, + ) + + return CNNAtari(model_size=self.model_size) + else: + from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP + + return MLP(model_size=self.model_size, name="vector_encoder") + + def build_decoder(self, framework: str) -> Model: + """Builds the World-Model's decoder network depending on the obs space.""" + if framework != "tf2": + raise NotImplementedError + + if self.is_img_space: + from ray.rllib.algorithms.dreamerv3.tf.models.components import ( + conv_transpose_atari, + ) + + return conv_transpose_atari.ConvTransposeAtari( + model_size=self.model_size, + gray_scaled=self.is_gray_scale, + ) + else: + from ray.rllib.algorithms.dreamerv3.tf.models.components import ( + vector_decoder, + ) + + return vector_decoder.VectorDecoder( + model_size=self.model_size, + observation_space=self.observation_space, + ) diff --git a/rllib/algorithms/dreamerv3/dreamerv3_learner.py b/rllib/algorithms/dreamerv3/dreamerv3_learner.py index c35d1743c8b1a..32c08d0a671f4 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3_learner.py +++ b/rllib/algorithms/dreamerv3/dreamerv3_learner.py @@ -8,11 +8,13 @@ https://arxiv.org/pdf/2010.02193.pdf """ from dataclasses import dataclass -from typing import Any, Dict +from typing import Any, DefaultDict, Dict from ray.rllib.core.learner.learner import Learner, LearnerHyperparameters from ray.rllib.core.rl_module.rl_module import ModuleID +from ray.rllib.policy.sample_batch import MultiAgentBatch from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import TensorType @dataclass @@ -25,7 +27,7 @@ class to configure your algorithm. more details on the individual properties. """ - model_dimension: str = None + model_size: str = None training_ratio: float = None batch_size_B: int = None batch_length_T: int = None @@ -44,6 +46,10 @@ class to configure your algorithm. world_model_grad_clip_by_global_norm: float = None actor_grad_clip_by_global_norm: float = None critic_grad_clip_by_global_norm: float = None + # Reporting settings. + report_individual_batch_item_stats: bool = None + report_dream_data: bool = None + report_images_and_videos: bool = None class DreamerV3Learner(Learner): @@ -53,6 +59,31 @@ class DreamerV3Learner(Learner): for updating the critic EMA-copy after each training step. """ + @override(Learner) + def compile_results( + self, + *, + batch: MultiAgentBatch, + fwd_out: Dict[str, Any], + loss_per_module: Dict[str, TensorType], + metrics_per_module: DefaultDict[ModuleID, Dict[str, Any]], + ) -> Dict[str, Any]: + results = super().compile_results( + batch=batch, + fwd_out=fwd_out, + loss_per_module=loss_per_module, + metrics_per_module=metrics_per_module, + ) + + # Add the predicted obs distributions for possible (video) summarization. + if self.hps.report_images_and_videos: + for module_id, res in results.items(): + if module_id in fwd_out: + res["WORLD_MODEL_fwd_out_obs_distribution_means_BxT"] = fwd_out[ + module_id + ]["obs_distribution_means_BxT"] + return results + @override(Learner) def additional_update_for_module( self, diff --git a/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py b/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py index 021fbb8646389..f1a112e7017d1 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py +++ b/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py @@ -14,6 +14,7 @@ from ray.rllib.core.models.base import STATE_IN, STATE_OUT from ray.rllib.core.models.specs.specs_dict import SpecDict from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.policy.eager_tf_policy import _convert_to_tf from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import ExperimentalAPI, override from ray.rllib.utils.nested_dict import NestedDict @@ -33,7 +34,7 @@ def setup(self): self.config.observation_space, self.config.model_config_dict.get("symlog_obs", "auto"), ) - model_dimension = self.config.model_config_dict["model_dimension"] + model_size = self.config.model_config_dict["model_size"] # Build encoder and decoder from catalog. catalog = self.config.get_catalog() @@ -42,40 +43,34 @@ def setup(self): # Build the world model (containing encoder and decoder). self.world_model = WorldModel( - model_dimension=model_dimension, + model_size=model_size, action_space=self.config.action_space, batch_length_T=T, - # num_gru_units=self.model_config.num_gru_units, encoder=self.encoder, decoder=self.decoder, symlog_obs=symlog_obs, ) self.actor = ActorNetwork( action_space=self.config.action_space, - model_dimension=model_dimension, + model_size=model_size, ) self.critic = CriticNetwork( - model_dimension=model_dimension, + model_size=model_size, ) # Build the final dreamer model (containing the world model). self.dreamer_model = DreamerModel( - model_dimension=self.config.model_config_dict["model_dimension"], + model_size=self.config.model_config_dict["model_size"], action_space=self.config.action_space, world_model=self.world_model, actor=self.actor, critic=self.critic, - # use_curiosity=use_curiosity, - # intrinsic_rewards_scale=intrinsic_rewards_scale, - batch_size_B=self.config.model_config_dict["batch_size_B"], - batch_length_T=T, - horizon_H=horizon_H, ) self.action_dist_cls = catalog.get_action_dist_cls(framework=self.framework) # Perform a test `call()` to force building the dreamer model's variables. test_obs = np.tile( np.expand_dims(self.config.observation_space.sample(), (0, 1)), - reps=(B, T, 1), + reps=(B, T) + (1,) * len(self.config.observation_space.shape), ) test_actions = np.tile( np.expand_dims( @@ -87,15 +82,13 @@ def setup(self): reps=(B, T, 1), ) self.dreamer_model( - inputs=test_obs, - actions=test_actions.astype(np.float32), - is_first=np.ones((B, T), np.float32), - start_is_terminated_BxT=np.zeros((B * T,), np.float32), + inputs=_convert_to_tf(test_obs), + actions=_convert_to_tf(test_actions.astype(np.float32)), + is_first=_convert_to_tf(np.ones((B, T), np.float32)), + start_is_terminated_BxT=_convert_to_tf(np.zeros((B * T,), np.float32)), horizon_H=horizon_H, gamma=gamma, ) - # This should work now. - self.dreamer_model.summary(expand_nested=True) # Initialize the critic EMA net: self.critic.init_ema() @@ -129,7 +122,7 @@ def input_specs_train(self) -> SpecDict: def output_specs_train(self) -> SpecDict: return [ "sampled_obs_symlog_BxT", - "obs_distribution_BxT", + "obs_distribution_means_BxT", "reward_logits_BxT", "rewards_BxT", "continue_distribution_BxT", diff --git a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py new file mode 100644 index 0000000000000..2e8ef82fd6dbe --- /dev/null +++ b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py @@ -0,0 +1,210 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf + +[3] +D. Hafner's (author) original code repo (for JAX): +https://github.com/danijar/dreamerv3 +""" +import unittest + +import gymnasium as gym +import numpy as np + +import ray +from ray.rllib.algorithms.dreamerv3 import dreamerv3 +from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID +from ray.rllib.utils.test_utils import framework_iterator + + +class TestDreamerV3(unittest.TestCase): + @classmethod + def setUpClass(cls): + ray.init() + + @classmethod + def tearDownClass(cls): + ray.shutdown() + + def test_dreamerv3_compilation(self): + """Test whether DreamerV3 can be built with all frameworks.""" + + # Build a DreamerV3Config object. + config = ( + dreamerv3.DreamerV3Config() + .framework(eager_tracing=True) + .training( + # Keep things simple. Especially the long dream rollouts seem + # to take an enormous amount of time (initially). + batch_size_B=2 * 2, # shared w/ model AND learner AND env runner + batch_length_T=16, + horizon_H=5, + # TODO (sven): Fix having to provide this. + # Should be compiled automatically as `RLModuleConfig` by + # AlgorithmConfig (see comment below)? + model={ + "batch_length_T": 16, + "horizon_H": 5, + "model_size": "nano", # Use a tiny model for testing. + "gamma": 0.997, + "symlog_obs": True, + }, + ) + .resources( + num_learner_workers=2, # Try with 2 Learners. + num_cpus_per_learner_worker=1, + num_gpus_per_learner_worker=0, + ) + .debugging(log_level="INFO") + ) + + # TODO (sven): Add a `get_model_config` utility to AlgorithmConfig + # that - for now - merges the user provided model_dict (which only + # contains settings that only affect the model, e.g. model_size) + # with the AlgorithmConfig-wide settings that are relevant for the model + # (e.g. `batch_size_B`). + # config.get_model_config() + + num_iterations = 2 + + for _ in framework_iterator(config, frameworks="tf2"): + for env in ["ALE/MsPacman-v5", "FrozenLake-v1", "CartPole-v1"]: + print("Env={}".format(env)) + config.environment(env) + algo = config.build() + + for i in range(num_iterations): + results = algo.train() + print(results) + + algo.stop() + + def test_dreamerv3_dreamer_model_sizes(self): + """Tests, whether the different model sizes match the ones reported in [1].""" + + # For Atari, these are the exact numbers from the repo ([3]). + # However, for CartPole + size "S" and "M", the author's original code will not + # match for the world model count. This is due to the fact that the author uses + # encoder/decoder nets with 5x1024 nodes (which corresponds to XL) regardless of + # the `model_size` settings (iff >="S"). + expected_num_params_world_model = { + "XS_cartpole": 2435076, + "S_cartpole": 7493380, + "M_cartpole": 16206084, + "L_cartpole": 37802244, + "XL_cartpole": 108353796, + "XS_atari": 7538979, + "S_atari": 15687811, + "M_atari": 32461635, + "L_atari": 68278275, + "XL_atari": 181558659, + } + + # All values confirmed against [3] (100% match). + expected_num_params_actor = { + # hidden=[1280, 256] + # hidden_norm=[256], [256] + # pi (2 actions)=[256, 2], [2] + "XS_cartpole": 328706, + "S_cartpole": 1051650, + "M_cartpole": 2135042, + "L_cartpole": 4136450, + "XL_cartpole": 9449474, + "XS_atari": 329734, + "S_atari": 1053702, + "M_atari": 2137606, + "L_atari": 4139526, + "XL_atari": 9453574, + } + + # All values confirmed against [3] (100% match). + expected_num_params_critic = { + # hidden=[1280, 256] + # hidden_norm=[256], [256] + # vf (buckets)=[256, 255], [255] + "XS_cartpole": 393727, + "S_cartpole": 1181439, + "M_cartpole": 2297215, + "L_cartpole": 4331007, + "XL_cartpole": 9708799, + "XS_atari": 393727, + "S_atari": 1181439, + "M_atari": 2297215, + "L_atari": 4331007, + "XL_atari": 9708799, + } + + config = ( + dreamerv3.DreamerV3Config() + .framework("tf2", eager_tracing=True) + .training( + model={ + "batch_length_T": 16, + "horizon_H": 5, + "gamma": 0.997, + "symlog_obs": True, + } + ) + ) + + # Check all model_sizes described in the paper ([1]) on matching the number + # of parameters to RLlib's implementation. + for model_size in ["XS", "S", "M", "L", "XL"]: + config.model_size = model_size + config.training(model={"model_size": model_size}) + + # Atari and CartPole spaces. + for obs_space, num_actions, env_name in [ + (gym.spaces.Box(-1.0, 0.0, (4,), np.float32), 2, "cartpole"), + (gym.spaces.Box(-1.0, 0.0, (64, 64, 3), np.float32), 6, "atari"), + ]: + print(f"Testing model_size={model_size} on env-type: {env_name} ..") + config.environment( + observation_space=obs_space, + action_space=gym.spaces.Discrete(num_actions), + ) + + # Create our RLModule to compute actions with. + policy_dict, _ = config.get_multi_agent_setup() + module_spec = config.get_marl_module_spec(policy_dict=policy_dict) + rl_module = module_spec.build()[DEFAULT_POLICY_ID] + + # Count the generated RLModule's parameters and compare to the paper's + # reported numbers ([1] and [3]). + num_params_world_model = sum( + np.prod(v.shape.as_list()) + for v in rl_module.world_model.trainable_variables + ) + self.assertEqual( + num_params_world_model, + expected_num_params_world_model[f"{model_size}_{env_name}"], + ) + num_params_actor = sum( + np.prod(v.shape.as_list()) + for v in rl_module.actor.trainable_variables + ) + self.assertEqual( + num_params_actor, + expected_num_params_actor[f"{model_size}_{env_name}"], + ) + num_params_critic = sum( + np.prod(v.shape.as_list()) + for v in rl_module.critic.trainable_variables + ) + self.assertEqual( + num_params_critic, + expected_num_params_critic[f"{model_size}_{env_name}"], + ) + print("\tok") + + +if __name__ == "__main__": + import pytest + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py index 6f970a9117d9e..366735f643d74 100644 --- a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py +++ b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py @@ -18,7 +18,7 @@ from ray.rllib.core.rl_module.marl_module import ModuleID from ray.rllib.core.learner.learner import ParamDict from ray.rllib.core.learner.tf.tf_learner import TfLearner -from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf, try_import_tfp from ray.rllib.utils.tf_utils import symlog, two_hot, clip_gradients @@ -34,16 +34,21 @@ class DreamerV3TfLearner(DreamerV3Learner, TfLearner): The critic EMA-copy update step can be found in the `DreamerV3Learner` base class, as it is framework independent. - We define 3 local tensorflow optimizers for the sub components "world_model", + We define 3 local TensorFlow optimizers for the sub components "world_model", "actor", and "critic". Each of these optimizers might use a different learning rate, epsilon parameter, and gradient clipping thresholds and procedures. """ @override(TfLearner) - def configure_optimizer_for_module( + def configure_optimizers_for_module( self, module_id: ModuleID, hps: DreamerV3LearnerHyperparameters ): - """Create the 3 optimizers for Dreamer learning: world_model, actor, critic.""" + """Create the 3 optimizers for Dreamer learning: world_model, actor, critic. + + The learning rates used are described in [1] and the epsilon values used here + - albeit probably not that important - are used by the author's own + implementation. + """ dreamerv3_module = self._module[module_id] @@ -95,7 +100,7 @@ def postprocess_gradients_for_module( """Performs gradient clipping on the 3 module components' computed grads. Note that different grad global-norm clip values are used for the 3 - module components (world model, actor, and critic). + module components: world model, actor, and critic. """ for optimizer_name, optimizer in self.get_optimizers_for_module( module_id=module_id @@ -134,6 +139,32 @@ def postprocess_gradients_for_module( return module_gradients_dict + @override(TfLearner) + def compute_gradients( + self, + loss_per_module, + gradient_tape, + **kwargs, + ): + # Override of the default gradient computation method. + # For DreamerV3, we need to compute gradients over the individual loss terms + # as otherwise, the world model's parameters would have their gradients also + # be influenced by the actor- and critic loss terms/gradient computations. + grads = {} + for component in ["world_model", "actor", "critic"]: + grads.update( + gradient_tape.gradient( + # Take individual loss term from the registered metrics for + # the main module. + self._metrics[DEFAULT_POLICY_ID][component.upper() + "_L_total"], + self.filter_param_dict_for_optimizer( + self._params, self.get_optimizer(optimizer_name=component) + ), + ) + ) + del gradient_tape + return grads + @override(TfLearner) def compute_loss_for_module( self, @@ -170,7 +201,11 @@ def compute_loss_for_module( + 0.1 * L_rep_B_T ) - # Sum up timesteps, and average over batch (see eq. 4 in [1]). + # In the paper, it says to sum up timesteps, and average over + # batch (see eq. 4 in [1]). But Danijar's implementation only does + # averaging (over B and T), so we'll do this here as well. This is generally + # true for all other loss terms as well (we'll always just average, no summing + # over T axis!). L_world_model_total = tf.reduce_mean(L_world_model_total_B_T) # Register world model loss stats. @@ -182,28 +217,36 @@ def compute_loss_for_module( ), # Prediction losses. # Decoder (obs) loss. - "WORLD_MODEL_L_decoder_B_T": prediction_losses["L_decoder_B_T"], "WORLD_MODEL_L_decoder": prediction_losses["L_decoder"], # Reward loss. - "WORLD_MODEL_L_reward_B_T": prediction_losses["L_reward_B_T"], "WORLD_MODEL_L_reward": prediction_losses["L_reward"], # Continue loss. - "WORLD_MODEL_L_continue_B_T": prediction_losses["L_continue_B_T"], "WORLD_MODEL_L_continue": prediction_losses["L_continue"], # Total. - "WORLD_MODEL_L_prediction_B_T": prediction_losses["L_prediction_B_T"], "WORLD_MODEL_L_prediction": prediction_losses["L_prediction"], # Dynamics loss. - "WORLD_MODEL_L_dynamics_B_T": L_dyn_B_T, "WORLD_MODEL_L_dynamics": L_dyn, # Representation loss. - "WORLD_MODEL_L_representation_B_T": L_rep_B_T, "WORLD_MODEL_L_representation": L_rep, # Total loss. - "WORLD_MODEL_L_total_B_T": L_world_model_total_B_T, "WORLD_MODEL_L_total": L_world_model_total, }, ) + if hps.report_individual_batch_item_stats: + self.register_metrics( + module_id=module_id, + metrics_dict={ + "WORLD_MODEL_L_decoder_B_T": prediction_losses["L_decoder_B_T"], + "WORLD_MODEL_L_reward_B_T": prediction_losses["L_reward_B_T"], + "WORLD_MODEL_L_continue_B_T": prediction_losses["L_continue_B_T"], + "WORLD_MODEL_L_prediction_B_T": ( + prediction_losses["L_prediction_B_T"] + ), + "WORLD_MODEL_L_dynamics_B_T": L_dyn_B_T, + "WORLD_MODEL_L_representation_B_T": L_rep_B_T, + "WORLD_MODEL_L_total_B_T": L_world_model_total_B_T, + }, + ) # Dream trajectories starting in all internal states (h + z_posterior) that were # computed during world model training. @@ -219,17 +262,31 @@ def compute_loss_for_module( timesteps_H=hps.horizon_H, gamma=hps.gamma, ) - self.register_metrics(module_id, {"dream_data": dream_data}) + if hps.report_dream_data: + # To reduce this massive mount of data a little, slice out a T=1 piece + # from each stats that has the shape (H, BxT), meaning convert e.g. + # `rewards_dreamed_t0_to_H_BxT` into `rewards_dreamed_t0_to_H_Bx1`. + # This will reduce the amount of data to be transferred and reported + # by the factor of `batch_length_T`. + self.register_metrics( + module_id, + { + # Replace 'T' with '1'. + "DREAM_DATA_" + key[:-1] + "1": value[:, hps.batch_size_B] + for key, value in dream_data.items() + if key.endswith("H_BxT") + }, + ) value_targets_t0_to_Hm1_BxT = self._compute_value_targets( hps=hps, # Learn critic in symlog'd space. - rewards_t0_to_H_BxT=dream_data["rewards_dreamed_t0_to_H_B"], + rewards_t0_to_H_BxT=dream_data["rewards_dreamed_t0_to_H_BxT"], intrinsic_rewards_t1_to_H_BxT=( dream_data["rewards_intrinsic_t1_to_H_B"] if hps.use_curiosity else None ), - continues_t0_to_H_BxT=dream_data["continues_dreamed_t0_to_H_B"], - value_predictions_t0_to_H_BxT=dream_data["values_dreamed_t0_to_H_B"], + continues_t0_to_H_BxT=dream_data["continues_dreamed_t0_to_H_BxT"], + value_predictions_t0_to_H_BxT=dream_data["values_dreamed_t0_to_H_BxT"], ) self.register_metric( module_id, "VALUE_TARGETS_H_BxT", value_targets_t0_to_Hm1_BxT @@ -237,6 +294,7 @@ def compute_loss_for_module( CRITIC_L_total = self._compute_critic_loss( module_id=module_id, + hps=hps, dream_data=dream_data, value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT, ) @@ -250,16 +308,6 @@ def compute_loss_for_module( else: ACTOR_L_total = 0.0 - # if hps.use_curiosity: - # L_disagree = self._compute_disagree_loss(dream_data=dream_data) - # results["DISAGREE_L_total"] = L_disagree - # results["DISAGREE_intrinsic_rewards_H_B"] = ( - # dream_data["rewards_intrinsic_t1_to_H_B"] - # ) - # results["DISAGREE_intrinsic_rewards"] = tf.reduce_mean( - # dream_data["rewards_intrinsic_t1_to_H_B"] - # ) - # Return the total loss as a sum of all individual losses. return L_world_model_total + CRITIC_L_total + ACTOR_L_total @@ -289,16 +337,27 @@ def _compute_world_model_prediction_losses( # If symlog is disabled (e.g. for uint8 image inputs), `obs_symlog_BxT` is the # same as `obs_BxT`. obs_BxT = fwd_out["sampled_obs_symlog_BxT"] - obs_distr = fwd_out["obs_distribution_BxT"] + obs_distr_means = fwd_out["obs_distribution_means_BxT"] + # In case we wanted to construct a distribution object from the fwd out data, + # we would have to do it like this: + # obs_distr = tfp.distributions.MultivariateNormalDiag( + # loc=obs_distr_means, + # # Scale == 1.0. + # # [2]: "Distributions The image predictor outputs the mean of a diagonal + # # Gaussian likelihood with **unit variance** ..." + # scale_diag=tf.ones_like(obs_distr_means), + # ) + # Leave time dim folded (BxT) and flatten all other (e.g. image) dims. obs_BxT = tf.reshape(obs_BxT, shape=[-1, tf.reduce_prod(obs_BxT.shape[1:])]) - # Neg logp loss. - # decoder_loss = - obs_distr.log_prob(observations) - # decoder_loss /= observations.shape.as_list()[1] # Squared diff loss w/ sum(!) over all (already folded) obs dims. + # decoder_loss_BxT = SUM[ (obs_distr.loc - observations)^2 ] + # Note: This is described strangely in the paper (stating a neglogp loss here), + # but the author's own implementation actually uses simple MSE with the loc + # of the Gaussian. decoder_loss_BxT = tf.reduce_sum( - tf.math.square(obs_distr.loc - obs_BxT), axis=-1 + tf.math.square(obs_distr_means - obs_BxT), axis=-1 ) # Unfold time rank back in. @@ -456,30 +515,36 @@ def _compute_actor_loss( """ actor = self.module[module_id].actor - # Note: `value_targets` are NOT stop_gradient'd yet. + # Note: `scaled_value_targets_t0_to_Hm1_B` are NOT stop_gradient'd yet. scaled_value_targets_t0_to_Hm1_B = self._compute_scaled_value_targets( module_id=module_id, hps=hps, value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT, - value_predictions_t0_to_Hm1_BxT=dream_data["values_dreamed_t0_to_H_B"][:-1], + value_predictions_t0_to_Hm1_BxT=dream_data["values_dreamed_t0_to_H_BxT"][ + :-1 + ], ) # Actions actually taken in the dream. - actions_dreamed = tf.stop_gradient(dream_data["actions_dreamed_t0_to_H_B"])[:-1] - dist_actions_t0_to_Hm1_B = dream_data[ - "actions_dreamed_distributions_t0_to_H_B" + actions_dreamed = tf.stop_gradient(dream_data["actions_dreamed_t0_to_H_BxT"])[ + :-1 + ] + actions_dreamed_dist_params_t0_to_Hm1_B = dream_data[ + "actions_dreamed_dist_params_t0_to_H_BxT" ][:-1] + dist_t0_to_Hm1_B = actor.get_action_dist_object( + actions_dreamed_dist_params_t0_to_Hm1_B + ) + # Compute log(p)s of all possible actions in the dream. if isinstance(self.module[module_id].actor.action_space, gym.spaces.Discrete): # Note that when we create the Categorical action distributions, we compute # unimix probs, then math.log these and provide these log(p) as "logits" to # the Categorical. So here, we'll continue to work with log(p)s (not # really "logits")! - logp_actions_t0_to_Hm1_B = tf.stack( - [dist.logits for dist in dist_actions_t0_to_Hm1_B], - axis=0, - ) + logp_actions_t0_to_Hm1_B = actions_dreamed_dist_params_t0_to_Hm1_B + # Log probs of actions actually taken in the dream. logp_actions_dreamed_t0_to_Hm1_B = tf.reduce_sum( actions_dreamed * logp_actions_t0_to_Hm1_B, @@ -489,29 +554,18 @@ def _compute_actor_loss( logp_loss_H_B = logp_actions_dreamed_t0_to_Hm1_B * tf.stop_gradient( scaled_value_targets_t0_to_Hm1_B ) - elif isinstance(actor.action_space, gym.spaces.Box): - # TODO (Rohan138, Sven): Figure out how to vectorize this instead! - logp_actions_dreamed_t0_to_Hm1_B = tf.stack( - [ - dist.log_prob(actions_dreamed[i]) - for i, dist in enumerate(dist_actions_t0_to_Hm1_B) - ] + # Box space. + else: + logp_actions_dreamed_t0_to_Hm1_B = dist_t0_to_Hm1_B.log_prob( + actions_dreamed ) # First term of loss function. [1] eq. 11. logp_loss_H_B = scaled_value_targets_t0_to_Hm1_B - else: - raise ValueError(f"Invalid action space: {actor.action_space}") assert len(logp_loss_H_B.shape) == 2 # Add entropy loss term (second term [1] eq. 11). - entropy_H_B = tf.stack( - [ - dist.entropy() - for dist in dream_data["actions_dreamed_distributions_t0_to_H_B"][:-1] - ], - axis=0, - ) + entropy_H_B = dist_t0_to_Hm1_B.entropy() assert len(entropy_H_B.shape) == 2 entropy = tf.reduce_mean(entropy_H_B) @@ -520,31 +574,44 @@ def _compute_actor_loss( L_actor_H_B = L_actor_reinforce_term_H_B + L_actor_action_entropy_term_H_B # Mask out everything that goes beyond a predicted continue=False boundary. - L_actor_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_B"])[:-1] + L_actor_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_BxT"])[ + :-1 + ] L_actor = tf.reduce_mean(L_actor_H_B) self.register_metrics( module_id, metrics_dict={ - "ACTOR_L_total_H_B": L_actor_H_B, "ACTOR_L_total": L_actor, - "ACTOR_logp_actions_dreamed_H_B": logp_actions_dreamed_t0_to_Hm1_B, - "ACTOR_scaled_value_targets_H_B": scaled_value_targets_t0_to_Hm1_B, "ACTOR_value_targets_pct95_ema": actor.ema_value_target_pct95, "ACTOR_value_targets_pct5_ema": actor.ema_value_target_pct5, - "ACTOR_action_entropy_H_B": entropy_H_B, "ACTOR_action_entropy": entropy, # Individual loss terms. - "ACTOR_L_neglogp_reinforce_term_H_B": L_actor_reinforce_term_H_B, "ACTOR_L_neglogp_reinforce_term": tf.reduce_mean( L_actor_reinforce_term_H_B ), - "ACTOR_L_neg_entropy_term_H_B": L_actor_action_entropy_term_H_B, "ACTOR_L_neg_entropy_term": tf.reduce_mean( L_actor_action_entropy_term_H_B ), }, ) + if hps.report_individual_batch_item_stats: + self.register_metrics( + module_id, + metrics_dict={ + "ACTOR_L_total_H_BxT": L_actor_H_B, + "ACTOR_logp_actions_dreamed_H_BxT": ( + logp_actions_dreamed_t0_to_Hm1_B + ), + "ACTOR_scaled_value_targets_H_BxT": ( + scaled_value_targets_t0_to_Hm1_B + ), + "ACTOR_action_entropy_H_BxT": entropy_H_B, + # Individual loss terms. + "ACTOR_L_neglogp_reinforce_term_H_BxT": L_actor_reinforce_term_H_B, + "ACTOR_L_neg_entropy_term_H_BxT": L_actor_action_entropy_term_H_B, + }, + ) return L_actor @@ -552,6 +619,7 @@ def _compute_critic_loss( self, *, module_id: ModuleID, + hps: DreamerV3LearnerHyperparameters, dream_data: Dict[str, TensorType], value_targets_t0_to_Hm1_BxT: TensorType, ) -> TensorType: @@ -559,6 +627,7 @@ def _compute_critic_loss( Args: module_id: The ModuleID for which to compute the critic loss. + hps: The DreamerV3LearnerHyperparameters to use. dream_data: The data generated by dreaming for H steps (horizon) starting from any BxT state (sampled from the buffer for the train batch). value_targets_t0_to_Hm1_BxT: The computed value function targets of the @@ -567,7 +636,8 @@ def _compute_critic_loss( Returns: The total critic loss tensor. """ - H, B = dream_data["rewards_dreamed_t0_to_H_B"].shape[:2] + # B=BxT + H, B = dream_data["rewards_dreamed_t0_to_H_BxT"].shape[:2] Hm1 = H - 1 # Note that value targets are NOT symlog'd and go from t0 to H-1, not H, like @@ -586,7 +656,7 @@ def _compute_critic_loss( ) # Get (B x T x probs) tensor from return distributions. - value_symlog_logits_HxB = dream_data["values_symlog_dreamed_logits_t0_to_HxB"] + value_symlog_logits_HxB = dream_data["values_symlog_dreamed_logits_t0_to_HxBxT"] # Unfold time rank and cut last time index to match value targets. value_symlog_logits_t0_to_Hm1_B = tf.reshape( value_symlog_logits_HxB, @@ -608,7 +678,7 @@ def _compute_critic_loss( # Expected values (dreamed) from the EMA (slow critic) net. # Note: Slow critic (EMA) outputs are already stop_gradient'd. value_symlog_ema_t0_to_Hm1_B = tf.stop_gradient( - dream_data["v_symlog_dreamed_ema_t0_to_H_B"] + dream_data["v_symlog_dreamed_ema_t0_to_H_BxT"] )[:-1] # Fold time rank (for two_hot'ing). value_symlog_ema_HxB = tf.reshape(value_symlog_ema_t0_to_Hm1_B, (-1,)) @@ -634,7 +704,7 @@ def _compute_critic_loss( L_critic_H_B = value_loss_two_hot_H_B + ema_regularization_loss_H_B # Mask out everything that goes beyond a predicted continue=False boundary. - L_critic_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_B"])[ + L_critic_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_BxT"])[ :-1 ] @@ -644,21 +714,29 @@ def _compute_critic_loss( self.register_metrics( module_id=module_id, metrics_dict={ - # Symlog'd value targets. Critic learns to predict symlog'd values. - "VALUE_TARGETS_symlog_H_B": value_symlog_targets_t0_to_Hm1_B, - # Critic loss terms. "CRITIC_L_total": L_critic, - "CRITIC_L_total_H_B": L_critic_H_B, - "CRITIC_L_neg_logp_of_value_targets_H_B": value_loss_two_hot_H_B, "CRITIC_L_neg_logp_of_value_targets": tf.reduce_mean( value_loss_two_hot_H_B ), - "CRITIC_L_slow_critic_regularization_H_B": ema_regularization_loss_H_B, "CRITIC_L_slow_critic_regularization": tf.reduce_mean( ema_regularization_loss_H_B ), }, ) + if hps.report_individual_batch_item_stats: + self.register_metrics( + module_id=module_id, + metrics_dict={ + # Symlog'd value targets. Critic learns to predict symlog'd values. + "VALUE_TARGETS_symlog_H_BxT": value_symlog_targets_t0_to_Hm1_B, + # Critic loss terms. + "CRITIC_L_total_H_BxT": L_critic_H_B, + "CRITIC_L_neg_logp_of_value_targets_H_BxT": value_loss_two_hot_H_B, + "CRITIC_L_slow_critic_regularization_H_BxT": ( + ema_regularization_loss_H_B + ), + }, + ) return L_critic @@ -724,7 +802,7 @@ def _compute_value_targets( # intermediates.shape=[2-16, BxT] # Loop through reversed timesteps (axis=1) from T+1 to t=2. - for t in reversed(range(len(discount))): + for t in reversed(range(discount.shape[0])): Rs.append(intermediates[t] + discount[t] * hps.gae_lambda * Rs[-1]) # Reverse along time axis and cut the last entry (value estimate at very end @@ -767,21 +845,32 @@ def _compute_scaled_value_targets( Per_R_5 = tfp.stats.percentile(value_targets_H_B, 5) Per_R_95 = tfp.stats.percentile(value_targets_H_B, 95) - # Update EMAs stored in actor network. - # Initial values: Just set. - if tf.math.is_nan(actor.ema_value_target_pct5): - actor.ema_value_target_pct5.assign(Per_R_5) - actor.ema_value_target_pct95.assign(Per_R_95) - # Later update (something already stored in EMA variable): Update EMA. - else: - actor.ema_value_target_pct5.assign( + # Update EMA values for 5 and 95 percentile, stored as tf variables under actor + # network. + # 5 percentile + new_val_pct5 = tf.where( + tf.math.is_nan(actor.ema_value_target_pct5), + # is NaN: Initial values: Just set. + Per_R_5, + # Later update (something already stored in EMA variable): Update EMA. + ( hps.return_normalization_decay * actor.ema_value_target_pct5 + (1.0 - hps.return_normalization_decay) * Per_R_5 - ) - actor.ema_value_target_pct95.assign( + ), + ) + actor.ema_value_target_pct5.assign(new_val_pct5) + # 95 percentile + new_val_pct95 = tf.where( + tf.math.is_nan(actor.ema_value_target_pct95), + # is NaN: Initial values: Just set. + Per_R_95, + # Later update (something already stored in EMA variable): Update EMA. + ( hps.return_normalization_decay * actor.ema_value_target_pct95 + (1.0 - hps.return_normalization_decay) * Per_R_95 - ) + ), + ) + actor.ema_value_target_pct95.assign(new_val_pct95) # [1] eq. 11 (first term). # Danijar's code: TODO: describe ... diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py index 0cb088e60fd95..77c4c285b21ba 100644 --- a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py +++ b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py @@ -1,3 +1,12 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" from typing import Mapping, Any from ray.rllib.algorithms.dreamerv3.dreamerv3_rl_module import DreamerV3RLModule diff --git a/rllib/algorithms/dreamerv3/tf/models/actor_network.py b/rllib/algorithms/dreamerv3/tf/models/actor_network.py index f22617960b0a8..d865f85606a3a 100644 --- a/rllib/algorithms/dreamerv3/tf/models/actor_network.py +++ b/rllib/algorithms/dreamerv3/tf/models/actor_network.py @@ -8,10 +8,12 @@ import gymnasium as gym from gymnasium.spaces import Box, Discrete import numpy as np -import tensorflow as tf -import tensorflow_probability as tfp from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP +from ray.rllib.utils.framework import try_import_tf, try_import_tfp + +_, tf, _ = try_import_tf() +tfp = try_import_tfp() class ActorNetwork(tf.keras.Model): @@ -28,19 +30,19 @@ class ActorNetwork(tf.keras.Model): def __init__( self, *, - model_dimension: Optional[str] = "XS", + model_size: Optional[str] = "XS", action_space: gym.Space, ): """Initializes an ActorNetwork instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different network sizes. action_space: The action space the our environment used. """ super().__init__(name="actor") - self.model_dimension = model_dimension + self.model_size = model_size self.action_space = action_space # The EMA decay variables used for the [Percentile(R, 95%) - Percentile(R, 5%)] @@ -55,20 +57,23 @@ def __init__( # For discrete actions, use a single MLP that computes logits. if isinstance(self.action_space, Discrete): self.mlp = MLP( - model_dimension=self.model_dimension, + model_size=self.model_size, output_layer_size=self.action_space.n, name="actor_mlp", ) # For cont. actions, use separate MLPs for Gaussian mean and stddev. + # TODO (sven): In the author's original code repo, this is NOT the case, + # inputs are pushed through a shared MLP, then only the two output linear + # layers are separate for std- and mean logits. elif isinstance(action_space, Box): output_layer_size = np.prod(action_space.shape) self.mlp = MLP( - model_dimension=self.model_dimension, + model_size=self.model_size, output_layer_size=output_layer_size, name="actor_mlp_mean", ) self.std_mlp = MLP( - model_dimension=self.model_dimension, + model_size=self.model_size, output_layer_size=output_layer_size, name="actor_mlp_std", ) @@ -76,15 +81,15 @@ def __init__( raise ValueError(f"Invalid action space: {action_space}") @tf.function - def call(self, h, z, return_distribution=False): + def call(self, h, z, return_distr_params=False): """Performs a forward pass through this policy network. Args: h: The deterministic hidden state of the sequence model. [B, dim(h)]. z: The stochastic discrete representations of the original observation input. [B, num_categoricals, num_classes]. - return_distribution: Whether to return (as a second tuple item) the action - distribution object created by the policy. + return_distr_params: Whether to return (as a second tuple item) the action + distribution parameter tensor created by the policy. """ # Flatten last two dims of z. assert len(z.shape) == 3 @@ -109,8 +114,10 @@ def call(self, h, z, return_distribution=False): # Danijar's code does: distr = [Distr class](logits=tf.log(probs)). # Not sure why we don't directly use the already available probs instead. action_logits = tf.math.log(action_probs) - # Create the distribution object using the unimix'd logits. - distr = tfp.distributions.OneHotCategorical(logits=action_logits) + + # Distribution parameters are the log(probs) directly. + distr_params = action_logits + distr = self.get_action_dist_object(distr_params) action = tf.cast(tf.stop_gradient(distr.sample()), tf.float32) + ( action_probs - tf.stop_gradient(action_probs) @@ -122,15 +129,48 @@ def call(self, h, z, return_distribution=False): # minstd, maxstd taken from [1] from configs.yaml minstd = 0.1 maxstd = 1.0 + + # Distribution parameters are the squashed std_logits and the tanh'd + # mean logits. # squash std_logits from (-inf, inf) to (minstd, maxstd) std_logits = (maxstd - minstd) * tf.sigmoid(std_logits + 2.0) + minstd + mean_logits = tf.tanh(action_logits) + + distr_params = tf.concat([mean_logits, std_logits], axis=-1) + distr = self.get_action_dist_object(distr_params) + + action = distr.sample() + + if return_distr_params: + return action, distr_params + return action + + def get_action_dist_object(self, action_dist_params_T_B): + """Helper method to create an action distribution object from (T, B, ..) params. + + Args: + action_dist_params_T_B: The time-major action distribution parameters. + This could be simply the logits (discrete) or a to-be-split-in-2 + tensor for mean and stddev (continuous). + + Returns: + The tfp action distribution object, from which one can sample, compute + log probs, entropy, etc.. + """ + if isinstance(self.action_space, gym.spaces.Discrete): + # Create the distribution object using the unimix'd logits. + distr = tfp.distributions.OneHotCategorical(logits=action_dist_params_T_B) + + elif isinstance(self.action_space, gym.spaces.Box): # Compute Normal distribution from action_logits and std_logits - distr = tfp.distributions.Normal(tf.tanh(action_logits), std_logits) + loc, scale = tf.split(action_dist_params_T_B, 2, axis=-1) + distr = tfp.distributions.Normal(loc=loc, scale=scale) + # If action_space is a box with multiple dims, make individual dims # independent. distr = tfp.distributions.Independent(distr, len(self.action_space.shape)) - action = distr.sample() - if return_distribution: - return action, distr - return action + else: + raise ValueError(f"Action space {self.action_space} not supported!") + + return distr diff --git a/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py b/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py index ba9ec38a0fa55..0700240f1bf8c 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py @@ -5,9 +5,10 @@ """ from typing import Optional -import tensorflow as tf - from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() class CNNAtari(tf.keras.Model): @@ -16,13 +17,13 @@ class CNNAtari(tf.keras.Model): def __init__( self, *, - model_dimension: Optional[str] = "XS", + model_size: Optional[str] = "XS", cnn_multiplier: Optional[int] = None, ): """Initializes a CNNAtari instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the `cnn_multiplier`. cnn_multiplier: Optional override for the additional factor used to multiply the number of filters with each CNN layer. Starting with @@ -32,7 +33,7 @@ def __init__( """ super().__init__(name="image_encoder") - cnn_multiplier = get_cnn_multiplier(model_dimension, override=cnn_multiplier) + cnn_multiplier = get_cnn_multiplier(model_size, override=cnn_multiplier) # See appendix C in [1]: # "We use a similar network architecture but employ layer normalization and diff --git a/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py index 41031c950e11b..a23ddca856c87 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py @@ -5,10 +5,11 @@ """ from typing import Optional -import tensorflow as tf -import tensorflow_probability as tfp - from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP +from ray.rllib.utils.framework import try_import_tf, try_import_tfp + +_, tf, _ = try_import_tf() +tfp = try_import_tfp() class ContinuePredictor(tf.keras.Model): @@ -23,15 +24,15 @@ class ContinuePredictor(tf.keras.Model): terminal. """ - def __init__(self, *, model_dimension: Optional[str] = "XS"): + def __init__(self, *, model_size: Optional[str] = "XS"): """Initializes a ContinuePredictor instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Determines the exact size of the underlying MLP. """ super().__init__(name="continue_predictor") - self.mlp = MLP(model_dimension=model_dimension, output_layer_size=1) + self.mlp = MLP(model_size=model_size, output_layer_size=1) def call(self, h, z, return_distribution=False): """Performs a forward pass through the continue predictor. diff --git a/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py b/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py index cffa73adb8029..ebc8649ccd79b 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py @@ -10,10 +10,11 @@ from typing import Optional import numpy as np -import tensorflow as tf -import tensorflow_probability as tfp from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() class ConvTransposeAtari(tf.keras.Model): @@ -28,14 +29,14 @@ class ConvTransposeAtari(tf.keras.Model): def __init__( self, *, - model_dimension: Optional[str] = "XS", + model_size: Optional[str] = "XS", cnn_multiplier: Optional[int] = None, gray_scaled: bool, ): """Initializes a ConvTransposeAtari instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the `cnn_multiplier`. cnn_multiplier: Optional override for the additional factor used to multiply the number of filters with each CNN transpose layer. Starting with @@ -47,7 +48,7 @@ def __init__( """ super().__init__(name="image_decoder") - cnn_multiplier = get_cnn_multiplier(model_dimension, override=cnn_multiplier) + cnn_multiplier = get_cnn_multiplier(model_size, override=cnn_multiplier) # The shape going into the first Conv2DTranspose layer. # We start with a 4x4 channels=8 "image". @@ -146,15 +147,9 @@ def call(self, h, z): # From [2]: # "Distributions: The image predictor outputs the mean of a diagonal Gaussian # likelihood with unit variance, ..." + # Reshape `out` for the diagonal multi-variate Gaussian (each pixel is its own # independent (b/c diagonal co-variance matrix) variable). loc = tf.reshape(out, shape=(out_shape[0], -1)) - distribution = tfp.distributions.MultivariateNormalDiag( - loc=loc, - # Scale == 1.0. - # [2]: "Distributions The image predictor outputs the mean of a diagonal - # Gaussian likelihood with **unit variance** ..." - scale_diag=tf.ones_like(loc), - ) - pred_obs = distribution.sample() - return pred_obs, distribution + + return loc diff --git a/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py index fc69c8dd33f9c..559009a44531f 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py @@ -5,12 +5,13 @@ """ from typing import Optional -import tensorflow as tf - from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import ( RepresentationLayer, ) +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() class DynamicsPredictor(tf.keras.Model): @@ -26,17 +27,17 @@ class DynamicsPredictor(tf.keras.Model): def __init__( self, *, - model_dimension: Optional[str] = "XS", + model_size: Optional[str] = "XS", num_categoricals: Optional[int] = None, num_classes_per_categorical: Optional[int] = None, ): """Initializes a DynamicsPredictor instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different parameters. num_categoricals: Overrides the number of categoricals used in the z-states. - In [1], 32 is used for any model dimension. + In [1], 32 is used for any model size. num_classes_per_categorical: Overrides the number of classes within each categorical used for the z-states. In [1], 32 is used for any model dimension. @@ -47,12 +48,12 @@ def __init__( # TODO: In Danijar's code, the Dynamics Net only has a single layer, no # matter the model size. num_dense_layers=1, - model_dimension=model_dimension, + model_size=model_size, output_layer_size=None, ) # The (prior) z-state generating layer. self.representation_layer = RepresentationLayer( - model_dimension=model_dimension, + model_size=model_size, num_categoricals=num_categoricals, num_classes_per_categorical=num_classes_per_categorical, ) diff --git a/rllib/algorithms/dreamerv3/tf/models/components/mlp.py b/rllib/algorithms/dreamerv3/tf/models/components/mlp.py index 30d4a7713ee1a..435d9f8544ab3 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/mlp.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/mlp.py @@ -9,12 +9,13 @@ """ from typing import Optional -import tensorflow as tf - from ray.rllib.algorithms.dreamerv3.utils import ( get_dense_hidden_units, get_num_dense_layers, ) +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() class MLP(tf.keras.Model): @@ -22,13 +23,13 @@ class MLP(tf.keras.Model): MLP=multi-layer perceptron. - See Appendix B in [1] for the MLP sizes depending on the given `model_dimension`. + See Appendix B in [1] for the MLP sizes depending on the given `model_size`. """ def __init__( self, *, - model_dimension: Optional[str] = "XS", + model_size: Optional[str] = "XS", num_dense_layers: Optional[int] = None, dense_hidden_units: Optional[int] = None, output_layer_size=None, @@ -38,12 +39,12 @@ def __init__( """Initializes an MLP instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different network sizes. num_dense_layers: The number of hidden layers in the MLP. If None, - will use `model_dimension` and appendix B to figure out this value. + will use `model_size` and appendix B to figure out this value. dense_hidden_units: The number of nodes in each hidden layer. If None, - will use `model_dimension` and appendix B to figure out this value. + will use `model_size` and appendix B to figure out this value. output_layer_size: The size of an optional linear (no activation) output layer. If None, no output layer will be added on top of the MLP dense stack. @@ -52,11 +53,9 @@ def __init__( """ super().__init__(name=name or "mlp") - num_dense_layers = get_num_dense_layers( - model_dimension, override=num_dense_layers - ) + num_dense_layers = get_num_dense_layers(model_size, override=num_dense_layers) dense_hidden_units = get_dense_hidden_units( - model_dimension, override=dense_hidden_units + model_size, override=dense_hidden_units ) self.dense_layers = [] diff --git a/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py b/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py index 36e2ace631844..cf6b27b3c68ff 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py @@ -9,13 +9,14 @@ """ from typing import Optional -import tensorflow as tf -import tensorflow_probability as tfp - from ray.rllib.algorithms.dreamerv3.utils import ( get_num_z_categoricals, get_num_z_classes, ) +from ray.rllib.utils.framework import try_import_tf, try_import_tfp + +_, tf, _ = try_import_tf() +tfp = try_import_tfp() class RepresentationLayer(tf.keras.layers.Layer): @@ -29,26 +30,26 @@ class RepresentationLayer(tf.keras.layers.Layer): def __init__( self, *, - model_dimension: Optional[str] = "XS", + model_size: Optional[str] = "XS", num_categoricals: Optional[int] = None, num_classes_per_categorical: Optional[int] = None, ): """Initializes a RepresentationLayer instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different parameters. num_categoricals: Overrides the number of categoricals used in the z-states. - In [1], 32 is used for any model dimension. + In [1], 32 is used for any model size. num_classes_per_categorical: Overrides the number of classes within each categorical used for the z-states. In [1], 32 is used for any model dimension. """ self.num_categoricals = get_num_z_categoricals( - model_dimension, override=num_categoricals + model_size, override=num_categoricals ) self.num_classes_per_categorical = get_num_z_classes( - model_dimension, override=num_classes_per_categorical + model_size, override=num_classes_per_categorical ) super().__init__( diff --git a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py index 7af29664c6024..c8ce0fc260fd6 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py @@ -5,12 +5,13 @@ """ from typing import Optional -import tensorflow as tf - from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import ( RewardPredictorLayer, ) +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() class RewardPredictor(tf.keras.Model): @@ -22,7 +23,7 @@ class RewardPredictor(tf.keras.Model): def __init__( self, *, - model_dimension: Optional[str] = "XS", + model_size: Optional[str] = "XS", num_buckets: int = 255, lower_bound: float = -20.0, upper_bound: float = 20.0, @@ -30,7 +31,7 @@ def __init__( """Initializes a RewardPredictor instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Determines the exact size of the underlying MLP. num_buckets: The number of buckets to create. Note that the number of possible symlog'd outcomes from the used distribution is @@ -51,7 +52,7 @@ def __init__( super().__init__(name="reward_predictor") self.mlp = MLP( - model_dimension=model_dimension, + model_size=model_size, output_layer_size=None, ) self.reward_layer = RewardPredictorLayer( diff --git a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py index f9c92e92e7279..185098b15b2bc 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py @@ -7,7 +7,9 @@ D. Hafner, T. Lillicrap, M. Norouzi, J. Ba https://arxiv.org/pdf/2010.02193.pdf """ -import tensorflow as tf +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() class RewardPredictorLayer(tf.keras.layers.Layer): @@ -15,7 +17,7 @@ class RewardPredictorLayer(tf.keras.layers.Layer): This layer is used in two models in DreamerV3: The reward predictor of the world model and the value function. K is 255 by default (see [1]) and doesn't change - with the model dimension. + with the model size. Possible predicted reward/values range from symexp(-20.0) to symexp(20.0), which should cover any possible environment. Outputs of this layer are generated by diff --git a/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py b/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py index 5f1d02f539ed8..d8ee68499625a 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py @@ -6,10 +6,12 @@ from typing import Optional import gymnasium as gym -import tensorflow as tf from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.utils import get_gru_units +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() class SequenceModel(tf.keras.Model): @@ -37,23 +39,23 @@ class SequenceModel(tf.keras.Model): def __init__( self, *, - model_dimension: Optional[str] = "XS", + model_size: Optional[str] = "XS", action_space: gym.Space, num_gru_units: Optional[int] = None, ): """Initializes a SequenceModel instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the number of GRU units used. action_space: The action space the our environment used. num_gru_units: Overrides the number of GRU units (dimension of the h-state). - If None, use the value given through `model_dimension` + If None, use the value given through `model_size` (see [1] Appendix B). """ super().__init__(name="sequence_model") - num_gru_units = get_gru_units(model_dimension, override=num_gru_units) + num_gru_units = get_gru_units(model_size, override=num_gru_units) self.action_space = action_space # In Danijar's code, there is an additional layer (units=[model_size]) @@ -61,7 +63,7 @@ def __init__( # the paper. self.pre_gru_layer = MLP( num_dense_layers=1, - model_dimension=model_dimension, + model_size=model_size, output_layer_size=None, ) self.gru_unit = tf.keras.layers.GRU( diff --git a/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py b/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py index 08dadaf6494d4..bcfdb164e6d0a 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py @@ -6,10 +6,11 @@ from typing import Optional import gymnasium as gym -import tensorflow as tf -import tensorflow_probability as tfp from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() class VectorDecoder(tf.keras.Model): @@ -22,13 +23,13 @@ class VectorDecoder(tf.keras.Model): def __init__( self, *, - model_dimension: Optional[str] = "XS", + model_size: Optional[str] = "XS", observation_space: gym.Space, ): """Initializes a VectorDecoder instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Determines the exact size of the underlying MLP. observation_space: The observation space to decode back into. This must be a Box of shape (d,), where d >= 1. @@ -41,7 +42,7 @@ def __init__( ) self.mlp = MLP( - model_dimension=model_dimension, + model_size=model_size, output_layer_size=observation_space.shape[0], ) @@ -62,13 +63,5 @@ def call(self, h, z): # Send h-cat-z through MLP to get mean values of diag gaussian. loc = self.mlp(out) - # Create the Gaussian diag distribution. - distribution = tfp.distributions.MultivariateNormalDiag( - loc=loc, - # Scale == 1.0. - scale_diag=tf.ones_like(loc), - ) - pred_obs = distribution.sample() - - # Always return both predicted observations (sample0 and distribution. - return pred_obs, distribution + # Return only the predicted observations (mean, no sample). + return loc diff --git a/rllib/algorithms/dreamerv3/tf/models/critic_network.py b/rllib/algorithms/dreamerv3/tf/models/critic_network.py index 837ca68ccfdcf..d40441e585baf 100644 --- a/rllib/algorithms/dreamerv3/tf/models/critic_network.py +++ b/rllib/algorithms/dreamerv3/tf/models/critic_network.py @@ -5,12 +5,13 @@ """ from typing import Optional -import tensorflow as tf - from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import ( RewardPredictorLayer, ) +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() class CriticNetwork(tf.keras.Model): @@ -27,7 +28,7 @@ class CriticNetwork(tf.keras.Model): def __init__( self, *, - model_dimension: Optional[str] = "XS", + model_size: Optional[str] = "XS", num_buckets: int = 255, lower_bound: float = -20.0, upper_bound: float = 20.0, @@ -36,7 +37,7 @@ def __init__( """Initializes a CriticNetwork instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different network sizes. num_buckets: The number of buckets to create. Note that the number of possible symlog'd outcomes from the used distribution is @@ -63,7 +64,7 @@ def __init__( """ super().__init__(name="critic") - self.model_dimension = model_dimension + self.model_size = model_size self.ema_decay = ema_decay # "Fast" critic network(s) (mlp + reward-pred-layer). This is the network @@ -72,7 +73,7 @@ def __init__( # the critic loss term such that the weights of this fast critic stay close # to the EMA weights (see below). self.mlp = MLP( - model_dimension=self.model_dimension, + model_size=self.model_size, output_layer_size=None, ) self.return_layer = RewardPredictorLayer( @@ -85,7 +86,7 @@ def __init__( # target net, BUT not used to compute anything, just for the # weights regularizer term inside the critic loss). self.mlp_ema = MLP( - model_dimension=self.model_dimension, + model_size=self.model_size, output_layer_size=None, trainable=False, ) diff --git a/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py b/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py index d186fdcd39eba..1a6f95245e302 100644 --- a/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py +++ b/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py @@ -4,12 +4,14 @@ https://arxiv.org/pdf/2301.04104v1.pdf """ -import tensorflow as tf - from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import ( RepresentationLayer, ) +from ray.rllib.utils.framework import try_import_tf, try_import_tfp + +_, tf, _ = try_import_tf() +tfp = try_import_tfp() class DisagreeNetworks(tf.keras.Model): @@ -21,10 +23,10 @@ class DisagreeNetworks(tf.keras.Model): TODO """ - def __init__(self, *, num_networks, model_dimension, intrinsic_rewards_scale): + def __init__(self, *, num_networks, model_size, intrinsic_rewards_scale): super().__init__(name="disagree_networks") - self.model_dimension = model_dimension + self.model_size = model_size self.num_networks = num_networks self.intrinsic_rewards_scale = intrinsic_rewards_scale @@ -34,15 +36,13 @@ def __init__(self, *, num_networks, model_dimension, intrinsic_rewards_scale): for _ in range(self.num_networks): self.mlps.append( MLP( - model_dimension=self.model_dimension, + model_size=self.model_size, output_layer_size=None, trainable=True, ) ) self.representation_layers.append( - RepresentationLayer( - model_dimension=self.model_dimension, name="disagree" - ) + RepresentationLayer(model_size=self.model_size, name="disagree") ) @tf.function diff --git a/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py b/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py index 9621c95ce3c22..f735b9e031ea3 100644 --- a/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py +++ b/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py @@ -7,20 +7,25 @@ import gymnasium as gym import numpy as np -import tensorflow as tf from ray.rllib.algorithms.dreamerv3.tf.models.disagree_networks import DisagreeNetworks - +from ray.rllib.algorithms.dreamerv3.tf.models.actor_network import ActorNetwork +from ray.rllib.algorithms.dreamerv3.tf.models.critic_network import CriticNetwork +from ray.rllib.algorithms.dreamerv3.tf.models.world_model import WorldModel +from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.tf_utils import inverse_symlog +_, tf, _ = try_import_tf() + class DreamerModel(tf.keras.Model): """The main tf-keras model containing all necessary components for DreamerV3. Includes: - - The world model (with encoder, decoder, sequence-model (RSSM), dynamics - (prior z-state generating) model, and "posterior" model) for producing dreamed - trajectories. + - The world model with encoder, decoder, sequence-model (RSSM), dynamics + (generates prior z-state), and "posterior" model (generates posterior z-state). + Predicts env dynamics and produces dreamed trajectories for actor- and critic + learning. - The actor network (policy). - The critic network for value function prediction. """ @@ -28,32 +33,29 @@ class DreamerModel(tf.keras.Model): def __init__( self, *, - model_dimension: str = "XS", + model_size: str = "XS", action_space: gym.Space, - batch_size_B, - batch_length_T, - horizon_H, - world_model, - actor, - critic, + world_model: WorldModel, + actor: ActorNetwork, + critic: CriticNetwork, use_curiosity: bool = False, intrinsic_rewards_scale: float = 0.1, ): - """TODO + """Initializes a DreamerModel instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different network sizes. action_space: The action space the our environment used. + world_model: The WorldModel component. + actor: The ActorNetwork component. + critic: The CriticNetwork component. """ super().__init__(name="dreamer_model") - self.model_dimension = model_dimension + self.model_size = model_size self.action_space = action_space self.use_curiosity = use_curiosity - self.batch_size_B = batch_size_B - self.batch_length_T = batch_length_T - self.horizon_H = horizon_H self.world_model = world_model self.actor = actor @@ -63,7 +65,7 @@ def __init__( if self.use_curiosity: self.disagree_nets = DisagreeNetworks( num_networks=8, - model_dimension=self.model_dimension, + model_size=self.model_size, intrinsic_rewards_scale=intrinsic_rewards_scale, ) @@ -97,11 +99,11 @@ def call( actions = self.actor( h=results["h_states_BxT"], z=results["z_posterior_states_BxT"] ) - # Actor (with returning distribution). - _, distr = self.actor( + # Actor (with returning distribution parameters). + _, distr_params = self.actor( h=results["h_states_BxT"], z=results["z_posterior_states_BxT"], - return_distribution=True, + return_distr_params=True, ) # Critic. values = self.critic( @@ -155,8 +157,11 @@ def forward_inference(self, observations, previous_states, is_first, training=No is_first=is_first, ) # Compute action using our actor network and the current states. - _, distr = self.actor(h=states["h"], z=states["z"], return_distribution=True) + _, distr_params = self.actor( + h=states["h"], z=states["z"], return_distr_params=True + ) # Use the mode of the distribution (Discrete=argmax, Normal=mean). + distr = self.actor.get_action_dist_object(distr_params) actions = distr.mode() return actions, {"h": states["h"], "z": states["z"], "a": actions} @@ -267,9 +272,9 @@ def dream_trajectory( timesteps_H: The number of timesteps to dream for. gamma: The discount factor gamma. """ - # Dreamed actions (one-hot for discrete actions). + # Dreamed actions (one-hot encoded for discrete actions). a_dreamed_t0_to_H = [] - a_dreamed_distributions_t0_to_H = [] + a_dreamed_dist_params_t0_to_H = [] h = start_states["h"] z = start_states["z"] @@ -281,7 +286,7 @@ def dream_trajectory( # Compute `a` using actor network (already the first step uses a dreamed action, # not a sampled one). - a, a_dist = self.actor( + a, a_dist_params = self.actor( # We have to stop the gradients through the states. B/c we are using a # differentiable Discrete action distribution (straight through gradients # with `a = stop_gradient(sample(probs)) + probs - stop_gradient(probs)`, @@ -289,10 +294,10 @@ def dream_trajectory( # term on actions further back in the trajectory. h=tf.stop_gradient(h), z=tf.stop_gradient(z), - return_distribution=True, + return_distr_params=True, ) a_dreamed_t0_to_H.append(a) - a_dreamed_distributions_t0_to_H.append(a_dist) + a_dreamed_dist_params_t0_to_H.append(a_dist_params) for i in range(timesteps_H): # Move one step in the dream using the RSSM. @@ -304,13 +309,13 @@ def dream_trajectory( z_states_prior_t0_to_H.append(z) # Compute `a` using actor network. - a, a_dist = self.actor( + a, a_dist_params = self.actor( h=tf.stop_gradient(h), z=tf.stop_gradient(z), - return_distribution=True, + return_distr_params=True, ) a_dreamed_t0_to_H.append(a) - a_dreamed_distributions_t0_to_H.append(a_dist) + a_dreamed_dist_params_t0_to_H.append(a_dist_params) h_states_H_B = tf.stack(h_states_t0_to_H, axis=0) # (T, B, ...) h_states_HxB = tf.reshape(h_states_H_B, [-1] + h_states_H_B.shape.as_list()[2:]) @@ -321,6 +326,7 @@ def dream_trajectory( ) a_dreamed_H_B = tf.stack(a_dreamed_t0_to_H, axis=0) # (T, B, ...) + a_dreamed_dist_params_H_B = tf.stack(a_dreamed_dist_params_t0_to_H, axis=0) # Compute r using reward predictor. r_dreamed_H_B = tf.reshape( @@ -389,17 +395,20 @@ def dream_trajectory( ) ret = { - "h_states_t0_to_H_B": h_states_H_B, - "z_states_prior_t0_to_H_B": z_states_prior_H_B, - "rewards_dreamed_t0_to_H_B": r_dreamed_H_B, - "continues_dreamed_t0_to_H_B": c_dreamed_H_B, - "actions_dreamed_t0_to_H_B": a_dreamed_H_B, - "actions_dreamed_distributions_t0_to_H_B": a_dreamed_distributions_t0_to_H, - "values_dreamed_t0_to_H_B": v_dreamed_H_B, - "values_symlog_dreamed_logits_t0_to_HxB": v_symlog_dreamed_logits_HxB, - "v_symlog_dreamed_ema_t0_to_H_B": v_symlog_dreamed_ema_H_B, + "h_states_t0_to_H_BxT": h_states_H_B, + "z_states_prior_t0_to_H_BxT": z_states_prior_H_B, + "rewards_dreamed_t0_to_H_BxT": r_dreamed_H_B, + "continues_dreamed_t0_to_H_BxT": c_dreamed_H_B, + "actions_dreamed_t0_to_H_BxT": a_dreamed_H_B, + # "actions_dreamed_distributions_t0_to_H_BxT": ( + # a_dreamed_distributions_t0_to_H + # ), + "actions_dreamed_dist_params_t0_to_H_BxT": a_dreamed_dist_params_H_B, + "values_dreamed_t0_to_H_BxT": v_dreamed_H_B, + "values_symlog_dreamed_logits_t0_to_HxBxT": v_symlog_dreamed_logits_HxB, + "v_symlog_dreamed_ema_t0_to_H_BxT": v_symlog_dreamed_ema_H_B, # Loss weights for critic- and actor losses. - "dream_loss_weights_t0_to_H_B": dream_loss_weights_H_B, + "dream_loss_weights_t0_to_H_BxT": dream_loss_weights_H_B, } if self.use_curiosity: @@ -537,20 +546,20 @@ def dream_trajectory_with_burn_in( # an original time dimension from the real env, from all of which we then branch # out our dream trajectories). ret = { - "h_states_t0_to_H_B": h_states_t0_to_H_B, - "z_states_prior_t0_to_H_B": z_states_prior_t0_to_H_B, + "h_states_t0_to_H_BxT": h_states_t0_to_H_B, + "z_states_prior_t0_to_H_BxT": z_states_prior_t0_to_H_B, # Unfold time-ranks in predictions. - "rewards_dreamed_t0_to_H_B": tf.reshape(r_dreamed_t0_to_HxB, (-1, B)), - "continues_dreamed_t0_to_H_B": tf.reshape(c_dreamed_t0_to_HxB, (-1, B)), + "rewards_dreamed_t0_to_H_BxT": tf.reshape(r_dreamed_t0_to_HxB, (-1, B)), + "continues_dreamed_t0_to_H_BxT": tf.reshape(c_dreamed_t0_to_HxB, (-1, B)), } # Figure out action key (random, sampled from env, dreamed?). if use_sampled_actions_in_dream: - key = "actions_sampled_t0_to_H_B" + key = "actions_sampled_t0_to_H_BxT" elif use_random_actions_in_dream: - key = "actions_random_t0_to_H_B" + key = "actions_random_t0_to_H_BxT" else: - key = "actions_dreamed_t0_to_H_B" + key = "actions_dreamed_t0_to_H_BxT" ret[key] = a_t0_to_H_B # Also provide int-actions, if discrete action space. diff --git a/rllib/algorithms/dreamerv3/tf/models/world_model.py b/rllib/algorithms/dreamerv3/tf/models/world_model.py index 39fa3e587d6ef..73195fc8e1a0b 100644 --- a/rllib/algorithms/dreamerv3/tf/models/world_model.py +++ b/rllib/algorithms/dreamerv3/tf/models/world_model.py @@ -6,7 +6,6 @@ from typing import Optional import gymnasium as gym -import tensorflow as tf import tree # pip install dm_tree from ray.rllib.algorithms.dreamerv3.tf.models.components.continue_predictor import ( @@ -26,9 +25,13 @@ SequenceModel, ) from ray.rllib.algorithms.dreamerv3.utils import get_gru_units +from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.tf_utils import symlog +_, tf, _ = try_import_tf() + + class WorldModel(tf.keras.Model): """WorldModel component of [1] w/ encoder, decoder, RSSM, reward/cont. predictors. @@ -56,7 +59,7 @@ class WorldModel(tf.keras.Model): def __init__( self, *, - model_dimension: str = "XS", + model_size: str = "XS", action_space: gym.Space, batch_length_T: int = 64, encoder: tf.keras.Model, @@ -67,7 +70,7 @@ def __init__( """Initializes a WorldModel instance. Args: - model_dimension: The "Model Size" used according to [1] Appendinx B. + model_size: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different network sizes. action_space: The action space the our environment used. batch_length_T: The length (T) of the sequences used for training. The @@ -87,7 +90,7 @@ def __init__( the last decoder layer produces the exact, normalized pixel values (not a Gaussian as described in [1]!). num_gru_units: The number of GRU units to use. If None, use - `model_dimension` to figure out this parameter. + `model_size` to figure out this parameter. symlog_obs: Whether to predict decoded observations in symlog space. This should be False for image based observations. According to the paper [1] Appendix E: "NoObsSymlog: This ablation @@ -98,7 +101,7 @@ def __init__( """ super().__init__(name="world_model") - self.model_dimension = model_dimension + self.model_size = model_size self.batch_length_T = batch_length_T self.symlog_obs = symlog_obs self.action_space = action_space @@ -109,7 +112,7 @@ def __init__( # Posterior predictor consisting of an MLP and a RepresentationLayer: # [ht, lt] -> zt. self.posterior_mlp = MLP( - model_dimension=self.model_dimension, + model_size=self.model_size, output_layer_size=None, # In Danijar's code, the posterior predictor only has a single layer, # no matter the model size: @@ -118,17 +121,15 @@ def __init__( ) # The (posterior) z-state generating layer. self.posterior_representation_layer = RepresentationLayer( - model_dimension=self.model_dimension, + model_size=self.model_size, ) # Dynamics (prior z-state) predictor: ht -> z^t - self.dynamics_predictor = DynamicsPredictor( - model_dimension=self.model_dimension - ) + self.dynamics_predictor = DynamicsPredictor(model_size=self.model_size) # GRU for the RSSM: [at, ht, zt] -> ht+1 self.num_gru_units = get_gru_units( - model_dimension=self.model_dimension, + model_size=self.model_size, override=num_gru_units, ) # Initial h-state variable (learnt). @@ -142,17 +143,15 @@ def __init__( ) # The actual sequence model containing the GRU layer. self.sequence_model = SequenceModel( - model_dimension=self.model_dimension, + model_size=self.model_size, action_space=self.action_space, num_gru_units=self.num_gru_units, ) # Reward Predictor: [ht, zt] -> rt. - self.reward_predictor = RewardPredictor(model_dimension=self.model_dimension) + self.reward_predictor = RewardPredictor(model_size=self.model_size) # Continue Predictor: [ht, zt] -> ct. - self.continue_predictor = ContinuePredictor( - model_dimension=self.model_dimension - ) + self.continue_predictor = ContinuePredictor(model_size=self.model_size) # Decoder: [ht, zt] -> x^t. self.decoder = decoder @@ -276,7 +275,7 @@ def forward_train(self, observations, actions, is_first, training=None): # Make actions and `is_first` time-major. actions = tf.transpose( actions, - perm=[1, 0] + list(range(2, len(actions.shape))), # .as_list() TODO + perm=[1, 0] + list(range(2, tf.shape(actions).shape.as_list()[0])), ) is_first = tf.transpose(is_first, perm=[1, 0]) @@ -343,7 +342,7 @@ def forward_train(self, observations, actions, is_first, training=None): h_BxT = tf.reshape(h_t1_to_T, shape=[-1] + h_t1_to_T.shape.as_list()[2:]) z_BxT = tf.reshape(z_t1_to_T, shape=[-1] + z_t1_to_T.shape.as_list()[2:]) - _, obs_distribution = self.decoder(h=h_BxT, z=z_BxT) + obs_distribution_means = self.decoder(h=h_BxT, z=z_BxT) # Compute (predicted) reward distributions. rewards, reward_logits = self.reward_predictor( @@ -356,11 +355,11 @@ def forward_train(self, observations, actions, is_first, training=None): ) # Return outputs for loss computation. - # Note that all shapes are [B, ...] (no time axis). + # Note that all shapes are [BxT, ...] (time axis already folded). return { # Obs. "sampled_obs_symlog_BxT": observations, - "obs_distribution_BxT": obs_distribution, + "obs_distribution_means_BxT": obs_distribution_means, # Rewards. "reward_logits_BxT": reward_logits, "rewards_BxT": rewards, diff --git a/rllib/algorithms/dreamerv3/utils/__init__.py b/rllib/algorithms/dreamerv3/utils/__init__.py new file mode 100644 index 0000000000000..592bbf9b32e82 --- /dev/null +++ b/rllib/algorithms/dreamerv3/utils/__init__.py @@ -0,0 +1,168 @@ +""" +Utility functions for the DreamerV3 ([1]) algorithm. + +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf +""" + +_ALLOWED_MODEL_DIMS = [ + # RLlib debug sizes (not mentioned in [1]). + "nano", + "micro", + "mini", + "XXS", + # Regular sizes (listed in table B in [1]). + "XS", + "S", + "M", + "L", + "XL", +] + + +def get_cnn_multiplier(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + cnn_multipliers = { + "nano": 2, + "micro": 4, + "mini": 8, + "XXS": 16, + "XS": 24, + "S": 32, + "M": 48, + "L": 64, + "XL": 96, + } + return cnn_multipliers[model_size] + + +def get_dense_hidden_units(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + dense_units = { + "nano": 16, + "micro": 32, + "mini": 64, + "XXS": 128, + "XS": 256, + "S": 512, + "M": 640, + "L": 768, + "XL": 1024, + } + return dense_units[model_size] + + +def get_gru_units(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + gru_units = { + "nano": 16, + "micro": 32, + "mini": 64, + "XXS": 128, + "XS": 256, + "S": 512, + "M": 1024, + "L": 2048, + "XL": 4096, + } + return gru_units[model_size] + + +def get_num_z_categoricals(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + gru_units = { + "nano": 4, + "micro": 8, + "mini": 16, + "XXS": 32, + "XS": 32, + "S": 32, + "M": 32, + "L": 32, + "XL": 32, + } + return gru_units[model_size] + + +def get_num_z_classes(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + gru_units = { + "nano": 4, + "micro": 8, + "mini": 16, + "XXS": 32, + "XS": 32, + "S": 32, + "M": 32, + "L": 32, + "XL": 32, + } + return gru_units[model_size] + + +def get_num_curiosity_nets(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + num_curiosity_nets = { + "nano": 8, + "micro": 8, + "mini": 16, + "XXS": 8, + "XS": 8, + "S": 8, + "M": 8, + "L": 8, + "XL": 8, + } + return num_curiosity_nets[model_size] + + +def get_num_dense_layers(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + num_dense_layers = { + "nano": 1, + "micro": 1, + "mini": 1, + "XXS": 1, + "XS": 1, + "S": 2, + "M": 3, + "L": 4, + "XL": 5, + } + return num_dense_layers[model_size] + + +def do_symlog_obs(observation_space, symlog_obs_user_setting): + # If our symlog_obs setting is NOT set specifically (it's set to "auto"), return + # True if we don't have an image observation space, otherwise return False. + + # TODO (sven): Support mixed observation spaces. + + is_image_space = len(observation_space.shape) in [2, 3] + return ( + not is_image_space + if symlog_obs_user_setting == "auto" + else symlog_obs_user_setting + ) diff --git a/rllib/algorithms/dreamerv3/utils/debugging.py b/rllib/algorithms/dreamerv3/utils/debugging.py new file mode 100644 index 0000000000000..1a4cf515d9f41 --- /dev/null +++ b/rllib/algorithms/dreamerv3/utils/debugging.py @@ -0,0 +1,185 @@ +import gymnasium as gym +import numpy as np +from PIL import Image, ImageDraw + +from gymnasium.envs.classic_control.cartpole import CartPoleEnv + +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() + + +class CartPoleDebug(CartPoleEnv): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + low = np.concatenate([np.array([0.0]), self.observation_space.low]) + high = np.concatenate([np.array([1000.0]), self.observation_space.high]) + + self.observation_space = gym.spaces.Box(low, high, shape=(5,), dtype=np.float32) + + self.timesteps_ = 0 + + def reset(self, *, seed=None, options=None): + ret = super().reset() + self.timesteps_ = 0 + obs = np.concatenate([np.array([self.timesteps_]), ret[0]]) + return obs, ret[1] + + def step(self, action): + ret = super().step(action) + + self.timesteps_ += 1 + + obs = np.concatenate([np.array([self.timesteps_]), ret[0]]) + reward = 0.1 * self.timesteps_ + return (obs, reward) + ret[2:] + + +gym.register("CartPoleDebug-v0", CartPoleDebug) +cartpole_env = gym.make("CartPoleDebug-v0", render_mode="rgb_array") +cartpole_env.reset() + +frozenlake_env = gym.make( + "FrozenLake-v1", render_mode="rgb_array", is_slippery=False, map_name="4x4" +) # desc=["SF", "HG"]) +frozenlake_env.reset() + + +def create_cartpole_dream_image( + dreamed_obs, # real space (not symlog'd) + dreamed_V, # real space (not symlog'd) + dreamed_a, + dreamed_r_tp1, # real space (not symlog'd) + dreamed_ri_tp1, # intrinsic reward + dreamed_c_tp1, # continue flag + value_target, # real space (not symlog'd) + initial_h, + as_tensor=False, +): + # CartPoleDebug + if dreamed_obs.shape == (5,): + # Set the state of our env to the given observation. + cartpole_env.unwrapped.state = np.array(dreamed_obs[1:], dtype=np.float32) + # Normal CartPole-v1 + else: + cartpole_env.unwrapped.state = np.array(dreamed_obs, dtype=np.float32) + + # Produce an RGB-image of the current state. + rgb_array = cartpole_env.render() + + # Add value-, action-, reward-, and continue-prediction information. + image = Image.fromarray(rgb_array) + draw_obj = ImageDraw.Draw(image) + + # fnt = ImageFont.load_default(size=40) + + draw_obj.text( + (5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0) + ) # , font=fnt.font, size=30) + draw_obj.text( + (5, 18), + f"at={'<--' if dreamed_a == 0 else '-->'} ({dreamed_a})", + fill=(0, 0, 0), + ) + draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0)) + if dreamed_ri_tp1 is not None: + draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0)) + draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0)) + draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0)) + + if dreamed_obs.shape == (5,): + draw_obj.text((20, 100), f"t={dreamed_obs[0]}", fill=(0, 0, 0)) + + # Return image. + np_img = np.asarray(image) + if as_tensor: + return tf.convert_to_tensor(np_img, dtype=tf.uint8) + return np_img + + +def create_frozenlake_dream_image( + dreamed_obs, # real space (not symlog'd) + dreamed_V, # real space (not symlog'd) + dreamed_a, + dreamed_r_tp1, # real space (not symlog'd) + dreamed_ri_tp1, # intrinsic reward + dreamed_c_tp1, # continue flag + value_target, # real space (not symlog'd) + initial_h, + as_tensor=False, +): + frozenlake_env.unwrapped.s = np.argmax(dreamed_obs, axis=0) + + # Produce an RGB-image of the current state. + rgb_array = frozenlake_env.render() + + # Add value-, action-, reward-, and continue-prediction information. + image = Image.fromarray(rgb_array) + draw_obj = ImageDraw.Draw(image) + + draw_obj.text((5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0)) + action_arrow = ( + "<--" + if dreamed_a == 0 + else "v" + if dreamed_a == 1 + else "-->" + if dreamed_a == 2 + else "^" + ) + draw_obj.text((5, 18), f"at={action_arrow} ({dreamed_a})", fill=(0, 0, 0)) + draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0)) + if dreamed_ri_tp1 is not None: + draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0)) + draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0)) + draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0)) + + # Return image. + np_img = np.asarray(image) + if as_tensor: + return tf.convert_to_tensor(np_img, dtype=tf.uint8) + return np_img + + +if __name__ == "__main__": + # CartPole debug. + rgb_array = create_cartpole_dream_image( + dreamed_obs=np.array([100.0, 1.0, -0.01, 1.5, 0.02]), + dreamed_V=4.3, + dreamed_a=1, + dreamed_r_tp1=1.0, + dreamed_c_tp1=True, + initial_h=0.0, + value_target=8.0, + ) + # ImageFont.load("arial.pil") + image = Image.fromarray(rgb_array) + image.show() + + # Normal CartPole. + rgb_array = create_cartpole_dream_image( + dreamed_obs=np.array([1.0, -0.01, 1.5, 0.02]), + dreamed_V=4.3, + dreamed_a=1, + dreamed_r_tp1=1.0, + dreamed_c_tp1=True, + initial_h=0.1, + value_target=8.0, + ) + # ImageFont.load("arial.pil") + image = Image.fromarray(rgb_array) + image.show() + + # Frozenlake + rgb_array = create_frozenlake_dream_image( + dreamed_obs=np.array([1.0] + [0.0] * (frozenlake_env.observation_space.n - 1)), + dreamed_V=4.3, + dreamed_a=1, + dreamed_r_tp1=1.0, + dreamed_c_tp1=True, + initial_h=0.1, + value_target=8.0, + ) + image = Image.fromarray(rgb_array) + image.show() diff --git a/rllib/algorithms/dreamerv3/utils/env_runner.py b/rllib/algorithms/dreamerv3/utils/env_runner.py new file mode 100644 index 0000000000000..c8db4e8ebc073 --- /dev/null +++ b/rllib/algorithms/dreamerv3/utils/env_runner.py @@ -0,0 +1,548 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from collections import defaultdict +from functools import partial +from typing import List, Tuple + +import gymnasium as gym +import numpy as np +from supersuit.generic_wrappers import resize_v1 +import tree # pip install dm_tree + +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.core.models.base import STATE_IN, STATE_OUT +from ray.rllib.env.env_runner import EnvRunner +from ray.rllib.env.wrappers.atari_wrappers import NoopResetEnv, MaxAndSkipEnv +from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv +from ray.rllib.evaluation.metrics import RolloutMetrics +from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.replay_buffers.episode_replay_buffer import _Episode as Episode +from ray.rllib.utils.numpy import one_hot + +_, tf, _ = try_import_tf() + + +class DreamerV3EnvRunner(EnvRunner): + """An environment runner to collect data from vectorized gymnasium environments.""" + + def __init__( + self, + config: AlgorithmConfig, + **kwargs, + ): + """Initializes a DreamerV3EnvRunner instance. + + Args: + config: The config to use to setup this EnvRunner. + """ + super().__init__(config=config) + + # Create the gym.vector.Env object. + # Atari env. + if self.config.env.startswith("ALE/"): + # [2]: "We down-scale the 84 × 84 grayscale images to 64 × 64 pixels so that + # we can apply the convolutional architecture of DreamerV1." + # ... + # "We follow the evaluation protocol of Machado et al. (2018) with 200M + # environment steps, action repeat of 4, a time limit of 108,000 steps per + # episode that correspond to 30 minutes of game play, no access to life + # information, full action space, and sticky actions. Because the world + # model integrates information over time, DreamerV2 does not use frame + # stacking." + # However, in Danijar's repo, Atari100k experiments are configured as: + # noop=30, 64x64x3 (no grayscaling), sticky actions=False, + # full action space=False, + wrappers = [ + partial(gym.wrappers.TimeLimit, max_episode_steps=108000), + partial(resize_v1, x_size=64, y_size=64), # resize to 64x64 + NormalizedImageEnv, + NoopResetEnv, + MaxAndSkipEnv, + ] + + self.env = gym.vector.make( + "GymV26Environment-v0", + env_id=self.config.env, + wrappers=wrappers, + num_envs=self.config.num_envs_per_worker, + asynchronous=self.config.remote_worker_envs, + make_kwargs=dict( + self.config.env_config, **{"render_mode": "rgb_array"} + ), + ) + # DeepMind Control. + elif self.config.env.startswith("DMC/"): + parts = self.config.env.split("/") + assert len(parts) == 3, ( + "ERROR: DMC env must be formatted as 'DMC/[task]/[domain]', e.g. " + f"'DMC/cartpole/swingup'! You provided '{self.config.env}'." + ) + gym.register( + "dmc_env-v0", + lambda from_pixels=True: DMCEnv( + parts[1], parts[2], from_pixels=from_pixels, channels_first=False + ), + ) + self.env = gym.vector.make( + "dmc_env-v0", + wrappers=[ActionClip], + num_envs=self.config.num_envs_per_worker, + asynchronous=self.config.remote_worker_envs, + **dict(self.config.env_config), + ) + # All other (gym) envs. + else: + wrappers = [] if self.config.env != "FrozenLake-v1" else [OneHot] + self.env = gym.vector.make( + self.config.env, + wrappers=wrappers, + num_envs=self.config.num_envs_per_worker, + asynchronous=self.config.remote_worker_envs, + **dict(self.config.env_config, **{"render_mode": "rgb_array"}), + ) + self.num_envs = self.env.num_envs + assert self.num_envs == self.config.num_envs_per_worker + + # Create our RLModule to compute actions with. + if self.config.share_module_between_env_runner_and_learner: + # DreamerV3 Algorithm will set this to the local Learner's module. + self.module = None + # Create our own instance of a DreamerV3RLModule (which then needs to be + # weight-synched each iteration). + else: + policy_dict, _ = self.config.get_multi_agent_setup(env=self.env) + module_spec = self.config.get_marl_module_spec(policy_dict=policy_dict) + # TODO (sven): DreamerV3 is currently single-agent only. + self.module = module_spec.build()[DEFAULT_POLICY_ID] + + self._needs_initial_reset = True + self._episodes = [None for _ in range(self.num_envs)] + + # TODO (sven): Move metrics temp storage and collection out of EnvRunner + # and RolloutWorkers. These classes should not continue tracking some data + # that they have already returned (in a call to `sample()`). Instead, the + # episode data should be analyzed where it was sent to (the Algorithm itself + # via its replay buffer, etc..). + self._done_episodes_for_metrics = [] + self._ongoing_episodes_for_metrics = defaultdict(list) + self._ts_since_last_metrics = 0 + + @override(EnvRunner) + def sample( + self, + *, + num_timesteps: int = None, + num_episodes: int = None, + explore: bool = True, + random_actions: bool = False, + with_render_data: bool = False, + ) -> Tuple[List[Episode], List[Episode]]: + """Runs and returns a sample (n timesteps or m episodes) on the environment(s). + + Timesteps or episodes are counted in total (across all vectorized + sub-environments). For example, if self.num_envs=2 and num_timesteps=10, each + sub-environment will be sampled for 5 steps. If self.num_envs=3 and + num_episodes=30, each sub-environment will be sampled for 10 episodes. + + Args: + num_timesteps: The number of timesteps to sample from the environment(s). + Note that only exactly one of `num_timesteps` or `num_episodes` must be + provided. + num_episodes: The number of full episodes to sample from the environment(s). + Note that only exactly one of `num_timesteps` or `num_episodes` must be + provided. + explore: Indicates whether to utilize exploration when picking actions. + random_actions: Whether to only use random actions. If True, the value of + `explore` is ignored. + force_reset: Whether to reset the environment(s) before starting to sample. + If False, will still reset the environment(s) if they were left in + a terminated or truncated state during previous sample calls. + with_render_data: If True, will record rendering images per timestep + in the returned Episodes. This data can be used to create video + reports. + TODO (sven): Note that this is only supported for runnign with + `num_episodes` yet. + + Returns: + A tuple consisting of a) list of Episode instances that are done and + b) list of Episode instances that are still ongoing. + """ + # If no execution details are provided, use self.config. + if num_timesteps is None and num_episodes is None: + if self.config.batch_mode == "truncate_episodes": + num_timesteps = self.config.rollout_fragment_length * self.num_envs + else: + num_episodes = self.num_envs + + # Sample n timesteps. + if num_timesteps is not None: + return self._sample_timesteps( + num_timesteps=num_timesteps, + explore=explore, + random_actions=random_actions, + force_reset=False, + ) + # Sample n episodes. + else: + # `_sample_episodes` returns only one list (with completed episodes) + # return empty list for incomplete ones. + return ( + self._sample_episodes( + num_episodes=num_episodes, + explore=explore, + random_actions=random_actions, + with_render_data=with_render_data, + ), + [], + ) + + def _sample_timesteps( + self, + num_timesteps: int, + explore: bool = True, + random_actions: bool = False, + force_reset: bool = False, + ) -> Tuple[List[Episode], List[Episode]]: + """Helper method to run n timesteps. + + See docstring of self.sample() for more details. + """ + done_episodes_to_return = [] + + # Get initial states for all `batch_size_B` rows in the forward batch. + initial_states = tree.map_structure( + lambda s: np.repeat(s, self.num_envs, axis=0), + self.module.get_initial_state(), + ) + + # Have to reset the env (on all vector sub-envs). + if force_reset or self._needs_initial_reset: + obs, _ = self.env.reset() + + self._episodes = [Episode() for _ in range(self.num_envs)] + states = initial_states + # Set is_first to True for all rows (all sub-envs just got reset). + is_first = np.ones((self.num_envs,), dtype=np.float32) + self._needs_initial_reset = False + + # Set initial obs and states in the episodes. + for i in range(self.num_envs): + self._episodes[i].add_initial_observation( + initial_observation=obs[i], + initial_state={k: s[i] for k, s in states.items()}, + ) + # Don't reset existing envs; continue in already started episodes. + else: + # Pick up stored observations and states from previous timesteps. + obs = np.stack([eps.observations[-1] for eps in self._episodes]) + # Compile the initial state for each batch row: If episode just started, use + # model's initial state, if not, use state stored last in Episode. + states = { + k: np.stack( + [ + initial_states[k][i] if eps.states is None else eps.states[k] + for i, eps in enumerate(self._episodes) + ] + ) + for k in initial_states.keys() + } + # If a batch row is at the beginning of an episode, set its `is_first` flag + # to 1.0, otherwise 0.0. + is_first = np.zeros((self.num_envs,), dtype=np.float32) + for i, eps in enumerate(self._episodes): + if eps.states is None: + is_first[i] = 1.0 + + # Loop through env for n timesteps. + ts = 0 + while ts < num_timesteps: + # Act randomly. + if random_actions: + actions = self.env.action_space.sample() + # Compute an action using our RLModule. + else: + batch = { + STATE_IN: tree.map_structure( + lambda s: tf.convert_to_tensor(s), states + ), + SampleBatch.OBS: tf.convert_to_tensor(obs), + "is_first": tf.convert_to_tensor(is_first), + } + # Explore or not. + if explore: + outs = self.module.forward_exploration(batch) + else: + outs = self.module.forward_inference(batch) + + # Model outputs one-hot actions (if discrete). Convert to int actions + # as well. + actions = outs[SampleBatch.ACTIONS].numpy() + if isinstance(self.env.single_action_space, gym.spaces.Discrete): + actions = np.argmax(actions, axis=-1) + states = tree.map_structure(lambda s: s.numpy(), outs[STATE_OUT]) + + obs, rewards, terminateds, truncateds, infos = self.env.step(actions) + ts += self.num_envs + + for i in range(self.num_envs): + s = {k: s[i] for k, s in states.items()} + # The last entry in self.observations[i] is already the reset + # obs of the new episode. + if terminateds[i] or truncateds[i]: + # Finish the episode with the actual terminal observation stored in + # the info dict. + self._episodes[i].add_timestep( + infos["final_observation"][i], + actions[i], + rewards[i], + state=s, + is_terminated=terminateds[i], + is_truncated=truncateds[i], + ) + # Reset h-states to the model's initial ones b/c we are starting a + # new episode. + for k, v in self.module.get_initial_state().items(): + states[k][i] = v.numpy() + is_first[i] = True + done_episodes_to_return.append(self._episodes[i]) + # Create a new episode object. + self._episodes[i] = Episode(observations=[obs[i]], states=s) + else: + self._episodes[i].add_timestep( + obs[i], actions[i], rewards[i], state=s + ) + is_first[i] = False + + # Return done episodes ... + self._done_episodes_for_metrics.extend(done_episodes_to_return) + # ... and all ongoing episode chunks. Also, make sure, we return + # a copy and start new chunks so that callers of this function + # don't alter our ongoing and returned Episode objects. + ongoing_episodes = self._episodes + self._episodes = [eps.create_successor() for eps in self._episodes] + for eps in ongoing_episodes: + self._ongoing_episodes_for_metrics[eps.id_].append(eps) + + self._ts_since_last_metrics += ts + + return done_episodes_to_return, ongoing_episodes + + def _sample_episodes( + self, + num_episodes: int, + explore: bool = True, + random_actions: bool = False, + with_render_data: bool = False, + ) -> List[Episode]: + """Helper method to run n episodes. + + See docstring of `self.sample()` for more details. + """ + done_episodes_to_return = [] + + obs, _ = self.env.reset() + episodes = [Episode() for _ in range(self.num_envs)] + + # Multiply states n times according to our vector env batch size (num_envs). + states = tree.map_structure( + lambda s: np.repeat(s, self.num_envs, axis=0), + self.module.get_initial_state(), + ) + is_first = np.ones((self.num_envs,), dtype=np.float32) + + render_images = [None] * self.num_envs + if with_render_data: + render_images = [e.render() for e in self.env.envs] + + for i in range(self.num_envs): + episodes[i].add_initial_observation( + initial_observation=obs[i], + initial_state={k: s[i] for k, s in states.items()}, + initial_render_image=render_images[i], + ) + + eps = 0 + while eps < num_episodes: + if random_actions: + actions = self.env.action_space.sample() + else: + batch = { + STATE_IN: tree.map_structure( + lambda s: tf.convert_to_tensor(s), states + ), + SampleBatch.OBS: tf.convert_to_tensor(obs), + "is_first": tf.convert_to_tensor(is_first), + } + + if explore: + outs = self.module.forward_exploration(batch) + else: + outs = self.module.forward_inference(batch) + + actions = outs[SampleBatch.ACTIONS].numpy() + if isinstance(self.env.single_action_space, gym.spaces.Discrete): + actions = np.argmax(actions, axis=-1) + states = tree.map_structure(lambda s: s.numpy(), outs[STATE_OUT]) + + obs, rewards, terminateds, truncateds, infos = self.env.step(actions) + if with_render_data: + render_images = [e.render() for e in self.env.envs] + + for i in range(self.num_envs): + s = {k: s[i] for k, s in states.items()} + # The last entry in self.observations[i] is already the reset + # obs of the new episode. + if terminateds[i] or truncateds[i]: + eps += 1 + + episodes[i].add_timestep( + infos["final_observation"][i], + actions[i], + rewards[i], + state=s, + is_terminated=terminateds[i], + is_truncated=truncateds[i], + ) + done_episodes_to_return.append(episodes[i]) + + # Also early-out if we reach the number of episodes within this + # for-loop. + if eps == num_episodes: + break + + # Reset h-states to the model's initial ones b/c we are starting a + # new episode. + for k, v in self.module.get_initial_state().items(): + states[k][i] = v.numpy() + is_first[i] = True + + episodes[i] = Episode( + observations=[obs[i]], + states=s, + render_images=[render_images[i]], + ) + else: + episodes[i].add_timestep( + obs[i], + actions[i], + rewards[i], + state=s, + render_image=render_images[i], + ) + is_first[i] = False + + self._done_episodes_for_metrics.extend(done_episodes_to_return) + self._ts_since_last_metrics += sum(len(eps) for eps in done_episodes_to_return) + + # If user calls sample(num_timesteps=..) after this, we must reset again + # at the beginning. + self._needs_initial_reset = True + + return done_episodes_to_return + + # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this + # API. Instead Algorithm should compile episode metrics itself via its local + # buffer. + def get_metrics(self) -> List[RolloutMetrics]: + # Compute per-episode metrics (only on already completed episodes). + metrics = [] + for eps in self._done_episodes_for_metrics: + episode_length = len(eps) + episode_reward = eps.get_return() + # Don't forget about the already returned chunks of this episode. + if eps.id_ in self._ongoing_episodes_for_metrics: + for eps2 in self._ongoing_episodes_for_metrics[eps.id_]: + episode_length += len(eps2) + episode_reward += eps2.get_return() + del self._ongoing_episodes_for_metrics[eps.id_] + + metrics.append( + RolloutMetrics( + episode_length=episode_length, + episode_reward=episode_reward, + ) + ) + + self._done_episodes_for_metrics.clear() + self._ts_since_last_metrics = 0 + + return metrics + + # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this + # API. Replace by proper state overriding via `EnvRunner.set_state()` + def set_weights(self, weights, global_vars=None): + """Writes the weights of our (single-agent) RLModule.""" + if self.module is None: + assert self.config.share_module_between_env_runner_and_learner + else: + self.module.set_state(weights[DEFAULT_POLICY_ID]) + + @override(EnvRunner) + def assert_healthy(self): + # Make sure, we have built our gym.vector.Env and RLModule properly. + assert self.env and self.module + + @override(EnvRunner) + def stop(self): + # Close our env object via gymnasium's API. + self.env.close() + + +class NormalizedImageEnv(gym.ObservationWrapper): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.observation_space = gym.spaces.Box( + -1.0, + 1.0, + shape=self.observation_space.shape, + dtype=np.float32, + ) + + # Divide by scale and center around 0.0, such that observations are in the range + # of -1.0 and 1.0. + def observation(self, observation): + return (observation.astype(np.float32) / 128.0) - 1.0 + + +class OneHot(gym.ObservationWrapper): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.observation_space = gym.spaces.Box( + 0.0, 1.0, shape=(self.observation_space.n,), dtype=np.float32 + ) + + def reset(self, **kwargs): + ret = self.env.reset(**kwargs) + return self._get_obs(ret[0]), ret[1] + + def step(self, action): + ret = self.env.step(action) + return self._get_obs(ret[0]), ret[1], ret[2], ret[3], ret[4] + + def _get_obs(self, obs): + return one_hot(obs, depth=self.observation_space.shape[0]) + + +class ActionClip(gym.ActionWrapper): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._low = -1.0 + self._high = 1.0 + self.action_space = gym.spaces.Box( + self._low, + self._high, + self.action_space.shape, + self.action_space.dtype, + ) + + def action(self, action): + return np.clip(action, self._low, self._high) diff --git a/rllib/algorithms/dreamerv3/utils/summaries.py b/rllib/algorithms/dreamerv3/utils/summaries.py new file mode 100644 index 0000000000000..d781a33e40d6b --- /dev/null +++ b/rllib/algorithms/dreamerv3/utils/summaries.py @@ -0,0 +1,329 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +import numpy as np + +from ray.rllib.algorithms.dreamerv3.utils.debugging import ( + create_cartpole_dream_image, + create_frozenlake_dream_image, +) +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.tf_utils import inverse_symlog + + +def _summarize(*, results, data_to_summarize, keys_to_log, include_histograms=False): + for k in keys_to_log: + if data_to_summarize[k].shape == (): + results.update({k: data_to_summarize[k]}) + elif include_histograms: + results.update({k: data_to_summarize[k]}) + + +def reconstruct_obs_from_h_and_z( + h_t0_to_H, + z_t0_to_H, + dreamer_model, + obs_dims_shape, +): + """Returns""" + shape = h_t0_to_H.shape + T = shape[0] # inputs are time-major + B = shape[1] + # Compute actual observations using h and z and the decoder net. + # Note that the last h-state (T+1) is NOT used here as it's already part of + # a new trajectory. + # Use mean() of the Gaussian, no sample! -> No need to construct dist object here. + reconstructed_obs_distr_means_TxB = dreamer_model.world_model.decoder( + # Fold time rank. + h=np.reshape(h_t0_to_H, (T * B, -1)), + z=np.reshape(z_t0_to_H, (T * B,) + z_t0_to_H.shape[2:]), + ) + # Unfold time rank again. + reconstructed_obs_T_B = np.reshape( + reconstructed_obs_distr_means_TxB, (T, B) + obs_dims_shape + ) + # Return inverse symlog'd (real env obs space) reconstructed observations. + return reconstructed_obs_T_B + + +def report_dreamed_trajectory( + *, + results, + env, + dreamer_model, + obs_dims_shape, + batch_indices=(0,), + desc=None, + include_images=True, +): + if not include_images: + return + + dream_data = results["dream_data"] + dreamed_obs_H_B = reconstruct_obs_from_h_and_z( + h_t0_to_H=dream_data["h_states_t0_to_H_BxT"], + z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"], + dreamer_model=dreamer_model, + obs_dims_shape=obs_dims_shape, + ) + func = ( + create_cartpole_dream_image + if env.startswith("CartPole") + else create_frozenlake_dream_image + ) + # Take 0th dreamed trajectory and produce series of images. + for b in batch_indices: + images = [] + for t in range(len(dreamed_obs_H_B) - 1): + images.append( + func( + dreamed_obs=dreamed_obs_H_B[t][b], + dreamed_V=dream_data["values_dreamed_t0_to_H_BxT"][t][b], + dreamed_a=(dream_data["actions_ints_dreamed_t0_to_H_BxT"][t][b]), + dreamed_r_tp1=(dream_data["rewards_dreamed_t0_to_H_BxT"][t + 1][b]), + # `DISAGREE_intrinsic_rewards_H_B` are shifted by 1 already + # (from t1 to H, not t0 to H like all other data here). + dreamed_ri_tp1=( + results["DISAGREE_intrinsic_rewards_H_BxT"][t][b] + if "DISAGREE_intrinsic_rewards_H_BxT" in results + else None + ), + dreamed_c_tp1=( + dream_data["continues_dreamed_t0_to_H_BxT"][t + 1][b] + ), + value_target=results["VALUE_TARGETS_H_BxT"][t][b], + initial_h=dream_data["h_states_t0_to_H_BxT"][t][b], + as_tensor=True, + ).numpy() + ) + # Concat images along width-axis (so they show as a "film sequence" next to each + # other). + results.update( + { + f"dreamed_trajectories{('_'+desc) if desc else ''}_B{b}": ( + np.concatenate(images, axis=1) + ), + } + ) + + +def report_predicted_vs_sampled_obs( + *, + results, + sample, + batch_size_B, + batch_length_T, + symlog_obs: bool = True, +): + """Summarizes sampled data (from the replay buffer) vs world-model predictions. + + World model predictions are based on the posterior states (z computed from actual + observation encoder input + the current h-states). + + Observations: Computes MSE (sampled vs predicted/recreated) over all features. + For image observations, also creates direct image comparisons (sampled images + vs predicted (posterior) ones). + Rewards: Compute MSE (sampled vs predicted). + Continues: Compute MSE (sampled vs predicted). + + Args: + results: The results dict that was returned by `LearnerGroup.update()`. + sample: The sampled data (dict) from the replay buffer. Already tf-tensor + converted. + batch_size_B: The batch size (B). This is the number of trajectories sampled + from the buffer. + batch_length_T: The batch length (T). This is the length of an individual + trajectory sampled from the buffer. + """ + predicted_observation_means_BxT = results[ + "WORLD_MODEL_fwd_out_obs_distribution_means_BxT" + ] + _report_obs( + results=results, + computed_float_obs_B_T_dims=np.reshape( + predicted_observation_means_BxT, + (batch_size_B, batch_length_T) + sample[SampleBatch.OBS].shape[2:], + ), + sampled_obs_B_T_dims=sample[SampleBatch.OBS], + descr_prefix="WORLD_MODEL", + descr_obs=f"predicted_posterior_T{batch_length_T}", + symlog_obs=symlog_obs, + ) + + +def report_dreamed_eval_trajectory_vs_samples( + *, + results, + dream_data, + sample, + burn_in_T, + dreamed_T, + dreamer_model, + symlog_obs: bool = True, +): + # Obs MSE. + dreamed_obs_T_B = reconstruct_obs_from_h_and_z( + h_t0_to_H=dream_data["h_states_t0_to_H_BxT"], + z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"], + dreamer_model=dreamer_model, + obs_dims_shape=sample[SampleBatch.OBS].shape[2:], + ) + t0 = burn_in_T - 1 + tH = t0 + dreamed_T + # Observation MSE and - if applicable - images comparisons. + mse_sampled_vs_dreamed_obs = _report_obs( + results=results, + # Have to transpose b/c dreamed data is time-major. + computed_float_obs_B_T_dims=np.transpose( + dreamed_obs_T_B, + axes=[1, 0] + list(range(2, len(dreamed_obs_T_B.shape))), + ), + sampled_obs_B_T_dims=sample[SampleBatch.OBS][:, t0 : tH + 1], + descr_prefix="EVALUATION", + descr_obs=f"dreamed_prior_H{dreamed_T}", + symlog_obs=symlog_obs, + ) + + # Reward MSE. + _report_rewards( + results=results, + computed_rewards=dream_data["rewards_dreamed_t0_to_H_BxT"], + sampled_rewards=sample[SampleBatch.REWARDS][:, t0 : tH + 1], + descr_prefix="EVALUATION", + descr_reward=f"dreamed_prior_H{dreamed_T}", + ) + + # Continues MSE. + _report_continues( + results=results, + computed_continues=dream_data["continues_dreamed_t0_to_H_BxT"], + sampled_continues=(1.0 - sample["is_terminated"])[:, t0 : tH + 1], + descr_prefix="EVALUATION", + descr_cont=f"dreamed_prior_H{dreamed_T}", + ) + return mse_sampled_vs_dreamed_obs + + +def report_sampling_and_replay_buffer(*, replay_buffer): + episodes_in_buffer = replay_buffer.get_num_episodes() + ts_in_buffer = replay_buffer.get_num_timesteps() + replayed_steps = replay_buffer.get_sampled_timesteps() + added_steps = replay_buffer.get_added_timesteps() + + # Summarize buffer, sampling, and train ratio stats. + return { + "BUFFER_capacity": replay_buffer.capacity, + "BUFFER_size_num_episodes": episodes_in_buffer, + "BUFFER_size_timesteps": ts_in_buffer, + "BUFFER_replayed_steps": replayed_steps, + "BUFFER_added_steps": added_steps, + } + + +def _report_obs( + *, + results, + computed_float_obs_B_T_dims, + sampled_obs_B_T_dims, + descr_prefix=None, + descr_obs, + symlog_obs, +): + """Summarizes computed- vs sampled observations: MSE and (if applicable) images. + + Args: + computed_float_obs_B_T_dims: Computed float observations + (not clipped, not cast'd). Shape=(B, T, [dims ...]). + sampled_obs_B_T_dims: Sampled observations (as-is from the environment, meaning + this could be uint8, 0-255 clipped images). Shape=(B, T, [dims ...]). + B: The batch size B (see shapes of `computed_float_obs_B_T_dims` and + `sampled_obs_B_T_dims` above). + T: The batch length T (see shapes of `computed_float_obs_B_T_dims` and + `sampled_obs_B_T_dims` above). + descr: A string used to describe the computed data to be used in the TB + summaries. + """ + # Videos: Create summary, comparing computed images with actual sampled ones. + # 4=[B, T, w, h] grayscale image; 5=[B, T, w, h, C] RGB image. + if len(sampled_obs_B_T_dims.shape) in [4, 5]: + descr_prefix = (descr_prefix + "_") if descr_prefix else "" + + if symlog_obs: + computed_float_obs_B_T_dims = inverse_symlog(computed_float_obs_B_T_dims) + + # Restore image pixels from normalized (non-symlog'd) data. + if not symlog_obs: + computed_float_obs_B_T_dims = (computed_float_obs_B_T_dims + 1.0) * 128 + sampled_obs_B_T_dims = (sampled_obs_B_T_dims + 1.0) * 128 + sampled_obs_B_T_dims = np.clip(sampled_obs_B_T_dims, 0.0, 255.0).astype( + np.uint8 + ) + computed_images = np.clip(computed_float_obs_B_T_dims, 0.0, 255.0).astype( + np.uint8 + ) + # Concat sampled and computed images along the height axis (3) such that + # real images show below respective predicted ones. + # (B, T, C, h, w) + sampled_vs_computed_images = np.concatenate( + [computed_images, sampled_obs_B_T_dims], + axis=3, + ) + # Add grayscale dim, if necessary. + if len(sampled_obs_B_T_dims.shape) == 2 + 2: + sampled_vs_computed_images = np.expand_dims(sampled_vs_computed_images, -1) + + results.update( + {f"{descr_prefix}sampled_vs_{descr_obs}_videos": sampled_vs_computed_images} + ) + + # return mse_sampled_vs_computed_obs + + +def _report_rewards( + *, + results, + computed_rewards, + sampled_rewards, + descr_prefix=None, + descr_reward, +): + descr_prefix = (descr_prefix + "_") if descr_prefix else "" + mse_sampled_vs_computed_rewards = np.mean( + np.square(computed_rewards - sampled_rewards) + ) + mse_sampled_vs_computed_rewards = np.mean(mse_sampled_vs_computed_rewards) + results.update( + { + f"{descr_prefix}sampled_vs_{descr_reward}_rewards_mse": ( + mse_sampled_vs_computed_rewards + ), + } + ) + + +def _report_continues( + *, + results, + computed_continues, + sampled_continues, + descr_prefix=None, + descr_cont, +): + descr_prefix = (descr_prefix + "_") if descr_prefix else "" + # Continue MSE. + mse_sampled_vs_computed_continues = np.mean( + np.square(computed_continues - sampled_continues.astype(np.float32)) + ) + results.update( + { + f"{descr_prefix}sampled_vs_{descr_cont}_continues_mse": ( + mse_sampled_vs_computed_continues + ), + } + ) diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index d435e469b23ce..81cb8d0627bde 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -482,12 +482,12 @@ def training_step(self) -> ResultDict: # workers. with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: if self.workers.num_remote_workers() > 0: - from_worker_or_trainer = None + from_worker_or_learner_group = None if self.config._enable_learner_api: # sync weights from learner_group to all rollout workers - from_worker_or_trainer = self.learner_group + from_worker_or_learner_group = self.learner_group self.workers.sync_weights( - from_worker_or_trainer=from_worker_or_trainer, + from_worker_or_learner_group=from_worker_or_learner_group, policies=policies_to_update, global_vars=global_vars, ) diff --git a/rllib/algorithms/registry.py b/rllib/algorithms/registry.py index 5387420cc5230..5352814f5e4e4 100644 --- a/rllib/algorithms/registry.py +++ b/rllib/algorithms/registry.py @@ -114,6 +114,12 @@ def _import_dreamer(): return dreamer.Dreamer, dreamer.Dreamer.get_default_config() +def _import_dreamerv3(): + import ray.rllib.algorithms.dreamerv3 as dreamerv3 + + return dreamerv3.DreamerV3, dreamerv3.DreamerV3.get_default_config() + + def _import_dt(): import ray.rllib.algorithms.dt as dt @@ -239,6 +245,7 @@ def _import_leela_chess_zero(): "DDPPO": _import_ddppo, "DQN": _import_dqn, "Dreamer": _import_dreamer, + "DreamerV3": _import_dreamerv3, "DT": _import_dt, "IMPALA": _import_impala, "APPO": _import_appo, @@ -278,6 +285,7 @@ def _import_leela_chess_zero(): "DDPPO": "DDPPO", "DQN": "DQN", "Dreamer": "Dreamer", + "DreamerV3": "DreamerV3", "DT": "DT", "Impala": "IMPALA", "APPO": "APPO", diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index 9bbff1f7f0877..2bde70aa69ea4 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -147,15 +147,12 @@ def test_detect_atari_env(self): config = AlgorithmConfig().environment( env="ALE/Breakout-v5", env_config={"frameskip": 1} ) - config.validate() self.assertTrue(config.is_atari) config = AlgorithmConfig().environment(env="ALE/Pong-v5") - config.validate() self.assertTrue(config.is_atari) config = AlgorithmConfig().environment(env="CartPole-v1") - config.validate() # We do not auto-detect callable env makers for Atari envs. self.assertFalse(config.is_atari) @@ -166,12 +163,10 @@ def test_detect_atari_env(self): make_kwargs={"frameskip": 1}, ) ) - config.validate() # We do not auto-detect callable env makers for Atari envs. self.assertFalse(config.is_atari) config = AlgorithmConfig().environment(env="NotAtari") - config.validate() self.assertFalse(config.is_atari) def test_rl_module_api(self): diff --git a/rllib/core/learner/tf/tf_learner.py b/rllib/core/learner/tf/tf_learner.py index 2cb9cdeb049aa..2cc22a725cf1b 100644 --- a/rllib/core/learner/tf/tf_learner.py +++ b/rllib/core/learner/tf/tf_learner.py @@ -376,7 +376,7 @@ def _make_distributed_strategy_if_necessary(self) -> "tf.distribute.Strategy": devices = tf.config.list_logical_devices("GPU") assert self._local_gpu_idx < len(devices), ( f"local_gpu_idx {self._local_gpu_idx} is not a valid GPU id or is " - " not available." + "not available." ) local_gpu = [devices[self._local_gpu_idx].name] strategy = tf.distribute.MirroredStrategy(devices=local_gpu) @@ -431,10 +431,11 @@ def helper(_batch): # in-efficient. However, for tf>=2.12, it works also w/o this conversion # so remove this after we upgrade officially to tf==2.12. _batch = NestedDict(_batch) - with tf.GradientTape() as tape: + with tf.GradientTape(persistent=True) as tape: fwd_out = self._module.forward_train(_batch) loss_per_module = self.compute_loss(fwd_out=fwd_out, batch=_batch) gradients = self.compute_gradients(loss_per_module, gradient_tape=tape) + del tape postprocessed_gradients = self.postprocess_gradients(gradients) self.apply_gradients(postprocessed_gradients) diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py index b6478d51d09d0..6aed0b9850521 100644 --- a/rllib/core/rl_module/rl_module.py +++ b/rllib/core/rl_module/rl_module.py @@ -285,7 +285,19 @@ class RLModule(abc.ABC): def __init__(self, config: RLModuleConfig): self.config = config + # Make sure, `setup()` is only called once, no matter what. In some cases + # of multiple inheritance (and with our __post_init__ functionality in place, + # this might get called twice. + if hasattr(self, "_is_setup") and self._is_setup: + raise RuntimeError( + "`RLModule.setup()` called twice within your RLModule implementation " + f"{self}! Make sure you are using the proper inheritance order " + "(TorchRLModule before [Algo]RLModule) or (TfRLModule before " + "[Algo]RLModule) and that you are using `super().__init__(...)` in " + "your custom constructor." + ) self.setup() + self._is_setup = True def __init_subclass__(cls, **kwargs): # Automatically add a __post_init__ method to all subclasses of RLModule. diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index 100b815d2b621..21b2601b7e05f 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -356,7 +356,9 @@ def num_remote_worker_restarts(self) -> int: def sync_weights( self, policies: Optional[List[PolicyID]] = None, - from_worker_or_trainer: Optional[Union[RolloutWorker, LearnerGroup]] = None, + from_worker_or_learner_group: Optional[ + Union[RolloutWorker, LearnerGroup] + ] = None, to_worker_indices: Optional[List[int]] = None, global_vars: Optional[Dict[str, TensorType]] = None, timeout_seconds: Optional[int] = 0, @@ -369,7 +371,7 @@ def sync_weights( Args: policies: Optional list of PolicyIDs to sync weights for. If None (default), sync weights to/from all policies. - from_worker_or_trainer: Optional (local) RolloutWorker instance or + from_worker_or_learner_group: Optional (local) RolloutWorker instance or LearnerGroup instance to sync from. If None (default), sync from this WorkerSet's local worker. to_worker_indices: Optional list of worker indices to sync the @@ -381,16 +383,16 @@ def sync_weights( for any sync calls to finish). This significantly improves algorithm performance. """ - if self.local_worker() is None and from_worker_or_trainer is None: + if self.local_worker() is None and from_worker_or_learner_group is None: raise TypeError( - "No `local_worker` in WorkerSet, must provide `from_worker` " - "arg in `sync_weights()`!" + "No `local_worker` in WorkerSet, must provide " + "`from_worker_or_learner_group` arg in `sync_weights()`!" ) # Only sync if we have remote workers or `from_worker_or_trainer` is provided. weights = None - if self.num_remote_workers() or from_worker_or_trainer is not None: - weights_src = from_worker_or_trainer or self.local_worker() + if self.num_remote_workers() or from_worker_or_learner_group is not None: + weights_src = from_worker_or_learner_group or self.local_worker() if weights_src is None: raise ValueError( @@ -414,10 +416,10 @@ def set_weight(w): timeout_seconds=timeout_seconds, ) - # If `from_worker` is provided, also sync to this WorkerSet's + # If `from_worker_or_learner_group` is provided, also sync to this WorkerSet's # local worker. if self.local_worker() is not None: - if from_worker_or_trainer is not None: + if from_worker_or_learner_group is not None: self.local_worker().set_weights(weights, global_vars=global_vars) # If `global_vars` is provided and local worker exists -> Update its # global_vars. diff --git a/rllib/policy/eager_tf_policy_v2.py b/rllib/policy/eager_tf_policy_v2.py index 8a4093fb0e2d5..4df6b2724fb3d 100644 --- a/rllib/policy/eager_tf_policy_v2.py +++ b/rllib/policy/eager_tf_policy_v2.py @@ -870,7 +870,12 @@ def _compute_actions_helper_rl_module_explore( actions = fwd_out[SampleBatch.ACTIONS] # Otherwise, sample actions from the distribution. else: - assert action_dist + if action_dist is None: + raise KeyError( + "Your RLModule's `forward_exploration()` method must return a dict" + f"with either the {SampleBatch.ACTIONS} key or the " + f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!" + ) actions = action_dist.sample() # Anything but action_dist and state_out is an extra fetch @@ -926,7 +931,12 @@ def _compute_actions_helper_rl_module_inference( actions = fwd_out[SampleBatch.ACTIONS] # Otherwise, sample actions from the distribution. else: - assert action_dist + if action_dist is None: + raise KeyError( + "Your RLModule's `forward_inference()` method must return a dict" + f"with either the {SampleBatch.ACTIONS} key or the " + f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!" + ) actions = action_dist.sample() # Anything but action_dist and state_out is an extra fetch diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 4165da80a1f8d..bef3c070d81a4 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -1147,7 +1147,12 @@ def _compute_action_helper( actions = fwd_out[SampleBatch.ACTIONS] # Otherwise, sample actions from the distribution. else: - assert action_dist + if action_dist is None: + raise KeyError( + "Your RLModule's `forward_exploration()` method must return" + f" a dict with either the {SampleBatch.ACTIONS} key or the " + f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!" + ) actions = action_dist.sample() # Compute action-logp and action-prob from distribution and add to @@ -1171,7 +1176,12 @@ def _compute_action_helper( actions = fwd_out[SampleBatch.ACTIONS] # Otherwise, sample actions from the distribution. else: - assert action_dist + if action_dist is None: + raise KeyError( + "Your RLModule's `forward_inference()` method must return" + f" a dict with either the {SampleBatch.ACTIONS} key or the " + f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!" + ) actions = action_dist.sample() # Anything but actions and state_out is an extra fetch. diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py index 0a9303a9e47d5..0f945dd7db82c 100644 --- a/rllib/tests/run_regression_tests.py +++ b/rllib/tests/run_regression_tests.py @@ -57,6 +57,12 @@ action="store_true", help="Run ray in local mode for easier debugging.", ) +parser.add_argument( + "--num-samples", + type=int, + default=1, + help="The number of seeds/samples to run with the given experiment config.", +) parser.add_argument( "--override-mean-reward", type=float, @@ -103,12 +109,14 @@ # Loop through all collected files. for file in files: + config_is_python = False # For python files, need to make sure, we only deliver the module name into the # `load_experiments_from_file` function (everything from "/ray/rllib" on). if file.endswith(".py"): if file.endswith("__init__.py"): # weird CI learning test (BAZEL) case continue experiments = load_experiments_from_file(file, SupportedFileType.python) + config_is_python = True else: experiments = load_experiments_from_file(file, SupportedFileType.yaml) @@ -118,13 +126,16 @@ exp = list(experiments.values())[0] + # Set the number of samples to run. + exp["num_samples"] = args.num_samples + # Override framework setting with the command line one, if provided. # Otherwise, will use framework setting in file (or default: torch). if args.framework is not None: exp["config"]["framework"] = args.framework # Override env setting if given on command line. if args.env is not None: - exp["config"]["env"] = args.env + exp["config"]["env"] = exp["env"] = args.env # Override the mean reward if specified. This is used by the ray ci # for overriding the episode reward mean for tf2 tests for off policy @@ -139,19 +150,23 @@ print(f"Skipping framework='{args.framework}' for QMIX.") continue - # Always run with eager-tracing when framework=tf2 if not in local-mode. - # Ignore this if the yaml explicitly tells us to disable eager tracing + # Always run with eager-tracing when framework=tf2, if not in local-mode + # and unless the yaml explicitly tells us to disable eager tracing. if ( - args.framework == "tf2" + (args.framework == "tf2" or exp["config"].get("framework") == "tf2") and not args.local_mode - and not exp["config"].get("eager_tracing") is False + # Note: This check will always fail for python configs, b/c normally, + # algorithm configs have `self.eager_tracing=False` by default. + # Thus, you'd have to set `eager_tracing` to True explicitly in your python + # config to make sure we are indeed using eager tracing. + and exp["config"].get("eager_tracing") is not False ): - exp["config"]["eager_tracing"] = True - # Print out the actual config. - print("== Test config ==") - print(yaml.dump(experiments)) + # Print out the actual config (not for py files as yaml.dump weirdly fails). + if not config_is_python: + print("== Test config ==") + print(yaml.dump(experiments)) # Try running each test 3 times and make sure it reaches the given # reward. diff --git a/rllib/tuned_examples/dreamerv3/__init__.py b/rllib/tuned_examples/dreamerv3/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/rllib/tuned_examples/dreamerv3/atari_100k.py b/rllib/tuned_examples/dreamerv3/atari_100k.py new file mode 100644 index 0000000000000..ef6731d6e2e2a --- /dev/null +++ b/rllib/tuned_examples/dreamerv3/atari_100k.py @@ -0,0 +1,71 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" + +# Run with: +# python run_regression_tests.py --dir [this file] --env ALE/[gym ID e.g. Pong-v5] + +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config + + +# Number of GPUs to run on. +num_gpus = 1 + +config = ( + DreamerV3Config() + # Switch on eager_tracing by default. + .framework("tf2", eager_tracing=True) + .resources( + num_learner_workers=0 if num_gpus == 1 else num_gpus, + num_gpus_per_learner_worker=1 if num_gpus else 0, + num_cpus_for_local_worker=1, + ) + # TODO (sven): concretize this: If you use >1 GPU and increase the batch size + # accordingly, you might also want to increase the number of envs per worker + .rollouts( + num_envs_per_worker=(num_gpus or 1), + remote_worker_envs=True, + ) + .environment( + # [2]: "We follow the evaluation protocol of Machado et al. (2018) with 200M + # environment steps, action repeat of 4, a time limit of 108,000 steps per + # episode that correspond to 30 minutes of game play, no access to life + # information, full action space, and sticky actions. Because the world model + # integrates information over time, DreamerV2 does not use frame stacking. + # The experiments use a single-task setup where a separate agent is trained + # for each game. Moreover, each agent uses only a single environment instance. + env_config={ + # "sticky actions" but not according to Danijar's 100k configs. + "repeat_action_probability": 0.0, + # "full action space" but not according to Danijar's 100k configs. + "full_action_space": False, + # Already done by MaxAndSkip wrapper: "action repeat" == 4. + "frameskip": 1, + } + ) + .reporting( + metrics_num_episodes_for_smoothing=(num_gpus or 1), + report_images_and_videos=False, + report_dream_data=False, + report_individual_batch_item_stats=False, + ) + # See Appendix A. + .training( + model_size="S", + training_ratio=1024, + batch_size_B=16 * (num_gpus or 1), + # TODO + model={ + "batch_length_T": 64, + "horizon_H": 15, + "gamma": 0.997, + "model_size": "S", + }, + ) +) diff --git a/rllib/tuned_examples/dreamerv3/cartpole.py b/rllib/tuned_examples/dreamerv3/cartpole.py new file mode 100644 index 0000000000000..b270d6c3b3137 --- /dev/null +++ b/rllib/tuned_examples/dreamerv3/cartpole.py @@ -0,0 +1,30 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config + +# Run with: +# python run_regression_tests.py --dir [this file] + +config = ( + DreamerV3Config() + .environment("CartPole-v1") + .training( + model_size="XS", + training_ratio=1024, + # TODO + model={ + "batch_size_B": 16, + "batch_length_T": 64, + "horizon_H": 15, + "gamma": 0.997, + "model_size": "XS", + }, + ) +) diff --git a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py new file mode 100644 index 0000000000000..a8938ce142123 --- /dev/null +++ b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py @@ -0,0 +1,39 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config + +# Run with: +# python run_regression_tests.py --dir [this file] --env DMC/[task]/[domain] +# e.g. --env=DMC/cartpole/swingup + +config = ( + DreamerV3Config() + # Use image observations. + .environment(env_config={"from_pixels": True}) + .resources( + num_learner_workers=1, + num_gpus_per_learner_worker=1, + num_cpus_for_local_worker=1, + ) + .rollouts(num_envs_per_worker=4, remote_worker_envs=True) + # See Appendix A. + .training( + model_size="S", + training_ratio=512, + # TODO + model={ + "batch_size_B": 16, + "batch_length_T": 64, + "horizon_H": 15, + "gamma": 0.997, + "model_size": "S", + }, + ) +) diff --git a/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py b/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py new file mode 100644 index 0000000000000..03e9b40def8a3 --- /dev/null +++ b/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py @@ -0,0 +1,39 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config + +# Run with: +# python run_regression_tests.py --dir [this file] + +config = ( + DreamerV3Config() + .environment( + "FrozenLake-v1", + env_config={ + "desc": [ + "SF", + "HG", + ], + "is_slippery": False, + }, + ) + .training( + model_size="XS", + training_ratio=1024, + # TODO + model={ + "batch_size_B": 16, + "batch_length_T": 64, + "horizon_H": 15, + "gamma": 0.997, + "model_size": "XS", + }, + ) +) diff --git a/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py b/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py new file mode 100644 index 0000000000000..9b7b260d595e9 --- /dev/null +++ b/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py @@ -0,0 +1,36 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config + +# Run with: +# python run_regression_tests.py --dir [this file] + +config = ( + DreamerV3Config() + .environment( + "FrozenLake-v1", + env_config={ + "map_name": "4x4", + "is_slippery": False, + }, + ) + .training( + model_size="nano", + training_ratio=1024, + # TODO + model={ + "batch_size_B": 16, + "batch_length_T": 64, + "horizon_H": 15, + "gamma": 0.997, + "model_size": "nano", + }, + ) +) diff --git a/rllib/tuned_examples/dreamerv3/pendulum.py b/rllib/tuned_examples/dreamerv3/pendulum.py new file mode 100644 index 0000000000000..4acc4b9aa85a9 --- /dev/null +++ b/rllib/tuned_examples/dreamerv3/pendulum.py @@ -0,0 +1,19 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config + +# Run with: +# python run_regression_tests.py --dir [this file] + +config = ( + DreamerV3Config() + .environment("Pendulum-v1") + .training(model_size="XS", training_ratio=1024) +) diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py index 6c9c9badd7a03..0bee53bbd5590 100644 --- a/rllib/utils/metrics/__init__.py +++ b/rllib/utils/metrics/__init__.py @@ -30,6 +30,7 @@ TRAINING_ITERATION_TIMER = "training_iteration" APPLY_GRADS_TIMER = "apply_grad" COMPUTE_GRADS_TIMER = "compute_grads" +GARBAGE_COLLECTION_TIMER = "garbage_collection" SYNCH_WORKER_WEIGHTS_TIMER = "synch_weights" GRAD_WAIT_TIMER = "grad_wait" SAMPLE_TIMER = "sample" diff --git a/rllib/utils/replay_buffers/episode_replay_buffer.py b/rllib/utils/replay_buffers/episode_replay_buffer.py index 787c25b1aae01..e95fc50432489 100644 --- a/rllib/utils/replay_buffers/episode_replay_buffer.py +++ b/rllib/utils/replay_buffers/episode_replay_buffer.py @@ -1,4 +1,5 @@ from collections import deque +import copy from typing import Any, Dict, List, Optional, Union import uuid @@ -109,6 +110,15 @@ def add(self, episodes: Union[List["_Episode"], "_Episode"]): episodes = [episodes] for eps in episodes: + # Make sure we don't change what's coming in from the user. + # TODO (sven): It'd probably be better to make sure in the EnvRunner to not + # hold on to episodes (for metrics purposes only) that we are returning + # back to the user from `EnvRunner.sample()`. Then we wouldn't have to + # do any copying. Instead, either compile the metrics right away on the + # EnvRunner OR compile metrics entirely on the Algorithm side (this is + # actually preferred). + eps = copy.deepcopy(eps) + self._num_timesteps += len(eps) self._num_timesteps_added += len(eps) @@ -242,7 +252,7 @@ def sample( ) episode = self.episodes[episode_idx] - # Starting a new chunk, set continue to False. + # Starting a new chunk, set is_first to True. is_first[B][T] = True # Begin of new batch item (row). @@ -255,7 +265,7 @@ def sample( else: rewards[B].append(episode.rewards[episode_ts - 1]) # We are in the middle of a batch item (row). Concat next episode to this - # row from the episode's beginning. In other words, we never concat + # row from the next episode's beginning. In other words, we never concat # a middle of an episode to another truncated one. else: episode_ts = 0 @@ -321,6 +331,10 @@ def get_sampled_timesteps(self) -> int: """Returns number of timesteps that have been sampled in buffer's lifetime.""" return self.sampled_timesteps + def get_added_timesteps(self) -> int: + """Returns number of timesteps that have been added in buffer's lifetime.""" + return self._num_timesteps_added + @override(ReplayBufferInterface) def get_state(self) -> Dict[str, Any]: return { @@ -329,6 +343,7 @@ def get_state(self) -> Dict[str, Any]: "_num_episodes_evicted": self._num_episodes_evicted, "_indices": self._indices, "_num_timesteps": self._num_timesteps, + "_num_timesteps_added": self._num_timesteps_added, "sampled_timesteps": self.sampled_timesteps, } @@ -341,6 +356,7 @@ def set_state(self, state) -> None: self._num_episodes_evicted = state["_num_episodes_evicted"] self._indices = state["_indices"] self._num_timesteps = state["_num_timesteps"] + self._num_timesteps_added = state["_num_timesteps_added"] self.sampled_timesteps = state["sampled_timesteps"] @@ -356,8 +372,9 @@ def __init__( actions=None, rewards=None, states=None, - is_terminated=False, - is_truncated=False, + t: int = 0, + is_terminated: bool = False, + is_truncated: bool = False, render_images=None, ): self.id_ = id_ or uuid.uuid4().hex @@ -370,6 +387,9 @@ def __init__( # h-states: t0 (in case this episode is a continuation chunk, we need to know # about the initial h) to T. self.states = states + # The global last timestep of the episode and the timesteps when this chunk + # started. + self.t = self.t_started = t # obs[-1] is the final observation in the episode. self.is_terminated = is_terminated # obs[-1] is the last obs in a truncated-by-the-env episode (there will no more @@ -381,13 +401,18 @@ def __init__( self.render_images = [] if render_images is None else render_images def concat_episode(self, episode_chunk: "_Episode"): + """Adds the given `episode_chunk` to the right side of self.""" assert episode_chunk.id_ == self.id_ assert not self.is_done + # Make sure the timesteps match. + assert self.t == episode_chunk.t_started episode_chunk.validate() # Make sure, end matches other episode chunk's beginning. assert np.all(episode_chunk.observations[0] == self.observations[-1]) + # Make sure the timesteps match (our last t should be the same as their first). + assert self.t == episode_chunk.t_started # Pop out our end. self.observations.pop() @@ -396,6 +421,7 @@ def concat_episode(self, episode_chunk: "_Episode"): self.observations.extend(list(episode_chunk.observations)) self.actions.extend(list(episode_chunk.actions)) self.rewards.extend(list(episode_chunk.rewards)) + self.t = episode_chunk.t self.states = episode_chunk.states if episode_chunk.is_terminated: @@ -405,6 +431,21 @@ def concat_episode(self, episode_chunk: "_Episode"): # Validate. self.validate() + def add_initial_observation( + self, *, initial_observation, initial_state=None, initial_render_image=None + ): + assert not self.is_done + assert len(self.observations) == 0 + # Assume that this episode is completely empty and has not stepped yet. + # Leave self.t (and self.t_started) at 0. + assert self.t == self.t_started == 0 + + self.observations.append(initial_observation) + self.states = initial_state + if initial_render_image is not None: + self.render_images.append(initial_render_image) + self.validate() + def add_timestep( self, observation, @@ -416,34 +457,25 @@ def add_timestep( is_truncated=False, render_image=None, ): + # Cannot add data to an already done episode. assert not self.is_done self.observations.append(observation) self.actions.append(action) self.rewards.append(reward) self.states = state + self.t += 1 if render_image is not None: self.render_images.append(render_image) self.is_terminated = is_terminated self.is_truncated = is_truncated self.validate() - def add_initial_observation( - self, *, initial_observation, initial_state=None, initial_render_image=None - ): - assert not self.is_done - assert len(self.observations) == 0 - - self.observations.append(initial_observation) - self.states = initial_state - if initial_render_image is not None: - self.render_images.append(initial_render_image) - self.validate() - def validate(self): # Make sure we always have one more obs stored than rewards (and actions) # due to the reset and last-obs logic of an MDP. assert len(self.observations) == len(self.rewards) + 1 == len(self.actions) + 1 + assert len(self.rewards) == (self.t - self.t_started) # Convert all lists to numpy arrays, if we are terminated. if self.is_done: @@ -454,8 +486,43 @@ def validate(self): @property def is_done(self): + """Whether the episode is actually done (terminated or truncated). + + A done episode cannot be continued via `self.add_timestep()` or being + concatenated on its right-side with another episode chunk or being + succeeded via `self.create_successor()`. + """ return self.is_terminated or self.is_truncated + def create_successor(self) -> "_Episode": + """Returns a successor episode chunk (of len=0) continuing with this one. + + The successor will have the same ID and state as self and its only observation + will be the last observation in self. Its length will therefore be 0 (no + steps taken yet). + + This method is useful if you would like to discontinue building an episode + chunk (b/c you have to return it from somewhere), but would like to have a new + episode (chunk) instance to continue building the actual env episode at a later + time. + + Returns: + The successor Episode chunk of this one with the same ID and state and the + only observation being the last observation in self. + """ + assert not self.is_done + + return _Episode( + # Same ID. + id_=self.id_, + # First (and only) observation of successor is this episode's last obs. + observations=[self.observations[-1]], + # Same state. + states=self.states, + # Continue with self's current timestep. + t=self.t, + ) + def to_sample_batch(self): return SampleBatch( { @@ -497,6 +564,8 @@ def get_state(self): "actions": self.actions, "rewards": self.rewards, "states": self.states, + "t_started": self.t_started, + "t": self.t, "is_terminated": self.is_terminated, "is_truncated": self.is_truncated, }.items() @@ -509,14 +578,16 @@ def from_state(state): eps.actions = state[2][1] eps.rewards = state[3][1] eps.states = state[4][1] - eps.is_terminated = state[5][1] - eps.is_truncated = state[6][1] + eps.t_started = state[5][1] + eps.t = state[6][1] + eps.is_terminated = state[7][1] + eps.is_truncated = state[8][1] return eps def __len__(self): assert len(self.observations) > 0, ( "ERROR: Cannot determine length of episode that hasn't started yet! " - "Call `_Episode.add_initial_obs(initial_observation=...)` first " + "Call `_Episode.add_initial_observation(initial_observation=...)` first " "(after which `len(_Episode)` will be 0)." ) return len(self.observations) - 1 diff --git a/rllib/utils/tf_utils.py b/rllib/utils/tf_utils.py index 7b43953c5b67f..3acbbad004c0f 100644 --- a/rllib/utils/tf_utils.py +++ b/rllib/utils/tf_utils.py @@ -679,7 +679,7 @@ def two_hot( # First make sure, values are clipped. value = tf.clip_by_value(value, lower_bound, upper_bound) # Tensor of batch indices: [0, B=batch size). - batch_indices = tf.range(0, value.shape[0], dtype=tf.float32) + batch_indices = tf.range(0, tf.shape(value)[0], dtype=tf.float32) # Calculate the step deltas (how much space between each bucket's central value?). bucket_delta = (upper_bound - lower_bound) / (num_buckets - 1) # Compute the float indices (might be non-int numbers: sitting between two buckets). @@ -690,12 +690,12 @@ def two_hot( kp1 = tf.math.ceil(idx) # In case k == kp1 (idx is exactly on the bucket boundary), move kp1 up by 1.0. # Otherwise, this would result in a NaN in the returned two-hot tensor. - kp1 = tf.where(k == kp1, kp1 + 1.0, kp1) + kp1 = tf.where(tf.equal(k, kp1), kp1 + 1.0, kp1) # Iff `kp1` is one beyond our last index (because incoming value is larger than # `upper_bound`), move it to one before k (kp1's weight is going to be 0.0 anyways, # so it doesn't matter where it points to; we are just avoiding an index error # with this). - kp1 = tf.where(kp1 == num_buckets, kp1 - 2.0, kp1) + kp1 = tf.where(tf.equal(kp1, num_buckets), kp1 - 2.0, kp1) # The actual values found at k and k+1 inside the set of buckets. values_k = lower_bound + k * bucket_delta values_kp1 = lower_bound + kp1 * bucket_delta