diff --git a/rllib/BUILD b/rllib/BUILD index b66d0ec983e41..221c2362b56cf 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1066,7 +1066,7 @@ py_test( srcs = ["algorithms/dqn/tests/test_repro_dqn.py"] ) -# Dreamer (V1) +# Dreamer py_test( name = "test_dreamer", tags = ["team:rllib", "algorithms_dir"], @@ -1074,16 +1074,6 @@ py_test( srcs = ["algorithms/dreamer/tests/test_dreamer.py"] ) -# DreamerV3 -# TODO (sven): Enable once the version conflict for gymnasium/supersuit/pettingzoo -# /shimmy/mujoco has been resolved. -#py_test( -# name = "test_dreamerv3", -# tags = ["team:rllib", "algorithms_dir"], -# size = "large", -# srcs = ["algorithms/dreamerv3/tests/test_dreamerv3.py"] -#) - # DT py_test( name = "test_segmentation_buffer", @@ -4355,7 +4345,6 @@ py_test_module_list( files = [ "tests/test_dnc.py", "tests/test_perf.py", - "algorithms/dreamerv3/tests/test_dreamerv3.py", "env/wrappers/tests/test_kaggle_wrapper.py", "examples/env/tests/test_cliff_walking_wall_env.py", "examples/env/tests/test_coin_game_non_vectorized_env.py", diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 7e3c32a4efc51..29de0b01a3be5 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -706,19 +706,7 @@ def setup(self, config: AlgorithmConfig) -> None: # the two we need to loop through the policy modules and create a simple # MARLModule from the RLModule within each policy. local_worker = self.workers.local_worker() - policy_dict, _ = self.config.get_multi_agent_setup( - env=local_worker.env, - spaces=getattr(local_worker, "spaces", None), - ) - # TODO (Sven): Unify the inference of the MARLModuleSpec. Right now, - # we get this from the RolloutWorker's `marl_module_spec` property. - # However, this is hacky (information leak) and should not remain this - # way. For other EnvRunner classes (that don't have this property), - # Algorithm should infer this itself. - if hasattr(local_worker, "marl_module_spec"): - module_spec = local_worker.marl_module_spec - else: - module_spec = self.config.get_marl_module_spec(policy_dict=policy_dict) + module_spec = local_worker.marl_module_spec learner_group_config = self.config.get_learner_group_config(module_spec) self.learner_group = learner_group_config.build() @@ -883,7 +871,7 @@ def evaluate( # Sync weights to the evaluation WorkerSet. if self.evaluation_workers is not None: self.evaluation_workers.sync_weights( - from_worker_or_learner_group=self.workers.local_worker() + from_worker_or_trainer=self.workers.local_worker() ) self._sync_filters_if_needed( central_worker=self.workers.local_worker(), @@ -1421,7 +1409,7 @@ def training_step(self) -> ResultDict: if self.config._enable_learner_api: from_worker_or_trainer = self.learner_group self.workers.sync_weights( - from_worker_or_learner_group=from_worker_or_trainer, + from_worker_or_trainer=from_worker_or_trainer, policies=list(train_results.keys()), global_vars=global_vars, ) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 2510490d48426..a037f7bb052b3 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -303,8 +303,10 @@ def __init__(self, algo_class=None): self.normalize_actions = True self.clip_actions = False self.disable_env_checking = False + # Whether this env is an atari env (for atari-specific preprocessing). + # If not specified, we will try to auto-detect this. + self.is_atari = None self.auto_wrap_old_gym_envs = True - self._is_atari = None # `self.rollouts()` self.env_runner_cls = None @@ -716,6 +718,31 @@ def freeze(self) -> None: # of themselves? This way, users won't even be able to alter those values # directly anymore. + def _detect_atari_env(self) -> bool: + """Returns whether this configured env is an Atari env or not. + + Returns: + True, if specified env is an Atari env, False otherwise. + """ + # Atari envs are usually specified via a string like "PongNoFrameskip-v4" + # or "ALE/Breakout-v5". + # We do NOT attempt to auto-detect Atari env for other specified types like + # a callable, to avoid running heavy logics in validate(). + # For these cases, users can explicitly set `environment(atari=True)`. + if not type(self.env) == str: + return False + + try: + if self.env.startswith("ALE/"): + env = gym.make("GymV26Environment-v0", env_id=self.env) + else: + env = gym.make(self.env) + except gym.error.NameNotFound: + # Not an Atari env if this is not a gym env. + return False + + return is_atari(env) + @OverrideToImplementCustomLogic_CallToSuperRecommended def validate(self) -> None: """Validates all values in this config.""" @@ -961,6 +988,10 @@ def validate(self) -> None: f"config.framework({self.framework_str})!" ) + # Detect if specified env is an Atari env. + if self.is_atari is None: + self.is_atari = self._detect_atari_env() + if self.input_ == "sampler" and self.off_policy_estimation_methods: raise ValueError( "Off-policy estimation methods can only be used if the input is a " @@ -1337,7 +1368,7 @@ def environment( disable_env_checking: If True, disable the environment pre-checking module. is_atari: This config can be used to explicitly specify whether the env is an Atari env or not. If not specified, RLlib will try to auto-detect - this. + this during config validation. auto_wrap_old_gym_envs: Whether to auto-wrap old gym environments (using the pre 0.24 gym APIs, e.g. reset() returning single obs and no info dict). If True, RLlib will automatically wrap the given gym env class @@ -1374,7 +1405,7 @@ def environment( if disable_env_checking is not NotProvided: self.disable_env_checking = disable_env_checking if is_atari is not NotProvided: - self._is_atari = is_atari + self.is_atari = is_atari if auto_wrap_old_gym_envs is not NotProvided: self.auto_wrap_old_gym_envs = auto_wrap_old_gym_envs @@ -2288,8 +2319,6 @@ def reporting( In case there are more than this many episodes collected in a single training iteration, use all of these episodes for metrics computation, meaning don't ever cut any "excess" episodes. - Set this to 1 to disable smoothing and to always report only the most - recently collected episode's return. min_time_s_per_iteration: Minimum time to accumulate within a single `train()` call. This value does not affect learning, only the number of times `Algorithm.training_step()` is called by @@ -2616,34 +2645,6 @@ def learner_class(self) -> Type["Learner"]: """ return self._learner_class or self.get_default_learner_class() - @property - def is_atari(self) -> bool: - """True if if specified env is an Atari env.""" - - # Not yet determined, try to figure this out. - if self._is_atari is None: - # Atari envs are usually specified via a string like "PongNoFrameskip-v4" - # or "ALE/Breakout-v5". - # We do NOT attempt to auto-detect Atari env for other specified types like - # a callable, to avoid running heavy logics in validate(). - # For these cases, users can explicitly set `environment(atari=True)`. - if not type(self.env) == str: - return False - try: - if self.env.startswith("ALE/"): - env = gym.make("GymV26Environment-v0", env_id=self.env) - else: - env = gym.make(self.env) - # Any gymnasium error -> Cannot be an Atari env. - except gym.error.Error: - return False - - self._is_atari = is_atari(env) - # Clean up env's resources, if any. - env.close() - - return self._is_atari - # TODO: Make rollout_fragment_length as read-only property and replace the current # self.rollout_fragment_length a private variable. def get_rollout_fragment_length(self, worker_index: int = 0) -> int: diff --git a/rllib/algorithms/dreamerv3/README.md b/rllib/algorithms/dreamerv3/README.md deleted file mode 100644 index 8c64b960b7b73..0000000000000 --- a/rllib/algorithms/dreamerv3/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# DreamerV3 -Implementation (TensorFlow/Keras) of the "DreamerV3" model-based reinforcement learning -(RL) algorithm by D. Hafner et al. 2023 - -DreamerV3 train a world model in supervised fashion using real environment -interactions. The world model utilizes a recurrent GRU-based architecture -("recurrent state space model" or RSSM) and uses it to predicts rewards, -episode continuation flags, as well as, observations. -With these predictions (dreams) made by the world model, both actor -and critic are trained in classic REINFORCE fashion. In other words, the -actual RL components of the model are never trained on actual environment data, -but on dreamed trajectories only. - -For more algorithm details, see: - -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -.. and the "DreamerV2" paper: - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf - -## Results -TODO diff --git a/rllib/algorithms/dreamerv3/__init__.py b/rllib/algorithms/dreamerv3/__init__.py deleted file mode 100644 index d4b2adb0d57ed..0000000000000 --- a/rllib/algorithms/dreamerv3/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3, DreamerV3Config - -__all__ = [ - "DreamerV3", - "DreamerV3Config", -] diff --git a/rllib/algorithms/dreamerv3/dreamerv3.py b/rllib/algorithms/dreamerv3/dreamerv3.py deleted file mode 100644 index 515f6e3a22a29..0000000000000 --- a/rllib/algorithms/dreamerv3/dreamerv3.py +++ /dev/null @@ -1,660 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" -import dataclasses -import gc -import logging -import tree # pip install dm_tree -from typing import Any, Dict, List, Optional - -import gymnasium as gym -import numpy as np - -from ray.rllib.algorithms.algorithm import Algorithm -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.dreamerv3.dreamerv3_catalog import DreamerV3Catalog -from ray.rllib.algorithms.dreamerv3.dreamerv3_learner import ( - DreamerV3LearnerHyperparameters, -) -from ray.rllib.algorithms.dreamerv3.utils import do_symlog_obs -from ray.rllib.algorithms.dreamerv3.utils.env_runner import DreamerV3EnvRunner -from ray.rllib.algorithms.dreamerv3.utils.summaries import ( - report_predicted_vs_sampled_obs, - report_sampling_and_replay_buffer, -) -from ray.rllib.core.learner.learner import LearnerHyperparameters -from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch -from ray.rllib.utils import deep_update -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.numpy import one_hot -from ray.rllib.utils.metrics import ( - ALL_MODULES, - GARBAGE_COLLECTION_TIMER, - LEARN_ON_BATCH_TIMER, - NUM_AGENT_STEPS_SAMPLED, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_SAMPLED, - NUM_ENV_STEPS_TRAINED, - NUM_GRAD_UPDATES_LIFETIME, - NUM_SYNCH_WORKER_WEIGHTS, - NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, - SAMPLE_TIMER, - SYNCH_WORKER_WEIGHTS_TIMER, -) -from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer -from ray.rllib.utils.typing import ResultDict - - -logger = logging.getLogger(__name__) - -_, tf, _ = try_import_tf() - - -class DreamerV3Config(AlgorithmConfig): - """Defines a configuration class from which a DreamerV3 can be built. - - Example: - >>> from ray.rllib.algorithms.dreamerv3 import DreamerV3Config - >>> config = DreamerV3Config() - >>> config = config.training( # doctest: +SKIP - ... batch_size_B=8, model_size="M" - ... ) - >>> config = config.resources(num_learner_workers=4) # doctest: +SKIP - >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Algorithm object from the config and run 1 training iteration. - >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP - >>> algo.train() # doctest: +SKIP - - Example: - >>> from ray.rllib.algorithms.dreamerv3 import DreamerV3Config - >>> from ray import air - >>> from ray import tune - >>> config = DreamerV3Config() - >>> # Print out some default values. - >>> print(config.training_ratio) # doctest: +SKIP - >>> # Update the config object. - >>> config = config.training( # doctest: +SKIP - ... training_ratio=tune.grid_search([256, 512, 1024]) - ... ) - >>> # Set the config object's env. - >>> config = config.environment(env="CartPole-v1") # doctest: +SKIP - >>> # Use to_dict() to get the old-style python config dict - >>> # when running with tune. - >>> tune.Tuner( # doctest: +SKIP - ... "DreamerV3", - ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), - ... param_space=config.to_dict(), - ... ).fit() - """ - - def __init__(self, algo_class=None): - """Initializes a DreamerV3Config instance.""" - super().__init__(algo_class=algo_class or DreamerV3) - - # fmt: off - # __sphinx_doc_begin__ - - # DreamerV3 specific settings: - self.model_size = "XS" - self.training_ratio = 1024 - - self.replay_buffer_config = { - "type": "EpisodeReplayBuffer", - "capacity": int(1e6), - } - self.world_model_lr = 1e-4 - self.actor_lr = 3e-5 - self.critic_lr = 3e-5 - self.batch_size_B = 16 - self.batch_length_T = 64 - self.horizon_H = 15 - self.gae_lambda = 0.95 # [1] eq. 7. - self.entropy_scale = 3e-4 # [1] eq. 11. - self.return_normalization_decay = 0.99 # [1] eq. 11 and 12. - self.train_critic = True - self.train_actor = True - self.intrinsic_rewards_scale = 0.1 - self.world_model_grad_clip_by_global_norm = 1000.0 - self.critic_grad_clip_by_global_norm = 100.0 - self.actor_grad_clip_by_global_norm = 100.0 - - # Reporting. - # DreamerV3 is super sample efficient and only needs very few episodes - # (normally) to learn. Leaving this at its default value would gravely - # underestimate the learning performance over the course of an experiment. - self.metrics_num_episodes_for_smoothing = 1 - self.report_individual_batch_item_stats = False - self.report_dream_data = False - self.report_images_and_videos = False - self.gc_frequency_train_steps = 100 - - # Override some of AlgorithmConfig's default values with DreamerV3-specific - # values. - self.lr = None - self.framework_str = "tf2" - self.gamma = 0.997 # [1] eq. 7. - # Do not use! Set `batch_size_B` and `batch_length_T` instead. - self.train_batch_size = None - self.env_runner_cls = DreamerV3EnvRunner - self.num_rollout_workers = 0 - self.rollout_fragment_length = 1 - # Since we are using a gymnasium-based EnvRunner, we can utilitze its - # vectorization capabilities w/o suffering performance losses (as we would - # with RLlib's `RemoteVectorEnv`). - self.remote_worker_envs = True - # Dreamer only runs on the new API stack. - self._enable_learner_api = True - self._enable_rl_module_api = True - # __sphinx_doc_end__ - # fmt: on - - @override(AlgorithmConfig) - def training( - self, - *, - model_size: Optional[str] = NotProvided, - training_ratio: Optional[float] = NotProvided, - gc_frequency_train_steps: Optional[int] = NotProvided, - batch_size_B: Optional[int] = NotProvided, - batch_length_T: Optional[int] = NotProvided, - horizon_H: Optional[int] = NotProvided, - gae_lambda: Optional[float] = NotProvided, - entropy_scale: Optional[float] = NotProvided, - return_normalization_decay: Optional[float] = NotProvided, - train_critic: Optional[bool] = NotProvided, - train_actor: Optional[bool] = NotProvided, - intrinsic_rewards_scale: Optional[float] = NotProvided, - world_model_grad_clip_by_global_norm: Optional[float] = NotProvided, - critic_grad_clip_by_global_norm: Optional[float] = NotProvided, - actor_grad_clip_by_global_norm: Optional[float] = NotProvided, - replay_buffer_config: Optional[dict] = NotProvided, - **kwargs, - ) -> "DreamerV3Config": - """Sets the training related configuration. - - Args: - model_size: The main switch for adjusting the overall model size. See [1] - (table B) for more information on the effects of this setting on the - model architecture. - Supported values are "XS", "S", "M", "L", "XL" (as per the paper), as - well as, "nano", "micro", "mini", and "XXS" (for RLlib's - implementation). See ray.rllib.algorithms.dreamerv3.utils. - __init__.py for the details on what exactly each size does to the layer - sizes, number of layers, etc.. - training_ratio: The ratio of total steps trained (sum of the sizes of all - batches ever sampled from the replay buffer) over the total env steps - taken (in the actual environment, not the dreamed one). For example, - if the training_ratio is 1024 and the batch size is 1024, we would take - 1 env step for every training update: 1024 / 1. If the training ratio - is 512 and the batch size is 1024, we would take 2 env steps and then - perform a single training update (on a 1024 batch): 1024 / 2. - gc_frequency_train_steps: The frequency (in training iterations) with which - we perform a `gc.collect()` calls at the end of a `training_step` - iteration. Doing this more often adds a (albeit very small) performance - overhead, but prevents memory leaks from becoming harmful. - TODO (sven): This might not be necessary anymore, but needs to be - confirmed experimentally. - batch_size_B: The batch size (B) interpreted as number of rows (each of - length `batch_length_T`) to sample from the replay buffer in each - iteration. - batch_length_T: The batch length (T) interpreted as the length of each row - sampled from the replay buffer in each iteration. Note that - `batch_size_B` rows will be sampled in each iteration. Rows normally - contain consecutive data (consecutive timesteps from the same episode), - but there might be episode boundaries in a row as well. - horizon_H: The horizon (in timesteps) used to create dreamed data from the - world model, which in turn is used to train/update both actor- and - critic networks. - gae_lambda: The lambda parameter used for computing the GAE-style - value targets for the actor- and critic losses. - entropy_scale: The factor with which to multiply the entropy loss term - inside the actor loss. - return_normalization_decay: The decay value to use when computing the - running EMA values for return normalization (used in the actor loss). - train_critic: Whether to train the critic network. If False, `train_actor` - must also be False (cannot train actor w/o training the critic). - train_actor: Whether to train the actor network. If True, `train_critic` - must also be True (cannot train actor w/o training the critic). - intrinsic_rewards_scale: The factor to multiply intrinsic rewards with - before adding them to the extrinsic (environment) rewards. - world_model_grad_clip_by_global_norm: World model grad clipping value - (by global norm). - critic_grad_clip_by_global_norm: Critic grad clipping value - (by global norm). - actor_grad_clip_by_global_norm: Actor grad clipping value (by global norm). - replay_buffer_config: Replay buffer config. - Only serves in DreamerV3 to set the capacity of the replay buffer. - Note though that in the paper ([1]) a size of 1M is used for all - benchmarks and there doesn't seem to be a good reason to change this - parameter. - Examples: - { - "type": "EpisodeReplayBuffer", - "capacity": 100000, - } - - Returns: - This updated AlgorithmConfig object. - """ - # Pass kwargs onto super's `training()` method. - super().training(**kwargs) - - if model_size is not NotProvided: - self.model_size = model_size - if training_ratio is not NotProvided: - self.training_ratio = training_ratio - if gc_frequency_train_steps is not NotProvided: - self.gc_frequency_train_steps = gc_frequency_train_steps - if batch_size_B is not NotProvided: - self.batch_size_B = batch_size_B - if batch_length_T is not NotProvided: - self.batch_length_T = batch_length_T - if horizon_H is not NotProvided: - self.horizon_H = horizon_H - if gae_lambda is not NotProvided: - self.gae_lambda = gae_lambda - if entropy_scale is not NotProvided: - self.entropy_scale = entropy_scale - if return_normalization_decay is not NotProvided: - self.return_normalization_decay = return_normalization_decay - if train_critic is not NotProvided: - self.train_critic = train_critic - if train_actor is not NotProvided: - self.train_actor = train_actor - if intrinsic_rewards_scale is not NotProvided: - self.intrinsic_rewards_scale = intrinsic_rewards_scale - if world_model_grad_clip_by_global_norm is not NotProvided: - self.world_model_grad_clip_by_global_norm = ( - world_model_grad_clip_by_global_norm - ) - if critic_grad_clip_by_global_norm is not NotProvided: - self.critic_grad_clip_by_global_norm = critic_grad_clip_by_global_norm - if actor_grad_clip_by_global_norm is not NotProvided: - self.actor_grad_clip_by_global_norm = actor_grad_clip_by_global_norm - if replay_buffer_config is not NotProvided: - # Override entire `replay_buffer_config` if `type` key changes. - # Update, if `type` key remains the same or is not specified. - new_replay_buffer_config = deep_update( - {"replay_buffer_config": self.replay_buffer_config}, - {"replay_buffer_config": replay_buffer_config}, - False, - ["replay_buffer_config"], - ["replay_buffer_config"], - ) - self.replay_buffer_config = new_replay_buffer_config["replay_buffer_config"] - - return self - - @override(AlgorithmConfig) - def reporting( - self, - *, - report_individual_batch_item_stats: Optional[bool] = NotProvided, - report_dream_data: Optional[bool] = NotProvided, - report_images_and_videos: Optional[bool] = NotProvided, - **kwargs, - ): - """Sets the reporting related configuration. - - Args: - report_individual_batch_item_stats: Whether to include loss and other stats - per individual timestep inside the training batch in the result dict - returned by `training_step()`. If True, besides the `CRITIC_L_total`, - the individual critic loss values per batch row and time axis step - in the train batch (CRITIC_L_total_B_T) will also be part of the - results. - report_dream_data: Whether to include the dreamed trajectory data in the - result dict returned by `training_step()`. If True, however, will - slice each reported item in the dream data down to the shape. - (H, B, t=0, ...), where H is the horizon and B is the batch size. The - original time axis will only be represented by the first timestep - to not make this data too large to handle. - report_images_and_videos: Whether to include any image/video data in the - result dict returned by `training_step()`. - **kwargs: - - Returns: - This updated AlgorithmConfig object. - """ - super().reporting(**kwargs) - - if report_individual_batch_item_stats is not NotProvided: - self.report_individual_batch_item_stats = report_individual_batch_item_stats - if report_dream_data is not NotProvided: - self.report_dream_data = report_dream_data - if report_images_and_videos is not NotProvided: - self.report_images_and_videos = report_images_and_videos - - return self - - @override(AlgorithmConfig) - def validate(self) -> None: - # Call the super class' validation method first. - super().validate() - - # Make sure, users are not using DreamerV3 yet for multi-agent: - if self.is_multi_agent(): - raise ValueError("DreamerV3 does NOT support multi-agent setups yet!") - - # Make sure, we are configure for the new API stack. - if not (self._enable_learner_api and self._enable_rl_module_api): - raise ValueError( - "DreamerV3 must be run with `config._enable_learner_api`=True AND " - "with `config._enable_rl_module_api`=True!" - ) - - # If run on several Learners, the provided batch_size_B must be a multiple - # of `num_learner_workers`. - if self.num_learner_workers > 1 and ( - self.batch_size_B % self.num_learner_workers != 0 - ): - raise ValueError( - f"Your `batch_size_B` ({self.batch_size_B}) must be a multiple of " - f"`num_learner_workers` ({self.num_learner_workers}) in order for " - "DreamerV3 to be able to split batches evenly across your Learner " - "processes." - ) - - # Cannot train actor w/o critic. - if self.train_actor and not self.train_critic: - raise ValueError( - "Cannot train actor network (`train_actor=True`) w/o training critic! " - "Make sure you either set `train_critic=True` or `train_actor=False`." - ) - # Use DreamerV3 specific batch size settings. - if self.train_batch_size is not None: - raise ValueError( - "`train_batch_size` should NOT be set! Use `batch_size_B` and " - "`batch_length_T` instead." - ) - # Must be run with `EpisodeReplayBuffer` type. - if self.replay_buffer_config.get("type") != "EpisodeReplayBuffer": - raise ValueError( - "DreamerV3 must be run with the `EpisodeReplayBuffer` type! None " - "other supported." - ) - - @override(AlgorithmConfig) - def get_learner_hyperparameters(self) -> LearnerHyperparameters: - base_hps = super().get_learner_hyperparameters() - return DreamerV3LearnerHyperparameters( - model_size=self.model_size, - training_ratio=self.training_ratio, - batch_size_B=self.batch_size_B // (self.num_learner_workers or 1), - batch_length_T=self.batch_length_T, - horizon_H=self.horizon_H, - gamma=self.gamma, - gae_lambda=self.gae_lambda, - entropy_scale=self.entropy_scale, - return_normalization_decay=self.return_normalization_decay, - train_actor=self.train_actor, - train_critic=self.train_critic, - world_model_lr=self.world_model_lr, - intrinsic_rewards_scale=self.intrinsic_rewards_scale, - actor_lr=self.actor_lr, - critic_lr=self.critic_lr, - world_model_grad_clip_by_global_norm=( - self.world_model_grad_clip_by_global_norm - ), - actor_grad_clip_by_global_norm=self.actor_grad_clip_by_global_norm, - critic_grad_clip_by_global_norm=self.critic_grad_clip_by_global_norm, - report_individual_batch_item_stats=( - self.report_individual_batch_item_stats - ), - report_dream_data=self.report_dream_data, - report_images_and_videos=self.report_images_and_videos, - **dataclasses.asdict(base_hps), - ) - - @override(AlgorithmConfig) - def get_default_learner_class(self): - if self.framework_str == "tf2": - from ray.rllib.algorithms.dreamerv3.tf.dreamerv3_tf_learner import ( - DreamerV3TfLearner, - ) - - return DreamerV3TfLearner - else: - raise ValueError(f"The framework {self.framework_str} is not supported.") - - @override(AlgorithmConfig) - def get_default_rl_module_spec(self) -> SingleAgentRLModuleSpec: - if self.framework_str == "tf2": - from ray.rllib.algorithms.dreamerv3.tf.dreamerv3_tf_rl_module import ( - DreamerV3TfRLModule, - ) - - return SingleAgentRLModuleSpec( - module_class=DreamerV3TfRLModule, catalog_class=DreamerV3Catalog - ) - else: - raise ValueError(f"The framework {self.framework_str} is not supported.") - - @property - def share_module_between_env_runner_and_learner(self) -> bool: - # If we only have one local Learner (num_learner_workers=0) and only - # one local EnvRunner (num_rollout_workers=0), share the RLModule - # between these two to avoid having to sync weights, ever. - return self.num_learner_workers == 0 and self.num_rollout_workers == 0 - - -class DreamerV3(Algorithm): - """Implementation of the model-based DreamerV3 RL algorithm described in [1].""" - - @classmethod - @override(Algorithm) - def get_default_config(cls) -> AlgorithmConfig: - return DreamerV3Config() - - @override(Algorithm) - def setup(self, config: AlgorithmConfig): - super().setup(config) - - # Share RLModule between EnvRunner and single (local) Learner instance. - # To avoid possibly expensive weight synching step. - if self.config.share_module_between_env_runner_and_learner: - assert self.workers.local_worker().module is None - self.workers.local_worker().module = self.learner_group._learner.module[ - DEFAULT_POLICY_ID - ] - - # Summarize (single-agent) RLModule (only once) here. - if self.config.framework_str == "tf2": - self.workers.local_worker().module.dreamer_model.summary(expand_nested=True) - - # Create a replay buffer for storing actual env samples. - self.replay_buffer = EpisodeReplayBuffer( - capacity=self.config.replay_buffer_config["capacity"], - batch_size_B=self.config.batch_size_B, - batch_length_T=self.config.batch_length_T, - ) - - @override(Algorithm) - def training_step(self) -> ResultDict: - results = {} - - env_runner = self.workers.local_worker() - - # Push enough samples into buffer initially before we start training. - if self.training_iteration == 0: - logger.info( - "Filling replay buffer so it contains at least " - f"{self.config.batch_size_B * self.config.batch_length_T} timesteps " - "(required for a single train batch)." - ) - - # Have we sampled yet in this `training_step()` call? - have_sampled = False - with self._timers[SAMPLE_TIMER]: - # Continue sampling from the actual environment (and add collected samples - # to our replay buffer) as long as we: - while ( - # a) Don't have at least batch_size_B x batch_length_T timesteps stored - # in the buffer. This is the minimum needed to train. - self.replay_buffer.get_num_timesteps() - < (self.config.batch_size_B * self.config.batch_length_T) - # b) The computed `training_ratio` is >= the configured (desired) - # training ratio (meaning we should continue sampling). - or self.training_ratio >= self.config.training_ratio - # c) we have not sampled at all yet in this `training_step()` call. - or not have_sampled - ): - done_episodes, ongoing_episodes = env_runner.sample() - have_sampled = True - - # We took B x T env steps. - env_steps_last_sample = sum( - len(eps) for eps in done_episodes + ongoing_episodes - ) - self._counters[NUM_AGENT_STEPS_SAMPLED] += env_steps_last_sample - self._counters[NUM_ENV_STEPS_SAMPLED] += env_steps_last_sample - - # Add ongoing and finished episodes into buffer. The buffer will - # automatically take care of properly concatenating (by episode IDs) - # the different chunks of the same episodes, even if they come in via - # separate `add()` calls. - self.replay_buffer.add(episodes=done_episodes + ongoing_episodes) - - # Summarize environment interaction and buffer data. - results[ALL_MODULES] = report_sampling_and_replay_buffer( - replay_buffer=self.replay_buffer, - ) - - # Continue sampling batch_size_B x batch_length_T sized batches from the buffer - # and using these to update our models (`LearnerGroup.update()`) until the - # computed `training_ratio` is larger than the configured one, meaning we should - # go back and collect more samples again from the actual environment. - # However, when calculating the `training_ratio` here, we use only the - # trained steps in this very `training_step()` call over the most recent sample - # amount (`env_steps_last_sample`), not the global values. This is to avoid a - # heavy overtraining at the very beginning when we have just pre-filled the - # buffer with the minimum amount of samples. - replayed_steps_this_iter = sub_iter = 0 - while ( - replayed_steps_this_iter / env_steps_last_sample - ) < self.config.training_ratio: - - # Time individual batch updates. - with self._timers[LEARN_ON_BATCH_TIMER]: - logger.info(f"\tSub-iteration {self.training_iteration}/{sub_iter})") - - # Draw a new sample from the replay buffer. - sample = self.replay_buffer.sample( - batch_size_B=self.config.batch_size_B, - batch_length_T=self.config.batch_length_T, - ) - replayed_steps = self.config.batch_size_B * self.config.batch_length_T - replayed_steps_this_iter += replayed_steps - - # Convert some bool columns to float32 and one-hot actions. - sample["is_first"] = sample["is_first"].astype(np.float32) - sample["is_last"] = sample["is_last"].astype(np.float32) - sample["is_terminated"] = sample["is_terminated"].astype(np.float32) - if isinstance(env_runner.env.single_action_space, gym.spaces.Discrete): - sample["actions_ints"] = sample[SampleBatch.ACTIONS] - sample[SampleBatch.ACTIONS] = one_hot( - sample["actions_ints"], - depth=env_runner.env.single_action_space.n, - ) - - # Perform the actual update via our learner group. - train_results = self.learner_group.update( - SampleBatch(sample).as_multi_agent(), - reduce_fn=self._reduce_results, - ) - self._counters[NUM_AGENT_STEPS_TRAINED] += replayed_steps - self._counters[NUM_ENV_STEPS_TRAINED] += replayed_steps - - # Perform additional (non-gradient updates), such as the critic EMA-copy - # update. - with self._timers["critic_ema_update"]: - self.learner_group.additional_update( - timestep=self._counters[NUM_ENV_STEPS_TRAINED], - reduce_fn=self._reduce_results, - ) - - if self.config.report_images_and_videos: - report_predicted_vs_sampled_obs( - # TODO (sven): DreamerV3 is single-agent only. - results=train_results[DEFAULT_POLICY_ID], - sample=sample, - batch_size_B=self.config.batch_size_B, - batch_length_T=self.config.batch_length_T, - symlog_obs=do_symlog_obs( - env_runner.env.single_observation_space, - self.config.model.get("symlog_obs", "auto"), - ), - ) - - res = train_results[DEFAULT_POLICY_ID] - logger.info( - f"\t\tWORLD_MODEL_L_total={res['WORLD_MODEL_L_total']:.5f} (" - f"L_pred={res['WORLD_MODEL_L_prediction']:.5f} (" - f"decoder/obs={res['WORLD_MODEL_L_decoder']} " - f"L_rew={res['WORLD_MODEL_L_reward']} " - f"L_cont={res['WORLD_MODEL_L_continue']}); " - f"L_dyn/rep={res['WORLD_MODEL_L_dynamics']:.5f})" - ) - msg = "\t\t" - if self.config.train_actor: - msg += f"L_actor={res['ACTOR_L_total']:.5f} " - if self.config.train_critic: - msg += f"L_critic={res['CRITIC_L_total']:.5f} " - logger.info(msg) - - sub_iter += 1 - self._counters[NUM_GRAD_UPDATES_LIFETIME] += 1 - - # Update weights - after learning on the LearnerGroup - on all EnvRunner - # workers. - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - # Only necessary if RLModule is not shared between (local) EnvRunner and - # (local) Learner. - if not self.config.share_module_between_env_runner_and_learner: - self._counters[ - NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS - ] = 0 - self._counters[NUM_SYNCH_WORKER_WEIGHTS] += 1 - self.workers.sync_weights( - from_worker_or_learner_group=self.learner_group - ) - - # Try trick from https://medium.com/dive-into-ml-ai/dealing-with-memory-leak- - # issue-in-keras-model-training-e703907a6501 - if self.config.gc_frequency_train_steps and ( - self.training_iteration % self.config.gc_frequency_train_steps == 0 - ): - with self._timers[GARBAGE_COLLECTION_TIMER]: - gc.collect() - - # Add train results and the actual training ratio to stats. The latter should - # be close to the configured `training_ratio`. - results.update(train_results) - results[ALL_MODULES]["actual_training_ratio"] = self.training_ratio - - # Return all results. - return results - - @property - def training_ratio(self) -> float: - """Returns the actual training ratio of this Algorithm. - - The training ratio is copmuted by dividing the total number of steps - trained thus far (replayed from the buffer) over the total number of actual - env steps taken thus far. - """ - return self._counters[NUM_ENV_STEPS_TRAINED] / ( - self._counters[NUM_ENV_STEPS_SAMPLED] - ) - - @staticmethod - def _reduce_results(results: List[Dict[str, Any]]): - return tree.map_structure(lambda *s: np.mean(s, axis=0), *results) diff --git a/rllib/algorithms/dreamerv3/dreamerv3_catalog.py b/rllib/algorithms/dreamerv3/dreamerv3_catalog.py deleted file mode 100644 index 50568fe1875ab..0000000000000 --- a/rllib/algorithms/dreamerv3/dreamerv3_catalog.py +++ /dev/null @@ -1,80 +0,0 @@ -import gymnasium as gym - -from ray.rllib.core.models.catalog import Catalog -from ray.rllib.core.models.base import Encoder, Model -from ray.rllib.utils import override - - -class DreamerV3Catalog(Catalog): - """The Catalog class used to build all the models needed for DreamerV3 training.""" - - def __init__( - self, - observation_space: gym.Space, - action_space: gym.Space, - model_config_dict: dict, - ): - """Initializes a DreamerV3Catalog instance. - - Args: - observation_space: The observation space of the environment. - action_space: The action space of the environment. - model_config_dict: The model config to use. - """ - super().__init__( - observation_space=observation_space, - action_space=action_space, - model_config_dict=model_config_dict, - ) - - self.model_size = self.model_config_dict["model_size"] - self.is_img_space = len(self.observation_space.shape) in [2, 3] - self.is_gray_scale = ( - self.is_img_space and len(self.observation_space.shape) == 2 - ) - - # TODO (sven): We should work with sub-component configurations here, - # and even try replacing all current Dreamer model components with - # our default primitives. But for now, we'll construct the DreamerV3Model - # directly in our `build_...()` methods. - - @override(Catalog) - def build_encoder(self, framework: str) -> Encoder: - """Builds the World-Model's encoder network depending on the obs space.""" - if framework != "tf2": - raise NotImplementedError - - if self.is_img_space: - from ray.rllib.algorithms.dreamerv3.tf.models.components.cnn_atari import ( - CNNAtari, - ) - - return CNNAtari(model_size=self.model_size) - else: - from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP - - return MLP(model_size=self.model_size, name="vector_encoder") - - def build_decoder(self, framework: str) -> Model: - """Builds the World-Model's decoder network depending on the obs space.""" - if framework != "tf2": - raise NotImplementedError - - if self.is_img_space: - from ray.rllib.algorithms.dreamerv3.tf.models.components import ( - conv_transpose_atari, - ) - - return conv_transpose_atari.ConvTransposeAtari( - model_size=self.model_size, - gray_scaled=self.is_gray_scale, - ) - else: - from ray.rllib.algorithms.dreamerv3.tf.models.components import ( - vector_decoder, - ) - - return vector_decoder.VectorDecoder( - model_size=self.model_size, - observation_space=self.observation_space, - ) diff --git a/rllib/algorithms/dreamerv3/dreamerv3_learner.py b/rllib/algorithms/dreamerv3/dreamerv3_learner.py index 32c08d0a671f4..c35d1743c8b1a 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3_learner.py +++ b/rllib/algorithms/dreamerv3/dreamerv3_learner.py @@ -8,13 +8,11 @@ https://arxiv.org/pdf/2010.02193.pdf """ from dataclasses import dataclass -from typing import Any, DefaultDict, Dict +from typing import Any, Dict from ray.rllib.core.learner.learner import Learner, LearnerHyperparameters from ray.rllib.core.rl_module.rl_module import ModuleID -from ray.rllib.policy.sample_batch import MultiAgentBatch from ray.rllib.utils.annotations import override -from ray.rllib.utils.typing import TensorType @dataclass @@ -27,7 +25,7 @@ class to configure your algorithm. more details on the individual properties. """ - model_size: str = None + model_dimension: str = None training_ratio: float = None batch_size_B: int = None batch_length_T: int = None @@ -46,10 +44,6 @@ class to configure your algorithm. world_model_grad_clip_by_global_norm: float = None actor_grad_clip_by_global_norm: float = None critic_grad_clip_by_global_norm: float = None - # Reporting settings. - report_individual_batch_item_stats: bool = None - report_dream_data: bool = None - report_images_and_videos: bool = None class DreamerV3Learner(Learner): @@ -59,31 +53,6 @@ class DreamerV3Learner(Learner): for updating the critic EMA-copy after each training step. """ - @override(Learner) - def compile_results( - self, - *, - batch: MultiAgentBatch, - fwd_out: Dict[str, Any], - loss_per_module: Dict[str, TensorType], - metrics_per_module: DefaultDict[ModuleID, Dict[str, Any]], - ) -> Dict[str, Any]: - results = super().compile_results( - batch=batch, - fwd_out=fwd_out, - loss_per_module=loss_per_module, - metrics_per_module=metrics_per_module, - ) - - # Add the predicted obs distributions for possible (video) summarization. - if self.hps.report_images_and_videos: - for module_id, res in results.items(): - if module_id in fwd_out: - res["WORLD_MODEL_fwd_out_obs_distribution_means_BxT"] = fwd_out[ - module_id - ]["obs_distribution_means_BxT"] - return results - @override(Learner) def additional_update_for_module( self, diff --git a/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py b/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py index f1a112e7017d1..021fbb8646389 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py +++ b/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py @@ -14,7 +14,6 @@ from ray.rllib.core.models.base import STATE_IN, STATE_OUT from ray.rllib.core.models.specs.specs_dict import SpecDict from ray.rllib.core.rl_module.rl_module import RLModule -from ray.rllib.policy.eager_tf_policy import _convert_to_tf from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import ExperimentalAPI, override from ray.rllib.utils.nested_dict import NestedDict @@ -34,7 +33,7 @@ def setup(self): self.config.observation_space, self.config.model_config_dict.get("symlog_obs", "auto"), ) - model_size = self.config.model_config_dict["model_size"] + model_dimension = self.config.model_config_dict["model_dimension"] # Build encoder and decoder from catalog. catalog = self.config.get_catalog() @@ -43,34 +42,40 @@ def setup(self): # Build the world model (containing encoder and decoder). self.world_model = WorldModel( - model_size=model_size, + model_dimension=model_dimension, action_space=self.config.action_space, batch_length_T=T, + # num_gru_units=self.model_config.num_gru_units, encoder=self.encoder, decoder=self.decoder, symlog_obs=symlog_obs, ) self.actor = ActorNetwork( action_space=self.config.action_space, - model_size=model_size, + model_dimension=model_dimension, ) self.critic = CriticNetwork( - model_size=model_size, + model_dimension=model_dimension, ) # Build the final dreamer model (containing the world model). self.dreamer_model = DreamerModel( - model_size=self.config.model_config_dict["model_size"], + model_dimension=self.config.model_config_dict["model_dimension"], action_space=self.config.action_space, world_model=self.world_model, actor=self.actor, critic=self.critic, + # use_curiosity=use_curiosity, + # intrinsic_rewards_scale=intrinsic_rewards_scale, + batch_size_B=self.config.model_config_dict["batch_size_B"], + batch_length_T=T, + horizon_H=horizon_H, ) self.action_dist_cls = catalog.get_action_dist_cls(framework=self.framework) # Perform a test `call()` to force building the dreamer model's variables. test_obs = np.tile( np.expand_dims(self.config.observation_space.sample(), (0, 1)), - reps=(B, T) + (1,) * len(self.config.observation_space.shape), + reps=(B, T, 1), ) test_actions = np.tile( np.expand_dims( @@ -82,13 +87,15 @@ def setup(self): reps=(B, T, 1), ) self.dreamer_model( - inputs=_convert_to_tf(test_obs), - actions=_convert_to_tf(test_actions.astype(np.float32)), - is_first=_convert_to_tf(np.ones((B, T), np.float32)), - start_is_terminated_BxT=_convert_to_tf(np.zeros((B * T,), np.float32)), + inputs=test_obs, + actions=test_actions.astype(np.float32), + is_first=np.ones((B, T), np.float32), + start_is_terminated_BxT=np.zeros((B * T,), np.float32), horizon_H=horizon_H, gamma=gamma, ) + # This should work now. + self.dreamer_model.summary(expand_nested=True) # Initialize the critic EMA net: self.critic.init_ema() @@ -122,7 +129,7 @@ def input_specs_train(self) -> SpecDict: def output_specs_train(self) -> SpecDict: return [ "sampled_obs_symlog_BxT", - "obs_distribution_means_BxT", + "obs_distribution_BxT", "reward_logits_BxT", "rewards_BxT", "continue_distribution_BxT", diff --git a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py deleted file mode 100644 index 2e8ef82fd6dbe..0000000000000 --- a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py +++ /dev/null @@ -1,210 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf - -[3] -D. Hafner's (author) original code repo (for JAX): -https://github.com/danijar/dreamerv3 -""" -import unittest - -import gymnasium as gym -import numpy as np - -import ray -from ray.rllib.algorithms.dreamerv3 import dreamerv3 -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.test_utils import framework_iterator - - -class TestDreamerV3(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_dreamerv3_compilation(self): - """Test whether DreamerV3 can be built with all frameworks.""" - - # Build a DreamerV3Config object. - config = ( - dreamerv3.DreamerV3Config() - .framework(eager_tracing=True) - .training( - # Keep things simple. Especially the long dream rollouts seem - # to take an enormous amount of time (initially). - batch_size_B=2 * 2, # shared w/ model AND learner AND env runner - batch_length_T=16, - horizon_H=5, - # TODO (sven): Fix having to provide this. - # Should be compiled automatically as `RLModuleConfig` by - # AlgorithmConfig (see comment below)? - model={ - "batch_length_T": 16, - "horizon_H": 5, - "model_size": "nano", # Use a tiny model for testing. - "gamma": 0.997, - "symlog_obs": True, - }, - ) - .resources( - num_learner_workers=2, # Try with 2 Learners. - num_cpus_per_learner_worker=1, - num_gpus_per_learner_worker=0, - ) - .debugging(log_level="INFO") - ) - - # TODO (sven): Add a `get_model_config` utility to AlgorithmConfig - # that - for now - merges the user provided model_dict (which only - # contains settings that only affect the model, e.g. model_size) - # with the AlgorithmConfig-wide settings that are relevant for the model - # (e.g. `batch_size_B`). - # config.get_model_config() - - num_iterations = 2 - - for _ in framework_iterator(config, frameworks="tf2"): - for env in ["ALE/MsPacman-v5", "FrozenLake-v1", "CartPole-v1"]: - print("Env={}".format(env)) - config.environment(env) - algo = config.build() - - for i in range(num_iterations): - results = algo.train() - print(results) - - algo.stop() - - def test_dreamerv3_dreamer_model_sizes(self): - """Tests, whether the different model sizes match the ones reported in [1].""" - - # For Atari, these are the exact numbers from the repo ([3]). - # However, for CartPole + size "S" and "M", the author's original code will not - # match for the world model count. This is due to the fact that the author uses - # encoder/decoder nets with 5x1024 nodes (which corresponds to XL) regardless of - # the `model_size` settings (iff >="S"). - expected_num_params_world_model = { - "XS_cartpole": 2435076, - "S_cartpole": 7493380, - "M_cartpole": 16206084, - "L_cartpole": 37802244, - "XL_cartpole": 108353796, - "XS_atari": 7538979, - "S_atari": 15687811, - "M_atari": 32461635, - "L_atari": 68278275, - "XL_atari": 181558659, - } - - # All values confirmed against [3] (100% match). - expected_num_params_actor = { - # hidden=[1280, 256] - # hidden_norm=[256], [256] - # pi (2 actions)=[256, 2], [2] - "XS_cartpole": 328706, - "S_cartpole": 1051650, - "M_cartpole": 2135042, - "L_cartpole": 4136450, - "XL_cartpole": 9449474, - "XS_atari": 329734, - "S_atari": 1053702, - "M_atari": 2137606, - "L_atari": 4139526, - "XL_atari": 9453574, - } - - # All values confirmed against [3] (100% match). - expected_num_params_critic = { - # hidden=[1280, 256] - # hidden_norm=[256], [256] - # vf (buckets)=[256, 255], [255] - "XS_cartpole": 393727, - "S_cartpole": 1181439, - "M_cartpole": 2297215, - "L_cartpole": 4331007, - "XL_cartpole": 9708799, - "XS_atari": 393727, - "S_atari": 1181439, - "M_atari": 2297215, - "L_atari": 4331007, - "XL_atari": 9708799, - } - - config = ( - dreamerv3.DreamerV3Config() - .framework("tf2", eager_tracing=True) - .training( - model={ - "batch_length_T": 16, - "horizon_H": 5, - "gamma": 0.997, - "symlog_obs": True, - } - ) - ) - - # Check all model_sizes described in the paper ([1]) on matching the number - # of parameters to RLlib's implementation. - for model_size in ["XS", "S", "M", "L", "XL"]: - config.model_size = model_size - config.training(model={"model_size": model_size}) - - # Atari and CartPole spaces. - for obs_space, num_actions, env_name in [ - (gym.spaces.Box(-1.0, 0.0, (4,), np.float32), 2, "cartpole"), - (gym.spaces.Box(-1.0, 0.0, (64, 64, 3), np.float32), 6, "atari"), - ]: - print(f"Testing model_size={model_size} on env-type: {env_name} ..") - config.environment( - observation_space=obs_space, - action_space=gym.spaces.Discrete(num_actions), - ) - - # Create our RLModule to compute actions with. - policy_dict, _ = config.get_multi_agent_setup() - module_spec = config.get_marl_module_spec(policy_dict=policy_dict) - rl_module = module_spec.build()[DEFAULT_POLICY_ID] - - # Count the generated RLModule's parameters and compare to the paper's - # reported numbers ([1] and [3]). - num_params_world_model = sum( - np.prod(v.shape.as_list()) - for v in rl_module.world_model.trainable_variables - ) - self.assertEqual( - num_params_world_model, - expected_num_params_world_model[f"{model_size}_{env_name}"], - ) - num_params_actor = sum( - np.prod(v.shape.as_list()) - for v in rl_module.actor.trainable_variables - ) - self.assertEqual( - num_params_actor, - expected_num_params_actor[f"{model_size}_{env_name}"], - ) - num_params_critic = sum( - np.prod(v.shape.as_list()) - for v in rl_module.critic.trainable_variables - ) - self.assertEqual( - num_params_critic, - expected_num_params_critic[f"{model_size}_{env_name}"], - ) - print("\tok") - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py index 366735f643d74..6f970a9117d9e 100644 --- a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py +++ b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py @@ -18,7 +18,7 @@ from ray.rllib.core.rl_module.marl_module import ModuleID from ray.rllib.core.learner.learner import ParamDict from ray.rllib.core.learner.tf.tf_learner import TfLearner -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch +from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf, try_import_tfp from ray.rllib.utils.tf_utils import symlog, two_hot, clip_gradients @@ -34,21 +34,16 @@ class DreamerV3TfLearner(DreamerV3Learner, TfLearner): The critic EMA-copy update step can be found in the `DreamerV3Learner` base class, as it is framework independent. - We define 3 local TensorFlow optimizers for the sub components "world_model", + We define 3 local tensorflow optimizers for the sub components "world_model", "actor", and "critic". Each of these optimizers might use a different learning rate, epsilon parameter, and gradient clipping thresholds and procedures. """ @override(TfLearner) - def configure_optimizers_for_module( + def configure_optimizer_for_module( self, module_id: ModuleID, hps: DreamerV3LearnerHyperparameters ): - """Create the 3 optimizers for Dreamer learning: world_model, actor, critic. - - The learning rates used are described in [1] and the epsilon values used here - - albeit probably not that important - are used by the author's own - implementation. - """ + """Create the 3 optimizers for Dreamer learning: world_model, actor, critic.""" dreamerv3_module = self._module[module_id] @@ -100,7 +95,7 @@ def postprocess_gradients_for_module( """Performs gradient clipping on the 3 module components' computed grads. Note that different grad global-norm clip values are used for the 3 - module components: world model, actor, and critic. + module components (world model, actor, and critic). """ for optimizer_name, optimizer in self.get_optimizers_for_module( module_id=module_id @@ -139,32 +134,6 @@ def postprocess_gradients_for_module( return module_gradients_dict - @override(TfLearner) - def compute_gradients( - self, - loss_per_module, - gradient_tape, - **kwargs, - ): - # Override of the default gradient computation method. - # For DreamerV3, we need to compute gradients over the individual loss terms - # as otherwise, the world model's parameters would have their gradients also - # be influenced by the actor- and critic loss terms/gradient computations. - grads = {} - for component in ["world_model", "actor", "critic"]: - grads.update( - gradient_tape.gradient( - # Take individual loss term from the registered metrics for - # the main module. - self._metrics[DEFAULT_POLICY_ID][component.upper() + "_L_total"], - self.filter_param_dict_for_optimizer( - self._params, self.get_optimizer(optimizer_name=component) - ), - ) - ) - del gradient_tape - return grads - @override(TfLearner) def compute_loss_for_module( self, @@ -201,11 +170,7 @@ def compute_loss_for_module( + 0.1 * L_rep_B_T ) - # In the paper, it says to sum up timesteps, and average over - # batch (see eq. 4 in [1]). But Danijar's implementation only does - # averaging (over B and T), so we'll do this here as well. This is generally - # true for all other loss terms as well (we'll always just average, no summing - # over T axis!). + # Sum up timesteps, and average over batch (see eq. 4 in [1]). L_world_model_total = tf.reduce_mean(L_world_model_total_B_T) # Register world model loss stats. @@ -217,36 +182,28 @@ def compute_loss_for_module( ), # Prediction losses. # Decoder (obs) loss. + "WORLD_MODEL_L_decoder_B_T": prediction_losses["L_decoder_B_T"], "WORLD_MODEL_L_decoder": prediction_losses["L_decoder"], # Reward loss. + "WORLD_MODEL_L_reward_B_T": prediction_losses["L_reward_B_T"], "WORLD_MODEL_L_reward": prediction_losses["L_reward"], # Continue loss. + "WORLD_MODEL_L_continue_B_T": prediction_losses["L_continue_B_T"], "WORLD_MODEL_L_continue": prediction_losses["L_continue"], # Total. + "WORLD_MODEL_L_prediction_B_T": prediction_losses["L_prediction_B_T"], "WORLD_MODEL_L_prediction": prediction_losses["L_prediction"], # Dynamics loss. + "WORLD_MODEL_L_dynamics_B_T": L_dyn_B_T, "WORLD_MODEL_L_dynamics": L_dyn, # Representation loss. + "WORLD_MODEL_L_representation_B_T": L_rep_B_T, "WORLD_MODEL_L_representation": L_rep, # Total loss. + "WORLD_MODEL_L_total_B_T": L_world_model_total_B_T, "WORLD_MODEL_L_total": L_world_model_total, }, ) - if hps.report_individual_batch_item_stats: - self.register_metrics( - module_id=module_id, - metrics_dict={ - "WORLD_MODEL_L_decoder_B_T": prediction_losses["L_decoder_B_T"], - "WORLD_MODEL_L_reward_B_T": prediction_losses["L_reward_B_T"], - "WORLD_MODEL_L_continue_B_T": prediction_losses["L_continue_B_T"], - "WORLD_MODEL_L_prediction_B_T": ( - prediction_losses["L_prediction_B_T"] - ), - "WORLD_MODEL_L_dynamics_B_T": L_dyn_B_T, - "WORLD_MODEL_L_representation_B_T": L_rep_B_T, - "WORLD_MODEL_L_total_B_T": L_world_model_total_B_T, - }, - ) # Dream trajectories starting in all internal states (h + z_posterior) that were # computed during world model training. @@ -262,31 +219,17 @@ def compute_loss_for_module( timesteps_H=hps.horizon_H, gamma=hps.gamma, ) - if hps.report_dream_data: - # To reduce this massive mount of data a little, slice out a T=1 piece - # from each stats that has the shape (H, BxT), meaning convert e.g. - # `rewards_dreamed_t0_to_H_BxT` into `rewards_dreamed_t0_to_H_Bx1`. - # This will reduce the amount of data to be transferred and reported - # by the factor of `batch_length_T`. - self.register_metrics( - module_id, - { - # Replace 'T' with '1'. - "DREAM_DATA_" + key[:-1] + "1": value[:, hps.batch_size_B] - for key, value in dream_data.items() - if key.endswith("H_BxT") - }, - ) + self.register_metrics(module_id, {"dream_data": dream_data}) value_targets_t0_to_Hm1_BxT = self._compute_value_targets( hps=hps, # Learn critic in symlog'd space. - rewards_t0_to_H_BxT=dream_data["rewards_dreamed_t0_to_H_BxT"], + rewards_t0_to_H_BxT=dream_data["rewards_dreamed_t0_to_H_B"], intrinsic_rewards_t1_to_H_BxT=( dream_data["rewards_intrinsic_t1_to_H_B"] if hps.use_curiosity else None ), - continues_t0_to_H_BxT=dream_data["continues_dreamed_t0_to_H_BxT"], - value_predictions_t0_to_H_BxT=dream_data["values_dreamed_t0_to_H_BxT"], + continues_t0_to_H_BxT=dream_data["continues_dreamed_t0_to_H_B"], + value_predictions_t0_to_H_BxT=dream_data["values_dreamed_t0_to_H_B"], ) self.register_metric( module_id, "VALUE_TARGETS_H_BxT", value_targets_t0_to_Hm1_BxT @@ -294,7 +237,6 @@ def compute_loss_for_module( CRITIC_L_total = self._compute_critic_loss( module_id=module_id, - hps=hps, dream_data=dream_data, value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT, ) @@ -308,6 +250,16 @@ def compute_loss_for_module( else: ACTOR_L_total = 0.0 + # if hps.use_curiosity: + # L_disagree = self._compute_disagree_loss(dream_data=dream_data) + # results["DISAGREE_L_total"] = L_disagree + # results["DISAGREE_intrinsic_rewards_H_B"] = ( + # dream_data["rewards_intrinsic_t1_to_H_B"] + # ) + # results["DISAGREE_intrinsic_rewards"] = tf.reduce_mean( + # dream_data["rewards_intrinsic_t1_to_H_B"] + # ) + # Return the total loss as a sum of all individual losses. return L_world_model_total + CRITIC_L_total + ACTOR_L_total @@ -337,27 +289,16 @@ def _compute_world_model_prediction_losses( # If symlog is disabled (e.g. for uint8 image inputs), `obs_symlog_BxT` is the # same as `obs_BxT`. obs_BxT = fwd_out["sampled_obs_symlog_BxT"] - obs_distr_means = fwd_out["obs_distribution_means_BxT"] - # In case we wanted to construct a distribution object from the fwd out data, - # we would have to do it like this: - # obs_distr = tfp.distributions.MultivariateNormalDiag( - # loc=obs_distr_means, - # # Scale == 1.0. - # # [2]: "Distributions The image predictor outputs the mean of a diagonal - # # Gaussian likelihood with **unit variance** ..." - # scale_diag=tf.ones_like(obs_distr_means), - # ) - + obs_distr = fwd_out["obs_distribution_BxT"] # Leave time dim folded (BxT) and flatten all other (e.g. image) dims. obs_BxT = tf.reshape(obs_BxT, shape=[-1, tf.reduce_prod(obs_BxT.shape[1:])]) + # Neg logp loss. + # decoder_loss = - obs_distr.log_prob(observations) + # decoder_loss /= observations.shape.as_list()[1] # Squared diff loss w/ sum(!) over all (already folded) obs dims. - # decoder_loss_BxT = SUM[ (obs_distr.loc - observations)^2 ] - # Note: This is described strangely in the paper (stating a neglogp loss here), - # but the author's own implementation actually uses simple MSE with the loc - # of the Gaussian. decoder_loss_BxT = tf.reduce_sum( - tf.math.square(obs_distr_means - obs_BxT), axis=-1 + tf.math.square(obs_distr.loc - obs_BxT), axis=-1 ) # Unfold time rank back in. @@ -515,36 +456,30 @@ def _compute_actor_loss( """ actor = self.module[module_id].actor - # Note: `scaled_value_targets_t0_to_Hm1_B` are NOT stop_gradient'd yet. + # Note: `value_targets` are NOT stop_gradient'd yet. scaled_value_targets_t0_to_Hm1_B = self._compute_scaled_value_targets( module_id=module_id, hps=hps, value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT, - value_predictions_t0_to_Hm1_BxT=dream_data["values_dreamed_t0_to_H_BxT"][ - :-1 - ], + value_predictions_t0_to_Hm1_BxT=dream_data["values_dreamed_t0_to_H_B"][:-1], ) # Actions actually taken in the dream. - actions_dreamed = tf.stop_gradient(dream_data["actions_dreamed_t0_to_H_BxT"])[ - :-1 - ] - actions_dreamed_dist_params_t0_to_Hm1_B = dream_data[ - "actions_dreamed_dist_params_t0_to_H_BxT" + actions_dreamed = tf.stop_gradient(dream_data["actions_dreamed_t0_to_H_B"])[:-1] + dist_actions_t0_to_Hm1_B = dream_data[ + "actions_dreamed_distributions_t0_to_H_B" ][:-1] - dist_t0_to_Hm1_B = actor.get_action_dist_object( - actions_dreamed_dist_params_t0_to_Hm1_B - ) - # Compute log(p)s of all possible actions in the dream. if isinstance(self.module[module_id].actor.action_space, gym.spaces.Discrete): # Note that when we create the Categorical action distributions, we compute # unimix probs, then math.log these and provide these log(p) as "logits" to # the Categorical. So here, we'll continue to work with log(p)s (not # really "logits")! - logp_actions_t0_to_Hm1_B = actions_dreamed_dist_params_t0_to_Hm1_B - + logp_actions_t0_to_Hm1_B = tf.stack( + [dist.logits for dist in dist_actions_t0_to_Hm1_B], + axis=0, + ) # Log probs of actions actually taken in the dream. logp_actions_dreamed_t0_to_Hm1_B = tf.reduce_sum( actions_dreamed * logp_actions_t0_to_Hm1_B, @@ -554,18 +489,29 @@ def _compute_actor_loss( logp_loss_H_B = logp_actions_dreamed_t0_to_Hm1_B * tf.stop_gradient( scaled_value_targets_t0_to_Hm1_B ) - # Box space. - else: - logp_actions_dreamed_t0_to_Hm1_B = dist_t0_to_Hm1_B.log_prob( - actions_dreamed + elif isinstance(actor.action_space, gym.spaces.Box): + # TODO (Rohan138, Sven): Figure out how to vectorize this instead! + logp_actions_dreamed_t0_to_Hm1_B = tf.stack( + [ + dist.log_prob(actions_dreamed[i]) + for i, dist in enumerate(dist_actions_t0_to_Hm1_B) + ] ) # First term of loss function. [1] eq. 11. logp_loss_H_B = scaled_value_targets_t0_to_Hm1_B + else: + raise ValueError(f"Invalid action space: {actor.action_space}") assert len(logp_loss_H_B.shape) == 2 # Add entropy loss term (second term [1] eq. 11). - entropy_H_B = dist_t0_to_Hm1_B.entropy() + entropy_H_B = tf.stack( + [ + dist.entropy() + for dist in dream_data["actions_dreamed_distributions_t0_to_H_B"][:-1] + ], + axis=0, + ) assert len(entropy_H_B.shape) == 2 entropy = tf.reduce_mean(entropy_H_B) @@ -574,44 +520,31 @@ def _compute_actor_loss( L_actor_H_B = L_actor_reinforce_term_H_B + L_actor_action_entropy_term_H_B # Mask out everything that goes beyond a predicted continue=False boundary. - L_actor_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_BxT"])[ - :-1 - ] + L_actor_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_B"])[:-1] L_actor = tf.reduce_mean(L_actor_H_B) self.register_metrics( module_id, metrics_dict={ + "ACTOR_L_total_H_B": L_actor_H_B, "ACTOR_L_total": L_actor, + "ACTOR_logp_actions_dreamed_H_B": logp_actions_dreamed_t0_to_Hm1_B, + "ACTOR_scaled_value_targets_H_B": scaled_value_targets_t0_to_Hm1_B, "ACTOR_value_targets_pct95_ema": actor.ema_value_target_pct95, "ACTOR_value_targets_pct5_ema": actor.ema_value_target_pct5, + "ACTOR_action_entropy_H_B": entropy_H_B, "ACTOR_action_entropy": entropy, # Individual loss terms. + "ACTOR_L_neglogp_reinforce_term_H_B": L_actor_reinforce_term_H_B, "ACTOR_L_neglogp_reinforce_term": tf.reduce_mean( L_actor_reinforce_term_H_B ), + "ACTOR_L_neg_entropy_term_H_B": L_actor_action_entropy_term_H_B, "ACTOR_L_neg_entropy_term": tf.reduce_mean( L_actor_action_entropy_term_H_B ), }, ) - if hps.report_individual_batch_item_stats: - self.register_metrics( - module_id, - metrics_dict={ - "ACTOR_L_total_H_BxT": L_actor_H_B, - "ACTOR_logp_actions_dreamed_H_BxT": ( - logp_actions_dreamed_t0_to_Hm1_B - ), - "ACTOR_scaled_value_targets_H_BxT": ( - scaled_value_targets_t0_to_Hm1_B - ), - "ACTOR_action_entropy_H_BxT": entropy_H_B, - # Individual loss terms. - "ACTOR_L_neglogp_reinforce_term_H_BxT": L_actor_reinforce_term_H_B, - "ACTOR_L_neg_entropy_term_H_BxT": L_actor_action_entropy_term_H_B, - }, - ) return L_actor @@ -619,7 +552,6 @@ def _compute_critic_loss( self, *, module_id: ModuleID, - hps: DreamerV3LearnerHyperparameters, dream_data: Dict[str, TensorType], value_targets_t0_to_Hm1_BxT: TensorType, ) -> TensorType: @@ -627,7 +559,6 @@ def _compute_critic_loss( Args: module_id: The ModuleID for which to compute the critic loss. - hps: The DreamerV3LearnerHyperparameters to use. dream_data: The data generated by dreaming for H steps (horizon) starting from any BxT state (sampled from the buffer for the train batch). value_targets_t0_to_Hm1_BxT: The computed value function targets of the @@ -636,8 +567,7 @@ def _compute_critic_loss( Returns: The total critic loss tensor. """ - # B=BxT - H, B = dream_data["rewards_dreamed_t0_to_H_BxT"].shape[:2] + H, B = dream_data["rewards_dreamed_t0_to_H_B"].shape[:2] Hm1 = H - 1 # Note that value targets are NOT symlog'd and go from t0 to H-1, not H, like @@ -656,7 +586,7 @@ def _compute_critic_loss( ) # Get (B x T x probs) tensor from return distributions. - value_symlog_logits_HxB = dream_data["values_symlog_dreamed_logits_t0_to_HxBxT"] + value_symlog_logits_HxB = dream_data["values_symlog_dreamed_logits_t0_to_HxB"] # Unfold time rank and cut last time index to match value targets. value_symlog_logits_t0_to_Hm1_B = tf.reshape( value_symlog_logits_HxB, @@ -678,7 +608,7 @@ def _compute_critic_loss( # Expected values (dreamed) from the EMA (slow critic) net. # Note: Slow critic (EMA) outputs are already stop_gradient'd. value_symlog_ema_t0_to_Hm1_B = tf.stop_gradient( - dream_data["v_symlog_dreamed_ema_t0_to_H_BxT"] + dream_data["v_symlog_dreamed_ema_t0_to_H_B"] )[:-1] # Fold time rank (for two_hot'ing). value_symlog_ema_HxB = tf.reshape(value_symlog_ema_t0_to_Hm1_B, (-1,)) @@ -704,7 +634,7 @@ def _compute_critic_loss( L_critic_H_B = value_loss_two_hot_H_B + ema_regularization_loss_H_B # Mask out everything that goes beyond a predicted continue=False boundary. - L_critic_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_BxT"])[ + L_critic_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_B"])[ :-1 ] @@ -714,29 +644,21 @@ def _compute_critic_loss( self.register_metrics( module_id=module_id, metrics_dict={ + # Symlog'd value targets. Critic learns to predict symlog'd values. + "VALUE_TARGETS_symlog_H_B": value_symlog_targets_t0_to_Hm1_B, + # Critic loss terms. "CRITIC_L_total": L_critic, + "CRITIC_L_total_H_B": L_critic_H_B, + "CRITIC_L_neg_logp_of_value_targets_H_B": value_loss_two_hot_H_B, "CRITIC_L_neg_logp_of_value_targets": tf.reduce_mean( value_loss_two_hot_H_B ), + "CRITIC_L_slow_critic_regularization_H_B": ema_regularization_loss_H_B, "CRITIC_L_slow_critic_regularization": tf.reduce_mean( ema_regularization_loss_H_B ), }, ) - if hps.report_individual_batch_item_stats: - self.register_metrics( - module_id=module_id, - metrics_dict={ - # Symlog'd value targets. Critic learns to predict symlog'd values. - "VALUE_TARGETS_symlog_H_BxT": value_symlog_targets_t0_to_Hm1_B, - # Critic loss terms. - "CRITIC_L_total_H_BxT": L_critic_H_B, - "CRITIC_L_neg_logp_of_value_targets_H_BxT": value_loss_two_hot_H_B, - "CRITIC_L_slow_critic_regularization_H_BxT": ( - ema_regularization_loss_H_B - ), - }, - ) return L_critic @@ -802,7 +724,7 @@ def _compute_value_targets( # intermediates.shape=[2-16, BxT] # Loop through reversed timesteps (axis=1) from T+1 to t=2. - for t in reversed(range(discount.shape[0])): + for t in reversed(range(len(discount))): Rs.append(intermediates[t] + discount[t] * hps.gae_lambda * Rs[-1]) # Reverse along time axis and cut the last entry (value estimate at very end @@ -845,32 +767,21 @@ def _compute_scaled_value_targets( Per_R_5 = tfp.stats.percentile(value_targets_H_B, 5) Per_R_95 = tfp.stats.percentile(value_targets_H_B, 95) - # Update EMA values for 5 and 95 percentile, stored as tf variables under actor - # network. - # 5 percentile - new_val_pct5 = tf.where( - tf.math.is_nan(actor.ema_value_target_pct5), - # is NaN: Initial values: Just set. - Per_R_5, - # Later update (something already stored in EMA variable): Update EMA. - ( + # Update EMAs stored in actor network. + # Initial values: Just set. + if tf.math.is_nan(actor.ema_value_target_pct5): + actor.ema_value_target_pct5.assign(Per_R_5) + actor.ema_value_target_pct95.assign(Per_R_95) + # Later update (something already stored in EMA variable): Update EMA. + else: + actor.ema_value_target_pct5.assign( hps.return_normalization_decay * actor.ema_value_target_pct5 + (1.0 - hps.return_normalization_decay) * Per_R_5 - ), - ) - actor.ema_value_target_pct5.assign(new_val_pct5) - # 95 percentile - new_val_pct95 = tf.where( - tf.math.is_nan(actor.ema_value_target_pct95), - # is NaN: Initial values: Just set. - Per_R_95, - # Later update (something already stored in EMA variable): Update EMA. - ( + ) + actor.ema_value_target_pct95.assign( hps.return_normalization_decay * actor.ema_value_target_pct95 + (1.0 - hps.return_normalization_decay) * Per_R_95 - ), - ) - actor.ema_value_target_pct95.assign(new_val_pct95) + ) # [1] eq. 11 (first term). # Danijar's code: TODO: describe ... diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py index 77c4c285b21ba..0cb088e60fd95 100644 --- a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py +++ b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py @@ -1,12 +1,3 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" from typing import Mapping, Any from ray.rllib.algorithms.dreamerv3.dreamerv3_rl_module import DreamerV3RLModule diff --git a/rllib/algorithms/dreamerv3/tf/models/actor_network.py b/rllib/algorithms/dreamerv3/tf/models/actor_network.py index d865f85606a3a..f22617960b0a8 100644 --- a/rllib/algorithms/dreamerv3/tf/models/actor_network.py +++ b/rllib/algorithms/dreamerv3/tf/models/actor_network.py @@ -8,12 +8,10 @@ import gymnasium as gym from gymnasium.spaces import Box, Discrete import numpy as np +import tensorflow as tf +import tensorflow_probability as tfp from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP -from ray.rllib.utils.framework import try_import_tf, try_import_tfp - -_, tf, _ = try_import_tf() -tfp = try_import_tfp() class ActorNetwork(tf.keras.Model): @@ -30,19 +28,19 @@ class ActorNetwork(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_dimension: Optional[str] = "XS", action_space: gym.Space, ): """Initializes an ActorNetwork instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different network sizes. action_space: The action space the our environment used. """ super().__init__(name="actor") - self.model_size = model_size + self.model_dimension = model_dimension self.action_space = action_space # The EMA decay variables used for the [Percentile(R, 95%) - Percentile(R, 5%)] @@ -57,23 +55,20 @@ def __init__( # For discrete actions, use a single MLP that computes logits. if isinstance(self.action_space, Discrete): self.mlp = MLP( - model_size=self.model_size, + model_dimension=self.model_dimension, output_layer_size=self.action_space.n, name="actor_mlp", ) # For cont. actions, use separate MLPs for Gaussian mean and stddev. - # TODO (sven): In the author's original code repo, this is NOT the case, - # inputs are pushed through a shared MLP, then only the two output linear - # layers are separate for std- and mean logits. elif isinstance(action_space, Box): output_layer_size = np.prod(action_space.shape) self.mlp = MLP( - model_size=self.model_size, + model_dimension=self.model_dimension, output_layer_size=output_layer_size, name="actor_mlp_mean", ) self.std_mlp = MLP( - model_size=self.model_size, + model_dimension=self.model_dimension, output_layer_size=output_layer_size, name="actor_mlp_std", ) @@ -81,15 +76,15 @@ def __init__( raise ValueError(f"Invalid action space: {action_space}") @tf.function - def call(self, h, z, return_distr_params=False): + def call(self, h, z, return_distribution=False): """Performs a forward pass through this policy network. Args: h: The deterministic hidden state of the sequence model. [B, dim(h)]. z: The stochastic discrete representations of the original observation input. [B, num_categoricals, num_classes]. - return_distr_params: Whether to return (as a second tuple item) the action - distribution parameter tensor created by the policy. + return_distribution: Whether to return (as a second tuple item) the action + distribution object created by the policy. """ # Flatten last two dims of z. assert len(z.shape) == 3 @@ -114,10 +109,8 @@ def call(self, h, z, return_distr_params=False): # Danijar's code does: distr = [Distr class](logits=tf.log(probs)). # Not sure why we don't directly use the already available probs instead. action_logits = tf.math.log(action_probs) - - # Distribution parameters are the log(probs) directly. - distr_params = action_logits - distr = self.get_action_dist_object(distr_params) + # Create the distribution object using the unimix'd logits. + distr = tfp.distributions.OneHotCategorical(logits=action_logits) action = tf.cast(tf.stop_gradient(distr.sample()), tf.float32) + ( action_probs - tf.stop_gradient(action_probs) @@ -129,48 +122,15 @@ def call(self, h, z, return_distr_params=False): # minstd, maxstd taken from [1] from configs.yaml minstd = 0.1 maxstd = 1.0 - - # Distribution parameters are the squashed std_logits and the tanh'd - # mean logits. # squash std_logits from (-inf, inf) to (minstd, maxstd) std_logits = (maxstd - minstd) * tf.sigmoid(std_logits + 2.0) + minstd - mean_logits = tf.tanh(action_logits) - - distr_params = tf.concat([mean_logits, std_logits], axis=-1) - distr = self.get_action_dist_object(distr_params) - - action = distr.sample() - - if return_distr_params: - return action, distr_params - return action - - def get_action_dist_object(self, action_dist_params_T_B): - """Helper method to create an action distribution object from (T, B, ..) params. - - Args: - action_dist_params_T_B: The time-major action distribution parameters. - This could be simply the logits (discrete) or a to-be-split-in-2 - tensor for mean and stddev (continuous). - - Returns: - The tfp action distribution object, from which one can sample, compute - log probs, entropy, etc.. - """ - if isinstance(self.action_space, gym.spaces.Discrete): - # Create the distribution object using the unimix'd logits. - distr = tfp.distributions.OneHotCategorical(logits=action_dist_params_T_B) - - elif isinstance(self.action_space, gym.spaces.Box): # Compute Normal distribution from action_logits and std_logits - loc, scale = tf.split(action_dist_params_T_B, 2, axis=-1) - distr = tfp.distributions.Normal(loc=loc, scale=scale) - + distr = tfp.distributions.Normal(tf.tanh(action_logits), std_logits) # If action_space is a box with multiple dims, make individual dims # independent. distr = tfp.distributions.Independent(distr, len(self.action_space.shape)) + action = distr.sample() - else: - raise ValueError(f"Action space {self.action_space} not supported!") - - return distr + if return_distribution: + return action, distr + return action diff --git a/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py b/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py index 0700240f1bf8c..ba9ec38a0fa55 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py @@ -5,10 +5,9 @@ """ from typing import Optional -from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier -from ray.rllib.utils.framework import try_import_tf +import tensorflow as tf -_, tf, _ = try_import_tf() +from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier class CNNAtari(tf.keras.Model): @@ -17,13 +16,13 @@ class CNNAtari(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_dimension: Optional[str] = "XS", cnn_multiplier: Optional[int] = None, ): """Initializes a CNNAtari instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the `cnn_multiplier`. cnn_multiplier: Optional override for the additional factor used to multiply the number of filters with each CNN layer. Starting with @@ -33,7 +32,7 @@ def __init__( """ super().__init__(name="image_encoder") - cnn_multiplier = get_cnn_multiplier(model_size, override=cnn_multiplier) + cnn_multiplier = get_cnn_multiplier(model_dimension, override=cnn_multiplier) # See appendix C in [1]: # "We use a similar network architecture but employ layer normalization and diff --git a/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py index a23ddca856c87..41031c950e11b 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py @@ -5,11 +5,10 @@ """ from typing import Optional -from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP -from ray.rllib.utils.framework import try_import_tf, try_import_tfp +import tensorflow as tf +import tensorflow_probability as tfp -_, tf, _ = try_import_tf() -tfp = try_import_tfp() +from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP class ContinuePredictor(tf.keras.Model): @@ -24,15 +23,15 @@ class ContinuePredictor(tf.keras.Model): terminal. """ - def __init__(self, *, model_size: Optional[str] = "XS"): + def __init__(self, *, model_dimension: Optional[str] = "XS"): """Initializes a ContinuePredictor instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Determines the exact size of the underlying MLP. """ super().__init__(name="continue_predictor") - self.mlp = MLP(model_size=model_size, output_layer_size=1) + self.mlp = MLP(model_dimension=model_dimension, output_layer_size=1) def call(self, h, z, return_distribution=False): """Performs a forward pass through the continue predictor. diff --git a/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py b/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py index ebc8649ccd79b..cffa73adb8029 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py @@ -10,11 +10,10 @@ from typing import Optional import numpy as np +import tensorflow as tf +import tensorflow_probability as tfp from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier -from ray.rllib.utils.framework import try_import_tf - -_, tf, _ = try_import_tf() class ConvTransposeAtari(tf.keras.Model): @@ -29,14 +28,14 @@ class ConvTransposeAtari(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_dimension: Optional[str] = "XS", cnn_multiplier: Optional[int] = None, gray_scaled: bool, ): """Initializes a ConvTransposeAtari instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the `cnn_multiplier`. cnn_multiplier: Optional override for the additional factor used to multiply the number of filters with each CNN transpose layer. Starting with @@ -48,7 +47,7 @@ def __init__( """ super().__init__(name="image_decoder") - cnn_multiplier = get_cnn_multiplier(model_size, override=cnn_multiplier) + cnn_multiplier = get_cnn_multiplier(model_dimension, override=cnn_multiplier) # The shape going into the first Conv2DTranspose layer. # We start with a 4x4 channels=8 "image". @@ -147,9 +146,15 @@ def call(self, h, z): # From [2]: # "Distributions: The image predictor outputs the mean of a diagonal Gaussian # likelihood with unit variance, ..." - # Reshape `out` for the diagonal multi-variate Gaussian (each pixel is its own # independent (b/c diagonal co-variance matrix) variable). loc = tf.reshape(out, shape=(out_shape[0], -1)) - - return loc + distribution = tfp.distributions.MultivariateNormalDiag( + loc=loc, + # Scale == 1.0. + # [2]: "Distributions The image predictor outputs the mean of a diagonal + # Gaussian likelihood with **unit variance** ..." + scale_diag=tf.ones_like(loc), + ) + pred_obs = distribution.sample() + return pred_obs, distribution diff --git a/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py index 559009a44531f..fc69c8dd33f9c 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py @@ -5,13 +5,12 @@ """ from typing import Optional +import tensorflow as tf + from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import ( RepresentationLayer, ) -from ray.rllib.utils.framework import try_import_tf - -_, tf, _ = try_import_tf() class DynamicsPredictor(tf.keras.Model): @@ -27,17 +26,17 @@ class DynamicsPredictor(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_dimension: Optional[str] = "XS", num_categoricals: Optional[int] = None, num_classes_per_categorical: Optional[int] = None, ): """Initializes a DynamicsPredictor instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different parameters. num_categoricals: Overrides the number of categoricals used in the z-states. - In [1], 32 is used for any model size. + In [1], 32 is used for any model dimension. num_classes_per_categorical: Overrides the number of classes within each categorical used for the z-states. In [1], 32 is used for any model dimension. @@ -48,12 +47,12 @@ def __init__( # TODO: In Danijar's code, the Dynamics Net only has a single layer, no # matter the model size. num_dense_layers=1, - model_size=model_size, + model_dimension=model_dimension, output_layer_size=None, ) # The (prior) z-state generating layer. self.representation_layer = RepresentationLayer( - model_size=model_size, + model_dimension=model_dimension, num_categoricals=num_categoricals, num_classes_per_categorical=num_classes_per_categorical, ) diff --git a/rllib/algorithms/dreamerv3/tf/models/components/mlp.py b/rllib/algorithms/dreamerv3/tf/models/components/mlp.py index 435d9f8544ab3..30d4a7713ee1a 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/mlp.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/mlp.py @@ -9,13 +9,12 @@ """ from typing import Optional +import tensorflow as tf + from ray.rllib.algorithms.dreamerv3.utils import ( get_dense_hidden_units, get_num_dense_layers, ) -from ray.rllib.utils.framework import try_import_tf - -_, tf, _ = try_import_tf() class MLP(tf.keras.Model): @@ -23,13 +22,13 @@ class MLP(tf.keras.Model): MLP=multi-layer perceptron. - See Appendix B in [1] for the MLP sizes depending on the given `model_size`. + See Appendix B in [1] for the MLP sizes depending on the given `model_dimension`. """ def __init__( self, *, - model_size: Optional[str] = "XS", + model_dimension: Optional[str] = "XS", num_dense_layers: Optional[int] = None, dense_hidden_units: Optional[int] = None, output_layer_size=None, @@ -39,12 +38,12 @@ def __init__( """Initializes an MLP instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different network sizes. num_dense_layers: The number of hidden layers in the MLP. If None, - will use `model_size` and appendix B to figure out this value. + will use `model_dimension` and appendix B to figure out this value. dense_hidden_units: The number of nodes in each hidden layer. If None, - will use `model_size` and appendix B to figure out this value. + will use `model_dimension` and appendix B to figure out this value. output_layer_size: The size of an optional linear (no activation) output layer. If None, no output layer will be added on top of the MLP dense stack. @@ -53,9 +52,11 @@ def __init__( """ super().__init__(name=name or "mlp") - num_dense_layers = get_num_dense_layers(model_size, override=num_dense_layers) + num_dense_layers = get_num_dense_layers( + model_dimension, override=num_dense_layers + ) dense_hidden_units = get_dense_hidden_units( - model_size, override=dense_hidden_units + model_dimension, override=dense_hidden_units ) self.dense_layers = [] diff --git a/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py b/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py index cf6b27b3c68ff..36e2ace631844 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py @@ -9,14 +9,13 @@ """ from typing import Optional +import tensorflow as tf +import tensorflow_probability as tfp + from ray.rllib.algorithms.dreamerv3.utils import ( get_num_z_categoricals, get_num_z_classes, ) -from ray.rllib.utils.framework import try_import_tf, try_import_tfp - -_, tf, _ = try_import_tf() -tfp = try_import_tfp() class RepresentationLayer(tf.keras.layers.Layer): @@ -30,26 +29,26 @@ class RepresentationLayer(tf.keras.layers.Layer): def __init__( self, *, - model_size: Optional[str] = "XS", + model_dimension: Optional[str] = "XS", num_categoricals: Optional[int] = None, num_classes_per_categorical: Optional[int] = None, ): """Initializes a RepresentationLayer instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different parameters. num_categoricals: Overrides the number of categoricals used in the z-states. - In [1], 32 is used for any model size. + In [1], 32 is used for any model dimension. num_classes_per_categorical: Overrides the number of classes within each categorical used for the z-states. In [1], 32 is used for any model dimension. """ self.num_categoricals = get_num_z_categoricals( - model_size, override=num_categoricals + model_dimension, override=num_categoricals ) self.num_classes_per_categorical = get_num_z_classes( - model_size, override=num_classes_per_categorical + model_dimension, override=num_classes_per_categorical ) super().__init__( diff --git a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py index c8ce0fc260fd6..7af29664c6024 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py @@ -5,13 +5,12 @@ """ from typing import Optional +import tensorflow as tf + from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import ( RewardPredictorLayer, ) -from ray.rllib.utils.framework import try_import_tf - -_, tf, _ = try_import_tf() class RewardPredictor(tf.keras.Model): @@ -23,7 +22,7 @@ class RewardPredictor(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_dimension: Optional[str] = "XS", num_buckets: int = 255, lower_bound: float = -20.0, upper_bound: float = 20.0, @@ -31,7 +30,7 @@ def __init__( """Initializes a RewardPredictor instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Determines the exact size of the underlying MLP. num_buckets: The number of buckets to create. Note that the number of possible symlog'd outcomes from the used distribution is @@ -52,7 +51,7 @@ def __init__( super().__init__(name="reward_predictor") self.mlp = MLP( - model_size=model_size, + model_dimension=model_dimension, output_layer_size=None, ) self.reward_layer = RewardPredictorLayer( diff --git a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py index 185098b15b2bc..f9c92e92e7279 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py @@ -7,9 +7,7 @@ D. Hafner, T. Lillicrap, M. Norouzi, J. Ba https://arxiv.org/pdf/2010.02193.pdf """ -from ray.rllib.utils.framework import try_import_tf - -_, tf, _ = try_import_tf() +import tensorflow as tf class RewardPredictorLayer(tf.keras.layers.Layer): @@ -17,7 +15,7 @@ class RewardPredictorLayer(tf.keras.layers.Layer): This layer is used in two models in DreamerV3: The reward predictor of the world model and the value function. K is 255 by default (see [1]) and doesn't change - with the model size. + with the model dimension. Possible predicted reward/values range from symexp(-20.0) to symexp(20.0), which should cover any possible environment. Outputs of this layer are generated by diff --git a/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py b/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py index d8ee68499625a..5f1d02f539ed8 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py @@ -6,12 +6,10 @@ from typing import Optional import gymnasium as gym +import tensorflow as tf from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.utils import get_gru_units -from ray.rllib.utils.framework import try_import_tf - -_, tf, _ = try_import_tf() class SequenceModel(tf.keras.Model): @@ -39,23 +37,23 @@ class SequenceModel(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_dimension: Optional[str] = "XS", action_space: gym.Space, num_gru_units: Optional[int] = None, ): """Initializes a SequenceModel instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the number of GRU units used. action_space: The action space the our environment used. num_gru_units: Overrides the number of GRU units (dimension of the h-state). - If None, use the value given through `model_size` + If None, use the value given through `model_dimension` (see [1] Appendix B). """ super().__init__(name="sequence_model") - num_gru_units = get_gru_units(model_size, override=num_gru_units) + num_gru_units = get_gru_units(model_dimension, override=num_gru_units) self.action_space = action_space # In Danijar's code, there is an additional layer (units=[model_size]) @@ -63,7 +61,7 @@ def __init__( # the paper. self.pre_gru_layer = MLP( num_dense_layers=1, - model_size=model_size, + model_dimension=model_dimension, output_layer_size=None, ) self.gru_unit = tf.keras.layers.GRU( diff --git a/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py b/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py index bcfdb164e6d0a..08dadaf6494d4 100644 --- a/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py +++ b/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py @@ -6,11 +6,10 @@ from typing import Optional import gymnasium as gym +import tensorflow as tf +import tensorflow_probability as tfp from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP -from ray.rllib.utils.framework import try_import_tf - -_, tf, _ = try_import_tf() class VectorDecoder(tf.keras.Model): @@ -23,13 +22,13 @@ class VectorDecoder(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_dimension: Optional[str] = "XS", observation_space: gym.Space, ): """Initializes a VectorDecoder instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Determines the exact size of the underlying MLP. observation_space: The observation space to decode back into. This must be a Box of shape (d,), where d >= 1. @@ -42,7 +41,7 @@ def __init__( ) self.mlp = MLP( - model_size=model_size, + model_dimension=model_dimension, output_layer_size=observation_space.shape[0], ) @@ -63,5 +62,13 @@ def call(self, h, z): # Send h-cat-z through MLP to get mean values of diag gaussian. loc = self.mlp(out) - # Return only the predicted observations (mean, no sample). - return loc + # Create the Gaussian diag distribution. + distribution = tfp.distributions.MultivariateNormalDiag( + loc=loc, + # Scale == 1.0. + scale_diag=tf.ones_like(loc), + ) + pred_obs = distribution.sample() + + # Always return both predicted observations (sample0 and distribution. + return pred_obs, distribution diff --git a/rllib/algorithms/dreamerv3/tf/models/critic_network.py b/rllib/algorithms/dreamerv3/tf/models/critic_network.py index d40441e585baf..837ca68ccfdcf 100644 --- a/rllib/algorithms/dreamerv3/tf/models/critic_network.py +++ b/rllib/algorithms/dreamerv3/tf/models/critic_network.py @@ -5,13 +5,12 @@ """ from typing import Optional +import tensorflow as tf + from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import ( RewardPredictorLayer, ) -from ray.rllib.utils.framework import try_import_tf - -_, tf, _ = try_import_tf() class CriticNetwork(tf.keras.Model): @@ -28,7 +27,7 @@ class CriticNetwork(tf.keras.Model): def __init__( self, *, - model_size: Optional[str] = "XS", + model_dimension: Optional[str] = "XS", num_buckets: int = 255, lower_bound: float = -20.0, upper_bound: float = 20.0, @@ -37,7 +36,7 @@ def __init__( """Initializes a CriticNetwork instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different network sizes. num_buckets: The number of buckets to create. Note that the number of possible symlog'd outcomes from the used distribution is @@ -64,7 +63,7 @@ def __init__( """ super().__init__(name="critic") - self.model_size = model_size + self.model_dimension = model_dimension self.ema_decay = ema_decay # "Fast" critic network(s) (mlp + reward-pred-layer). This is the network @@ -73,7 +72,7 @@ def __init__( # the critic loss term such that the weights of this fast critic stay close # to the EMA weights (see below). self.mlp = MLP( - model_size=self.model_size, + model_dimension=self.model_dimension, output_layer_size=None, ) self.return_layer = RewardPredictorLayer( @@ -86,7 +85,7 @@ def __init__( # target net, BUT not used to compute anything, just for the # weights regularizer term inside the critic loss). self.mlp_ema = MLP( - model_size=self.model_size, + model_dimension=self.model_dimension, output_layer_size=None, trainable=False, ) diff --git a/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py b/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py index 1a6f95245e302..d186fdcd39eba 100644 --- a/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py +++ b/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py @@ -4,14 +4,12 @@ https://arxiv.org/pdf/2301.04104v1.pdf """ +import tensorflow as tf + from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import ( RepresentationLayer, ) -from ray.rllib.utils.framework import try_import_tf, try_import_tfp - -_, tf, _ = try_import_tf() -tfp = try_import_tfp() class DisagreeNetworks(tf.keras.Model): @@ -23,10 +21,10 @@ class DisagreeNetworks(tf.keras.Model): TODO """ - def __init__(self, *, num_networks, model_size, intrinsic_rewards_scale): + def __init__(self, *, num_networks, model_dimension, intrinsic_rewards_scale): super().__init__(name="disagree_networks") - self.model_size = model_size + self.model_dimension = model_dimension self.num_networks = num_networks self.intrinsic_rewards_scale = intrinsic_rewards_scale @@ -36,13 +34,15 @@ def __init__(self, *, num_networks, model_size, intrinsic_rewards_scale): for _ in range(self.num_networks): self.mlps.append( MLP( - model_size=self.model_size, + model_dimension=self.model_dimension, output_layer_size=None, trainable=True, ) ) self.representation_layers.append( - RepresentationLayer(model_size=self.model_size, name="disagree") + RepresentationLayer( + model_dimension=self.model_dimension, name="disagree" + ) ) @tf.function diff --git a/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py b/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py index f735b9e031ea3..9621c95ce3c22 100644 --- a/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py +++ b/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py @@ -7,25 +7,20 @@ import gymnasium as gym import numpy as np +import tensorflow as tf from ray.rllib.algorithms.dreamerv3.tf.models.disagree_networks import DisagreeNetworks -from ray.rllib.algorithms.dreamerv3.tf.models.actor_network import ActorNetwork -from ray.rllib.algorithms.dreamerv3.tf.models.critic_network import CriticNetwork -from ray.rllib.algorithms.dreamerv3.tf.models.world_model import WorldModel -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_utils import inverse_symlog -_, tf, _ = try_import_tf() +from ray.rllib.utils.tf_utils import inverse_symlog class DreamerModel(tf.keras.Model): """The main tf-keras model containing all necessary components for DreamerV3. Includes: - - The world model with encoder, decoder, sequence-model (RSSM), dynamics - (generates prior z-state), and "posterior" model (generates posterior z-state). - Predicts env dynamics and produces dreamed trajectories for actor- and critic - learning. + - The world model (with encoder, decoder, sequence-model (RSSM), dynamics + (prior z-state generating) model, and "posterior" model) for producing dreamed + trajectories. - The actor network (policy). - The critic network for value function prediction. """ @@ -33,29 +28,32 @@ class DreamerModel(tf.keras.Model): def __init__( self, *, - model_size: str = "XS", + model_dimension: str = "XS", action_space: gym.Space, - world_model: WorldModel, - actor: ActorNetwork, - critic: CriticNetwork, + batch_size_B, + batch_length_T, + horizon_H, + world_model, + actor, + critic, use_curiosity: bool = False, intrinsic_rewards_scale: float = 0.1, ): - """Initializes a DreamerModel instance. + """TODO Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different network sizes. action_space: The action space the our environment used. - world_model: The WorldModel component. - actor: The ActorNetwork component. - critic: The CriticNetwork component. """ super().__init__(name="dreamer_model") - self.model_size = model_size + self.model_dimension = model_dimension self.action_space = action_space self.use_curiosity = use_curiosity + self.batch_size_B = batch_size_B + self.batch_length_T = batch_length_T + self.horizon_H = horizon_H self.world_model = world_model self.actor = actor @@ -65,7 +63,7 @@ def __init__( if self.use_curiosity: self.disagree_nets = DisagreeNetworks( num_networks=8, - model_size=self.model_size, + model_dimension=self.model_dimension, intrinsic_rewards_scale=intrinsic_rewards_scale, ) @@ -99,11 +97,11 @@ def call( actions = self.actor( h=results["h_states_BxT"], z=results["z_posterior_states_BxT"] ) - # Actor (with returning distribution parameters). - _, distr_params = self.actor( + # Actor (with returning distribution). + _, distr = self.actor( h=results["h_states_BxT"], z=results["z_posterior_states_BxT"], - return_distr_params=True, + return_distribution=True, ) # Critic. values = self.critic( @@ -157,11 +155,8 @@ def forward_inference(self, observations, previous_states, is_first, training=No is_first=is_first, ) # Compute action using our actor network and the current states. - _, distr_params = self.actor( - h=states["h"], z=states["z"], return_distr_params=True - ) + _, distr = self.actor(h=states["h"], z=states["z"], return_distribution=True) # Use the mode of the distribution (Discrete=argmax, Normal=mean). - distr = self.actor.get_action_dist_object(distr_params) actions = distr.mode() return actions, {"h": states["h"], "z": states["z"], "a": actions} @@ -272,9 +267,9 @@ def dream_trajectory( timesteps_H: The number of timesteps to dream for. gamma: The discount factor gamma. """ - # Dreamed actions (one-hot encoded for discrete actions). + # Dreamed actions (one-hot for discrete actions). a_dreamed_t0_to_H = [] - a_dreamed_dist_params_t0_to_H = [] + a_dreamed_distributions_t0_to_H = [] h = start_states["h"] z = start_states["z"] @@ -286,7 +281,7 @@ def dream_trajectory( # Compute `a` using actor network (already the first step uses a dreamed action, # not a sampled one). - a, a_dist_params = self.actor( + a, a_dist = self.actor( # We have to stop the gradients through the states. B/c we are using a # differentiable Discrete action distribution (straight through gradients # with `a = stop_gradient(sample(probs)) + probs - stop_gradient(probs)`, @@ -294,10 +289,10 @@ def dream_trajectory( # term on actions further back in the trajectory. h=tf.stop_gradient(h), z=tf.stop_gradient(z), - return_distr_params=True, + return_distribution=True, ) a_dreamed_t0_to_H.append(a) - a_dreamed_dist_params_t0_to_H.append(a_dist_params) + a_dreamed_distributions_t0_to_H.append(a_dist) for i in range(timesteps_H): # Move one step in the dream using the RSSM. @@ -309,13 +304,13 @@ def dream_trajectory( z_states_prior_t0_to_H.append(z) # Compute `a` using actor network. - a, a_dist_params = self.actor( + a, a_dist = self.actor( h=tf.stop_gradient(h), z=tf.stop_gradient(z), - return_distr_params=True, + return_distribution=True, ) a_dreamed_t0_to_H.append(a) - a_dreamed_dist_params_t0_to_H.append(a_dist_params) + a_dreamed_distributions_t0_to_H.append(a_dist) h_states_H_B = tf.stack(h_states_t0_to_H, axis=0) # (T, B, ...) h_states_HxB = tf.reshape(h_states_H_B, [-1] + h_states_H_B.shape.as_list()[2:]) @@ -326,7 +321,6 @@ def dream_trajectory( ) a_dreamed_H_B = tf.stack(a_dreamed_t0_to_H, axis=0) # (T, B, ...) - a_dreamed_dist_params_H_B = tf.stack(a_dreamed_dist_params_t0_to_H, axis=0) # Compute r using reward predictor. r_dreamed_H_B = tf.reshape( @@ -395,20 +389,17 @@ def dream_trajectory( ) ret = { - "h_states_t0_to_H_BxT": h_states_H_B, - "z_states_prior_t0_to_H_BxT": z_states_prior_H_B, - "rewards_dreamed_t0_to_H_BxT": r_dreamed_H_B, - "continues_dreamed_t0_to_H_BxT": c_dreamed_H_B, - "actions_dreamed_t0_to_H_BxT": a_dreamed_H_B, - # "actions_dreamed_distributions_t0_to_H_BxT": ( - # a_dreamed_distributions_t0_to_H - # ), - "actions_dreamed_dist_params_t0_to_H_BxT": a_dreamed_dist_params_H_B, - "values_dreamed_t0_to_H_BxT": v_dreamed_H_B, - "values_symlog_dreamed_logits_t0_to_HxBxT": v_symlog_dreamed_logits_HxB, - "v_symlog_dreamed_ema_t0_to_H_BxT": v_symlog_dreamed_ema_H_B, + "h_states_t0_to_H_B": h_states_H_B, + "z_states_prior_t0_to_H_B": z_states_prior_H_B, + "rewards_dreamed_t0_to_H_B": r_dreamed_H_B, + "continues_dreamed_t0_to_H_B": c_dreamed_H_B, + "actions_dreamed_t0_to_H_B": a_dreamed_H_B, + "actions_dreamed_distributions_t0_to_H_B": a_dreamed_distributions_t0_to_H, + "values_dreamed_t0_to_H_B": v_dreamed_H_B, + "values_symlog_dreamed_logits_t0_to_HxB": v_symlog_dreamed_logits_HxB, + "v_symlog_dreamed_ema_t0_to_H_B": v_symlog_dreamed_ema_H_B, # Loss weights for critic- and actor losses. - "dream_loss_weights_t0_to_H_BxT": dream_loss_weights_H_B, + "dream_loss_weights_t0_to_H_B": dream_loss_weights_H_B, } if self.use_curiosity: @@ -546,20 +537,20 @@ def dream_trajectory_with_burn_in( # an original time dimension from the real env, from all of which we then branch # out our dream trajectories). ret = { - "h_states_t0_to_H_BxT": h_states_t0_to_H_B, - "z_states_prior_t0_to_H_BxT": z_states_prior_t0_to_H_B, + "h_states_t0_to_H_B": h_states_t0_to_H_B, + "z_states_prior_t0_to_H_B": z_states_prior_t0_to_H_B, # Unfold time-ranks in predictions. - "rewards_dreamed_t0_to_H_BxT": tf.reshape(r_dreamed_t0_to_HxB, (-1, B)), - "continues_dreamed_t0_to_H_BxT": tf.reshape(c_dreamed_t0_to_HxB, (-1, B)), + "rewards_dreamed_t0_to_H_B": tf.reshape(r_dreamed_t0_to_HxB, (-1, B)), + "continues_dreamed_t0_to_H_B": tf.reshape(c_dreamed_t0_to_HxB, (-1, B)), } # Figure out action key (random, sampled from env, dreamed?). if use_sampled_actions_in_dream: - key = "actions_sampled_t0_to_H_BxT" + key = "actions_sampled_t0_to_H_B" elif use_random_actions_in_dream: - key = "actions_random_t0_to_H_BxT" + key = "actions_random_t0_to_H_B" else: - key = "actions_dreamed_t0_to_H_BxT" + key = "actions_dreamed_t0_to_H_B" ret[key] = a_t0_to_H_B # Also provide int-actions, if discrete action space. diff --git a/rllib/algorithms/dreamerv3/tf/models/world_model.py b/rllib/algorithms/dreamerv3/tf/models/world_model.py index 73195fc8e1a0b..39fa3e587d6ef 100644 --- a/rllib/algorithms/dreamerv3/tf/models/world_model.py +++ b/rllib/algorithms/dreamerv3/tf/models/world_model.py @@ -6,6 +6,7 @@ from typing import Optional import gymnasium as gym +import tensorflow as tf import tree # pip install dm_tree from ray.rllib.algorithms.dreamerv3.tf.models.components.continue_predictor import ( @@ -25,13 +26,9 @@ SequenceModel, ) from ray.rllib.algorithms.dreamerv3.utils import get_gru_units -from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.tf_utils import symlog -_, tf, _ = try_import_tf() - - class WorldModel(tf.keras.Model): """WorldModel component of [1] w/ encoder, decoder, RSSM, reward/cont. predictors. @@ -59,7 +56,7 @@ class WorldModel(tf.keras.Model): def __init__( self, *, - model_size: str = "XS", + model_dimension: str = "XS", action_space: gym.Space, batch_length_T: int = 64, encoder: tf.keras.Model, @@ -70,7 +67,7 @@ def __init__( """Initializes a WorldModel instance. Args: - model_size: The "Model Size" used according to [1] Appendinx B. + model_dimension: The "Model Size" used according to [1] Appendinx B. Use None for manually setting the different network sizes. action_space: The action space the our environment used. batch_length_T: The length (T) of the sequences used for training. The @@ -90,7 +87,7 @@ def __init__( the last decoder layer produces the exact, normalized pixel values (not a Gaussian as described in [1]!). num_gru_units: The number of GRU units to use. If None, use - `model_size` to figure out this parameter. + `model_dimension` to figure out this parameter. symlog_obs: Whether to predict decoded observations in symlog space. This should be False for image based observations. According to the paper [1] Appendix E: "NoObsSymlog: This ablation @@ -101,7 +98,7 @@ def __init__( """ super().__init__(name="world_model") - self.model_size = model_size + self.model_dimension = model_dimension self.batch_length_T = batch_length_T self.symlog_obs = symlog_obs self.action_space = action_space @@ -112,7 +109,7 @@ def __init__( # Posterior predictor consisting of an MLP and a RepresentationLayer: # [ht, lt] -> zt. self.posterior_mlp = MLP( - model_size=self.model_size, + model_dimension=self.model_dimension, output_layer_size=None, # In Danijar's code, the posterior predictor only has a single layer, # no matter the model size: @@ -121,15 +118,17 @@ def __init__( ) # The (posterior) z-state generating layer. self.posterior_representation_layer = RepresentationLayer( - model_size=self.model_size, + model_dimension=self.model_dimension, ) # Dynamics (prior z-state) predictor: ht -> z^t - self.dynamics_predictor = DynamicsPredictor(model_size=self.model_size) + self.dynamics_predictor = DynamicsPredictor( + model_dimension=self.model_dimension + ) # GRU for the RSSM: [at, ht, zt] -> ht+1 self.num_gru_units = get_gru_units( - model_size=self.model_size, + model_dimension=self.model_dimension, override=num_gru_units, ) # Initial h-state variable (learnt). @@ -143,15 +142,17 @@ def __init__( ) # The actual sequence model containing the GRU layer. self.sequence_model = SequenceModel( - model_size=self.model_size, + model_dimension=self.model_dimension, action_space=self.action_space, num_gru_units=self.num_gru_units, ) # Reward Predictor: [ht, zt] -> rt. - self.reward_predictor = RewardPredictor(model_size=self.model_size) + self.reward_predictor = RewardPredictor(model_dimension=self.model_dimension) # Continue Predictor: [ht, zt] -> ct. - self.continue_predictor = ContinuePredictor(model_size=self.model_size) + self.continue_predictor = ContinuePredictor( + model_dimension=self.model_dimension + ) # Decoder: [ht, zt] -> x^t. self.decoder = decoder @@ -275,7 +276,7 @@ def forward_train(self, observations, actions, is_first, training=None): # Make actions and `is_first` time-major. actions = tf.transpose( actions, - perm=[1, 0] + list(range(2, tf.shape(actions).shape.as_list()[0])), + perm=[1, 0] + list(range(2, len(actions.shape))), # .as_list() TODO ) is_first = tf.transpose(is_first, perm=[1, 0]) @@ -342,7 +343,7 @@ def forward_train(self, observations, actions, is_first, training=None): h_BxT = tf.reshape(h_t1_to_T, shape=[-1] + h_t1_to_T.shape.as_list()[2:]) z_BxT = tf.reshape(z_t1_to_T, shape=[-1] + z_t1_to_T.shape.as_list()[2:]) - obs_distribution_means = self.decoder(h=h_BxT, z=z_BxT) + _, obs_distribution = self.decoder(h=h_BxT, z=z_BxT) # Compute (predicted) reward distributions. rewards, reward_logits = self.reward_predictor( @@ -355,11 +356,11 @@ def forward_train(self, observations, actions, is_first, training=None): ) # Return outputs for loss computation. - # Note that all shapes are [BxT, ...] (time axis already folded). + # Note that all shapes are [B, ...] (no time axis). return { # Obs. "sampled_obs_symlog_BxT": observations, - "obs_distribution_means_BxT": obs_distribution_means, + "obs_distribution_BxT": obs_distribution, # Rewards. "reward_logits_BxT": reward_logits, "rewards_BxT": rewards, diff --git a/rllib/algorithms/dreamerv3/utils/__init__.py b/rllib/algorithms/dreamerv3/utils/__init__.py deleted file mode 100644 index 592bbf9b32e82..0000000000000 --- a/rllib/algorithms/dreamerv3/utils/__init__.py +++ /dev/null @@ -1,168 +0,0 @@ -""" -Utility functions for the DreamerV3 ([1]) algorithm. - -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf -""" - -_ALLOWED_MODEL_DIMS = [ - # RLlib debug sizes (not mentioned in [1]). - "nano", - "micro", - "mini", - "XXS", - # Regular sizes (listed in table B in [1]). - "XS", - "S", - "M", - "L", - "XL", -] - - -def get_cnn_multiplier(model_size, override=None): - if override is not None: - return override - - assert model_size in _ALLOWED_MODEL_DIMS - cnn_multipliers = { - "nano": 2, - "micro": 4, - "mini": 8, - "XXS": 16, - "XS": 24, - "S": 32, - "M": 48, - "L": 64, - "XL": 96, - } - return cnn_multipliers[model_size] - - -def get_dense_hidden_units(model_size, override=None): - if override is not None: - return override - - assert model_size in _ALLOWED_MODEL_DIMS - dense_units = { - "nano": 16, - "micro": 32, - "mini": 64, - "XXS": 128, - "XS": 256, - "S": 512, - "M": 640, - "L": 768, - "XL": 1024, - } - return dense_units[model_size] - - -def get_gru_units(model_size, override=None): - if override is not None: - return override - - assert model_size in _ALLOWED_MODEL_DIMS - gru_units = { - "nano": 16, - "micro": 32, - "mini": 64, - "XXS": 128, - "XS": 256, - "S": 512, - "M": 1024, - "L": 2048, - "XL": 4096, - } - return gru_units[model_size] - - -def get_num_z_categoricals(model_size, override=None): - if override is not None: - return override - - assert model_size in _ALLOWED_MODEL_DIMS - gru_units = { - "nano": 4, - "micro": 8, - "mini": 16, - "XXS": 32, - "XS": 32, - "S": 32, - "M": 32, - "L": 32, - "XL": 32, - } - return gru_units[model_size] - - -def get_num_z_classes(model_size, override=None): - if override is not None: - return override - - assert model_size in _ALLOWED_MODEL_DIMS - gru_units = { - "nano": 4, - "micro": 8, - "mini": 16, - "XXS": 32, - "XS": 32, - "S": 32, - "M": 32, - "L": 32, - "XL": 32, - } - return gru_units[model_size] - - -def get_num_curiosity_nets(model_size, override=None): - if override is not None: - return override - - assert model_size in _ALLOWED_MODEL_DIMS - num_curiosity_nets = { - "nano": 8, - "micro": 8, - "mini": 16, - "XXS": 8, - "XS": 8, - "S": 8, - "M": 8, - "L": 8, - "XL": 8, - } - return num_curiosity_nets[model_size] - - -def get_num_dense_layers(model_size, override=None): - if override is not None: - return override - - assert model_size in _ALLOWED_MODEL_DIMS - num_dense_layers = { - "nano": 1, - "micro": 1, - "mini": 1, - "XXS": 1, - "XS": 1, - "S": 2, - "M": 3, - "L": 4, - "XL": 5, - } - return num_dense_layers[model_size] - - -def do_symlog_obs(observation_space, symlog_obs_user_setting): - # If our symlog_obs setting is NOT set specifically (it's set to "auto"), return - # True if we don't have an image observation space, otherwise return False. - - # TODO (sven): Support mixed observation spaces. - - is_image_space = len(observation_space.shape) in [2, 3] - return ( - not is_image_space - if symlog_obs_user_setting == "auto" - else symlog_obs_user_setting - ) diff --git a/rllib/algorithms/dreamerv3/utils/debugging.py b/rllib/algorithms/dreamerv3/utils/debugging.py deleted file mode 100644 index 1a4cf515d9f41..0000000000000 --- a/rllib/algorithms/dreamerv3/utils/debugging.py +++ /dev/null @@ -1,185 +0,0 @@ -import gymnasium as gym -import numpy as np -from PIL import Image, ImageDraw - -from gymnasium.envs.classic_control.cartpole import CartPoleEnv - -from ray.rllib.utils.framework import try_import_tf - -_, tf, _ = try_import_tf() - - -class CartPoleDebug(CartPoleEnv): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - low = np.concatenate([np.array([0.0]), self.observation_space.low]) - high = np.concatenate([np.array([1000.0]), self.observation_space.high]) - - self.observation_space = gym.spaces.Box(low, high, shape=(5,), dtype=np.float32) - - self.timesteps_ = 0 - - def reset(self, *, seed=None, options=None): - ret = super().reset() - self.timesteps_ = 0 - obs = np.concatenate([np.array([self.timesteps_]), ret[0]]) - return obs, ret[1] - - def step(self, action): - ret = super().step(action) - - self.timesteps_ += 1 - - obs = np.concatenate([np.array([self.timesteps_]), ret[0]]) - reward = 0.1 * self.timesteps_ - return (obs, reward) + ret[2:] - - -gym.register("CartPoleDebug-v0", CartPoleDebug) -cartpole_env = gym.make("CartPoleDebug-v0", render_mode="rgb_array") -cartpole_env.reset() - -frozenlake_env = gym.make( - "FrozenLake-v1", render_mode="rgb_array", is_slippery=False, map_name="4x4" -) # desc=["SF", "HG"]) -frozenlake_env.reset() - - -def create_cartpole_dream_image( - dreamed_obs, # real space (not symlog'd) - dreamed_V, # real space (not symlog'd) - dreamed_a, - dreamed_r_tp1, # real space (not symlog'd) - dreamed_ri_tp1, # intrinsic reward - dreamed_c_tp1, # continue flag - value_target, # real space (not symlog'd) - initial_h, - as_tensor=False, -): - # CartPoleDebug - if dreamed_obs.shape == (5,): - # Set the state of our env to the given observation. - cartpole_env.unwrapped.state = np.array(dreamed_obs[1:], dtype=np.float32) - # Normal CartPole-v1 - else: - cartpole_env.unwrapped.state = np.array(dreamed_obs, dtype=np.float32) - - # Produce an RGB-image of the current state. - rgb_array = cartpole_env.render() - - # Add value-, action-, reward-, and continue-prediction information. - image = Image.fromarray(rgb_array) - draw_obj = ImageDraw.Draw(image) - - # fnt = ImageFont.load_default(size=40) - - draw_obj.text( - (5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0) - ) # , font=fnt.font, size=30) - draw_obj.text( - (5, 18), - f"at={'<--' if dreamed_a == 0 else '-->'} ({dreamed_a})", - fill=(0, 0, 0), - ) - draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0)) - if dreamed_ri_tp1 is not None: - draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0)) - draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0)) - draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0)) - - if dreamed_obs.shape == (5,): - draw_obj.text((20, 100), f"t={dreamed_obs[0]}", fill=(0, 0, 0)) - - # Return image. - np_img = np.asarray(image) - if as_tensor: - return tf.convert_to_tensor(np_img, dtype=tf.uint8) - return np_img - - -def create_frozenlake_dream_image( - dreamed_obs, # real space (not symlog'd) - dreamed_V, # real space (not symlog'd) - dreamed_a, - dreamed_r_tp1, # real space (not symlog'd) - dreamed_ri_tp1, # intrinsic reward - dreamed_c_tp1, # continue flag - value_target, # real space (not symlog'd) - initial_h, - as_tensor=False, -): - frozenlake_env.unwrapped.s = np.argmax(dreamed_obs, axis=0) - - # Produce an RGB-image of the current state. - rgb_array = frozenlake_env.render() - - # Add value-, action-, reward-, and continue-prediction information. - image = Image.fromarray(rgb_array) - draw_obj = ImageDraw.Draw(image) - - draw_obj.text((5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0)) - action_arrow = ( - "<--" - if dreamed_a == 0 - else "v" - if dreamed_a == 1 - else "-->" - if dreamed_a == 2 - else "^" - ) - draw_obj.text((5, 18), f"at={action_arrow} ({dreamed_a})", fill=(0, 0, 0)) - draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0)) - if dreamed_ri_tp1 is not None: - draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0)) - draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0)) - draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0)) - - # Return image. - np_img = np.asarray(image) - if as_tensor: - return tf.convert_to_tensor(np_img, dtype=tf.uint8) - return np_img - - -if __name__ == "__main__": - # CartPole debug. - rgb_array = create_cartpole_dream_image( - dreamed_obs=np.array([100.0, 1.0, -0.01, 1.5, 0.02]), - dreamed_V=4.3, - dreamed_a=1, - dreamed_r_tp1=1.0, - dreamed_c_tp1=True, - initial_h=0.0, - value_target=8.0, - ) - # ImageFont.load("arial.pil") - image = Image.fromarray(rgb_array) - image.show() - - # Normal CartPole. - rgb_array = create_cartpole_dream_image( - dreamed_obs=np.array([1.0, -0.01, 1.5, 0.02]), - dreamed_V=4.3, - dreamed_a=1, - dreamed_r_tp1=1.0, - dreamed_c_tp1=True, - initial_h=0.1, - value_target=8.0, - ) - # ImageFont.load("arial.pil") - image = Image.fromarray(rgb_array) - image.show() - - # Frozenlake - rgb_array = create_frozenlake_dream_image( - dreamed_obs=np.array([1.0] + [0.0] * (frozenlake_env.observation_space.n - 1)), - dreamed_V=4.3, - dreamed_a=1, - dreamed_r_tp1=1.0, - dreamed_c_tp1=True, - initial_h=0.1, - value_target=8.0, - ) - image = Image.fromarray(rgb_array) - image.show() diff --git a/rllib/algorithms/dreamerv3/utils/env_runner.py b/rllib/algorithms/dreamerv3/utils/env_runner.py deleted file mode 100644 index c8db4e8ebc073..0000000000000 --- a/rllib/algorithms/dreamerv3/utils/env_runner.py +++ /dev/null @@ -1,548 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" -from collections import defaultdict -from functools import partial -from typing import List, Tuple - -import gymnasium as gym -import numpy as np -from supersuit.generic_wrappers import resize_v1 -import tree # pip install dm_tree - -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.core.models.base import STATE_IN, STATE_OUT -from ray.rllib.env.env_runner import EnvRunner -from ray.rllib.env.wrappers.atari_wrappers import NoopResetEnv, MaxAndSkipEnv -from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv -from ray.rllib.evaluation.metrics import RolloutMetrics -from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.replay_buffers.episode_replay_buffer import _Episode as Episode -from ray.rllib.utils.numpy import one_hot - -_, tf, _ = try_import_tf() - - -class DreamerV3EnvRunner(EnvRunner): - """An environment runner to collect data from vectorized gymnasium environments.""" - - def __init__( - self, - config: AlgorithmConfig, - **kwargs, - ): - """Initializes a DreamerV3EnvRunner instance. - - Args: - config: The config to use to setup this EnvRunner. - """ - super().__init__(config=config) - - # Create the gym.vector.Env object. - # Atari env. - if self.config.env.startswith("ALE/"): - # [2]: "We down-scale the 84 × 84 grayscale images to 64 × 64 pixels so that - # we can apply the convolutional architecture of DreamerV1." - # ... - # "We follow the evaluation protocol of Machado et al. (2018) with 200M - # environment steps, action repeat of 4, a time limit of 108,000 steps per - # episode that correspond to 30 minutes of game play, no access to life - # information, full action space, and sticky actions. Because the world - # model integrates information over time, DreamerV2 does not use frame - # stacking." - # However, in Danijar's repo, Atari100k experiments are configured as: - # noop=30, 64x64x3 (no grayscaling), sticky actions=False, - # full action space=False, - wrappers = [ - partial(gym.wrappers.TimeLimit, max_episode_steps=108000), - partial(resize_v1, x_size=64, y_size=64), # resize to 64x64 - NormalizedImageEnv, - NoopResetEnv, - MaxAndSkipEnv, - ] - - self.env = gym.vector.make( - "GymV26Environment-v0", - env_id=self.config.env, - wrappers=wrappers, - num_envs=self.config.num_envs_per_worker, - asynchronous=self.config.remote_worker_envs, - make_kwargs=dict( - self.config.env_config, **{"render_mode": "rgb_array"} - ), - ) - # DeepMind Control. - elif self.config.env.startswith("DMC/"): - parts = self.config.env.split("/") - assert len(parts) == 3, ( - "ERROR: DMC env must be formatted as 'DMC/[task]/[domain]', e.g. " - f"'DMC/cartpole/swingup'! You provided '{self.config.env}'." - ) - gym.register( - "dmc_env-v0", - lambda from_pixels=True: DMCEnv( - parts[1], parts[2], from_pixels=from_pixels, channels_first=False - ), - ) - self.env = gym.vector.make( - "dmc_env-v0", - wrappers=[ActionClip], - num_envs=self.config.num_envs_per_worker, - asynchronous=self.config.remote_worker_envs, - **dict(self.config.env_config), - ) - # All other (gym) envs. - else: - wrappers = [] if self.config.env != "FrozenLake-v1" else [OneHot] - self.env = gym.vector.make( - self.config.env, - wrappers=wrappers, - num_envs=self.config.num_envs_per_worker, - asynchronous=self.config.remote_worker_envs, - **dict(self.config.env_config, **{"render_mode": "rgb_array"}), - ) - self.num_envs = self.env.num_envs - assert self.num_envs == self.config.num_envs_per_worker - - # Create our RLModule to compute actions with. - if self.config.share_module_between_env_runner_and_learner: - # DreamerV3 Algorithm will set this to the local Learner's module. - self.module = None - # Create our own instance of a DreamerV3RLModule (which then needs to be - # weight-synched each iteration). - else: - policy_dict, _ = self.config.get_multi_agent_setup(env=self.env) - module_spec = self.config.get_marl_module_spec(policy_dict=policy_dict) - # TODO (sven): DreamerV3 is currently single-agent only. - self.module = module_spec.build()[DEFAULT_POLICY_ID] - - self._needs_initial_reset = True - self._episodes = [None for _ in range(self.num_envs)] - - # TODO (sven): Move metrics temp storage and collection out of EnvRunner - # and RolloutWorkers. These classes should not continue tracking some data - # that they have already returned (in a call to `sample()`). Instead, the - # episode data should be analyzed where it was sent to (the Algorithm itself - # via its replay buffer, etc..). - self._done_episodes_for_metrics = [] - self._ongoing_episodes_for_metrics = defaultdict(list) - self._ts_since_last_metrics = 0 - - @override(EnvRunner) - def sample( - self, - *, - num_timesteps: int = None, - num_episodes: int = None, - explore: bool = True, - random_actions: bool = False, - with_render_data: bool = False, - ) -> Tuple[List[Episode], List[Episode]]: - """Runs and returns a sample (n timesteps or m episodes) on the environment(s). - - Timesteps or episodes are counted in total (across all vectorized - sub-environments). For example, if self.num_envs=2 and num_timesteps=10, each - sub-environment will be sampled for 5 steps. If self.num_envs=3 and - num_episodes=30, each sub-environment will be sampled for 10 episodes. - - Args: - num_timesteps: The number of timesteps to sample from the environment(s). - Note that only exactly one of `num_timesteps` or `num_episodes` must be - provided. - num_episodes: The number of full episodes to sample from the environment(s). - Note that only exactly one of `num_timesteps` or `num_episodes` must be - provided. - explore: Indicates whether to utilize exploration when picking actions. - random_actions: Whether to only use random actions. If True, the value of - `explore` is ignored. - force_reset: Whether to reset the environment(s) before starting to sample. - If False, will still reset the environment(s) if they were left in - a terminated or truncated state during previous sample calls. - with_render_data: If True, will record rendering images per timestep - in the returned Episodes. This data can be used to create video - reports. - TODO (sven): Note that this is only supported for runnign with - `num_episodes` yet. - - Returns: - A tuple consisting of a) list of Episode instances that are done and - b) list of Episode instances that are still ongoing. - """ - # If no execution details are provided, use self.config. - if num_timesteps is None and num_episodes is None: - if self.config.batch_mode == "truncate_episodes": - num_timesteps = self.config.rollout_fragment_length * self.num_envs - else: - num_episodes = self.num_envs - - # Sample n timesteps. - if num_timesteps is not None: - return self._sample_timesteps( - num_timesteps=num_timesteps, - explore=explore, - random_actions=random_actions, - force_reset=False, - ) - # Sample n episodes. - else: - # `_sample_episodes` returns only one list (with completed episodes) - # return empty list for incomplete ones. - return ( - self._sample_episodes( - num_episodes=num_episodes, - explore=explore, - random_actions=random_actions, - with_render_data=with_render_data, - ), - [], - ) - - def _sample_timesteps( - self, - num_timesteps: int, - explore: bool = True, - random_actions: bool = False, - force_reset: bool = False, - ) -> Tuple[List[Episode], List[Episode]]: - """Helper method to run n timesteps. - - See docstring of self.sample() for more details. - """ - done_episodes_to_return = [] - - # Get initial states for all `batch_size_B` rows in the forward batch. - initial_states = tree.map_structure( - lambda s: np.repeat(s, self.num_envs, axis=0), - self.module.get_initial_state(), - ) - - # Have to reset the env (on all vector sub-envs). - if force_reset or self._needs_initial_reset: - obs, _ = self.env.reset() - - self._episodes = [Episode() for _ in range(self.num_envs)] - states = initial_states - # Set is_first to True for all rows (all sub-envs just got reset). - is_first = np.ones((self.num_envs,), dtype=np.float32) - self._needs_initial_reset = False - - # Set initial obs and states in the episodes. - for i in range(self.num_envs): - self._episodes[i].add_initial_observation( - initial_observation=obs[i], - initial_state={k: s[i] for k, s in states.items()}, - ) - # Don't reset existing envs; continue in already started episodes. - else: - # Pick up stored observations and states from previous timesteps. - obs = np.stack([eps.observations[-1] for eps in self._episodes]) - # Compile the initial state for each batch row: If episode just started, use - # model's initial state, if not, use state stored last in Episode. - states = { - k: np.stack( - [ - initial_states[k][i] if eps.states is None else eps.states[k] - for i, eps in enumerate(self._episodes) - ] - ) - for k in initial_states.keys() - } - # If a batch row is at the beginning of an episode, set its `is_first` flag - # to 1.0, otherwise 0.0. - is_first = np.zeros((self.num_envs,), dtype=np.float32) - for i, eps in enumerate(self._episodes): - if eps.states is None: - is_first[i] = 1.0 - - # Loop through env for n timesteps. - ts = 0 - while ts < num_timesteps: - # Act randomly. - if random_actions: - actions = self.env.action_space.sample() - # Compute an action using our RLModule. - else: - batch = { - STATE_IN: tree.map_structure( - lambda s: tf.convert_to_tensor(s), states - ), - SampleBatch.OBS: tf.convert_to_tensor(obs), - "is_first": tf.convert_to_tensor(is_first), - } - # Explore or not. - if explore: - outs = self.module.forward_exploration(batch) - else: - outs = self.module.forward_inference(batch) - - # Model outputs one-hot actions (if discrete). Convert to int actions - # as well. - actions = outs[SampleBatch.ACTIONS].numpy() - if isinstance(self.env.single_action_space, gym.spaces.Discrete): - actions = np.argmax(actions, axis=-1) - states = tree.map_structure(lambda s: s.numpy(), outs[STATE_OUT]) - - obs, rewards, terminateds, truncateds, infos = self.env.step(actions) - ts += self.num_envs - - for i in range(self.num_envs): - s = {k: s[i] for k, s in states.items()} - # The last entry in self.observations[i] is already the reset - # obs of the new episode. - if terminateds[i] or truncateds[i]: - # Finish the episode with the actual terminal observation stored in - # the info dict. - self._episodes[i].add_timestep( - infos["final_observation"][i], - actions[i], - rewards[i], - state=s, - is_terminated=terminateds[i], - is_truncated=truncateds[i], - ) - # Reset h-states to the model's initial ones b/c we are starting a - # new episode. - for k, v in self.module.get_initial_state().items(): - states[k][i] = v.numpy() - is_first[i] = True - done_episodes_to_return.append(self._episodes[i]) - # Create a new episode object. - self._episodes[i] = Episode(observations=[obs[i]], states=s) - else: - self._episodes[i].add_timestep( - obs[i], actions[i], rewards[i], state=s - ) - is_first[i] = False - - # Return done episodes ... - self._done_episodes_for_metrics.extend(done_episodes_to_return) - # ... and all ongoing episode chunks. Also, make sure, we return - # a copy and start new chunks so that callers of this function - # don't alter our ongoing and returned Episode objects. - ongoing_episodes = self._episodes - self._episodes = [eps.create_successor() for eps in self._episodes] - for eps in ongoing_episodes: - self._ongoing_episodes_for_metrics[eps.id_].append(eps) - - self._ts_since_last_metrics += ts - - return done_episodes_to_return, ongoing_episodes - - def _sample_episodes( - self, - num_episodes: int, - explore: bool = True, - random_actions: bool = False, - with_render_data: bool = False, - ) -> List[Episode]: - """Helper method to run n episodes. - - See docstring of `self.sample()` for more details. - """ - done_episodes_to_return = [] - - obs, _ = self.env.reset() - episodes = [Episode() for _ in range(self.num_envs)] - - # Multiply states n times according to our vector env batch size (num_envs). - states = tree.map_structure( - lambda s: np.repeat(s, self.num_envs, axis=0), - self.module.get_initial_state(), - ) - is_first = np.ones((self.num_envs,), dtype=np.float32) - - render_images = [None] * self.num_envs - if with_render_data: - render_images = [e.render() for e in self.env.envs] - - for i in range(self.num_envs): - episodes[i].add_initial_observation( - initial_observation=obs[i], - initial_state={k: s[i] for k, s in states.items()}, - initial_render_image=render_images[i], - ) - - eps = 0 - while eps < num_episodes: - if random_actions: - actions = self.env.action_space.sample() - else: - batch = { - STATE_IN: tree.map_structure( - lambda s: tf.convert_to_tensor(s), states - ), - SampleBatch.OBS: tf.convert_to_tensor(obs), - "is_first": tf.convert_to_tensor(is_first), - } - - if explore: - outs = self.module.forward_exploration(batch) - else: - outs = self.module.forward_inference(batch) - - actions = outs[SampleBatch.ACTIONS].numpy() - if isinstance(self.env.single_action_space, gym.spaces.Discrete): - actions = np.argmax(actions, axis=-1) - states = tree.map_structure(lambda s: s.numpy(), outs[STATE_OUT]) - - obs, rewards, terminateds, truncateds, infos = self.env.step(actions) - if with_render_data: - render_images = [e.render() for e in self.env.envs] - - for i in range(self.num_envs): - s = {k: s[i] for k, s in states.items()} - # The last entry in self.observations[i] is already the reset - # obs of the new episode. - if terminateds[i] or truncateds[i]: - eps += 1 - - episodes[i].add_timestep( - infos["final_observation"][i], - actions[i], - rewards[i], - state=s, - is_terminated=terminateds[i], - is_truncated=truncateds[i], - ) - done_episodes_to_return.append(episodes[i]) - - # Also early-out if we reach the number of episodes within this - # for-loop. - if eps == num_episodes: - break - - # Reset h-states to the model's initial ones b/c we are starting a - # new episode. - for k, v in self.module.get_initial_state().items(): - states[k][i] = v.numpy() - is_first[i] = True - - episodes[i] = Episode( - observations=[obs[i]], - states=s, - render_images=[render_images[i]], - ) - else: - episodes[i].add_timestep( - obs[i], - actions[i], - rewards[i], - state=s, - render_image=render_images[i], - ) - is_first[i] = False - - self._done_episodes_for_metrics.extend(done_episodes_to_return) - self._ts_since_last_metrics += sum(len(eps) for eps in done_episodes_to_return) - - # If user calls sample(num_timesteps=..) after this, we must reset again - # at the beginning. - self._needs_initial_reset = True - - return done_episodes_to_return - - # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this - # API. Instead Algorithm should compile episode metrics itself via its local - # buffer. - def get_metrics(self) -> List[RolloutMetrics]: - # Compute per-episode metrics (only on already completed episodes). - metrics = [] - for eps in self._done_episodes_for_metrics: - episode_length = len(eps) - episode_reward = eps.get_return() - # Don't forget about the already returned chunks of this episode. - if eps.id_ in self._ongoing_episodes_for_metrics: - for eps2 in self._ongoing_episodes_for_metrics[eps.id_]: - episode_length += len(eps2) - episode_reward += eps2.get_return() - del self._ongoing_episodes_for_metrics[eps.id_] - - metrics.append( - RolloutMetrics( - episode_length=episode_length, - episode_reward=episode_reward, - ) - ) - - self._done_episodes_for_metrics.clear() - self._ts_since_last_metrics = 0 - - return metrics - - # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this - # API. Replace by proper state overriding via `EnvRunner.set_state()` - def set_weights(self, weights, global_vars=None): - """Writes the weights of our (single-agent) RLModule.""" - if self.module is None: - assert self.config.share_module_between_env_runner_and_learner - else: - self.module.set_state(weights[DEFAULT_POLICY_ID]) - - @override(EnvRunner) - def assert_healthy(self): - # Make sure, we have built our gym.vector.Env and RLModule properly. - assert self.env and self.module - - @override(EnvRunner) - def stop(self): - # Close our env object via gymnasium's API. - self.env.close() - - -class NormalizedImageEnv(gym.ObservationWrapper): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.observation_space = gym.spaces.Box( - -1.0, - 1.0, - shape=self.observation_space.shape, - dtype=np.float32, - ) - - # Divide by scale and center around 0.0, such that observations are in the range - # of -1.0 and 1.0. - def observation(self, observation): - return (observation.astype(np.float32) / 128.0) - 1.0 - - -class OneHot(gym.ObservationWrapper): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.observation_space = gym.spaces.Box( - 0.0, 1.0, shape=(self.observation_space.n,), dtype=np.float32 - ) - - def reset(self, **kwargs): - ret = self.env.reset(**kwargs) - return self._get_obs(ret[0]), ret[1] - - def step(self, action): - ret = self.env.step(action) - return self._get_obs(ret[0]), ret[1], ret[2], ret[3], ret[4] - - def _get_obs(self, obs): - return one_hot(obs, depth=self.observation_space.shape[0]) - - -class ActionClip(gym.ActionWrapper): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._low = -1.0 - self._high = 1.0 - self.action_space = gym.spaces.Box( - self._low, - self._high, - self.action_space.shape, - self.action_space.dtype, - ) - - def action(self, action): - return np.clip(action, self._low, self._high) diff --git a/rllib/algorithms/dreamerv3/utils/summaries.py b/rllib/algorithms/dreamerv3/utils/summaries.py deleted file mode 100644 index d781a33e40d6b..0000000000000 --- a/rllib/algorithms/dreamerv3/utils/summaries.py +++ /dev/null @@ -1,329 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" -import numpy as np - -from ray.rllib.algorithms.dreamerv3.utils.debugging import ( - create_cartpole_dream_image, - create_frozenlake_dream_image, -) -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.tf_utils import inverse_symlog - - -def _summarize(*, results, data_to_summarize, keys_to_log, include_histograms=False): - for k in keys_to_log: - if data_to_summarize[k].shape == (): - results.update({k: data_to_summarize[k]}) - elif include_histograms: - results.update({k: data_to_summarize[k]}) - - -def reconstruct_obs_from_h_and_z( - h_t0_to_H, - z_t0_to_H, - dreamer_model, - obs_dims_shape, -): - """Returns""" - shape = h_t0_to_H.shape - T = shape[0] # inputs are time-major - B = shape[1] - # Compute actual observations using h and z and the decoder net. - # Note that the last h-state (T+1) is NOT used here as it's already part of - # a new trajectory. - # Use mean() of the Gaussian, no sample! -> No need to construct dist object here. - reconstructed_obs_distr_means_TxB = dreamer_model.world_model.decoder( - # Fold time rank. - h=np.reshape(h_t0_to_H, (T * B, -1)), - z=np.reshape(z_t0_to_H, (T * B,) + z_t0_to_H.shape[2:]), - ) - # Unfold time rank again. - reconstructed_obs_T_B = np.reshape( - reconstructed_obs_distr_means_TxB, (T, B) + obs_dims_shape - ) - # Return inverse symlog'd (real env obs space) reconstructed observations. - return reconstructed_obs_T_B - - -def report_dreamed_trajectory( - *, - results, - env, - dreamer_model, - obs_dims_shape, - batch_indices=(0,), - desc=None, - include_images=True, -): - if not include_images: - return - - dream_data = results["dream_data"] - dreamed_obs_H_B = reconstruct_obs_from_h_and_z( - h_t0_to_H=dream_data["h_states_t0_to_H_BxT"], - z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"], - dreamer_model=dreamer_model, - obs_dims_shape=obs_dims_shape, - ) - func = ( - create_cartpole_dream_image - if env.startswith("CartPole") - else create_frozenlake_dream_image - ) - # Take 0th dreamed trajectory and produce series of images. - for b in batch_indices: - images = [] - for t in range(len(dreamed_obs_H_B) - 1): - images.append( - func( - dreamed_obs=dreamed_obs_H_B[t][b], - dreamed_V=dream_data["values_dreamed_t0_to_H_BxT"][t][b], - dreamed_a=(dream_data["actions_ints_dreamed_t0_to_H_BxT"][t][b]), - dreamed_r_tp1=(dream_data["rewards_dreamed_t0_to_H_BxT"][t + 1][b]), - # `DISAGREE_intrinsic_rewards_H_B` are shifted by 1 already - # (from t1 to H, not t0 to H like all other data here). - dreamed_ri_tp1=( - results["DISAGREE_intrinsic_rewards_H_BxT"][t][b] - if "DISAGREE_intrinsic_rewards_H_BxT" in results - else None - ), - dreamed_c_tp1=( - dream_data["continues_dreamed_t0_to_H_BxT"][t + 1][b] - ), - value_target=results["VALUE_TARGETS_H_BxT"][t][b], - initial_h=dream_data["h_states_t0_to_H_BxT"][t][b], - as_tensor=True, - ).numpy() - ) - # Concat images along width-axis (so they show as a "film sequence" next to each - # other). - results.update( - { - f"dreamed_trajectories{('_'+desc) if desc else ''}_B{b}": ( - np.concatenate(images, axis=1) - ), - } - ) - - -def report_predicted_vs_sampled_obs( - *, - results, - sample, - batch_size_B, - batch_length_T, - symlog_obs: bool = True, -): - """Summarizes sampled data (from the replay buffer) vs world-model predictions. - - World model predictions are based on the posterior states (z computed from actual - observation encoder input + the current h-states). - - Observations: Computes MSE (sampled vs predicted/recreated) over all features. - For image observations, also creates direct image comparisons (sampled images - vs predicted (posterior) ones). - Rewards: Compute MSE (sampled vs predicted). - Continues: Compute MSE (sampled vs predicted). - - Args: - results: The results dict that was returned by `LearnerGroup.update()`. - sample: The sampled data (dict) from the replay buffer. Already tf-tensor - converted. - batch_size_B: The batch size (B). This is the number of trajectories sampled - from the buffer. - batch_length_T: The batch length (T). This is the length of an individual - trajectory sampled from the buffer. - """ - predicted_observation_means_BxT = results[ - "WORLD_MODEL_fwd_out_obs_distribution_means_BxT" - ] - _report_obs( - results=results, - computed_float_obs_B_T_dims=np.reshape( - predicted_observation_means_BxT, - (batch_size_B, batch_length_T) + sample[SampleBatch.OBS].shape[2:], - ), - sampled_obs_B_T_dims=sample[SampleBatch.OBS], - descr_prefix="WORLD_MODEL", - descr_obs=f"predicted_posterior_T{batch_length_T}", - symlog_obs=symlog_obs, - ) - - -def report_dreamed_eval_trajectory_vs_samples( - *, - results, - dream_data, - sample, - burn_in_T, - dreamed_T, - dreamer_model, - symlog_obs: bool = True, -): - # Obs MSE. - dreamed_obs_T_B = reconstruct_obs_from_h_and_z( - h_t0_to_H=dream_data["h_states_t0_to_H_BxT"], - z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"], - dreamer_model=dreamer_model, - obs_dims_shape=sample[SampleBatch.OBS].shape[2:], - ) - t0 = burn_in_T - 1 - tH = t0 + dreamed_T - # Observation MSE and - if applicable - images comparisons. - mse_sampled_vs_dreamed_obs = _report_obs( - results=results, - # Have to transpose b/c dreamed data is time-major. - computed_float_obs_B_T_dims=np.transpose( - dreamed_obs_T_B, - axes=[1, 0] + list(range(2, len(dreamed_obs_T_B.shape))), - ), - sampled_obs_B_T_dims=sample[SampleBatch.OBS][:, t0 : tH + 1], - descr_prefix="EVALUATION", - descr_obs=f"dreamed_prior_H{dreamed_T}", - symlog_obs=symlog_obs, - ) - - # Reward MSE. - _report_rewards( - results=results, - computed_rewards=dream_data["rewards_dreamed_t0_to_H_BxT"], - sampled_rewards=sample[SampleBatch.REWARDS][:, t0 : tH + 1], - descr_prefix="EVALUATION", - descr_reward=f"dreamed_prior_H{dreamed_T}", - ) - - # Continues MSE. - _report_continues( - results=results, - computed_continues=dream_data["continues_dreamed_t0_to_H_BxT"], - sampled_continues=(1.0 - sample["is_terminated"])[:, t0 : tH + 1], - descr_prefix="EVALUATION", - descr_cont=f"dreamed_prior_H{dreamed_T}", - ) - return mse_sampled_vs_dreamed_obs - - -def report_sampling_and_replay_buffer(*, replay_buffer): - episodes_in_buffer = replay_buffer.get_num_episodes() - ts_in_buffer = replay_buffer.get_num_timesteps() - replayed_steps = replay_buffer.get_sampled_timesteps() - added_steps = replay_buffer.get_added_timesteps() - - # Summarize buffer, sampling, and train ratio stats. - return { - "BUFFER_capacity": replay_buffer.capacity, - "BUFFER_size_num_episodes": episodes_in_buffer, - "BUFFER_size_timesteps": ts_in_buffer, - "BUFFER_replayed_steps": replayed_steps, - "BUFFER_added_steps": added_steps, - } - - -def _report_obs( - *, - results, - computed_float_obs_B_T_dims, - sampled_obs_B_T_dims, - descr_prefix=None, - descr_obs, - symlog_obs, -): - """Summarizes computed- vs sampled observations: MSE and (if applicable) images. - - Args: - computed_float_obs_B_T_dims: Computed float observations - (not clipped, not cast'd). Shape=(B, T, [dims ...]). - sampled_obs_B_T_dims: Sampled observations (as-is from the environment, meaning - this could be uint8, 0-255 clipped images). Shape=(B, T, [dims ...]). - B: The batch size B (see shapes of `computed_float_obs_B_T_dims` and - `sampled_obs_B_T_dims` above). - T: The batch length T (see shapes of `computed_float_obs_B_T_dims` and - `sampled_obs_B_T_dims` above). - descr: A string used to describe the computed data to be used in the TB - summaries. - """ - # Videos: Create summary, comparing computed images with actual sampled ones. - # 4=[B, T, w, h] grayscale image; 5=[B, T, w, h, C] RGB image. - if len(sampled_obs_B_T_dims.shape) in [4, 5]: - descr_prefix = (descr_prefix + "_") if descr_prefix else "" - - if symlog_obs: - computed_float_obs_B_T_dims = inverse_symlog(computed_float_obs_B_T_dims) - - # Restore image pixels from normalized (non-symlog'd) data. - if not symlog_obs: - computed_float_obs_B_T_dims = (computed_float_obs_B_T_dims + 1.0) * 128 - sampled_obs_B_T_dims = (sampled_obs_B_T_dims + 1.0) * 128 - sampled_obs_B_T_dims = np.clip(sampled_obs_B_T_dims, 0.0, 255.0).astype( - np.uint8 - ) - computed_images = np.clip(computed_float_obs_B_T_dims, 0.0, 255.0).astype( - np.uint8 - ) - # Concat sampled and computed images along the height axis (3) such that - # real images show below respective predicted ones. - # (B, T, C, h, w) - sampled_vs_computed_images = np.concatenate( - [computed_images, sampled_obs_B_T_dims], - axis=3, - ) - # Add grayscale dim, if necessary. - if len(sampled_obs_B_T_dims.shape) == 2 + 2: - sampled_vs_computed_images = np.expand_dims(sampled_vs_computed_images, -1) - - results.update( - {f"{descr_prefix}sampled_vs_{descr_obs}_videos": sampled_vs_computed_images} - ) - - # return mse_sampled_vs_computed_obs - - -def _report_rewards( - *, - results, - computed_rewards, - sampled_rewards, - descr_prefix=None, - descr_reward, -): - descr_prefix = (descr_prefix + "_") if descr_prefix else "" - mse_sampled_vs_computed_rewards = np.mean( - np.square(computed_rewards - sampled_rewards) - ) - mse_sampled_vs_computed_rewards = np.mean(mse_sampled_vs_computed_rewards) - results.update( - { - f"{descr_prefix}sampled_vs_{descr_reward}_rewards_mse": ( - mse_sampled_vs_computed_rewards - ), - } - ) - - -def _report_continues( - *, - results, - computed_continues, - sampled_continues, - descr_prefix=None, - descr_cont, -): - descr_prefix = (descr_prefix + "_") if descr_prefix else "" - # Continue MSE. - mse_sampled_vs_computed_continues = np.mean( - np.square(computed_continues - sampled_continues.astype(np.float32)) - ) - results.update( - { - f"{descr_prefix}sampled_vs_{descr_cont}_continues_mse": ( - mse_sampled_vs_computed_continues - ), - } - ) diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 81cb8d0627bde..d435e469b23ce 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -482,12 +482,12 @@ def training_step(self) -> ResultDict: # workers. with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: if self.workers.num_remote_workers() > 0: - from_worker_or_learner_group = None + from_worker_or_trainer = None if self.config._enable_learner_api: # sync weights from learner_group to all rollout workers - from_worker_or_learner_group = self.learner_group + from_worker_or_trainer = self.learner_group self.workers.sync_weights( - from_worker_or_learner_group=from_worker_or_learner_group, + from_worker_or_trainer=from_worker_or_trainer, policies=policies_to_update, global_vars=global_vars, ) diff --git a/rllib/algorithms/registry.py b/rllib/algorithms/registry.py index 5352814f5e4e4..5387420cc5230 100644 --- a/rllib/algorithms/registry.py +++ b/rllib/algorithms/registry.py @@ -114,12 +114,6 @@ def _import_dreamer(): return dreamer.Dreamer, dreamer.Dreamer.get_default_config() -def _import_dreamerv3(): - import ray.rllib.algorithms.dreamerv3 as dreamerv3 - - return dreamerv3.DreamerV3, dreamerv3.DreamerV3.get_default_config() - - def _import_dt(): import ray.rllib.algorithms.dt as dt @@ -245,7 +239,6 @@ def _import_leela_chess_zero(): "DDPPO": _import_ddppo, "DQN": _import_dqn, "Dreamer": _import_dreamer, - "DreamerV3": _import_dreamerv3, "DT": _import_dt, "IMPALA": _import_impala, "APPO": _import_appo, @@ -285,7 +278,6 @@ def _import_leela_chess_zero(): "DDPPO": "DDPPO", "DQN": "DQN", "Dreamer": "Dreamer", - "DreamerV3": "DreamerV3", "DT": "DT", "Impala": "IMPALA", "APPO": "APPO", diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index 2bde70aa69ea4..9bbff1f7f0877 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -147,12 +147,15 @@ def test_detect_atari_env(self): config = AlgorithmConfig().environment( env="ALE/Breakout-v5", env_config={"frameskip": 1} ) + config.validate() self.assertTrue(config.is_atari) config = AlgorithmConfig().environment(env="ALE/Pong-v5") + config.validate() self.assertTrue(config.is_atari) config = AlgorithmConfig().environment(env="CartPole-v1") + config.validate() # We do not auto-detect callable env makers for Atari envs. self.assertFalse(config.is_atari) @@ -163,10 +166,12 @@ def test_detect_atari_env(self): make_kwargs={"frameskip": 1}, ) ) + config.validate() # We do not auto-detect callable env makers for Atari envs. self.assertFalse(config.is_atari) config = AlgorithmConfig().environment(env="NotAtari") + config.validate() self.assertFalse(config.is_atari) def test_rl_module_api(self): diff --git a/rllib/core/learner/tf/tf_learner.py b/rllib/core/learner/tf/tf_learner.py index 2cc22a725cf1b..2cb9cdeb049aa 100644 --- a/rllib/core/learner/tf/tf_learner.py +++ b/rllib/core/learner/tf/tf_learner.py @@ -376,7 +376,7 @@ def _make_distributed_strategy_if_necessary(self) -> "tf.distribute.Strategy": devices = tf.config.list_logical_devices("GPU") assert self._local_gpu_idx < len(devices), ( f"local_gpu_idx {self._local_gpu_idx} is not a valid GPU id or is " - "not available." + " not available." ) local_gpu = [devices[self._local_gpu_idx].name] strategy = tf.distribute.MirroredStrategy(devices=local_gpu) @@ -431,11 +431,10 @@ def helper(_batch): # in-efficient. However, for tf>=2.12, it works also w/o this conversion # so remove this after we upgrade officially to tf==2.12. _batch = NestedDict(_batch) - with tf.GradientTape(persistent=True) as tape: + with tf.GradientTape() as tape: fwd_out = self._module.forward_train(_batch) loss_per_module = self.compute_loss(fwd_out=fwd_out, batch=_batch) gradients = self.compute_gradients(loss_per_module, gradient_tape=tape) - del tape postprocessed_gradients = self.postprocess_gradients(gradients) self.apply_gradients(postprocessed_gradients) diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py index 6aed0b9850521..b6478d51d09d0 100644 --- a/rllib/core/rl_module/rl_module.py +++ b/rllib/core/rl_module/rl_module.py @@ -285,19 +285,7 @@ class RLModule(abc.ABC): def __init__(self, config: RLModuleConfig): self.config = config - # Make sure, `setup()` is only called once, no matter what. In some cases - # of multiple inheritance (and with our __post_init__ functionality in place, - # this might get called twice. - if hasattr(self, "_is_setup") and self._is_setup: - raise RuntimeError( - "`RLModule.setup()` called twice within your RLModule implementation " - f"{self}! Make sure you are using the proper inheritance order " - "(TorchRLModule before [Algo]RLModule) or (TfRLModule before " - "[Algo]RLModule) and that you are using `super().__init__(...)` in " - "your custom constructor." - ) self.setup() - self._is_setup = True def __init_subclass__(cls, **kwargs): # Automatically add a __post_init__ method to all subclasses of RLModule. diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index 21b2601b7e05f..100b815d2b621 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -356,9 +356,7 @@ def num_remote_worker_restarts(self) -> int: def sync_weights( self, policies: Optional[List[PolicyID]] = None, - from_worker_or_learner_group: Optional[ - Union[RolloutWorker, LearnerGroup] - ] = None, + from_worker_or_trainer: Optional[Union[RolloutWorker, LearnerGroup]] = None, to_worker_indices: Optional[List[int]] = None, global_vars: Optional[Dict[str, TensorType]] = None, timeout_seconds: Optional[int] = 0, @@ -371,7 +369,7 @@ def sync_weights( Args: policies: Optional list of PolicyIDs to sync weights for. If None (default), sync weights to/from all policies. - from_worker_or_learner_group: Optional (local) RolloutWorker instance or + from_worker_or_trainer: Optional (local) RolloutWorker instance or LearnerGroup instance to sync from. If None (default), sync from this WorkerSet's local worker. to_worker_indices: Optional list of worker indices to sync the @@ -383,16 +381,16 @@ def sync_weights( for any sync calls to finish). This significantly improves algorithm performance. """ - if self.local_worker() is None and from_worker_or_learner_group is None: + if self.local_worker() is None and from_worker_or_trainer is None: raise TypeError( - "No `local_worker` in WorkerSet, must provide " - "`from_worker_or_learner_group` arg in `sync_weights()`!" + "No `local_worker` in WorkerSet, must provide `from_worker` " + "arg in `sync_weights()`!" ) # Only sync if we have remote workers or `from_worker_or_trainer` is provided. weights = None - if self.num_remote_workers() or from_worker_or_learner_group is not None: - weights_src = from_worker_or_learner_group or self.local_worker() + if self.num_remote_workers() or from_worker_or_trainer is not None: + weights_src = from_worker_or_trainer or self.local_worker() if weights_src is None: raise ValueError( @@ -416,10 +414,10 @@ def set_weight(w): timeout_seconds=timeout_seconds, ) - # If `from_worker_or_learner_group` is provided, also sync to this WorkerSet's + # If `from_worker` is provided, also sync to this WorkerSet's # local worker. if self.local_worker() is not None: - if from_worker_or_learner_group is not None: + if from_worker_or_trainer is not None: self.local_worker().set_weights(weights, global_vars=global_vars) # If `global_vars` is provided and local worker exists -> Update its # global_vars. diff --git a/rllib/policy/eager_tf_policy_v2.py b/rllib/policy/eager_tf_policy_v2.py index 4df6b2724fb3d..8a4093fb0e2d5 100644 --- a/rllib/policy/eager_tf_policy_v2.py +++ b/rllib/policy/eager_tf_policy_v2.py @@ -870,12 +870,7 @@ def _compute_actions_helper_rl_module_explore( actions = fwd_out[SampleBatch.ACTIONS] # Otherwise, sample actions from the distribution. else: - if action_dist is None: - raise KeyError( - "Your RLModule's `forward_exploration()` method must return a dict" - f"with either the {SampleBatch.ACTIONS} key or the " - f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!" - ) + assert action_dist actions = action_dist.sample() # Anything but action_dist and state_out is an extra fetch @@ -931,12 +926,7 @@ def _compute_actions_helper_rl_module_inference( actions = fwd_out[SampleBatch.ACTIONS] # Otherwise, sample actions from the distribution. else: - if action_dist is None: - raise KeyError( - "Your RLModule's `forward_inference()` method must return a dict" - f"with either the {SampleBatch.ACTIONS} key or the " - f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!" - ) + assert action_dist actions = action_dist.sample() # Anything but action_dist and state_out is an extra fetch diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index bef3c070d81a4..4165da80a1f8d 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -1147,12 +1147,7 @@ def _compute_action_helper( actions = fwd_out[SampleBatch.ACTIONS] # Otherwise, sample actions from the distribution. else: - if action_dist is None: - raise KeyError( - "Your RLModule's `forward_exploration()` method must return" - f" a dict with either the {SampleBatch.ACTIONS} key or the " - f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!" - ) + assert action_dist actions = action_dist.sample() # Compute action-logp and action-prob from distribution and add to @@ -1176,12 +1171,7 @@ def _compute_action_helper( actions = fwd_out[SampleBatch.ACTIONS] # Otherwise, sample actions from the distribution. else: - if action_dist is None: - raise KeyError( - "Your RLModule's `forward_inference()` method must return" - f" a dict with either the {SampleBatch.ACTIONS} key or the " - f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!" - ) + assert action_dist actions = action_dist.sample() # Anything but actions and state_out is an extra fetch. diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py index 0f945dd7db82c..0a9303a9e47d5 100644 --- a/rllib/tests/run_regression_tests.py +++ b/rllib/tests/run_regression_tests.py @@ -57,12 +57,6 @@ action="store_true", help="Run ray in local mode for easier debugging.", ) -parser.add_argument( - "--num-samples", - type=int, - default=1, - help="The number of seeds/samples to run with the given experiment config.", -) parser.add_argument( "--override-mean-reward", type=float, @@ -109,14 +103,12 @@ # Loop through all collected files. for file in files: - config_is_python = False # For python files, need to make sure, we only deliver the module name into the # `load_experiments_from_file` function (everything from "/ray/rllib" on). if file.endswith(".py"): if file.endswith("__init__.py"): # weird CI learning test (BAZEL) case continue experiments = load_experiments_from_file(file, SupportedFileType.python) - config_is_python = True else: experiments = load_experiments_from_file(file, SupportedFileType.yaml) @@ -126,16 +118,13 @@ exp = list(experiments.values())[0] - # Set the number of samples to run. - exp["num_samples"] = args.num_samples - # Override framework setting with the command line one, if provided. # Otherwise, will use framework setting in file (or default: torch). if args.framework is not None: exp["config"]["framework"] = args.framework # Override env setting if given on command line. if args.env is not None: - exp["config"]["env"] = exp["env"] = args.env + exp["config"]["env"] = args.env # Override the mean reward if specified. This is used by the ray ci # for overriding the episode reward mean for tf2 tests for off policy @@ -150,23 +139,19 @@ print(f"Skipping framework='{args.framework}' for QMIX.") continue - # Always run with eager-tracing when framework=tf2, if not in local-mode - # and unless the yaml explicitly tells us to disable eager tracing. + # Always run with eager-tracing when framework=tf2 if not in local-mode. + # Ignore this if the yaml explicitly tells us to disable eager tracing if ( - (args.framework == "tf2" or exp["config"].get("framework") == "tf2") + args.framework == "tf2" and not args.local_mode - # Note: This check will always fail for python configs, b/c normally, - # algorithm configs have `self.eager_tracing=False` by default. - # Thus, you'd have to set `eager_tracing` to True explicitly in your python - # config to make sure we are indeed using eager tracing. - and exp["config"].get("eager_tracing") is not False + and not exp["config"].get("eager_tracing") is False ): + exp["config"]["eager_tracing"] = True - # Print out the actual config (not for py files as yaml.dump weirdly fails). - if not config_is_python: - print("== Test config ==") - print(yaml.dump(experiments)) + # Print out the actual config. + print("== Test config ==") + print(yaml.dump(experiments)) # Try running each test 3 times and make sure it reaches the given # reward. diff --git a/rllib/tuned_examples/dreamerv3/__init__.py b/rllib/tuned_examples/dreamerv3/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/rllib/tuned_examples/dreamerv3/atari_100k.py b/rllib/tuned_examples/dreamerv3/atari_100k.py deleted file mode 100644 index ef6731d6e2e2a..0000000000000 --- a/rllib/tuned_examples/dreamerv3/atari_100k.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" - -# Run with: -# python run_regression_tests.py --dir [this file] --env ALE/[gym ID e.g. Pong-v5] - -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config - - -# Number of GPUs to run on. -num_gpus = 1 - -config = ( - DreamerV3Config() - # Switch on eager_tracing by default. - .framework("tf2", eager_tracing=True) - .resources( - num_learner_workers=0 if num_gpus == 1 else num_gpus, - num_gpus_per_learner_worker=1 if num_gpus else 0, - num_cpus_for_local_worker=1, - ) - # TODO (sven): concretize this: If you use >1 GPU and increase the batch size - # accordingly, you might also want to increase the number of envs per worker - .rollouts( - num_envs_per_worker=(num_gpus or 1), - remote_worker_envs=True, - ) - .environment( - # [2]: "We follow the evaluation protocol of Machado et al. (2018) with 200M - # environment steps, action repeat of 4, a time limit of 108,000 steps per - # episode that correspond to 30 minutes of game play, no access to life - # information, full action space, and sticky actions. Because the world model - # integrates information over time, DreamerV2 does not use frame stacking. - # The experiments use a single-task setup where a separate agent is trained - # for each game. Moreover, each agent uses only a single environment instance. - env_config={ - # "sticky actions" but not according to Danijar's 100k configs. - "repeat_action_probability": 0.0, - # "full action space" but not according to Danijar's 100k configs. - "full_action_space": False, - # Already done by MaxAndSkip wrapper: "action repeat" == 4. - "frameskip": 1, - } - ) - .reporting( - metrics_num_episodes_for_smoothing=(num_gpus or 1), - report_images_and_videos=False, - report_dream_data=False, - report_individual_batch_item_stats=False, - ) - # See Appendix A. - .training( - model_size="S", - training_ratio=1024, - batch_size_B=16 * (num_gpus or 1), - # TODO - model={ - "batch_length_T": 64, - "horizon_H": 15, - "gamma": 0.997, - "model_size": "S", - }, - ) -) diff --git a/rllib/tuned_examples/dreamerv3/cartpole.py b/rllib/tuned_examples/dreamerv3/cartpole.py deleted file mode 100644 index b270d6c3b3137..0000000000000 --- a/rllib/tuned_examples/dreamerv3/cartpole.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config - -# Run with: -# python run_regression_tests.py --dir [this file] - -config = ( - DreamerV3Config() - .environment("CartPole-v1") - .training( - model_size="XS", - training_ratio=1024, - # TODO - model={ - "batch_size_B": 16, - "batch_length_T": 64, - "horizon_H": 15, - "gamma": 0.997, - "model_size": "XS", - }, - ) -) diff --git a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py deleted file mode 100644 index a8938ce142123..0000000000000 --- a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config - -# Run with: -# python run_regression_tests.py --dir [this file] --env DMC/[task]/[domain] -# e.g. --env=DMC/cartpole/swingup - -config = ( - DreamerV3Config() - # Use image observations. - .environment(env_config={"from_pixels": True}) - .resources( - num_learner_workers=1, - num_gpus_per_learner_worker=1, - num_cpus_for_local_worker=1, - ) - .rollouts(num_envs_per_worker=4, remote_worker_envs=True) - # See Appendix A. - .training( - model_size="S", - training_ratio=512, - # TODO - model={ - "batch_size_B": 16, - "batch_length_T": 64, - "horizon_H": 15, - "gamma": 0.997, - "model_size": "S", - }, - ) -) diff --git a/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py b/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py deleted file mode 100644 index 03e9b40def8a3..0000000000000 --- a/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config - -# Run with: -# python run_regression_tests.py --dir [this file] - -config = ( - DreamerV3Config() - .environment( - "FrozenLake-v1", - env_config={ - "desc": [ - "SF", - "HG", - ], - "is_slippery": False, - }, - ) - .training( - model_size="XS", - training_ratio=1024, - # TODO - model={ - "batch_size_B": 16, - "batch_length_T": 64, - "horizon_H": 15, - "gamma": 0.997, - "model_size": "XS", - }, - ) -) diff --git a/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py b/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py deleted file mode 100644 index 9b7b260d595e9..0000000000000 --- a/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config - -# Run with: -# python run_regression_tests.py --dir [this file] - -config = ( - DreamerV3Config() - .environment( - "FrozenLake-v1", - env_config={ - "map_name": "4x4", - "is_slippery": False, - }, - ) - .training( - model_size="nano", - training_ratio=1024, - # TODO - model={ - "batch_size_B": 16, - "batch_length_T": 64, - "horizon_H": 15, - "gamma": 0.997, - "model_size": "nano", - }, - ) -) diff --git a/rllib/tuned_examples/dreamerv3/pendulum.py b/rllib/tuned_examples/dreamerv3/pendulum.py deleted file mode 100644 index 4acc4b9aa85a9..0000000000000 --- a/rllib/tuned_examples/dreamerv3/pendulum.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -[1] Mastering Diverse Domains through World Models - 2023 -D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap -https://arxiv.org/pdf/2301.04104v1.pdf - -[2] Mastering Atari with Discrete World Models - 2021 -D. Hafner, T. Lillicrap, M. Norouzi, J. Ba -https://arxiv.org/pdf/2010.02193.pdf -""" -from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config - -# Run with: -# python run_regression_tests.py --dir [this file] - -config = ( - DreamerV3Config() - .environment("Pendulum-v1") - .training(model_size="XS", training_ratio=1024) -) diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py index 0bee53bbd5590..6c9c9badd7a03 100644 --- a/rllib/utils/metrics/__init__.py +++ b/rllib/utils/metrics/__init__.py @@ -30,7 +30,6 @@ TRAINING_ITERATION_TIMER = "training_iteration" APPLY_GRADS_TIMER = "apply_grad" COMPUTE_GRADS_TIMER = "compute_grads" -GARBAGE_COLLECTION_TIMER = "garbage_collection" SYNCH_WORKER_WEIGHTS_TIMER = "synch_weights" GRAD_WAIT_TIMER = "grad_wait" SAMPLE_TIMER = "sample" diff --git a/rllib/utils/replay_buffers/episode_replay_buffer.py b/rllib/utils/replay_buffers/episode_replay_buffer.py index e95fc50432489..787c25b1aae01 100644 --- a/rllib/utils/replay_buffers/episode_replay_buffer.py +++ b/rllib/utils/replay_buffers/episode_replay_buffer.py @@ -1,5 +1,4 @@ from collections import deque -import copy from typing import Any, Dict, List, Optional, Union import uuid @@ -110,15 +109,6 @@ def add(self, episodes: Union[List["_Episode"], "_Episode"]): episodes = [episodes] for eps in episodes: - # Make sure we don't change what's coming in from the user. - # TODO (sven): It'd probably be better to make sure in the EnvRunner to not - # hold on to episodes (for metrics purposes only) that we are returning - # back to the user from `EnvRunner.sample()`. Then we wouldn't have to - # do any copying. Instead, either compile the metrics right away on the - # EnvRunner OR compile metrics entirely on the Algorithm side (this is - # actually preferred). - eps = copy.deepcopy(eps) - self._num_timesteps += len(eps) self._num_timesteps_added += len(eps) @@ -252,7 +242,7 @@ def sample( ) episode = self.episodes[episode_idx] - # Starting a new chunk, set is_first to True. + # Starting a new chunk, set continue to False. is_first[B][T] = True # Begin of new batch item (row). @@ -265,7 +255,7 @@ def sample( else: rewards[B].append(episode.rewards[episode_ts - 1]) # We are in the middle of a batch item (row). Concat next episode to this - # row from the next episode's beginning. In other words, we never concat + # row from the episode's beginning. In other words, we never concat # a middle of an episode to another truncated one. else: episode_ts = 0 @@ -331,10 +321,6 @@ def get_sampled_timesteps(self) -> int: """Returns number of timesteps that have been sampled in buffer's lifetime.""" return self.sampled_timesteps - def get_added_timesteps(self) -> int: - """Returns number of timesteps that have been added in buffer's lifetime.""" - return self._num_timesteps_added - @override(ReplayBufferInterface) def get_state(self) -> Dict[str, Any]: return { @@ -343,7 +329,6 @@ def get_state(self) -> Dict[str, Any]: "_num_episodes_evicted": self._num_episodes_evicted, "_indices": self._indices, "_num_timesteps": self._num_timesteps, - "_num_timesteps_added": self._num_timesteps_added, "sampled_timesteps": self.sampled_timesteps, } @@ -356,7 +341,6 @@ def set_state(self, state) -> None: self._num_episodes_evicted = state["_num_episodes_evicted"] self._indices = state["_indices"] self._num_timesteps = state["_num_timesteps"] - self._num_timesteps_added = state["_num_timesteps_added"] self.sampled_timesteps = state["sampled_timesteps"] @@ -372,9 +356,8 @@ def __init__( actions=None, rewards=None, states=None, - t: int = 0, - is_terminated: bool = False, - is_truncated: bool = False, + is_terminated=False, + is_truncated=False, render_images=None, ): self.id_ = id_ or uuid.uuid4().hex @@ -387,9 +370,6 @@ def __init__( # h-states: t0 (in case this episode is a continuation chunk, we need to know # about the initial h) to T. self.states = states - # The global last timestep of the episode and the timesteps when this chunk - # started. - self.t = self.t_started = t # obs[-1] is the final observation in the episode. self.is_terminated = is_terminated # obs[-1] is the last obs in a truncated-by-the-env episode (there will no more @@ -401,18 +381,13 @@ def __init__( self.render_images = [] if render_images is None else render_images def concat_episode(self, episode_chunk: "_Episode"): - """Adds the given `episode_chunk` to the right side of self.""" assert episode_chunk.id_ == self.id_ assert not self.is_done - # Make sure the timesteps match. - assert self.t == episode_chunk.t_started episode_chunk.validate() # Make sure, end matches other episode chunk's beginning. assert np.all(episode_chunk.observations[0] == self.observations[-1]) - # Make sure the timesteps match (our last t should be the same as their first). - assert self.t == episode_chunk.t_started # Pop out our end. self.observations.pop() @@ -421,7 +396,6 @@ def concat_episode(self, episode_chunk: "_Episode"): self.observations.extend(list(episode_chunk.observations)) self.actions.extend(list(episode_chunk.actions)) self.rewards.extend(list(episode_chunk.rewards)) - self.t = episode_chunk.t self.states = episode_chunk.states if episode_chunk.is_terminated: @@ -431,21 +405,6 @@ def concat_episode(self, episode_chunk: "_Episode"): # Validate. self.validate() - def add_initial_observation( - self, *, initial_observation, initial_state=None, initial_render_image=None - ): - assert not self.is_done - assert len(self.observations) == 0 - # Assume that this episode is completely empty and has not stepped yet. - # Leave self.t (and self.t_started) at 0. - assert self.t == self.t_started == 0 - - self.observations.append(initial_observation) - self.states = initial_state - if initial_render_image is not None: - self.render_images.append(initial_render_image) - self.validate() - def add_timestep( self, observation, @@ -457,25 +416,34 @@ def add_timestep( is_truncated=False, render_image=None, ): - # Cannot add data to an already done episode. assert not self.is_done self.observations.append(observation) self.actions.append(action) self.rewards.append(reward) self.states = state - self.t += 1 if render_image is not None: self.render_images.append(render_image) self.is_terminated = is_terminated self.is_truncated = is_truncated self.validate() + def add_initial_observation( + self, *, initial_observation, initial_state=None, initial_render_image=None + ): + assert not self.is_done + assert len(self.observations) == 0 + + self.observations.append(initial_observation) + self.states = initial_state + if initial_render_image is not None: + self.render_images.append(initial_render_image) + self.validate() + def validate(self): # Make sure we always have one more obs stored than rewards (and actions) # due to the reset and last-obs logic of an MDP. assert len(self.observations) == len(self.rewards) + 1 == len(self.actions) + 1 - assert len(self.rewards) == (self.t - self.t_started) # Convert all lists to numpy arrays, if we are terminated. if self.is_done: @@ -486,43 +454,8 @@ def validate(self): @property def is_done(self): - """Whether the episode is actually done (terminated or truncated). - - A done episode cannot be continued via `self.add_timestep()` or being - concatenated on its right-side with another episode chunk or being - succeeded via `self.create_successor()`. - """ return self.is_terminated or self.is_truncated - def create_successor(self) -> "_Episode": - """Returns a successor episode chunk (of len=0) continuing with this one. - - The successor will have the same ID and state as self and its only observation - will be the last observation in self. Its length will therefore be 0 (no - steps taken yet). - - This method is useful if you would like to discontinue building an episode - chunk (b/c you have to return it from somewhere), but would like to have a new - episode (chunk) instance to continue building the actual env episode at a later - time. - - Returns: - The successor Episode chunk of this one with the same ID and state and the - only observation being the last observation in self. - """ - assert not self.is_done - - return _Episode( - # Same ID. - id_=self.id_, - # First (and only) observation of successor is this episode's last obs. - observations=[self.observations[-1]], - # Same state. - states=self.states, - # Continue with self's current timestep. - t=self.t, - ) - def to_sample_batch(self): return SampleBatch( { @@ -564,8 +497,6 @@ def get_state(self): "actions": self.actions, "rewards": self.rewards, "states": self.states, - "t_started": self.t_started, - "t": self.t, "is_terminated": self.is_terminated, "is_truncated": self.is_truncated, }.items() @@ -578,16 +509,14 @@ def from_state(state): eps.actions = state[2][1] eps.rewards = state[3][1] eps.states = state[4][1] - eps.t_started = state[5][1] - eps.t = state[6][1] - eps.is_terminated = state[7][1] - eps.is_truncated = state[8][1] + eps.is_terminated = state[5][1] + eps.is_truncated = state[6][1] return eps def __len__(self): assert len(self.observations) > 0, ( "ERROR: Cannot determine length of episode that hasn't started yet! " - "Call `_Episode.add_initial_observation(initial_observation=...)` first " + "Call `_Episode.add_initial_obs(initial_observation=...)` first " "(after which `len(_Episode)` will be 0)." ) return len(self.observations) - 1 diff --git a/rllib/utils/tf_utils.py b/rllib/utils/tf_utils.py index 3acbbad004c0f..7b43953c5b67f 100644 --- a/rllib/utils/tf_utils.py +++ b/rllib/utils/tf_utils.py @@ -679,7 +679,7 @@ def two_hot( # First make sure, values are clipped. value = tf.clip_by_value(value, lower_bound, upper_bound) # Tensor of batch indices: [0, B=batch size). - batch_indices = tf.range(0, tf.shape(value)[0], dtype=tf.float32) + batch_indices = tf.range(0, value.shape[0], dtype=tf.float32) # Calculate the step deltas (how much space between each bucket's central value?). bucket_delta = (upper_bound - lower_bound) / (num_buckets - 1) # Compute the float indices (might be non-int numbers: sitting between two buckets). @@ -690,12 +690,12 @@ def two_hot( kp1 = tf.math.ceil(idx) # In case k == kp1 (idx is exactly on the bucket boundary), move kp1 up by 1.0. # Otherwise, this would result in a NaN in the returned two-hot tensor. - kp1 = tf.where(tf.equal(k, kp1), kp1 + 1.0, kp1) + kp1 = tf.where(k == kp1, kp1 + 1.0, kp1) # Iff `kp1` is one beyond our last index (because incoming value is larger than # `upper_bound`), move it to one before k (kp1's weight is going to be 0.0 anyways, # so it doesn't matter where it points to; we are just avoiding an index error # with this). - kp1 = tf.where(tf.equal(kp1, num_buckets), kp1 - 2.0, kp1) + kp1 = tf.where(kp1 == num_buckets, kp1 - 2.0, kp1) # The actual values found at k and k+1 inside the set of buckets. values_k = lower_bound + k * bucket_delta values_kp1 = lower_bound + kp1 * bucket_delta