diff --git a/rllib/BUILD b/rllib/BUILD
index 221c2362b56cf..b66d0ec983e41 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1066,7 +1066,7 @@ py_test(
     srcs = ["algorithms/dqn/tests/test_repro_dqn.py"]
 )
 
-# Dreamer
+# Dreamer (V1)
 py_test(
     name = "test_dreamer",
     tags = ["team:rllib", "algorithms_dir"],
@@ -1074,6 +1074,16 @@ py_test(
     srcs = ["algorithms/dreamer/tests/test_dreamer.py"]
 )
 
+# DreamerV3
+# TODO (sven): Enable once the version conflict for gymnasium/supersuit/pettingzoo
+#  /shimmy/mujoco has been resolved.
+#py_test(
+#    name = "test_dreamerv3",
+#    tags = ["team:rllib", "algorithms_dir"],
+#    size = "large",
+#    srcs = ["algorithms/dreamerv3/tests/test_dreamerv3.py"]
+#)
+
 # DT
 py_test(
     name = "test_segmentation_buffer",
@@ -4345,6 +4355,7 @@ py_test_module_list(
   files = [
     "tests/test_dnc.py",
     "tests/test_perf.py",
+    "algorithms/dreamerv3/tests/test_dreamerv3.py",
     "env/wrappers/tests/test_kaggle_wrapper.py",
     "examples/env/tests/test_cliff_walking_wall_env.py",
     "examples/env/tests/test_coin_game_non_vectorized_env.py",
diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 29de0b01a3be5..7e3c32a4efc51 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -706,7 +706,19 @@ def setup(self, config: AlgorithmConfig) -> None:
             #  the two we need to loop through the policy modules and create a simple
             #  MARLModule from the RLModule within each policy.
             local_worker = self.workers.local_worker()
-            module_spec = local_worker.marl_module_spec
+            policy_dict, _ = self.config.get_multi_agent_setup(
+                env=local_worker.env,
+                spaces=getattr(local_worker, "spaces", None),
+            )
+            # TODO (Sven): Unify the inference of the MARLModuleSpec. Right now,
+            #  we get this from the RolloutWorker's `marl_module_spec` property.
+            #  However, this is hacky (information leak) and should not remain this
+            #  way. For other EnvRunner classes (that don't have this property),
+            #  Algorithm should infer this itself.
+            if hasattr(local_worker, "marl_module_spec"):
+                module_spec = local_worker.marl_module_spec
+            else:
+                module_spec = self.config.get_marl_module_spec(policy_dict=policy_dict)
             learner_group_config = self.config.get_learner_group_config(module_spec)
             self.learner_group = learner_group_config.build()
 
@@ -871,7 +883,7 @@ def evaluate(
         # Sync weights to the evaluation WorkerSet.
         if self.evaluation_workers is not None:
             self.evaluation_workers.sync_weights(
-                from_worker_or_trainer=self.workers.local_worker()
+                from_worker_or_learner_group=self.workers.local_worker()
             )
             self._sync_filters_if_needed(
                 central_worker=self.workers.local_worker(),
@@ -1409,7 +1421,7 @@ def training_step(self) -> ResultDict:
             if self.config._enable_learner_api:
                 from_worker_or_trainer = self.learner_group
             self.workers.sync_weights(
-                from_worker_or_trainer=from_worker_or_trainer,
+                from_worker_or_learner_group=from_worker_or_trainer,
                 policies=list(train_results.keys()),
                 global_vars=global_vars,
             )
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index a037f7bb052b3..2510490d48426 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -303,10 +303,8 @@ def __init__(self, algo_class=None):
         self.normalize_actions = True
         self.clip_actions = False
         self.disable_env_checking = False
-        # Whether this env is an atari env (for atari-specific preprocessing).
-        # If not specified, we will try to auto-detect this.
-        self.is_atari = None
         self.auto_wrap_old_gym_envs = True
+        self._is_atari = None
 
         # `self.rollouts()`
         self.env_runner_cls = None
@@ -718,31 +716,6 @@ def freeze(self) -> None:
         #  of themselves? This way, users won't even be able to alter those values
         #  directly anymore.
 
-    def _detect_atari_env(self) -> bool:
-        """Returns whether this configured env is an Atari env or not.
-
-        Returns:
-            True, if specified env is an Atari env, False otherwise.
-        """
-        # Atari envs are usually specified via a string like "PongNoFrameskip-v4"
-        # or "ALE/Breakout-v5".
-        # We do NOT attempt to auto-detect Atari env for other specified types like
-        # a callable, to avoid running heavy logics in validate().
-        # For these cases, users can explicitly set `environment(atari=True)`.
-        if not type(self.env) == str:
-            return False
-
-        try:
-            if self.env.startswith("ALE/"):
-                env = gym.make("GymV26Environment-v0", env_id=self.env)
-            else:
-                env = gym.make(self.env)
-        except gym.error.NameNotFound:
-            # Not an Atari env if this is not a gym env.
-            return False
-
-        return is_atari(env)
-
     @OverrideToImplementCustomLogic_CallToSuperRecommended
     def validate(self) -> None:
         """Validates all values in this config."""
@@ -988,10 +961,6 @@ def validate(self) -> None:
                     f"config.framework({self.framework_str})!"
                 )
 
-        # Detect if specified env is an Atari env.
-        if self.is_atari is None:
-            self.is_atari = self._detect_atari_env()
-
         if self.input_ == "sampler" and self.off_policy_estimation_methods:
             raise ValueError(
                 "Off-policy estimation methods can only be used if the input is a "
@@ -1368,7 +1337,7 @@ def environment(
             disable_env_checking: If True, disable the environment pre-checking module.
             is_atari: This config can be used to explicitly specify whether the env is
                 an Atari env or not. If not specified, RLlib will try to auto-detect
-                this during config validation.
+                this.
             auto_wrap_old_gym_envs: Whether to auto-wrap old gym environments (using
                 the pre 0.24 gym APIs, e.g. reset() returning single obs and no info
                 dict). If True, RLlib will automatically wrap the given gym env class
@@ -1405,7 +1374,7 @@ def environment(
         if disable_env_checking is not NotProvided:
             self.disable_env_checking = disable_env_checking
         if is_atari is not NotProvided:
-            self.is_atari = is_atari
+            self._is_atari = is_atari
         if auto_wrap_old_gym_envs is not NotProvided:
             self.auto_wrap_old_gym_envs = auto_wrap_old_gym_envs
 
@@ -2319,6 +2288,8 @@ def reporting(
                 In case there are more than this many episodes collected in a single
                 training iteration, use all of these episodes for metrics computation,
                 meaning don't ever cut any "excess" episodes.
+                Set this to 1 to disable smoothing and to always report only the most
+                recently collected episode's return.
             min_time_s_per_iteration: Minimum time to accumulate within a single
                 `train()` call. This value does not affect learning,
                 only the number of times `Algorithm.training_step()` is called by
@@ -2645,6 +2616,34 @@ def learner_class(self) -> Type["Learner"]:
         """
         return self._learner_class or self.get_default_learner_class()
 
+    @property
+    def is_atari(self) -> bool:
+        """True if if specified env is an Atari env."""
+
+        # Not yet determined, try to figure this out.
+        if self._is_atari is None:
+            # Atari envs are usually specified via a string like "PongNoFrameskip-v4"
+            # or "ALE/Breakout-v5".
+            # We do NOT attempt to auto-detect Atari env for other specified types like
+            # a callable, to avoid running heavy logics in validate().
+            # For these cases, users can explicitly set `environment(atari=True)`.
+            if not type(self.env) == str:
+                return False
+            try:
+                if self.env.startswith("ALE/"):
+                    env = gym.make("GymV26Environment-v0", env_id=self.env)
+                else:
+                    env = gym.make(self.env)
+            # Any gymnasium error -> Cannot be an Atari env.
+            except gym.error.Error:
+                return False
+
+            self._is_atari = is_atari(env)
+            # Clean up env's resources, if any.
+            env.close()
+
+        return self._is_atari
+
     # TODO: Make rollout_fragment_length as read-only property and replace the current
     #  self.rollout_fragment_length a private variable.
     def get_rollout_fragment_length(self, worker_index: int = 0) -> int:
diff --git a/rllib/algorithms/dreamerv3/README.md b/rllib/algorithms/dreamerv3/README.md
new file mode 100644
index 0000000000000..8c64b960b7b73
--- /dev/null
+++ b/rllib/algorithms/dreamerv3/README.md
@@ -0,0 +1,27 @@
+# DreamerV3
+Implementation (TensorFlow/Keras) of the "DreamerV3" model-based reinforcement learning
+(RL) algorithm by D. Hafner et al. 2023
+
+DreamerV3 train a world model in supervised fashion using real environment
+interactions. The world model utilizes a recurrent GRU-based architecture
+("recurrent state space model" or RSSM) and uses it to predicts rewards,
+episode continuation flags, as well as, observations.
+With these predictions (dreams) made by the world model, both actor
+and critic are trained in classic REINFORCE fashion. In other words, the
+actual RL components of the model are never trained on actual environment data,
+but on dreamed trajectories only.
+
+For more algorithm details, see:
+
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+.. and the "DreamerV2" paper:
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+
+## Results
+TODO
diff --git a/rllib/algorithms/dreamerv3/__init__.py b/rllib/algorithms/dreamerv3/__init__.py
new file mode 100644
index 0000000000000..d4b2adb0d57ed
--- /dev/null
+++ b/rllib/algorithms/dreamerv3/__init__.py
@@ -0,0 +1,15 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3, DreamerV3Config
+
+__all__ = [
+    "DreamerV3",
+    "DreamerV3Config",
+]
diff --git a/rllib/algorithms/dreamerv3/dreamerv3.py b/rllib/algorithms/dreamerv3/dreamerv3.py
new file mode 100644
index 0000000000000..515f6e3a22a29
--- /dev/null
+++ b/rllib/algorithms/dreamerv3/dreamerv3.py
@@ -0,0 +1,660 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+import dataclasses
+import gc
+import logging
+import tree  # pip install dm_tree
+from typing import Any, Dict, List, Optional
+
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.algorithms.dreamerv3.dreamerv3_catalog import DreamerV3Catalog
+from ray.rllib.algorithms.dreamerv3.dreamerv3_learner import (
+    DreamerV3LearnerHyperparameters,
+)
+from ray.rllib.algorithms.dreamerv3.utils import do_symlog_obs
+from ray.rllib.algorithms.dreamerv3.utils.env_runner import DreamerV3EnvRunner
+from ray.rllib.algorithms.dreamerv3.utils.summaries import (
+    report_predicted_vs_sampled_obs,
+    report_sampling_and_replay_buffer,
+)
+from ray.rllib.core.learner.learner import LearnerHyperparameters
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
+from ray.rllib.utils import deep_update
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.numpy import one_hot
+from ray.rllib.utils.metrics import (
+    ALL_MODULES,
+    GARBAGE_COLLECTION_TIMER,
+    LEARN_ON_BATCH_TIMER,
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_AGENT_STEPS_TRAINED,
+    NUM_ENV_STEPS_SAMPLED,
+    NUM_ENV_STEPS_TRAINED,
+    NUM_GRAD_UPDATES_LIFETIME,
+    NUM_SYNCH_WORKER_WEIGHTS,
+    NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS,
+    SAMPLE_TIMER,
+    SYNCH_WORKER_WEIGHTS_TIMER,
+)
+from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer
+from ray.rllib.utils.typing import ResultDict
+
+
+logger = logging.getLogger(__name__)
+
+_, tf, _ = try_import_tf()
+
+
+class DreamerV3Config(AlgorithmConfig):
+    """Defines a configuration class from which a DreamerV3 can be built.
+
+    Example:
+        >>> from ray.rllib.algorithms.dreamerv3 import DreamerV3Config
+        >>> config = DreamerV3Config()
+        >>> config = config.training(  # doctest: +SKIP
+        ...     batch_size_B=8, model_size="M"
+        ... )
+        >>> config = config.resources(num_learner_workers=4)  # doctest: +SKIP
+        >>> print(config.to_dict())  # doctest: +SKIP
+        >>> # Build a Algorithm object from the config and run 1 training iteration.
+        >>> algo = config.build(env="CartPole-v1")  # doctest: +SKIP
+        >>> algo.train()  # doctest: +SKIP
+
+    Example:
+        >>> from ray.rllib.algorithms.dreamerv3 import DreamerV3Config
+        >>> from ray import air
+        >>> from ray import tune
+        >>> config = DreamerV3Config()
+        >>> # Print out some default values.
+        >>> print(config.training_ratio)  # doctest: +SKIP
+        >>> # Update the config object.
+        >>> config = config.training(   # doctest: +SKIP
+        ...     training_ratio=tune.grid_search([256, 512, 1024])
+        ... )
+        >>> # Set the config object's env.
+        >>> config = config.environment(env="CartPole-v1")  # doctest: +SKIP
+        >>> # Use to_dict() to get the old-style python config dict
+        >>> # when running with tune.
+        >>> tune.Tuner(  # doctest: +SKIP
+        ...     "DreamerV3",
+        ...     run_config=air.RunConfig(stop={"episode_reward_mean": 200}),
+        ...     param_space=config.to_dict(),
+        ... ).fit()
+    """
+
+    def __init__(self, algo_class=None):
+        """Initializes a DreamerV3Config instance."""
+        super().__init__(algo_class=algo_class or DreamerV3)
+
+        # fmt: off
+        # __sphinx_doc_begin__
+
+        # DreamerV3 specific settings:
+        self.model_size = "XS"
+        self.training_ratio = 1024
+
+        self.replay_buffer_config = {
+            "type": "EpisodeReplayBuffer",
+            "capacity": int(1e6),
+        }
+        self.world_model_lr = 1e-4
+        self.actor_lr = 3e-5
+        self.critic_lr = 3e-5
+        self.batch_size_B = 16
+        self.batch_length_T = 64
+        self.horizon_H = 15
+        self.gae_lambda = 0.95  # [1] eq. 7.
+        self.entropy_scale = 3e-4  # [1] eq. 11.
+        self.return_normalization_decay = 0.99  # [1] eq. 11 and 12.
+        self.train_critic = True
+        self.train_actor = True
+        self.intrinsic_rewards_scale = 0.1
+        self.world_model_grad_clip_by_global_norm = 1000.0
+        self.critic_grad_clip_by_global_norm = 100.0
+        self.actor_grad_clip_by_global_norm = 100.0
+
+        # Reporting.
+        # DreamerV3 is super sample efficient and only needs very few episodes
+        # (normally) to learn. Leaving this at its default value would gravely
+        # underestimate the learning performance over the course of an experiment.
+        self.metrics_num_episodes_for_smoothing = 1
+        self.report_individual_batch_item_stats = False
+        self.report_dream_data = False
+        self.report_images_and_videos = False
+        self.gc_frequency_train_steps = 100
+
+        # Override some of AlgorithmConfig's default values with DreamerV3-specific
+        # values.
+        self.lr = None
+        self.framework_str = "tf2"
+        self.gamma = 0.997  # [1] eq. 7.
+        # Do not use! Set `batch_size_B` and `batch_length_T` instead.
+        self.train_batch_size = None
+        self.env_runner_cls = DreamerV3EnvRunner
+        self.num_rollout_workers = 0
+        self.rollout_fragment_length = 1
+        # Since we are using a gymnasium-based EnvRunner, we can utilitze its
+        # vectorization capabilities w/o suffering performance losses (as we would
+        # with RLlib's `RemoteVectorEnv`).
+        self.remote_worker_envs = True
+        # Dreamer only runs on the new API stack.
+        self._enable_learner_api = True
+        self._enable_rl_module_api = True
+        # __sphinx_doc_end__
+        # fmt: on
+
+    @override(AlgorithmConfig)
+    def training(
+        self,
+        *,
+        model_size: Optional[str] = NotProvided,
+        training_ratio: Optional[float] = NotProvided,
+        gc_frequency_train_steps: Optional[int] = NotProvided,
+        batch_size_B: Optional[int] = NotProvided,
+        batch_length_T: Optional[int] = NotProvided,
+        horizon_H: Optional[int] = NotProvided,
+        gae_lambda: Optional[float] = NotProvided,
+        entropy_scale: Optional[float] = NotProvided,
+        return_normalization_decay: Optional[float] = NotProvided,
+        train_critic: Optional[bool] = NotProvided,
+        train_actor: Optional[bool] = NotProvided,
+        intrinsic_rewards_scale: Optional[float] = NotProvided,
+        world_model_grad_clip_by_global_norm: Optional[float] = NotProvided,
+        critic_grad_clip_by_global_norm: Optional[float] = NotProvided,
+        actor_grad_clip_by_global_norm: Optional[float] = NotProvided,
+        replay_buffer_config: Optional[dict] = NotProvided,
+        **kwargs,
+    ) -> "DreamerV3Config":
+        """Sets the training related configuration.
+
+        Args:
+            model_size: The main switch for adjusting the overall model size. See [1]
+                (table B) for more information on the effects of this setting on the
+                model architecture.
+                Supported values are "XS", "S", "M", "L", "XL" (as per the paper), as
+                well as, "nano", "micro", "mini", and "XXS" (for RLlib's
+                implementation). See ray.rllib.algorithms.dreamerv3.utils.
+                __init__.py for the details on what exactly each size does to the layer
+                sizes, number of layers, etc..
+            training_ratio: The ratio of total steps trained (sum of the sizes of all
+                batches ever sampled from the replay buffer) over the total env steps
+                taken (in the actual environment, not the dreamed one). For example,
+                if the training_ratio is 1024 and the batch size is 1024, we would take
+                1 env step for every training update: 1024 / 1. If the training ratio
+                is 512 and the batch size is 1024, we would take 2 env steps and then
+                perform a single training update (on a 1024 batch): 1024 / 2.
+            gc_frequency_train_steps: The frequency (in training iterations) with which
+                we perform a `gc.collect()` calls at the end of a `training_step`
+                iteration. Doing this more often adds a (albeit very small) performance
+                overhead, but prevents memory leaks from becoming harmful.
+                TODO (sven): This might not be necessary anymore, but needs to be
+                 confirmed experimentally.
+            batch_size_B: The batch size (B) interpreted as number of rows (each of
+                length `batch_length_T`) to sample from the replay buffer in each
+                iteration.
+            batch_length_T: The batch length (T) interpreted as the length of each row
+                sampled from the replay buffer in each iteration. Note that
+                `batch_size_B` rows will be sampled in each iteration. Rows normally
+                contain consecutive data (consecutive timesteps from the same episode),
+                but there might be episode boundaries in a row as well.
+            horizon_H: The horizon (in timesteps) used to create dreamed data from the
+                world model, which in turn is used to train/update both actor- and
+                critic networks.
+            gae_lambda: The lambda parameter used for computing the GAE-style
+                value targets for the actor- and critic losses.
+            entropy_scale: The factor with which to multiply the entropy loss term
+                inside the actor loss.
+            return_normalization_decay: The decay value to use when computing the
+                running EMA values for return normalization (used in the actor loss).
+            train_critic: Whether to train the critic network. If False, `train_actor`
+                must also be False (cannot train actor w/o training the critic).
+            train_actor: Whether to train the actor network. If True, `train_critic`
+                must also be True (cannot train actor w/o training the critic).
+            intrinsic_rewards_scale: The factor to multiply intrinsic rewards with
+                before adding them to the extrinsic (environment) rewards.
+            world_model_grad_clip_by_global_norm: World model grad clipping value
+                (by global norm).
+            critic_grad_clip_by_global_norm: Critic grad clipping value
+                (by global norm).
+            actor_grad_clip_by_global_norm: Actor grad clipping value (by global norm).
+            replay_buffer_config: Replay buffer config.
+                Only serves in DreamerV3 to set the capacity of the replay buffer.
+                Note though that in the paper ([1]) a size of 1M is used for all
+                benchmarks and there doesn't seem to be a good reason to change this
+                parameter.
+                Examples:
+                {
+                "type": "EpisodeReplayBuffer",
+                "capacity": 100000,
+                }
+
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+
+        if model_size is not NotProvided:
+            self.model_size = model_size
+        if training_ratio is not NotProvided:
+            self.training_ratio = training_ratio
+        if gc_frequency_train_steps is not NotProvided:
+            self.gc_frequency_train_steps = gc_frequency_train_steps
+        if batch_size_B is not NotProvided:
+            self.batch_size_B = batch_size_B
+        if batch_length_T is not NotProvided:
+            self.batch_length_T = batch_length_T
+        if horizon_H is not NotProvided:
+            self.horizon_H = horizon_H
+        if gae_lambda is not NotProvided:
+            self.gae_lambda = gae_lambda
+        if entropy_scale is not NotProvided:
+            self.entropy_scale = entropy_scale
+        if return_normalization_decay is not NotProvided:
+            self.return_normalization_decay = return_normalization_decay
+        if train_critic is not NotProvided:
+            self.train_critic = train_critic
+        if train_actor is not NotProvided:
+            self.train_actor = train_actor
+        if intrinsic_rewards_scale is not NotProvided:
+            self.intrinsic_rewards_scale = intrinsic_rewards_scale
+        if world_model_grad_clip_by_global_norm is not NotProvided:
+            self.world_model_grad_clip_by_global_norm = (
+                world_model_grad_clip_by_global_norm
+            )
+        if critic_grad_clip_by_global_norm is not NotProvided:
+            self.critic_grad_clip_by_global_norm = critic_grad_clip_by_global_norm
+        if actor_grad_clip_by_global_norm is not NotProvided:
+            self.actor_grad_clip_by_global_norm = actor_grad_clip_by_global_norm
+        if replay_buffer_config is not NotProvided:
+            # Override entire `replay_buffer_config` if `type` key changes.
+            # Update, if `type` key remains the same or is not specified.
+            new_replay_buffer_config = deep_update(
+                {"replay_buffer_config": self.replay_buffer_config},
+                {"replay_buffer_config": replay_buffer_config},
+                False,
+                ["replay_buffer_config"],
+                ["replay_buffer_config"],
+            )
+            self.replay_buffer_config = new_replay_buffer_config["replay_buffer_config"]
+
+        return self
+
+    @override(AlgorithmConfig)
+    def reporting(
+        self,
+        *,
+        report_individual_batch_item_stats: Optional[bool] = NotProvided,
+        report_dream_data: Optional[bool] = NotProvided,
+        report_images_and_videos: Optional[bool] = NotProvided,
+        **kwargs,
+    ):
+        """Sets the reporting related configuration.
+
+        Args:
+            report_individual_batch_item_stats: Whether to include loss and other stats
+                per individual timestep inside the training batch in the result dict
+                returned by `training_step()`. If True, besides the `CRITIC_L_total`,
+                the individual critic loss values per batch row and time axis step
+                in the train batch (CRITIC_L_total_B_T) will also be part of the
+                results.
+            report_dream_data:  Whether to include the dreamed trajectory data in the
+                result dict returned by `training_step()`. If True, however, will
+                slice each reported item in the dream data down to the shape.
+                (H, B, t=0, ...), where H is the horizon and B is the batch size. The
+                original time axis will only be represented by the first timestep
+                to not make this data too large to handle.
+            report_images_and_videos: Whether to include any image/video data in the
+                result dict returned by `training_step()`.
+            **kwargs:
+
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        super().reporting(**kwargs)
+
+        if report_individual_batch_item_stats is not NotProvided:
+            self.report_individual_batch_item_stats = report_individual_batch_item_stats
+        if report_dream_data is not NotProvided:
+            self.report_dream_data = report_dream_data
+        if report_images_and_videos is not NotProvided:
+            self.report_images_and_videos = report_images_and_videos
+
+        return self
+
+    @override(AlgorithmConfig)
+    def validate(self) -> None:
+        # Call the super class' validation method first.
+        super().validate()
+
+        # Make sure, users are not using DreamerV3 yet for multi-agent:
+        if self.is_multi_agent():
+            raise ValueError("DreamerV3 does NOT support multi-agent setups yet!")
+
+        # Make sure, we are configure for the new API stack.
+        if not (self._enable_learner_api and self._enable_rl_module_api):
+            raise ValueError(
+                "DreamerV3 must be run with `config._enable_learner_api`=True AND "
+                "with `config._enable_rl_module_api`=True!"
+            )
+
+        # If run on several Learners, the provided batch_size_B must be a multiple
+        # of `num_learner_workers`.
+        if self.num_learner_workers > 1 and (
+            self.batch_size_B % self.num_learner_workers != 0
+        ):
+            raise ValueError(
+                f"Your `batch_size_B` ({self.batch_size_B}) must be a multiple of "
+                f"`num_learner_workers` ({self.num_learner_workers}) in order for "
+                "DreamerV3 to be able to split batches evenly across your Learner "
+                "processes."
+            )
+
+        # Cannot train actor w/o critic.
+        if self.train_actor and not self.train_critic:
+            raise ValueError(
+                "Cannot train actor network (`train_actor=True`) w/o training critic! "
+                "Make sure you either set `train_critic=True` or `train_actor=False`."
+            )
+        # Use DreamerV3 specific batch size settings.
+        if self.train_batch_size is not None:
+            raise ValueError(
+                "`train_batch_size` should NOT be set! Use `batch_size_B` and "
+                "`batch_length_T` instead."
+            )
+        # Must be run with `EpisodeReplayBuffer` type.
+        if self.replay_buffer_config.get("type") != "EpisodeReplayBuffer":
+            raise ValueError(
+                "DreamerV3 must be run with the `EpisodeReplayBuffer` type! None "
+                "other supported."
+            )
+
+    @override(AlgorithmConfig)
+    def get_learner_hyperparameters(self) -> LearnerHyperparameters:
+        base_hps = super().get_learner_hyperparameters()
+        return DreamerV3LearnerHyperparameters(
+            model_size=self.model_size,
+            training_ratio=self.training_ratio,
+            batch_size_B=self.batch_size_B // (self.num_learner_workers or 1),
+            batch_length_T=self.batch_length_T,
+            horizon_H=self.horizon_H,
+            gamma=self.gamma,
+            gae_lambda=self.gae_lambda,
+            entropy_scale=self.entropy_scale,
+            return_normalization_decay=self.return_normalization_decay,
+            train_actor=self.train_actor,
+            train_critic=self.train_critic,
+            world_model_lr=self.world_model_lr,
+            intrinsic_rewards_scale=self.intrinsic_rewards_scale,
+            actor_lr=self.actor_lr,
+            critic_lr=self.critic_lr,
+            world_model_grad_clip_by_global_norm=(
+                self.world_model_grad_clip_by_global_norm
+            ),
+            actor_grad_clip_by_global_norm=self.actor_grad_clip_by_global_norm,
+            critic_grad_clip_by_global_norm=self.critic_grad_clip_by_global_norm,
+            report_individual_batch_item_stats=(
+                self.report_individual_batch_item_stats
+            ),
+            report_dream_data=self.report_dream_data,
+            report_images_and_videos=self.report_images_and_videos,
+            **dataclasses.asdict(base_hps),
+        )
+
+    @override(AlgorithmConfig)
+    def get_default_learner_class(self):
+        if self.framework_str == "tf2":
+            from ray.rllib.algorithms.dreamerv3.tf.dreamerv3_tf_learner import (
+                DreamerV3TfLearner,
+            )
+
+            return DreamerV3TfLearner
+        else:
+            raise ValueError(f"The framework {self.framework_str} is not supported.")
+
+    @override(AlgorithmConfig)
+    def get_default_rl_module_spec(self) -> SingleAgentRLModuleSpec:
+        if self.framework_str == "tf2":
+            from ray.rllib.algorithms.dreamerv3.tf.dreamerv3_tf_rl_module import (
+                DreamerV3TfRLModule,
+            )
+
+            return SingleAgentRLModuleSpec(
+                module_class=DreamerV3TfRLModule, catalog_class=DreamerV3Catalog
+            )
+        else:
+            raise ValueError(f"The framework {self.framework_str} is not supported.")
+
+    @property
+    def share_module_between_env_runner_and_learner(self) -> bool:
+        # If we only have one local Learner (num_learner_workers=0) and only
+        # one local EnvRunner (num_rollout_workers=0), share the RLModule
+        # between these two to avoid having to sync weights, ever.
+        return self.num_learner_workers == 0 and self.num_rollout_workers == 0
+
+
+class DreamerV3(Algorithm):
+    """Implementation of the model-based DreamerV3 RL algorithm described in [1]."""
+
+    @classmethod
+    @override(Algorithm)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return DreamerV3Config()
+
+    @override(Algorithm)
+    def setup(self, config: AlgorithmConfig):
+        super().setup(config)
+
+        # Share RLModule between EnvRunner and single (local) Learner instance.
+        # To avoid possibly expensive weight synching step.
+        if self.config.share_module_between_env_runner_and_learner:
+            assert self.workers.local_worker().module is None
+            self.workers.local_worker().module = self.learner_group._learner.module[
+                DEFAULT_POLICY_ID
+            ]
+
+        # Summarize (single-agent) RLModule (only once) here.
+        if self.config.framework_str == "tf2":
+            self.workers.local_worker().module.dreamer_model.summary(expand_nested=True)
+
+        # Create a replay buffer for storing actual env samples.
+        self.replay_buffer = EpisodeReplayBuffer(
+            capacity=self.config.replay_buffer_config["capacity"],
+            batch_size_B=self.config.batch_size_B,
+            batch_length_T=self.config.batch_length_T,
+        )
+
+    @override(Algorithm)
+    def training_step(self) -> ResultDict:
+        results = {}
+
+        env_runner = self.workers.local_worker()
+
+        # Push enough samples into buffer initially before we start training.
+        if self.training_iteration == 0:
+            logger.info(
+                "Filling replay buffer so it contains at least "
+                f"{self.config.batch_size_B * self.config.batch_length_T} timesteps "
+                "(required for a single train batch)."
+            )
+
+        # Have we sampled yet in this `training_step()` call?
+        have_sampled = False
+        with self._timers[SAMPLE_TIMER]:
+            # Continue sampling from the actual environment (and add collected samples
+            # to our replay buffer) as long as we:
+            while (
+                # a) Don't have at least batch_size_B x batch_length_T timesteps stored
+                # in the buffer. This is the minimum needed to train.
+                self.replay_buffer.get_num_timesteps()
+                < (self.config.batch_size_B * self.config.batch_length_T)
+                # b) The computed `training_ratio` is >= the configured (desired)
+                # training ratio (meaning we should continue sampling).
+                or self.training_ratio >= self.config.training_ratio
+                # c) we have not sampled at all yet in this `training_step()` call.
+                or not have_sampled
+            ):
+                done_episodes, ongoing_episodes = env_runner.sample()
+                have_sampled = True
+
+                # We took B x T env steps.
+                env_steps_last_sample = sum(
+                    len(eps) for eps in done_episodes + ongoing_episodes
+                )
+                self._counters[NUM_AGENT_STEPS_SAMPLED] += env_steps_last_sample
+                self._counters[NUM_ENV_STEPS_SAMPLED] += env_steps_last_sample
+
+                # Add ongoing and finished episodes into buffer. The buffer will
+                # automatically take care of properly concatenating (by episode IDs)
+                # the different chunks of the same episodes, even if they come in via
+                # separate `add()` calls.
+                self.replay_buffer.add(episodes=done_episodes + ongoing_episodes)
+
+        # Summarize environment interaction and buffer data.
+        results[ALL_MODULES] = report_sampling_and_replay_buffer(
+            replay_buffer=self.replay_buffer,
+        )
+
+        # Continue sampling batch_size_B x batch_length_T sized batches from the buffer
+        # and using these to update our models (`LearnerGroup.update()`) until the
+        # computed `training_ratio` is larger than the configured one, meaning we should
+        # go back and collect more samples again from the actual environment.
+        # However, when calculating the `training_ratio` here, we use only the
+        # trained steps in this very `training_step()` call over the most recent sample
+        # amount (`env_steps_last_sample`), not the global values. This is to avoid a
+        # heavy overtraining at the very beginning when we have just pre-filled the
+        # buffer with the minimum amount of samples.
+        replayed_steps_this_iter = sub_iter = 0
+        while (
+            replayed_steps_this_iter / env_steps_last_sample
+        ) < self.config.training_ratio:
+
+            # Time individual batch updates.
+            with self._timers[LEARN_ON_BATCH_TIMER]:
+                logger.info(f"\tSub-iteration {self.training_iteration}/{sub_iter})")
+
+                # Draw a new sample from the replay buffer.
+                sample = self.replay_buffer.sample(
+                    batch_size_B=self.config.batch_size_B,
+                    batch_length_T=self.config.batch_length_T,
+                )
+                replayed_steps = self.config.batch_size_B * self.config.batch_length_T
+                replayed_steps_this_iter += replayed_steps
+
+                # Convert some bool columns to float32 and one-hot actions.
+                sample["is_first"] = sample["is_first"].astype(np.float32)
+                sample["is_last"] = sample["is_last"].astype(np.float32)
+                sample["is_terminated"] = sample["is_terminated"].astype(np.float32)
+                if isinstance(env_runner.env.single_action_space, gym.spaces.Discrete):
+                    sample["actions_ints"] = sample[SampleBatch.ACTIONS]
+                    sample[SampleBatch.ACTIONS] = one_hot(
+                        sample["actions_ints"],
+                        depth=env_runner.env.single_action_space.n,
+                    )
+
+                # Perform the actual update via our learner group.
+                train_results = self.learner_group.update(
+                    SampleBatch(sample).as_multi_agent(),
+                    reduce_fn=self._reduce_results,
+                )
+                self._counters[NUM_AGENT_STEPS_TRAINED] += replayed_steps
+                self._counters[NUM_ENV_STEPS_TRAINED] += replayed_steps
+
+                # Perform additional (non-gradient updates), such as the critic EMA-copy
+                # update.
+                with self._timers["critic_ema_update"]:
+                    self.learner_group.additional_update(
+                        timestep=self._counters[NUM_ENV_STEPS_TRAINED],
+                        reduce_fn=self._reduce_results,
+                    )
+
+                if self.config.report_images_and_videos:
+                    report_predicted_vs_sampled_obs(
+                        # TODO (sven): DreamerV3 is single-agent only.
+                        results=train_results[DEFAULT_POLICY_ID],
+                        sample=sample,
+                        batch_size_B=self.config.batch_size_B,
+                        batch_length_T=self.config.batch_length_T,
+                        symlog_obs=do_symlog_obs(
+                            env_runner.env.single_observation_space,
+                            self.config.model.get("symlog_obs", "auto"),
+                        ),
+                    )
+
+                res = train_results[DEFAULT_POLICY_ID]
+                logger.info(
+                    f"\t\tWORLD_MODEL_L_total={res['WORLD_MODEL_L_total']:.5f} ("
+                    f"L_pred={res['WORLD_MODEL_L_prediction']:.5f} ("
+                    f"decoder/obs={res['WORLD_MODEL_L_decoder']} "
+                    f"L_rew={res['WORLD_MODEL_L_reward']} "
+                    f"L_cont={res['WORLD_MODEL_L_continue']}); "
+                    f"L_dyn/rep={res['WORLD_MODEL_L_dynamics']:.5f})"
+                )
+                msg = "\t\t"
+                if self.config.train_actor:
+                    msg += f"L_actor={res['ACTOR_L_total']:.5f} "
+                if self.config.train_critic:
+                    msg += f"L_critic={res['CRITIC_L_total']:.5f} "
+                logger.info(msg)
+
+                sub_iter += 1
+                self._counters[NUM_GRAD_UPDATES_LIFETIME] += 1
+
+        # Update weights - after learning on the LearnerGroup - on all EnvRunner
+        # workers.
+        with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
+            # Only necessary if RLModule is not shared between (local) EnvRunner and
+            # (local) Learner.
+            if not self.config.share_module_between_env_runner_and_learner:
+                self._counters[
+                    NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS
+                ] = 0
+                self._counters[NUM_SYNCH_WORKER_WEIGHTS] += 1
+                self.workers.sync_weights(
+                    from_worker_or_learner_group=self.learner_group
+                )
+
+        # Try trick from https://medium.com/dive-into-ml-ai/dealing-with-memory-leak-
+        # issue-in-keras-model-training-e703907a6501
+        if self.config.gc_frequency_train_steps and (
+            self.training_iteration % self.config.gc_frequency_train_steps == 0
+        ):
+            with self._timers[GARBAGE_COLLECTION_TIMER]:
+                gc.collect()
+
+        # Add train results and the actual training ratio to stats. The latter should
+        # be close to the configured `training_ratio`.
+        results.update(train_results)
+        results[ALL_MODULES]["actual_training_ratio"] = self.training_ratio
+
+        # Return all results.
+        return results
+
+    @property
+    def training_ratio(self) -> float:
+        """Returns the actual training ratio of this Algorithm.
+
+        The training ratio is copmuted by dividing the total number of steps
+        trained thus far (replayed from the buffer) over the total number of actual
+        env steps taken thus far.
+        """
+        return self._counters[NUM_ENV_STEPS_TRAINED] / (
+            self._counters[NUM_ENV_STEPS_SAMPLED]
+        )
+
+    @staticmethod
+    def _reduce_results(results: List[Dict[str, Any]]):
+        return tree.map_structure(lambda *s: np.mean(s, axis=0), *results)
diff --git a/rllib/algorithms/dreamerv3/dreamerv3_catalog.py b/rllib/algorithms/dreamerv3/dreamerv3_catalog.py
new file mode 100644
index 0000000000000..50568fe1875ab
--- /dev/null
+++ b/rllib/algorithms/dreamerv3/dreamerv3_catalog.py
@@ -0,0 +1,80 @@
+import gymnasium as gym
+
+from ray.rllib.core.models.catalog import Catalog
+from ray.rllib.core.models.base import Encoder, Model
+from ray.rllib.utils import override
+
+
+class DreamerV3Catalog(Catalog):
+    """The Catalog class used to build all the models needed for DreamerV3 training."""
+
+    def __init__(
+        self,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        model_config_dict: dict,
+    ):
+        """Initializes a DreamerV3Catalog instance.
+
+        Args:
+            observation_space: The observation space of the environment.
+            action_space: The action space of the environment.
+            model_config_dict: The model config to use.
+        """
+        super().__init__(
+            observation_space=observation_space,
+            action_space=action_space,
+            model_config_dict=model_config_dict,
+        )
+
+        self.model_size = self.model_config_dict["model_size"]
+        self.is_img_space = len(self.observation_space.shape) in [2, 3]
+        self.is_gray_scale = (
+            self.is_img_space and len(self.observation_space.shape) == 2
+        )
+
+        # TODO (sven): We should work with sub-component configurations here,
+        #  and even try replacing all current Dreamer model components with
+        #  our default primitives. But for now, we'll construct the DreamerV3Model
+        #  directly in our `build_...()` methods.
+
+    @override(Catalog)
+    def build_encoder(self, framework: str) -> Encoder:
+        """Builds the World-Model's encoder network depending on the obs space."""
+        if framework != "tf2":
+            raise NotImplementedError
+
+        if self.is_img_space:
+            from ray.rllib.algorithms.dreamerv3.tf.models.components.cnn_atari import (
+                CNNAtari,
+            )
+
+            return CNNAtari(model_size=self.model_size)
+        else:
+            from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+
+            return MLP(model_size=self.model_size, name="vector_encoder")
+
+    def build_decoder(self, framework: str) -> Model:
+        """Builds the World-Model's decoder network depending on the obs space."""
+        if framework != "tf2":
+            raise NotImplementedError
+
+        if self.is_img_space:
+            from ray.rllib.algorithms.dreamerv3.tf.models.components import (
+                conv_transpose_atari,
+            )
+
+            return conv_transpose_atari.ConvTransposeAtari(
+                model_size=self.model_size,
+                gray_scaled=self.is_gray_scale,
+            )
+        else:
+            from ray.rllib.algorithms.dreamerv3.tf.models.components import (
+                vector_decoder,
+            )
+
+            return vector_decoder.VectorDecoder(
+                model_size=self.model_size,
+                observation_space=self.observation_space,
+            )
diff --git a/rllib/algorithms/dreamerv3/dreamerv3_learner.py b/rllib/algorithms/dreamerv3/dreamerv3_learner.py
index c35d1743c8b1a..32c08d0a671f4 100644
--- a/rllib/algorithms/dreamerv3/dreamerv3_learner.py
+++ b/rllib/algorithms/dreamerv3/dreamerv3_learner.py
@@ -8,11 +8,13 @@
 https://arxiv.org/pdf/2010.02193.pdf
 """
 from dataclasses import dataclass
-from typing import Any, Dict
+from typing import Any, DefaultDict, Dict
 
 from ray.rllib.core.learner.learner import Learner, LearnerHyperparameters
 from ray.rllib.core.rl_module.rl_module import ModuleID
+from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import TensorType
 
 
 @dataclass
@@ -25,7 +27,7 @@ class to configure your algorithm.
     more details on the individual properties.
     """
 
-    model_dimension: str = None
+    model_size: str = None
     training_ratio: float = None
     batch_size_B: int = None
     batch_length_T: int = None
@@ -44,6 +46,10 @@ class to configure your algorithm.
     world_model_grad_clip_by_global_norm: float = None
     actor_grad_clip_by_global_norm: float = None
     critic_grad_clip_by_global_norm: float = None
+    # Reporting settings.
+    report_individual_batch_item_stats: bool = None
+    report_dream_data: bool = None
+    report_images_and_videos: bool = None
 
 
 class DreamerV3Learner(Learner):
@@ -53,6 +59,31 @@ class DreamerV3Learner(Learner):
     for updating the critic EMA-copy after each training step.
     """
 
+    @override(Learner)
+    def compile_results(
+        self,
+        *,
+        batch: MultiAgentBatch,
+        fwd_out: Dict[str, Any],
+        loss_per_module: Dict[str, TensorType],
+        metrics_per_module: DefaultDict[ModuleID, Dict[str, Any]],
+    ) -> Dict[str, Any]:
+        results = super().compile_results(
+            batch=batch,
+            fwd_out=fwd_out,
+            loss_per_module=loss_per_module,
+            metrics_per_module=metrics_per_module,
+        )
+
+        # Add the predicted obs distributions for possible (video) summarization.
+        if self.hps.report_images_and_videos:
+            for module_id, res in results.items():
+                if module_id in fwd_out:
+                    res["WORLD_MODEL_fwd_out_obs_distribution_means_BxT"] = fwd_out[
+                        module_id
+                    ]["obs_distribution_means_BxT"]
+        return results
+
     @override(Learner)
     def additional_update_for_module(
         self,
diff --git a/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py b/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py
index 021fbb8646389..f1a112e7017d1 100644
--- a/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py
+++ b/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py
@@ -14,6 +14,7 @@
 from ray.rllib.core.models.base import STATE_IN, STATE_OUT
 from ray.rllib.core.models.specs.specs_dict import SpecDict
 from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.policy.eager_tf_policy import _convert_to_tf
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import ExperimentalAPI, override
 from ray.rllib.utils.nested_dict import NestedDict
@@ -33,7 +34,7 @@ def setup(self):
             self.config.observation_space,
             self.config.model_config_dict.get("symlog_obs", "auto"),
         )
-        model_dimension = self.config.model_config_dict["model_dimension"]
+        model_size = self.config.model_config_dict["model_size"]
 
         # Build encoder and decoder from catalog.
         catalog = self.config.get_catalog()
@@ -42,40 +43,34 @@ def setup(self):
 
         # Build the world model (containing encoder and decoder).
         self.world_model = WorldModel(
-            model_dimension=model_dimension,
+            model_size=model_size,
             action_space=self.config.action_space,
             batch_length_T=T,
-            # num_gru_units=self.model_config.num_gru_units,
             encoder=self.encoder,
             decoder=self.decoder,
             symlog_obs=symlog_obs,
         )
         self.actor = ActorNetwork(
             action_space=self.config.action_space,
-            model_dimension=model_dimension,
+            model_size=model_size,
         )
         self.critic = CriticNetwork(
-            model_dimension=model_dimension,
+            model_size=model_size,
         )
         # Build the final dreamer model (containing the world model).
         self.dreamer_model = DreamerModel(
-            model_dimension=self.config.model_config_dict["model_dimension"],
+            model_size=self.config.model_config_dict["model_size"],
             action_space=self.config.action_space,
             world_model=self.world_model,
             actor=self.actor,
             critic=self.critic,
-            # use_curiosity=use_curiosity,
-            # intrinsic_rewards_scale=intrinsic_rewards_scale,
-            batch_size_B=self.config.model_config_dict["batch_size_B"],
-            batch_length_T=T,
-            horizon_H=horizon_H,
         )
         self.action_dist_cls = catalog.get_action_dist_cls(framework=self.framework)
 
         # Perform a test `call()` to force building the dreamer model's variables.
         test_obs = np.tile(
             np.expand_dims(self.config.observation_space.sample(), (0, 1)),
-            reps=(B, T, 1),
+            reps=(B, T) + (1,) * len(self.config.observation_space.shape),
         )
         test_actions = np.tile(
             np.expand_dims(
@@ -87,15 +82,13 @@ def setup(self):
             reps=(B, T, 1),
         )
         self.dreamer_model(
-            inputs=test_obs,
-            actions=test_actions.astype(np.float32),
-            is_first=np.ones((B, T), np.float32),
-            start_is_terminated_BxT=np.zeros((B * T,), np.float32),
+            inputs=_convert_to_tf(test_obs),
+            actions=_convert_to_tf(test_actions.astype(np.float32)),
+            is_first=_convert_to_tf(np.ones((B, T), np.float32)),
+            start_is_terminated_BxT=_convert_to_tf(np.zeros((B * T,), np.float32)),
             horizon_H=horizon_H,
             gamma=gamma,
         )
-        # This should work now.
-        self.dreamer_model.summary(expand_nested=True)
 
         # Initialize the critic EMA net:
         self.critic.init_ema()
@@ -129,7 +122,7 @@ def input_specs_train(self) -> SpecDict:
     def output_specs_train(self) -> SpecDict:
         return [
             "sampled_obs_symlog_BxT",
-            "obs_distribution_BxT",
+            "obs_distribution_means_BxT",
             "reward_logits_BxT",
             "rewards_BxT",
             "continue_distribution_BxT",
diff --git a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py
new file mode 100644
index 0000000000000..2e8ef82fd6dbe
--- /dev/null
+++ b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py
@@ -0,0 +1,210 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+
+[3]
+D. Hafner's (author) original code repo (for JAX):
+https://github.com/danijar/dreamerv3
+"""
+import unittest
+
+import gymnasium as gym
+import numpy as np
+
+import ray
+from ray.rllib.algorithms.dreamerv3 import dreamerv3
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
+from ray.rllib.utils.test_utils import framework_iterator
+
+
+class TestDreamerV3(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        ray.init()
+
+    @classmethod
+    def tearDownClass(cls):
+        ray.shutdown()
+
+    def test_dreamerv3_compilation(self):
+        """Test whether DreamerV3 can be built with all frameworks."""
+
+        # Build a DreamerV3Config object.
+        config = (
+            dreamerv3.DreamerV3Config()
+            .framework(eager_tracing=True)
+            .training(
+                # Keep things simple. Especially the long dream rollouts seem
+                # to take an enormous amount of time (initially).
+                batch_size_B=2 * 2,  # shared w/ model AND learner AND env runner
+                batch_length_T=16,
+                horizon_H=5,
+                # TODO (sven): Fix having to provide this.
+                #  Should be compiled automatically as `RLModuleConfig` by
+                #  AlgorithmConfig (see comment below)?
+                model={
+                    "batch_length_T": 16,
+                    "horizon_H": 5,
+                    "model_size": "nano",  # Use a tiny model for testing.
+                    "gamma": 0.997,
+                    "symlog_obs": True,
+                },
+            )
+            .resources(
+                num_learner_workers=2,  # Try with 2 Learners.
+                num_cpus_per_learner_worker=1,
+                num_gpus_per_learner_worker=0,
+            )
+            .debugging(log_level="INFO")
+        )
+
+        # TODO (sven): Add a `get_model_config` utility to AlgorithmConfig
+        #  that - for now - merges the user provided model_dict (which only
+        #  contains settings that only affect the model, e.g. model_size)
+        #  with the AlgorithmConfig-wide settings that are relevant for the model
+        #  (e.g. `batch_size_B`).
+        # config.get_model_config()
+
+        num_iterations = 2
+
+        for _ in framework_iterator(config, frameworks="tf2"):
+            for env in ["ALE/MsPacman-v5", "FrozenLake-v1", "CartPole-v1"]:
+                print("Env={}".format(env))
+                config.environment(env)
+                algo = config.build()
+
+                for i in range(num_iterations):
+                    results = algo.train()
+                    print(results)
+
+                algo.stop()
+
+    def test_dreamerv3_dreamer_model_sizes(self):
+        """Tests, whether the different model sizes match the ones reported in [1]."""
+
+        # For Atari, these are the exact numbers from the repo ([3]).
+        # However, for CartPole + size "S" and "M", the author's original code will not
+        # match for the world model count. This is due to the fact that the author uses
+        # encoder/decoder nets with 5x1024 nodes (which corresponds to XL) regardless of
+        # the `model_size` settings (iff >="S").
+        expected_num_params_world_model = {
+            "XS_cartpole": 2435076,
+            "S_cartpole": 7493380,
+            "M_cartpole": 16206084,
+            "L_cartpole": 37802244,
+            "XL_cartpole": 108353796,
+            "XS_atari": 7538979,
+            "S_atari": 15687811,
+            "M_atari": 32461635,
+            "L_atari": 68278275,
+            "XL_atari": 181558659,
+        }
+
+        # All values confirmed against [3] (100% match).
+        expected_num_params_actor = {
+            # hidden=[1280, 256]
+            # hidden_norm=[256], [256]
+            # pi (2 actions)=[256, 2], [2]
+            "XS_cartpole": 328706,
+            "S_cartpole": 1051650,
+            "M_cartpole": 2135042,
+            "L_cartpole": 4136450,
+            "XL_cartpole": 9449474,
+            "XS_atari": 329734,
+            "S_atari": 1053702,
+            "M_atari": 2137606,
+            "L_atari": 4139526,
+            "XL_atari": 9453574,
+        }
+
+        # All values confirmed against [3] (100% match).
+        expected_num_params_critic = {
+            # hidden=[1280, 256]
+            # hidden_norm=[256], [256]
+            # vf (buckets)=[256, 255], [255]
+            "XS_cartpole": 393727,
+            "S_cartpole": 1181439,
+            "M_cartpole": 2297215,
+            "L_cartpole": 4331007,
+            "XL_cartpole": 9708799,
+            "XS_atari": 393727,
+            "S_atari": 1181439,
+            "M_atari": 2297215,
+            "L_atari": 4331007,
+            "XL_atari": 9708799,
+        }
+
+        config = (
+            dreamerv3.DreamerV3Config()
+            .framework("tf2", eager_tracing=True)
+            .training(
+                model={
+                    "batch_length_T": 16,
+                    "horizon_H": 5,
+                    "gamma": 0.997,
+                    "symlog_obs": True,
+                }
+            )
+        )
+
+        # Check all model_sizes described in the paper ([1]) on matching the number
+        # of parameters to RLlib's implementation.
+        for model_size in ["XS", "S", "M", "L", "XL"]:
+            config.model_size = model_size
+            config.training(model={"model_size": model_size})
+
+            # Atari and CartPole spaces.
+            for obs_space, num_actions, env_name in [
+                (gym.spaces.Box(-1.0, 0.0, (4,), np.float32), 2, "cartpole"),
+                (gym.spaces.Box(-1.0, 0.0, (64, 64, 3), np.float32), 6, "atari"),
+            ]:
+                print(f"Testing model_size={model_size} on env-type: {env_name} ..")
+                config.environment(
+                    observation_space=obs_space,
+                    action_space=gym.spaces.Discrete(num_actions),
+                )
+
+                # Create our RLModule to compute actions with.
+                policy_dict, _ = config.get_multi_agent_setup()
+                module_spec = config.get_marl_module_spec(policy_dict=policy_dict)
+                rl_module = module_spec.build()[DEFAULT_POLICY_ID]
+
+                # Count the generated RLModule's parameters and compare to the paper's
+                # reported numbers ([1] and [3]).
+                num_params_world_model = sum(
+                    np.prod(v.shape.as_list())
+                    for v in rl_module.world_model.trainable_variables
+                )
+                self.assertEqual(
+                    num_params_world_model,
+                    expected_num_params_world_model[f"{model_size}_{env_name}"],
+                )
+                num_params_actor = sum(
+                    np.prod(v.shape.as_list())
+                    for v in rl_module.actor.trainable_variables
+                )
+                self.assertEqual(
+                    num_params_actor,
+                    expected_num_params_actor[f"{model_size}_{env_name}"],
+                )
+                num_params_critic = sum(
+                    np.prod(v.shape.as_list())
+                    for v in rl_module.critic.trainable_variables
+                )
+                self.assertEqual(
+                    num_params_critic,
+                    expected_num_params_critic[f"{model_size}_{env_name}"],
+                )
+                print("\tok")
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))
diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py
index 6f970a9117d9e..366735f643d74 100644
--- a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py
+++ b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py
@@ -18,7 +18,7 @@
 from ray.rllib.core.rl_module.marl_module import ModuleID
 from ray.rllib.core.learner.learner import ParamDict
 from ray.rllib.core.learner.tf.tf_learner import TfLearner
-from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf, try_import_tfp
 from ray.rllib.utils.tf_utils import symlog, two_hot, clip_gradients
@@ -34,16 +34,21 @@ class DreamerV3TfLearner(DreamerV3Learner, TfLearner):
     The critic EMA-copy update step can be found in the `DreamerV3Learner` base class,
     as it is framework independent.
 
-    We define 3 local tensorflow optimizers for the sub components "world_model",
+    We define 3 local TensorFlow optimizers for the sub components "world_model",
     "actor", and "critic". Each of these optimizers might use a different learning rate,
     epsilon parameter, and gradient clipping thresholds and procedures.
     """
 
     @override(TfLearner)
-    def configure_optimizer_for_module(
+    def configure_optimizers_for_module(
         self, module_id: ModuleID, hps: DreamerV3LearnerHyperparameters
     ):
-        """Create the 3 optimizers for Dreamer learning: world_model, actor, critic."""
+        """Create the 3 optimizers for Dreamer learning: world_model, actor, critic.
+
+        The learning rates used are described in [1] and the epsilon values used here
+        - albeit probably not that important - are used by the author's own
+        implementation.
+        """
 
         dreamerv3_module = self._module[module_id]
 
@@ -95,7 +100,7 @@ def postprocess_gradients_for_module(
         """Performs gradient clipping on the 3 module components' computed grads.
 
         Note that different grad global-norm clip values are used for the 3
-        module components (world model, actor, and critic).
+        module components: world model, actor, and critic.
         """
         for optimizer_name, optimizer in self.get_optimizers_for_module(
             module_id=module_id
@@ -134,6 +139,32 @@ def postprocess_gradients_for_module(
 
         return module_gradients_dict
 
+    @override(TfLearner)
+    def compute_gradients(
+        self,
+        loss_per_module,
+        gradient_tape,
+        **kwargs,
+    ):
+        # Override of the default gradient computation method.
+        # For DreamerV3, we need to compute gradients over the individual loss terms
+        # as otherwise, the world model's parameters would have their gradients also
+        # be influenced by the actor- and critic loss terms/gradient computations.
+        grads = {}
+        for component in ["world_model", "actor", "critic"]:
+            grads.update(
+                gradient_tape.gradient(
+                    # Take individual loss term from the registered metrics for
+                    # the main module.
+                    self._metrics[DEFAULT_POLICY_ID][component.upper() + "_L_total"],
+                    self.filter_param_dict_for_optimizer(
+                        self._params, self.get_optimizer(optimizer_name=component)
+                    ),
+                )
+            )
+        del gradient_tape
+        return grads
+
     @override(TfLearner)
     def compute_loss_for_module(
         self,
@@ -170,7 +201,11 @@ def compute_loss_for_module(
             + 0.1 * L_rep_B_T
         )
 
-        # Sum up timesteps, and average over batch (see eq. 4 in [1]).
+        # In the paper, it says to sum up timesteps, and average over
+        # batch (see eq. 4 in [1]). But Danijar's implementation only does
+        # averaging (over B and T), so we'll do this here as well. This is generally
+        # true for all other loss terms as well (we'll always just average, no summing
+        # over T axis!).
         L_world_model_total = tf.reduce_mean(L_world_model_total_B_T)
 
         # Register world model loss stats.
@@ -182,28 +217,36 @@ def compute_loss_for_module(
                 ),
                 # Prediction losses.
                 # Decoder (obs) loss.
-                "WORLD_MODEL_L_decoder_B_T": prediction_losses["L_decoder_B_T"],
                 "WORLD_MODEL_L_decoder": prediction_losses["L_decoder"],
                 # Reward loss.
-                "WORLD_MODEL_L_reward_B_T": prediction_losses["L_reward_B_T"],
                 "WORLD_MODEL_L_reward": prediction_losses["L_reward"],
                 # Continue loss.
-                "WORLD_MODEL_L_continue_B_T": prediction_losses["L_continue_B_T"],
                 "WORLD_MODEL_L_continue": prediction_losses["L_continue"],
                 # Total.
-                "WORLD_MODEL_L_prediction_B_T": prediction_losses["L_prediction_B_T"],
                 "WORLD_MODEL_L_prediction": prediction_losses["L_prediction"],
                 # Dynamics loss.
-                "WORLD_MODEL_L_dynamics_B_T": L_dyn_B_T,
                 "WORLD_MODEL_L_dynamics": L_dyn,
                 # Representation loss.
-                "WORLD_MODEL_L_representation_B_T": L_rep_B_T,
                 "WORLD_MODEL_L_representation": L_rep,
                 # Total loss.
-                "WORLD_MODEL_L_total_B_T": L_world_model_total_B_T,
                 "WORLD_MODEL_L_total": L_world_model_total,
             },
         )
+        if hps.report_individual_batch_item_stats:
+            self.register_metrics(
+                module_id=module_id,
+                metrics_dict={
+                    "WORLD_MODEL_L_decoder_B_T": prediction_losses["L_decoder_B_T"],
+                    "WORLD_MODEL_L_reward_B_T": prediction_losses["L_reward_B_T"],
+                    "WORLD_MODEL_L_continue_B_T": prediction_losses["L_continue_B_T"],
+                    "WORLD_MODEL_L_prediction_B_T": (
+                        prediction_losses["L_prediction_B_T"]
+                    ),
+                    "WORLD_MODEL_L_dynamics_B_T": L_dyn_B_T,
+                    "WORLD_MODEL_L_representation_B_T": L_rep_B_T,
+                    "WORLD_MODEL_L_total_B_T": L_world_model_total_B_T,
+                },
+            )
 
         # Dream trajectories starting in all internal states (h + z_posterior) that were
         # computed during world model training.
@@ -219,17 +262,31 @@ def compute_loss_for_module(
             timesteps_H=hps.horizon_H,
             gamma=hps.gamma,
         )
-        self.register_metrics(module_id, {"dream_data": dream_data})
+        if hps.report_dream_data:
+            # To reduce this massive mount of data a little, slice out a T=1 piece
+            # from each stats that has the shape (H, BxT), meaning convert e.g.
+            # `rewards_dreamed_t0_to_H_BxT` into `rewards_dreamed_t0_to_H_Bx1`.
+            # This will reduce the amount of data to be transferred and reported
+            # by the factor of `batch_length_T`.
+            self.register_metrics(
+                module_id,
+                {
+                    # Replace 'T' with '1'.
+                    "DREAM_DATA_" + key[:-1] + "1": value[:, hps.batch_size_B]
+                    for key, value in dream_data.items()
+                    if key.endswith("H_BxT")
+                },
+            )
 
         value_targets_t0_to_Hm1_BxT = self._compute_value_targets(
             hps=hps,
             # Learn critic in symlog'd space.
-            rewards_t0_to_H_BxT=dream_data["rewards_dreamed_t0_to_H_B"],
+            rewards_t0_to_H_BxT=dream_data["rewards_dreamed_t0_to_H_BxT"],
             intrinsic_rewards_t1_to_H_BxT=(
                 dream_data["rewards_intrinsic_t1_to_H_B"] if hps.use_curiosity else None
             ),
-            continues_t0_to_H_BxT=dream_data["continues_dreamed_t0_to_H_B"],
-            value_predictions_t0_to_H_BxT=dream_data["values_dreamed_t0_to_H_B"],
+            continues_t0_to_H_BxT=dream_data["continues_dreamed_t0_to_H_BxT"],
+            value_predictions_t0_to_H_BxT=dream_data["values_dreamed_t0_to_H_BxT"],
         )
         self.register_metric(
             module_id, "VALUE_TARGETS_H_BxT", value_targets_t0_to_Hm1_BxT
@@ -237,6 +294,7 @@ def compute_loss_for_module(
 
         CRITIC_L_total = self._compute_critic_loss(
             module_id=module_id,
+            hps=hps,
             dream_data=dream_data,
             value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT,
         )
@@ -250,16 +308,6 @@ def compute_loss_for_module(
         else:
             ACTOR_L_total = 0.0
 
-        # if hps.use_curiosity:
-        #    L_disagree = self._compute_disagree_loss(dream_data=dream_data)
-        #    results["DISAGREE_L_total"] = L_disagree
-        #    results["DISAGREE_intrinsic_rewards_H_B"] = (
-        #        dream_data["rewards_intrinsic_t1_to_H_B"]
-        #    )
-        #    results["DISAGREE_intrinsic_rewards"] = tf.reduce_mean(
-        #        dream_data["rewards_intrinsic_t1_to_H_B"]
-        #    )
-
         # Return the total loss as a sum of all individual losses.
         return L_world_model_total + CRITIC_L_total + ACTOR_L_total
 
@@ -289,16 +337,27 @@ def _compute_world_model_prediction_losses(
         # If symlog is disabled (e.g. for uint8 image inputs), `obs_symlog_BxT` is the
         # same as `obs_BxT`.
         obs_BxT = fwd_out["sampled_obs_symlog_BxT"]
-        obs_distr = fwd_out["obs_distribution_BxT"]
+        obs_distr_means = fwd_out["obs_distribution_means_BxT"]
+        # In case we wanted to construct a distribution object from the fwd out data,
+        # we would have to do it like this:
+        # obs_distr = tfp.distributions.MultivariateNormalDiag(
+        #    loc=obs_distr_means,
+        #    # Scale == 1.0.
+        #    # [2]: "Distributions The image predictor outputs the mean of a diagonal
+        #    # Gaussian likelihood with **unit variance** ..."
+        #    scale_diag=tf.ones_like(obs_distr_means),
+        # )
+
         # Leave time dim folded (BxT) and flatten all other (e.g. image) dims.
         obs_BxT = tf.reshape(obs_BxT, shape=[-1, tf.reduce_prod(obs_BxT.shape[1:])])
 
-        # Neg logp loss.
-        # decoder_loss = - obs_distr.log_prob(observations)
-        # decoder_loss /= observations.shape.as_list()[1]
         # Squared diff loss w/ sum(!) over all (already folded) obs dims.
+        # decoder_loss_BxT = SUM[ (obs_distr.loc - observations)^2 ]
+        # Note: This is described strangely in the paper (stating a neglogp loss here),
+        # but the author's own implementation actually uses simple MSE with the loc
+        # of the Gaussian.
         decoder_loss_BxT = tf.reduce_sum(
-            tf.math.square(obs_distr.loc - obs_BxT), axis=-1
+            tf.math.square(obs_distr_means - obs_BxT), axis=-1
         )
 
         # Unfold time rank back in.
@@ -456,30 +515,36 @@ def _compute_actor_loss(
         """
         actor = self.module[module_id].actor
 
-        # Note: `value_targets` are NOT stop_gradient'd yet.
+        # Note: `scaled_value_targets_t0_to_Hm1_B` are NOT stop_gradient'd yet.
         scaled_value_targets_t0_to_Hm1_B = self._compute_scaled_value_targets(
             module_id=module_id,
             hps=hps,
             value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT,
-            value_predictions_t0_to_Hm1_BxT=dream_data["values_dreamed_t0_to_H_B"][:-1],
+            value_predictions_t0_to_Hm1_BxT=dream_data["values_dreamed_t0_to_H_BxT"][
+                :-1
+            ],
         )
 
         # Actions actually taken in the dream.
-        actions_dreamed = tf.stop_gradient(dream_data["actions_dreamed_t0_to_H_B"])[:-1]
-        dist_actions_t0_to_Hm1_B = dream_data[
-            "actions_dreamed_distributions_t0_to_H_B"
+        actions_dreamed = tf.stop_gradient(dream_data["actions_dreamed_t0_to_H_BxT"])[
+            :-1
+        ]
+        actions_dreamed_dist_params_t0_to_Hm1_B = dream_data[
+            "actions_dreamed_dist_params_t0_to_H_BxT"
         ][:-1]
 
+        dist_t0_to_Hm1_B = actor.get_action_dist_object(
+            actions_dreamed_dist_params_t0_to_Hm1_B
+        )
+
         # Compute log(p)s of all possible actions in the dream.
         if isinstance(self.module[module_id].actor.action_space, gym.spaces.Discrete):
             # Note that when we create the Categorical action distributions, we compute
             # unimix probs, then math.log these and provide these log(p) as "logits" to
             # the Categorical. So here, we'll continue to work with log(p)s (not
             # really "logits")!
-            logp_actions_t0_to_Hm1_B = tf.stack(
-                [dist.logits for dist in dist_actions_t0_to_Hm1_B],
-                axis=0,
-            )
+            logp_actions_t0_to_Hm1_B = actions_dreamed_dist_params_t0_to_Hm1_B
+
             # Log probs of actions actually taken in the dream.
             logp_actions_dreamed_t0_to_Hm1_B = tf.reduce_sum(
                 actions_dreamed * logp_actions_t0_to_Hm1_B,
@@ -489,29 +554,18 @@ def _compute_actor_loss(
             logp_loss_H_B = logp_actions_dreamed_t0_to_Hm1_B * tf.stop_gradient(
                 scaled_value_targets_t0_to_Hm1_B
             )
-        elif isinstance(actor.action_space, gym.spaces.Box):
-            # TODO (Rohan138, Sven): Figure out how to vectorize this instead!
-            logp_actions_dreamed_t0_to_Hm1_B = tf.stack(
-                [
-                    dist.log_prob(actions_dreamed[i])
-                    for i, dist in enumerate(dist_actions_t0_to_Hm1_B)
-                ]
+        # Box space.
+        else:
+            logp_actions_dreamed_t0_to_Hm1_B = dist_t0_to_Hm1_B.log_prob(
+                actions_dreamed
             )
             # First term of loss function. [1] eq. 11.
             logp_loss_H_B = scaled_value_targets_t0_to_Hm1_B
-        else:
-            raise ValueError(f"Invalid action space: {actor.action_space}")
 
         assert len(logp_loss_H_B.shape) == 2
 
         # Add entropy loss term (second term [1] eq. 11).
-        entropy_H_B = tf.stack(
-            [
-                dist.entropy()
-                for dist in dream_data["actions_dreamed_distributions_t0_to_H_B"][:-1]
-            ],
-            axis=0,
-        )
+        entropy_H_B = dist_t0_to_Hm1_B.entropy()
         assert len(entropy_H_B.shape) == 2
         entropy = tf.reduce_mean(entropy_H_B)
 
@@ -520,31 +574,44 @@ def _compute_actor_loss(
 
         L_actor_H_B = L_actor_reinforce_term_H_B + L_actor_action_entropy_term_H_B
         # Mask out everything that goes beyond a predicted continue=False boundary.
-        L_actor_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_B"])[:-1]
+        L_actor_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_BxT"])[
+            :-1
+        ]
         L_actor = tf.reduce_mean(L_actor_H_B)
 
         self.register_metrics(
             module_id,
             metrics_dict={
-                "ACTOR_L_total_H_B": L_actor_H_B,
                 "ACTOR_L_total": L_actor,
-                "ACTOR_logp_actions_dreamed_H_B": logp_actions_dreamed_t0_to_Hm1_B,
-                "ACTOR_scaled_value_targets_H_B": scaled_value_targets_t0_to_Hm1_B,
                 "ACTOR_value_targets_pct95_ema": actor.ema_value_target_pct95,
                 "ACTOR_value_targets_pct5_ema": actor.ema_value_target_pct5,
-                "ACTOR_action_entropy_H_B": entropy_H_B,
                 "ACTOR_action_entropy": entropy,
                 # Individual loss terms.
-                "ACTOR_L_neglogp_reinforce_term_H_B": L_actor_reinforce_term_H_B,
                 "ACTOR_L_neglogp_reinforce_term": tf.reduce_mean(
                     L_actor_reinforce_term_H_B
                 ),
-                "ACTOR_L_neg_entropy_term_H_B": L_actor_action_entropy_term_H_B,
                 "ACTOR_L_neg_entropy_term": tf.reduce_mean(
                     L_actor_action_entropy_term_H_B
                 ),
             },
         )
+        if hps.report_individual_batch_item_stats:
+            self.register_metrics(
+                module_id,
+                metrics_dict={
+                    "ACTOR_L_total_H_BxT": L_actor_H_B,
+                    "ACTOR_logp_actions_dreamed_H_BxT": (
+                        logp_actions_dreamed_t0_to_Hm1_B
+                    ),
+                    "ACTOR_scaled_value_targets_H_BxT": (
+                        scaled_value_targets_t0_to_Hm1_B
+                    ),
+                    "ACTOR_action_entropy_H_BxT": entropy_H_B,
+                    # Individual loss terms.
+                    "ACTOR_L_neglogp_reinforce_term_H_BxT": L_actor_reinforce_term_H_B,
+                    "ACTOR_L_neg_entropy_term_H_BxT": L_actor_action_entropy_term_H_B,
+                },
+            )
 
         return L_actor
 
@@ -552,6 +619,7 @@ def _compute_critic_loss(
         self,
         *,
         module_id: ModuleID,
+        hps: DreamerV3LearnerHyperparameters,
         dream_data: Dict[str, TensorType],
         value_targets_t0_to_Hm1_BxT: TensorType,
     ) -> TensorType:
@@ -559,6 +627,7 @@ def _compute_critic_loss(
 
         Args:
             module_id: The ModuleID for which to compute the critic loss.
+            hps: The DreamerV3LearnerHyperparameters to use.
             dream_data: The data generated by dreaming for H steps (horizon) starting
                 from any BxT state (sampled from the buffer for the train batch).
             value_targets_t0_to_Hm1_BxT: The computed value function targets of the
@@ -567,7 +636,8 @@ def _compute_critic_loss(
         Returns:
             The total critic loss tensor.
         """
-        H, B = dream_data["rewards_dreamed_t0_to_H_B"].shape[:2]
+        # B=BxT
+        H, B = dream_data["rewards_dreamed_t0_to_H_BxT"].shape[:2]
         Hm1 = H - 1
 
         # Note that value targets are NOT symlog'd and go from t0 to H-1, not H, like
@@ -586,7 +656,7 @@ def _compute_critic_loss(
         )
 
         # Get (B x T x probs) tensor from return distributions.
-        value_symlog_logits_HxB = dream_data["values_symlog_dreamed_logits_t0_to_HxB"]
+        value_symlog_logits_HxB = dream_data["values_symlog_dreamed_logits_t0_to_HxBxT"]
         # Unfold time rank and cut last time index to match value targets.
         value_symlog_logits_t0_to_Hm1_B = tf.reshape(
             value_symlog_logits_HxB,
@@ -608,7 +678,7 @@ def _compute_critic_loss(
         # Expected values (dreamed) from the EMA (slow critic) net.
         # Note: Slow critic (EMA) outputs are already stop_gradient'd.
         value_symlog_ema_t0_to_Hm1_B = tf.stop_gradient(
-            dream_data["v_symlog_dreamed_ema_t0_to_H_B"]
+            dream_data["v_symlog_dreamed_ema_t0_to_H_BxT"]
         )[:-1]
         # Fold time rank (for two_hot'ing).
         value_symlog_ema_HxB = tf.reshape(value_symlog_ema_t0_to_Hm1_B, (-1,))
@@ -634,7 +704,7 @@ def _compute_critic_loss(
         L_critic_H_B = value_loss_two_hot_H_B + ema_regularization_loss_H_B
 
         # Mask out everything that goes beyond a predicted continue=False boundary.
-        L_critic_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_B"])[
+        L_critic_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_BxT"])[
             :-1
         ]
 
@@ -644,21 +714,29 @@ def _compute_critic_loss(
         self.register_metrics(
             module_id=module_id,
             metrics_dict={
-                # Symlog'd value targets. Critic learns to predict symlog'd values.
-                "VALUE_TARGETS_symlog_H_B": value_symlog_targets_t0_to_Hm1_B,
-                # Critic loss terms.
                 "CRITIC_L_total": L_critic,
-                "CRITIC_L_total_H_B": L_critic_H_B,
-                "CRITIC_L_neg_logp_of_value_targets_H_B": value_loss_two_hot_H_B,
                 "CRITIC_L_neg_logp_of_value_targets": tf.reduce_mean(
                     value_loss_two_hot_H_B
                 ),
-                "CRITIC_L_slow_critic_regularization_H_B": ema_regularization_loss_H_B,
                 "CRITIC_L_slow_critic_regularization": tf.reduce_mean(
                     ema_regularization_loss_H_B
                 ),
             },
         )
+        if hps.report_individual_batch_item_stats:
+            self.register_metrics(
+                module_id=module_id,
+                metrics_dict={
+                    # Symlog'd value targets. Critic learns to predict symlog'd values.
+                    "VALUE_TARGETS_symlog_H_BxT": value_symlog_targets_t0_to_Hm1_B,
+                    # Critic loss terms.
+                    "CRITIC_L_total_H_BxT": L_critic_H_B,
+                    "CRITIC_L_neg_logp_of_value_targets_H_BxT": value_loss_two_hot_H_B,
+                    "CRITIC_L_slow_critic_regularization_H_BxT": (
+                        ema_regularization_loss_H_B
+                    ),
+                },
+            )
 
         return L_critic
 
@@ -724,7 +802,7 @@ def _compute_value_targets(
         # intermediates.shape=[2-16, BxT]
 
         # Loop through reversed timesteps (axis=1) from T+1 to t=2.
-        for t in reversed(range(len(discount))):
+        for t in reversed(range(discount.shape[0])):
             Rs.append(intermediates[t] + discount[t] * hps.gae_lambda * Rs[-1])
 
         # Reverse along time axis and cut the last entry (value estimate at very end
@@ -767,21 +845,32 @@ def _compute_scaled_value_targets(
         Per_R_5 = tfp.stats.percentile(value_targets_H_B, 5)
         Per_R_95 = tfp.stats.percentile(value_targets_H_B, 95)
 
-        # Update EMAs stored in actor network.
-        # Initial values: Just set.
-        if tf.math.is_nan(actor.ema_value_target_pct5):
-            actor.ema_value_target_pct5.assign(Per_R_5)
-            actor.ema_value_target_pct95.assign(Per_R_95)
-        # Later update (something already stored in EMA variable): Update EMA.
-        else:
-            actor.ema_value_target_pct5.assign(
+        # Update EMA values for 5 and 95 percentile, stored as tf variables under actor
+        # network.
+        # 5 percentile
+        new_val_pct5 = tf.where(
+            tf.math.is_nan(actor.ema_value_target_pct5),
+            # is NaN: Initial values: Just set.
+            Per_R_5,
+            # Later update (something already stored in EMA variable): Update EMA.
+            (
                 hps.return_normalization_decay * actor.ema_value_target_pct5
                 + (1.0 - hps.return_normalization_decay) * Per_R_5
-            )
-            actor.ema_value_target_pct95.assign(
+            ),
+        )
+        actor.ema_value_target_pct5.assign(new_val_pct5)
+        # 95 percentile
+        new_val_pct95 = tf.where(
+            tf.math.is_nan(actor.ema_value_target_pct95),
+            # is NaN: Initial values: Just set.
+            Per_R_95,
+            # Later update (something already stored in EMA variable): Update EMA.
+            (
                 hps.return_normalization_decay * actor.ema_value_target_pct95
                 + (1.0 - hps.return_normalization_decay) * Per_R_95
-            )
+            ),
+        )
+        actor.ema_value_target_pct95.assign(new_val_pct95)
 
         # [1] eq. 11 (first term).
         # Danijar's code: TODO: describe ...
diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py
index 0cb088e60fd95..77c4c285b21ba 100644
--- a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py
+++ b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py
@@ -1,3 +1,12 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
 from typing import Mapping, Any
 
 from ray.rllib.algorithms.dreamerv3.dreamerv3_rl_module import DreamerV3RLModule
diff --git a/rllib/algorithms/dreamerv3/tf/models/actor_network.py b/rllib/algorithms/dreamerv3/tf/models/actor_network.py
index f22617960b0a8..d865f85606a3a 100644
--- a/rllib/algorithms/dreamerv3/tf/models/actor_network.py
+++ b/rllib/algorithms/dreamerv3/tf/models/actor_network.py
@@ -8,10 +8,12 @@
 import gymnasium as gym
 from gymnasium.spaces import Box, Discrete
 import numpy as np
-import tensorflow as tf
-import tensorflow_probability as tfp
 
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
 
 
 class ActorNetwork(tf.keras.Model):
@@ -28,19 +30,19 @@ class ActorNetwork(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_dimension: Optional[str] = "XS",
+        model_size: Optional[str] = "XS",
         action_space: gym.Space,
     ):
         """Initializes an ActorNetwork instance.
 
         Args:
-             model_dimension: The "Model Size" used according to [1] Appendinx B.
+             model_size: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different network sizes.
              action_space: The action space the our environment used.
         """
         super().__init__(name="actor")
 
-        self.model_dimension = model_dimension
+        self.model_size = model_size
         self.action_space = action_space
 
         # The EMA decay variables used for the [Percentile(R, 95%) - Percentile(R, 5%)]
@@ -55,20 +57,23 @@ def __init__(
         # For discrete actions, use a single MLP that computes logits.
         if isinstance(self.action_space, Discrete):
             self.mlp = MLP(
-                model_dimension=self.model_dimension,
+                model_size=self.model_size,
                 output_layer_size=self.action_space.n,
                 name="actor_mlp",
             )
         # For cont. actions, use separate MLPs for Gaussian mean and stddev.
+        # TODO (sven): In the author's original code repo, this is NOT the case,
+        #  inputs are pushed through a shared MLP, then only the two output linear
+        #  layers are separate for std- and mean logits.
         elif isinstance(action_space, Box):
             output_layer_size = np.prod(action_space.shape)
             self.mlp = MLP(
-                model_dimension=self.model_dimension,
+                model_size=self.model_size,
                 output_layer_size=output_layer_size,
                 name="actor_mlp_mean",
             )
             self.std_mlp = MLP(
-                model_dimension=self.model_dimension,
+                model_size=self.model_size,
                 output_layer_size=output_layer_size,
                 name="actor_mlp_std",
             )
@@ -76,15 +81,15 @@ def __init__(
             raise ValueError(f"Invalid action space: {action_space}")
 
     @tf.function
-    def call(self, h, z, return_distribution=False):
+    def call(self, h, z, return_distr_params=False):
         """Performs a forward pass through this policy network.
 
         Args:
             h: The deterministic hidden state of the sequence model. [B, dim(h)].
             z: The stochastic discrete representations of the original
                 observation input. [B, num_categoricals, num_classes].
-            return_distribution: Whether to return (as a second tuple item) the action
-                distribution object created by the policy.
+            return_distr_params: Whether to return (as a second tuple item) the action
+                distribution parameter tensor created by the policy.
         """
         # Flatten last two dims of z.
         assert len(z.shape) == 3
@@ -109,8 +114,10 @@ def call(self, h, z, return_distribution=False):
             # Danijar's code does: distr = [Distr class](logits=tf.log(probs)).
             # Not sure why we don't directly use the already available probs instead.
             action_logits = tf.math.log(action_probs)
-            # Create the distribution object using the unimix'd logits.
-            distr = tfp.distributions.OneHotCategorical(logits=action_logits)
+
+            # Distribution parameters are the log(probs) directly.
+            distr_params = action_logits
+            distr = self.get_action_dist_object(distr_params)
 
             action = tf.cast(tf.stop_gradient(distr.sample()), tf.float32) + (
                 action_probs - tf.stop_gradient(action_probs)
@@ -122,15 +129,48 @@ def call(self, h, z, return_distribution=False):
             # minstd, maxstd taken from [1] from configs.yaml
             minstd = 0.1
             maxstd = 1.0
+
+            # Distribution parameters are the squashed std_logits and the tanh'd
+            # mean logits.
             # squash std_logits from (-inf, inf) to (minstd, maxstd)
             std_logits = (maxstd - minstd) * tf.sigmoid(std_logits + 2.0) + minstd
+            mean_logits = tf.tanh(action_logits)
+
+            distr_params = tf.concat([mean_logits, std_logits], axis=-1)
+            distr = self.get_action_dist_object(distr_params)
+
+            action = distr.sample()
+
+        if return_distr_params:
+            return action, distr_params
+        return action
+
+    def get_action_dist_object(self, action_dist_params_T_B):
+        """Helper method to create an action distribution object from (T, B, ..) params.
+
+        Args:
+            action_dist_params_T_B: The time-major action distribution parameters.
+                This could be simply the logits (discrete) or a to-be-split-in-2
+                tensor for mean and stddev (continuous).
+
+        Returns:
+            The tfp action distribution object, from which one can sample, compute
+            log probs, entropy, etc..
+        """
+        if isinstance(self.action_space, gym.spaces.Discrete):
+            # Create the distribution object using the unimix'd logits.
+            distr = tfp.distributions.OneHotCategorical(logits=action_dist_params_T_B)
+
+        elif isinstance(self.action_space, gym.spaces.Box):
             # Compute Normal distribution from action_logits and std_logits
-            distr = tfp.distributions.Normal(tf.tanh(action_logits), std_logits)
+            loc, scale = tf.split(action_dist_params_T_B, 2, axis=-1)
+            distr = tfp.distributions.Normal(loc=loc, scale=scale)
+
             # If action_space is a box with multiple dims, make individual dims
             # independent.
             distr = tfp.distributions.Independent(distr, len(self.action_space.shape))
-            action = distr.sample()
 
-        if return_distribution:
-            return action, distr
-        return action
+        else:
+            raise ValueError(f"Action space {self.action_space} not supported!")
+
+        return distr
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py b/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py
index ba9ec38a0fa55..0700240f1bf8c 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py
@@ -5,9 +5,10 @@
 """
 from typing import Optional
 
-import tensorflow as tf
-
 from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
 
 
 class CNNAtari(tf.keras.Model):
@@ -16,13 +17,13 @@ class CNNAtari(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_dimension: Optional[str] = "XS",
+        model_size: Optional[str] = "XS",
         cnn_multiplier: Optional[int] = None,
     ):
         """Initializes a CNNAtari instance.
 
         Args:
-            model_dimension: The "Model Size" used according to [1] Appendinx B.
+            model_size: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the `cnn_multiplier`.
             cnn_multiplier: Optional override for the additional factor used to multiply
                 the number of filters with each CNN layer. Starting with
@@ -32,7 +33,7 @@ def __init__(
         """
         super().__init__(name="image_encoder")
 
-        cnn_multiplier = get_cnn_multiplier(model_dimension, override=cnn_multiplier)
+        cnn_multiplier = get_cnn_multiplier(model_size, override=cnn_multiplier)
 
         # See appendix C in [1]:
         # "We use a similar network architecture but employ layer normalization and
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py
index 41031c950e11b..a23ddca856c87 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py
@@ -5,10 +5,11 @@
 """
 from typing import Optional
 
-import tensorflow as tf
-import tensorflow_probability as tfp
-
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
 
 
 class ContinuePredictor(tf.keras.Model):
@@ -23,15 +24,15 @@ class ContinuePredictor(tf.keras.Model):
     terminal.
     """
 
-    def __init__(self, *, model_dimension: Optional[str] = "XS"):
+    def __init__(self, *, model_size: Optional[str] = "XS"):
         """Initializes a ContinuePredictor instance.
 
         Args:
-            model_dimension: The "Model Size" used according to [1] Appendinx B.
+            model_size: The "Model Size" used according to [1] Appendinx B.
                 Determines the exact size of the underlying MLP.
         """
         super().__init__(name="continue_predictor")
-        self.mlp = MLP(model_dimension=model_dimension, output_layer_size=1)
+        self.mlp = MLP(model_size=model_size, output_layer_size=1)
 
     def call(self, h, z, return_distribution=False):
         """Performs a forward pass through the continue predictor.
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py b/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py
index cffa73adb8029..ebc8649ccd79b 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py
@@ -10,10 +10,11 @@
 from typing import Optional
 
 import numpy as np
-import tensorflow as tf
-import tensorflow_probability as tfp
 
 from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
 
 
 class ConvTransposeAtari(tf.keras.Model):
@@ -28,14 +29,14 @@ class ConvTransposeAtari(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_dimension: Optional[str] = "XS",
+        model_size: Optional[str] = "XS",
         cnn_multiplier: Optional[int] = None,
         gray_scaled: bool,
     ):
         """Initializes a ConvTransposeAtari instance.
 
         Args:
-            model_dimension: The "Model Size" used according to [1] Appendinx B.
+            model_size: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the `cnn_multiplier`.
             cnn_multiplier: Optional override for the additional factor used to multiply
                 the number of filters with each CNN transpose layer. Starting with
@@ -47,7 +48,7 @@ def __init__(
         """
         super().__init__(name="image_decoder")
 
-        cnn_multiplier = get_cnn_multiplier(model_dimension, override=cnn_multiplier)
+        cnn_multiplier = get_cnn_multiplier(model_size, override=cnn_multiplier)
 
         # The shape going into the first Conv2DTranspose layer.
         # We start with a 4x4 channels=8 "image".
@@ -146,15 +147,9 @@ def call(self, h, z):
         # From [2]:
         # "Distributions: The image predictor outputs the mean of a diagonal Gaussian
         # likelihood with unit variance, ..."
+
         # Reshape `out` for the diagonal multi-variate Gaussian (each pixel is its own
         # independent (b/c diagonal co-variance matrix) variable).
         loc = tf.reshape(out, shape=(out_shape[0], -1))
-        distribution = tfp.distributions.MultivariateNormalDiag(
-            loc=loc,
-            # Scale == 1.0.
-            # [2]: "Distributions The image predictor outputs the mean of a diagonal
-            # Gaussian likelihood with **unit variance** ..."
-            scale_diag=tf.ones_like(loc),
-        )
-        pred_obs = distribution.sample()
-        return pred_obs, distribution
+
+        return loc
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py
index fc69c8dd33f9c..559009a44531f 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py
@@ -5,12 +5,13 @@
 """
 from typing import Optional
 
-import tensorflow as tf
-
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import (
     RepresentationLayer,
 )
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
 
 
 class DynamicsPredictor(tf.keras.Model):
@@ -26,17 +27,17 @@ class DynamicsPredictor(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_dimension: Optional[str] = "XS",
+        model_size: Optional[str] = "XS",
         num_categoricals: Optional[int] = None,
         num_classes_per_categorical: Optional[int] = None,
     ):
         """Initializes a DynamicsPredictor instance.
 
         Args:
-            model_dimension: The "Model Size" used according to [1] Appendinx B.
+            model_size: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different parameters.
             num_categoricals: Overrides the number of categoricals used in the z-states.
-                In [1], 32 is used for any model dimension.
+                In [1], 32 is used for any model size.
             num_classes_per_categorical: Overrides the number of classes within each
                 categorical used for the z-states. In [1], 32 is used for any model
                 dimension.
@@ -47,12 +48,12 @@ def __init__(
             # TODO: In Danijar's code, the Dynamics Net only has a single layer, no
             #  matter the model size.
             num_dense_layers=1,
-            model_dimension=model_dimension,
+            model_size=model_size,
             output_layer_size=None,
         )
         # The (prior) z-state generating layer.
         self.representation_layer = RepresentationLayer(
-            model_dimension=model_dimension,
+            model_size=model_size,
             num_categoricals=num_categoricals,
             num_classes_per_categorical=num_classes_per_categorical,
         )
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/mlp.py b/rllib/algorithms/dreamerv3/tf/models/components/mlp.py
index 30d4a7713ee1a..435d9f8544ab3 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/mlp.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/mlp.py
@@ -9,12 +9,13 @@
 """
 from typing import Optional
 
-import tensorflow as tf
-
 from ray.rllib.algorithms.dreamerv3.utils import (
     get_dense_hidden_units,
     get_num_dense_layers,
 )
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
 
 
 class MLP(tf.keras.Model):
@@ -22,13 +23,13 @@ class MLP(tf.keras.Model):
 
     MLP=multi-layer perceptron.
 
-    See Appendix B in [1] for the MLP sizes depending on the given `model_dimension`.
+    See Appendix B in [1] for the MLP sizes depending on the given `model_size`.
     """
 
     def __init__(
         self,
         *,
-        model_dimension: Optional[str] = "XS",
+        model_size: Optional[str] = "XS",
         num_dense_layers: Optional[int] = None,
         dense_hidden_units: Optional[int] = None,
         output_layer_size=None,
@@ -38,12 +39,12 @@ def __init__(
         """Initializes an MLP instance.
 
         Args:
-            model_dimension: The "Model Size" used according to [1] Appendinx B.
+            model_size: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different network sizes.
             num_dense_layers: The number of hidden layers in the MLP. If None,
-                will use `model_dimension` and appendix B to figure out this value.
+                will use `model_size` and appendix B to figure out this value.
             dense_hidden_units: The number of nodes in each hidden layer. If None,
-                will use `model_dimension` and appendix B to figure out this value.
+                will use `model_size` and appendix B to figure out this value.
             output_layer_size: The size of an optional linear (no activation) output
                 layer. If None, no output layer will be added on top of the MLP dense
                 stack.
@@ -52,11 +53,9 @@ def __init__(
         """
         super().__init__(name=name or "mlp")
 
-        num_dense_layers = get_num_dense_layers(
-            model_dimension, override=num_dense_layers
-        )
+        num_dense_layers = get_num_dense_layers(model_size, override=num_dense_layers)
         dense_hidden_units = get_dense_hidden_units(
-            model_dimension, override=dense_hidden_units
+            model_size, override=dense_hidden_units
         )
 
         self.dense_layers = []
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py b/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py
index 36e2ace631844..cf6b27b3c68ff 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py
@@ -9,13 +9,14 @@
 """
 from typing import Optional
 
-import tensorflow as tf
-import tensorflow_probability as tfp
-
 from ray.rllib.algorithms.dreamerv3.utils import (
     get_num_z_categoricals,
     get_num_z_classes,
 )
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
 
 
 class RepresentationLayer(tf.keras.layers.Layer):
@@ -29,26 +30,26 @@ class RepresentationLayer(tf.keras.layers.Layer):
     def __init__(
         self,
         *,
-        model_dimension: Optional[str] = "XS",
+        model_size: Optional[str] = "XS",
         num_categoricals: Optional[int] = None,
         num_classes_per_categorical: Optional[int] = None,
     ):
         """Initializes a RepresentationLayer instance.
 
         Args:
-            model_dimension: The "Model Size" used according to [1] Appendinx B.
+            model_size: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different parameters.
             num_categoricals: Overrides the number of categoricals used in the z-states.
-                In [1], 32 is used for any model dimension.
+                In [1], 32 is used for any model size.
             num_classes_per_categorical: Overrides the number of classes within each
                 categorical used for the z-states. In [1], 32 is used for any model
                 dimension.
         """
         self.num_categoricals = get_num_z_categoricals(
-            model_dimension, override=num_categoricals
+            model_size, override=num_categoricals
         )
         self.num_classes_per_categorical = get_num_z_classes(
-            model_dimension, override=num_classes_per_categorical
+            model_size, override=num_classes_per_categorical
         )
 
         super().__init__(
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py
index 7af29664c6024..c8ce0fc260fd6 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py
@@ -5,12 +5,13 @@
 """
 from typing import Optional
 
-import tensorflow as tf
-
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import (
     RewardPredictorLayer,
 )
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
 
 
 class RewardPredictor(tf.keras.Model):
@@ -22,7 +23,7 @@ class RewardPredictor(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_dimension: Optional[str] = "XS",
+        model_size: Optional[str] = "XS",
         num_buckets: int = 255,
         lower_bound: float = -20.0,
         upper_bound: float = 20.0,
@@ -30,7 +31,7 @@ def __init__(
         """Initializes a RewardPredictor instance.
 
         Args:
-            model_dimension: The "Model Size" used according to [1] Appendinx B.
+            model_size: The "Model Size" used according to [1] Appendinx B.
                 Determines the exact size of the underlying MLP.
             num_buckets: The number of buckets to create. Note that the number of
                 possible symlog'd outcomes from the used distribution is
@@ -51,7 +52,7 @@ def __init__(
         super().__init__(name="reward_predictor")
 
         self.mlp = MLP(
-            model_dimension=model_dimension,
+            model_size=model_size,
             output_layer_size=None,
         )
         self.reward_layer = RewardPredictorLayer(
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py
index f9c92e92e7279..185098b15b2bc 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py
@@ -7,7 +7,9 @@
 D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
 https://arxiv.org/pdf/2010.02193.pdf
 """
-import tensorflow as tf
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
 
 
 class RewardPredictorLayer(tf.keras.layers.Layer):
@@ -15,7 +17,7 @@ class RewardPredictorLayer(tf.keras.layers.Layer):
 
     This layer is used in two models in DreamerV3: The reward predictor of the world
     model and the value function. K is 255 by default (see [1]) and doesn't change
-    with the model dimension.
+    with the model size.
 
     Possible predicted reward/values range from symexp(-20.0) to symexp(20.0), which
     should cover any possible environment. Outputs of this layer are generated by
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py b/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py
index 5f1d02f539ed8..d8ee68499625a 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py
@@ -6,10 +6,12 @@
 from typing import Optional
 
 import gymnasium as gym
-import tensorflow as tf
 
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 from ray.rllib.algorithms.dreamerv3.utils import get_gru_units
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
 
 
 class SequenceModel(tf.keras.Model):
@@ -37,23 +39,23 @@ class SequenceModel(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_dimension: Optional[str] = "XS",
+        model_size: Optional[str] = "XS",
         action_space: gym.Space,
         num_gru_units: Optional[int] = None,
     ):
         """Initializes a SequenceModel instance.
 
         Args:
-            model_dimension: The "Model Size" used according to [1] Appendinx B.
+            model_size: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the number of GRU units used.
             action_space: The action space the our environment used.
             num_gru_units: Overrides the number of GRU units (dimension of the h-state).
-                If None, use the value given through `model_dimension`
+                If None, use the value given through `model_size`
                 (see [1] Appendix B).
         """
         super().__init__(name="sequence_model")
 
-        num_gru_units = get_gru_units(model_dimension, override=num_gru_units)
+        num_gru_units = get_gru_units(model_size, override=num_gru_units)
         self.action_space = action_space
 
         # In Danijar's code, there is an additional layer (units=[model_size])
@@ -61,7 +63,7 @@ def __init__(
         # the paper.
         self.pre_gru_layer = MLP(
             num_dense_layers=1,
-            model_dimension=model_dimension,
+            model_size=model_size,
             output_layer_size=None,
         )
         self.gru_unit = tf.keras.layers.GRU(
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py b/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py
index 08dadaf6494d4..bcfdb164e6d0a 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py
@@ -6,10 +6,11 @@
 from typing import Optional
 
 import gymnasium as gym
-import tensorflow as tf
-import tensorflow_probability as tfp
 
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
 
 
 class VectorDecoder(tf.keras.Model):
@@ -22,13 +23,13 @@ class VectorDecoder(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_dimension: Optional[str] = "XS",
+        model_size: Optional[str] = "XS",
         observation_space: gym.Space,
     ):
         """Initializes a VectorDecoder instance.
 
         Args:
-            model_dimension: The "Model Size" used according to [1] Appendinx B.
+            model_size: The "Model Size" used according to [1] Appendinx B.
                 Determines the exact size of the underlying MLP.
             observation_space: The observation space to decode back into. This must
                 be a Box of shape (d,), where d >= 1.
@@ -41,7 +42,7 @@ def __init__(
         )
 
         self.mlp = MLP(
-            model_dimension=model_dimension,
+            model_size=model_size,
             output_layer_size=observation_space.shape[0],
         )
 
@@ -62,13 +63,5 @@ def call(self, h, z):
         # Send h-cat-z through MLP to get mean values of diag gaussian.
         loc = self.mlp(out)
 
-        # Create the Gaussian diag distribution.
-        distribution = tfp.distributions.MultivariateNormalDiag(
-            loc=loc,
-            # Scale == 1.0.
-            scale_diag=tf.ones_like(loc),
-        )
-        pred_obs = distribution.sample()
-
-        # Always return both predicted observations (sample0 and distribution.
-        return pred_obs, distribution
+        # Return only the predicted observations (mean, no sample).
+        return loc
diff --git a/rllib/algorithms/dreamerv3/tf/models/critic_network.py b/rllib/algorithms/dreamerv3/tf/models/critic_network.py
index 837ca68ccfdcf..d40441e585baf 100644
--- a/rllib/algorithms/dreamerv3/tf/models/critic_network.py
+++ b/rllib/algorithms/dreamerv3/tf/models/critic_network.py
@@ -5,12 +5,13 @@
 """
 from typing import Optional
 
-import tensorflow as tf
-
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import (
     RewardPredictorLayer,
 )
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
 
 
 class CriticNetwork(tf.keras.Model):
@@ -27,7 +28,7 @@ class CriticNetwork(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_dimension: Optional[str] = "XS",
+        model_size: Optional[str] = "XS",
         num_buckets: int = 255,
         lower_bound: float = -20.0,
         upper_bound: float = 20.0,
@@ -36,7 +37,7 @@ def __init__(
         """Initializes a CriticNetwork instance.
 
         Args:
-            model_dimension: The "Model Size" used according to [1] Appendinx B.
+            model_size: The "Model Size" used according to [1] Appendinx B.
                Use None for manually setting the different network sizes.
             num_buckets: The number of buckets to create. Note that the number of
                 possible symlog'd outcomes from the used distribution is
@@ -63,7 +64,7 @@ def __init__(
         """
         super().__init__(name="critic")
 
-        self.model_dimension = model_dimension
+        self.model_size = model_size
         self.ema_decay = ema_decay
 
         # "Fast" critic network(s) (mlp + reward-pred-layer). This is the network
@@ -72,7 +73,7 @@ def __init__(
         # the critic loss term such that the weights of this fast critic stay close
         # to the EMA weights (see below).
         self.mlp = MLP(
-            model_dimension=self.model_dimension,
+            model_size=self.model_size,
             output_layer_size=None,
         )
         self.return_layer = RewardPredictorLayer(
@@ -85,7 +86,7 @@ def __init__(
         # target net, BUT not used to compute anything, just for the
         # weights regularizer term inside the critic loss).
         self.mlp_ema = MLP(
-            model_dimension=self.model_dimension,
+            model_size=self.model_size,
             output_layer_size=None,
             trainable=False,
         )
diff --git a/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py b/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py
index d186fdcd39eba..1a6f95245e302 100644
--- a/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py
+++ b/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py
@@ -4,12 +4,14 @@
 https://arxiv.org/pdf/2301.04104v1.pdf
 """
 
-import tensorflow as tf
-
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import (
     RepresentationLayer,
 )
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
 
 
 class DisagreeNetworks(tf.keras.Model):
@@ -21,10 +23,10 @@ class DisagreeNetworks(tf.keras.Model):
     TODO
     """
 
-    def __init__(self, *, num_networks, model_dimension, intrinsic_rewards_scale):
+    def __init__(self, *, num_networks, model_size, intrinsic_rewards_scale):
         super().__init__(name="disagree_networks")
 
-        self.model_dimension = model_dimension
+        self.model_size = model_size
         self.num_networks = num_networks
         self.intrinsic_rewards_scale = intrinsic_rewards_scale
 
@@ -34,15 +36,13 @@ def __init__(self, *, num_networks, model_dimension, intrinsic_rewards_scale):
         for _ in range(self.num_networks):
             self.mlps.append(
                 MLP(
-                    model_dimension=self.model_dimension,
+                    model_size=self.model_size,
                     output_layer_size=None,
                     trainable=True,
                 )
             )
             self.representation_layers.append(
-                RepresentationLayer(
-                    model_dimension=self.model_dimension, name="disagree"
-                )
+                RepresentationLayer(model_size=self.model_size, name="disagree")
             )
 
     @tf.function
diff --git a/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py b/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py
index 9621c95ce3c22..f735b9e031ea3 100644
--- a/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py
+++ b/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py
@@ -7,20 +7,25 @@
 
 import gymnasium as gym
 import numpy as np
-import tensorflow as tf
 
 from ray.rllib.algorithms.dreamerv3.tf.models.disagree_networks import DisagreeNetworks
-
+from ray.rllib.algorithms.dreamerv3.tf.models.actor_network import ActorNetwork
+from ray.rllib.algorithms.dreamerv3.tf.models.critic_network import CriticNetwork
+from ray.rllib.algorithms.dreamerv3.tf.models.world_model import WorldModel
+from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_utils import inverse_symlog
 
+_, tf, _ = try_import_tf()
+
 
 class DreamerModel(tf.keras.Model):
     """The main tf-keras model containing all necessary components for DreamerV3.
 
     Includes:
-    - The world model (with encoder, decoder, sequence-model (RSSM), dynamics
-    (prior z-state generating) model, and "posterior" model) for producing dreamed
-    trajectories.
+    - The world model with encoder, decoder, sequence-model (RSSM), dynamics
+    (generates prior z-state), and "posterior" model (generates posterior z-state).
+    Predicts env dynamics and produces dreamed trajectories for actor- and critic
+    learning.
     - The actor network (policy).
     - The critic network for value function prediction.
     """
@@ -28,32 +33,29 @@ class DreamerModel(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_dimension: str = "XS",
+        model_size: str = "XS",
         action_space: gym.Space,
-        batch_size_B,
-        batch_length_T,
-        horizon_H,
-        world_model,
-        actor,
-        critic,
+        world_model: WorldModel,
+        actor: ActorNetwork,
+        critic: CriticNetwork,
         use_curiosity: bool = False,
         intrinsic_rewards_scale: float = 0.1,
     ):
-        """TODO
+        """Initializes a DreamerModel instance.
 
         Args:
-             model_dimension: The "Model Size" used according to [1] Appendinx B.
+             model_size: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different network sizes.
              action_space: The action space the our environment used.
+             world_model: The WorldModel component.
+             actor: The ActorNetwork component.
+             critic: The CriticNetwork component.
         """
         super().__init__(name="dreamer_model")
 
-        self.model_dimension = model_dimension
+        self.model_size = model_size
         self.action_space = action_space
         self.use_curiosity = use_curiosity
-        self.batch_size_B = batch_size_B
-        self.batch_length_T = batch_length_T
-        self.horizon_H = horizon_H
 
         self.world_model = world_model
         self.actor = actor
@@ -63,7 +65,7 @@ def __init__(
         if self.use_curiosity:
             self.disagree_nets = DisagreeNetworks(
                 num_networks=8,
-                model_dimension=self.model_dimension,
+                model_size=self.model_size,
                 intrinsic_rewards_scale=intrinsic_rewards_scale,
             )
 
@@ -97,11 +99,11 @@ def call(
         actions = self.actor(
             h=results["h_states_BxT"], z=results["z_posterior_states_BxT"]
         )
-        # Actor (with returning distribution).
-        _, distr = self.actor(
+        # Actor (with returning distribution parameters).
+        _, distr_params = self.actor(
             h=results["h_states_BxT"],
             z=results["z_posterior_states_BxT"],
-            return_distribution=True,
+            return_distr_params=True,
         )
         # Critic.
         values = self.critic(
@@ -155,8 +157,11 @@ def forward_inference(self, observations, previous_states, is_first, training=No
             is_first=is_first,
         )
         # Compute action using our actor network and the current states.
-        _, distr = self.actor(h=states["h"], z=states["z"], return_distribution=True)
+        _, distr_params = self.actor(
+            h=states["h"], z=states["z"], return_distr_params=True
+        )
         # Use the mode of the distribution (Discrete=argmax, Normal=mean).
+        distr = self.actor.get_action_dist_object(distr_params)
         actions = distr.mode()
         return actions, {"h": states["h"], "z": states["z"], "a": actions}
 
@@ -267,9 +272,9 @@ def dream_trajectory(
             timesteps_H: The number of timesteps to dream for.
             gamma: The discount factor gamma.
         """
-        # Dreamed actions (one-hot for discrete actions).
+        # Dreamed actions (one-hot encoded for discrete actions).
         a_dreamed_t0_to_H = []
-        a_dreamed_distributions_t0_to_H = []
+        a_dreamed_dist_params_t0_to_H = []
 
         h = start_states["h"]
         z = start_states["z"]
@@ -281,7 +286,7 @@ def dream_trajectory(
 
         # Compute `a` using actor network (already the first step uses a dreamed action,
         # not a sampled one).
-        a, a_dist = self.actor(
+        a, a_dist_params = self.actor(
             # We have to stop the gradients through the states. B/c we are using a
             # differentiable Discrete action distribution (straight through gradients
             # with `a = stop_gradient(sample(probs)) + probs - stop_gradient(probs)`,
@@ -289,10 +294,10 @@ def dream_trajectory(
             # term on actions further back in the trajectory.
             h=tf.stop_gradient(h),
             z=tf.stop_gradient(z),
-            return_distribution=True,
+            return_distr_params=True,
         )
         a_dreamed_t0_to_H.append(a)
-        a_dreamed_distributions_t0_to_H.append(a_dist)
+        a_dreamed_dist_params_t0_to_H.append(a_dist_params)
 
         for i in range(timesteps_H):
             # Move one step in the dream using the RSSM.
@@ -304,13 +309,13 @@ def dream_trajectory(
             z_states_prior_t0_to_H.append(z)
 
             # Compute `a` using actor network.
-            a, a_dist = self.actor(
+            a, a_dist_params = self.actor(
                 h=tf.stop_gradient(h),
                 z=tf.stop_gradient(z),
-                return_distribution=True,
+                return_distr_params=True,
             )
             a_dreamed_t0_to_H.append(a)
-            a_dreamed_distributions_t0_to_H.append(a_dist)
+            a_dreamed_dist_params_t0_to_H.append(a_dist_params)
 
         h_states_H_B = tf.stack(h_states_t0_to_H, axis=0)  # (T, B, ...)
         h_states_HxB = tf.reshape(h_states_H_B, [-1] + h_states_H_B.shape.as_list()[2:])
@@ -321,6 +326,7 @@ def dream_trajectory(
         )
 
         a_dreamed_H_B = tf.stack(a_dreamed_t0_to_H, axis=0)  # (T, B, ...)
+        a_dreamed_dist_params_H_B = tf.stack(a_dreamed_dist_params_t0_to_H, axis=0)
 
         # Compute r using reward predictor.
         r_dreamed_H_B = tf.reshape(
@@ -389,17 +395,20 @@ def dream_trajectory(
         )
 
         ret = {
-            "h_states_t0_to_H_B": h_states_H_B,
-            "z_states_prior_t0_to_H_B": z_states_prior_H_B,
-            "rewards_dreamed_t0_to_H_B": r_dreamed_H_B,
-            "continues_dreamed_t0_to_H_B": c_dreamed_H_B,
-            "actions_dreamed_t0_to_H_B": a_dreamed_H_B,
-            "actions_dreamed_distributions_t0_to_H_B": a_dreamed_distributions_t0_to_H,
-            "values_dreamed_t0_to_H_B": v_dreamed_H_B,
-            "values_symlog_dreamed_logits_t0_to_HxB": v_symlog_dreamed_logits_HxB,
-            "v_symlog_dreamed_ema_t0_to_H_B": v_symlog_dreamed_ema_H_B,
+            "h_states_t0_to_H_BxT": h_states_H_B,
+            "z_states_prior_t0_to_H_BxT": z_states_prior_H_B,
+            "rewards_dreamed_t0_to_H_BxT": r_dreamed_H_B,
+            "continues_dreamed_t0_to_H_BxT": c_dreamed_H_B,
+            "actions_dreamed_t0_to_H_BxT": a_dreamed_H_B,
+            # "actions_dreamed_distributions_t0_to_H_BxT": (
+            #    a_dreamed_distributions_t0_to_H
+            # ),
+            "actions_dreamed_dist_params_t0_to_H_BxT": a_dreamed_dist_params_H_B,
+            "values_dreamed_t0_to_H_BxT": v_dreamed_H_B,
+            "values_symlog_dreamed_logits_t0_to_HxBxT": v_symlog_dreamed_logits_HxB,
+            "v_symlog_dreamed_ema_t0_to_H_BxT": v_symlog_dreamed_ema_H_B,
             # Loss weights for critic- and actor losses.
-            "dream_loss_weights_t0_to_H_B": dream_loss_weights_H_B,
+            "dream_loss_weights_t0_to_H_BxT": dream_loss_weights_H_B,
         }
 
         if self.use_curiosity:
@@ -537,20 +546,20 @@ def dream_trajectory_with_burn_in(
         # an original time dimension from the real env, from all of which we then branch
         # out our dream trajectories).
         ret = {
-            "h_states_t0_to_H_B": h_states_t0_to_H_B,
-            "z_states_prior_t0_to_H_B": z_states_prior_t0_to_H_B,
+            "h_states_t0_to_H_BxT": h_states_t0_to_H_B,
+            "z_states_prior_t0_to_H_BxT": z_states_prior_t0_to_H_B,
             # Unfold time-ranks in predictions.
-            "rewards_dreamed_t0_to_H_B": tf.reshape(r_dreamed_t0_to_HxB, (-1, B)),
-            "continues_dreamed_t0_to_H_B": tf.reshape(c_dreamed_t0_to_HxB, (-1, B)),
+            "rewards_dreamed_t0_to_H_BxT": tf.reshape(r_dreamed_t0_to_HxB, (-1, B)),
+            "continues_dreamed_t0_to_H_BxT": tf.reshape(c_dreamed_t0_to_HxB, (-1, B)),
         }
 
         # Figure out action key (random, sampled from env, dreamed?).
         if use_sampled_actions_in_dream:
-            key = "actions_sampled_t0_to_H_B"
+            key = "actions_sampled_t0_to_H_BxT"
         elif use_random_actions_in_dream:
-            key = "actions_random_t0_to_H_B"
+            key = "actions_random_t0_to_H_BxT"
         else:
-            key = "actions_dreamed_t0_to_H_B"
+            key = "actions_dreamed_t0_to_H_BxT"
         ret[key] = a_t0_to_H_B
 
         # Also provide int-actions, if discrete action space.
diff --git a/rllib/algorithms/dreamerv3/tf/models/world_model.py b/rllib/algorithms/dreamerv3/tf/models/world_model.py
index 39fa3e587d6ef..73195fc8e1a0b 100644
--- a/rllib/algorithms/dreamerv3/tf/models/world_model.py
+++ b/rllib/algorithms/dreamerv3/tf/models/world_model.py
@@ -6,7 +6,6 @@
 from typing import Optional
 
 import gymnasium as gym
-import tensorflow as tf
 import tree  # pip install dm_tree
 
 from ray.rllib.algorithms.dreamerv3.tf.models.components.continue_predictor import (
@@ -26,9 +25,13 @@
     SequenceModel,
 )
 from ray.rllib.algorithms.dreamerv3.utils import get_gru_units
+from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_utils import symlog
 
 
+_, tf, _ = try_import_tf()
+
+
 class WorldModel(tf.keras.Model):
     """WorldModel component of [1] w/ encoder, decoder, RSSM, reward/cont. predictors.
 
@@ -56,7 +59,7 @@ class WorldModel(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_dimension: str = "XS",
+        model_size: str = "XS",
         action_space: gym.Space,
         batch_length_T: int = 64,
         encoder: tf.keras.Model,
@@ -67,7 +70,7 @@ def __init__(
         """Initializes a WorldModel instance.
 
         Args:
-             model_dimension: The "Model Size" used according to [1] Appendinx B.
+             model_size: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different network sizes.
              action_space: The action space the our environment used.
              batch_length_T: The length (T) of the sequences used for training. The
@@ -87,7 +90,7 @@ def __init__(
                 the last decoder layer produces the exact, normalized pixel values
                 (not a Gaussian as described in [1]!).
             num_gru_units: The number of GRU units to use. If None, use
-                `model_dimension` to figure out this parameter.
+                `model_size` to figure out this parameter.
             symlog_obs: Whether to predict decoded observations in symlog space.
                 This should be False for image based observations.
                 According to the paper [1] Appendix E: "NoObsSymlog: This ablation
@@ -98,7 +101,7 @@ def __init__(
         """
         super().__init__(name="world_model")
 
-        self.model_dimension = model_dimension
+        self.model_size = model_size
         self.batch_length_T = batch_length_T
         self.symlog_obs = symlog_obs
         self.action_space = action_space
@@ -109,7 +112,7 @@ def __init__(
         # Posterior predictor consisting of an MLP and a RepresentationLayer:
         # [ht, lt] -> zt.
         self.posterior_mlp = MLP(
-            model_dimension=self.model_dimension,
+            model_size=self.model_size,
             output_layer_size=None,
             # In Danijar's code, the posterior predictor only has a single layer,
             # no matter the model size:
@@ -118,17 +121,15 @@ def __init__(
         )
         # The (posterior) z-state generating layer.
         self.posterior_representation_layer = RepresentationLayer(
-            model_dimension=self.model_dimension,
+            model_size=self.model_size,
         )
 
         # Dynamics (prior z-state) predictor: ht -> z^t
-        self.dynamics_predictor = DynamicsPredictor(
-            model_dimension=self.model_dimension
-        )
+        self.dynamics_predictor = DynamicsPredictor(model_size=self.model_size)
 
         # GRU for the RSSM: [at, ht, zt] -> ht+1
         self.num_gru_units = get_gru_units(
-            model_dimension=self.model_dimension,
+            model_size=self.model_size,
             override=num_gru_units,
         )
         # Initial h-state variable (learnt).
@@ -142,17 +143,15 @@ def __init__(
         )
         # The actual sequence model containing the GRU layer.
         self.sequence_model = SequenceModel(
-            model_dimension=self.model_dimension,
+            model_size=self.model_size,
             action_space=self.action_space,
             num_gru_units=self.num_gru_units,
         )
 
         # Reward Predictor: [ht, zt] -> rt.
-        self.reward_predictor = RewardPredictor(model_dimension=self.model_dimension)
+        self.reward_predictor = RewardPredictor(model_size=self.model_size)
         # Continue Predictor: [ht, zt] -> ct.
-        self.continue_predictor = ContinuePredictor(
-            model_dimension=self.model_dimension
-        )
+        self.continue_predictor = ContinuePredictor(model_size=self.model_size)
 
         # Decoder: [ht, zt] -> x^t.
         self.decoder = decoder
@@ -276,7 +275,7 @@ def forward_train(self, observations, actions, is_first, training=None):
         # Make actions and `is_first` time-major.
         actions = tf.transpose(
             actions,
-            perm=[1, 0] + list(range(2, len(actions.shape))),  # .as_list() TODO
+            perm=[1, 0] + list(range(2, tf.shape(actions).shape.as_list()[0])),
         )
         is_first = tf.transpose(is_first, perm=[1, 0])
 
@@ -343,7 +342,7 @@ def forward_train(self, observations, actions, is_first, training=None):
         h_BxT = tf.reshape(h_t1_to_T, shape=[-1] + h_t1_to_T.shape.as_list()[2:])
         z_BxT = tf.reshape(z_t1_to_T, shape=[-1] + z_t1_to_T.shape.as_list()[2:])
 
-        _, obs_distribution = self.decoder(h=h_BxT, z=z_BxT)
+        obs_distribution_means = self.decoder(h=h_BxT, z=z_BxT)
 
         # Compute (predicted) reward distributions.
         rewards, reward_logits = self.reward_predictor(
@@ -356,11 +355,11 @@ def forward_train(self, observations, actions, is_first, training=None):
         )
 
         # Return outputs for loss computation.
-        # Note that all shapes are [B, ...] (no time axis).
+        # Note that all shapes are [BxT, ...] (time axis already folded).
         return {
             # Obs.
             "sampled_obs_symlog_BxT": observations,
-            "obs_distribution_BxT": obs_distribution,
+            "obs_distribution_means_BxT": obs_distribution_means,
             # Rewards.
             "reward_logits_BxT": reward_logits,
             "rewards_BxT": rewards,
diff --git a/rllib/algorithms/dreamerv3/utils/__init__.py b/rllib/algorithms/dreamerv3/utils/__init__.py
new file mode 100644
index 0000000000000..592bbf9b32e82
--- /dev/null
+++ b/rllib/algorithms/dreamerv3/utils/__init__.py
@@ -0,0 +1,168 @@
+"""
+Utility functions for the DreamerV3 ([1]) algorithm.
+
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+
+_ALLOWED_MODEL_DIMS = [
+    # RLlib debug sizes (not mentioned in [1]).
+    "nano",
+    "micro",
+    "mini",
+    "XXS",
+    # Regular sizes (listed in table B in [1]).
+    "XS",
+    "S",
+    "M",
+    "L",
+    "XL",
+]
+
+
+def get_cnn_multiplier(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    cnn_multipliers = {
+        "nano": 2,
+        "micro": 4,
+        "mini": 8,
+        "XXS": 16,
+        "XS": 24,
+        "S": 32,
+        "M": 48,
+        "L": 64,
+        "XL": 96,
+    }
+    return cnn_multipliers[model_size]
+
+
+def get_dense_hidden_units(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    dense_units = {
+        "nano": 16,
+        "micro": 32,
+        "mini": 64,
+        "XXS": 128,
+        "XS": 256,
+        "S": 512,
+        "M": 640,
+        "L": 768,
+        "XL": 1024,
+    }
+    return dense_units[model_size]
+
+
+def get_gru_units(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    gru_units = {
+        "nano": 16,
+        "micro": 32,
+        "mini": 64,
+        "XXS": 128,
+        "XS": 256,
+        "S": 512,
+        "M": 1024,
+        "L": 2048,
+        "XL": 4096,
+    }
+    return gru_units[model_size]
+
+
+def get_num_z_categoricals(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    gru_units = {
+        "nano": 4,
+        "micro": 8,
+        "mini": 16,
+        "XXS": 32,
+        "XS": 32,
+        "S": 32,
+        "M": 32,
+        "L": 32,
+        "XL": 32,
+    }
+    return gru_units[model_size]
+
+
+def get_num_z_classes(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    gru_units = {
+        "nano": 4,
+        "micro": 8,
+        "mini": 16,
+        "XXS": 32,
+        "XS": 32,
+        "S": 32,
+        "M": 32,
+        "L": 32,
+        "XL": 32,
+    }
+    return gru_units[model_size]
+
+
+def get_num_curiosity_nets(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    num_curiosity_nets = {
+        "nano": 8,
+        "micro": 8,
+        "mini": 16,
+        "XXS": 8,
+        "XS": 8,
+        "S": 8,
+        "M": 8,
+        "L": 8,
+        "XL": 8,
+    }
+    return num_curiosity_nets[model_size]
+
+
+def get_num_dense_layers(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    num_dense_layers = {
+        "nano": 1,
+        "micro": 1,
+        "mini": 1,
+        "XXS": 1,
+        "XS": 1,
+        "S": 2,
+        "M": 3,
+        "L": 4,
+        "XL": 5,
+    }
+    return num_dense_layers[model_size]
+
+
+def do_symlog_obs(observation_space, symlog_obs_user_setting):
+    # If our symlog_obs setting is NOT set specifically (it's set to "auto"), return
+    # True if we don't have an image observation space, otherwise return False.
+
+    # TODO (sven): Support mixed observation spaces.
+
+    is_image_space = len(observation_space.shape) in [2, 3]
+    return (
+        not is_image_space
+        if symlog_obs_user_setting == "auto"
+        else symlog_obs_user_setting
+    )
diff --git a/rllib/algorithms/dreamerv3/utils/debugging.py b/rllib/algorithms/dreamerv3/utils/debugging.py
new file mode 100644
index 0000000000000..1a4cf515d9f41
--- /dev/null
+++ b/rllib/algorithms/dreamerv3/utils/debugging.py
@@ -0,0 +1,185 @@
+import gymnasium as gym
+import numpy as np
+from PIL import Image, ImageDraw
+
+from gymnasium.envs.classic_control.cartpole import CartPoleEnv
+
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
+
+
+class CartPoleDebug(CartPoleEnv):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        low = np.concatenate([np.array([0.0]), self.observation_space.low])
+        high = np.concatenate([np.array([1000.0]), self.observation_space.high])
+
+        self.observation_space = gym.spaces.Box(low, high, shape=(5,), dtype=np.float32)
+
+        self.timesteps_ = 0
+
+    def reset(self, *, seed=None, options=None):
+        ret = super().reset()
+        self.timesteps_ = 0
+        obs = np.concatenate([np.array([self.timesteps_]), ret[0]])
+        return obs, ret[1]
+
+    def step(self, action):
+        ret = super().step(action)
+
+        self.timesteps_ += 1
+
+        obs = np.concatenate([np.array([self.timesteps_]), ret[0]])
+        reward = 0.1 * self.timesteps_
+        return (obs, reward) + ret[2:]
+
+
+gym.register("CartPoleDebug-v0", CartPoleDebug)
+cartpole_env = gym.make("CartPoleDebug-v0", render_mode="rgb_array")
+cartpole_env.reset()
+
+frozenlake_env = gym.make(
+    "FrozenLake-v1", render_mode="rgb_array", is_slippery=False, map_name="4x4"
+)  # desc=["SF", "HG"])
+frozenlake_env.reset()
+
+
+def create_cartpole_dream_image(
+    dreamed_obs,  # real space (not symlog'd)
+    dreamed_V,  # real space (not symlog'd)
+    dreamed_a,
+    dreamed_r_tp1,  # real space (not symlog'd)
+    dreamed_ri_tp1,  # intrinsic reward
+    dreamed_c_tp1,  # continue flag
+    value_target,  # real space (not symlog'd)
+    initial_h,
+    as_tensor=False,
+):
+    # CartPoleDebug
+    if dreamed_obs.shape == (5,):
+        # Set the state of our env to the given observation.
+        cartpole_env.unwrapped.state = np.array(dreamed_obs[1:], dtype=np.float32)
+    # Normal CartPole-v1
+    else:
+        cartpole_env.unwrapped.state = np.array(dreamed_obs, dtype=np.float32)
+
+    # Produce an RGB-image of the current state.
+    rgb_array = cartpole_env.render()
+
+    # Add value-, action-, reward-, and continue-prediction information.
+    image = Image.fromarray(rgb_array)
+    draw_obj = ImageDraw.Draw(image)
+
+    # fnt = ImageFont.load_default(size=40)
+
+    draw_obj.text(
+        (5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0)
+    )  # , font=fnt.font, size=30)
+    draw_obj.text(
+        (5, 18),
+        f"at={'<--' if dreamed_a == 0 else '-->'} ({dreamed_a})",
+        fill=(0, 0, 0),
+    )
+    draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0))
+    if dreamed_ri_tp1 is not None:
+        draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0))
+    draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0))
+    draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0))
+
+    if dreamed_obs.shape == (5,):
+        draw_obj.text((20, 100), f"t={dreamed_obs[0]}", fill=(0, 0, 0))
+
+    # Return image.
+    np_img = np.asarray(image)
+    if as_tensor:
+        return tf.convert_to_tensor(np_img, dtype=tf.uint8)
+    return np_img
+
+
+def create_frozenlake_dream_image(
+    dreamed_obs,  # real space (not symlog'd)
+    dreamed_V,  # real space (not symlog'd)
+    dreamed_a,
+    dreamed_r_tp1,  # real space (not symlog'd)
+    dreamed_ri_tp1,  # intrinsic reward
+    dreamed_c_tp1,  # continue flag
+    value_target,  # real space (not symlog'd)
+    initial_h,
+    as_tensor=False,
+):
+    frozenlake_env.unwrapped.s = np.argmax(dreamed_obs, axis=0)
+
+    # Produce an RGB-image of the current state.
+    rgb_array = frozenlake_env.render()
+
+    # Add value-, action-, reward-, and continue-prediction information.
+    image = Image.fromarray(rgb_array)
+    draw_obj = ImageDraw.Draw(image)
+
+    draw_obj.text((5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0))
+    action_arrow = (
+        "<--"
+        if dreamed_a == 0
+        else "v"
+        if dreamed_a == 1
+        else "-->"
+        if dreamed_a == 2
+        else "^"
+    )
+    draw_obj.text((5, 18), f"at={action_arrow} ({dreamed_a})", fill=(0, 0, 0))
+    draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0))
+    if dreamed_ri_tp1 is not None:
+        draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0))
+    draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0))
+    draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0))
+
+    # Return image.
+    np_img = np.asarray(image)
+    if as_tensor:
+        return tf.convert_to_tensor(np_img, dtype=tf.uint8)
+    return np_img
+
+
+if __name__ == "__main__":
+    # CartPole debug.
+    rgb_array = create_cartpole_dream_image(
+        dreamed_obs=np.array([100.0, 1.0, -0.01, 1.5, 0.02]),
+        dreamed_V=4.3,
+        dreamed_a=1,
+        dreamed_r_tp1=1.0,
+        dreamed_c_tp1=True,
+        initial_h=0.0,
+        value_target=8.0,
+    )
+    # ImageFont.load("arial.pil")
+    image = Image.fromarray(rgb_array)
+    image.show()
+
+    # Normal CartPole.
+    rgb_array = create_cartpole_dream_image(
+        dreamed_obs=np.array([1.0, -0.01, 1.5, 0.02]),
+        dreamed_V=4.3,
+        dreamed_a=1,
+        dreamed_r_tp1=1.0,
+        dreamed_c_tp1=True,
+        initial_h=0.1,
+        value_target=8.0,
+    )
+    # ImageFont.load("arial.pil")
+    image = Image.fromarray(rgb_array)
+    image.show()
+
+    # Frozenlake
+    rgb_array = create_frozenlake_dream_image(
+        dreamed_obs=np.array([1.0] + [0.0] * (frozenlake_env.observation_space.n - 1)),
+        dreamed_V=4.3,
+        dreamed_a=1,
+        dreamed_r_tp1=1.0,
+        dreamed_c_tp1=True,
+        initial_h=0.1,
+        value_target=8.0,
+    )
+    image = Image.fromarray(rgb_array)
+    image.show()
diff --git a/rllib/algorithms/dreamerv3/utils/env_runner.py b/rllib/algorithms/dreamerv3/utils/env_runner.py
new file mode 100644
index 0000000000000..c8db4e8ebc073
--- /dev/null
+++ b/rllib/algorithms/dreamerv3/utils/env_runner.py
@@ -0,0 +1,548 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from collections import defaultdict
+from functools import partial
+from typing import List, Tuple
+
+import gymnasium as gym
+import numpy as np
+from supersuit.generic_wrappers import resize_v1
+import tree  # pip install dm_tree
+
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.core.models.base import STATE_IN, STATE_OUT
+from ray.rllib.env.env_runner import EnvRunner
+from ray.rllib.env.wrappers.atari_wrappers import NoopResetEnv, MaxAndSkipEnv
+from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv
+from ray.rllib.evaluation.metrics import RolloutMetrics
+from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.replay_buffers.episode_replay_buffer import _Episode as Episode
+from ray.rllib.utils.numpy import one_hot
+
+_, tf, _ = try_import_tf()
+
+
+class DreamerV3EnvRunner(EnvRunner):
+    """An environment runner to collect data from vectorized gymnasium environments."""
+
+    def __init__(
+        self,
+        config: AlgorithmConfig,
+        **kwargs,
+    ):
+        """Initializes a DreamerV3EnvRunner instance.
+
+        Args:
+            config: The config to use to setup this EnvRunner.
+        """
+        super().__init__(config=config)
+
+        # Create the gym.vector.Env object.
+        # Atari env.
+        if self.config.env.startswith("ALE/"):
+            # [2]: "We down-scale the 84 × 84 grayscale images to 64 × 64 pixels so that
+            # we can apply the convolutional architecture of DreamerV1."
+            # ...
+            # "We follow the evaluation protocol of Machado et al. (2018) with 200M
+            # environment steps, action repeat of 4, a time limit of 108,000 steps per
+            # episode that correspond to 30 minutes of game play, no access to life
+            # information, full action space, and sticky actions. Because the world
+            # model integrates information over time, DreamerV2 does not use frame
+            # stacking."
+            # However, in Danijar's repo, Atari100k experiments are configured as:
+            # noop=30, 64x64x3 (no grayscaling), sticky actions=False,
+            # full action space=False,
+            wrappers = [
+                partial(gym.wrappers.TimeLimit, max_episode_steps=108000),
+                partial(resize_v1, x_size=64, y_size=64),  # resize to 64x64
+                NormalizedImageEnv,
+                NoopResetEnv,
+                MaxAndSkipEnv,
+            ]
+
+            self.env = gym.vector.make(
+                "GymV26Environment-v0",
+                env_id=self.config.env,
+                wrappers=wrappers,
+                num_envs=self.config.num_envs_per_worker,
+                asynchronous=self.config.remote_worker_envs,
+                make_kwargs=dict(
+                    self.config.env_config, **{"render_mode": "rgb_array"}
+                ),
+            )
+        # DeepMind Control.
+        elif self.config.env.startswith("DMC/"):
+            parts = self.config.env.split("/")
+            assert len(parts) == 3, (
+                "ERROR: DMC env must be formatted as 'DMC/[task]/[domain]', e.g. "
+                f"'DMC/cartpole/swingup'! You provided '{self.config.env}'."
+            )
+            gym.register(
+                "dmc_env-v0",
+                lambda from_pixels=True: DMCEnv(
+                    parts[1], parts[2], from_pixels=from_pixels, channels_first=False
+                ),
+            )
+            self.env = gym.vector.make(
+                "dmc_env-v0",
+                wrappers=[ActionClip],
+                num_envs=self.config.num_envs_per_worker,
+                asynchronous=self.config.remote_worker_envs,
+                **dict(self.config.env_config),
+            )
+        # All other (gym) envs.
+        else:
+            wrappers = [] if self.config.env != "FrozenLake-v1" else [OneHot]
+            self.env = gym.vector.make(
+                self.config.env,
+                wrappers=wrappers,
+                num_envs=self.config.num_envs_per_worker,
+                asynchronous=self.config.remote_worker_envs,
+                **dict(self.config.env_config, **{"render_mode": "rgb_array"}),
+            )
+        self.num_envs = self.env.num_envs
+        assert self.num_envs == self.config.num_envs_per_worker
+
+        # Create our RLModule to compute actions with.
+        if self.config.share_module_between_env_runner_and_learner:
+            # DreamerV3 Algorithm will set this to the local Learner's module.
+            self.module = None
+        # Create our own instance of a DreamerV3RLModule (which then needs to be
+        # weight-synched each iteration).
+        else:
+            policy_dict, _ = self.config.get_multi_agent_setup(env=self.env)
+            module_spec = self.config.get_marl_module_spec(policy_dict=policy_dict)
+            # TODO (sven): DreamerV3 is currently single-agent only.
+            self.module = module_spec.build()[DEFAULT_POLICY_ID]
+
+        self._needs_initial_reset = True
+        self._episodes = [None for _ in range(self.num_envs)]
+
+        # TODO (sven): Move metrics temp storage and collection out of EnvRunner
+        #  and RolloutWorkers. These classes should not continue tracking some data
+        #  that they have already returned (in a call to `sample()`). Instead, the
+        #  episode data should be analyzed where it was sent to (the Algorithm itself
+        #  via its replay buffer, etc..).
+        self._done_episodes_for_metrics = []
+        self._ongoing_episodes_for_metrics = defaultdict(list)
+        self._ts_since_last_metrics = 0
+
+    @override(EnvRunner)
+    def sample(
+        self,
+        *,
+        num_timesteps: int = None,
+        num_episodes: int = None,
+        explore: bool = True,
+        random_actions: bool = False,
+        with_render_data: bool = False,
+    ) -> Tuple[List[Episode], List[Episode]]:
+        """Runs and returns a sample (n timesteps or m episodes) on the environment(s).
+
+        Timesteps or episodes are counted in total (across all vectorized
+        sub-environments). For example, if self.num_envs=2 and num_timesteps=10, each
+        sub-environment will be sampled for 5 steps. If self.num_envs=3 and
+        num_episodes=30, each sub-environment will be sampled for 10 episodes.
+
+        Args:
+            num_timesteps: The number of timesteps to sample from the environment(s).
+                Note that only exactly one of `num_timesteps` or `num_episodes` must be
+                provided.
+            num_episodes: The number of full episodes to sample from the environment(s).
+                Note that only exactly one of `num_timesteps` or `num_episodes` must be
+                provided.
+            explore: Indicates whether to utilize exploration when picking actions.
+            random_actions: Whether to only use random actions. If True, the value of
+                `explore` is ignored.
+            force_reset: Whether to reset the environment(s) before starting to sample.
+                If False, will still reset the environment(s) if they were left in
+                a terminated or truncated state during previous sample calls.
+            with_render_data: If True, will record rendering images per timestep
+                in the returned Episodes. This data can be used to create video
+                reports.
+                TODO (sven): Note that this is only supported for runnign with
+                 `num_episodes` yet.
+
+        Returns:
+            A tuple consisting of a) list of Episode instances that are done and
+            b) list of Episode instances that are still ongoing.
+        """
+        # If no execution details are provided, use self.config.
+        if num_timesteps is None and num_episodes is None:
+            if self.config.batch_mode == "truncate_episodes":
+                num_timesteps = self.config.rollout_fragment_length * self.num_envs
+            else:
+                num_episodes = self.num_envs
+
+        # Sample n timesteps.
+        if num_timesteps is not None:
+            return self._sample_timesteps(
+                num_timesteps=num_timesteps,
+                explore=explore,
+                random_actions=random_actions,
+                force_reset=False,
+            )
+        # Sample n episodes.
+        else:
+            # `_sample_episodes` returns only one list (with completed episodes)
+            # return empty list for incomplete ones.
+            return (
+                self._sample_episodes(
+                    num_episodes=num_episodes,
+                    explore=explore,
+                    random_actions=random_actions,
+                    with_render_data=with_render_data,
+                ),
+                [],
+            )
+
+    def _sample_timesteps(
+        self,
+        num_timesteps: int,
+        explore: bool = True,
+        random_actions: bool = False,
+        force_reset: bool = False,
+    ) -> Tuple[List[Episode], List[Episode]]:
+        """Helper method to run n timesteps.
+
+        See docstring of self.sample() for more details.
+        """
+        done_episodes_to_return = []
+
+        # Get initial states for all `batch_size_B` rows in the forward batch.
+        initial_states = tree.map_structure(
+            lambda s: np.repeat(s, self.num_envs, axis=0),
+            self.module.get_initial_state(),
+        )
+
+        # Have to reset the env (on all vector sub-envs).
+        if force_reset or self._needs_initial_reset:
+            obs, _ = self.env.reset()
+
+            self._episodes = [Episode() for _ in range(self.num_envs)]
+            states = initial_states
+            # Set is_first to True for all rows (all sub-envs just got reset).
+            is_first = np.ones((self.num_envs,), dtype=np.float32)
+            self._needs_initial_reset = False
+
+            # Set initial obs and states in the episodes.
+            for i in range(self.num_envs):
+                self._episodes[i].add_initial_observation(
+                    initial_observation=obs[i],
+                    initial_state={k: s[i] for k, s in states.items()},
+                )
+        # Don't reset existing envs; continue in already started episodes.
+        else:
+            # Pick up stored observations and states from previous timesteps.
+            obs = np.stack([eps.observations[-1] for eps in self._episodes])
+            # Compile the initial state for each batch row: If episode just started, use
+            # model's initial state, if not, use state stored last in Episode.
+            states = {
+                k: np.stack(
+                    [
+                        initial_states[k][i] if eps.states is None else eps.states[k]
+                        for i, eps in enumerate(self._episodes)
+                    ]
+                )
+                for k in initial_states.keys()
+            }
+            # If a batch row is at the beginning of an episode, set its `is_first` flag
+            # to 1.0, otherwise 0.0.
+            is_first = np.zeros((self.num_envs,), dtype=np.float32)
+            for i, eps in enumerate(self._episodes):
+                if eps.states is None:
+                    is_first[i] = 1.0
+
+        # Loop through env for n timesteps.
+        ts = 0
+        while ts < num_timesteps:
+            # Act randomly.
+            if random_actions:
+                actions = self.env.action_space.sample()
+            # Compute an action using our RLModule.
+            else:
+                batch = {
+                    STATE_IN: tree.map_structure(
+                        lambda s: tf.convert_to_tensor(s), states
+                    ),
+                    SampleBatch.OBS: tf.convert_to_tensor(obs),
+                    "is_first": tf.convert_to_tensor(is_first),
+                }
+                # Explore or not.
+                if explore:
+                    outs = self.module.forward_exploration(batch)
+                else:
+                    outs = self.module.forward_inference(batch)
+
+                # Model outputs one-hot actions (if discrete). Convert to int actions
+                # as well.
+                actions = outs[SampleBatch.ACTIONS].numpy()
+                if isinstance(self.env.single_action_space, gym.spaces.Discrete):
+                    actions = np.argmax(actions, axis=-1)
+                states = tree.map_structure(lambda s: s.numpy(), outs[STATE_OUT])
+
+            obs, rewards, terminateds, truncateds, infos = self.env.step(actions)
+            ts += self.num_envs
+
+            for i in range(self.num_envs):
+                s = {k: s[i] for k, s in states.items()}
+                # The last entry in self.observations[i] is already the reset
+                # obs of the new episode.
+                if terminateds[i] or truncateds[i]:
+                    # Finish the episode with the actual terminal observation stored in
+                    # the info dict.
+                    self._episodes[i].add_timestep(
+                        infos["final_observation"][i],
+                        actions[i],
+                        rewards[i],
+                        state=s,
+                        is_terminated=terminateds[i],
+                        is_truncated=truncateds[i],
+                    )
+                    # Reset h-states to the model's initial ones b/c we are starting a
+                    # new episode.
+                    for k, v in self.module.get_initial_state().items():
+                        states[k][i] = v.numpy()
+                    is_first[i] = True
+                    done_episodes_to_return.append(self._episodes[i])
+                    # Create a new episode object.
+                    self._episodes[i] = Episode(observations=[obs[i]], states=s)
+                else:
+                    self._episodes[i].add_timestep(
+                        obs[i], actions[i], rewards[i], state=s
+                    )
+                    is_first[i] = False
+
+        # Return done episodes ...
+        self._done_episodes_for_metrics.extend(done_episodes_to_return)
+        # ... and all ongoing episode chunks. Also, make sure, we return
+        # a copy and start new chunks so that callers of this function
+        # don't alter our ongoing and returned Episode objects.
+        ongoing_episodes = self._episodes
+        self._episodes = [eps.create_successor() for eps in self._episodes]
+        for eps in ongoing_episodes:
+            self._ongoing_episodes_for_metrics[eps.id_].append(eps)
+
+        self._ts_since_last_metrics += ts
+
+        return done_episodes_to_return, ongoing_episodes
+
+    def _sample_episodes(
+        self,
+        num_episodes: int,
+        explore: bool = True,
+        random_actions: bool = False,
+        with_render_data: bool = False,
+    ) -> List[Episode]:
+        """Helper method to run n episodes.
+
+        See docstring of `self.sample()` for more details.
+        """
+        done_episodes_to_return = []
+
+        obs, _ = self.env.reset()
+        episodes = [Episode() for _ in range(self.num_envs)]
+
+        # Multiply states n times according to our vector env batch size (num_envs).
+        states = tree.map_structure(
+            lambda s: np.repeat(s, self.num_envs, axis=0),
+            self.module.get_initial_state(),
+        )
+        is_first = np.ones((self.num_envs,), dtype=np.float32)
+
+        render_images = [None] * self.num_envs
+        if with_render_data:
+            render_images = [e.render() for e in self.env.envs]
+
+        for i in range(self.num_envs):
+            episodes[i].add_initial_observation(
+                initial_observation=obs[i],
+                initial_state={k: s[i] for k, s in states.items()},
+                initial_render_image=render_images[i],
+            )
+
+        eps = 0
+        while eps < num_episodes:
+            if random_actions:
+                actions = self.env.action_space.sample()
+            else:
+                batch = {
+                    STATE_IN: tree.map_structure(
+                        lambda s: tf.convert_to_tensor(s), states
+                    ),
+                    SampleBatch.OBS: tf.convert_to_tensor(obs),
+                    "is_first": tf.convert_to_tensor(is_first),
+                }
+
+                if explore:
+                    outs = self.module.forward_exploration(batch)
+                else:
+                    outs = self.module.forward_inference(batch)
+
+                actions = outs[SampleBatch.ACTIONS].numpy()
+                if isinstance(self.env.single_action_space, gym.spaces.Discrete):
+                    actions = np.argmax(actions, axis=-1)
+                states = tree.map_structure(lambda s: s.numpy(), outs[STATE_OUT])
+
+            obs, rewards, terminateds, truncateds, infos = self.env.step(actions)
+            if with_render_data:
+                render_images = [e.render() for e in self.env.envs]
+
+            for i in range(self.num_envs):
+                s = {k: s[i] for k, s in states.items()}
+                # The last entry in self.observations[i] is already the reset
+                # obs of the new episode.
+                if terminateds[i] or truncateds[i]:
+                    eps += 1
+
+                    episodes[i].add_timestep(
+                        infos["final_observation"][i],
+                        actions[i],
+                        rewards[i],
+                        state=s,
+                        is_terminated=terminateds[i],
+                        is_truncated=truncateds[i],
+                    )
+                    done_episodes_to_return.append(episodes[i])
+
+                    # Also early-out if we reach the number of episodes within this
+                    # for-loop.
+                    if eps == num_episodes:
+                        break
+
+                    # Reset h-states to the model's initial ones b/c we are starting a
+                    # new episode.
+                    for k, v in self.module.get_initial_state().items():
+                        states[k][i] = v.numpy()
+                    is_first[i] = True
+
+                    episodes[i] = Episode(
+                        observations=[obs[i]],
+                        states=s,
+                        render_images=[render_images[i]],
+                    )
+                else:
+                    episodes[i].add_timestep(
+                        obs[i],
+                        actions[i],
+                        rewards[i],
+                        state=s,
+                        render_image=render_images[i],
+                    )
+                    is_first[i] = False
+
+        self._done_episodes_for_metrics.extend(done_episodes_to_return)
+        self._ts_since_last_metrics += sum(len(eps) for eps in done_episodes_to_return)
+
+        # If user calls sample(num_timesteps=..) after this, we must reset again
+        # at the beginning.
+        self._needs_initial_reset = True
+
+        return done_episodes_to_return
+
+    # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this
+    #  API. Instead Algorithm should compile episode metrics itself via its local
+    #  buffer.
+    def get_metrics(self) -> List[RolloutMetrics]:
+        # Compute per-episode metrics (only on already completed episodes).
+        metrics = []
+        for eps in self._done_episodes_for_metrics:
+            episode_length = len(eps)
+            episode_reward = eps.get_return()
+            # Don't forget about the already returned chunks of this episode.
+            if eps.id_ in self._ongoing_episodes_for_metrics:
+                for eps2 in self._ongoing_episodes_for_metrics[eps.id_]:
+                    episode_length += len(eps2)
+                    episode_reward += eps2.get_return()
+                del self._ongoing_episodes_for_metrics[eps.id_]
+
+            metrics.append(
+                RolloutMetrics(
+                    episode_length=episode_length,
+                    episode_reward=episode_reward,
+                )
+            )
+
+        self._done_episodes_for_metrics.clear()
+        self._ts_since_last_metrics = 0
+
+        return metrics
+
+    # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this
+    #  API. Replace by proper state overriding via `EnvRunner.set_state()`
+    def set_weights(self, weights, global_vars=None):
+        """Writes the weights of our (single-agent) RLModule."""
+        if self.module is None:
+            assert self.config.share_module_between_env_runner_and_learner
+        else:
+            self.module.set_state(weights[DEFAULT_POLICY_ID])
+
+    @override(EnvRunner)
+    def assert_healthy(self):
+        # Make sure, we have built our gym.vector.Env and RLModule properly.
+        assert self.env and self.module
+
+    @override(EnvRunner)
+    def stop(self):
+        # Close our env object via gymnasium's API.
+        self.env.close()
+
+
+class NormalizedImageEnv(gym.ObservationWrapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.observation_space = gym.spaces.Box(
+            -1.0,
+            1.0,
+            shape=self.observation_space.shape,
+            dtype=np.float32,
+        )
+
+    # Divide by scale and center around 0.0, such that observations are in the range
+    # of -1.0 and 1.0.
+    def observation(self, observation):
+        return (observation.astype(np.float32) / 128.0) - 1.0
+
+
+class OneHot(gym.ObservationWrapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.observation_space = gym.spaces.Box(
+            0.0, 1.0, shape=(self.observation_space.n,), dtype=np.float32
+        )
+
+    def reset(self, **kwargs):
+        ret = self.env.reset(**kwargs)
+        return self._get_obs(ret[0]), ret[1]
+
+    def step(self, action):
+        ret = self.env.step(action)
+        return self._get_obs(ret[0]), ret[1], ret[2], ret[3], ret[4]
+
+    def _get_obs(self, obs):
+        return one_hot(obs, depth=self.observation_space.shape[0])
+
+
+class ActionClip(gym.ActionWrapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._low = -1.0
+        self._high = 1.0
+        self.action_space = gym.spaces.Box(
+            self._low,
+            self._high,
+            self.action_space.shape,
+            self.action_space.dtype,
+        )
+
+    def action(self, action):
+        return np.clip(action, self._low, self._high)
diff --git a/rllib/algorithms/dreamerv3/utils/summaries.py b/rllib/algorithms/dreamerv3/utils/summaries.py
new file mode 100644
index 0000000000000..d781a33e40d6b
--- /dev/null
+++ b/rllib/algorithms/dreamerv3/utils/summaries.py
@@ -0,0 +1,329 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+import numpy as np
+
+from ray.rllib.algorithms.dreamerv3.utils.debugging import (
+    create_cartpole_dream_image,
+    create_frozenlake_dream_image,
+)
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.tf_utils import inverse_symlog
+
+
+def _summarize(*, results, data_to_summarize, keys_to_log, include_histograms=False):
+    for k in keys_to_log:
+        if data_to_summarize[k].shape == ():
+            results.update({k: data_to_summarize[k]})
+        elif include_histograms:
+            results.update({k: data_to_summarize[k]})
+
+
+def reconstruct_obs_from_h_and_z(
+    h_t0_to_H,
+    z_t0_to_H,
+    dreamer_model,
+    obs_dims_shape,
+):
+    """Returns"""
+    shape = h_t0_to_H.shape
+    T = shape[0]  # inputs are time-major
+    B = shape[1]
+    # Compute actual observations using h and z and the decoder net.
+    # Note that the last h-state (T+1) is NOT used here as it's already part of
+    # a new trajectory.
+    # Use mean() of the Gaussian, no sample! -> No need to construct dist object here.
+    reconstructed_obs_distr_means_TxB = dreamer_model.world_model.decoder(
+        # Fold time rank.
+        h=np.reshape(h_t0_to_H, (T * B, -1)),
+        z=np.reshape(z_t0_to_H, (T * B,) + z_t0_to_H.shape[2:]),
+    )
+    # Unfold time rank again.
+    reconstructed_obs_T_B = np.reshape(
+        reconstructed_obs_distr_means_TxB, (T, B) + obs_dims_shape
+    )
+    # Return inverse symlog'd (real env obs space) reconstructed observations.
+    return reconstructed_obs_T_B
+
+
+def report_dreamed_trajectory(
+    *,
+    results,
+    env,
+    dreamer_model,
+    obs_dims_shape,
+    batch_indices=(0,),
+    desc=None,
+    include_images=True,
+):
+    if not include_images:
+        return
+
+    dream_data = results["dream_data"]
+    dreamed_obs_H_B = reconstruct_obs_from_h_and_z(
+        h_t0_to_H=dream_data["h_states_t0_to_H_BxT"],
+        z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"],
+        dreamer_model=dreamer_model,
+        obs_dims_shape=obs_dims_shape,
+    )
+    func = (
+        create_cartpole_dream_image
+        if env.startswith("CartPole")
+        else create_frozenlake_dream_image
+    )
+    # Take 0th dreamed trajectory and produce series of images.
+    for b in batch_indices:
+        images = []
+        for t in range(len(dreamed_obs_H_B) - 1):
+            images.append(
+                func(
+                    dreamed_obs=dreamed_obs_H_B[t][b],
+                    dreamed_V=dream_data["values_dreamed_t0_to_H_BxT"][t][b],
+                    dreamed_a=(dream_data["actions_ints_dreamed_t0_to_H_BxT"][t][b]),
+                    dreamed_r_tp1=(dream_data["rewards_dreamed_t0_to_H_BxT"][t + 1][b]),
+                    # `DISAGREE_intrinsic_rewards_H_B` are shifted by 1 already
+                    # (from t1 to H, not t0 to H like all other data here).
+                    dreamed_ri_tp1=(
+                        results["DISAGREE_intrinsic_rewards_H_BxT"][t][b]
+                        if "DISAGREE_intrinsic_rewards_H_BxT" in results
+                        else None
+                    ),
+                    dreamed_c_tp1=(
+                        dream_data["continues_dreamed_t0_to_H_BxT"][t + 1][b]
+                    ),
+                    value_target=results["VALUE_TARGETS_H_BxT"][t][b],
+                    initial_h=dream_data["h_states_t0_to_H_BxT"][t][b],
+                    as_tensor=True,
+                ).numpy()
+            )
+        # Concat images along width-axis (so they show as a "film sequence" next to each
+        # other).
+        results.update(
+            {
+                f"dreamed_trajectories{('_'+desc) if desc else ''}_B{b}": (
+                    np.concatenate(images, axis=1)
+                ),
+            }
+        )
+
+
+def report_predicted_vs_sampled_obs(
+    *,
+    results,
+    sample,
+    batch_size_B,
+    batch_length_T,
+    symlog_obs: bool = True,
+):
+    """Summarizes sampled data (from the replay buffer) vs world-model predictions.
+
+    World model predictions are based on the posterior states (z computed from actual
+    observation encoder input + the current h-states).
+
+    Observations: Computes MSE (sampled vs predicted/recreated) over all features.
+    For image observations, also creates direct image comparisons (sampled images
+    vs predicted (posterior) ones).
+    Rewards: Compute MSE (sampled vs predicted).
+    Continues: Compute MSE (sampled vs predicted).
+
+    Args:
+        results: The results dict that was returned by `LearnerGroup.update()`.
+        sample: The sampled data (dict) from the replay buffer. Already tf-tensor
+            converted.
+        batch_size_B: The batch size (B). This is the number of trajectories sampled
+            from the buffer.
+        batch_length_T: The batch length (T). This is the length of an individual
+            trajectory sampled from the buffer.
+    """
+    predicted_observation_means_BxT = results[
+        "WORLD_MODEL_fwd_out_obs_distribution_means_BxT"
+    ]
+    _report_obs(
+        results=results,
+        computed_float_obs_B_T_dims=np.reshape(
+            predicted_observation_means_BxT,
+            (batch_size_B, batch_length_T) + sample[SampleBatch.OBS].shape[2:],
+        ),
+        sampled_obs_B_T_dims=sample[SampleBatch.OBS],
+        descr_prefix="WORLD_MODEL",
+        descr_obs=f"predicted_posterior_T{batch_length_T}",
+        symlog_obs=symlog_obs,
+    )
+
+
+def report_dreamed_eval_trajectory_vs_samples(
+    *,
+    results,
+    dream_data,
+    sample,
+    burn_in_T,
+    dreamed_T,
+    dreamer_model,
+    symlog_obs: bool = True,
+):
+    # Obs MSE.
+    dreamed_obs_T_B = reconstruct_obs_from_h_and_z(
+        h_t0_to_H=dream_data["h_states_t0_to_H_BxT"],
+        z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"],
+        dreamer_model=dreamer_model,
+        obs_dims_shape=sample[SampleBatch.OBS].shape[2:],
+    )
+    t0 = burn_in_T - 1
+    tH = t0 + dreamed_T
+    # Observation MSE and - if applicable - images comparisons.
+    mse_sampled_vs_dreamed_obs = _report_obs(
+        results=results,
+        # Have to transpose b/c dreamed data is time-major.
+        computed_float_obs_B_T_dims=np.transpose(
+            dreamed_obs_T_B,
+            axes=[1, 0] + list(range(2, len(dreamed_obs_T_B.shape))),
+        ),
+        sampled_obs_B_T_dims=sample[SampleBatch.OBS][:, t0 : tH + 1],
+        descr_prefix="EVALUATION",
+        descr_obs=f"dreamed_prior_H{dreamed_T}",
+        symlog_obs=symlog_obs,
+    )
+
+    # Reward MSE.
+    _report_rewards(
+        results=results,
+        computed_rewards=dream_data["rewards_dreamed_t0_to_H_BxT"],
+        sampled_rewards=sample[SampleBatch.REWARDS][:, t0 : tH + 1],
+        descr_prefix="EVALUATION",
+        descr_reward=f"dreamed_prior_H{dreamed_T}",
+    )
+
+    # Continues MSE.
+    _report_continues(
+        results=results,
+        computed_continues=dream_data["continues_dreamed_t0_to_H_BxT"],
+        sampled_continues=(1.0 - sample["is_terminated"])[:, t0 : tH + 1],
+        descr_prefix="EVALUATION",
+        descr_cont=f"dreamed_prior_H{dreamed_T}",
+    )
+    return mse_sampled_vs_dreamed_obs
+
+
+def report_sampling_and_replay_buffer(*, replay_buffer):
+    episodes_in_buffer = replay_buffer.get_num_episodes()
+    ts_in_buffer = replay_buffer.get_num_timesteps()
+    replayed_steps = replay_buffer.get_sampled_timesteps()
+    added_steps = replay_buffer.get_added_timesteps()
+
+    # Summarize buffer, sampling, and train ratio stats.
+    return {
+        "BUFFER_capacity": replay_buffer.capacity,
+        "BUFFER_size_num_episodes": episodes_in_buffer,
+        "BUFFER_size_timesteps": ts_in_buffer,
+        "BUFFER_replayed_steps": replayed_steps,
+        "BUFFER_added_steps": added_steps,
+    }
+
+
+def _report_obs(
+    *,
+    results,
+    computed_float_obs_B_T_dims,
+    sampled_obs_B_T_dims,
+    descr_prefix=None,
+    descr_obs,
+    symlog_obs,
+):
+    """Summarizes computed- vs sampled observations: MSE and (if applicable) images.
+
+    Args:
+        computed_float_obs_B_T_dims: Computed float observations
+            (not clipped, not cast'd). Shape=(B, T, [dims ...]).
+        sampled_obs_B_T_dims: Sampled observations (as-is from the environment, meaning
+            this could be uint8, 0-255 clipped images). Shape=(B, T, [dims ...]).
+        B: The batch size B (see shapes of `computed_float_obs_B_T_dims` and
+            `sampled_obs_B_T_dims` above).
+        T: The batch length T (see shapes of `computed_float_obs_B_T_dims` and
+            `sampled_obs_B_T_dims` above).
+        descr: A string used to describe the computed data to be used in the TB
+            summaries.
+    """
+    # Videos: Create summary, comparing computed images with actual sampled ones.
+    # 4=[B, T, w, h] grayscale image; 5=[B, T, w, h, C] RGB image.
+    if len(sampled_obs_B_T_dims.shape) in [4, 5]:
+        descr_prefix = (descr_prefix + "_") if descr_prefix else ""
+
+        if symlog_obs:
+            computed_float_obs_B_T_dims = inverse_symlog(computed_float_obs_B_T_dims)
+
+        # Restore image pixels from normalized (non-symlog'd) data.
+        if not symlog_obs:
+            computed_float_obs_B_T_dims = (computed_float_obs_B_T_dims + 1.0) * 128
+            sampled_obs_B_T_dims = (sampled_obs_B_T_dims + 1.0) * 128
+            sampled_obs_B_T_dims = np.clip(sampled_obs_B_T_dims, 0.0, 255.0).astype(
+                np.uint8
+            )
+        computed_images = np.clip(computed_float_obs_B_T_dims, 0.0, 255.0).astype(
+            np.uint8
+        )
+        # Concat sampled and computed images along the height axis (3) such that
+        # real images show below respective predicted ones.
+        # (B, T, C, h, w)
+        sampled_vs_computed_images = np.concatenate(
+            [computed_images, sampled_obs_B_T_dims],
+            axis=3,
+        )
+        # Add grayscale dim, if necessary.
+        if len(sampled_obs_B_T_dims.shape) == 2 + 2:
+            sampled_vs_computed_images = np.expand_dims(sampled_vs_computed_images, -1)
+
+        results.update(
+            {f"{descr_prefix}sampled_vs_{descr_obs}_videos": sampled_vs_computed_images}
+        )
+
+    # return mse_sampled_vs_computed_obs
+
+
+def _report_rewards(
+    *,
+    results,
+    computed_rewards,
+    sampled_rewards,
+    descr_prefix=None,
+    descr_reward,
+):
+    descr_prefix = (descr_prefix + "_") if descr_prefix else ""
+    mse_sampled_vs_computed_rewards = np.mean(
+        np.square(computed_rewards - sampled_rewards)
+    )
+    mse_sampled_vs_computed_rewards = np.mean(mse_sampled_vs_computed_rewards)
+    results.update(
+        {
+            f"{descr_prefix}sampled_vs_{descr_reward}_rewards_mse": (
+                mse_sampled_vs_computed_rewards
+            ),
+        }
+    )
+
+
+def _report_continues(
+    *,
+    results,
+    computed_continues,
+    sampled_continues,
+    descr_prefix=None,
+    descr_cont,
+):
+    descr_prefix = (descr_prefix + "_") if descr_prefix else ""
+    # Continue MSE.
+    mse_sampled_vs_computed_continues = np.mean(
+        np.square(computed_continues - sampled_continues.astype(np.float32))
+    )
+    results.update(
+        {
+            f"{descr_prefix}sampled_vs_{descr_cont}_continues_mse": (
+                mse_sampled_vs_computed_continues
+            ),
+        }
+    )
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index d435e469b23ce..81cb8d0627bde 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -482,12 +482,12 @@ def training_step(self) -> ResultDict:
         # workers.
         with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
             if self.workers.num_remote_workers() > 0:
-                from_worker_or_trainer = None
+                from_worker_or_learner_group = None
                 if self.config._enable_learner_api:
                     # sync weights from learner_group to all rollout workers
-                    from_worker_or_trainer = self.learner_group
+                    from_worker_or_learner_group = self.learner_group
                 self.workers.sync_weights(
-                    from_worker_or_trainer=from_worker_or_trainer,
+                    from_worker_or_learner_group=from_worker_or_learner_group,
                     policies=policies_to_update,
                     global_vars=global_vars,
                 )
diff --git a/rllib/algorithms/registry.py b/rllib/algorithms/registry.py
index 5387420cc5230..5352814f5e4e4 100644
--- a/rllib/algorithms/registry.py
+++ b/rllib/algorithms/registry.py
@@ -114,6 +114,12 @@ def _import_dreamer():
     return dreamer.Dreamer, dreamer.Dreamer.get_default_config()
 
 
+def _import_dreamerv3():
+    import ray.rllib.algorithms.dreamerv3 as dreamerv3
+
+    return dreamerv3.DreamerV3, dreamerv3.DreamerV3.get_default_config()
+
+
 def _import_dt():
     import ray.rllib.algorithms.dt as dt
 
@@ -239,6 +245,7 @@ def _import_leela_chess_zero():
     "DDPPO": _import_ddppo,
     "DQN": _import_dqn,
     "Dreamer": _import_dreamer,
+    "DreamerV3": _import_dreamerv3,
     "DT": _import_dt,
     "IMPALA": _import_impala,
     "APPO": _import_appo,
@@ -278,6 +285,7 @@ def _import_leela_chess_zero():
     "DDPPO": "DDPPO",
     "DQN": "DQN",
     "Dreamer": "Dreamer",
+    "DreamerV3": "DreamerV3",
     "DT": "DT",
     "Impala": "IMPALA",
     "APPO": "APPO",
diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py
index 9bbff1f7f0877..2bde70aa69ea4 100644
--- a/rllib/algorithms/tests/test_algorithm_config.py
+++ b/rllib/algorithms/tests/test_algorithm_config.py
@@ -147,15 +147,12 @@ def test_detect_atari_env(self):
         config = AlgorithmConfig().environment(
             env="ALE/Breakout-v5", env_config={"frameskip": 1}
         )
-        config.validate()
         self.assertTrue(config.is_atari)
 
         config = AlgorithmConfig().environment(env="ALE/Pong-v5")
-        config.validate()
         self.assertTrue(config.is_atari)
 
         config = AlgorithmConfig().environment(env="CartPole-v1")
-        config.validate()
         # We do not auto-detect callable env makers for Atari envs.
         self.assertFalse(config.is_atari)
 
@@ -166,12 +163,10 @@ def test_detect_atari_env(self):
                 make_kwargs={"frameskip": 1},
             )
         )
-        config.validate()
         # We do not auto-detect callable env makers for Atari envs.
         self.assertFalse(config.is_atari)
 
         config = AlgorithmConfig().environment(env="NotAtari")
-        config.validate()
         self.assertFalse(config.is_atari)
 
     def test_rl_module_api(self):
diff --git a/rllib/core/learner/tf/tf_learner.py b/rllib/core/learner/tf/tf_learner.py
index 2cb9cdeb049aa..2cc22a725cf1b 100644
--- a/rllib/core/learner/tf/tf_learner.py
+++ b/rllib/core/learner/tf/tf_learner.py
@@ -376,7 +376,7 @@ def _make_distributed_strategy_if_necessary(self) -> "tf.distribute.Strategy":
             devices = tf.config.list_logical_devices("GPU")
             assert self._local_gpu_idx < len(devices), (
                 f"local_gpu_idx {self._local_gpu_idx} is not a valid GPU id or is "
-                " not available."
+                "not available."
             )
             local_gpu = [devices[self._local_gpu_idx].name]
             strategy = tf.distribute.MirroredStrategy(devices=local_gpu)
@@ -431,10 +431,11 @@ def helper(_batch):
             #  in-efficient. However, for tf>=2.12, it works also w/o this conversion
             #  so remove this after we upgrade officially to tf==2.12.
             _batch = NestedDict(_batch)
-            with tf.GradientTape() as tape:
+            with tf.GradientTape(persistent=True) as tape:
                 fwd_out = self._module.forward_train(_batch)
                 loss_per_module = self.compute_loss(fwd_out=fwd_out, batch=_batch)
             gradients = self.compute_gradients(loss_per_module, gradient_tape=tape)
+            del tape
             postprocessed_gradients = self.postprocess_gradients(gradients)
             self.apply_gradients(postprocessed_gradients)
 
diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py
index b6478d51d09d0..6aed0b9850521 100644
--- a/rllib/core/rl_module/rl_module.py
+++ b/rllib/core/rl_module/rl_module.py
@@ -285,7 +285,19 @@ class RLModule(abc.ABC):
 
     def __init__(self, config: RLModuleConfig):
         self.config = config
+        # Make sure, `setup()` is only called once, no matter what. In some cases
+        # of multiple inheritance (and with our __post_init__ functionality in place,
+        # this might get called twice.
+        if hasattr(self, "_is_setup") and self._is_setup:
+            raise RuntimeError(
+                "`RLModule.setup()` called twice within your RLModule implementation "
+                f"{self}! Make sure you are using the proper inheritance order "
+                "(TorchRLModule before [Algo]RLModule) or (TfRLModule before "
+                "[Algo]RLModule) and that you are using `super().__init__(...)` in "
+                "your custom constructor."
+            )
         self.setup()
+        self._is_setup = True
 
     def __init_subclass__(cls, **kwargs):
         # Automatically add a __post_init__ method to all subclasses of RLModule.
diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py
index 100b815d2b621..21b2601b7e05f 100644
--- a/rllib/evaluation/worker_set.py
+++ b/rllib/evaluation/worker_set.py
@@ -356,7 +356,9 @@ def num_remote_worker_restarts(self) -> int:
     def sync_weights(
         self,
         policies: Optional[List[PolicyID]] = None,
-        from_worker_or_trainer: Optional[Union[RolloutWorker, LearnerGroup]] = None,
+        from_worker_or_learner_group: Optional[
+            Union[RolloutWorker, LearnerGroup]
+        ] = None,
         to_worker_indices: Optional[List[int]] = None,
         global_vars: Optional[Dict[str, TensorType]] = None,
         timeout_seconds: Optional[int] = 0,
@@ -369,7 +371,7 @@ def sync_weights(
         Args:
             policies: Optional list of PolicyIDs to sync weights for.
                 If None (default), sync weights to/from all policies.
-            from_worker_or_trainer: Optional (local) RolloutWorker instance or
+            from_worker_or_learner_group: Optional (local) RolloutWorker instance or
                 LearnerGroup instance to sync from. If None (default),
                 sync from this WorkerSet's local worker.
             to_worker_indices: Optional list of worker indices to sync the
@@ -381,16 +383,16 @@ def sync_weights(
                 for any sync calls to finish). This significantly improves
                 algorithm performance.
         """
-        if self.local_worker() is None and from_worker_or_trainer is None:
+        if self.local_worker() is None and from_worker_or_learner_group is None:
             raise TypeError(
-                "No `local_worker` in WorkerSet, must provide `from_worker` "
-                "arg in `sync_weights()`!"
+                "No `local_worker` in WorkerSet, must provide "
+                "`from_worker_or_learner_group` arg in `sync_weights()`!"
             )
 
         # Only sync if we have remote workers or `from_worker_or_trainer` is provided.
         weights = None
-        if self.num_remote_workers() or from_worker_or_trainer is not None:
-            weights_src = from_worker_or_trainer or self.local_worker()
+        if self.num_remote_workers() or from_worker_or_learner_group is not None:
+            weights_src = from_worker_or_learner_group or self.local_worker()
 
             if weights_src is None:
                 raise ValueError(
@@ -414,10 +416,10 @@ def set_weight(w):
                 timeout_seconds=timeout_seconds,
             )
 
-        # If `from_worker` is provided, also sync to this WorkerSet's
+        # If `from_worker_or_learner_group` is provided, also sync to this WorkerSet's
         # local worker.
         if self.local_worker() is not None:
-            if from_worker_or_trainer is not None:
+            if from_worker_or_learner_group is not None:
                 self.local_worker().set_weights(weights, global_vars=global_vars)
             # If `global_vars` is provided and local worker exists  -> Update its
             # global_vars.
diff --git a/rllib/policy/eager_tf_policy_v2.py b/rllib/policy/eager_tf_policy_v2.py
index 8a4093fb0e2d5..4df6b2724fb3d 100644
--- a/rllib/policy/eager_tf_policy_v2.py
+++ b/rllib/policy/eager_tf_policy_v2.py
@@ -870,7 +870,12 @@ def _compute_actions_helper_rl_module_explore(
             actions = fwd_out[SampleBatch.ACTIONS]
         # Otherwise, sample actions from the distribution.
         else:
-            assert action_dist
+            if action_dist is None:
+                raise KeyError(
+                    "Your RLModule's `forward_exploration()` method must return a dict"
+                    f"with either the {SampleBatch.ACTIONS} key or the "
+                    f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!"
+                )
             actions = action_dist.sample()
 
         # Anything but action_dist and state_out is an extra fetch
@@ -926,7 +931,12 @@ def _compute_actions_helper_rl_module_inference(
             actions = fwd_out[SampleBatch.ACTIONS]
         # Otherwise, sample actions from the distribution.
         else:
-            assert action_dist
+            if action_dist is None:
+                raise KeyError(
+                    "Your RLModule's `forward_inference()` method must return a dict"
+                    f"with either the {SampleBatch.ACTIONS} key or the "
+                    f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!"
+                )
             actions = action_dist.sample()
 
         # Anything but action_dist and state_out is an extra fetch
diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index 4165da80a1f8d..bef3c070d81a4 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -1147,7 +1147,12 @@ def _compute_action_helper(
                     actions = fwd_out[SampleBatch.ACTIONS]
                 # Otherwise, sample actions from the distribution.
                 else:
-                    assert action_dist
+                    if action_dist is None:
+                        raise KeyError(
+                            "Your RLModule's `forward_exploration()` method must return"
+                            f" a dict with either the {SampleBatch.ACTIONS} key or the "
+                            f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!"
+                        )
                     actions = action_dist.sample()
 
                 # Compute action-logp and action-prob from distribution and add to
@@ -1171,7 +1176,12 @@ def _compute_action_helper(
                     actions = fwd_out[SampleBatch.ACTIONS]
                 # Otherwise, sample actions from the distribution.
                 else:
-                    assert action_dist
+                    if action_dist is None:
+                        raise KeyError(
+                            "Your RLModule's `forward_inference()` method must return"
+                            f" a dict with either the {SampleBatch.ACTIONS} key or the "
+                            f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!"
+                        )
                     actions = action_dist.sample()
 
             # Anything but actions and state_out is an extra fetch.
diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py
index 0a9303a9e47d5..0f945dd7db82c 100644
--- a/rllib/tests/run_regression_tests.py
+++ b/rllib/tests/run_regression_tests.py
@@ -57,6 +57,12 @@
     action="store_true",
     help="Run ray in local mode for easier debugging.",
 )
+parser.add_argument(
+    "--num-samples",
+    type=int,
+    default=1,
+    help="The number of seeds/samples to run with the given experiment config.",
+)
 parser.add_argument(
     "--override-mean-reward",
     type=float,
@@ -103,12 +109,14 @@
 
     # Loop through all collected files.
     for file in files:
+        config_is_python = False
         # For python files, need to make sure, we only deliver the module name into the
         # `load_experiments_from_file` function (everything from "/ray/rllib" on).
         if file.endswith(".py"):
             if file.endswith("__init__.py"):  # weird CI learning test (BAZEL) case
                 continue
             experiments = load_experiments_from_file(file, SupportedFileType.python)
+            config_is_python = True
         else:
             experiments = load_experiments_from_file(file, SupportedFileType.yaml)
 
@@ -118,13 +126,16 @@
 
         exp = list(experiments.values())[0]
 
+        # Set the number of samples to run.
+        exp["num_samples"] = args.num_samples
+
         # Override framework setting with the command line one, if provided.
         # Otherwise, will use framework setting in file (or default: torch).
         if args.framework is not None:
             exp["config"]["framework"] = args.framework
         # Override env setting if given on command line.
         if args.env is not None:
-            exp["config"]["env"] = args.env
+            exp["config"]["env"] = exp["env"] = args.env
 
         # Override the mean reward if specified. This is used by the ray ci
         # for overriding the episode reward mean for tf2 tests for off policy
@@ -139,19 +150,23 @@
             print(f"Skipping framework='{args.framework}' for QMIX.")
             continue
 
-        # Always run with eager-tracing when framework=tf2 if not in local-mode.
-        # Ignore this if the yaml explicitly tells us to disable eager tracing
+        # Always run with eager-tracing when framework=tf2, if not in local-mode
+        # and unless the yaml explicitly tells us to disable eager tracing.
         if (
-            args.framework == "tf2"
+            (args.framework == "tf2" or exp["config"].get("framework") == "tf2")
             and not args.local_mode
-            and not exp["config"].get("eager_tracing") is False
+            # Note: This check will always fail for python configs, b/c normally,
+            # algorithm configs have `self.eager_tracing=False` by default.
+            # Thus, you'd have to set `eager_tracing` to True explicitly in your python
+            # config to make sure we are indeed using eager tracing.
+            and exp["config"].get("eager_tracing") is not False
         ):
-
             exp["config"]["eager_tracing"] = True
 
-        # Print out the actual config.
-        print("== Test config ==")
-        print(yaml.dump(experiments))
+        # Print out the actual config (not for py files as yaml.dump weirdly fails).
+        if not config_is_python:
+            print("== Test config ==")
+            print(yaml.dump(experiments))
 
         # Try running each test 3 times and make sure it reaches the given
         # reward.
diff --git a/rllib/tuned_examples/dreamerv3/__init__.py b/rllib/tuned_examples/dreamerv3/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/rllib/tuned_examples/dreamerv3/atari_100k.py b/rllib/tuned_examples/dreamerv3/atari_100k.py
new file mode 100644
index 0000000000000..ef6731d6e2e2a
--- /dev/null
+++ b/rllib/tuned_examples/dreamerv3/atari_100k.py
@@ -0,0 +1,71 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+
+# Run with:
+# python run_regression_tests.py --dir [this file] --env ALE/[gym ID e.g. Pong-v5]
+
+from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
+
+
+# Number of GPUs to run on.
+num_gpus = 1
+
+config = (
+    DreamerV3Config()
+    # Switch on eager_tracing by default.
+    .framework("tf2", eager_tracing=True)
+    .resources(
+        num_learner_workers=0 if num_gpus == 1 else num_gpus,
+        num_gpus_per_learner_worker=1 if num_gpus else 0,
+        num_cpus_for_local_worker=1,
+    )
+    # TODO (sven): concretize this: If you use >1 GPU and increase the batch size
+    #  accordingly, you might also want to increase the number of envs per worker
+    .rollouts(
+        num_envs_per_worker=(num_gpus or 1),
+        remote_worker_envs=True,
+    )
+    .environment(
+        # [2]: "We follow the evaluation protocol of Machado et al. (2018) with 200M
+        # environment steps, action repeat of 4, a time limit of 108,000 steps per
+        # episode that correspond to 30 minutes of game play, no access to life
+        # information, full action space, and sticky actions. Because the world model
+        # integrates information over time, DreamerV2 does not use frame stacking.
+        # The experiments use a single-task setup where a separate agent is trained
+        # for each game. Moreover, each agent uses only a single environment instance.
+        env_config={
+            # "sticky actions" but not according to Danijar's 100k configs.
+            "repeat_action_probability": 0.0,
+            # "full action space" but not according to Danijar's 100k configs.
+            "full_action_space": False,
+            # Already done by MaxAndSkip wrapper: "action repeat" == 4.
+            "frameskip": 1,
+        }
+    )
+    .reporting(
+        metrics_num_episodes_for_smoothing=(num_gpus or 1),
+        report_images_and_videos=False,
+        report_dream_data=False,
+        report_individual_batch_item_stats=False,
+    )
+    # See Appendix A.
+    .training(
+        model_size="S",
+        training_ratio=1024,
+        batch_size_B=16 * (num_gpus or 1),
+        # TODO
+        model={
+            "batch_length_T": 64,
+            "horizon_H": 15,
+            "gamma": 0.997,
+            "model_size": "S",
+        },
+    )
+)
diff --git a/rllib/tuned_examples/dreamerv3/cartpole.py b/rllib/tuned_examples/dreamerv3/cartpole.py
new file mode 100644
index 0000000000000..b270d6c3b3137
--- /dev/null
+++ b/rllib/tuned_examples/dreamerv3/cartpole.py
@@ -0,0 +1,30 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
+
+# Run with:
+# python run_regression_tests.py --dir [this file]
+
+config = (
+    DreamerV3Config()
+    .environment("CartPole-v1")
+    .training(
+        model_size="XS",
+        training_ratio=1024,
+        # TODO
+        model={
+            "batch_size_B": 16,
+            "batch_length_T": 64,
+            "horizon_H": 15,
+            "gamma": 0.997,
+            "model_size": "XS",
+        },
+    )
+)
diff --git a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py
new file mode 100644
index 0000000000000..a8938ce142123
--- /dev/null
+++ b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py
@@ -0,0 +1,39 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
+
+# Run with:
+# python run_regression_tests.py --dir [this file] --env DMC/[task]/[domain]
+# e.g. --env=DMC/cartpole/swingup
+
+config = (
+    DreamerV3Config()
+    # Use image observations.
+    .environment(env_config={"from_pixels": True})
+    .resources(
+        num_learner_workers=1,
+        num_gpus_per_learner_worker=1,
+        num_cpus_for_local_worker=1,
+    )
+    .rollouts(num_envs_per_worker=4, remote_worker_envs=True)
+    # See Appendix A.
+    .training(
+        model_size="S",
+        training_ratio=512,
+        # TODO
+        model={
+            "batch_size_B": 16,
+            "batch_length_T": 64,
+            "horizon_H": 15,
+            "gamma": 0.997,
+            "model_size": "S",
+        },
+    )
+)
diff --git a/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py b/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py
new file mode 100644
index 0000000000000..03e9b40def8a3
--- /dev/null
+++ b/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py
@@ -0,0 +1,39 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
+
+# Run with:
+# python run_regression_tests.py --dir [this file]
+
+config = (
+    DreamerV3Config()
+    .environment(
+        "FrozenLake-v1",
+        env_config={
+            "desc": [
+                "SF",
+                "HG",
+            ],
+            "is_slippery": False,
+        },
+    )
+    .training(
+        model_size="XS",
+        training_ratio=1024,
+        # TODO
+        model={
+            "batch_size_B": 16,
+            "batch_length_T": 64,
+            "horizon_H": 15,
+            "gamma": 0.997,
+            "model_size": "XS",
+        },
+    )
+)
diff --git a/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py b/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py
new file mode 100644
index 0000000000000..9b7b260d595e9
--- /dev/null
+++ b/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py
@@ -0,0 +1,36 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
+
+# Run with:
+# python run_regression_tests.py --dir [this file]
+
+config = (
+    DreamerV3Config()
+    .environment(
+        "FrozenLake-v1",
+        env_config={
+            "map_name": "4x4",
+            "is_slippery": False,
+        },
+    )
+    .training(
+        model_size="nano",
+        training_ratio=1024,
+        # TODO
+        model={
+            "batch_size_B": 16,
+            "batch_length_T": 64,
+            "horizon_H": 15,
+            "gamma": 0.997,
+            "model_size": "nano",
+        },
+    )
+)
diff --git a/rllib/tuned_examples/dreamerv3/pendulum.py b/rllib/tuned_examples/dreamerv3/pendulum.py
new file mode 100644
index 0000000000000..4acc4b9aa85a9
--- /dev/null
+++ b/rllib/tuned_examples/dreamerv3/pendulum.py
@@ -0,0 +1,19 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
+
+# Run with:
+# python run_regression_tests.py --dir [this file]
+
+config = (
+    DreamerV3Config()
+    .environment("Pendulum-v1")
+    .training(model_size="XS", training_ratio=1024)
+)
diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py
index 6c9c9badd7a03..0bee53bbd5590 100644
--- a/rllib/utils/metrics/__init__.py
+++ b/rllib/utils/metrics/__init__.py
@@ -30,6 +30,7 @@
 TRAINING_ITERATION_TIMER = "training_iteration"
 APPLY_GRADS_TIMER = "apply_grad"
 COMPUTE_GRADS_TIMER = "compute_grads"
+GARBAGE_COLLECTION_TIMER = "garbage_collection"
 SYNCH_WORKER_WEIGHTS_TIMER = "synch_weights"
 GRAD_WAIT_TIMER = "grad_wait"
 SAMPLE_TIMER = "sample"
diff --git a/rllib/utils/replay_buffers/episode_replay_buffer.py b/rllib/utils/replay_buffers/episode_replay_buffer.py
index 787c25b1aae01..e95fc50432489 100644
--- a/rllib/utils/replay_buffers/episode_replay_buffer.py
+++ b/rllib/utils/replay_buffers/episode_replay_buffer.py
@@ -1,4 +1,5 @@
 from collections import deque
+import copy
 from typing import Any, Dict, List, Optional, Union
 import uuid
 
@@ -109,6 +110,15 @@ def add(self, episodes: Union[List["_Episode"], "_Episode"]):
             episodes = [episodes]
 
         for eps in episodes:
+            # Make sure we don't change what's coming in from the user.
+            # TODO (sven): It'd probably be better to make sure in the EnvRunner to not
+            #  hold on to episodes (for metrics purposes only) that we are returning
+            #  back to the user from `EnvRunner.sample()`. Then we wouldn't have to
+            #  do any copying. Instead, either compile the metrics right away on the
+            #  EnvRunner OR compile metrics entirely on the Algorithm side (this is
+            #  actually preferred).
+            eps = copy.deepcopy(eps)
+
             self._num_timesteps += len(eps)
             self._num_timesteps_added += len(eps)
 
@@ -242,7 +252,7 @@ def sample(
             )
             episode = self.episodes[episode_idx]
 
-            # Starting a new chunk, set continue to False.
+            # Starting a new chunk, set is_first to True.
             is_first[B][T] = True
 
             # Begin of new batch item (row).
@@ -255,7 +265,7 @@ def sample(
                 else:
                     rewards[B].append(episode.rewards[episode_ts - 1])
             # We are in the middle of a batch item (row). Concat next episode to this
-            # row from the episode's beginning. In other words, we never concat
+            # row from the next episode's beginning. In other words, we never concat
             # a middle of an episode to another truncated one.
             else:
                 episode_ts = 0
@@ -321,6 +331,10 @@ def get_sampled_timesteps(self) -> int:
         """Returns number of timesteps that have been sampled in buffer's lifetime."""
         return self.sampled_timesteps
 
+    def get_added_timesteps(self) -> int:
+        """Returns number of timesteps that have been added in buffer's lifetime."""
+        return self._num_timesteps_added
+
     @override(ReplayBufferInterface)
     def get_state(self) -> Dict[str, Any]:
         return {
@@ -329,6 +343,7 @@ def get_state(self) -> Dict[str, Any]:
             "_num_episodes_evicted": self._num_episodes_evicted,
             "_indices": self._indices,
             "_num_timesteps": self._num_timesteps,
+            "_num_timesteps_added": self._num_timesteps_added,
             "sampled_timesteps": self.sampled_timesteps,
         }
 
@@ -341,6 +356,7 @@ def set_state(self, state) -> None:
         self._num_episodes_evicted = state["_num_episodes_evicted"]
         self._indices = state["_indices"]
         self._num_timesteps = state["_num_timesteps"]
+        self._num_timesteps_added = state["_num_timesteps_added"]
         self.sampled_timesteps = state["sampled_timesteps"]
 
 
@@ -356,8 +372,9 @@ def __init__(
         actions=None,
         rewards=None,
         states=None,
-        is_terminated=False,
-        is_truncated=False,
+        t: int = 0,
+        is_terminated: bool = False,
+        is_truncated: bool = False,
         render_images=None,
     ):
         self.id_ = id_ or uuid.uuid4().hex
@@ -370,6 +387,9 @@ def __init__(
         # h-states: t0 (in case this episode is a continuation chunk, we need to know
         # about the initial h) to T.
         self.states = states
+        # The global last timestep of the episode and the timesteps when this chunk
+        # started.
+        self.t = self.t_started = t
         # obs[-1] is the final observation in the episode.
         self.is_terminated = is_terminated
         # obs[-1] is the last obs in a truncated-by-the-env episode (there will no more
@@ -381,13 +401,18 @@ def __init__(
         self.render_images = [] if render_images is None else render_images
 
     def concat_episode(self, episode_chunk: "_Episode"):
+        """Adds the given `episode_chunk` to the right side of self."""
         assert episode_chunk.id_ == self.id_
         assert not self.is_done
+        # Make sure the timesteps match.
+        assert self.t == episode_chunk.t_started
 
         episode_chunk.validate()
 
         # Make sure, end matches other episode chunk's beginning.
         assert np.all(episode_chunk.observations[0] == self.observations[-1])
+        # Make sure the timesteps match (our last t should be the same as their first).
+        assert self.t == episode_chunk.t_started
         # Pop out our end.
         self.observations.pop()
 
@@ -396,6 +421,7 @@ def concat_episode(self, episode_chunk: "_Episode"):
         self.observations.extend(list(episode_chunk.observations))
         self.actions.extend(list(episode_chunk.actions))
         self.rewards.extend(list(episode_chunk.rewards))
+        self.t = episode_chunk.t
         self.states = episode_chunk.states
 
         if episode_chunk.is_terminated:
@@ -405,6 +431,21 @@ def concat_episode(self, episode_chunk: "_Episode"):
         # Validate.
         self.validate()
 
+    def add_initial_observation(
+        self, *, initial_observation, initial_state=None, initial_render_image=None
+    ):
+        assert not self.is_done
+        assert len(self.observations) == 0
+        # Assume that this episode is completely empty and has not stepped yet.
+        # Leave self.t (and self.t_started) at 0.
+        assert self.t == self.t_started == 0
+
+        self.observations.append(initial_observation)
+        self.states = initial_state
+        if initial_render_image is not None:
+            self.render_images.append(initial_render_image)
+        self.validate()
+
     def add_timestep(
         self,
         observation,
@@ -416,34 +457,25 @@ def add_timestep(
         is_truncated=False,
         render_image=None,
     ):
+        # Cannot add data to an already done episode.
         assert not self.is_done
 
         self.observations.append(observation)
         self.actions.append(action)
         self.rewards.append(reward)
         self.states = state
+        self.t += 1
         if render_image is not None:
             self.render_images.append(render_image)
         self.is_terminated = is_terminated
         self.is_truncated = is_truncated
         self.validate()
 
-    def add_initial_observation(
-        self, *, initial_observation, initial_state=None, initial_render_image=None
-    ):
-        assert not self.is_done
-        assert len(self.observations) == 0
-
-        self.observations.append(initial_observation)
-        self.states = initial_state
-        if initial_render_image is not None:
-            self.render_images.append(initial_render_image)
-        self.validate()
-
     def validate(self):
         # Make sure we always have one more obs stored than rewards (and actions)
         # due to the reset and last-obs logic of an MDP.
         assert len(self.observations) == len(self.rewards) + 1 == len(self.actions) + 1
+        assert len(self.rewards) == (self.t - self.t_started)
 
         # Convert all lists to numpy arrays, if we are terminated.
         if self.is_done:
@@ -454,8 +486,43 @@ def validate(self):
 
     @property
     def is_done(self):
+        """Whether the episode is actually done (terminated or truncated).
+
+        A done episode cannot be continued via `self.add_timestep()` or being
+        concatenated on its right-side with another episode chunk or being
+        succeeded via `self.create_successor()`.
+        """
         return self.is_terminated or self.is_truncated
 
+    def create_successor(self) -> "_Episode":
+        """Returns a successor episode chunk (of len=0) continuing with this one.
+
+        The successor will have the same ID and state as self and its only observation
+        will be the last observation in self. Its length will therefore be 0 (no
+        steps taken yet).
+
+        This method is useful if you would like to discontinue building an episode
+        chunk (b/c you have to return it from somewhere), but would like to have a new
+        episode (chunk) instance to continue building the actual env episode at a later
+        time.
+
+        Returns:
+            The successor Episode chunk of this one with the same ID and state and the
+            only observation being the last observation in self.
+        """
+        assert not self.is_done
+
+        return _Episode(
+            # Same ID.
+            id_=self.id_,
+            # First (and only) observation of successor is this episode's last obs.
+            observations=[self.observations[-1]],
+            # Same state.
+            states=self.states,
+            # Continue with self's current timestep.
+            t=self.t,
+        )
+
     def to_sample_batch(self):
         return SampleBatch(
             {
@@ -497,6 +564,8 @@ def get_state(self):
                 "actions": self.actions,
                 "rewards": self.rewards,
                 "states": self.states,
+                "t_started": self.t_started,
+                "t": self.t,
                 "is_terminated": self.is_terminated,
                 "is_truncated": self.is_truncated,
             }.items()
@@ -509,14 +578,16 @@ def from_state(state):
         eps.actions = state[2][1]
         eps.rewards = state[3][1]
         eps.states = state[4][1]
-        eps.is_terminated = state[5][1]
-        eps.is_truncated = state[6][1]
+        eps.t_started = state[5][1]
+        eps.t = state[6][1]
+        eps.is_terminated = state[7][1]
+        eps.is_truncated = state[8][1]
         return eps
 
     def __len__(self):
         assert len(self.observations) > 0, (
             "ERROR: Cannot determine length of episode that hasn't started yet! "
-            "Call `_Episode.add_initial_obs(initial_observation=...)` first "
+            "Call `_Episode.add_initial_observation(initial_observation=...)` first "
             "(after which `len(_Episode)` will be 0)."
         )
         return len(self.observations) - 1
diff --git a/rllib/utils/tf_utils.py b/rllib/utils/tf_utils.py
index 7b43953c5b67f..3acbbad004c0f 100644
--- a/rllib/utils/tf_utils.py
+++ b/rllib/utils/tf_utils.py
@@ -679,7 +679,7 @@ def two_hot(
     # First make sure, values are clipped.
     value = tf.clip_by_value(value, lower_bound, upper_bound)
     # Tensor of batch indices: [0, B=batch size).
-    batch_indices = tf.range(0, value.shape[0], dtype=tf.float32)
+    batch_indices = tf.range(0, tf.shape(value)[0], dtype=tf.float32)
     # Calculate the step deltas (how much space between each bucket's central value?).
     bucket_delta = (upper_bound - lower_bound) / (num_buckets - 1)
     # Compute the float indices (might be non-int numbers: sitting between two buckets).
@@ -690,12 +690,12 @@ def two_hot(
     kp1 = tf.math.ceil(idx)
     # In case k == kp1 (idx is exactly on the bucket boundary), move kp1 up by 1.0.
     # Otherwise, this would result in a NaN in the returned two-hot tensor.
-    kp1 = tf.where(k == kp1, kp1 + 1.0, kp1)
+    kp1 = tf.where(tf.equal(k, kp1), kp1 + 1.0, kp1)
     # Iff `kp1` is one beyond our last index (because incoming value is larger than
     # `upper_bound`), move it to one before k (kp1's weight is going to be 0.0 anyways,
     # so it doesn't matter where it points to; we are just avoiding an index error
     # with this).
-    kp1 = tf.where(kp1 == num_buckets, kp1 - 2.0, kp1)
+    kp1 = tf.where(tf.equal(kp1, num_buckets), kp1 - 2.0, kp1)
     # The actual values found at k and k+1 inside the set of buckets.
     values_k = lower_bound + k * bucket_delta
     values_kp1 = lower_bound + kp1 * bucket_delta