diff --git a/rllib/BUILD b/rllib/BUILD
index b66d0ec983e41..221c2362b56cf 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -1066,7 +1066,7 @@ py_test(
     srcs = ["algorithms/dqn/tests/test_repro_dqn.py"]
 )
 
-# Dreamer (V1)
+# Dreamer
 py_test(
     name = "test_dreamer",
     tags = ["team:rllib", "algorithms_dir"],
@@ -1074,16 +1074,6 @@ py_test(
     srcs = ["algorithms/dreamer/tests/test_dreamer.py"]
 )
 
-# DreamerV3
-# TODO (sven): Enable once the version conflict for gymnasium/supersuit/pettingzoo
-#  /shimmy/mujoco has been resolved.
-#py_test(
-#    name = "test_dreamerv3",
-#    tags = ["team:rllib", "algorithms_dir"],
-#    size = "large",
-#    srcs = ["algorithms/dreamerv3/tests/test_dreamerv3.py"]
-#)
-
 # DT
 py_test(
     name = "test_segmentation_buffer",
@@ -4355,7 +4345,6 @@ py_test_module_list(
   files = [
     "tests/test_dnc.py",
     "tests/test_perf.py",
-    "algorithms/dreamerv3/tests/test_dreamerv3.py",
     "env/wrappers/tests/test_kaggle_wrapper.py",
     "examples/env/tests/test_cliff_walking_wall_env.py",
     "examples/env/tests/test_coin_game_non_vectorized_env.py",
diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 7e3c32a4efc51..29de0b01a3be5 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -706,19 +706,7 @@ def setup(self, config: AlgorithmConfig) -> None:
             #  the two we need to loop through the policy modules and create a simple
             #  MARLModule from the RLModule within each policy.
             local_worker = self.workers.local_worker()
-            policy_dict, _ = self.config.get_multi_agent_setup(
-                env=local_worker.env,
-                spaces=getattr(local_worker, "spaces", None),
-            )
-            # TODO (Sven): Unify the inference of the MARLModuleSpec. Right now,
-            #  we get this from the RolloutWorker's `marl_module_spec` property.
-            #  However, this is hacky (information leak) and should not remain this
-            #  way. For other EnvRunner classes (that don't have this property),
-            #  Algorithm should infer this itself.
-            if hasattr(local_worker, "marl_module_spec"):
-                module_spec = local_worker.marl_module_spec
-            else:
-                module_spec = self.config.get_marl_module_spec(policy_dict=policy_dict)
+            module_spec = local_worker.marl_module_spec
             learner_group_config = self.config.get_learner_group_config(module_spec)
             self.learner_group = learner_group_config.build()
 
@@ -883,7 +871,7 @@ def evaluate(
         # Sync weights to the evaluation WorkerSet.
         if self.evaluation_workers is not None:
             self.evaluation_workers.sync_weights(
-                from_worker_or_learner_group=self.workers.local_worker()
+                from_worker_or_trainer=self.workers.local_worker()
             )
             self._sync_filters_if_needed(
                 central_worker=self.workers.local_worker(),
@@ -1421,7 +1409,7 @@ def training_step(self) -> ResultDict:
             if self.config._enable_learner_api:
                 from_worker_or_trainer = self.learner_group
             self.workers.sync_weights(
-                from_worker_or_learner_group=from_worker_or_trainer,
+                from_worker_or_trainer=from_worker_or_trainer,
                 policies=list(train_results.keys()),
                 global_vars=global_vars,
             )
diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
index 2510490d48426..a037f7bb052b3 100644
--- a/rllib/algorithms/algorithm_config.py
+++ b/rllib/algorithms/algorithm_config.py
@@ -303,8 +303,10 @@ def __init__(self, algo_class=None):
         self.normalize_actions = True
         self.clip_actions = False
         self.disable_env_checking = False
+        # Whether this env is an atari env (for atari-specific preprocessing).
+        # If not specified, we will try to auto-detect this.
+        self.is_atari = None
         self.auto_wrap_old_gym_envs = True
-        self._is_atari = None
 
         # `self.rollouts()`
         self.env_runner_cls = None
@@ -716,6 +718,31 @@ def freeze(self) -> None:
         #  of themselves? This way, users won't even be able to alter those values
         #  directly anymore.
 
+    def _detect_atari_env(self) -> bool:
+        """Returns whether this configured env is an Atari env or not.
+
+        Returns:
+            True, if specified env is an Atari env, False otherwise.
+        """
+        # Atari envs are usually specified via a string like "PongNoFrameskip-v4"
+        # or "ALE/Breakout-v5".
+        # We do NOT attempt to auto-detect Atari env for other specified types like
+        # a callable, to avoid running heavy logics in validate().
+        # For these cases, users can explicitly set `environment(atari=True)`.
+        if not type(self.env) == str:
+            return False
+
+        try:
+            if self.env.startswith("ALE/"):
+                env = gym.make("GymV26Environment-v0", env_id=self.env)
+            else:
+                env = gym.make(self.env)
+        except gym.error.NameNotFound:
+            # Not an Atari env if this is not a gym env.
+            return False
+
+        return is_atari(env)
+
     @OverrideToImplementCustomLogic_CallToSuperRecommended
     def validate(self) -> None:
         """Validates all values in this config."""
@@ -961,6 +988,10 @@ def validate(self) -> None:
                     f"config.framework({self.framework_str})!"
                 )
 
+        # Detect if specified env is an Atari env.
+        if self.is_atari is None:
+            self.is_atari = self._detect_atari_env()
+
         if self.input_ == "sampler" and self.off_policy_estimation_methods:
             raise ValueError(
                 "Off-policy estimation methods can only be used if the input is a "
@@ -1337,7 +1368,7 @@ def environment(
             disable_env_checking: If True, disable the environment pre-checking module.
             is_atari: This config can be used to explicitly specify whether the env is
                 an Atari env or not. If not specified, RLlib will try to auto-detect
-                this.
+                this during config validation.
             auto_wrap_old_gym_envs: Whether to auto-wrap old gym environments (using
                 the pre 0.24 gym APIs, e.g. reset() returning single obs and no info
                 dict). If True, RLlib will automatically wrap the given gym env class
@@ -1374,7 +1405,7 @@ def environment(
         if disable_env_checking is not NotProvided:
             self.disable_env_checking = disable_env_checking
         if is_atari is not NotProvided:
-            self._is_atari = is_atari
+            self.is_atari = is_atari
         if auto_wrap_old_gym_envs is not NotProvided:
             self.auto_wrap_old_gym_envs = auto_wrap_old_gym_envs
 
@@ -2288,8 +2319,6 @@ def reporting(
                 In case there are more than this many episodes collected in a single
                 training iteration, use all of these episodes for metrics computation,
                 meaning don't ever cut any "excess" episodes.
-                Set this to 1 to disable smoothing and to always report only the most
-                recently collected episode's return.
             min_time_s_per_iteration: Minimum time to accumulate within a single
                 `train()` call. This value does not affect learning,
                 only the number of times `Algorithm.training_step()` is called by
@@ -2616,34 +2645,6 @@ def learner_class(self) -> Type["Learner"]:
         """
         return self._learner_class or self.get_default_learner_class()
 
-    @property
-    def is_atari(self) -> bool:
-        """True if if specified env is an Atari env."""
-
-        # Not yet determined, try to figure this out.
-        if self._is_atari is None:
-            # Atari envs are usually specified via a string like "PongNoFrameskip-v4"
-            # or "ALE/Breakout-v5".
-            # We do NOT attempt to auto-detect Atari env for other specified types like
-            # a callable, to avoid running heavy logics in validate().
-            # For these cases, users can explicitly set `environment(atari=True)`.
-            if not type(self.env) == str:
-                return False
-            try:
-                if self.env.startswith("ALE/"):
-                    env = gym.make("GymV26Environment-v0", env_id=self.env)
-                else:
-                    env = gym.make(self.env)
-            # Any gymnasium error -> Cannot be an Atari env.
-            except gym.error.Error:
-                return False
-
-            self._is_atari = is_atari(env)
-            # Clean up env's resources, if any.
-            env.close()
-
-        return self._is_atari
-
     # TODO: Make rollout_fragment_length as read-only property and replace the current
     #  self.rollout_fragment_length a private variable.
     def get_rollout_fragment_length(self, worker_index: int = 0) -> int:
diff --git a/rllib/algorithms/dreamerv3/README.md b/rllib/algorithms/dreamerv3/README.md
deleted file mode 100644
index 8c64b960b7b73..0000000000000
--- a/rllib/algorithms/dreamerv3/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# DreamerV3
-Implementation (TensorFlow/Keras) of the "DreamerV3" model-based reinforcement learning
-(RL) algorithm by D. Hafner et al. 2023
-
-DreamerV3 train a world model in supervised fashion using real environment
-interactions. The world model utilizes a recurrent GRU-based architecture
-("recurrent state space model" or RSSM) and uses it to predicts rewards,
-episode continuation flags, as well as, observations.
-With these predictions (dreams) made by the world model, both actor
-and critic are trained in classic REINFORCE fashion. In other words, the
-actual RL components of the model are never trained on actual environment data,
-but on dreamed trajectories only.
-
-For more algorithm details, see:
-
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-.. and the "DreamerV2" paper:
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-
-## Results
-TODO
diff --git a/rllib/algorithms/dreamerv3/__init__.py b/rllib/algorithms/dreamerv3/__init__.py
deleted file mode 100644
index d4b2adb0d57ed..0000000000000
--- a/rllib/algorithms/dreamerv3/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
-from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3, DreamerV3Config
-
-__all__ = [
-    "DreamerV3",
-    "DreamerV3Config",
-]
diff --git a/rllib/algorithms/dreamerv3/dreamerv3.py b/rllib/algorithms/dreamerv3/dreamerv3.py
deleted file mode 100644
index 515f6e3a22a29..0000000000000
--- a/rllib/algorithms/dreamerv3/dreamerv3.py
+++ /dev/null
@@ -1,660 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
-import dataclasses
-import gc
-import logging
-import tree  # pip install dm_tree
-from typing import Any, Dict, List, Optional
-
-import gymnasium as gym
-import numpy as np
-
-from ray.rllib.algorithms.algorithm import Algorithm
-from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
-from ray.rllib.algorithms.dreamerv3.dreamerv3_catalog import DreamerV3Catalog
-from ray.rllib.algorithms.dreamerv3.dreamerv3_learner import (
-    DreamerV3LearnerHyperparameters,
-)
-from ray.rllib.algorithms.dreamerv3.utils import do_symlog_obs
-from ray.rllib.algorithms.dreamerv3.utils.env_runner import DreamerV3EnvRunner
-from ray.rllib.algorithms.dreamerv3.utils.summaries import (
-    report_predicted_vs_sampled_obs,
-    report_sampling_and_replay_buffer,
-)
-from ray.rllib.core.learner.learner import LearnerHyperparameters
-from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
-from ray.rllib.utils import deep_update
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils.framework import try_import_tf
-from ray.rllib.utils.numpy import one_hot
-from ray.rllib.utils.metrics import (
-    ALL_MODULES,
-    GARBAGE_COLLECTION_TIMER,
-    LEARN_ON_BATCH_TIMER,
-    NUM_AGENT_STEPS_SAMPLED,
-    NUM_AGENT_STEPS_TRAINED,
-    NUM_ENV_STEPS_SAMPLED,
-    NUM_ENV_STEPS_TRAINED,
-    NUM_GRAD_UPDATES_LIFETIME,
-    NUM_SYNCH_WORKER_WEIGHTS,
-    NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS,
-    SAMPLE_TIMER,
-    SYNCH_WORKER_WEIGHTS_TIMER,
-)
-from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer
-from ray.rllib.utils.typing import ResultDict
-
-
-logger = logging.getLogger(__name__)
-
-_, tf, _ = try_import_tf()
-
-
-class DreamerV3Config(AlgorithmConfig):
-    """Defines a configuration class from which a DreamerV3 can be built.
-
-    Example:
-        >>> from ray.rllib.algorithms.dreamerv3 import DreamerV3Config
-        >>> config = DreamerV3Config()
-        >>> config = config.training(  # doctest: +SKIP
-        ...     batch_size_B=8, model_size="M"
-        ... )
-        >>> config = config.resources(num_learner_workers=4)  # doctest: +SKIP
-        >>> print(config.to_dict())  # doctest: +SKIP
-        >>> # Build a Algorithm object from the config and run 1 training iteration.
-        >>> algo = config.build(env="CartPole-v1")  # doctest: +SKIP
-        >>> algo.train()  # doctest: +SKIP
-
-    Example:
-        >>> from ray.rllib.algorithms.dreamerv3 import DreamerV3Config
-        >>> from ray import air
-        >>> from ray import tune
-        >>> config = DreamerV3Config()
-        >>> # Print out some default values.
-        >>> print(config.training_ratio)  # doctest: +SKIP
-        >>> # Update the config object.
-        >>> config = config.training(   # doctest: +SKIP
-        ...     training_ratio=tune.grid_search([256, 512, 1024])
-        ... )
-        >>> # Set the config object's env.
-        >>> config = config.environment(env="CartPole-v1")  # doctest: +SKIP
-        >>> # Use to_dict() to get the old-style python config dict
-        >>> # when running with tune.
-        >>> tune.Tuner(  # doctest: +SKIP
-        ...     "DreamerV3",
-        ...     run_config=air.RunConfig(stop={"episode_reward_mean": 200}),
-        ...     param_space=config.to_dict(),
-        ... ).fit()
-    """
-
-    def __init__(self, algo_class=None):
-        """Initializes a DreamerV3Config instance."""
-        super().__init__(algo_class=algo_class or DreamerV3)
-
-        # fmt: off
-        # __sphinx_doc_begin__
-
-        # DreamerV3 specific settings:
-        self.model_size = "XS"
-        self.training_ratio = 1024
-
-        self.replay_buffer_config = {
-            "type": "EpisodeReplayBuffer",
-            "capacity": int(1e6),
-        }
-        self.world_model_lr = 1e-4
-        self.actor_lr = 3e-5
-        self.critic_lr = 3e-5
-        self.batch_size_B = 16
-        self.batch_length_T = 64
-        self.horizon_H = 15
-        self.gae_lambda = 0.95  # [1] eq. 7.
-        self.entropy_scale = 3e-4  # [1] eq. 11.
-        self.return_normalization_decay = 0.99  # [1] eq. 11 and 12.
-        self.train_critic = True
-        self.train_actor = True
-        self.intrinsic_rewards_scale = 0.1
-        self.world_model_grad_clip_by_global_norm = 1000.0
-        self.critic_grad_clip_by_global_norm = 100.0
-        self.actor_grad_clip_by_global_norm = 100.0
-
-        # Reporting.
-        # DreamerV3 is super sample efficient and only needs very few episodes
-        # (normally) to learn. Leaving this at its default value would gravely
-        # underestimate the learning performance over the course of an experiment.
-        self.metrics_num_episodes_for_smoothing = 1
-        self.report_individual_batch_item_stats = False
-        self.report_dream_data = False
-        self.report_images_and_videos = False
-        self.gc_frequency_train_steps = 100
-
-        # Override some of AlgorithmConfig's default values with DreamerV3-specific
-        # values.
-        self.lr = None
-        self.framework_str = "tf2"
-        self.gamma = 0.997  # [1] eq. 7.
-        # Do not use! Set `batch_size_B` and `batch_length_T` instead.
-        self.train_batch_size = None
-        self.env_runner_cls = DreamerV3EnvRunner
-        self.num_rollout_workers = 0
-        self.rollout_fragment_length = 1
-        # Since we are using a gymnasium-based EnvRunner, we can utilitze its
-        # vectorization capabilities w/o suffering performance losses (as we would
-        # with RLlib's `RemoteVectorEnv`).
-        self.remote_worker_envs = True
-        # Dreamer only runs on the new API stack.
-        self._enable_learner_api = True
-        self._enable_rl_module_api = True
-        # __sphinx_doc_end__
-        # fmt: on
-
-    @override(AlgorithmConfig)
-    def training(
-        self,
-        *,
-        model_size: Optional[str] = NotProvided,
-        training_ratio: Optional[float] = NotProvided,
-        gc_frequency_train_steps: Optional[int] = NotProvided,
-        batch_size_B: Optional[int] = NotProvided,
-        batch_length_T: Optional[int] = NotProvided,
-        horizon_H: Optional[int] = NotProvided,
-        gae_lambda: Optional[float] = NotProvided,
-        entropy_scale: Optional[float] = NotProvided,
-        return_normalization_decay: Optional[float] = NotProvided,
-        train_critic: Optional[bool] = NotProvided,
-        train_actor: Optional[bool] = NotProvided,
-        intrinsic_rewards_scale: Optional[float] = NotProvided,
-        world_model_grad_clip_by_global_norm: Optional[float] = NotProvided,
-        critic_grad_clip_by_global_norm: Optional[float] = NotProvided,
-        actor_grad_clip_by_global_norm: Optional[float] = NotProvided,
-        replay_buffer_config: Optional[dict] = NotProvided,
-        **kwargs,
-    ) -> "DreamerV3Config":
-        """Sets the training related configuration.
-
-        Args:
-            model_size: The main switch for adjusting the overall model size. See [1]
-                (table B) for more information on the effects of this setting on the
-                model architecture.
-                Supported values are "XS", "S", "M", "L", "XL" (as per the paper), as
-                well as, "nano", "micro", "mini", and "XXS" (for RLlib's
-                implementation). See ray.rllib.algorithms.dreamerv3.utils.
-                __init__.py for the details on what exactly each size does to the layer
-                sizes, number of layers, etc..
-            training_ratio: The ratio of total steps trained (sum of the sizes of all
-                batches ever sampled from the replay buffer) over the total env steps
-                taken (in the actual environment, not the dreamed one). For example,
-                if the training_ratio is 1024 and the batch size is 1024, we would take
-                1 env step for every training update: 1024 / 1. If the training ratio
-                is 512 and the batch size is 1024, we would take 2 env steps and then
-                perform a single training update (on a 1024 batch): 1024 / 2.
-            gc_frequency_train_steps: The frequency (in training iterations) with which
-                we perform a `gc.collect()` calls at the end of a `training_step`
-                iteration. Doing this more often adds a (albeit very small) performance
-                overhead, but prevents memory leaks from becoming harmful.
-                TODO (sven): This might not be necessary anymore, but needs to be
-                 confirmed experimentally.
-            batch_size_B: The batch size (B) interpreted as number of rows (each of
-                length `batch_length_T`) to sample from the replay buffer in each
-                iteration.
-            batch_length_T: The batch length (T) interpreted as the length of each row
-                sampled from the replay buffer in each iteration. Note that
-                `batch_size_B` rows will be sampled in each iteration. Rows normally
-                contain consecutive data (consecutive timesteps from the same episode),
-                but there might be episode boundaries in a row as well.
-            horizon_H: The horizon (in timesteps) used to create dreamed data from the
-                world model, which in turn is used to train/update both actor- and
-                critic networks.
-            gae_lambda: The lambda parameter used for computing the GAE-style
-                value targets for the actor- and critic losses.
-            entropy_scale: The factor with which to multiply the entropy loss term
-                inside the actor loss.
-            return_normalization_decay: The decay value to use when computing the
-                running EMA values for return normalization (used in the actor loss).
-            train_critic: Whether to train the critic network. If False, `train_actor`
-                must also be False (cannot train actor w/o training the critic).
-            train_actor: Whether to train the actor network. If True, `train_critic`
-                must also be True (cannot train actor w/o training the critic).
-            intrinsic_rewards_scale: The factor to multiply intrinsic rewards with
-                before adding them to the extrinsic (environment) rewards.
-            world_model_grad_clip_by_global_norm: World model grad clipping value
-                (by global norm).
-            critic_grad_clip_by_global_norm: Critic grad clipping value
-                (by global norm).
-            actor_grad_clip_by_global_norm: Actor grad clipping value (by global norm).
-            replay_buffer_config: Replay buffer config.
-                Only serves in DreamerV3 to set the capacity of the replay buffer.
-                Note though that in the paper ([1]) a size of 1M is used for all
-                benchmarks and there doesn't seem to be a good reason to change this
-                parameter.
-                Examples:
-                {
-                "type": "EpisodeReplayBuffer",
-                "capacity": 100000,
-                }
-
-        Returns:
-            This updated AlgorithmConfig object.
-        """
-        # Pass kwargs onto super's `training()` method.
-        super().training(**kwargs)
-
-        if model_size is not NotProvided:
-            self.model_size = model_size
-        if training_ratio is not NotProvided:
-            self.training_ratio = training_ratio
-        if gc_frequency_train_steps is not NotProvided:
-            self.gc_frequency_train_steps = gc_frequency_train_steps
-        if batch_size_B is not NotProvided:
-            self.batch_size_B = batch_size_B
-        if batch_length_T is not NotProvided:
-            self.batch_length_T = batch_length_T
-        if horizon_H is not NotProvided:
-            self.horizon_H = horizon_H
-        if gae_lambda is not NotProvided:
-            self.gae_lambda = gae_lambda
-        if entropy_scale is not NotProvided:
-            self.entropy_scale = entropy_scale
-        if return_normalization_decay is not NotProvided:
-            self.return_normalization_decay = return_normalization_decay
-        if train_critic is not NotProvided:
-            self.train_critic = train_critic
-        if train_actor is not NotProvided:
-            self.train_actor = train_actor
-        if intrinsic_rewards_scale is not NotProvided:
-            self.intrinsic_rewards_scale = intrinsic_rewards_scale
-        if world_model_grad_clip_by_global_norm is not NotProvided:
-            self.world_model_grad_clip_by_global_norm = (
-                world_model_grad_clip_by_global_norm
-            )
-        if critic_grad_clip_by_global_norm is not NotProvided:
-            self.critic_grad_clip_by_global_norm = critic_grad_clip_by_global_norm
-        if actor_grad_clip_by_global_norm is not NotProvided:
-            self.actor_grad_clip_by_global_norm = actor_grad_clip_by_global_norm
-        if replay_buffer_config is not NotProvided:
-            # Override entire `replay_buffer_config` if `type` key changes.
-            # Update, if `type` key remains the same or is not specified.
-            new_replay_buffer_config = deep_update(
-                {"replay_buffer_config": self.replay_buffer_config},
-                {"replay_buffer_config": replay_buffer_config},
-                False,
-                ["replay_buffer_config"],
-                ["replay_buffer_config"],
-            )
-            self.replay_buffer_config = new_replay_buffer_config["replay_buffer_config"]
-
-        return self
-
-    @override(AlgorithmConfig)
-    def reporting(
-        self,
-        *,
-        report_individual_batch_item_stats: Optional[bool] = NotProvided,
-        report_dream_data: Optional[bool] = NotProvided,
-        report_images_and_videos: Optional[bool] = NotProvided,
-        **kwargs,
-    ):
-        """Sets the reporting related configuration.
-
-        Args:
-            report_individual_batch_item_stats: Whether to include loss and other stats
-                per individual timestep inside the training batch in the result dict
-                returned by `training_step()`. If True, besides the `CRITIC_L_total`,
-                the individual critic loss values per batch row and time axis step
-                in the train batch (CRITIC_L_total_B_T) will also be part of the
-                results.
-            report_dream_data:  Whether to include the dreamed trajectory data in the
-                result dict returned by `training_step()`. If True, however, will
-                slice each reported item in the dream data down to the shape.
-                (H, B, t=0, ...), where H is the horizon and B is the batch size. The
-                original time axis will only be represented by the first timestep
-                to not make this data too large to handle.
-            report_images_and_videos: Whether to include any image/video data in the
-                result dict returned by `training_step()`.
-            **kwargs:
-
-        Returns:
-            This updated AlgorithmConfig object.
-        """
-        super().reporting(**kwargs)
-
-        if report_individual_batch_item_stats is not NotProvided:
-            self.report_individual_batch_item_stats = report_individual_batch_item_stats
-        if report_dream_data is not NotProvided:
-            self.report_dream_data = report_dream_data
-        if report_images_and_videos is not NotProvided:
-            self.report_images_and_videos = report_images_and_videos
-
-        return self
-
-    @override(AlgorithmConfig)
-    def validate(self) -> None:
-        # Call the super class' validation method first.
-        super().validate()
-
-        # Make sure, users are not using DreamerV3 yet for multi-agent:
-        if self.is_multi_agent():
-            raise ValueError("DreamerV3 does NOT support multi-agent setups yet!")
-
-        # Make sure, we are configure for the new API stack.
-        if not (self._enable_learner_api and self._enable_rl_module_api):
-            raise ValueError(
-                "DreamerV3 must be run with `config._enable_learner_api`=True AND "
-                "with `config._enable_rl_module_api`=True!"
-            )
-
-        # If run on several Learners, the provided batch_size_B must be a multiple
-        # of `num_learner_workers`.
-        if self.num_learner_workers > 1 and (
-            self.batch_size_B % self.num_learner_workers != 0
-        ):
-            raise ValueError(
-                f"Your `batch_size_B` ({self.batch_size_B}) must be a multiple of "
-                f"`num_learner_workers` ({self.num_learner_workers}) in order for "
-                "DreamerV3 to be able to split batches evenly across your Learner "
-                "processes."
-            )
-
-        # Cannot train actor w/o critic.
-        if self.train_actor and not self.train_critic:
-            raise ValueError(
-                "Cannot train actor network (`train_actor=True`) w/o training critic! "
-                "Make sure you either set `train_critic=True` or `train_actor=False`."
-            )
-        # Use DreamerV3 specific batch size settings.
-        if self.train_batch_size is not None:
-            raise ValueError(
-                "`train_batch_size` should NOT be set! Use `batch_size_B` and "
-                "`batch_length_T` instead."
-            )
-        # Must be run with `EpisodeReplayBuffer` type.
-        if self.replay_buffer_config.get("type") != "EpisodeReplayBuffer":
-            raise ValueError(
-                "DreamerV3 must be run with the `EpisodeReplayBuffer` type! None "
-                "other supported."
-            )
-
-    @override(AlgorithmConfig)
-    def get_learner_hyperparameters(self) -> LearnerHyperparameters:
-        base_hps = super().get_learner_hyperparameters()
-        return DreamerV3LearnerHyperparameters(
-            model_size=self.model_size,
-            training_ratio=self.training_ratio,
-            batch_size_B=self.batch_size_B // (self.num_learner_workers or 1),
-            batch_length_T=self.batch_length_T,
-            horizon_H=self.horizon_H,
-            gamma=self.gamma,
-            gae_lambda=self.gae_lambda,
-            entropy_scale=self.entropy_scale,
-            return_normalization_decay=self.return_normalization_decay,
-            train_actor=self.train_actor,
-            train_critic=self.train_critic,
-            world_model_lr=self.world_model_lr,
-            intrinsic_rewards_scale=self.intrinsic_rewards_scale,
-            actor_lr=self.actor_lr,
-            critic_lr=self.critic_lr,
-            world_model_grad_clip_by_global_norm=(
-                self.world_model_grad_clip_by_global_norm
-            ),
-            actor_grad_clip_by_global_norm=self.actor_grad_clip_by_global_norm,
-            critic_grad_clip_by_global_norm=self.critic_grad_clip_by_global_norm,
-            report_individual_batch_item_stats=(
-                self.report_individual_batch_item_stats
-            ),
-            report_dream_data=self.report_dream_data,
-            report_images_and_videos=self.report_images_and_videos,
-            **dataclasses.asdict(base_hps),
-        )
-
-    @override(AlgorithmConfig)
-    def get_default_learner_class(self):
-        if self.framework_str == "tf2":
-            from ray.rllib.algorithms.dreamerv3.tf.dreamerv3_tf_learner import (
-                DreamerV3TfLearner,
-            )
-
-            return DreamerV3TfLearner
-        else:
-            raise ValueError(f"The framework {self.framework_str} is not supported.")
-
-    @override(AlgorithmConfig)
-    def get_default_rl_module_spec(self) -> SingleAgentRLModuleSpec:
-        if self.framework_str == "tf2":
-            from ray.rllib.algorithms.dreamerv3.tf.dreamerv3_tf_rl_module import (
-                DreamerV3TfRLModule,
-            )
-
-            return SingleAgentRLModuleSpec(
-                module_class=DreamerV3TfRLModule, catalog_class=DreamerV3Catalog
-            )
-        else:
-            raise ValueError(f"The framework {self.framework_str} is not supported.")
-
-    @property
-    def share_module_between_env_runner_and_learner(self) -> bool:
-        # If we only have one local Learner (num_learner_workers=0) and only
-        # one local EnvRunner (num_rollout_workers=0), share the RLModule
-        # between these two to avoid having to sync weights, ever.
-        return self.num_learner_workers == 0 and self.num_rollout_workers == 0
-
-
-class DreamerV3(Algorithm):
-    """Implementation of the model-based DreamerV3 RL algorithm described in [1]."""
-
-    @classmethod
-    @override(Algorithm)
-    def get_default_config(cls) -> AlgorithmConfig:
-        return DreamerV3Config()
-
-    @override(Algorithm)
-    def setup(self, config: AlgorithmConfig):
-        super().setup(config)
-
-        # Share RLModule between EnvRunner and single (local) Learner instance.
-        # To avoid possibly expensive weight synching step.
-        if self.config.share_module_between_env_runner_and_learner:
-            assert self.workers.local_worker().module is None
-            self.workers.local_worker().module = self.learner_group._learner.module[
-                DEFAULT_POLICY_ID
-            ]
-
-        # Summarize (single-agent) RLModule (only once) here.
-        if self.config.framework_str == "tf2":
-            self.workers.local_worker().module.dreamer_model.summary(expand_nested=True)
-
-        # Create a replay buffer for storing actual env samples.
-        self.replay_buffer = EpisodeReplayBuffer(
-            capacity=self.config.replay_buffer_config["capacity"],
-            batch_size_B=self.config.batch_size_B,
-            batch_length_T=self.config.batch_length_T,
-        )
-
-    @override(Algorithm)
-    def training_step(self) -> ResultDict:
-        results = {}
-
-        env_runner = self.workers.local_worker()
-
-        # Push enough samples into buffer initially before we start training.
-        if self.training_iteration == 0:
-            logger.info(
-                "Filling replay buffer so it contains at least "
-                f"{self.config.batch_size_B * self.config.batch_length_T} timesteps "
-                "(required for a single train batch)."
-            )
-
-        # Have we sampled yet in this `training_step()` call?
-        have_sampled = False
-        with self._timers[SAMPLE_TIMER]:
-            # Continue sampling from the actual environment (and add collected samples
-            # to our replay buffer) as long as we:
-            while (
-                # a) Don't have at least batch_size_B x batch_length_T timesteps stored
-                # in the buffer. This is the minimum needed to train.
-                self.replay_buffer.get_num_timesteps()
-                < (self.config.batch_size_B * self.config.batch_length_T)
-                # b) The computed `training_ratio` is >= the configured (desired)
-                # training ratio (meaning we should continue sampling).
-                or self.training_ratio >= self.config.training_ratio
-                # c) we have not sampled at all yet in this `training_step()` call.
-                or not have_sampled
-            ):
-                done_episodes, ongoing_episodes = env_runner.sample()
-                have_sampled = True
-
-                # We took B x T env steps.
-                env_steps_last_sample = sum(
-                    len(eps) for eps in done_episodes + ongoing_episodes
-                )
-                self._counters[NUM_AGENT_STEPS_SAMPLED] += env_steps_last_sample
-                self._counters[NUM_ENV_STEPS_SAMPLED] += env_steps_last_sample
-
-                # Add ongoing and finished episodes into buffer. The buffer will
-                # automatically take care of properly concatenating (by episode IDs)
-                # the different chunks of the same episodes, even if they come in via
-                # separate `add()` calls.
-                self.replay_buffer.add(episodes=done_episodes + ongoing_episodes)
-
-        # Summarize environment interaction and buffer data.
-        results[ALL_MODULES] = report_sampling_and_replay_buffer(
-            replay_buffer=self.replay_buffer,
-        )
-
-        # Continue sampling batch_size_B x batch_length_T sized batches from the buffer
-        # and using these to update our models (`LearnerGroup.update()`) until the
-        # computed `training_ratio` is larger than the configured one, meaning we should
-        # go back and collect more samples again from the actual environment.
-        # However, when calculating the `training_ratio` here, we use only the
-        # trained steps in this very `training_step()` call over the most recent sample
-        # amount (`env_steps_last_sample`), not the global values. This is to avoid a
-        # heavy overtraining at the very beginning when we have just pre-filled the
-        # buffer with the minimum amount of samples.
-        replayed_steps_this_iter = sub_iter = 0
-        while (
-            replayed_steps_this_iter / env_steps_last_sample
-        ) < self.config.training_ratio:
-
-            # Time individual batch updates.
-            with self._timers[LEARN_ON_BATCH_TIMER]:
-                logger.info(f"\tSub-iteration {self.training_iteration}/{sub_iter})")
-
-                # Draw a new sample from the replay buffer.
-                sample = self.replay_buffer.sample(
-                    batch_size_B=self.config.batch_size_B,
-                    batch_length_T=self.config.batch_length_T,
-                )
-                replayed_steps = self.config.batch_size_B * self.config.batch_length_T
-                replayed_steps_this_iter += replayed_steps
-
-                # Convert some bool columns to float32 and one-hot actions.
-                sample["is_first"] = sample["is_first"].astype(np.float32)
-                sample["is_last"] = sample["is_last"].astype(np.float32)
-                sample["is_terminated"] = sample["is_terminated"].astype(np.float32)
-                if isinstance(env_runner.env.single_action_space, gym.spaces.Discrete):
-                    sample["actions_ints"] = sample[SampleBatch.ACTIONS]
-                    sample[SampleBatch.ACTIONS] = one_hot(
-                        sample["actions_ints"],
-                        depth=env_runner.env.single_action_space.n,
-                    )
-
-                # Perform the actual update via our learner group.
-                train_results = self.learner_group.update(
-                    SampleBatch(sample).as_multi_agent(),
-                    reduce_fn=self._reduce_results,
-                )
-                self._counters[NUM_AGENT_STEPS_TRAINED] += replayed_steps
-                self._counters[NUM_ENV_STEPS_TRAINED] += replayed_steps
-
-                # Perform additional (non-gradient updates), such as the critic EMA-copy
-                # update.
-                with self._timers["critic_ema_update"]:
-                    self.learner_group.additional_update(
-                        timestep=self._counters[NUM_ENV_STEPS_TRAINED],
-                        reduce_fn=self._reduce_results,
-                    )
-
-                if self.config.report_images_and_videos:
-                    report_predicted_vs_sampled_obs(
-                        # TODO (sven): DreamerV3 is single-agent only.
-                        results=train_results[DEFAULT_POLICY_ID],
-                        sample=sample,
-                        batch_size_B=self.config.batch_size_B,
-                        batch_length_T=self.config.batch_length_T,
-                        symlog_obs=do_symlog_obs(
-                            env_runner.env.single_observation_space,
-                            self.config.model.get("symlog_obs", "auto"),
-                        ),
-                    )
-
-                res = train_results[DEFAULT_POLICY_ID]
-                logger.info(
-                    f"\t\tWORLD_MODEL_L_total={res['WORLD_MODEL_L_total']:.5f} ("
-                    f"L_pred={res['WORLD_MODEL_L_prediction']:.5f} ("
-                    f"decoder/obs={res['WORLD_MODEL_L_decoder']} "
-                    f"L_rew={res['WORLD_MODEL_L_reward']} "
-                    f"L_cont={res['WORLD_MODEL_L_continue']}); "
-                    f"L_dyn/rep={res['WORLD_MODEL_L_dynamics']:.5f})"
-                )
-                msg = "\t\t"
-                if self.config.train_actor:
-                    msg += f"L_actor={res['ACTOR_L_total']:.5f} "
-                if self.config.train_critic:
-                    msg += f"L_critic={res['CRITIC_L_total']:.5f} "
-                logger.info(msg)
-
-                sub_iter += 1
-                self._counters[NUM_GRAD_UPDATES_LIFETIME] += 1
-
-        # Update weights - after learning on the LearnerGroup - on all EnvRunner
-        # workers.
-        with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
-            # Only necessary if RLModule is not shared between (local) EnvRunner and
-            # (local) Learner.
-            if not self.config.share_module_between_env_runner_and_learner:
-                self._counters[
-                    NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS
-                ] = 0
-                self._counters[NUM_SYNCH_WORKER_WEIGHTS] += 1
-                self.workers.sync_weights(
-                    from_worker_or_learner_group=self.learner_group
-                )
-
-        # Try trick from https://medium.com/dive-into-ml-ai/dealing-with-memory-leak-
-        # issue-in-keras-model-training-e703907a6501
-        if self.config.gc_frequency_train_steps and (
-            self.training_iteration % self.config.gc_frequency_train_steps == 0
-        ):
-            with self._timers[GARBAGE_COLLECTION_TIMER]:
-                gc.collect()
-
-        # Add train results and the actual training ratio to stats. The latter should
-        # be close to the configured `training_ratio`.
-        results.update(train_results)
-        results[ALL_MODULES]["actual_training_ratio"] = self.training_ratio
-
-        # Return all results.
-        return results
-
-    @property
-    def training_ratio(self) -> float:
-        """Returns the actual training ratio of this Algorithm.
-
-        The training ratio is copmuted by dividing the total number of steps
-        trained thus far (replayed from the buffer) over the total number of actual
-        env steps taken thus far.
-        """
-        return self._counters[NUM_ENV_STEPS_TRAINED] / (
-            self._counters[NUM_ENV_STEPS_SAMPLED]
-        )
-
-    @staticmethod
-    def _reduce_results(results: List[Dict[str, Any]]):
-        return tree.map_structure(lambda *s: np.mean(s, axis=0), *results)
diff --git a/rllib/algorithms/dreamerv3/dreamerv3_catalog.py b/rllib/algorithms/dreamerv3/dreamerv3_catalog.py
deleted file mode 100644
index 50568fe1875ab..0000000000000
--- a/rllib/algorithms/dreamerv3/dreamerv3_catalog.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import gymnasium as gym
-
-from ray.rllib.core.models.catalog import Catalog
-from ray.rllib.core.models.base import Encoder, Model
-from ray.rllib.utils import override
-
-
-class DreamerV3Catalog(Catalog):
-    """The Catalog class used to build all the models needed for DreamerV3 training."""
-
-    def __init__(
-        self,
-        observation_space: gym.Space,
-        action_space: gym.Space,
-        model_config_dict: dict,
-    ):
-        """Initializes a DreamerV3Catalog instance.
-
-        Args:
-            observation_space: The observation space of the environment.
-            action_space: The action space of the environment.
-            model_config_dict: The model config to use.
-        """
-        super().__init__(
-            observation_space=observation_space,
-            action_space=action_space,
-            model_config_dict=model_config_dict,
-        )
-
-        self.model_size = self.model_config_dict["model_size"]
-        self.is_img_space = len(self.observation_space.shape) in [2, 3]
-        self.is_gray_scale = (
-            self.is_img_space and len(self.observation_space.shape) == 2
-        )
-
-        # TODO (sven): We should work with sub-component configurations here,
-        #  and even try replacing all current Dreamer model components with
-        #  our default primitives. But for now, we'll construct the DreamerV3Model
-        #  directly in our `build_...()` methods.
-
-    @override(Catalog)
-    def build_encoder(self, framework: str) -> Encoder:
-        """Builds the World-Model's encoder network depending on the obs space."""
-        if framework != "tf2":
-            raise NotImplementedError
-
-        if self.is_img_space:
-            from ray.rllib.algorithms.dreamerv3.tf.models.components.cnn_atari import (
-                CNNAtari,
-            )
-
-            return CNNAtari(model_size=self.model_size)
-        else:
-            from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
-
-            return MLP(model_size=self.model_size, name="vector_encoder")
-
-    def build_decoder(self, framework: str) -> Model:
-        """Builds the World-Model's decoder network depending on the obs space."""
-        if framework != "tf2":
-            raise NotImplementedError
-
-        if self.is_img_space:
-            from ray.rllib.algorithms.dreamerv3.tf.models.components import (
-                conv_transpose_atari,
-            )
-
-            return conv_transpose_atari.ConvTransposeAtari(
-                model_size=self.model_size,
-                gray_scaled=self.is_gray_scale,
-            )
-        else:
-            from ray.rllib.algorithms.dreamerv3.tf.models.components import (
-                vector_decoder,
-            )
-
-            return vector_decoder.VectorDecoder(
-                model_size=self.model_size,
-                observation_space=self.observation_space,
-            )
diff --git a/rllib/algorithms/dreamerv3/dreamerv3_learner.py b/rllib/algorithms/dreamerv3/dreamerv3_learner.py
index 32c08d0a671f4..c35d1743c8b1a 100644
--- a/rllib/algorithms/dreamerv3/dreamerv3_learner.py
+++ b/rllib/algorithms/dreamerv3/dreamerv3_learner.py
@@ -8,13 +8,11 @@
 https://arxiv.org/pdf/2010.02193.pdf
 """
 from dataclasses import dataclass
-from typing import Any, DefaultDict, Dict
+from typing import Any, Dict
 
 from ray.rllib.core.learner.learner import Learner, LearnerHyperparameters
 from ray.rllib.core.rl_module.rl_module import ModuleID
-from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
-from ray.rllib.utils.typing import TensorType
 
 
 @dataclass
@@ -27,7 +25,7 @@ class to configure your algorithm.
     more details on the individual properties.
     """
 
-    model_size: str = None
+    model_dimension: str = None
     training_ratio: float = None
     batch_size_B: int = None
     batch_length_T: int = None
@@ -46,10 +44,6 @@ class to configure your algorithm.
     world_model_grad_clip_by_global_norm: float = None
     actor_grad_clip_by_global_norm: float = None
     critic_grad_clip_by_global_norm: float = None
-    # Reporting settings.
-    report_individual_batch_item_stats: bool = None
-    report_dream_data: bool = None
-    report_images_and_videos: bool = None
 
 
 class DreamerV3Learner(Learner):
@@ -59,31 +53,6 @@ class DreamerV3Learner(Learner):
     for updating the critic EMA-copy after each training step.
     """
 
-    @override(Learner)
-    def compile_results(
-        self,
-        *,
-        batch: MultiAgentBatch,
-        fwd_out: Dict[str, Any],
-        loss_per_module: Dict[str, TensorType],
-        metrics_per_module: DefaultDict[ModuleID, Dict[str, Any]],
-    ) -> Dict[str, Any]:
-        results = super().compile_results(
-            batch=batch,
-            fwd_out=fwd_out,
-            loss_per_module=loss_per_module,
-            metrics_per_module=metrics_per_module,
-        )
-
-        # Add the predicted obs distributions for possible (video) summarization.
-        if self.hps.report_images_and_videos:
-            for module_id, res in results.items():
-                if module_id in fwd_out:
-                    res["WORLD_MODEL_fwd_out_obs_distribution_means_BxT"] = fwd_out[
-                        module_id
-                    ]["obs_distribution_means_BxT"]
-        return results
-
     @override(Learner)
     def additional_update_for_module(
         self,
diff --git a/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py b/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py
index f1a112e7017d1..021fbb8646389 100644
--- a/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py
+++ b/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py
@@ -14,7 +14,6 @@
 from ray.rllib.core.models.base import STATE_IN, STATE_OUT
 from ray.rllib.core.models.specs.specs_dict import SpecDict
 from ray.rllib.core.rl_module.rl_module import RLModule
-from ray.rllib.policy.eager_tf_policy import _convert_to_tf
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import ExperimentalAPI, override
 from ray.rllib.utils.nested_dict import NestedDict
@@ -34,7 +33,7 @@ def setup(self):
             self.config.observation_space,
             self.config.model_config_dict.get("symlog_obs", "auto"),
         )
-        model_size = self.config.model_config_dict["model_size"]
+        model_dimension = self.config.model_config_dict["model_dimension"]
 
         # Build encoder and decoder from catalog.
         catalog = self.config.get_catalog()
@@ -43,34 +42,40 @@ def setup(self):
 
         # Build the world model (containing encoder and decoder).
         self.world_model = WorldModel(
-            model_size=model_size,
+            model_dimension=model_dimension,
             action_space=self.config.action_space,
             batch_length_T=T,
+            # num_gru_units=self.model_config.num_gru_units,
             encoder=self.encoder,
             decoder=self.decoder,
             symlog_obs=symlog_obs,
         )
         self.actor = ActorNetwork(
             action_space=self.config.action_space,
-            model_size=model_size,
+            model_dimension=model_dimension,
         )
         self.critic = CriticNetwork(
-            model_size=model_size,
+            model_dimension=model_dimension,
         )
         # Build the final dreamer model (containing the world model).
         self.dreamer_model = DreamerModel(
-            model_size=self.config.model_config_dict["model_size"],
+            model_dimension=self.config.model_config_dict["model_dimension"],
             action_space=self.config.action_space,
             world_model=self.world_model,
             actor=self.actor,
             critic=self.critic,
+            # use_curiosity=use_curiosity,
+            # intrinsic_rewards_scale=intrinsic_rewards_scale,
+            batch_size_B=self.config.model_config_dict["batch_size_B"],
+            batch_length_T=T,
+            horizon_H=horizon_H,
         )
         self.action_dist_cls = catalog.get_action_dist_cls(framework=self.framework)
 
         # Perform a test `call()` to force building the dreamer model's variables.
         test_obs = np.tile(
             np.expand_dims(self.config.observation_space.sample(), (0, 1)),
-            reps=(B, T) + (1,) * len(self.config.observation_space.shape),
+            reps=(B, T, 1),
         )
         test_actions = np.tile(
             np.expand_dims(
@@ -82,13 +87,15 @@ def setup(self):
             reps=(B, T, 1),
         )
         self.dreamer_model(
-            inputs=_convert_to_tf(test_obs),
-            actions=_convert_to_tf(test_actions.astype(np.float32)),
-            is_first=_convert_to_tf(np.ones((B, T), np.float32)),
-            start_is_terminated_BxT=_convert_to_tf(np.zeros((B * T,), np.float32)),
+            inputs=test_obs,
+            actions=test_actions.astype(np.float32),
+            is_first=np.ones((B, T), np.float32),
+            start_is_terminated_BxT=np.zeros((B * T,), np.float32),
             horizon_H=horizon_H,
             gamma=gamma,
         )
+        # This should work now.
+        self.dreamer_model.summary(expand_nested=True)
 
         # Initialize the critic EMA net:
         self.critic.init_ema()
@@ -122,7 +129,7 @@ def input_specs_train(self) -> SpecDict:
     def output_specs_train(self) -> SpecDict:
         return [
             "sampled_obs_symlog_BxT",
-            "obs_distribution_means_BxT",
+            "obs_distribution_BxT",
             "reward_logits_BxT",
             "rewards_BxT",
             "continue_distribution_BxT",
diff --git a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py
deleted file mode 100644
index 2e8ef82fd6dbe..0000000000000
--- a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py
+++ /dev/null
@@ -1,210 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-
-[3]
-D. Hafner's (author) original code repo (for JAX):
-https://github.com/danijar/dreamerv3
-"""
-import unittest
-
-import gymnasium as gym
-import numpy as np
-
-import ray
-from ray.rllib.algorithms.dreamerv3 import dreamerv3
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
-from ray.rllib.utils.test_utils import framework_iterator
-
-
-class TestDreamerV3(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        ray.init()
-
-    @classmethod
-    def tearDownClass(cls):
-        ray.shutdown()
-
-    def test_dreamerv3_compilation(self):
-        """Test whether DreamerV3 can be built with all frameworks."""
-
-        # Build a DreamerV3Config object.
-        config = (
-            dreamerv3.DreamerV3Config()
-            .framework(eager_tracing=True)
-            .training(
-                # Keep things simple. Especially the long dream rollouts seem
-                # to take an enormous amount of time (initially).
-                batch_size_B=2 * 2,  # shared w/ model AND learner AND env runner
-                batch_length_T=16,
-                horizon_H=5,
-                # TODO (sven): Fix having to provide this.
-                #  Should be compiled automatically as `RLModuleConfig` by
-                #  AlgorithmConfig (see comment below)?
-                model={
-                    "batch_length_T": 16,
-                    "horizon_H": 5,
-                    "model_size": "nano",  # Use a tiny model for testing.
-                    "gamma": 0.997,
-                    "symlog_obs": True,
-                },
-            )
-            .resources(
-                num_learner_workers=2,  # Try with 2 Learners.
-                num_cpus_per_learner_worker=1,
-                num_gpus_per_learner_worker=0,
-            )
-            .debugging(log_level="INFO")
-        )
-
-        # TODO (sven): Add a `get_model_config` utility to AlgorithmConfig
-        #  that - for now - merges the user provided model_dict (which only
-        #  contains settings that only affect the model, e.g. model_size)
-        #  with the AlgorithmConfig-wide settings that are relevant for the model
-        #  (e.g. `batch_size_B`).
-        # config.get_model_config()
-
-        num_iterations = 2
-
-        for _ in framework_iterator(config, frameworks="tf2"):
-            for env in ["ALE/MsPacman-v5", "FrozenLake-v1", "CartPole-v1"]:
-                print("Env={}".format(env))
-                config.environment(env)
-                algo = config.build()
-
-                for i in range(num_iterations):
-                    results = algo.train()
-                    print(results)
-
-                algo.stop()
-
-    def test_dreamerv3_dreamer_model_sizes(self):
-        """Tests, whether the different model sizes match the ones reported in [1]."""
-
-        # For Atari, these are the exact numbers from the repo ([3]).
-        # However, for CartPole + size "S" and "M", the author's original code will not
-        # match for the world model count. This is due to the fact that the author uses
-        # encoder/decoder nets with 5x1024 nodes (which corresponds to XL) regardless of
-        # the `model_size` settings (iff >="S").
-        expected_num_params_world_model = {
-            "XS_cartpole": 2435076,
-            "S_cartpole": 7493380,
-            "M_cartpole": 16206084,
-            "L_cartpole": 37802244,
-            "XL_cartpole": 108353796,
-            "XS_atari": 7538979,
-            "S_atari": 15687811,
-            "M_atari": 32461635,
-            "L_atari": 68278275,
-            "XL_atari": 181558659,
-        }
-
-        # All values confirmed against [3] (100% match).
-        expected_num_params_actor = {
-            # hidden=[1280, 256]
-            # hidden_norm=[256], [256]
-            # pi (2 actions)=[256, 2], [2]
-            "XS_cartpole": 328706,
-            "S_cartpole": 1051650,
-            "M_cartpole": 2135042,
-            "L_cartpole": 4136450,
-            "XL_cartpole": 9449474,
-            "XS_atari": 329734,
-            "S_atari": 1053702,
-            "M_atari": 2137606,
-            "L_atari": 4139526,
-            "XL_atari": 9453574,
-        }
-
-        # All values confirmed against [3] (100% match).
-        expected_num_params_critic = {
-            # hidden=[1280, 256]
-            # hidden_norm=[256], [256]
-            # vf (buckets)=[256, 255], [255]
-            "XS_cartpole": 393727,
-            "S_cartpole": 1181439,
-            "M_cartpole": 2297215,
-            "L_cartpole": 4331007,
-            "XL_cartpole": 9708799,
-            "XS_atari": 393727,
-            "S_atari": 1181439,
-            "M_atari": 2297215,
-            "L_atari": 4331007,
-            "XL_atari": 9708799,
-        }
-
-        config = (
-            dreamerv3.DreamerV3Config()
-            .framework("tf2", eager_tracing=True)
-            .training(
-                model={
-                    "batch_length_T": 16,
-                    "horizon_H": 5,
-                    "gamma": 0.997,
-                    "symlog_obs": True,
-                }
-            )
-        )
-
-        # Check all model_sizes described in the paper ([1]) on matching the number
-        # of parameters to RLlib's implementation.
-        for model_size in ["XS", "S", "M", "L", "XL"]:
-            config.model_size = model_size
-            config.training(model={"model_size": model_size})
-
-            # Atari and CartPole spaces.
-            for obs_space, num_actions, env_name in [
-                (gym.spaces.Box(-1.0, 0.0, (4,), np.float32), 2, "cartpole"),
-                (gym.spaces.Box(-1.0, 0.0, (64, 64, 3), np.float32), 6, "atari"),
-            ]:
-                print(f"Testing model_size={model_size} on env-type: {env_name} ..")
-                config.environment(
-                    observation_space=obs_space,
-                    action_space=gym.spaces.Discrete(num_actions),
-                )
-
-                # Create our RLModule to compute actions with.
-                policy_dict, _ = config.get_multi_agent_setup()
-                module_spec = config.get_marl_module_spec(policy_dict=policy_dict)
-                rl_module = module_spec.build()[DEFAULT_POLICY_ID]
-
-                # Count the generated RLModule's parameters and compare to the paper's
-                # reported numbers ([1] and [3]).
-                num_params_world_model = sum(
-                    np.prod(v.shape.as_list())
-                    for v in rl_module.world_model.trainable_variables
-                )
-                self.assertEqual(
-                    num_params_world_model,
-                    expected_num_params_world_model[f"{model_size}_{env_name}"],
-                )
-                num_params_actor = sum(
-                    np.prod(v.shape.as_list())
-                    for v in rl_module.actor.trainable_variables
-                )
-                self.assertEqual(
-                    num_params_actor,
-                    expected_num_params_actor[f"{model_size}_{env_name}"],
-                )
-                num_params_critic = sum(
-                    np.prod(v.shape.as_list())
-                    for v in rl_module.critic.trainable_variables
-                )
-                self.assertEqual(
-                    num_params_critic,
-                    expected_num_params_critic[f"{model_size}_{env_name}"],
-                )
-                print("\tok")
-
-
-if __name__ == "__main__":
-    import pytest
-    import sys
-
-    sys.exit(pytest.main(["-v", __file__]))
diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py
index 366735f643d74..6f970a9117d9e 100644
--- a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py
+++ b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py
@@ -18,7 +18,7 @@
 from ray.rllib.core.rl_module.marl_module import ModuleID
 from ray.rllib.core.learner.learner import ParamDict
 from ray.rllib.core.learner.tf.tf_learner import TfLearner
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf, try_import_tfp
 from ray.rllib.utils.tf_utils import symlog, two_hot, clip_gradients
@@ -34,21 +34,16 @@ class DreamerV3TfLearner(DreamerV3Learner, TfLearner):
     The critic EMA-copy update step can be found in the `DreamerV3Learner` base class,
     as it is framework independent.
 
-    We define 3 local TensorFlow optimizers for the sub components "world_model",
+    We define 3 local tensorflow optimizers for the sub components "world_model",
     "actor", and "critic". Each of these optimizers might use a different learning rate,
     epsilon parameter, and gradient clipping thresholds and procedures.
     """
 
     @override(TfLearner)
-    def configure_optimizers_for_module(
+    def configure_optimizer_for_module(
         self, module_id: ModuleID, hps: DreamerV3LearnerHyperparameters
     ):
-        """Create the 3 optimizers for Dreamer learning: world_model, actor, critic.
-
-        The learning rates used are described in [1] and the epsilon values used here
-        - albeit probably not that important - are used by the author's own
-        implementation.
-        """
+        """Create the 3 optimizers for Dreamer learning: world_model, actor, critic."""
 
         dreamerv3_module = self._module[module_id]
 
@@ -100,7 +95,7 @@ def postprocess_gradients_for_module(
         """Performs gradient clipping on the 3 module components' computed grads.
 
         Note that different grad global-norm clip values are used for the 3
-        module components: world model, actor, and critic.
+        module components (world model, actor, and critic).
         """
         for optimizer_name, optimizer in self.get_optimizers_for_module(
             module_id=module_id
@@ -139,32 +134,6 @@ def postprocess_gradients_for_module(
 
         return module_gradients_dict
 
-    @override(TfLearner)
-    def compute_gradients(
-        self,
-        loss_per_module,
-        gradient_tape,
-        **kwargs,
-    ):
-        # Override of the default gradient computation method.
-        # For DreamerV3, we need to compute gradients over the individual loss terms
-        # as otherwise, the world model's parameters would have their gradients also
-        # be influenced by the actor- and critic loss terms/gradient computations.
-        grads = {}
-        for component in ["world_model", "actor", "critic"]:
-            grads.update(
-                gradient_tape.gradient(
-                    # Take individual loss term from the registered metrics for
-                    # the main module.
-                    self._metrics[DEFAULT_POLICY_ID][component.upper() + "_L_total"],
-                    self.filter_param_dict_for_optimizer(
-                        self._params, self.get_optimizer(optimizer_name=component)
-                    ),
-                )
-            )
-        del gradient_tape
-        return grads
-
     @override(TfLearner)
     def compute_loss_for_module(
         self,
@@ -201,11 +170,7 @@ def compute_loss_for_module(
             + 0.1 * L_rep_B_T
         )
 
-        # In the paper, it says to sum up timesteps, and average over
-        # batch (see eq. 4 in [1]). But Danijar's implementation only does
-        # averaging (over B and T), so we'll do this here as well. This is generally
-        # true for all other loss terms as well (we'll always just average, no summing
-        # over T axis!).
+        # Sum up timesteps, and average over batch (see eq. 4 in [1]).
         L_world_model_total = tf.reduce_mean(L_world_model_total_B_T)
 
         # Register world model loss stats.
@@ -217,36 +182,28 @@ def compute_loss_for_module(
                 ),
                 # Prediction losses.
                 # Decoder (obs) loss.
+                "WORLD_MODEL_L_decoder_B_T": prediction_losses["L_decoder_B_T"],
                 "WORLD_MODEL_L_decoder": prediction_losses["L_decoder"],
                 # Reward loss.
+                "WORLD_MODEL_L_reward_B_T": prediction_losses["L_reward_B_T"],
                 "WORLD_MODEL_L_reward": prediction_losses["L_reward"],
                 # Continue loss.
+                "WORLD_MODEL_L_continue_B_T": prediction_losses["L_continue_B_T"],
                 "WORLD_MODEL_L_continue": prediction_losses["L_continue"],
                 # Total.
+                "WORLD_MODEL_L_prediction_B_T": prediction_losses["L_prediction_B_T"],
                 "WORLD_MODEL_L_prediction": prediction_losses["L_prediction"],
                 # Dynamics loss.
+                "WORLD_MODEL_L_dynamics_B_T": L_dyn_B_T,
                 "WORLD_MODEL_L_dynamics": L_dyn,
                 # Representation loss.
+                "WORLD_MODEL_L_representation_B_T": L_rep_B_T,
                 "WORLD_MODEL_L_representation": L_rep,
                 # Total loss.
+                "WORLD_MODEL_L_total_B_T": L_world_model_total_B_T,
                 "WORLD_MODEL_L_total": L_world_model_total,
             },
         )
-        if hps.report_individual_batch_item_stats:
-            self.register_metrics(
-                module_id=module_id,
-                metrics_dict={
-                    "WORLD_MODEL_L_decoder_B_T": prediction_losses["L_decoder_B_T"],
-                    "WORLD_MODEL_L_reward_B_T": prediction_losses["L_reward_B_T"],
-                    "WORLD_MODEL_L_continue_B_T": prediction_losses["L_continue_B_T"],
-                    "WORLD_MODEL_L_prediction_B_T": (
-                        prediction_losses["L_prediction_B_T"]
-                    ),
-                    "WORLD_MODEL_L_dynamics_B_T": L_dyn_B_T,
-                    "WORLD_MODEL_L_representation_B_T": L_rep_B_T,
-                    "WORLD_MODEL_L_total_B_T": L_world_model_total_B_T,
-                },
-            )
 
         # Dream trajectories starting in all internal states (h + z_posterior) that were
         # computed during world model training.
@@ -262,31 +219,17 @@ def compute_loss_for_module(
             timesteps_H=hps.horizon_H,
             gamma=hps.gamma,
         )
-        if hps.report_dream_data:
-            # To reduce this massive mount of data a little, slice out a T=1 piece
-            # from each stats that has the shape (H, BxT), meaning convert e.g.
-            # `rewards_dreamed_t0_to_H_BxT` into `rewards_dreamed_t0_to_H_Bx1`.
-            # This will reduce the amount of data to be transferred and reported
-            # by the factor of `batch_length_T`.
-            self.register_metrics(
-                module_id,
-                {
-                    # Replace 'T' with '1'.
-                    "DREAM_DATA_" + key[:-1] + "1": value[:, hps.batch_size_B]
-                    for key, value in dream_data.items()
-                    if key.endswith("H_BxT")
-                },
-            )
+        self.register_metrics(module_id, {"dream_data": dream_data})
 
         value_targets_t0_to_Hm1_BxT = self._compute_value_targets(
             hps=hps,
             # Learn critic in symlog'd space.
-            rewards_t0_to_H_BxT=dream_data["rewards_dreamed_t0_to_H_BxT"],
+            rewards_t0_to_H_BxT=dream_data["rewards_dreamed_t0_to_H_B"],
             intrinsic_rewards_t1_to_H_BxT=(
                 dream_data["rewards_intrinsic_t1_to_H_B"] if hps.use_curiosity else None
             ),
-            continues_t0_to_H_BxT=dream_data["continues_dreamed_t0_to_H_BxT"],
-            value_predictions_t0_to_H_BxT=dream_data["values_dreamed_t0_to_H_BxT"],
+            continues_t0_to_H_BxT=dream_data["continues_dreamed_t0_to_H_B"],
+            value_predictions_t0_to_H_BxT=dream_data["values_dreamed_t0_to_H_B"],
         )
         self.register_metric(
             module_id, "VALUE_TARGETS_H_BxT", value_targets_t0_to_Hm1_BxT
@@ -294,7 +237,6 @@ def compute_loss_for_module(
 
         CRITIC_L_total = self._compute_critic_loss(
             module_id=module_id,
-            hps=hps,
             dream_data=dream_data,
             value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT,
         )
@@ -308,6 +250,16 @@ def compute_loss_for_module(
         else:
             ACTOR_L_total = 0.0
 
+        # if hps.use_curiosity:
+        #    L_disagree = self._compute_disagree_loss(dream_data=dream_data)
+        #    results["DISAGREE_L_total"] = L_disagree
+        #    results["DISAGREE_intrinsic_rewards_H_B"] = (
+        #        dream_data["rewards_intrinsic_t1_to_H_B"]
+        #    )
+        #    results["DISAGREE_intrinsic_rewards"] = tf.reduce_mean(
+        #        dream_data["rewards_intrinsic_t1_to_H_B"]
+        #    )
+
         # Return the total loss as a sum of all individual losses.
         return L_world_model_total + CRITIC_L_total + ACTOR_L_total
 
@@ -337,27 +289,16 @@ def _compute_world_model_prediction_losses(
         # If symlog is disabled (e.g. for uint8 image inputs), `obs_symlog_BxT` is the
         # same as `obs_BxT`.
         obs_BxT = fwd_out["sampled_obs_symlog_BxT"]
-        obs_distr_means = fwd_out["obs_distribution_means_BxT"]
-        # In case we wanted to construct a distribution object from the fwd out data,
-        # we would have to do it like this:
-        # obs_distr = tfp.distributions.MultivariateNormalDiag(
-        #    loc=obs_distr_means,
-        #    # Scale == 1.0.
-        #    # [2]: "Distributions The image predictor outputs the mean of a diagonal
-        #    # Gaussian likelihood with **unit variance** ..."
-        #    scale_diag=tf.ones_like(obs_distr_means),
-        # )
-
+        obs_distr = fwd_out["obs_distribution_BxT"]
         # Leave time dim folded (BxT) and flatten all other (e.g. image) dims.
         obs_BxT = tf.reshape(obs_BxT, shape=[-1, tf.reduce_prod(obs_BxT.shape[1:])])
 
+        # Neg logp loss.
+        # decoder_loss = - obs_distr.log_prob(observations)
+        # decoder_loss /= observations.shape.as_list()[1]
         # Squared diff loss w/ sum(!) over all (already folded) obs dims.
-        # decoder_loss_BxT = SUM[ (obs_distr.loc - observations)^2 ]
-        # Note: This is described strangely in the paper (stating a neglogp loss here),
-        # but the author's own implementation actually uses simple MSE with the loc
-        # of the Gaussian.
         decoder_loss_BxT = tf.reduce_sum(
-            tf.math.square(obs_distr_means - obs_BxT), axis=-1
+            tf.math.square(obs_distr.loc - obs_BxT), axis=-1
         )
 
         # Unfold time rank back in.
@@ -515,36 +456,30 @@ def _compute_actor_loss(
         """
         actor = self.module[module_id].actor
 
-        # Note: `scaled_value_targets_t0_to_Hm1_B` are NOT stop_gradient'd yet.
+        # Note: `value_targets` are NOT stop_gradient'd yet.
         scaled_value_targets_t0_to_Hm1_B = self._compute_scaled_value_targets(
             module_id=module_id,
             hps=hps,
             value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT,
-            value_predictions_t0_to_Hm1_BxT=dream_data["values_dreamed_t0_to_H_BxT"][
-                :-1
-            ],
+            value_predictions_t0_to_Hm1_BxT=dream_data["values_dreamed_t0_to_H_B"][:-1],
         )
 
         # Actions actually taken in the dream.
-        actions_dreamed = tf.stop_gradient(dream_data["actions_dreamed_t0_to_H_BxT"])[
-            :-1
-        ]
-        actions_dreamed_dist_params_t0_to_Hm1_B = dream_data[
-            "actions_dreamed_dist_params_t0_to_H_BxT"
+        actions_dreamed = tf.stop_gradient(dream_data["actions_dreamed_t0_to_H_B"])[:-1]
+        dist_actions_t0_to_Hm1_B = dream_data[
+            "actions_dreamed_distributions_t0_to_H_B"
         ][:-1]
 
-        dist_t0_to_Hm1_B = actor.get_action_dist_object(
-            actions_dreamed_dist_params_t0_to_Hm1_B
-        )
-
         # Compute log(p)s of all possible actions in the dream.
         if isinstance(self.module[module_id].actor.action_space, gym.spaces.Discrete):
             # Note that when we create the Categorical action distributions, we compute
             # unimix probs, then math.log these and provide these log(p) as "logits" to
             # the Categorical. So here, we'll continue to work with log(p)s (not
             # really "logits")!
-            logp_actions_t0_to_Hm1_B = actions_dreamed_dist_params_t0_to_Hm1_B
-
+            logp_actions_t0_to_Hm1_B = tf.stack(
+                [dist.logits for dist in dist_actions_t0_to_Hm1_B],
+                axis=0,
+            )
             # Log probs of actions actually taken in the dream.
             logp_actions_dreamed_t0_to_Hm1_B = tf.reduce_sum(
                 actions_dreamed * logp_actions_t0_to_Hm1_B,
@@ -554,18 +489,29 @@ def _compute_actor_loss(
             logp_loss_H_B = logp_actions_dreamed_t0_to_Hm1_B * tf.stop_gradient(
                 scaled_value_targets_t0_to_Hm1_B
             )
-        # Box space.
-        else:
-            logp_actions_dreamed_t0_to_Hm1_B = dist_t0_to_Hm1_B.log_prob(
-                actions_dreamed
+        elif isinstance(actor.action_space, gym.spaces.Box):
+            # TODO (Rohan138, Sven): Figure out how to vectorize this instead!
+            logp_actions_dreamed_t0_to_Hm1_B = tf.stack(
+                [
+                    dist.log_prob(actions_dreamed[i])
+                    for i, dist in enumerate(dist_actions_t0_to_Hm1_B)
+                ]
             )
             # First term of loss function. [1] eq. 11.
             logp_loss_H_B = scaled_value_targets_t0_to_Hm1_B
+        else:
+            raise ValueError(f"Invalid action space: {actor.action_space}")
 
         assert len(logp_loss_H_B.shape) == 2
 
         # Add entropy loss term (second term [1] eq. 11).
-        entropy_H_B = dist_t0_to_Hm1_B.entropy()
+        entropy_H_B = tf.stack(
+            [
+                dist.entropy()
+                for dist in dream_data["actions_dreamed_distributions_t0_to_H_B"][:-1]
+            ],
+            axis=0,
+        )
         assert len(entropy_H_B.shape) == 2
         entropy = tf.reduce_mean(entropy_H_B)
 
@@ -574,44 +520,31 @@ def _compute_actor_loss(
 
         L_actor_H_B = L_actor_reinforce_term_H_B + L_actor_action_entropy_term_H_B
         # Mask out everything that goes beyond a predicted continue=False boundary.
-        L_actor_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_BxT"])[
-            :-1
-        ]
+        L_actor_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_B"])[:-1]
         L_actor = tf.reduce_mean(L_actor_H_B)
 
         self.register_metrics(
             module_id,
             metrics_dict={
+                "ACTOR_L_total_H_B": L_actor_H_B,
                 "ACTOR_L_total": L_actor,
+                "ACTOR_logp_actions_dreamed_H_B": logp_actions_dreamed_t0_to_Hm1_B,
+                "ACTOR_scaled_value_targets_H_B": scaled_value_targets_t0_to_Hm1_B,
                 "ACTOR_value_targets_pct95_ema": actor.ema_value_target_pct95,
                 "ACTOR_value_targets_pct5_ema": actor.ema_value_target_pct5,
+                "ACTOR_action_entropy_H_B": entropy_H_B,
                 "ACTOR_action_entropy": entropy,
                 # Individual loss terms.
+                "ACTOR_L_neglogp_reinforce_term_H_B": L_actor_reinforce_term_H_B,
                 "ACTOR_L_neglogp_reinforce_term": tf.reduce_mean(
                     L_actor_reinforce_term_H_B
                 ),
+                "ACTOR_L_neg_entropy_term_H_B": L_actor_action_entropy_term_H_B,
                 "ACTOR_L_neg_entropy_term": tf.reduce_mean(
                     L_actor_action_entropy_term_H_B
                 ),
             },
         )
-        if hps.report_individual_batch_item_stats:
-            self.register_metrics(
-                module_id,
-                metrics_dict={
-                    "ACTOR_L_total_H_BxT": L_actor_H_B,
-                    "ACTOR_logp_actions_dreamed_H_BxT": (
-                        logp_actions_dreamed_t0_to_Hm1_B
-                    ),
-                    "ACTOR_scaled_value_targets_H_BxT": (
-                        scaled_value_targets_t0_to_Hm1_B
-                    ),
-                    "ACTOR_action_entropy_H_BxT": entropy_H_B,
-                    # Individual loss terms.
-                    "ACTOR_L_neglogp_reinforce_term_H_BxT": L_actor_reinforce_term_H_B,
-                    "ACTOR_L_neg_entropy_term_H_BxT": L_actor_action_entropy_term_H_B,
-                },
-            )
 
         return L_actor
 
@@ -619,7 +552,6 @@ def _compute_critic_loss(
         self,
         *,
         module_id: ModuleID,
-        hps: DreamerV3LearnerHyperparameters,
         dream_data: Dict[str, TensorType],
         value_targets_t0_to_Hm1_BxT: TensorType,
     ) -> TensorType:
@@ -627,7 +559,6 @@ def _compute_critic_loss(
 
         Args:
             module_id: The ModuleID for which to compute the critic loss.
-            hps: The DreamerV3LearnerHyperparameters to use.
             dream_data: The data generated by dreaming for H steps (horizon) starting
                 from any BxT state (sampled from the buffer for the train batch).
             value_targets_t0_to_Hm1_BxT: The computed value function targets of the
@@ -636,8 +567,7 @@ def _compute_critic_loss(
         Returns:
             The total critic loss tensor.
         """
-        # B=BxT
-        H, B = dream_data["rewards_dreamed_t0_to_H_BxT"].shape[:2]
+        H, B = dream_data["rewards_dreamed_t0_to_H_B"].shape[:2]
         Hm1 = H - 1
 
         # Note that value targets are NOT symlog'd and go from t0 to H-1, not H, like
@@ -656,7 +586,7 @@ def _compute_critic_loss(
         )
 
         # Get (B x T x probs) tensor from return distributions.
-        value_symlog_logits_HxB = dream_data["values_symlog_dreamed_logits_t0_to_HxBxT"]
+        value_symlog_logits_HxB = dream_data["values_symlog_dreamed_logits_t0_to_HxB"]
         # Unfold time rank and cut last time index to match value targets.
         value_symlog_logits_t0_to_Hm1_B = tf.reshape(
             value_symlog_logits_HxB,
@@ -678,7 +608,7 @@ def _compute_critic_loss(
         # Expected values (dreamed) from the EMA (slow critic) net.
         # Note: Slow critic (EMA) outputs are already stop_gradient'd.
         value_symlog_ema_t0_to_Hm1_B = tf.stop_gradient(
-            dream_data["v_symlog_dreamed_ema_t0_to_H_BxT"]
+            dream_data["v_symlog_dreamed_ema_t0_to_H_B"]
         )[:-1]
         # Fold time rank (for two_hot'ing).
         value_symlog_ema_HxB = tf.reshape(value_symlog_ema_t0_to_Hm1_B, (-1,))
@@ -704,7 +634,7 @@ def _compute_critic_loss(
         L_critic_H_B = value_loss_two_hot_H_B + ema_regularization_loss_H_B
 
         # Mask out everything that goes beyond a predicted continue=False boundary.
-        L_critic_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_BxT"])[
+        L_critic_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_B"])[
             :-1
         ]
 
@@ -714,29 +644,21 @@ def _compute_critic_loss(
         self.register_metrics(
             module_id=module_id,
             metrics_dict={
+                # Symlog'd value targets. Critic learns to predict symlog'd values.
+                "VALUE_TARGETS_symlog_H_B": value_symlog_targets_t0_to_Hm1_B,
+                # Critic loss terms.
                 "CRITIC_L_total": L_critic,
+                "CRITIC_L_total_H_B": L_critic_H_B,
+                "CRITIC_L_neg_logp_of_value_targets_H_B": value_loss_two_hot_H_B,
                 "CRITIC_L_neg_logp_of_value_targets": tf.reduce_mean(
                     value_loss_two_hot_H_B
                 ),
+                "CRITIC_L_slow_critic_regularization_H_B": ema_regularization_loss_H_B,
                 "CRITIC_L_slow_critic_regularization": tf.reduce_mean(
                     ema_regularization_loss_H_B
                 ),
             },
         )
-        if hps.report_individual_batch_item_stats:
-            self.register_metrics(
-                module_id=module_id,
-                metrics_dict={
-                    # Symlog'd value targets. Critic learns to predict symlog'd values.
-                    "VALUE_TARGETS_symlog_H_BxT": value_symlog_targets_t0_to_Hm1_B,
-                    # Critic loss terms.
-                    "CRITIC_L_total_H_BxT": L_critic_H_B,
-                    "CRITIC_L_neg_logp_of_value_targets_H_BxT": value_loss_two_hot_H_B,
-                    "CRITIC_L_slow_critic_regularization_H_BxT": (
-                        ema_regularization_loss_H_B
-                    ),
-                },
-            )
 
         return L_critic
 
@@ -802,7 +724,7 @@ def _compute_value_targets(
         # intermediates.shape=[2-16, BxT]
 
         # Loop through reversed timesteps (axis=1) from T+1 to t=2.
-        for t in reversed(range(discount.shape[0])):
+        for t in reversed(range(len(discount))):
             Rs.append(intermediates[t] + discount[t] * hps.gae_lambda * Rs[-1])
 
         # Reverse along time axis and cut the last entry (value estimate at very end
@@ -845,32 +767,21 @@ def _compute_scaled_value_targets(
         Per_R_5 = tfp.stats.percentile(value_targets_H_B, 5)
         Per_R_95 = tfp.stats.percentile(value_targets_H_B, 95)
 
-        # Update EMA values for 5 and 95 percentile, stored as tf variables under actor
-        # network.
-        # 5 percentile
-        new_val_pct5 = tf.where(
-            tf.math.is_nan(actor.ema_value_target_pct5),
-            # is NaN: Initial values: Just set.
-            Per_R_5,
-            # Later update (something already stored in EMA variable): Update EMA.
-            (
+        # Update EMAs stored in actor network.
+        # Initial values: Just set.
+        if tf.math.is_nan(actor.ema_value_target_pct5):
+            actor.ema_value_target_pct5.assign(Per_R_5)
+            actor.ema_value_target_pct95.assign(Per_R_95)
+        # Later update (something already stored in EMA variable): Update EMA.
+        else:
+            actor.ema_value_target_pct5.assign(
                 hps.return_normalization_decay * actor.ema_value_target_pct5
                 + (1.0 - hps.return_normalization_decay) * Per_R_5
-            ),
-        )
-        actor.ema_value_target_pct5.assign(new_val_pct5)
-        # 95 percentile
-        new_val_pct95 = tf.where(
-            tf.math.is_nan(actor.ema_value_target_pct95),
-            # is NaN: Initial values: Just set.
-            Per_R_95,
-            # Later update (something already stored in EMA variable): Update EMA.
-            (
+            )
+            actor.ema_value_target_pct95.assign(
                 hps.return_normalization_decay * actor.ema_value_target_pct95
                 + (1.0 - hps.return_normalization_decay) * Per_R_95
-            ),
-        )
-        actor.ema_value_target_pct95.assign(new_val_pct95)
+            )
 
         # [1] eq. 11 (first term).
         # Danijar's code: TODO: describe ...
diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py
index 77c4c285b21ba..0cb088e60fd95 100644
--- a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py
+++ b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py
@@ -1,12 +1,3 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
 from typing import Mapping, Any
 
 from ray.rllib.algorithms.dreamerv3.dreamerv3_rl_module import DreamerV3RLModule
diff --git a/rllib/algorithms/dreamerv3/tf/models/actor_network.py b/rllib/algorithms/dreamerv3/tf/models/actor_network.py
index d865f85606a3a..f22617960b0a8 100644
--- a/rllib/algorithms/dreamerv3/tf/models/actor_network.py
+++ b/rllib/algorithms/dreamerv3/tf/models/actor_network.py
@@ -8,12 +8,10 @@
 import gymnasium as gym
 from gymnasium.spaces import Box, Discrete
 import numpy as np
+import tensorflow as tf
+import tensorflow_probability as tfp
 
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
-from ray.rllib.utils.framework import try_import_tf, try_import_tfp
-
-_, tf, _ = try_import_tf()
-tfp = try_import_tfp()
 
 
 class ActorNetwork(tf.keras.Model):
@@ -30,19 +28,19 @@ class ActorNetwork(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_size: Optional[str] = "XS",
+        model_dimension: Optional[str] = "XS",
         action_space: gym.Space,
     ):
         """Initializes an ActorNetwork instance.
 
         Args:
-             model_size: The "Model Size" used according to [1] Appendinx B.
+             model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different network sizes.
              action_space: The action space the our environment used.
         """
         super().__init__(name="actor")
 
-        self.model_size = model_size
+        self.model_dimension = model_dimension
         self.action_space = action_space
 
         # The EMA decay variables used for the [Percentile(R, 95%) - Percentile(R, 5%)]
@@ -57,23 +55,20 @@ def __init__(
         # For discrete actions, use a single MLP that computes logits.
         if isinstance(self.action_space, Discrete):
             self.mlp = MLP(
-                model_size=self.model_size,
+                model_dimension=self.model_dimension,
                 output_layer_size=self.action_space.n,
                 name="actor_mlp",
             )
         # For cont. actions, use separate MLPs for Gaussian mean and stddev.
-        # TODO (sven): In the author's original code repo, this is NOT the case,
-        #  inputs are pushed through a shared MLP, then only the two output linear
-        #  layers are separate for std- and mean logits.
         elif isinstance(action_space, Box):
             output_layer_size = np.prod(action_space.shape)
             self.mlp = MLP(
-                model_size=self.model_size,
+                model_dimension=self.model_dimension,
                 output_layer_size=output_layer_size,
                 name="actor_mlp_mean",
             )
             self.std_mlp = MLP(
-                model_size=self.model_size,
+                model_dimension=self.model_dimension,
                 output_layer_size=output_layer_size,
                 name="actor_mlp_std",
             )
@@ -81,15 +76,15 @@ def __init__(
             raise ValueError(f"Invalid action space: {action_space}")
 
     @tf.function
-    def call(self, h, z, return_distr_params=False):
+    def call(self, h, z, return_distribution=False):
         """Performs a forward pass through this policy network.
 
         Args:
             h: The deterministic hidden state of the sequence model. [B, dim(h)].
             z: The stochastic discrete representations of the original
                 observation input. [B, num_categoricals, num_classes].
-            return_distr_params: Whether to return (as a second tuple item) the action
-                distribution parameter tensor created by the policy.
+            return_distribution: Whether to return (as a second tuple item) the action
+                distribution object created by the policy.
         """
         # Flatten last two dims of z.
         assert len(z.shape) == 3
@@ -114,10 +109,8 @@ def call(self, h, z, return_distr_params=False):
             # Danijar's code does: distr = [Distr class](logits=tf.log(probs)).
             # Not sure why we don't directly use the already available probs instead.
             action_logits = tf.math.log(action_probs)
-
-            # Distribution parameters are the log(probs) directly.
-            distr_params = action_logits
-            distr = self.get_action_dist_object(distr_params)
+            # Create the distribution object using the unimix'd logits.
+            distr = tfp.distributions.OneHotCategorical(logits=action_logits)
 
             action = tf.cast(tf.stop_gradient(distr.sample()), tf.float32) + (
                 action_probs - tf.stop_gradient(action_probs)
@@ -129,48 +122,15 @@ def call(self, h, z, return_distr_params=False):
             # minstd, maxstd taken from [1] from configs.yaml
             minstd = 0.1
             maxstd = 1.0
-
-            # Distribution parameters are the squashed std_logits and the tanh'd
-            # mean logits.
             # squash std_logits from (-inf, inf) to (minstd, maxstd)
             std_logits = (maxstd - minstd) * tf.sigmoid(std_logits + 2.0) + minstd
-            mean_logits = tf.tanh(action_logits)
-
-            distr_params = tf.concat([mean_logits, std_logits], axis=-1)
-            distr = self.get_action_dist_object(distr_params)
-
-            action = distr.sample()
-
-        if return_distr_params:
-            return action, distr_params
-        return action
-
-    def get_action_dist_object(self, action_dist_params_T_B):
-        """Helper method to create an action distribution object from (T, B, ..) params.
-
-        Args:
-            action_dist_params_T_B: The time-major action distribution parameters.
-                This could be simply the logits (discrete) or a to-be-split-in-2
-                tensor for mean and stddev (continuous).
-
-        Returns:
-            The tfp action distribution object, from which one can sample, compute
-            log probs, entropy, etc..
-        """
-        if isinstance(self.action_space, gym.spaces.Discrete):
-            # Create the distribution object using the unimix'd logits.
-            distr = tfp.distributions.OneHotCategorical(logits=action_dist_params_T_B)
-
-        elif isinstance(self.action_space, gym.spaces.Box):
             # Compute Normal distribution from action_logits and std_logits
-            loc, scale = tf.split(action_dist_params_T_B, 2, axis=-1)
-            distr = tfp.distributions.Normal(loc=loc, scale=scale)
-
+            distr = tfp.distributions.Normal(tf.tanh(action_logits), std_logits)
             # If action_space is a box with multiple dims, make individual dims
             # independent.
             distr = tfp.distributions.Independent(distr, len(self.action_space.shape))
+            action = distr.sample()
 
-        else:
-            raise ValueError(f"Action space {self.action_space} not supported!")
-
-        return distr
+        if return_distribution:
+            return action, distr
+        return action
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py b/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py
index 0700240f1bf8c..ba9ec38a0fa55 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py
@@ -5,10 +5,9 @@
 """
 from typing import Optional
 
-from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier
-from ray.rllib.utils.framework import try_import_tf
+import tensorflow as tf
 
-_, tf, _ = try_import_tf()
+from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier
 
 
 class CNNAtari(tf.keras.Model):
@@ -17,13 +16,13 @@ class CNNAtari(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_size: Optional[str] = "XS",
+        model_dimension: Optional[str] = "XS",
         cnn_multiplier: Optional[int] = None,
     ):
         """Initializes a CNNAtari instance.
 
         Args:
-            model_size: The "Model Size" used according to [1] Appendinx B.
+            model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the `cnn_multiplier`.
             cnn_multiplier: Optional override for the additional factor used to multiply
                 the number of filters with each CNN layer. Starting with
@@ -33,7 +32,7 @@ def __init__(
         """
         super().__init__(name="image_encoder")
 
-        cnn_multiplier = get_cnn_multiplier(model_size, override=cnn_multiplier)
+        cnn_multiplier = get_cnn_multiplier(model_dimension, override=cnn_multiplier)
 
         # See appendix C in [1]:
         # "We use a similar network architecture but employ layer normalization and
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py
index a23ddca856c87..41031c950e11b 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py
@@ -5,11 +5,10 @@
 """
 from typing import Optional
 
-from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
-from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+import tensorflow as tf
+import tensorflow_probability as tfp
 
-_, tf, _ = try_import_tf()
-tfp = try_import_tfp()
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 
 
 class ContinuePredictor(tf.keras.Model):
@@ -24,15 +23,15 @@ class ContinuePredictor(tf.keras.Model):
     terminal.
     """
 
-    def __init__(self, *, model_size: Optional[str] = "XS"):
+    def __init__(self, *, model_dimension: Optional[str] = "XS"):
         """Initializes a ContinuePredictor instance.
 
         Args:
-            model_size: The "Model Size" used according to [1] Appendinx B.
+            model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Determines the exact size of the underlying MLP.
         """
         super().__init__(name="continue_predictor")
-        self.mlp = MLP(model_size=model_size, output_layer_size=1)
+        self.mlp = MLP(model_dimension=model_dimension, output_layer_size=1)
 
     def call(self, h, z, return_distribution=False):
         """Performs a forward pass through the continue predictor.
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py b/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py
index ebc8649ccd79b..cffa73adb8029 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py
@@ -10,11 +10,10 @@
 from typing import Optional
 
 import numpy as np
+import tensorflow as tf
+import tensorflow_probability as tfp
 
 from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier
-from ray.rllib.utils.framework import try_import_tf
-
-_, tf, _ = try_import_tf()
 
 
 class ConvTransposeAtari(tf.keras.Model):
@@ -29,14 +28,14 @@ class ConvTransposeAtari(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_size: Optional[str] = "XS",
+        model_dimension: Optional[str] = "XS",
         cnn_multiplier: Optional[int] = None,
         gray_scaled: bool,
     ):
         """Initializes a ConvTransposeAtari instance.
 
         Args:
-            model_size: The "Model Size" used according to [1] Appendinx B.
+            model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the `cnn_multiplier`.
             cnn_multiplier: Optional override for the additional factor used to multiply
                 the number of filters with each CNN transpose layer. Starting with
@@ -48,7 +47,7 @@ def __init__(
         """
         super().__init__(name="image_decoder")
 
-        cnn_multiplier = get_cnn_multiplier(model_size, override=cnn_multiplier)
+        cnn_multiplier = get_cnn_multiplier(model_dimension, override=cnn_multiplier)
 
         # The shape going into the first Conv2DTranspose layer.
         # We start with a 4x4 channels=8 "image".
@@ -147,9 +146,15 @@ def call(self, h, z):
         # From [2]:
         # "Distributions: The image predictor outputs the mean of a diagonal Gaussian
         # likelihood with unit variance, ..."
-
         # Reshape `out` for the diagonal multi-variate Gaussian (each pixel is its own
         # independent (b/c diagonal co-variance matrix) variable).
         loc = tf.reshape(out, shape=(out_shape[0], -1))
-
-        return loc
+        distribution = tfp.distributions.MultivariateNormalDiag(
+            loc=loc,
+            # Scale == 1.0.
+            # [2]: "Distributions The image predictor outputs the mean of a diagonal
+            # Gaussian likelihood with **unit variance** ..."
+            scale_diag=tf.ones_like(loc),
+        )
+        pred_obs = distribution.sample()
+        return pred_obs, distribution
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py
index 559009a44531f..fc69c8dd33f9c 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py
@@ -5,13 +5,12 @@
 """
 from typing import Optional
 
+import tensorflow as tf
+
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import (
     RepresentationLayer,
 )
-from ray.rllib.utils.framework import try_import_tf
-
-_, tf, _ = try_import_tf()
 
 
 class DynamicsPredictor(tf.keras.Model):
@@ -27,17 +26,17 @@ class DynamicsPredictor(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_size: Optional[str] = "XS",
+        model_dimension: Optional[str] = "XS",
         num_categoricals: Optional[int] = None,
         num_classes_per_categorical: Optional[int] = None,
     ):
         """Initializes a DynamicsPredictor instance.
 
         Args:
-            model_size: The "Model Size" used according to [1] Appendinx B.
+            model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different parameters.
             num_categoricals: Overrides the number of categoricals used in the z-states.
-                In [1], 32 is used for any model size.
+                In [1], 32 is used for any model dimension.
             num_classes_per_categorical: Overrides the number of classes within each
                 categorical used for the z-states. In [1], 32 is used for any model
                 dimension.
@@ -48,12 +47,12 @@ def __init__(
             # TODO: In Danijar's code, the Dynamics Net only has a single layer, no
             #  matter the model size.
             num_dense_layers=1,
-            model_size=model_size,
+            model_dimension=model_dimension,
             output_layer_size=None,
         )
         # The (prior) z-state generating layer.
         self.representation_layer = RepresentationLayer(
-            model_size=model_size,
+            model_dimension=model_dimension,
             num_categoricals=num_categoricals,
             num_classes_per_categorical=num_classes_per_categorical,
         )
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/mlp.py b/rllib/algorithms/dreamerv3/tf/models/components/mlp.py
index 435d9f8544ab3..30d4a7713ee1a 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/mlp.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/mlp.py
@@ -9,13 +9,12 @@
 """
 from typing import Optional
 
+import tensorflow as tf
+
 from ray.rllib.algorithms.dreamerv3.utils import (
     get_dense_hidden_units,
     get_num_dense_layers,
 )
-from ray.rllib.utils.framework import try_import_tf
-
-_, tf, _ = try_import_tf()
 
 
 class MLP(tf.keras.Model):
@@ -23,13 +22,13 @@ class MLP(tf.keras.Model):
 
     MLP=multi-layer perceptron.
 
-    See Appendix B in [1] for the MLP sizes depending on the given `model_size`.
+    See Appendix B in [1] for the MLP sizes depending on the given `model_dimension`.
     """
 
     def __init__(
         self,
         *,
-        model_size: Optional[str] = "XS",
+        model_dimension: Optional[str] = "XS",
         num_dense_layers: Optional[int] = None,
         dense_hidden_units: Optional[int] = None,
         output_layer_size=None,
@@ -39,12 +38,12 @@ def __init__(
         """Initializes an MLP instance.
 
         Args:
-            model_size: The "Model Size" used according to [1] Appendinx B.
+            model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different network sizes.
             num_dense_layers: The number of hidden layers in the MLP. If None,
-                will use `model_size` and appendix B to figure out this value.
+                will use `model_dimension` and appendix B to figure out this value.
             dense_hidden_units: The number of nodes in each hidden layer. If None,
-                will use `model_size` and appendix B to figure out this value.
+                will use `model_dimension` and appendix B to figure out this value.
             output_layer_size: The size of an optional linear (no activation) output
                 layer. If None, no output layer will be added on top of the MLP dense
                 stack.
@@ -53,9 +52,11 @@ def __init__(
         """
         super().__init__(name=name or "mlp")
 
-        num_dense_layers = get_num_dense_layers(model_size, override=num_dense_layers)
+        num_dense_layers = get_num_dense_layers(
+            model_dimension, override=num_dense_layers
+        )
         dense_hidden_units = get_dense_hidden_units(
-            model_size, override=dense_hidden_units
+            model_dimension, override=dense_hidden_units
         )
 
         self.dense_layers = []
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py b/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py
index cf6b27b3c68ff..36e2ace631844 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py
@@ -9,14 +9,13 @@
 """
 from typing import Optional
 
+import tensorflow as tf
+import tensorflow_probability as tfp
+
 from ray.rllib.algorithms.dreamerv3.utils import (
     get_num_z_categoricals,
     get_num_z_classes,
 )
-from ray.rllib.utils.framework import try_import_tf, try_import_tfp
-
-_, tf, _ = try_import_tf()
-tfp = try_import_tfp()
 
 
 class RepresentationLayer(tf.keras.layers.Layer):
@@ -30,26 +29,26 @@ class RepresentationLayer(tf.keras.layers.Layer):
     def __init__(
         self,
         *,
-        model_size: Optional[str] = "XS",
+        model_dimension: Optional[str] = "XS",
         num_categoricals: Optional[int] = None,
         num_classes_per_categorical: Optional[int] = None,
     ):
         """Initializes a RepresentationLayer instance.
 
         Args:
-            model_size: The "Model Size" used according to [1] Appendinx B.
+            model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different parameters.
             num_categoricals: Overrides the number of categoricals used in the z-states.
-                In [1], 32 is used for any model size.
+                In [1], 32 is used for any model dimension.
             num_classes_per_categorical: Overrides the number of classes within each
                 categorical used for the z-states. In [1], 32 is used for any model
                 dimension.
         """
         self.num_categoricals = get_num_z_categoricals(
-            model_size, override=num_categoricals
+            model_dimension, override=num_categoricals
         )
         self.num_classes_per_categorical = get_num_z_classes(
-            model_size, override=num_classes_per_categorical
+            model_dimension, override=num_classes_per_categorical
         )
 
         super().__init__(
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py
index c8ce0fc260fd6..7af29664c6024 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py
@@ -5,13 +5,12 @@
 """
 from typing import Optional
 
+import tensorflow as tf
+
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import (
     RewardPredictorLayer,
 )
-from ray.rllib.utils.framework import try_import_tf
-
-_, tf, _ = try_import_tf()
 
 
 class RewardPredictor(tf.keras.Model):
@@ -23,7 +22,7 @@ class RewardPredictor(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_size: Optional[str] = "XS",
+        model_dimension: Optional[str] = "XS",
         num_buckets: int = 255,
         lower_bound: float = -20.0,
         upper_bound: float = 20.0,
@@ -31,7 +30,7 @@ def __init__(
         """Initializes a RewardPredictor instance.
 
         Args:
-            model_size: The "Model Size" used according to [1] Appendinx B.
+            model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Determines the exact size of the underlying MLP.
             num_buckets: The number of buckets to create. Note that the number of
                 possible symlog'd outcomes from the used distribution is
@@ -52,7 +51,7 @@ def __init__(
         super().__init__(name="reward_predictor")
 
         self.mlp = MLP(
-            model_size=model_size,
+            model_dimension=model_dimension,
             output_layer_size=None,
         )
         self.reward_layer = RewardPredictorLayer(
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py
index 185098b15b2bc..f9c92e92e7279 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py
@@ -7,9 +7,7 @@
 D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
 https://arxiv.org/pdf/2010.02193.pdf
 """
-from ray.rllib.utils.framework import try_import_tf
-
-_, tf, _ = try_import_tf()
+import tensorflow as tf
 
 
 class RewardPredictorLayer(tf.keras.layers.Layer):
@@ -17,7 +15,7 @@ class RewardPredictorLayer(tf.keras.layers.Layer):
 
     This layer is used in two models in DreamerV3: The reward predictor of the world
     model and the value function. K is 255 by default (see [1]) and doesn't change
-    with the model size.
+    with the model dimension.
 
     Possible predicted reward/values range from symexp(-20.0) to symexp(20.0), which
     should cover any possible environment. Outputs of this layer are generated by
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py b/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py
index d8ee68499625a..5f1d02f539ed8 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py
@@ -6,12 +6,10 @@
 from typing import Optional
 
 import gymnasium as gym
+import tensorflow as tf
 
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 from ray.rllib.algorithms.dreamerv3.utils import get_gru_units
-from ray.rllib.utils.framework import try_import_tf
-
-_, tf, _ = try_import_tf()
 
 
 class SequenceModel(tf.keras.Model):
@@ -39,23 +37,23 @@ class SequenceModel(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_size: Optional[str] = "XS",
+        model_dimension: Optional[str] = "XS",
         action_space: gym.Space,
         num_gru_units: Optional[int] = None,
     ):
         """Initializes a SequenceModel instance.
 
         Args:
-            model_size: The "Model Size" used according to [1] Appendinx B.
+            model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the number of GRU units used.
             action_space: The action space the our environment used.
             num_gru_units: Overrides the number of GRU units (dimension of the h-state).
-                If None, use the value given through `model_size`
+                If None, use the value given through `model_dimension`
                 (see [1] Appendix B).
         """
         super().__init__(name="sequence_model")
 
-        num_gru_units = get_gru_units(model_size, override=num_gru_units)
+        num_gru_units = get_gru_units(model_dimension, override=num_gru_units)
         self.action_space = action_space
 
         # In Danijar's code, there is an additional layer (units=[model_size])
@@ -63,7 +61,7 @@ def __init__(
         # the paper.
         self.pre_gru_layer = MLP(
             num_dense_layers=1,
-            model_size=model_size,
+            model_dimension=model_dimension,
             output_layer_size=None,
         )
         self.gru_unit = tf.keras.layers.GRU(
diff --git a/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py b/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py
index bcfdb164e6d0a..08dadaf6494d4 100644
--- a/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py
+++ b/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py
@@ -6,11 +6,10 @@
 from typing import Optional
 
 import gymnasium as gym
+import tensorflow as tf
+import tensorflow_probability as tfp
 
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
-from ray.rllib.utils.framework import try_import_tf
-
-_, tf, _ = try_import_tf()
 
 
 class VectorDecoder(tf.keras.Model):
@@ -23,13 +22,13 @@ class VectorDecoder(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_size: Optional[str] = "XS",
+        model_dimension: Optional[str] = "XS",
         observation_space: gym.Space,
     ):
         """Initializes a VectorDecoder instance.
 
         Args:
-            model_size: The "Model Size" used according to [1] Appendinx B.
+            model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Determines the exact size of the underlying MLP.
             observation_space: The observation space to decode back into. This must
                 be a Box of shape (d,), where d >= 1.
@@ -42,7 +41,7 @@ def __init__(
         )
 
         self.mlp = MLP(
-            model_size=model_size,
+            model_dimension=model_dimension,
             output_layer_size=observation_space.shape[0],
         )
 
@@ -63,5 +62,13 @@ def call(self, h, z):
         # Send h-cat-z through MLP to get mean values of diag gaussian.
         loc = self.mlp(out)
 
-        # Return only the predicted observations (mean, no sample).
-        return loc
+        # Create the Gaussian diag distribution.
+        distribution = tfp.distributions.MultivariateNormalDiag(
+            loc=loc,
+            # Scale == 1.0.
+            scale_diag=tf.ones_like(loc),
+        )
+        pred_obs = distribution.sample()
+
+        # Always return both predicted observations (sample0 and distribution.
+        return pred_obs, distribution
diff --git a/rllib/algorithms/dreamerv3/tf/models/critic_network.py b/rllib/algorithms/dreamerv3/tf/models/critic_network.py
index d40441e585baf..837ca68ccfdcf 100644
--- a/rllib/algorithms/dreamerv3/tf/models/critic_network.py
+++ b/rllib/algorithms/dreamerv3/tf/models/critic_network.py
@@ -5,13 +5,12 @@
 """
 from typing import Optional
 
+import tensorflow as tf
+
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import (
     RewardPredictorLayer,
 )
-from ray.rllib.utils.framework import try_import_tf
-
-_, tf, _ = try_import_tf()
 
 
 class CriticNetwork(tf.keras.Model):
@@ -28,7 +27,7 @@ class CriticNetwork(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_size: Optional[str] = "XS",
+        model_dimension: Optional[str] = "XS",
         num_buckets: int = 255,
         lower_bound: float = -20.0,
         upper_bound: float = 20.0,
@@ -37,7 +36,7 @@ def __init__(
         """Initializes a CriticNetwork instance.
 
         Args:
-            model_size: The "Model Size" used according to [1] Appendinx B.
+            model_dimension: The "Model Size" used according to [1] Appendinx B.
                Use None for manually setting the different network sizes.
             num_buckets: The number of buckets to create. Note that the number of
                 possible symlog'd outcomes from the used distribution is
@@ -64,7 +63,7 @@ def __init__(
         """
         super().__init__(name="critic")
 
-        self.model_size = model_size
+        self.model_dimension = model_dimension
         self.ema_decay = ema_decay
 
         # "Fast" critic network(s) (mlp + reward-pred-layer). This is the network
@@ -73,7 +72,7 @@ def __init__(
         # the critic loss term such that the weights of this fast critic stay close
         # to the EMA weights (see below).
         self.mlp = MLP(
-            model_size=self.model_size,
+            model_dimension=self.model_dimension,
             output_layer_size=None,
         )
         self.return_layer = RewardPredictorLayer(
@@ -86,7 +85,7 @@ def __init__(
         # target net, BUT not used to compute anything, just for the
         # weights regularizer term inside the critic loss).
         self.mlp_ema = MLP(
-            model_size=self.model_size,
+            model_dimension=self.model_dimension,
             output_layer_size=None,
             trainable=False,
         )
diff --git a/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py b/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py
index 1a6f95245e302..d186fdcd39eba 100644
--- a/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py
+++ b/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py
@@ -4,14 +4,12 @@
 https://arxiv.org/pdf/2301.04104v1.pdf
 """
 
+import tensorflow as tf
+
 from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
 from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import (
     RepresentationLayer,
 )
-from ray.rllib.utils.framework import try_import_tf, try_import_tfp
-
-_, tf, _ = try_import_tf()
-tfp = try_import_tfp()
 
 
 class DisagreeNetworks(tf.keras.Model):
@@ -23,10 +21,10 @@ class DisagreeNetworks(tf.keras.Model):
     TODO
     """
 
-    def __init__(self, *, num_networks, model_size, intrinsic_rewards_scale):
+    def __init__(self, *, num_networks, model_dimension, intrinsic_rewards_scale):
         super().__init__(name="disagree_networks")
 
-        self.model_size = model_size
+        self.model_dimension = model_dimension
         self.num_networks = num_networks
         self.intrinsic_rewards_scale = intrinsic_rewards_scale
 
@@ -36,13 +34,15 @@ def __init__(self, *, num_networks, model_size, intrinsic_rewards_scale):
         for _ in range(self.num_networks):
             self.mlps.append(
                 MLP(
-                    model_size=self.model_size,
+                    model_dimension=self.model_dimension,
                     output_layer_size=None,
                     trainable=True,
                 )
             )
             self.representation_layers.append(
-                RepresentationLayer(model_size=self.model_size, name="disagree")
+                RepresentationLayer(
+                    model_dimension=self.model_dimension, name="disagree"
+                )
             )
 
     @tf.function
diff --git a/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py b/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py
index f735b9e031ea3..9621c95ce3c22 100644
--- a/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py
+++ b/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py
@@ -7,25 +7,20 @@
 
 import gymnasium as gym
 import numpy as np
+import tensorflow as tf
 
 from ray.rllib.algorithms.dreamerv3.tf.models.disagree_networks import DisagreeNetworks
-from ray.rllib.algorithms.dreamerv3.tf.models.actor_network import ActorNetwork
-from ray.rllib.algorithms.dreamerv3.tf.models.critic_network import CriticNetwork
-from ray.rllib.algorithms.dreamerv3.tf.models.world_model import WorldModel
-from ray.rllib.utils.framework import try_import_tf
-from ray.rllib.utils.tf_utils import inverse_symlog
 
-_, tf, _ = try_import_tf()
+from ray.rllib.utils.tf_utils import inverse_symlog
 
 
 class DreamerModel(tf.keras.Model):
     """The main tf-keras model containing all necessary components for DreamerV3.
 
     Includes:
-    - The world model with encoder, decoder, sequence-model (RSSM), dynamics
-    (generates prior z-state), and "posterior" model (generates posterior z-state).
-    Predicts env dynamics and produces dreamed trajectories for actor- and critic
-    learning.
+    - The world model (with encoder, decoder, sequence-model (RSSM), dynamics
+    (prior z-state generating) model, and "posterior" model) for producing dreamed
+    trajectories.
     - The actor network (policy).
     - The critic network for value function prediction.
     """
@@ -33,29 +28,32 @@ class DreamerModel(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_size: str = "XS",
+        model_dimension: str = "XS",
         action_space: gym.Space,
-        world_model: WorldModel,
-        actor: ActorNetwork,
-        critic: CriticNetwork,
+        batch_size_B,
+        batch_length_T,
+        horizon_H,
+        world_model,
+        actor,
+        critic,
         use_curiosity: bool = False,
         intrinsic_rewards_scale: float = 0.1,
     ):
-        """Initializes a DreamerModel instance.
+        """TODO
 
         Args:
-             model_size: The "Model Size" used according to [1] Appendinx B.
+             model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different network sizes.
              action_space: The action space the our environment used.
-             world_model: The WorldModel component.
-             actor: The ActorNetwork component.
-             critic: The CriticNetwork component.
         """
         super().__init__(name="dreamer_model")
 
-        self.model_size = model_size
+        self.model_dimension = model_dimension
         self.action_space = action_space
         self.use_curiosity = use_curiosity
+        self.batch_size_B = batch_size_B
+        self.batch_length_T = batch_length_T
+        self.horizon_H = horizon_H
 
         self.world_model = world_model
         self.actor = actor
@@ -65,7 +63,7 @@ def __init__(
         if self.use_curiosity:
             self.disagree_nets = DisagreeNetworks(
                 num_networks=8,
-                model_size=self.model_size,
+                model_dimension=self.model_dimension,
                 intrinsic_rewards_scale=intrinsic_rewards_scale,
             )
 
@@ -99,11 +97,11 @@ def call(
         actions = self.actor(
             h=results["h_states_BxT"], z=results["z_posterior_states_BxT"]
         )
-        # Actor (with returning distribution parameters).
-        _, distr_params = self.actor(
+        # Actor (with returning distribution).
+        _, distr = self.actor(
             h=results["h_states_BxT"],
             z=results["z_posterior_states_BxT"],
-            return_distr_params=True,
+            return_distribution=True,
         )
         # Critic.
         values = self.critic(
@@ -157,11 +155,8 @@ def forward_inference(self, observations, previous_states, is_first, training=No
             is_first=is_first,
         )
         # Compute action using our actor network and the current states.
-        _, distr_params = self.actor(
-            h=states["h"], z=states["z"], return_distr_params=True
-        )
+        _, distr = self.actor(h=states["h"], z=states["z"], return_distribution=True)
         # Use the mode of the distribution (Discrete=argmax, Normal=mean).
-        distr = self.actor.get_action_dist_object(distr_params)
         actions = distr.mode()
         return actions, {"h": states["h"], "z": states["z"], "a": actions}
 
@@ -272,9 +267,9 @@ def dream_trajectory(
             timesteps_H: The number of timesteps to dream for.
             gamma: The discount factor gamma.
         """
-        # Dreamed actions (one-hot encoded for discrete actions).
+        # Dreamed actions (one-hot for discrete actions).
         a_dreamed_t0_to_H = []
-        a_dreamed_dist_params_t0_to_H = []
+        a_dreamed_distributions_t0_to_H = []
 
         h = start_states["h"]
         z = start_states["z"]
@@ -286,7 +281,7 @@ def dream_trajectory(
 
         # Compute `a` using actor network (already the first step uses a dreamed action,
         # not a sampled one).
-        a, a_dist_params = self.actor(
+        a, a_dist = self.actor(
             # We have to stop the gradients through the states. B/c we are using a
             # differentiable Discrete action distribution (straight through gradients
             # with `a = stop_gradient(sample(probs)) + probs - stop_gradient(probs)`,
@@ -294,10 +289,10 @@ def dream_trajectory(
             # term on actions further back in the trajectory.
             h=tf.stop_gradient(h),
             z=tf.stop_gradient(z),
-            return_distr_params=True,
+            return_distribution=True,
         )
         a_dreamed_t0_to_H.append(a)
-        a_dreamed_dist_params_t0_to_H.append(a_dist_params)
+        a_dreamed_distributions_t0_to_H.append(a_dist)
 
         for i in range(timesteps_H):
             # Move one step in the dream using the RSSM.
@@ -309,13 +304,13 @@ def dream_trajectory(
             z_states_prior_t0_to_H.append(z)
 
             # Compute `a` using actor network.
-            a, a_dist_params = self.actor(
+            a, a_dist = self.actor(
                 h=tf.stop_gradient(h),
                 z=tf.stop_gradient(z),
-                return_distr_params=True,
+                return_distribution=True,
             )
             a_dreamed_t0_to_H.append(a)
-            a_dreamed_dist_params_t0_to_H.append(a_dist_params)
+            a_dreamed_distributions_t0_to_H.append(a_dist)
 
         h_states_H_B = tf.stack(h_states_t0_to_H, axis=0)  # (T, B, ...)
         h_states_HxB = tf.reshape(h_states_H_B, [-1] + h_states_H_B.shape.as_list()[2:])
@@ -326,7 +321,6 @@ def dream_trajectory(
         )
 
         a_dreamed_H_B = tf.stack(a_dreamed_t0_to_H, axis=0)  # (T, B, ...)
-        a_dreamed_dist_params_H_B = tf.stack(a_dreamed_dist_params_t0_to_H, axis=0)
 
         # Compute r using reward predictor.
         r_dreamed_H_B = tf.reshape(
@@ -395,20 +389,17 @@ def dream_trajectory(
         )
 
         ret = {
-            "h_states_t0_to_H_BxT": h_states_H_B,
-            "z_states_prior_t0_to_H_BxT": z_states_prior_H_B,
-            "rewards_dreamed_t0_to_H_BxT": r_dreamed_H_B,
-            "continues_dreamed_t0_to_H_BxT": c_dreamed_H_B,
-            "actions_dreamed_t0_to_H_BxT": a_dreamed_H_B,
-            # "actions_dreamed_distributions_t0_to_H_BxT": (
-            #    a_dreamed_distributions_t0_to_H
-            # ),
-            "actions_dreamed_dist_params_t0_to_H_BxT": a_dreamed_dist_params_H_B,
-            "values_dreamed_t0_to_H_BxT": v_dreamed_H_B,
-            "values_symlog_dreamed_logits_t0_to_HxBxT": v_symlog_dreamed_logits_HxB,
-            "v_symlog_dreamed_ema_t0_to_H_BxT": v_symlog_dreamed_ema_H_B,
+            "h_states_t0_to_H_B": h_states_H_B,
+            "z_states_prior_t0_to_H_B": z_states_prior_H_B,
+            "rewards_dreamed_t0_to_H_B": r_dreamed_H_B,
+            "continues_dreamed_t0_to_H_B": c_dreamed_H_B,
+            "actions_dreamed_t0_to_H_B": a_dreamed_H_B,
+            "actions_dreamed_distributions_t0_to_H_B": a_dreamed_distributions_t0_to_H,
+            "values_dreamed_t0_to_H_B": v_dreamed_H_B,
+            "values_symlog_dreamed_logits_t0_to_HxB": v_symlog_dreamed_logits_HxB,
+            "v_symlog_dreamed_ema_t0_to_H_B": v_symlog_dreamed_ema_H_B,
             # Loss weights for critic- and actor losses.
-            "dream_loss_weights_t0_to_H_BxT": dream_loss_weights_H_B,
+            "dream_loss_weights_t0_to_H_B": dream_loss_weights_H_B,
         }
 
         if self.use_curiosity:
@@ -546,20 +537,20 @@ def dream_trajectory_with_burn_in(
         # an original time dimension from the real env, from all of which we then branch
         # out our dream trajectories).
         ret = {
-            "h_states_t0_to_H_BxT": h_states_t0_to_H_B,
-            "z_states_prior_t0_to_H_BxT": z_states_prior_t0_to_H_B,
+            "h_states_t0_to_H_B": h_states_t0_to_H_B,
+            "z_states_prior_t0_to_H_B": z_states_prior_t0_to_H_B,
             # Unfold time-ranks in predictions.
-            "rewards_dreamed_t0_to_H_BxT": tf.reshape(r_dreamed_t0_to_HxB, (-1, B)),
-            "continues_dreamed_t0_to_H_BxT": tf.reshape(c_dreamed_t0_to_HxB, (-1, B)),
+            "rewards_dreamed_t0_to_H_B": tf.reshape(r_dreamed_t0_to_HxB, (-1, B)),
+            "continues_dreamed_t0_to_H_B": tf.reshape(c_dreamed_t0_to_HxB, (-1, B)),
         }
 
         # Figure out action key (random, sampled from env, dreamed?).
         if use_sampled_actions_in_dream:
-            key = "actions_sampled_t0_to_H_BxT"
+            key = "actions_sampled_t0_to_H_B"
         elif use_random_actions_in_dream:
-            key = "actions_random_t0_to_H_BxT"
+            key = "actions_random_t0_to_H_B"
         else:
-            key = "actions_dreamed_t0_to_H_BxT"
+            key = "actions_dreamed_t0_to_H_B"
         ret[key] = a_t0_to_H_B
 
         # Also provide int-actions, if discrete action space.
diff --git a/rllib/algorithms/dreamerv3/tf/models/world_model.py b/rllib/algorithms/dreamerv3/tf/models/world_model.py
index 73195fc8e1a0b..39fa3e587d6ef 100644
--- a/rllib/algorithms/dreamerv3/tf/models/world_model.py
+++ b/rllib/algorithms/dreamerv3/tf/models/world_model.py
@@ -6,6 +6,7 @@
 from typing import Optional
 
 import gymnasium as gym
+import tensorflow as tf
 import tree  # pip install dm_tree
 
 from ray.rllib.algorithms.dreamerv3.tf.models.components.continue_predictor import (
@@ -25,13 +26,9 @@
     SequenceModel,
 )
 from ray.rllib.algorithms.dreamerv3.utils import get_gru_units
-from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_utils import symlog
 
 
-_, tf, _ = try_import_tf()
-
-
 class WorldModel(tf.keras.Model):
     """WorldModel component of [1] w/ encoder, decoder, RSSM, reward/cont. predictors.
 
@@ -59,7 +56,7 @@ class WorldModel(tf.keras.Model):
     def __init__(
         self,
         *,
-        model_size: str = "XS",
+        model_dimension: str = "XS",
         action_space: gym.Space,
         batch_length_T: int = 64,
         encoder: tf.keras.Model,
@@ -70,7 +67,7 @@ def __init__(
         """Initializes a WorldModel instance.
 
         Args:
-             model_size: The "Model Size" used according to [1] Appendinx B.
+             model_dimension: The "Model Size" used according to [1] Appendinx B.
                 Use None for manually setting the different network sizes.
              action_space: The action space the our environment used.
              batch_length_T: The length (T) of the sequences used for training. The
@@ -90,7 +87,7 @@ def __init__(
                 the last decoder layer produces the exact, normalized pixel values
                 (not a Gaussian as described in [1]!).
             num_gru_units: The number of GRU units to use. If None, use
-                `model_size` to figure out this parameter.
+                `model_dimension` to figure out this parameter.
             symlog_obs: Whether to predict decoded observations in symlog space.
                 This should be False for image based observations.
                 According to the paper [1] Appendix E: "NoObsSymlog: This ablation
@@ -101,7 +98,7 @@ def __init__(
         """
         super().__init__(name="world_model")
 
-        self.model_size = model_size
+        self.model_dimension = model_dimension
         self.batch_length_T = batch_length_T
         self.symlog_obs = symlog_obs
         self.action_space = action_space
@@ -112,7 +109,7 @@ def __init__(
         # Posterior predictor consisting of an MLP and a RepresentationLayer:
         # [ht, lt] -> zt.
         self.posterior_mlp = MLP(
-            model_size=self.model_size,
+            model_dimension=self.model_dimension,
             output_layer_size=None,
             # In Danijar's code, the posterior predictor only has a single layer,
             # no matter the model size:
@@ -121,15 +118,17 @@ def __init__(
         )
         # The (posterior) z-state generating layer.
         self.posterior_representation_layer = RepresentationLayer(
-            model_size=self.model_size,
+            model_dimension=self.model_dimension,
         )
 
         # Dynamics (prior z-state) predictor: ht -> z^t
-        self.dynamics_predictor = DynamicsPredictor(model_size=self.model_size)
+        self.dynamics_predictor = DynamicsPredictor(
+            model_dimension=self.model_dimension
+        )
 
         # GRU for the RSSM: [at, ht, zt] -> ht+1
         self.num_gru_units = get_gru_units(
-            model_size=self.model_size,
+            model_dimension=self.model_dimension,
             override=num_gru_units,
         )
         # Initial h-state variable (learnt).
@@ -143,15 +142,17 @@ def __init__(
         )
         # The actual sequence model containing the GRU layer.
         self.sequence_model = SequenceModel(
-            model_size=self.model_size,
+            model_dimension=self.model_dimension,
             action_space=self.action_space,
             num_gru_units=self.num_gru_units,
         )
 
         # Reward Predictor: [ht, zt] -> rt.
-        self.reward_predictor = RewardPredictor(model_size=self.model_size)
+        self.reward_predictor = RewardPredictor(model_dimension=self.model_dimension)
         # Continue Predictor: [ht, zt] -> ct.
-        self.continue_predictor = ContinuePredictor(model_size=self.model_size)
+        self.continue_predictor = ContinuePredictor(
+            model_dimension=self.model_dimension
+        )
 
         # Decoder: [ht, zt] -> x^t.
         self.decoder = decoder
@@ -275,7 +276,7 @@ def forward_train(self, observations, actions, is_first, training=None):
         # Make actions and `is_first` time-major.
         actions = tf.transpose(
             actions,
-            perm=[1, 0] + list(range(2, tf.shape(actions).shape.as_list()[0])),
+            perm=[1, 0] + list(range(2, len(actions.shape))),  # .as_list() TODO
         )
         is_first = tf.transpose(is_first, perm=[1, 0])
 
@@ -342,7 +343,7 @@ def forward_train(self, observations, actions, is_first, training=None):
         h_BxT = tf.reshape(h_t1_to_T, shape=[-1] + h_t1_to_T.shape.as_list()[2:])
         z_BxT = tf.reshape(z_t1_to_T, shape=[-1] + z_t1_to_T.shape.as_list()[2:])
 
-        obs_distribution_means = self.decoder(h=h_BxT, z=z_BxT)
+        _, obs_distribution = self.decoder(h=h_BxT, z=z_BxT)
 
         # Compute (predicted) reward distributions.
         rewards, reward_logits = self.reward_predictor(
@@ -355,11 +356,11 @@ def forward_train(self, observations, actions, is_first, training=None):
         )
 
         # Return outputs for loss computation.
-        # Note that all shapes are [BxT, ...] (time axis already folded).
+        # Note that all shapes are [B, ...] (no time axis).
         return {
             # Obs.
             "sampled_obs_symlog_BxT": observations,
-            "obs_distribution_means_BxT": obs_distribution_means,
+            "obs_distribution_BxT": obs_distribution,
             # Rewards.
             "reward_logits_BxT": reward_logits,
             "rewards_BxT": rewards,
diff --git a/rllib/algorithms/dreamerv3/utils/__init__.py b/rllib/algorithms/dreamerv3/utils/__init__.py
deleted file mode 100644
index 592bbf9b32e82..0000000000000
--- a/rllib/algorithms/dreamerv3/utils/__init__.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""
-Utility functions for the DreamerV3 ([1]) algorithm.
-
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-"""
-
-_ALLOWED_MODEL_DIMS = [
-    # RLlib debug sizes (not mentioned in [1]).
-    "nano",
-    "micro",
-    "mini",
-    "XXS",
-    # Regular sizes (listed in table B in [1]).
-    "XS",
-    "S",
-    "M",
-    "L",
-    "XL",
-]
-
-
-def get_cnn_multiplier(model_size, override=None):
-    if override is not None:
-        return override
-
-    assert model_size in _ALLOWED_MODEL_DIMS
-    cnn_multipliers = {
-        "nano": 2,
-        "micro": 4,
-        "mini": 8,
-        "XXS": 16,
-        "XS": 24,
-        "S": 32,
-        "M": 48,
-        "L": 64,
-        "XL": 96,
-    }
-    return cnn_multipliers[model_size]
-
-
-def get_dense_hidden_units(model_size, override=None):
-    if override is not None:
-        return override
-
-    assert model_size in _ALLOWED_MODEL_DIMS
-    dense_units = {
-        "nano": 16,
-        "micro": 32,
-        "mini": 64,
-        "XXS": 128,
-        "XS": 256,
-        "S": 512,
-        "M": 640,
-        "L": 768,
-        "XL": 1024,
-    }
-    return dense_units[model_size]
-
-
-def get_gru_units(model_size, override=None):
-    if override is not None:
-        return override
-
-    assert model_size in _ALLOWED_MODEL_DIMS
-    gru_units = {
-        "nano": 16,
-        "micro": 32,
-        "mini": 64,
-        "XXS": 128,
-        "XS": 256,
-        "S": 512,
-        "M": 1024,
-        "L": 2048,
-        "XL": 4096,
-    }
-    return gru_units[model_size]
-
-
-def get_num_z_categoricals(model_size, override=None):
-    if override is not None:
-        return override
-
-    assert model_size in _ALLOWED_MODEL_DIMS
-    gru_units = {
-        "nano": 4,
-        "micro": 8,
-        "mini": 16,
-        "XXS": 32,
-        "XS": 32,
-        "S": 32,
-        "M": 32,
-        "L": 32,
-        "XL": 32,
-    }
-    return gru_units[model_size]
-
-
-def get_num_z_classes(model_size, override=None):
-    if override is not None:
-        return override
-
-    assert model_size in _ALLOWED_MODEL_DIMS
-    gru_units = {
-        "nano": 4,
-        "micro": 8,
-        "mini": 16,
-        "XXS": 32,
-        "XS": 32,
-        "S": 32,
-        "M": 32,
-        "L": 32,
-        "XL": 32,
-    }
-    return gru_units[model_size]
-
-
-def get_num_curiosity_nets(model_size, override=None):
-    if override is not None:
-        return override
-
-    assert model_size in _ALLOWED_MODEL_DIMS
-    num_curiosity_nets = {
-        "nano": 8,
-        "micro": 8,
-        "mini": 16,
-        "XXS": 8,
-        "XS": 8,
-        "S": 8,
-        "M": 8,
-        "L": 8,
-        "XL": 8,
-    }
-    return num_curiosity_nets[model_size]
-
-
-def get_num_dense_layers(model_size, override=None):
-    if override is not None:
-        return override
-
-    assert model_size in _ALLOWED_MODEL_DIMS
-    num_dense_layers = {
-        "nano": 1,
-        "micro": 1,
-        "mini": 1,
-        "XXS": 1,
-        "XS": 1,
-        "S": 2,
-        "M": 3,
-        "L": 4,
-        "XL": 5,
-    }
-    return num_dense_layers[model_size]
-
-
-def do_symlog_obs(observation_space, symlog_obs_user_setting):
-    # If our symlog_obs setting is NOT set specifically (it's set to "auto"), return
-    # True if we don't have an image observation space, otherwise return False.
-
-    # TODO (sven): Support mixed observation spaces.
-
-    is_image_space = len(observation_space.shape) in [2, 3]
-    return (
-        not is_image_space
-        if symlog_obs_user_setting == "auto"
-        else symlog_obs_user_setting
-    )
diff --git a/rllib/algorithms/dreamerv3/utils/debugging.py b/rllib/algorithms/dreamerv3/utils/debugging.py
deleted file mode 100644
index 1a4cf515d9f41..0000000000000
--- a/rllib/algorithms/dreamerv3/utils/debugging.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import gymnasium as gym
-import numpy as np
-from PIL import Image, ImageDraw
-
-from gymnasium.envs.classic_control.cartpole import CartPoleEnv
-
-from ray.rllib.utils.framework import try_import_tf
-
-_, tf, _ = try_import_tf()
-
-
-class CartPoleDebug(CartPoleEnv):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        low = np.concatenate([np.array([0.0]), self.observation_space.low])
-        high = np.concatenate([np.array([1000.0]), self.observation_space.high])
-
-        self.observation_space = gym.spaces.Box(low, high, shape=(5,), dtype=np.float32)
-
-        self.timesteps_ = 0
-
-    def reset(self, *, seed=None, options=None):
-        ret = super().reset()
-        self.timesteps_ = 0
-        obs = np.concatenate([np.array([self.timesteps_]), ret[0]])
-        return obs, ret[1]
-
-    def step(self, action):
-        ret = super().step(action)
-
-        self.timesteps_ += 1
-
-        obs = np.concatenate([np.array([self.timesteps_]), ret[0]])
-        reward = 0.1 * self.timesteps_
-        return (obs, reward) + ret[2:]
-
-
-gym.register("CartPoleDebug-v0", CartPoleDebug)
-cartpole_env = gym.make("CartPoleDebug-v0", render_mode="rgb_array")
-cartpole_env.reset()
-
-frozenlake_env = gym.make(
-    "FrozenLake-v1", render_mode="rgb_array", is_slippery=False, map_name="4x4"
-)  # desc=["SF", "HG"])
-frozenlake_env.reset()
-
-
-def create_cartpole_dream_image(
-    dreamed_obs,  # real space (not symlog'd)
-    dreamed_V,  # real space (not symlog'd)
-    dreamed_a,
-    dreamed_r_tp1,  # real space (not symlog'd)
-    dreamed_ri_tp1,  # intrinsic reward
-    dreamed_c_tp1,  # continue flag
-    value_target,  # real space (not symlog'd)
-    initial_h,
-    as_tensor=False,
-):
-    # CartPoleDebug
-    if dreamed_obs.shape == (5,):
-        # Set the state of our env to the given observation.
-        cartpole_env.unwrapped.state = np.array(dreamed_obs[1:], dtype=np.float32)
-    # Normal CartPole-v1
-    else:
-        cartpole_env.unwrapped.state = np.array(dreamed_obs, dtype=np.float32)
-
-    # Produce an RGB-image of the current state.
-    rgb_array = cartpole_env.render()
-
-    # Add value-, action-, reward-, and continue-prediction information.
-    image = Image.fromarray(rgb_array)
-    draw_obj = ImageDraw.Draw(image)
-
-    # fnt = ImageFont.load_default(size=40)
-
-    draw_obj.text(
-        (5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0)
-    )  # , font=fnt.font, size=30)
-    draw_obj.text(
-        (5, 18),
-        f"at={'<--' if dreamed_a == 0 else '-->'} ({dreamed_a})",
-        fill=(0, 0, 0),
-    )
-    draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0))
-    if dreamed_ri_tp1 is not None:
-        draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0))
-    draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0))
-    draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0))
-
-    if dreamed_obs.shape == (5,):
-        draw_obj.text((20, 100), f"t={dreamed_obs[0]}", fill=(0, 0, 0))
-
-    # Return image.
-    np_img = np.asarray(image)
-    if as_tensor:
-        return tf.convert_to_tensor(np_img, dtype=tf.uint8)
-    return np_img
-
-
-def create_frozenlake_dream_image(
-    dreamed_obs,  # real space (not symlog'd)
-    dreamed_V,  # real space (not symlog'd)
-    dreamed_a,
-    dreamed_r_tp1,  # real space (not symlog'd)
-    dreamed_ri_tp1,  # intrinsic reward
-    dreamed_c_tp1,  # continue flag
-    value_target,  # real space (not symlog'd)
-    initial_h,
-    as_tensor=False,
-):
-    frozenlake_env.unwrapped.s = np.argmax(dreamed_obs, axis=0)
-
-    # Produce an RGB-image of the current state.
-    rgb_array = frozenlake_env.render()
-
-    # Add value-, action-, reward-, and continue-prediction information.
-    image = Image.fromarray(rgb_array)
-    draw_obj = ImageDraw.Draw(image)
-
-    draw_obj.text((5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0))
-    action_arrow = (
-        "<--"
-        if dreamed_a == 0
-        else "v"
-        if dreamed_a == 1
-        else "-->"
-        if dreamed_a == 2
-        else "^"
-    )
-    draw_obj.text((5, 18), f"at={action_arrow} ({dreamed_a})", fill=(0, 0, 0))
-    draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0))
-    if dreamed_ri_tp1 is not None:
-        draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0))
-    draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0))
-    draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0))
-
-    # Return image.
-    np_img = np.asarray(image)
-    if as_tensor:
-        return tf.convert_to_tensor(np_img, dtype=tf.uint8)
-    return np_img
-
-
-if __name__ == "__main__":
-    # CartPole debug.
-    rgb_array = create_cartpole_dream_image(
-        dreamed_obs=np.array([100.0, 1.0, -0.01, 1.5, 0.02]),
-        dreamed_V=4.3,
-        dreamed_a=1,
-        dreamed_r_tp1=1.0,
-        dreamed_c_tp1=True,
-        initial_h=0.0,
-        value_target=8.0,
-    )
-    # ImageFont.load("arial.pil")
-    image = Image.fromarray(rgb_array)
-    image.show()
-
-    # Normal CartPole.
-    rgb_array = create_cartpole_dream_image(
-        dreamed_obs=np.array([1.0, -0.01, 1.5, 0.02]),
-        dreamed_V=4.3,
-        dreamed_a=1,
-        dreamed_r_tp1=1.0,
-        dreamed_c_tp1=True,
-        initial_h=0.1,
-        value_target=8.0,
-    )
-    # ImageFont.load("arial.pil")
-    image = Image.fromarray(rgb_array)
-    image.show()
-
-    # Frozenlake
-    rgb_array = create_frozenlake_dream_image(
-        dreamed_obs=np.array([1.0] + [0.0] * (frozenlake_env.observation_space.n - 1)),
-        dreamed_V=4.3,
-        dreamed_a=1,
-        dreamed_r_tp1=1.0,
-        dreamed_c_tp1=True,
-        initial_h=0.1,
-        value_target=8.0,
-    )
-    image = Image.fromarray(rgb_array)
-    image.show()
diff --git a/rllib/algorithms/dreamerv3/utils/env_runner.py b/rllib/algorithms/dreamerv3/utils/env_runner.py
deleted file mode 100644
index c8db4e8ebc073..0000000000000
--- a/rllib/algorithms/dreamerv3/utils/env_runner.py
+++ /dev/null
@@ -1,548 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
-from collections import defaultdict
-from functools import partial
-from typing import List, Tuple
-
-import gymnasium as gym
-import numpy as np
-from supersuit.generic_wrappers import resize_v1
-import tree  # pip install dm_tree
-
-from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
-from ray.rllib.core.models.base import STATE_IN, STATE_OUT
-from ray.rllib.env.env_runner import EnvRunner
-from ray.rllib.env.wrappers.atari_wrappers import NoopResetEnv, MaxAndSkipEnv
-from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv
-from ray.rllib.evaluation.metrics import RolloutMetrics
-from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils.framework import try_import_tf
-from ray.rllib.utils.replay_buffers.episode_replay_buffer import _Episode as Episode
-from ray.rllib.utils.numpy import one_hot
-
-_, tf, _ = try_import_tf()
-
-
-class DreamerV3EnvRunner(EnvRunner):
-    """An environment runner to collect data from vectorized gymnasium environments."""
-
-    def __init__(
-        self,
-        config: AlgorithmConfig,
-        **kwargs,
-    ):
-        """Initializes a DreamerV3EnvRunner instance.
-
-        Args:
-            config: The config to use to setup this EnvRunner.
-        """
-        super().__init__(config=config)
-
-        # Create the gym.vector.Env object.
-        # Atari env.
-        if self.config.env.startswith("ALE/"):
-            # [2]: "We down-scale the 84 × 84 grayscale images to 64 × 64 pixels so that
-            # we can apply the convolutional architecture of DreamerV1."
-            # ...
-            # "We follow the evaluation protocol of Machado et al. (2018) with 200M
-            # environment steps, action repeat of 4, a time limit of 108,000 steps per
-            # episode that correspond to 30 minutes of game play, no access to life
-            # information, full action space, and sticky actions. Because the world
-            # model integrates information over time, DreamerV2 does not use frame
-            # stacking."
-            # However, in Danijar's repo, Atari100k experiments are configured as:
-            # noop=30, 64x64x3 (no grayscaling), sticky actions=False,
-            # full action space=False,
-            wrappers = [
-                partial(gym.wrappers.TimeLimit, max_episode_steps=108000),
-                partial(resize_v1, x_size=64, y_size=64),  # resize to 64x64
-                NormalizedImageEnv,
-                NoopResetEnv,
-                MaxAndSkipEnv,
-            ]
-
-            self.env = gym.vector.make(
-                "GymV26Environment-v0",
-                env_id=self.config.env,
-                wrappers=wrappers,
-                num_envs=self.config.num_envs_per_worker,
-                asynchronous=self.config.remote_worker_envs,
-                make_kwargs=dict(
-                    self.config.env_config, **{"render_mode": "rgb_array"}
-                ),
-            )
-        # DeepMind Control.
-        elif self.config.env.startswith("DMC/"):
-            parts = self.config.env.split("/")
-            assert len(parts) == 3, (
-                "ERROR: DMC env must be formatted as 'DMC/[task]/[domain]', e.g. "
-                f"'DMC/cartpole/swingup'! You provided '{self.config.env}'."
-            )
-            gym.register(
-                "dmc_env-v0",
-                lambda from_pixels=True: DMCEnv(
-                    parts[1], parts[2], from_pixels=from_pixels, channels_first=False
-                ),
-            )
-            self.env = gym.vector.make(
-                "dmc_env-v0",
-                wrappers=[ActionClip],
-                num_envs=self.config.num_envs_per_worker,
-                asynchronous=self.config.remote_worker_envs,
-                **dict(self.config.env_config),
-            )
-        # All other (gym) envs.
-        else:
-            wrappers = [] if self.config.env != "FrozenLake-v1" else [OneHot]
-            self.env = gym.vector.make(
-                self.config.env,
-                wrappers=wrappers,
-                num_envs=self.config.num_envs_per_worker,
-                asynchronous=self.config.remote_worker_envs,
-                **dict(self.config.env_config, **{"render_mode": "rgb_array"}),
-            )
-        self.num_envs = self.env.num_envs
-        assert self.num_envs == self.config.num_envs_per_worker
-
-        # Create our RLModule to compute actions with.
-        if self.config.share_module_between_env_runner_and_learner:
-            # DreamerV3 Algorithm will set this to the local Learner's module.
-            self.module = None
-        # Create our own instance of a DreamerV3RLModule (which then needs to be
-        # weight-synched each iteration).
-        else:
-            policy_dict, _ = self.config.get_multi_agent_setup(env=self.env)
-            module_spec = self.config.get_marl_module_spec(policy_dict=policy_dict)
-            # TODO (sven): DreamerV3 is currently single-agent only.
-            self.module = module_spec.build()[DEFAULT_POLICY_ID]
-
-        self._needs_initial_reset = True
-        self._episodes = [None for _ in range(self.num_envs)]
-
-        # TODO (sven): Move metrics temp storage and collection out of EnvRunner
-        #  and RolloutWorkers. These classes should not continue tracking some data
-        #  that they have already returned (in a call to `sample()`). Instead, the
-        #  episode data should be analyzed where it was sent to (the Algorithm itself
-        #  via its replay buffer, etc..).
-        self._done_episodes_for_metrics = []
-        self._ongoing_episodes_for_metrics = defaultdict(list)
-        self._ts_since_last_metrics = 0
-
-    @override(EnvRunner)
-    def sample(
-        self,
-        *,
-        num_timesteps: int = None,
-        num_episodes: int = None,
-        explore: bool = True,
-        random_actions: bool = False,
-        with_render_data: bool = False,
-    ) -> Tuple[List[Episode], List[Episode]]:
-        """Runs and returns a sample (n timesteps or m episodes) on the environment(s).
-
-        Timesteps or episodes are counted in total (across all vectorized
-        sub-environments). For example, if self.num_envs=2 and num_timesteps=10, each
-        sub-environment will be sampled for 5 steps. If self.num_envs=3 and
-        num_episodes=30, each sub-environment will be sampled for 10 episodes.
-
-        Args:
-            num_timesteps: The number of timesteps to sample from the environment(s).
-                Note that only exactly one of `num_timesteps` or `num_episodes` must be
-                provided.
-            num_episodes: The number of full episodes to sample from the environment(s).
-                Note that only exactly one of `num_timesteps` or `num_episodes` must be
-                provided.
-            explore: Indicates whether to utilize exploration when picking actions.
-            random_actions: Whether to only use random actions. If True, the value of
-                `explore` is ignored.
-            force_reset: Whether to reset the environment(s) before starting to sample.
-                If False, will still reset the environment(s) if they were left in
-                a terminated or truncated state during previous sample calls.
-            with_render_data: If True, will record rendering images per timestep
-                in the returned Episodes. This data can be used to create video
-                reports.
-                TODO (sven): Note that this is only supported for runnign with
-                 `num_episodes` yet.
-
-        Returns:
-            A tuple consisting of a) list of Episode instances that are done and
-            b) list of Episode instances that are still ongoing.
-        """
-        # If no execution details are provided, use self.config.
-        if num_timesteps is None and num_episodes is None:
-            if self.config.batch_mode == "truncate_episodes":
-                num_timesteps = self.config.rollout_fragment_length * self.num_envs
-            else:
-                num_episodes = self.num_envs
-
-        # Sample n timesteps.
-        if num_timesteps is not None:
-            return self._sample_timesteps(
-                num_timesteps=num_timesteps,
-                explore=explore,
-                random_actions=random_actions,
-                force_reset=False,
-            )
-        # Sample n episodes.
-        else:
-            # `_sample_episodes` returns only one list (with completed episodes)
-            # return empty list for incomplete ones.
-            return (
-                self._sample_episodes(
-                    num_episodes=num_episodes,
-                    explore=explore,
-                    random_actions=random_actions,
-                    with_render_data=with_render_data,
-                ),
-                [],
-            )
-
-    def _sample_timesteps(
-        self,
-        num_timesteps: int,
-        explore: bool = True,
-        random_actions: bool = False,
-        force_reset: bool = False,
-    ) -> Tuple[List[Episode], List[Episode]]:
-        """Helper method to run n timesteps.
-
-        See docstring of self.sample() for more details.
-        """
-        done_episodes_to_return = []
-
-        # Get initial states for all `batch_size_B` rows in the forward batch.
-        initial_states = tree.map_structure(
-            lambda s: np.repeat(s, self.num_envs, axis=0),
-            self.module.get_initial_state(),
-        )
-
-        # Have to reset the env (on all vector sub-envs).
-        if force_reset or self._needs_initial_reset:
-            obs, _ = self.env.reset()
-
-            self._episodes = [Episode() for _ in range(self.num_envs)]
-            states = initial_states
-            # Set is_first to True for all rows (all sub-envs just got reset).
-            is_first = np.ones((self.num_envs,), dtype=np.float32)
-            self._needs_initial_reset = False
-
-            # Set initial obs and states in the episodes.
-            for i in range(self.num_envs):
-                self._episodes[i].add_initial_observation(
-                    initial_observation=obs[i],
-                    initial_state={k: s[i] for k, s in states.items()},
-                )
-        # Don't reset existing envs; continue in already started episodes.
-        else:
-            # Pick up stored observations and states from previous timesteps.
-            obs = np.stack([eps.observations[-1] for eps in self._episodes])
-            # Compile the initial state for each batch row: If episode just started, use
-            # model's initial state, if not, use state stored last in Episode.
-            states = {
-                k: np.stack(
-                    [
-                        initial_states[k][i] if eps.states is None else eps.states[k]
-                        for i, eps in enumerate(self._episodes)
-                    ]
-                )
-                for k in initial_states.keys()
-            }
-            # If a batch row is at the beginning of an episode, set its `is_first` flag
-            # to 1.0, otherwise 0.0.
-            is_first = np.zeros((self.num_envs,), dtype=np.float32)
-            for i, eps in enumerate(self._episodes):
-                if eps.states is None:
-                    is_first[i] = 1.0
-
-        # Loop through env for n timesteps.
-        ts = 0
-        while ts < num_timesteps:
-            # Act randomly.
-            if random_actions:
-                actions = self.env.action_space.sample()
-            # Compute an action using our RLModule.
-            else:
-                batch = {
-                    STATE_IN: tree.map_structure(
-                        lambda s: tf.convert_to_tensor(s), states
-                    ),
-                    SampleBatch.OBS: tf.convert_to_tensor(obs),
-                    "is_first": tf.convert_to_tensor(is_first),
-                }
-                # Explore or not.
-                if explore:
-                    outs = self.module.forward_exploration(batch)
-                else:
-                    outs = self.module.forward_inference(batch)
-
-                # Model outputs one-hot actions (if discrete). Convert to int actions
-                # as well.
-                actions = outs[SampleBatch.ACTIONS].numpy()
-                if isinstance(self.env.single_action_space, gym.spaces.Discrete):
-                    actions = np.argmax(actions, axis=-1)
-                states = tree.map_structure(lambda s: s.numpy(), outs[STATE_OUT])
-
-            obs, rewards, terminateds, truncateds, infos = self.env.step(actions)
-            ts += self.num_envs
-
-            for i in range(self.num_envs):
-                s = {k: s[i] for k, s in states.items()}
-                # The last entry in self.observations[i] is already the reset
-                # obs of the new episode.
-                if terminateds[i] or truncateds[i]:
-                    # Finish the episode with the actual terminal observation stored in
-                    # the info dict.
-                    self._episodes[i].add_timestep(
-                        infos["final_observation"][i],
-                        actions[i],
-                        rewards[i],
-                        state=s,
-                        is_terminated=terminateds[i],
-                        is_truncated=truncateds[i],
-                    )
-                    # Reset h-states to the model's initial ones b/c we are starting a
-                    # new episode.
-                    for k, v in self.module.get_initial_state().items():
-                        states[k][i] = v.numpy()
-                    is_first[i] = True
-                    done_episodes_to_return.append(self._episodes[i])
-                    # Create a new episode object.
-                    self._episodes[i] = Episode(observations=[obs[i]], states=s)
-                else:
-                    self._episodes[i].add_timestep(
-                        obs[i], actions[i], rewards[i], state=s
-                    )
-                    is_first[i] = False
-
-        # Return done episodes ...
-        self._done_episodes_for_metrics.extend(done_episodes_to_return)
-        # ... and all ongoing episode chunks. Also, make sure, we return
-        # a copy and start new chunks so that callers of this function
-        # don't alter our ongoing and returned Episode objects.
-        ongoing_episodes = self._episodes
-        self._episodes = [eps.create_successor() for eps in self._episodes]
-        for eps in ongoing_episodes:
-            self._ongoing_episodes_for_metrics[eps.id_].append(eps)
-
-        self._ts_since_last_metrics += ts
-
-        return done_episodes_to_return, ongoing_episodes
-
-    def _sample_episodes(
-        self,
-        num_episodes: int,
-        explore: bool = True,
-        random_actions: bool = False,
-        with_render_data: bool = False,
-    ) -> List[Episode]:
-        """Helper method to run n episodes.
-
-        See docstring of `self.sample()` for more details.
-        """
-        done_episodes_to_return = []
-
-        obs, _ = self.env.reset()
-        episodes = [Episode() for _ in range(self.num_envs)]
-
-        # Multiply states n times according to our vector env batch size (num_envs).
-        states = tree.map_structure(
-            lambda s: np.repeat(s, self.num_envs, axis=0),
-            self.module.get_initial_state(),
-        )
-        is_first = np.ones((self.num_envs,), dtype=np.float32)
-
-        render_images = [None] * self.num_envs
-        if with_render_data:
-            render_images = [e.render() for e in self.env.envs]
-
-        for i in range(self.num_envs):
-            episodes[i].add_initial_observation(
-                initial_observation=obs[i],
-                initial_state={k: s[i] for k, s in states.items()},
-                initial_render_image=render_images[i],
-            )
-
-        eps = 0
-        while eps < num_episodes:
-            if random_actions:
-                actions = self.env.action_space.sample()
-            else:
-                batch = {
-                    STATE_IN: tree.map_structure(
-                        lambda s: tf.convert_to_tensor(s), states
-                    ),
-                    SampleBatch.OBS: tf.convert_to_tensor(obs),
-                    "is_first": tf.convert_to_tensor(is_first),
-                }
-
-                if explore:
-                    outs = self.module.forward_exploration(batch)
-                else:
-                    outs = self.module.forward_inference(batch)
-
-                actions = outs[SampleBatch.ACTIONS].numpy()
-                if isinstance(self.env.single_action_space, gym.spaces.Discrete):
-                    actions = np.argmax(actions, axis=-1)
-                states = tree.map_structure(lambda s: s.numpy(), outs[STATE_OUT])
-
-            obs, rewards, terminateds, truncateds, infos = self.env.step(actions)
-            if with_render_data:
-                render_images = [e.render() for e in self.env.envs]
-
-            for i in range(self.num_envs):
-                s = {k: s[i] for k, s in states.items()}
-                # The last entry in self.observations[i] is already the reset
-                # obs of the new episode.
-                if terminateds[i] or truncateds[i]:
-                    eps += 1
-
-                    episodes[i].add_timestep(
-                        infos["final_observation"][i],
-                        actions[i],
-                        rewards[i],
-                        state=s,
-                        is_terminated=terminateds[i],
-                        is_truncated=truncateds[i],
-                    )
-                    done_episodes_to_return.append(episodes[i])
-
-                    # Also early-out if we reach the number of episodes within this
-                    # for-loop.
-                    if eps == num_episodes:
-                        break
-
-                    # Reset h-states to the model's initial ones b/c we are starting a
-                    # new episode.
-                    for k, v in self.module.get_initial_state().items():
-                        states[k][i] = v.numpy()
-                    is_first[i] = True
-
-                    episodes[i] = Episode(
-                        observations=[obs[i]],
-                        states=s,
-                        render_images=[render_images[i]],
-                    )
-                else:
-                    episodes[i].add_timestep(
-                        obs[i],
-                        actions[i],
-                        rewards[i],
-                        state=s,
-                        render_image=render_images[i],
-                    )
-                    is_first[i] = False
-
-        self._done_episodes_for_metrics.extend(done_episodes_to_return)
-        self._ts_since_last_metrics += sum(len(eps) for eps in done_episodes_to_return)
-
-        # If user calls sample(num_timesteps=..) after this, we must reset again
-        # at the beginning.
-        self._needs_initial_reset = True
-
-        return done_episodes_to_return
-
-    # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this
-    #  API. Instead Algorithm should compile episode metrics itself via its local
-    #  buffer.
-    def get_metrics(self) -> List[RolloutMetrics]:
-        # Compute per-episode metrics (only on already completed episodes).
-        metrics = []
-        for eps in self._done_episodes_for_metrics:
-            episode_length = len(eps)
-            episode_reward = eps.get_return()
-            # Don't forget about the already returned chunks of this episode.
-            if eps.id_ in self._ongoing_episodes_for_metrics:
-                for eps2 in self._ongoing_episodes_for_metrics[eps.id_]:
-                    episode_length += len(eps2)
-                    episode_reward += eps2.get_return()
-                del self._ongoing_episodes_for_metrics[eps.id_]
-
-            metrics.append(
-                RolloutMetrics(
-                    episode_length=episode_length,
-                    episode_reward=episode_reward,
-                )
-            )
-
-        self._done_episodes_for_metrics.clear()
-        self._ts_since_last_metrics = 0
-
-        return metrics
-
-    # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this
-    #  API. Replace by proper state overriding via `EnvRunner.set_state()`
-    def set_weights(self, weights, global_vars=None):
-        """Writes the weights of our (single-agent) RLModule."""
-        if self.module is None:
-            assert self.config.share_module_between_env_runner_and_learner
-        else:
-            self.module.set_state(weights[DEFAULT_POLICY_ID])
-
-    @override(EnvRunner)
-    def assert_healthy(self):
-        # Make sure, we have built our gym.vector.Env and RLModule properly.
-        assert self.env and self.module
-
-    @override(EnvRunner)
-    def stop(self):
-        # Close our env object via gymnasium's API.
-        self.env.close()
-
-
-class NormalizedImageEnv(gym.ObservationWrapper):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.observation_space = gym.spaces.Box(
-            -1.0,
-            1.0,
-            shape=self.observation_space.shape,
-            dtype=np.float32,
-        )
-
-    # Divide by scale and center around 0.0, such that observations are in the range
-    # of -1.0 and 1.0.
-    def observation(self, observation):
-        return (observation.astype(np.float32) / 128.0) - 1.0
-
-
-class OneHot(gym.ObservationWrapper):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.observation_space = gym.spaces.Box(
-            0.0, 1.0, shape=(self.observation_space.n,), dtype=np.float32
-        )
-
-    def reset(self, **kwargs):
-        ret = self.env.reset(**kwargs)
-        return self._get_obs(ret[0]), ret[1]
-
-    def step(self, action):
-        ret = self.env.step(action)
-        return self._get_obs(ret[0]), ret[1], ret[2], ret[3], ret[4]
-
-    def _get_obs(self, obs):
-        return one_hot(obs, depth=self.observation_space.shape[0])
-
-
-class ActionClip(gym.ActionWrapper):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._low = -1.0
-        self._high = 1.0
-        self.action_space = gym.spaces.Box(
-            self._low,
-            self._high,
-            self.action_space.shape,
-            self.action_space.dtype,
-        )
-
-    def action(self, action):
-        return np.clip(action, self._low, self._high)
diff --git a/rllib/algorithms/dreamerv3/utils/summaries.py b/rllib/algorithms/dreamerv3/utils/summaries.py
deleted file mode 100644
index d781a33e40d6b..0000000000000
--- a/rllib/algorithms/dreamerv3/utils/summaries.py
+++ /dev/null
@@ -1,329 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
-import numpy as np
-
-from ray.rllib.algorithms.dreamerv3.utils.debugging import (
-    create_cartpole_dream_image,
-    create_frozenlake_dream_image,
-)
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.tf_utils import inverse_symlog
-
-
-def _summarize(*, results, data_to_summarize, keys_to_log, include_histograms=False):
-    for k in keys_to_log:
-        if data_to_summarize[k].shape == ():
-            results.update({k: data_to_summarize[k]})
-        elif include_histograms:
-            results.update({k: data_to_summarize[k]})
-
-
-def reconstruct_obs_from_h_and_z(
-    h_t0_to_H,
-    z_t0_to_H,
-    dreamer_model,
-    obs_dims_shape,
-):
-    """Returns"""
-    shape = h_t0_to_H.shape
-    T = shape[0]  # inputs are time-major
-    B = shape[1]
-    # Compute actual observations using h and z and the decoder net.
-    # Note that the last h-state (T+1) is NOT used here as it's already part of
-    # a new trajectory.
-    # Use mean() of the Gaussian, no sample! -> No need to construct dist object here.
-    reconstructed_obs_distr_means_TxB = dreamer_model.world_model.decoder(
-        # Fold time rank.
-        h=np.reshape(h_t0_to_H, (T * B, -1)),
-        z=np.reshape(z_t0_to_H, (T * B,) + z_t0_to_H.shape[2:]),
-    )
-    # Unfold time rank again.
-    reconstructed_obs_T_B = np.reshape(
-        reconstructed_obs_distr_means_TxB, (T, B) + obs_dims_shape
-    )
-    # Return inverse symlog'd (real env obs space) reconstructed observations.
-    return reconstructed_obs_T_B
-
-
-def report_dreamed_trajectory(
-    *,
-    results,
-    env,
-    dreamer_model,
-    obs_dims_shape,
-    batch_indices=(0,),
-    desc=None,
-    include_images=True,
-):
-    if not include_images:
-        return
-
-    dream_data = results["dream_data"]
-    dreamed_obs_H_B = reconstruct_obs_from_h_and_z(
-        h_t0_to_H=dream_data["h_states_t0_to_H_BxT"],
-        z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"],
-        dreamer_model=dreamer_model,
-        obs_dims_shape=obs_dims_shape,
-    )
-    func = (
-        create_cartpole_dream_image
-        if env.startswith("CartPole")
-        else create_frozenlake_dream_image
-    )
-    # Take 0th dreamed trajectory and produce series of images.
-    for b in batch_indices:
-        images = []
-        for t in range(len(dreamed_obs_H_B) - 1):
-            images.append(
-                func(
-                    dreamed_obs=dreamed_obs_H_B[t][b],
-                    dreamed_V=dream_data["values_dreamed_t0_to_H_BxT"][t][b],
-                    dreamed_a=(dream_data["actions_ints_dreamed_t0_to_H_BxT"][t][b]),
-                    dreamed_r_tp1=(dream_data["rewards_dreamed_t0_to_H_BxT"][t + 1][b]),
-                    # `DISAGREE_intrinsic_rewards_H_B` are shifted by 1 already
-                    # (from t1 to H, not t0 to H like all other data here).
-                    dreamed_ri_tp1=(
-                        results["DISAGREE_intrinsic_rewards_H_BxT"][t][b]
-                        if "DISAGREE_intrinsic_rewards_H_BxT" in results
-                        else None
-                    ),
-                    dreamed_c_tp1=(
-                        dream_data["continues_dreamed_t0_to_H_BxT"][t + 1][b]
-                    ),
-                    value_target=results["VALUE_TARGETS_H_BxT"][t][b],
-                    initial_h=dream_data["h_states_t0_to_H_BxT"][t][b],
-                    as_tensor=True,
-                ).numpy()
-            )
-        # Concat images along width-axis (so they show as a "film sequence" next to each
-        # other).
-        results.update(
-            {
-                f"dreamed_trajectories{('_'+desc) if desc else ''}_B{b}": (
-                    np.concatenate(images, axis=1)
-                ),
-            }
-        )
-
-
-def report_predicted_vs_sampled_obs(
-    *,
-    results,
-    sample,
-    batch_size_B,
-    batch_length_T,
-    symlog_obs: bool = True,
-):
-    """Summarizes sampled data (from the replay buffer) vs world-model predictions.
-
-    World model predictions are based on the posterior states (z computed from actual
-    observation encoder input + the current h-states).
-
-    Observations: Computes MSE (sampled vs predicted/recreated) over all features.
-    For image observations, also creates direct image comparisons (sampled images
-    vs predicted (posterior) ones).
-    Rewards: Compute MSE (sampled vs predicted).
-    Continues: Compute MSE (sampled vs predicted).
-
-    Args:
-        results: The results dict that was returned by `LearnerGroup.update()`.
-        sample: The sampled data (dict) from the replay buffer. Already tf-tensor
-            converted.
-        batch_size_B: The batch size (B). This is the number of trajectories sampled
-            from the buffer.
-        batch_length_T: The batch length (T). This is the length of an individual
-            trajectory sampled from the buffer.
-    """
-    predicted_observation_means_BxT = results[
-        "WORLD_MODEL_fwd_out_obs_distribution_means_BxT"
-    ]
-    _report_obs(
-        results=results,
-        computed_float_obs_B_T_dims=np.reshape(
-            predicted_observation_means_BxT,
-            (batch_size_B, batch_length_T) + sample[SampleBatch.OBS].shape[2:],
-        ),
-        sampled_obs_B_T_dims=sample[SampleBatch.OBS],
-        descr_prefix="WORLD_MODEL",
-        descr_obs=f"predicted_posterior_T{batch_length_T}",
-        symlog_obs=symlog_obs,
-    )
-
-
-def report_dreamed_eval_trajectory_vs_samples(
-    *,
-    results,
-    dream_data,
-    sample,
-    burn_in_T,
-    dreamed_T,
-    dreamer_model,
-    symlog_obs: bool = True,
-):
-    # Obs MSE.
-    dreamed_obs_T_B = reconstruct_obs_from_h_and_z(
-        h_t0_to_H=dream_data["h_states_t0_to_H_BxT"],
-        z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"],
-        dreamer_model=dreamer_model,
-        obs_dims_shape=sample[SampleBatch.OBS].shape[2:],
-    )
-    t0 = burn_in_T - 1
-    tH = t0 + dreamed_T
-    # Observation MSE and - if applicable - images comparisons.
-    mse_sampled_vs_dreamed_obs = _report_obs(
-        results=results,
-        # Have to transpose b/c dreamed data is time-major.
-        computed_float_obs_B_T_dims=np.transpose(
-            dreamed_obs_T_B,
-            axes=[1, 0] + list(range(2, len(dreamed_obs_T_B.shape))),
-        ),
-        sampled_obs_B_T_dims=sample[SampleBatch.OBS][:, t0 : tH + 1],
-        descr_prefix="EVALUATION",
-        descr_obs=f"dreamed_prior_H{dreamed_T}",
-        symlog_obs=symlog_obs,
-    )
-
-    # Reward MSE.
-    _report_rewards(
-        results=results,
-        computed_rewards=dream_data["rewards_dreamed_t0_to_H_BxT"],
-        sampled_rewards=sample[SampleBatch.REWARDS][:, t0 : tH + 1],
-        descr_prefix="EVALUATION",
-        descr_reward=f"dreamed_prior_H{dreamed_T}",
-    )
-
-    # Continues MSE.
-    _report_continues(
-        results=results,
-        computed_continues=dream_data["continues_dreamed_t0_to_H_BxT"],
-        sampled_continues=(1.0 - sample["is_terminated"])[:, t0 : tH + 1],
-        descr_prefix="EVALUATION",
-        descr_cont=f"dreamed_prior_H{dreamed_T}",
-    )
-    return mse_sampled_vs_dreamed_obs
-
-
-def report_sampling_and_replay_buffer(*, replay_buffer):
-    episodes_in_buffer = replay_buffer.get_num_episodes()
-    ts_in_buffer = replay_buffer.get_num_timesteps()
-    replayed_steps = replay_buffer.get_sampled_timesteps()
-    added_steps = replay_buffer.get_added_timesteps()
-
-    # Summarize buffer, sampling, and train ratio stats.
-    return {
-        "BUFFER_capacity": replay_buffer.capacity,
-        "BUFFER_size_num_episodes": episodes_in_buffer,
-        "BUFFER_size_timesteps": ts_in_buffer,
-        "BUFFER_replayed_steps": replayed_steps,
-        "BUFFER_added_steps": added_steps,
-    }
-
-
-def _report_obs(
-    *,
-    results,
-    computed_float_obs_B_T_dims,
-    sampled_obs_B_T_dims,
-    descr_prefix=None,
-    descr_obs,
-    symlog_obs,
-):
-    """Summarizes computed- vs sampled observations: MSE and (if applicable) images.
-
-    Args:
-        computed_float_obs_B_T_dims: Computed float observations
-            (not clipped, not cast'd). Shape=(B, T, [dims ...]).
-        sampled_obs_B_T_dims: Sampled observations (as-is from the environment, meaning
-            this could be uint8, 0-255 clipped images). Shape=(B, T, [dims ...]).
-        B: The batch size B (see shapes of `computed_float_obs_B_T_dims` and
-            `sampled_obs_B_T_dims` above).
-        T: The batch length T (see shapes of `computed_float_obs_B_T_dims` and
-            `sampled_obs_B_T_dims` above).
-        descr: A string used to describe the computed data to be used in the TB
-            summaries.
-    """
-    # Videos: Create summary, comparing computed images with actual sampled ones.
-    # 4=[B, T, w, h] grayscale image; 5=[B, T, w, h, C] RGB image.
-    if len(sampled_obs_B_T_dims.shape) in [4, 5]:
-        descr_prefix = (descr_prefix + "_") if descr_prefix else ""
-
-        if symlog_obs:
-            computed_float_obs_B_T_dims = inverse_symlog(computed_float_obs_B_T_dims)
-
-        # Restore image pixels from normalized (non-symlog'd) data.
-        if not symlog_obs:
-            computed_float_obs_B_T_dims = (computed_float_obs_B_T_dims + 1.0) * 128
-            sampled_obs_B_T_dims = (sampled_obs_B_T_dims + 1.0) * 128
-            sampled_obs_B_T_dims = np.clip(sampled_obs_B_T_dims, 0.0, 255.0).astype(
-                np.uint8
-            )
-        computed_images = np.clip(computed_float_obs_B_T_dims, 0.0, 255.0).astype(
-            np.uint8
-        )
-        # Concat sampled and computed images along the height axis (3) such that
-        # real images show below respective predicted ones.
-        # (B, T, C, h, w)
-        sampled_vs_computed_images = np.concatenate(
-            [computed_images, sampled_obs_B_T_dims],
-            axis=3,
-        )
-        # Add grayscale dim, if necessary.
-        if len(sampled_obs_B_T_dims.shape) == 2 + 2:
-            sampled_vs_computed_images = np.expand_dims(sampled_vs_computed_images, -1)
-
-        results.update(
-            {f"{descr_prefix}sampled_vs_{descr_obs}_videos": sampled_vs_computed_images}
-        )
-
-    # return mse_sampled_vs_computed_obs
-
-
-def _report_rewards(
-    *,
-    results,
-    computed_rewards,
-    sampled_rewards,
-    descr_prefix=None,
-    descr_reward,
-):
-    descr_prefix = (descr_prefix + "_") if descr_prefix else ""
-    mse_sampled_vs_computed_rewards = np.mean(
-        np.square(computed_rewards - sampled_rewards)
-    )
-    mse_sampled_vs_computed_rewards = np.mean(mse_sampled_vs_computed_rewards)
-    results.update(
-        {
-            f"{descr_prefix}sampled_vs_{descr_reward}_rewards_mse": (
-                mse_sampled_vs_computed_rewards
-            ),
-        }
-    )
-
-
-def _report_continues(
-    *,
-    results,
-    computed_continues,
-    sampled_continues,
-    descr_prefix=None,
-    descr_cont,
-):
-    descr_prefix = (descr_prefix + "_") if descr_prefix else ""
-    # Continue MSE.
-    mse_sampled_vs_computed_continues = np.mean(
-        np.square(computed_continues - sampled_continues.astype(np.float32))
-    )
-    results.update(
-        {
-            f"{descr_prefix}sampled_vs_{descr_cont}_continues_mse": (
-                mse_sampled_vs_computed_continues
-            ),
-        }
-    )
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
index 81cb8d0627bde..d435e469b23ce 100644
--- a/rllib/algorithms/ppo/ppo.py
+++ b/rllib/algorithms/ppo/ppo.py
@@ -482,12 +482,12 @@ def training_step(self) -> ResultDict:
         # workers.
         with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
             if self.workers.num_remote_workers() > 0:
-                from_worker_or_learner_group = None
+                from_worker_or_trainer = None
                 if self.config._enable_learner_api:
                     # sync weights from learner_group to all rollout workers
-                    from_worker_or_learner_group = self.learner_group
+                    from_worker_or_trainer = self.learner_group
                 self.workers.sync_weights(
-                    from_worker_or_learner_group=from_worker_or_learner_group,
+                    from_worker_or_trainer=from_worker_or_trainer,
                     policies=policies_to_update,
                     global_vars=global_vars,
                 )
diff --git a/rllib/algorithms/registry.py b/rllib/algorithms/registry.py
index 5352814f5e4e4..5387420cc5230 100644
--- a/rllib/algorithms/registry.py
+++ b/rllib/algorithms/registry.py
@@ -114,12 +114,6 @@ def _import_dreamer():
     return dreamer.Dreamer, dreamer.Dreamer.get_default_config()
 
 
-def _import_dreamerv3():
-    import ray.rllib.algorithms.dreamerv3 as dreamerv3
-
-    return dreamerv3.DreamerV3, dreamerv3.DreamerV3.get_default_config()
-
-
 def _import_dt():
     import ray.rllib.algorithms.dt as dt
 
@@ -245,7 +239,6 @@ def _import_leela_chess_zero():
     "DDPPO": _import_ddppo,
     "DQN": _import_dqn,
     "Dreamer": _import_dreamer,
-    "DreamerV3": _import_dreamerv3,
     "DT": _import_dt,
     "IMPALA": _import_impala,
     "APPO": _import_appo,
@@ -285,7 +278,6 @@ def _import_leela_chess_zero():
     "DDPPO": "DDPPO",
     "DQN": "DQN",
     "Dreamer": "Dreamer",
-    "DreamerV3": "DreamerV3",
     "DT": "DT",
     "Impala": "IMPALA",
     "APPO": "APPO",
diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py
index 2bde70aa69ea4..9bbff1f7f0877 100644
--- a/rllib/algorithms/tests/test_algorithm_config.py
+++ b/rllib/algorithms/tests/test_algorithm_config.py
@@ -147,12 +147,15 @@ def test_detect_atari_env(self):
         config = AlgorithmConfig().environment(
             env="ALE/Breakout-v5", env_config={"frameskip": 1}
         )
+        config.validate()
         self.assertTrue(config.is_atari)
 
         config = AlgorithmConfig().environment(env="ALE/Pong-v5")
+        config.validate()
         self.assertTrue(config.is_atari)
 
         config = AlgorithmConfig().environment(env="CartPole-v1")
+        config.validate()
         # We do not auto-detect callable env makers for Atari envs.
         self.assertFalse(config.is_atari)
 
@@ -163,10 +166,12 @@ def test_detect_atari_env(self):
                 make_kwargs={"frameskip": 1},
             )
         )
+        config.validate()
         # We do not auto-detect callable env makers for Atari envs.
         self.assertFalse(config.is_atari)
 
         config = AlgorithmConfig().environment(env="NotAtari")
+        config.validate()
         self.assertFalse(config.is_atari)
 
     def test_rl_module_api(self):
diff --git a/rllib/core/learner/tf/tf_learner.py b/rllib/core/learner/tf/tf_learner.py
index 2cc22a725cf1b..2cb9cdeb049aa 100644
--- a/rllib/core/learner/tf/tf_learner.py
+++ b/rllib/core/learner/tf/tf_learner.py
@@ -376,7 +376,7 @@ def _make_distributed_strategy_if_necessary(self) -> "tf.distribute.Strategy":
             devices = tf.config.list_logical_devices("GPU")
             assert self._local_gpu_idx < len(devices), (
                 f"local_gpu_idx {self._local_gpu_idx} is not a valid GPU id or is "
-                "not available."
+                " not available."
             )
             local_gpu = [devices[self._local_gpu_idx].name]
             strategy = tf.distribute.MirroredStrategy(devices=local_gpu)
@@ -431,11 +431,10 @@ def helper(_batch):
             #  in-efficient. However, for tf>=2.12, it works also w/o this conversion
             #  so remove this after we upgrade officially to tf==2.12.
             _batch = NestedDict(_batch)
-            with tf.GradientTape(persistent=True) as tape:
+            with tf.GradientTape() as tape:
                 fwd_out = self._module.forward_train(_batch)
                 loss_per_module = self.compute_loss(fwd_out=fwd_out, batch=_batch)
             gradients = self.compute_gradients(loss_per_module, gradient_tape=tape)
-            del tape
             postprocessed_gradients = self.postprocess_gradients(gradients)
             self.apply_gradients(postprocessed_gradients)
 
diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py
index 6aed0b9850521..b6478d51d09d0 100644
--- a/rllib/core/rl_module/rl_module.py
+++ b/rllib/core/rl_module/rl_module.py
@@ -285,19 +285,7 @@ class RLModule(abc.ABC):
 
     def __init__(self, config: RLModuleConfig):
         self.config = config
-        # Make sure, `setup()` is only called once, no matter what. In some cases
-        # of multiple inheritance (and with our __post_init__ functionality in place,
-        # this might get called twice.
-        if hasattr(self, "_is_setup") and self._is_setup:
-            raise RuntimeError(
-                "`RLModule.setup()` called twice within your RLModule implementation "
-                f"{self}! Make sure you are using the proper inheritance order "
-                "(TorchRLModule before [Algo]RLModule) or (TfRLModule before "
-                "[Algo]RLModule) and that you are using `super().__init__(...)` in "
-                "your custom constructor."
-            )
         self.setup()
-        self._is_setup = True
 
     def __init_subclass__(cls, **kwargs):
         # Automatically add a __post_init__ method to all subclasses of RLModule.
diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py
index 21b2601b7e05f..100b815d2b621 100644
--- a/rllib/evaluation/worker_set.py
+++ b/rllib/evaluation/worker_set.py
@@ -356,9 +356,7 @@ def num_remote_worker_restarts(self) -> int:
     def sync_weights(
         self,
         policies: Optional[List[PolicyID]] = None,
-        from_worker_or_learner_group: Optional[
-            Union[RolloutWorker, LearnerGroup]
-        ] = None,
+        from_worker_or_trainer: Optional[Union[RolloutWorker, LearnerGroup]] = None,
         to_worker_indices: Optional[List[int]] = None,
         global_vars: Optional[Dict[str, TensorType]] = None,
         timeout_seconds: Optional[int] = 0,
@@ -371,7 +369,7 @@ def sync_weights(
         Args:
             policies: Optional list of PolicyIDs to sync weights for.
                 If None (default), sync weights to/from all policies.
-            from_worker_or_learner_group: Optional (local) RolloutWorker instance or
+            from_worker_or_trainer: Optional (local) RolloutWorker instance or
                 LearnerGroup instance to sync from. If None (default),
                 sync from this WorkerSet's local worker.
             to_worker_indices: Optional list of worker indices to sync the
@@ -383,16 +381,16 @@ def sync_weights(
                 for any sync calls to finish). This significantly improves
                 algorithm performance.
         """
-        if self.local_worker() is None and from_worker_or_learner_group is None:
+        if self.local_worker() is None and from_worker_or_trainer is None:
             raise TypeError(
-                "No `local_worker` in WorkerSet, must provide "
-                "`from_worker_or_learner_group` arg in `sync_weights()`!"
+                "No `local_worker` in WorkerSet, must provide `from_worker` "
+                "arg in `sync_weights()`!"
             )
 
         # Only sync if we have remote workers or `from_worker_or_trainer` is provided.
         weights = None
-        if self.num_remote_workers() or from_worker_or_learner_group is not None:
-            weights_src = from_worker_or_learner_group or self.local_worker()
+        if self.num_remote_workers() or from_worker_or_trainer is not None:
+            weights_src = from_worker_or_trainer or self.local_worker()
 
             if weights_src is None:
                 raise ValueError(
@@ -416,10 +414,10 @@ def set_weight(w):
                 timeout_seconds=timeout_seconds,
             )
 
-        # If `from_worker_or_learner_group` is provided, also sync to this WorkerSet's
+        # If `from_worker` is provided, also sync to this WorkerSet's
         # local worker.
         if self.local_worker() is not None:
-            if from_worker_or_learner_group is not None:
+            if from_worker_or_trainer is not None:
                 self.local_worker().set_weights(weights, global_vars=global_vars)
             # If `global_vars` is provided and local worker exists  -> Update its
             # global_vars.
diff --git a/rllib/policy/eager_tf_policy_v2.py b/rllib/policy/eager_tf_policy_v2.py
index 4df6b2724fb3d..8a4093fb0e2d5 100644
--- a/rllib/policy/eager_tf_policy_v2.py
+++ b/rllib/policy/eager_tf_policy_v2.py
@@ -870,12 +870,7 @@ def _compute_actions_helper_rl_module_explore(
             actions = fwd_out[SampleBatch.ACTIONS]
         # Otherwise, sample actions from the distribution.
         else:
-            if action_dist is None:
-                raise KeyError(
-                    "Your RLModule's `forward_exploration()` method must return a dict"
-                    f"with either the {SampleBatch.ACTIONS} key or the "
-                    f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!"
-                )
+            assert action_dist
             actions = action_dist.sample()
 
         # Anything but action_dist and state_out is an extra fetch
@@ -931,12 +926,7 @@ def _compute_actions_helper_rl_module_inference(
             actions = fwd_out[SampleBatch.ACTIONS]
         # Otherwise, sample actions from the distribution.
         else:
-            if action_dist is None:
-                raise KeyError(
-                    "Your RLModule's `forward_inference()` method must return a dict"
-                    f"with either the {SampleBatch.ACTIONS} key or the "
-                    f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!"
-                )
+            assert action_dist
             actions = action_dist.sample()
 
         # Anything but action_dist and state_out is an extra fetch
diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index bef3c070d81a4..4165da80a1f8d 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -1147,12 +1147,7 @@ def _compute_action_helper(
                     actions = fwd_out[SampleBatch.ACTIONS]
                 # Otherwise, sample actions from the distribution.
                 else:
-                    if action_dist is None:
-                        raise KeyError(
-                            "Your RLModule's `forward_exploration()` method must return"
-                            f" a dict with either the {SampleBatch.ACTIONS} key or the "
-                            f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!"
-                        )
+                    assert action_dist
                     actions = action_dist.sample()
 
                 # Compute action-logp and action-prob from distribution and add to
@@ -1176,12 +1171,7 @@ def _compute_action_helper(
                     actions = fwd_out[SampleBatch.ACTIONS]
                 # Otherwise, sample actions from the distribution.
                 else:
-                    if action_dist is None:
-                        raise KeyError(
-                            "Your RLModule's `forward_inference()` method must return"
-                            f" a dict with either the {SampleBatch.ACTIONS} key or the "
-                            f"{SampleBatch.ACTION_DIST_INPUTS} key in it (or both)!"
-                        )
+                    assert action_dist
                     actions = action_dist.sample()
 
             # Anything but actions and state_out is an extra fetch.
diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py
index 0f945dd7db82c..0a9303a9e47d5 100644
--- a/rllib/tests/run_regression_tests.py
+++ b/rllib/tests/run_regression_tests.py
@@ -57,12 +57,6 @@
     action="store_true",
     help="Run ray in local mode for easier debugging.",
 )
-parser.add_argument(
-    "--num-samples",
-    type=int,
-    default=1,
-    help="The number of seeds/samples to run with the given experiment config.",
-)
 parser.add_argument(
     "--override-mean-reward",
     type=float,
@@ -109,14 +103,12 @@
 
     # Loop through all collected files.
     for file in files:
-        config_is_python = False
         # For python files, need to make sure, we only deliver the module name into the
         # `load_experiments_from_file` function (everything from "/ray/rllib" on).
         if file.endswith(".py"):
             if file.endswith("__init__.py"):  # weird CI learning test (BAZEL) case
                 continue
             experiments = load_experiments_from_file(file, SupportedFileType.python)
-            config_is_python = True
         else:
             experiments = load_experiments_from_file(file, SupportedFileType.yaml)
 
@@ -126,16 +118,13 @@
 
         exp = list(experiments.values())[0]
 
-        # Set the number of samples to run.
-        exp["num_samples"] = args.num_samples
-
         # Override framework setting with the command line one, if provided.
         # Otherwise, will use framework setting in file (or default: torch).
         if args.framework is not None:
             exp["config"]["framework"] = args.framework
         # Override env setting if given on command line.
         if args.env is not None:
-            exp["config"]["env"] = exp["env"] = args.env
+            exp["config"]["env"] = args.env
 
         # Override the mean reward if specified. This is used by the ray ci
         # for overriding the episode reward mean for tf2 tests for off policy
@@ -150,23 +139,19 @@
             print(f"Skipping framework='{args.framework}' for QMIX.")
             continue
 
-        # Always run with eager-tracing when framework=tf2, if not in local-mode
-        # and unless the yaml explicitly tells us to disable eager tracing.
+        # Always run with eager-tracing when framework=tf2 if not in local-mode.
+        # Ignore this if the yaml explicitly tells us to disable eager tracing
         if (
-            (args.framework == "tf2" or exp["config"].get("framework") == "tf2")
+            args.framework == "tf2"
             and not args.local_mode
-            # Note: This check will always fail for python configs, b/c normally,
-            # algorithm configs have `self.eager_tracing=False` by default.
-            # Thus, you'd have to set `eager_tracing` to True explicitly in your python
-            # config to make sure we are indeed using eager tracing.
-            and exp["config"].get("eager_tracing") is not False
+            and not exp["config"].get("eager_tracing") is False
         ):
+
             exp["config"]["eager_tracing"] = True
 
-        # Print out the actual config (not for py files as yaml.dump weirdly fails).
-        if not config_is_python:
-            print("== Test config ==")
-            print(yaml.dump(experiments))
+        # Print out the actual config.
+        print("== Test config ==")
+        print(yaml.dump(experiments))
 
         # Try running each test 3 times and make sure it reaches the given
         # reward.
diff --git a/rllib/tuned_examples/dreamerv3/__init__.py b/rllib/tuned_examples/dreamerv3/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/rllib/tuned_examples/dreamerv3/atari_100k.py b/rllib/tuned_examples/dreamerv3/atari_100k.py
deleted file mode 100644
index ef6731d6e2e2a..0000000000000
--- a/rllib/tuned_examples/dreamerv3/atari_100k.py
+++ /dev/null
@@ -1,71 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
-
-# Run with:
-# python run_regression_tests.py --dir [this file] --env ALE/[gym ID e.g. Pong-v5]
-
-from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
-
-
-# Number of GPUs to run on.
-num_gpus = 1
-
-config = (
-    DreamerV3Config()
-    # Switch on eager_tracing by default.
-    .framework("tf2", eager_tracing=True)
-    .resources(
-        num_learner_workers=0 if num_gpus == 1 else num_gpus,
-        num_gpus_per_learner_worker=1 if num_gpus else 0,
-        num_cpus_for_local_worker=1,
-    )
-    # TODO (sven): concretize this: If you use >1 GPU and increase the batch size
-    #  accordingly, you might also want to increase the number of envs per worker
-    .rollouts(
-        num_envs_per_worker=(num_gpus or 1),
-        remote_worker_envs=True,
-    )
-    .environment(
-        # [2]: "We follow the evaluation protocol of Machado et al. (2018) with 200M
-        # environment steps, action repeat of 4, a time limit of 108,000 steps per
-        # episode that correspond to 30 minutes of game play, no access to life
-        # information, full action space, and sticky actions. Because the world model
-        # integrates information over time, DreamerV2 does not use frame stacking.
-        # The experiments use a single-task setup where a separate agent is trained
-        # for each game. Moreover, each agent uses only a single environment instance.
-        env_config={
-            # "sticky actions" but not according to Danijar's 100k configs.
-            "repeat_action_probability": 0.0,
-            # "full action space" but not according to Danijar's 100k configs.
-            "full_action_space": False,
-            # Already done by MaxAndSkip wrapper: "action repeat" == 4.
-            "frameskip": 1,
-        }
-    )
-    .reporting(
-        metrics_num_episodes_for_smoothing=(num_gpus or 1),
-        report_images_and_videos=False,
-        report_dream_data=False,
-        report_individual_batch_item_stats=False,
-    )
-    # See Appendix A.
-    .training(
-        model_size="S",
-        training_ratio=1024,
-        batch_size_B=16 * (num_gpus or 1),
-        # TODO
-        model={
-            "batch_length_T": 64,
-            "horizon_H": 15,
-            "gamma": 0.997,
-            "model_size": "S",
-        },
-    )
-)
diff --git a/rllib/tuned_examples/dreamerv3/cartpole.py b/rllib/tuned_examples/dreamerv3/cartpole.py
deleted file mode 100644
index b270d6c3b3137..0000000000000
--- a/rllib/tuned_examples/dreamerv3/cartpole.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
-from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
-
-# Run with:
-# python run_regression_tests.py --dir [this file]
-
-config = (
-    DreamerV3Config()
-    .environment("CartPole-v1")
-    .training(
-        model_size="XS",
-        training_ratio=1024,
-        # TODO
-        model={
-            "batch_size_B": 16,
-            "batch_length_T": 64,
-            "horizon_H": 15,
-            "gamma": 0.997,
-            "model_size": "XS",
-        },
-    )
-)
diff --git a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py
deleted file mode 100644
index a8938ce142123..0000000000000
--- a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py
+++ /dev/null
@@ -1,39 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
-from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
-
-# Run with:
-# python run_regression_tests.py --dir [this file] --env DMC/[task]/[domain]
-# e.g. --env=DMC/cartpole/swingup
-
-config = (
-    DreamerV3Config()
-    # Use image observations.
-    .environment(env_config={"from_pixels": True})
-    .resources(
-        num_learner_workers=1,
-        num_gpus_per_learner_worker=1,
-        num_cpus_for_local_worker=1,
-    )
-    .rollouts(num_envs_per_worker=4, remote_worker_envs=True)
-    # See Appendix A.
-    .training(
-        model_size="S",
-        training_ratio=512,
-        # TODO
-        model={
-            "batch_size_B": 16,
-            "batch_length_T": 64,
-            "horizon_H": 15,
-            "gamma": 0.997,
-            "model_size": "S",
-        },
-    )
-)
diff --git a/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py b/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py
deleted file mode 100644
index 03e9b40def8a3..0000000000000
--- a/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py
+++ /dev/null
@@ -1,39 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
-from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
-
-# Run with:
-# python run_regression_tests.py --dir [this file]
-
-config = (
-    DreamerV3Config()
-    .environment(
-        "FrozenLake-v1",
-        env_config={
-            "desc": [
-                "SF",
-                "HG",
-            ],
-            "is_slippery": False,
-        },
-    )
-    .training(
-        model_size="XS",
-        training_ratio=1024,
-        # TODO
-        model={
-            "batch_size_B": 16,
-            "batch_length_T": 64,
-            "horizon_H": 15,
-            "gamma": 0.997,
-            "model_size": "XS",
-        },
-    )
-)
diff --git a/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py b/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py
deleted file mode 100644
index 9b7b260d595e9..0000000000000
--- a/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
-from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
-
-# Run with:
-# python run_regression_tests.py --dir [this file]
-
-config = (
-    DreamerV3Config()
-    .environment(
-        "FrozenLake-v1",
-        env_config={
-            "map_name": "4x4",
-            "is_slippery": False,
-        },
-    )
-    .training(
-        model_size="nano",
-        training_ratio=1024,
-        # TODO
-        model={
-            "batch_size_B": 16,
-            "batch_length_T": 64,
-            "horizon_H": 15,
-            "gamma": 0.997,
-            "model_size": "nano",
-        },
-    )
-)
diff --git a/rllib/tuned_examples/dreamerv3/pendulum.py b/rllib/tuned_examples/dreamerv3/pendulum.py
deleted file mode 100644
index 4acc4b9aa85a9..0000000000000
--- a/rllib/tuned_examples/dreamerv3/pendulum.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""
-[1] Mastering Diverse Domains through World Models - 2023
-D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
-https://arxiv.org/pdf/2301.04104v1.pdf
-
-[2] Mastering Atari with Discrete World Models - 2021
-D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
-https://arxiv.org/pdf/2010.02193.pdf
-"""
-from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
-
-# Run with:
-# python run_regression_tests.py --dir [this file]
-
-config = (
-    DreamerV3Config()
-    .environment("Pendulum-v1")
-    .training(model_size="XS", training_ratio=1024)
-)
diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py
index 0bee53bbd5590..6c9c9badd7a03 100644
--- a/rllib/utils/metrics/__init__.py
+++ b/rllib/utils/metrics/__init__.py
@@ -30,7 +30,6 @@
 TRAINING_ITERATION_TIMER = "training_iteration"
 APPLY_GRADS_TIMER = "apply_grad"
 COMPUTE_GRADS_TIMER = "compute_grads"
-GARBAGE_COLLECTION_TIMER = "garbage_collection"
 SYNCH_WORKER_WEIGHTS_TIMER = "synch_weights"
 GRAD_WAIT_TIMER = "grad_wait"
 SAMPLE_TIMER = "sample"
diff --git a/rllib/utils/replay_buffers/episode_replay_buffer.py b/rllib/utils/replay_buffers/episode_replay_buffer.py
index e95fc50432489..787c25b1aae01 100644
--- a/rllib/utils/replay_buffers/episode_replay_buffer.py
+++ b/rllib/utils/replay_buffers/episode_replay_buffer.py
@@ -1,5 +1,4 @@
 from collections import deque
-import copy
 from typing import Any, Dict, List, Optional, Union
 import uuid
 
@@ -110,15 +109,6 @@ def add(self, episodes: Union[List["_Episode"], "_Episode"]):
             episodes = [episodes]
 
         for eps in episodes:
-            # Make sure we don't change what's coming in from the user.
-            # TODO (sven): It'd probably be better to make sure in the EnvRunner to not
-            #  hold on to episodes (for metrics purposes only) that we are returning
-            #  back to the user from `EnvRunner.sample()`. Then we wouldn't have to
-            #  do any copying. Instead, either compile the metrics right away on the
-            #  EnvRunner OR compile metrics entirely on the Algorithm side (this is
-            #  actually preferred).
-            eps = copy.deepcopy(eps)
-
             self._num_timesteps += len(eps)
             self._num_timesteps_added += len(eps)
 
@@ -252,7 +242,7 @@ def sample(
             )
             episode = self.episodes[episode_idx]
 
-            # Starting a new chunk, set is_first to True.
+            # Starting a new chunk, set continue to False.
             is_first[B][T] = True
 
             # Begin of new batch item (row).
@@ -265,7 +255,7 @@ def sample(
                 else:
                     rewards[B].append(episode.rewards[episode_ts - 1])
             # We are in the middle of a batch item (row). Concat next episode to this
-            # row from the next episode's beginning. In other words, we never concat
+            # row from the episode's beginning. In other words, we never concat
             # a middle of an episode to another truncated one.
             else:
                 episode_ts = 0
@@ -331,10 +321,6 @@ def get_sampled_timesteps(self) -> int:
         """Returns number of timesteps that have been sampled in buffer's lifetime."""
         return self.sampled_timesteps
 
-    def get_added_timesteps(self) -> int:
-        """Returns number of timesteps that have been added in buffer's lifetime."""
-        return self._num_timesteps_added
-
     @override(ReplayBufferInterface)
     def get_state(self) -> Dict[str, Any]:
         return {
@@ -343,7 +329,6 @@ def get_state(self) -> Dict[str, Any]:
             "_num_episodes_evicted": self._num_episodes_evicted,
             "_indices": self._indices,
             "_num_timesteps": self._num_timesteps,
-            "_num_timesteps_added": self._num_timesteps_added,
             "sampled_timesteps": self.sampled_timesteps,
         }
 
@@ -356,7 +341,6 @@ def set_state(self, state) -> None:
         self._num_episodes_evicted = state["_num_episodes_evicted"]
         self._indices = state["_indices"]
         self._num_timesteps = state["_num_timesteps"]
-        self._num_timesteps_added = state["_num_timesteps_added"]
         self.sampled_timesteps = state["sampled_timesteps"]
 
 
@@ -372,9 +356,8 @@ def __init__(
         actions=None,
         rewards=None,
         states=None,
-        t: int = 0,
-        is_terminated: bool = False,
-        is_truncated: bool = False,
+        is_terminated=False,
+        is_truncated=False,
         render_images=None,
     ):
         self.id_ = id_ or uuid.uuid4().hex
@@ -387,9 +370,6 @@ def __init__(
         # h-states: t0 (in case this episode is a continuation chunk, we need to know
         # about the initial h) to T.
         self.states = states
-        # The global last timestep of the episode and the timesteps when this chunk
-        # started.
-        self.t = self.t_started = t
         # obs[-1] is the final observation in the episode.
         self.is_terminated = is_terminated
         # obs[-1] is the last obs in a truncated-by-the-env episode (there will no more
@@ -401,18 +381,13 @@ def __init__(
         self.render_images = [] if render_images is None else render_images
 
     def concat_episode(self, episode_chunk: "_Episode"):
-        """Adds the given `episode_chunk` to the right side of self."""
         assert episode_chunk.id_ == self.id_
         assert not self.is_done
-        # Make sure the timesteps match.
-        assert self.t == episode_chunk.t_started
 
         episode_chunk.validate()
 
         # Make sure, end matches other episode chunk's beginning.
         assert np.all(episode_chunk.observations[0] == self.observations[-1])
-        # Make sure the timesteps match (our last t should be the same as their first).
-        assert self.t == episode_chunk.t_started
         # Pop out our end.
         self.observations.pop()
 
@@ -421,7 +396,6 @@ def concat_episode(self, episode_chunk: "_Episode"):
         self.observations.extend(list(episode_chunk.observations))
         self.actions.extend(list(episode_chunk.actions))
         self.rewards.extend(list(episode_chunk.rewards))
-        self.t = episode_chunk.t
         self.states = episode_chunk.states
 
         if episode_chunk.is_terminated:
@@ -431,21 +405,6 @@ def concat_episode(self, episode_chunk: "_Episode"):
         # Validate.
         self.validate()
 
-    def add_initial_observation(
-        self, *, initial_observation, initial_state=None, initial_render_image=None
-    ):
-        assert not self.is_done
-        assert len(self.observations) == 0
-        # Assume that this episode is completely empty and has not stepped yet.
-        # Leave self.t (and self.t_started) at 0.
-        assert self.t == self.t_started == 0
-
-        self.observations.append(initial_observation)
-        self.states = initial_state
-        if initial_render_image is not None:
-            self.render_images.append(initial_render_image)
-        self.validate()
-
     def add_timestep(
         self,
         observation,
@@ -457,25 +416,34 @@ def add_timestep(
         is_truncated=False,
         render_image=None,
     ):
-        # Cannot add data to an already done episode.
         assert not self.is_done
 
         self.observations.append(observation)
         self.actions.append(action)
         self.rewards.append(reward)
         self.states = state
-        self.t += 1
         if render_image is not None:
             self.render_images.append(render_image)
         self.is_terminated = is_terminated
         self.is_truncated = is_truncated
         self.validate()
 
+    def add_initial_observation(
+        self, *, initial_observation, initial_state=None, initial_render_image=None
+    ):
+        assert not self.is_done
+        assert len(self.observations) == 0
+
+        self.observations.append(initial_observation)
+        self.states = initial_state
+        if initial_render_image is not None:
+            self.render_images.append(initial_render_image)
+        self.validate()
+
     def validate(self):
         # Make sure we always have one more obs stored than rewards (and actions)
         # due to the reset and last-obs logic of an MDP.
         assert len(self.observations) == len(self.rewards) + 1 == len(self.actions) + 1
-        assert len(self.rewards) == (self.t - self.t_started)
 
         # Convert all lists to numpy arrays, if we are terminated.
         if self.is_done:
@@ -486,43 +454,8 @@ def validate(self):
 
     @property
     def is_done(self):
-        """Whether the episode is actually done (terminated or truncated).
-
-        A done episode cannot be continued via `self.add_timestep()` or being
-        concatenated on its right-side with another episode chunk or being
-        succeeded via `self.create_successor()`.
-        """
         return self.is_terminated or self.is_truncated
 
-    def create_successor(self) -> "_Episode":
-        """Returns a successor episode chunk (of len=0) continuing with this one.
-
-        The successor will have the same ID and state as self and its only observation
-        will be the last observation in self. Its length will therefore be 0 (no
-        steps taken yet).
-
-        This method is useful if you would like to discontinue building an episode
-        chunk (b/c you have to return it from somewhere), but would like to have a new
-        episode (chunk) instance to continue building the actual env episode at a later
-        time.
-
-        Returns:
-            The successor Episode chunk of this one with the same ID and state and the
-            only observation being the last observation in self.
-        """
-        assert not self.is_done
-
-        return _Episode(
-            # Same ID.
-            id_=self.id_,
-            # First (and only) observation of successor is this episode's last obs.
-            observations=[self.observations[-1]],
-            # Same state.
-            states=self.states,
-            # Continue with self's current timestep.
-            t=self.t,
-        )
-
     def to_sample_batch(self):
         return SampleBatch(
             {
@@ -564,8 +497,6 @@ def get_state(self):
                 "actions": self.actions,
                 "rewards": self.rewards,
                 "states": self.states,
-                "t_started": self.t_started,
-                "t": self.t,
                 "is_terminated": self.is_terminated,
                 "is_truncated": self.is_truncated,
             }.items()
@@ -578,16 +509,14 @@ def from_state(state):
         eps.actions = state[2][1]
         eps.rewards = state[3][1]
         eps.states = state[4][1]
-        eps.t_started = state[5][1]
-        eps.t = state[6][1]
-        eps.is_terminated = state[7][1]
-        eps.is_truncated = state[8][1]
+        eps.is_terminated = state[5][1]
+        eps.is_truncated = state[6][1]
         return eps
 
     def __len__(self):
         assert len(self.observations) > 0, (
             "ERROR: Cannot determine length of episode that hasn't started yet! "
-            "Call `_Episode.add_initial_observation(initial_observation=...)` first "
+            "Call `_Episode.add_initial_obs(initial_observation=...)` first "
             "(after which `len(_Episode)` will be 0)."
         )
         return len(self.observations) - 1
diff --git a/rllib/utils/tf_utils.py b/rllib/utils/tf_utils.py
index 3acbbad004c0f..7b43953c5b67f 100644
--- a/rllib/utils/tf_utils.py
+++ b/rllib/utils/tf_utils.py
@@ -679,7 +679,7 @@ def two_hot(
     # First make sure, values are clipped.
     value = tf.clip_by_value(value, lower_bound, upper_bound)
     # Tensor of batch indices: [0, B=batch size).
-    batch_indices = tf.range(0, tf.shape(value)[0], dtype=tf.float32)
+    batch_indices = tf.range(0, value.shape[0], dtype=tf.float32)
     # Calculate the step deltas (how much space between each bucket's central value?).
     bucket_delta = (upper_bound - lower_bound) / (num_buckets - 1)
     # Compute the float indices (might be non-int numbers: sitting between two buckets).
@@ -690,12 +690,12 @@ def two_hot(
     kp1 = tf.math.ceil(idx)
     # In case k == kp1 (idx is exactly on the bucket boundary), move kp1 up by 1.0.
     # Otherwise, this would result in a NaN in the returned two-hot tensor.
-    kp1 = tf.where(tf.equal(k, kp1), kp1 + 1.0, kp1)
+    kp1 = tf.where(k == kp1, kp1 + 1.0, kp1)
     # Iff `kp1` is one beyond our last index (because incoming value is larger than
     # `upper_bound`), move it to one before k (kp1's weight is going to be 0.0 anyways,
     # so it doesn't matter where it points to; we are just avoiding an index error
     # with this).
-    kp1 = tf.where(tf.equal(kp1, num_buckets), kp1 - 2.0, kp1)
+    kp1 = tf.where(kp1 == num_buckets, kp1 - 2.0, kp1)
     # The actual values found at k and k+1 inside the set of buckets.
     values_k = lower_bound + k * bucket_delta
     values_kp1 = lower_bound + kp1 * bucket_delta