pytorch · vmoens · Sep 23, 2022 · Aug 30, 2022 · Aug 30, 2022 · Aug 31, 2022
diff --git a/test/test_modules.py b/test/test_modules.py
@@ -8,6 +8,7 @@
 import pytest
 import torch
 from _utils_internal import get_available_devices
+from mocking_classes import MockBatchedUnLockedEnv
 from torch import nn
 from torchrl.data import TensorDict
 from torchrl.data.tensor_specs import OneHotDiscreteTensorSpec
@@ -18,6 +19,7 @@
  ValueOperator,
  ProbabilisticActor,
  LSTMNet,
+ CEMPlanner,
 )
 from torchrl.modules.functional_modules import (
  FunctionalModule,
@@ -326,6 +328,31 @@ def test_func_transformer(self):
  torch.testing.assert_close(fmodule(params, buffers, x, x), module(x, x))
 
 
+class TestPlanner:
+ @pytest.mark.parametrize("device", get_available_devices())
+ @pytest.mark.parametrize("batch_size", [3, 5])
+ def test_CEM_model_free_env(self, device, batch_size, seed=1):
+ env = MockBatchedUnLockedEnv(device=device)
+ torch.manual_seed(seed)
+ planner = CEMPlanner(
+ env,
+ planning_horizon=10,
+ optim_steps=2,
+ num_candidates=100,
+ num_top_k_candidates=2,
+ )
+ td = env.reset(TensorDict({}, batch_size=batch_size))
+ td_copy = td.clone()
+ td = planner(td)
+ assert td.get("action").shape[1:] == env.action_spec.shape
+
+ assert env.action_spec.is_in(td.get("action"))
+
+ for key in td.keys():
+ if key != "action":
+ assert torch.allclose(td[key], td_copy[key])
+
+
 if __name__ == "__main__":
  args, unknown = argparse.ArgumentParser().parse_known_args()
  pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)
diff --git a/torchrl/modules/__init__.py b/torchrl/modules/__init__.py
@@ -6,3 +6,4 @@
 from .distributions import *
 from .models import *
 from .tensordict_module import *
+from .planners import *
diff --git a/torchrl/modules/planners/__init__.py b/torchrl/modules/planners/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .common import *
+from .cem import *
diff --git a/torchrl/modules/planners/cem.py b/torchrl/modules/planners/cem.py
@@ -0,0 +1,127 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from torchrl.data.tensordict.tensordict import TensorDictBase
+from torchrl.envs import EnvBase
+from torchrl.modules.planners import MPCPlannerBase
+
+__all__ = ["CEMPlanner"]
+
+
+class CEMPlanner(MPCPlannerBase):
+ """CEMPlanner Module. 
+
+ Reference: The cross-entropy method for optimization, Botev et al. 2013
+
+ This module will perform a CEM planning step when given a TensorDict containing initial states.
+ The CEM planning step is performed by sampling actions from a Gaussian distribution with zero mean and unit variance.
+ The sampled actions are then used to perform a rollout in the environment. The cumulative rewards obtained with the rollout is then 
+ ranked. We select the top-k episodes and use their actions to update the mean and standard deviation of the actions distribution.
+ The CEM planning step is repeated for a specified number of steps.
+
+ A call to the module returns the actions that empirically maximised the returns given a planning horizon
+
+ Args:
+ env (EnvBase): The environment to perform the planning step on (can be ```ModelBasedEnv``` or ```EnvBase```).
+ planning_horizon (int): The length of the simulated trajectories
+ optim_steps (int): The number of optimization steps used by the MPC planner
+ num_candidates (int): The number of candidates to sample from the Gaussian distributions.
+ num_top_k_candidates (int): The number of top candidates to use to update the mean and standard deviation of the Gaussian distribution.
+ reward_key (str, optional): The key in the TensorDict to use to retrieve the reward.
+ action_key (str, optional): The key in the TensorDict to use to store the action.
+ """
+
+ def __init__(
+ self,
+ env: EnvBase,
+ planning_horizon: int,
+ optim_steps: int,
+ num_candidates: int,
+ num_top_k_candidates: int,
+ reward_key: str = "reward",
+ action_key: str = "action",
+ ):
+ super().__init__(env=env, action_key=action_key)
+ self.planning_horizon = planning_horizon
+ self.optim_steps = optim_steps
+ self.num_candidates = num_candidates
+ self.num_top_k_candidates = num_top_k_candidates
+ self.reward_key = reward_key
+
+ def planning(self, tensordict: TensorDictBase) -> torch.Tensor:
+ batch_size = tensordict.batch_size
+ expanded_original_tensordict = (
+ tensordict.unsqueeze(-1)
+ .expand(*batch_size, self.num_candidates)
+ .reshape(-1)
+ )
+ flatten_batch_size = batch_size.numel()
+ actions_means = torch.zeros(
+ flatten_batch_size,
+ 1,
+ self.planning_horizon,
+ *self.action_spec.shape,
+ device=tensordict.device,
+ dtype=self.env.action_spec.dtype,
+ )
+ actions_stds = torch.ones(
+ flatten_batch_size,
+ 1,
+ self.planning_horizon,
+ *self.action_spec.shape,
+ device=tensordict.device,
+ dtype=self.env.action_spec.dtype,
+ )
+
+ for _ in range(self.optim_steps):
+ actions = actions_means + actions_stds * torch.randn(
+ flatten_batch_size,
+ self.num_candidates,
+ self.planning_horizon,
+ *self.action_spec.shape,
+ device=tensordict.device,
+ dtype=self.env.action_spec.dtype,
+ )
+ actions = actions.flatten(0, 1)
+ actions = self.env.action_spec.project(actions)
+ optim_tensordict = expanded_original_tensordict.to_tensordict()
+ policy = PrecomputedActionsSequentialSetter(actions)
+ optim_tensordict = self.env.rollout(
+ max_steps=self.planning_horizon,
+ policy=policy,
+ auto_reset=False,
+ tensordict=optim_tensordict,
+ )
+ rewards = (
+ optim_tensordict.get(self.reward_key)
+ .sum(dim=1)
+ .reshape(flatten_batch_size, self.num_candidates)
+ )
+ _, top_k = rewards.topk(self.num_top_k_candidates, dim=1)
+
+ best_actions = actions.unflatten(
+ 0, (flatten_batch_size, self.num_candidates)
+ )
+ best_actions = best_actions[
+ torch.arange(flatten_batch_size).unsqueeze(1), top_k
+ ]
+ actions_means = best_actions.mean(dim=1, keepdim=True)
+ actions_stds = best_actions.std(dim=1, keepdim=True)
+ return actions_means[:, :, 0].reshape(*batch_size, *self.action_spec.shape)
+
+
+class PrecomputedActionsSequentialSetter:
+ def __init__(self, actions):
+ self.actions = actions
+ self.cmpt = 0
+
+ def __call__(self, td):
+ if self.cmpt >= self.actions.shape[1]:
+ raise ValueError("Precomputed actions are too short")
+ td = td.set("action", self.actions[:, self.cmpt])
+ self.cmpt += 1
+ return td
diff --git a/torchrl/modules/planners/common.py b/torchrl/modules/planners/common.py
@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import abc
+from typing import Optional
+
+import torch
+
+from torchrl.data.tensordict.tensordict import TensorDictBase
+from torchrl.envs import EnvBase
+from torchrl.modules import TensorDictModule
+
+__all__ = ["MPCPlannerBase"]
+
+
+class MPCPlannerBase(TensorDictModule, metaclass=abc.ABCMeta):
+ """MPCPlannerBase Module.
+
+ This is an abstract class.
+
+ This class inherits from TensorDictModule. Provided a TensorDict, this module will perform a Model Predictive Control (MPC) planning step.
+ At the end of the planning step, the MPCPlanner will return a proposed action
+
+ Args:
+ env (EnvBase): The environment to perform the planning step on (Can be ModelBasedEnv or EnvBase).
+ action_key (str, optional): The key that will point to the computed action
+ """
+
+ def __init__(
+ self,
+ env: EnvBase,
+ action_key: str = "action",
+ ):
+ # Check if env is stateless
+ if env.batch_locked:
+ raise ValueError("Environment is batch_locked. MPCPlanners need an environnement that accepts batched inputs with any batch size")
+ out_keys = [action_key]
+ in_keys = list(env.observation_spec.keys())
+ super().__init__(env, in_keys=in_keys, out_keys=out_keys)
+ self.env = env
+ self.action_spec = env.action_spec
+ self.to(env.device)
+
+ @abc.abstractmethod
+ def planning(self, td: TensorDictBase) -> torch.Tensor:
+ """Perform the MPC planning step.
+
+ Args:
+ td (TensorDict): The TensorDict to perform the planning step on.
+ """
+ raise NotImplementedError()
+
+ def forward(
+ self,
+ tensordict: TensorDictBase,
+ tensordict_out: Optional[TensorDictBase] = None,
+ **kwargs,
+ ) -> TensorDictBase:
+ if "params" in kwargs or "vmap" in kwargs:
+ raise ValueError("MPCPlannerBase does not support params or vmap for now.")
+ action = self.planning(tensordict)
+ action = self.action_spec.project(action)
+ tensordict_out = self._write_to_tensordict(
+ tensordict,
+ (action,),
+ tensordict_out,
+ )
+ return tensordict_out
diff --git a/torchrl/trainers/helpers/envs.py b/torchrl/trainers/helpers/envs.py
@@ -161,7 +161,7 @@ def make_env_transforms(
  key
  for key in env.observation_spec.keys()
  if ("pixels" not in key)
- and (key.strip("next_") not in env.input_spec.keys())
+ and (key.replace("next_", "") not in env.input_spec.keys())
  ]
 
  # even if there is a single tensor, it'll be renamed in "next_observation_vector"