pytorch · vmoens · Sep 23, 2022 · Aug 30, 2022 · Aug 30, 2022 · Aug 31, 2022
diff --git a/test/test_modules.py b/test/test_modules.py
@@ -8,6 +8,7 @@
 import pytest
 import torch
 from _utils_internal import get_available_devices
+from mocking_classes import MockBatchedUnLockedEnv
 from torch import nn
 from torchrl.data import TensorDict
 from torchrl.data.tensor_specs import OneHotDiscreteTensorSpec
@@ -18,6 +19,7 @@
  ValueOperator,
  ProbabilisticActor,
  LSTMNet,
+ CEMPlanner,
 )
 from torchrl.modules.functional_modules import (
  FunctionalModule,
@@ -326,6 +328,31 @@ def test_func_transformer(self):
  torch.testing.assert_close(fmodule(params, buffers, x, x), module(x, x))
 
 
+class TestPlanner:
+ @pytest.mark.parametrize("device", get_available_devices())
+ @pytest.mark.parametrize("batch_size", [3, 5])
+ def test_CEM_model_free_env(self, device, batch_size, seed=1):
+ env = MockBatchedUnLockedEnv(device=device)
+ env.set_seed(seed)
+ planner = CEMPlanner(
+ env,
+ planning_horizon=10,
+ optim_steps=2,
+ num_candidates=100,
+ num_top_k_candidates=2,
+ ).to(device)
+ td = env.reset(TensorDict({}, batch_size=batch_size)).to(device)
+ td_copy = td.clone()
+ td = planner(td)
+ assert td.get("action").shape[1:] == env.action_spec.shape
+
+ assert env.action_spec.is_in(td.get("action"))
+
+ for key in td.keys():
+ if key != "action":
+ assert torch.allclose(td[key], td_copy[key])
+
+
 if __name__ == "__main__":
  args, unknown = argparse.ArgumentParser().parse_known_args()
  pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)
diff --git a/torchrl/envs/common.py b/torchrl/envs/common.py
@@ -226,12 +226,12 @@ def __init__(
  self.batch_size = torch.Size([])
 
  @classmethod
- def __new__(cls, *args, _batch_locked=True, **kwargs):
+ def __new__(cls, *args, _inplace_update=False, _batch_locked=True, **kwargs):
  # inplace update will write tensors in-place on the provided tensordict.
  # This is risky, especially if gradients need to be passed (in-place copy
  # for tensors that are part of computational graphs will result in an error).
  # It can also lead to inconsistencies when calling rollout.
- cls._inplace_update = False
+ cls._inplace_update = _inplace_update
  cls._batch_locked = _batch_locked
  return super().__new__(cls)
 

diff --git a/torchrl/envs/transforms/transforms.py b/torchrl/envs/transforms/transforms.py
@@ -312,6 +312,16 @@ def __init__(
  self._observation_spec = None
  self.batch_size = self.base_env.batch_size
 
+ def __new__(cls, env, *args, **kwargs):
+ return super().__new__(
+ cls,
+ env,
+ *args,
+ _inplace_update=env._inplace_update,
+ _batch_locked=env.batch_locked,
+ **kwargs,
+ )
+
  def _set_env(self, env: EnvBase, device) -> None:
  self.base_env = env.to(device)
  # updates need not be inplace, as transforms may modify values out-place

diff --git a/torchrl/modules/__init__.py b/torchrl/modules/__init__.py
@@ -6,3 +6,4 @@
 from .distributions import *
 from .models import *
 from .tensordict_module import *
+from .planners import *
diff --git a/torchrl/modules/planners/__init__.py b/torchrl/modules/planners/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .common import *
+from .cem import *
diff --git a/torchrl/modules/planners/cem.py b/torchrl/modules/planners/cem.py
@@ -0,0 +1,130 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from torchrl.data.tensordict.tensordict import TensorDictBase
+from torchrl.envs import EnvBase
+from torchrl.modules.planners import MPCPlannerBase
+
+__all__ = ["CEMPlanner"]
+
+
+class CEMPlanner(MPCPlannerBase):
+ """
+ CEMPlanner Module. This class inherits from TensorDictModule.
+
+ Provided a TensorDict, this module will perform a CEM planning step.
+ The CEM planning step is performed by sampling actions from a Gaussian distribution with zero mean and unit variance.
+ The actions are then used to perform a rollout in the environment.
+ The rewards are then used to update the mean and standard deviation of the Gaussian distribution.
+ The mean and standard deviation of the Gaussian distribution are then used to sample actions for the next planning step.
+ The CEM planning step is repeated for a specified number of steps.
+ At the end, we recover the best action which is the one that maximizes the reward given a planning horizon.
+
+ Args:
+ env (Environment): The environment to perform the planning step on (Can be ModelBasedEnv or EnvBase).
+ planning_horizon (int): The number of steps to perform the planning step for.
+ optim_steps (int): The number of steps to perform the MPC planning step for.
+ num_candidates (int): The number of candidates to sample from the Gaussian distribution.
+ num_top_k_candidates (int): The number of top candidates to use to update the mean and standard deviation of the Gaussian distribution.
+ reward_key (str): The key in the TensorDict to use to retrieve the reward.
+ action_key (str): The key in the TensorDict to use to store the action.
+
+ Returns:
+ TensorDict: The TensorDict with the action added.
+ """
+
+ def __init__(
+ self,
+ env: EnvBase,
+ planning_horizon: int,
+ optim_steps: int,
+ num_candidates: int,
+ num_top_k_candidates: int,
+ reward_key: str = "reward",
+ action_key: str = "action",
+ ):
+ super().__init__(env=env, action_key=action_key)
+ self.planning_horizon = planning_horizon
+ self.optim_steps = optim_steps
+ self.num_candidates = num_candidates
+ self.num_top_k_candidates = num_top_k_candidates
+ self.reward_key = reward_key
+
+ def planning(self, td: TensorDictBase) -> torch.Tensor:
+ batch_size = td.batch_size
+ expanded_original_td = (
+ td.unsqueeze(-1)
+ .expand(*batch_size, self.num_candidates)
+ .contiguous()
+ .view(-1)
+ )
+ flatten_batch_size = batch_size.numel()
+ actions_means = torch.zeros(
+ flatten_batch_size,
+ 1,
+ self.planning_horizon,
+ *self.action_spec.shape,
+ device=td.device,
+ dtype=self.env.action_spec.dtype,
+ )
+ actions_stds = torch.ones(
+ flatten_batch_size,
+ 1,
+ self.planning_horizon,
+ *self.action_spec.shape,
+ device=td.device,
+ dtype=self.env.action_spec.dtype,
+ )
+
+ for _ in range(self.optim_steps):
+ actions = actions_means + actions_stds * torch.randn(
+ flatten_batch_size,
+ self.num_candidates,
+ self.planning_horizon,
+ *self.action_spec.shape,
+ device=td.device,
+ dtype=self.env.action_spec.dtype,
+ )
+ actions = actions.flatten(0, 1)
+ actions = self.env.action_spec.project(actions)
+ optim_td = expanded_original_td.to_tensordict()
+ policy = PrecomputedActionsSequentialSetter(actions)
+ optim_td = self.env.rollout(
+ max_steps=self.planning_horizon,
+ policy=policy,
+ auto_reset=False,
+ tensordict=optim_td,
+ )
+ rewards = (
+ optim_td.get(self.reward_key)
+ .sum(dim=1)
+ .reshape(flatten_batch_size, self.num_candidates)
+ )
+ _, top_k = rewards.topk(self.num_top_k_candidates, dim=1)
+
+ best_actions = actions.unflatten(
+ 0, (flatten_batch_size, self.num_candidates)
+ )
+ best_actions = best_actions[
+ torch.arange(flatten_batch_size).unsqueeze(1), top_k
+ ]
+ actions_means = best_actions.mean(dim=1, keepdim=True)
+ actions_stds = best_actions.std(dim=1, keepdim=True)
+ return (actions_means[:, :, 0]).reshape(*batch_size, *self.action_spec.shape)
+
+
+class PrecomputedActionsSequentialSetter:
+ def __init__(self, actions):
+ self.actions = actions
+ self.cmpt = 0
+
+ def __call__(self, td):
+ if self.cmpt >= self.actions.shape[1]:
+ raise ValueError("Precomputed actions are too short")
+ td = td.set("action", self.actions[:, self.cmpt])
+ self.cmpt += 1
+ return td
diff --git a/torchrl/modules/planners/common.py b/torchrl/modules/planners/common.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import abc
+from typing import Optional
+
+import torch
+
+from torchrl.data.tensordict.tensordict import TensorDictBase
+from torchrl.envs import EnvBase
+from torchrl.modules import TensorDictModule
+
+__all__ = ["MPCPlannerBase"]
+
+
+class MPCPlannerBase(TensorDictModule, metaclass=abc.ABCMeta):
+ """
+ MPCPlannerBase Module. This is an abstract class and must be implemented by the user.
+
+ This class inherits from TensorDictModule. Provided a TensorDict, this module will perform a Model Predictive Control (MPC) planning step.
+ At the end of the planning step, the MPCPlanner will return the action that should be taken.
+
+ Args:
+ env (Environment): The environment to perform the planning step on (Can be ModelBasedEnv or EnvBase).
+ action_key (str): The key in the TensorDict to use to store the action.
+
+ Returns:
+ TensorDict: The TensorDict with the action added.
+ """
+
+ def __init__(
+ self,
+ env: EnvBase,
+ action_key: str = "action",
+ ):
+ # Check if env is stateless
+ if env.batch_locked:
+ raise ValueError("Environment is not stateless")
+ out_keys = [action_key]
+ in_keys = list(env.observation_spec.keys())
+ super().__init__(env, in_keys=in_keys, out_keys=out_keys)
+ self.env = env
+ self.action_spec = env.action_spec
+
+ @abc.abstractmethod
+ def planning(self, td: TensorDictBase) -> torch.Tensor:
+ """
+ Perform the MPC planning step.
+ Args:
+ td (TensorDict): The TensorDict to perform the planning step on.
+ Returns:
+ TensorDict: The TensorDict with the action added.
+ """
+ raise NotImplementedError()
+
+ def forward(
+ self,
+ tensordict: TensorDictBase,
+ tensordict_out: Optional[TensorDictBase] = None,
+ **kwargs,
+ ) -> TensorDictBase:
+ if "params" in kwargs or "vmap" in kwargs:
+ raise ValueError("params not supported")
+ action = self.planning(tensordict)
+ action = self.action_spec.project(action)
+ tensordict_out = self._write_to_tensordict(
+ tensordict,
+ (action,),
+ tensordict_out,
+ )
+ return tensordict_out
diff --git a/torchrl/trainers/helpers/envs.py b/torchrl/trainers/helpers/envs.py
@@ -161,7 +161,7 @@ def make_env_transforms(
  key
  for key in env.observation_spec.keys()
  if ("pixels" not in key)
- and (key.strip("next_") not in env.input_spec.keys())
+ and (key.replace("next_", "") not in env.input_spec.keys())
  ]
 
  # even if there is a single tensor, it'll be renamed in "next_observation_vector"