pytorch · vmoens · Jul 10, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 21, 2024
diff --git a/.github/unittest/linux_examples/scripts/run_test.sh b/.github/unittest/linux_examples/scripts/run_test.sh
@@ -149,8 +149,6 @@ python .github/unittest/helpers/coverage_run_parallel.py sota-implementations/di
  replay_buffer.size=120 \
  env.name=CartPole-v1 \
  logger.backend=
-# logger.record_video=True \
-# logger.record_frames=4 \
 python .github/unittest/helpers/coverage_run_parallel.py sota-implementations/crossq/crossq.py \
  collector.total_frames=48 \
  collector.init_random_frames=10 \

diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst
@@ -121,6 +121,15 @@ REDQ
 
  REDQLoss
 
+CrossQ
+----
+
+.. autosummary::
+ :toctree: generated/
+ :template: rl_template_noinherit.rst
+
+ CrossQ
+
 IQL
 ----
 

diff --git a/sota-check/run_crossq.sh b/sota-check/run_crossq.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#SBATCH --job-name=crossq
+#SBATCH --ntasks=32
+#SBATCH --cpus-per-task=1
+#SBATCH --gres=gpu:1
+#SBATCH --output=slurm_logs/crossq_%j.txt
+#SBATCH --error=slurm_errors/crossq_%j.txt
+
+current_commit=$(git rev-parse --short HEAD)
+project_name="torchrl-example-check-$current_commit"
+group_name="crossq"
+export PYTHONPATH=$(dirname $(dirname $PWD))
+python $PYTHONPATH/sota-implementations/crossq/crossq.py \
+ logger.backend=wandb \
+ logger.project_name="$project_name" \
+ logger.group_name="$group_name"
+
+# Capture the exit status of the Python command
+exit_status=$?
+# Write the exit status to a file
+if [ $exit_status -eq 0 ]; then
+ echo "${group_name}_${SLURM_JOB_ID}=success" >> report.log
+else
+ echo "${group_name}_${SLURM_JOB_ID}=error" >> report.log
+fi
diff --git a/sota-implementations/crossq/batchrenorm.py b/sota-implementations/crossq/batchrenorm.py
@@ -0,0 +1,98 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+
+
+class BatchRenorm(nn.Module):
+ """
+ BatchRenorm Module (https://arxiv.org/abs/1702.03275).
+
+ BatchRenorm is an enhanced version of the standard BatchNorm. Unlike BatchNorm,
+ BatchRenorm utilizes running statistics to normalize batches after an initial warmup phase.
+ This approach reduces the impact of "outlier" batches that may occur during extended training periods,
+ making BatchRenorm more robust for long training runs.
+
+ During the warmup phase, BatchRenorm functions identically to a BatchNorm layer.
+
+ Args:
+ num_features (int): Number of features in the input tensor.
+
+ Keyword Args:
+ momentum (float, optional): Momentum factor for computing the running mean and variance. Default is 0.01.
+ eps (float, optional): Small value added to the variance to avoid division by zero. Default is 1e-5.
+ max_r (float, optional): Maximum value for the scaling factor r. Default is 3.0.
+ max_d (float, optional): Maximum value for the bias factor d. Default is 5.0.
+ warmup_steps (int, optional): Number of warm-up steps for the running mean and variance. Default is 10000.
+ """
+
+ def __init__(
+ self,
+ num_features,
+ momentum=0.01,
+ eps=1e-5,
+ max_r=3.0,
+ max_d=5.0,
+ warmup_steps=10000,
+ ):
+ super().__init__()
+ self.num_features = num_features
+ self.eps = eps
+ self.momentum = momentum
+ self.max_r = max_r
+ self.max_d = max_d
+ self.warmup_steps = warmup_steps
+
+ self.register_buffer(
+ "running_mean", torch.zeros(num_features, dtype=torch.float32)
+ )
+ self.register_buffer(
+ "running_var", torch.ones(num_features, dtype=torch.float32)
+ )
+ self.register_buffer("num_batches_tracked", torch.tensor(0, dtype=torch.int64))
+ self.weight = nn.Parameter(torch.ones(num_features, dtype=torch.float32))
+ self.bias = nn.Parameter(torch.zeros(num_features, dtype=torch.float32))
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ assert x.dim() >= 2
+ view_dims = [1, x.shape[1]] + [1] * (x.dim() - 2)
+ # _v = lambda v: v.view(view_dims)
+
+ def _v(v):
+ return v.view(view_dims)
+
+ running_std = (self.running_var + self.eps).sqrt_()
+
+ if self.training:
+ reduce_dims = [i for i in range(x.dim()) if i != 1]
+ b_mean = x.mean(reduce_dims)
+ b_var = x.var(reduce_dims, unbiased=False)
+ b_std = (b_var + self.eps).sqrt_()
+
+ r = torch.clamp((b_std.detach() / running_std), 1 / self.max_r, self.max_r)
+ d = torch.clamp(
+ (b_mean.detach() - self.running_mean) / running_std,
+ -self.max_d,
+ self.max_d,
+ )
+
+ # Compute warmup factor (0 during warmup, 1 after warmup)
+ warmup_factor = torch.clamp(
+ self.num_batches_tracked / self.warmup_steps, 0.0, 1.0
+ )
+ r = 1.0 + (r - 1.0) * warmup_factor
+ d = d * warmup_factor
+
+ x = (x - _v(b_mean)) / _v(b_std) * _v(r) + _v(d)
+
+ unbiased_var = b_var.detach() * x.shape[1] / (x.shape[1] - 1)
+ self.running_var += self.momentum * (unbiased_var - self.running_var)
+ self.running_mean += self.momentum * (b_mean.detach() - self.running_mean)
+ self.num_batches_tracked += 1
+ else:
+ x = (x - _v(self.running_mean)) / _v(running_std)
+
+ x = _v(self.weight) * x + _v(self.bias)
+ return x
diff --git a/sota-implementations/crossq/config.yaml b/sota-implementations/crossq/config.yaml
@@ -28,20 +28,21 @@ optim:
  policy_update_delay: 3
  gamma: 0.99
  loss_function: l2
- lr: 3.0e-4
+ lr: 1.0e-3
  weight_decay: 0.0
  batch_size: 256
  alpha_init: 1.0
- # Adam β1 = 0.5
  adam_eps: 1.0e-8
+ beta1: 0.5
+ beta2: 0.999
 
 # network
 network:
  batch_norm_momentum: 0.01
- # warmup_steps: 100000 # 10^5
+ warmup_steps: 100000 
  critic_hidden_sizes: [2048, 2048]
  actor_hidden_sizes: [256, 256]
- critic_activation: tanh
+ critic_activation: relu
  actor_activation: relu
  default_policy_scale: 1.0
  scale_lb: 0.1

diff --git a/sota-implementations/crossq/crossq.py b/sota-implementations/crossq/crossq.py
@@ -36,6 +36,8 @@
 @hydra.main(version_base="1.1", config_path=".", config_name="config")
 def main(cfg: "DictConfig"): # noqa: F821
  device = torch.device(cfg.network.device)
+ if device is None:
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
  # Create logger
  exp_name = generate_exp_name("CrossQ", cfg.logger.exp_name)
@@ -60,7 +62,7 @@ def main(cfg: "DictConfig"): # noqa: F821
  train_env, eval_env = make_environment(cfg)
 
  # Create agent
- model, exploration_policy = make_crossQ_agent(cfg, train_env, eval_env, device)
+ model, exploration_policy = make_crossQ_agent(cfg, train_env, device)
 
  # Create CrossQ loss
  loss_module = make_loss_module(cfg, model)
@@ -133,14 +135,12 @@ def main(cfg: "DictConfig"): # noqa: F821
  # Sample from replay buffer
  sampled_tensordict = replay_buffer.sample()
  if sampled_tensordict.device != device:
- sampled_tensordict = sampled_tensordict.to(
- device, non_blocking=True
- )
+ sampled_tensordict = sampled_tensordict.to(device)
  else:
  sampled_tensordict = sampled_tensordict.clone()
 
  # Compute loss
- q_loss, *_ = loss_module._qvalue_loss(sampled_tensordict)
+ q_loss, *_ = loss_module.qvalue_loss(sampled_tensordict)
  q_loss = q_loss.mean()
  # Update critic
  optimizer_critic.zero_grad()
@@ -149,14 +149,14 @@ def main(cfg: "DictConfig"): # noqa: F821
  q_losses.append(q_loss.detach().item())
 
  if update_actor:
- actor_loss, metadata_actor = loss_module._actor_loss(
+ actor_loss, metadata_actor = loss_module.actor_loss(
  sampled_tensordict
  )
  actor_loss = actor_loss.mean()
- alpha_loss = loss_module._alpha_loss(
+ alpha_loss = loss_module.alpha_loss(
  log_prob=metadata_actor["log_prob"]
- )
- alpha_loss = alpha_loss.mean()
+ ).mean()
+
  # Update actor
  optimizer_actor.zero_grad()
  actor_loss.backward()

diff --git a/sota-implementations/crossq/utils.py b/sota-implementations/crossq/utils.py
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+
+from batchrenorm import BatchRenorm
 from tensordict.nn import InteractionType, TensorDictModule
 from tensordict.nn.distributions import NormalParamExtractor
 from torch import nn, optim
@@ -26,7 +28,6 @@
 from torchrl.modules.distributions import TanhNormal
 from torchrl.objectives import CrossQLoss
 
-
 # ====================================================================
 # Environment utils
 # -----------------
@@ -120,7 +121,6 @@ def make_replay_buffer(
  storage=LazyMemmapStorage(
  buffer_size,
  scratch_dir=scratch_dir,
- device=device,
  ),
  batch_size=batch_size,
  )
@@ -131,10 +131,10 @@ def make_replay_buffer(
  storage=LazyMemmapStorage(
  buffer_size,
  scratch_dir=scratch_dir,
- device=device,
  ),
  batch_size=batch_size,
  )
+ replay_buffer.append_transform(lambda x: x.to(device, non_blocking=True))
  return replay_buffer
 
 
@@ -143,7 +143,7 @@ def make_replay_buffer(
 # -----
 
 
-def make_crossQ_agent(cfg, train_env, eval_env, device):
+def make_crossQ_agent(cfg, train_env, device):
  """Make CrossQ agent."""
  # Define Actor Network
  in_keys = ["observation"]
@@ -154,19 +154,20 @@ def make_crossQ_agent(cfg, train_env, eval_env, device):
  "num_cells": cfg.network.actor_hidden_sizes,
  "out_features": 2 * action_spec.shape[-1],
  "activation_class": get_activation(cfg.network.actor_activation),
- "norm_class": nn.BatchNorm1d, # Should be BRN (https://arxiv.org/abs/1702.03275) not sure if added to torch
+ "norm_class": BatchRenorm,
  "norm_kwargs": {
  "momentum": cfg.network.batch_norm_momentum,
  "num_features": cfg.network.actor_hidden_sizes[-1],
+ "warmup_steps": cfg.network.warmup_steps,
  },
  }
 
  actor_net = MLP(**actor_net_kwargs)
 
  dist_class = TanhNormal
  dist_kwargs = {
- "min": action_spec.space.low,
- "max": action_spec.space.high,
+ "low": action_spec.space.low,
+ "high": action_spec.space.high,
  "tanh_loc": False,
  }
 
@@ -200,10 +201,11 @@ def make_crossQ_agent(cfg, train_env, eval_env, device):
  "num_cells": cfg.network.critic_hidden_sizes,
  "out_features": 1,
  "activation_class": get_activation(cfg.network.critic_activation),
- "norm_class": nn.BatchNorm1d, # Should be BRN (https://arxiv.org/abs/1702.03275) not sure if added to torch
+ "norm_class": BatchRenorm,
  "norm_kwargs": {
  "momentum": cfg.network.batch_norm_momentum,
  "num_features": cfg.network.critic_hidden_sizes[-1],
+ "warmup_steps": cfg.network.warmup_steps,
  },
  }
 
@@ -220,14 +222,13 @@ def make_crossQ_agent(cfg, train_env, eval_env, device):
 
  # init nets
  with torch.no_grad(), set_exploration_type(ExplorationType.RANDOM):
- td = eval_env.reset()
+ td = train_env.fake_tensordict()
  td = td.to(device)
  for net in model:
  net.eval()
  net(td)
  net.train()
  del td
- eval_env.close()
 
  return model, model[0]
 
@@ -273,16 +274,18 @@ def make_crossQ_optimizer(cfg, loss_module):
  lr=cfg.optim.lr,
  weight_decay=cfg.optim.weight_decay,
  eps=cfg.optim.adam_eps,
+ betas=(cfg.optim.beta1, cfg.optim.beta2),
  )
  optimizer_critic = optim.Adam(
  critic_params,
  lr=cfg.optim.lr,
  weight_decay=cfg.optim.weight_decay,
  eps=cfg.optim.adam_eps,
+ betas=(cfg.optim.beta1, cfg.optim.beta2),
  )
  optimizer_alpha = optim.Adam(
  [loss_module.log_alpha],
- lr=3.0e-4,
+ lr=cfg.optim.lr,
  )
  return optimizer_actor, optimizer_critic, optimizer_alpha