Move kl check out of traced update, torch, tf

Signed-off-by: Avnish <[email protected]>
ray-project · gjoliver · May 23, 2023 · May 18, 2023 · May 18, 2023 · May 18, 2023
commit c496c344c1fbc41a4e84d9766e45725ebcf4eca3
diff --git a/rllib/algorithms/ppo/tf/ppo_tf_learner.py b/rllib/algorithms/ppo/tf/ppo_tf_learner.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, Mapping
+from typing import Any, DefaultDict, Dict, Mapping
 
 from ray.rllib.algorithms.ppo.ppo_learner import (
  LEARNER_RESULTS_KL_KEY,
@@ -12,7 +12,7 @@
 from ray.rllib.core.learner.tf.tf_learner import TfLearner
 from ray.rllib.core.rl_module.rl_module import ModuleID
 from ray.rllib.evaluation.postprocessing import Postprocessing
-from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_utils import explained_variance
 from ray.rllib.utils.annotations import override
@@ -60,17 +60,6 @@ def compute_loss_for_module(
  if self.hps.kl_coeff > 0.0:
  action_kl = prev_action_dist.kl(curr_action_dist)
  mean_kl_loss = tf.reduce_mean(action_kl)
- if tf.math.is_inf(mean_kl_loss):
- logger.warning(
- "KL divergence is non-finite, this will likely destabilize "
- "your model and the training process. Action(s) in a "
- "specific state have near-zero probability. "
- "This can happen naturally in deterministic "
- "environments where the optimal policy has zero mass "
- "for a specific action. To fix this issue, consider "
- "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
- "config."
- )
  else:
  mean_kl_loss = tf.constant(0.0, dtype=logp_ratio.dtype)
 
@@ -151,3 +140,32 @@ def additional_update_for_module(
  results.update({LEARNER_RESULTS_CURR_KL_COEFF_KEY: curr_var.numpy()})
 
  return results
+
+ def compile_results(
+ self,
+ *,
+ batch: MultiAgentBatch,
+ fwd_out: Mapping[str, Any],
+ loss_per_module: Mapping[str, TensorType],
+ metrics_per_module: DefaultDict[ModuleID, Dict[str, Any]]
+ ) -> Mapping[str, Any]:
+ if self.hps.kl_coeff > 0.0:
+ for metrics in metrics_per_module.values():
+ mean_kl_loss = metrics[LEARNER_RESULTS_KL_KEY]
+ if tf.math.is_inf(mean_kl_loss):
+ logger.warning(
+ "KL divergence is non-finite, this will likely destabilize "
+ "your model and the training process. Action(s) in a "
+ "specific state have near-zero probability. "
+ "This can happen naturally in deterministic "
+ "environments where the optimal policy has zero mass "
+ "for a specific action. To fix this issue, consider "
+ "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in "
+ "your config."
+ )
+ return super().compile_results(
+ batch=batch,
+ fwd_out=fwd_out,
+ loss_per_module=loss_per_module,
+ metrics_per_module=metrics_per_module,
+ )
diff --git a/rllib/algorithms/ppo/torch/ppo_torch_learner.py b/rllib/algorithms/ppo/torch/ppo_torch_learner.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, Mapping
+from typing import Any, DefaultDict, Dict, Mapping
 
 from ray.rllib.algorithms.ppo.ppo_learner import (
  LEARNER_RESULTS_KL_KEY,
@@ -12,7 +12,7 @@
 from ray.rllib.core.learner.torch.torch_learner import TorchLearner
 from ray.rllib.core.rl_module.rl_module import ModuleID
 from ray.rllib.evaluation.postprocessing import Postprocessing
-from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.nested_dict import NestedDict
@@ -63,17 +63,6 @@ def compute_loss_for_module(
  if self.hps.kl_coeff > 0.0:
  action_kl = prev_action_dist.kl(curr_action_dist)
  mean_kl_loss = torch.mean(action_kl)
- if mean_kl_loss.isinf():
- logger.warning(
- "KL divergence is non-finite, this will likely destabilize "
- "your model and the training process. Action(s) in a "
- "specific state have near-zero probability. "
- "This can happen naturally in deterministic "
- "environments where the optimal policy has zero mass "
- "for a specific action. To fix this issue, consider "
- "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
- "config."
- )
  else:
  mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)
 
@@ -150,3 +139,32 @@ def additional_update_for_module(
  results.update({LEARNER_RESULTS_CURR_KL_COEFF_KEY: curr_var.item()})
 
  return results
+
+ def compile_results(
+ self,
+ *,
+ batch: MultiAgentBatch,
+ fwd_out: Mapping[str, Any],
+ loss_per_module: Mapping[str, TensorType],
+ metrics_per_module: DefaultDict[ModuleID, Dict[str, Any]]
+ ) -> Mapping[str, Any]:
+ if self.hps.kl_coeff > 0.0:
+ for metrics in metrics_per_module.values():
+ mean_kl_loss = metrics[LEARNER_RESULTS_KL_KEY]
+ if mean_kl_loss.isinf():
+ logger.warning(
+ "KL divergence is non-finite, this will likely destabilize "
+ "your model and the training process. Action(s) in a "
+ "specific state have near-zero probability. "
+ "This can happen naturally in deterministic "
+ "environments where the optimal policy has zero mass "
+ "for a specific action. To fix this issue, consider "
+ "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in "
+ "your config."
+ )
+ return super().compile_results(
+ batch=batch,
+ fwd_out=fwd_out,
+ loss_per_module=loss_per_module,
+ metrics_per_module=metrics_per_module,
+ )