Move ppo kl infinite check to avoid tensorflow autograph errors

Signed-off-by: Avnish <[email protected]>
ray-project · gjoliver · May 23, 2023 · May 18, 2023 · May 18, 2023 · May 18, 2023
commit 9ba46de9acbc237c3865f0d4e05e1699d9946d83
diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py
@@ -1,10 +1,13 @@
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
+import logging
+import math
+from typing import Any, Dict, List, Mapping, Optional, Union
 
 from ray.rllib.core.learner.learner import LearnerHyperparameters
 from ray.rllib.core.learner.learner import Learner
 from ray.rllib.core.rl_module.rl_module import ModuleID
+from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.schedules.scheduler import Scheduler
 
@@ -16,6 +19,9 @@
 LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY = "curr_entropy_coeff"
 
 
+logger = logging.getLogger(__name__)
+
+
 @dataclass
 class PPOLearnerHyperparameters(LearnerHyperparameters):
  """Hyperparameters for the PPOLearner sub-classes (framework specific).
@@ -73,3 +79,29 @@ def additional_update_per_module(
  results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: new_entropy_coeff})
 
  return results
+
+ @override(Learner)
+ def compile_results(
+ self,
+ batch: MultiAgentBatch,
+ fwd_out: Mapping[str, Any],
+ postprocessed_loss: Mapping[str, Any],
+ postprocessed_gradients: Mapping[str, Any],
+ ) -> Mapping[str, Any]:
+ for module_id, module_loss_results in postprocessed_loss.items():
+ if module_id == self.TOTAL_LOSS_KEY:
+ continue
+ if math.isinf(module_loss_results[LEARNER_RESULTS_KL_KEY]):
+ logger.warning(
+ "KL divergence is non-finite, this will likely destabilize "
+ "your model and the training process. Action(s) in a "
+ "specific state have near-zero probability. "
+ "This can happen naturally in deterministic "
+ "environments where the optimal policy has zero mass "
+ "for a specific action. To fix this issue, consider "
+ "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
+ "config."
+ )
+ return super().compile_results(
+ batch, fwd_out, postprocessed_loss, postprocessed_gradients
+ )
diff --git a/rllib/algorithms/ppo/tf/ppo_tf_learner.py b/rllib/algorithms/ppo/tf/ppo_tf_learner.py
@@ -1,4 +1,3 @@
-import logging
 from typing import Any, Dict, Mapping
 
 from ray.rllib.algorithms.ppo.ppo_learner import (
@@ -20,7 +19,6 @@
 
 
 _, tf, _ = try_import_tf()
-logger = logging.getLogger(__name__)
 
 
 class PPOTfLearner(PPOLearner, TfLearner):
@@ -59,17 +57,6 @@ def compute_loss_per_module(
  if self.hps.kl_coeff > 0.0:
  action_kl = prev_action_dist.kl(curr_action_dist)
  mean_kl_loss = tf.reduce_mean(action_kl)
- if tf.math.is_inf(mean_kl_loss):
- logger.warning(
- "KL divergence is non-finite, this will likely destabilize "
- "your model and the training process. Action(s) in a "
- "specific state have near-zero probability. "
- "This can happen naturally in deterministic "
- "environments where the optimal policy has zero mass "
- "for a specific action. To fix this issue, consider "
- "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
- "config."
- )
  else:
  mean_kl_loss = tf.constant(0.0, dtype=logp_ratio.dtype)
 

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_learner.py b/rllib/algorithms/ppo/torch/ppo_torch_learner.py
@@ -1,4 +1,3 @@
-import logging
 from typing import Any, Dict, Mapping
 
 from ray.rllib.algorithms.ppo.ppo_learner import (
@@ -20,8 +19,6 @@
 
 torch, nn = try_import_torch()
 
-logger = logging.getLogger(__name__)
-
 
 class PPOTorchLearner(PPOLearner, TorchLearner):
  """Implements torch-specific PPO loss logic on top of PPOLearner.
@@ -62,17 +59,6 @@ def compute_loss_per_module(
  if self.hps.kl_coeff > 0.0:
  action_kl = prev_action_dist.kl(curr_action_dist)
  mean_kl_loss = torch.mean(action_kl)
- if mean_kl_loss.isinf():
- logger.warning(
- "KL divergence is non-finite, this will likely destabilize "
- "your model and the training process. Action(s) in a "
- "specific state have near-zero probability. "
- "This can happen naturally in deterministic "
- "environments where the optimal policy has zero mass "
- "for a specific action. To fix this issue, consider "
- "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
- "config."
- )
  else:
  mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)