Make level with master, fix docstring

Signed-off-by: Avnish <[email protected]>
ray-project · gjoliver · May 23, 2023 · May 18, 2023 · May 18, 2023 · May 18, 2023
commit 1a7820e1ddc87d85c2890f62f08df1c2dbfe7e28
diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py
@@ -1,13 +1,10 @@
 from collections import defaultdict
 from dataclasses import dataclass
-import logging
-import math
-from typing import Any, Dict, List, Mapping, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 from ray.rllib.core.learner.learner import LearnerHyperparameters
 from ray.rllib.core.learner.learner import Learner
 from ray.rllib.core.rl_module.rl_module import ModuleID
-from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.schedules.scheduler import Scheduler
 
@@ -19,9 +16,6 @@
 LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY = "curr_entropy_coeff"
 
 
-logger = logging.getLogger(__name__)
-
-
 @dataclass
 class PPOLearnerHyperparameters(LearnerHyperparameters):
  """Hyperparameters for the PPOLearner sub-classes (framework specific).
@@ -79,29 +73,3 @@ def additional_update_for_module(
  results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: new_entropy_coeff})
 
  return results
-
- @override(Learner)
- def compile_results(
- self,
- batch: MultiAgentBatch,
- fwd_out: Mapping[str, Any],
- postprocessed_loss: Mapping[str, Any],
- postprocessed_gradients: Mapping[str, Any],
- ) -> Mapping[str, Any]:
- for module_id, module_loss_results in postprocessed_loss.items():
- if module_id == self.TOTAL_LOSS_KEY:
- continue
- if math.isinf(module_loss_results[LEARNER_RESULTS_KL_KEY]):
- logger.warning(
- "KL divergence is non-finite, this will likely destabilize "
- "your model and the training process. Action(s) in a "
- "specific state have near-zero probability. "
- "This can happen naturally in deterministic "
- "environments where the optimal policy has zero mass "
- "for a specific action. To fix this issue, consider "
- "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
- "config."
- )
- return super().compile_results(
- batch, fwd_out, postprocessed_loss, postprocessed_gradients
- )
diff --git a/rllib/algorithms/ppo/tf/ppo_tf_learner.py b/rllib/algorithms/ppo/tf/ppo_tf_learner.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Any, Dict, Mapping
 
 from ray.rllib.algorithms.ppo.ppo_learner import (
@@ -20,6 +21,7 @@
 
 
 _, tf, _ = try_import_tf()
+logger = logging.getLogger(__name__)
 
 
 class PPOTfLearner(PPOLearner, TfLearner):
@@ -58,6 +60,17 @@ def compute_loss_for_module(
  if self.hps.kl_coeff > 0.0:
  action_kl = prev_action_dist.kl(curr_action_dist)
  mean_kl_loss = tf.reduce_mean(action_kl)
+ if tf.math.is_inf(mean_kl_loss):
+ logger.warning(
+ "KL divergence is non-finite, this will likely destabilize "
+ "your model and the training process. Action(s) in a "
+ "specific state have near-zero probability. "
+ "This can happen naturally in deterministic "
+ "environments where the optimal policy has zero mass "
+ "for a specific action. To fix this issue, consider "
+ "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
+ "config."
+ )
  else:
  mean_kl_loss = tf.constant(0.0, dtype=logp_ratio.dtype)
 

diff --git a/rllib/algorithms/ppo/torch/ppo_torch_learner.py b/rllib/algorithms/ppo/torch/ppo_torch_learner.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Any, Dict, Mapping
 
 from ray.rllib.algorithms.ppo.ppo_learner import (
@@ -20,6 +21,8 @@
 
 torch, nn = try_import_torch()
 
+logger = logging.getLogger(__name__)
+
 
 class PPOTorchLearner(PPOLearner, TorchLearner):
  """Implements torch-specific PPO loss logic on top of PPOLearner.
@@ -60,6 +63,17 @@ def compute_loss_for_module(
  if self.hps.kl_coeff > 0.0:
  action_kl = prev_action_dist.kl(curr_action_dist)
  mean_kl_loss = torch.mean(action_kl)
+ if mean_kl_loss.isinf():
+ logger.warning(
+ "KL divergence is non-finite, this will likely destabilize "
+ "your model and the training process. Action(s) in a "
+ "specific state have near-zero probability. "
+ "This can happen naturally in deterministic "
+ "environments where the optimal policy has zero mass "
+ "for a specific action. To fix this issue, consider "
+ "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
+ "config."
+ )
  else:
  mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)
 

@@ -588,8 +588,6 @@ def compile_results(
  loss_per_module: A dict mapping module IDs (including ALL_MODULES) to the
  individual loss tensors as returned by calls to
  `compute_loss_for_module(module_id=...)`.
- postprocessed_gradients: The postprocessed gradients dict, (flat) mapping
- gradient tensor refs to the already postprocessed gradient tensors.
  metrics_per_module: The collected metrics defaultdict mapping ModuleIDs to
  metrics dicts. These metrics are collected during loss- and
  gradient computation, gradient postprocessing, and gradient application.