Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Don't add a cpu to bundle for learner when using gpu #35529

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Move ppo kl infinite check to avoid tensorflow autograph errors
Signed-off-by: Avnish <[email protected]>
  • Loading branch information
avnishn committed May 21, 2023
commit 9ba46de9acbc237c3865f0d4e05e1699d9946d83
34 changes: 33 additions & 1 deletion rllib/algorithms/ppo/ppo_learner.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from collections import defaultdict
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Union
import logging
import math
from typing import Any, Dict, List, Mapping, Optional, Union

from ray.rllib.core.learner.learner import LearnerHyperparameters
from ray.rllib.core.learner.learner import Learner
from ray.rllib.core.rl_module.rl_module import ModuleID
from ray.rllib.policy.sample_batch import MultiAgentBatch
from ray.rllib.utils.annotations import override
from ray.rllib.utils.schedules.scheduler import Scheduler

Expand All @@ -16,6 +19,9 @@
LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY = "curr_entropy_coeff"


logger = logging.getLogger(__name__)


@dataclass
class PPOLearnerHyperparameters(LearnerHyperparameters):
"""Hyperparameters for the PPOLearner sub-classes (framework specific).
Expand Down Expand Up @@ -73,3 +79,29 @@ def additional_update_per_module(
results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: new_entropy_coeff})

return results

avnishn marked this conversation as resolved.
Show resolved Hide resolved
@override(Learner)
def compile_results(
self,
batch: MultiAgentBatch,
fwd_out: Mapping[str, Any],
postprocessed_loss: Mapping[str, Any],
postprocessed_gradients: Mapping[str, Any],
) -> Mapping[str, Any]:
for module_id, module_loss_results in postprocessed_loss.items():
if module_id == self.TOTAL_LOSS_KEY:
continue
if math.isinf(module_loss_results[LEARNER_RESULTS_KL_KEY]):
logger.warning(
"KL divergence is non-finite, this will likely destabilize "
"your model and the training process. Action(s) in a "
"specific state have near-zero probability. "
"This can happen naturally in deterministic "
"environments where the optimal policy has zero mass "
"for a specific action. To fix this issue, consider "
"setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
"config."
)
return super().compile_results(
batch, fwd_out, postprocessed_loss, postprocessed_gradients
)
13 changes: 0 additions & 13 deletions rllib/algorithms/ppo/tf/ppo_tf_learner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import logging
from typing import Any, Dict, Mapping

from ray.rllib.algorithms.ppo.ppo_learner import (
Expand All @@ -20,7 +19,6 @@


_, tf, _ = try_import_tf()
logger = logging.getLogger(__name__)


class PPOTfLearner(PPOLearner, TfLearner):
Expand Down Expand Up @@ -59,17 +57,6 @@ def compute_loss_per_module(
if self.hps.kl_coeff > 0.0:
action_kl = prev_action_dist.kl(curr_action_dist)
mean_kl_loss = tf.reduce_mean(action_kl)
if tf.math.is_inf(mean_kl_loss):
logger.warning(
"KL divergence is non-finite, this will likely destabilize "
"your model and the training process. Action(s) in a "
"specific state have near-zero probability. "
"This can happen naturally in deterministic "
"environments where the optimal policy has zero mass "
"for a specific action. To fix this issue, consider "
"setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
"config."
)
else:
mean_kl_loss = tf.constant(0.0, dtype=logp_ratio.dtype)

Expand Down
14 changes: 0 additions & 14 deletions rllib/algorithms/ppo/torch/ppo_torch_learner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import logging
from typing import Any, Dict, Mapping

from ray.rllib.algorithms.ppo.ppo_learner import (
Expand All @@ -20,8 +19,6 @@

torch, nn = try_import_torch()

logger = logging.getLogger(__name__)


class PPOTorchLearner(PPOLearner, TorchLearner):
"""Implements torch-specific PPO loss logic on top of PPOLearner.
Expand Down Expand Up @@ -62,17 +59,6 @@ def compute_loss_per_module(
if self.hps.kl_coeff > 0.0:
action_kl = prev_action_dist.kl(curr_action_dist)
mean_kl_loss = torch.mean(action_kl)
if mean_kl_loss.isinf():
logger.warning(
"KL divergence is non-finite, this will likely destabilize "
"your model and the training process. Action(s) in a "
"specific state have near-zero probability. "
"This can happen naturally in deterministic "
"environments where the optimal policy has zero mass "
"for a specific action. To fix this issue, consider "
"setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
"config."
)
else:
mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)

Expand Down