Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Don't add a cpu to bundle for learner when using gpu #35529

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Make level with master, fix docstring
Signed-off-by: Avnish <[email protected]>
  • Loading branch information
avnishn committed May 22, 2023
commit 1a7820e1ddc87d85c2890f62f08df1c2dbfe7e28
34 changes: 1 addition & 33 deletions rllib/algorithms/ppo/ppo_learner.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
from collections import defaultdict
from dataclasses import dataclass
import logging
import math
from typing import Any, Dict, List, Mapping, Optional, Union
from typing import Any, Dict, List, Optional, Union

from ray.rllib.core.learner.learner import LearnerHyperparameters
from ray.rllib.core.learner.learner import Learner
from ray.rllib.core.rl_module.rl_module import ModuleID
from ray.rllib.policy.sample_batch import MultiAgentBatch
from ray.rllib.utils.annotations import override
from ray.rllib.utils.schedules.scheduler import Scheduler

Expand All @@ -19,9 +16,6 @@
LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY = "curr_entropy_coeff"


logger = logging.getLogger(__name__)


@dataclass
class PPOLearnerHyperparameters(LearnerHyperparameters):
"""Hyperparameters for the PPOLearner sub-classes (framework specific).
Expand Down Expand Up @@ -79,29 +73,3 @@ def additional_update_for_module(
results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: new_entropy_coeff})

return results

@override(Learner)
def compile_results(
self,
batch: MultiAgentBatch,
fwd_out: Mapping[str, Any],
postprocessed_loss: Mapping[str, Any],
postprocessed_gradients: Mapping[str, Any],
) -> Mapping[str, Any]:
for module_id, module_loss_results in postprocessed_loss.items():
if module_id == self.TOTAL_LOSS_KEY:
continue
if math.isinf(module_loss_results[LEARNER_RESULTS_KL_KEY]):
logger.warning(
"KL divergence is non-finite, this will likely destabilize "
"your model and the training process. Action(s) in a "
"specific state have near-zero probability. "
"This can happen naturally in deterministic "
"environments where the optimal policy has zero mass "
"for a specific action. To fix this issue, consider "
"setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
"config."
)
return super().compile_results(
batch, fwd_out, postprocessed_loss, postprocessed_gradients
)
13 changes: 13 additions & 0 deletions rllib/algorithms/ppo/tf/ppo_tf_learner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from typing import Any, Dict, Mapping

from ray.rllib.algorithms.ppo.ppo_learner import (
Expand All @@ -20,6 +21,7 @@


_, tf, _ = try_import_tf()
logger = logging.getLogger(__name__)


class PPOTfLearner(PPOLearner, TfLearner):
Expand Down Expand Up @@ -58,6 +60,17 @@ def compute_loss_for_module(
if self.hps.kl_coeff > 0.0:
action_kl = prev_action_dist.kl(curr_action_dist)
mean_kl_loss = tf.reduce_mean(action_kl)
if tf.math.is_inf(mean_kl_loss):
logger.warning(
"KL divergence is non-finite, this will likely destabilize "
"your model and the training process. Action(s) in a "
"specific state have near-zero probability. "
"This can happen naturally in deterministic "
"environments where the optimal policy has zero mass "
"for a specific action. To fix this issue, consider "
"setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
"config."
)
else:
mean_kl_loss = tf.constant(0.0, dtype=logp_ratio.dtype)

Expand Down
14 changes: 14 additions & 0 deletions rllib/algorithms/ppo/torch/ppo_torch_learner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from typing import Any, Dict, Mapping

from ray.rllib.algorithms.ppo.ppo_learner import (
Expand All @@ -20,6 +21,8 @@

torch, nn = try_import_torch()

logger = logging.getLogger(__name__)


class PPOTorchLearner(PPOLearner, TorchLearner):
"""Implements torch-specific PPO loss logic on top of PPOLearner.
Expand Down Expand Up @@ -60,6 +63,17 @@ def compute_loss_for_module(
if self.hps.kl_coeff > 0.0:
action_kl = prev_action_dist.kl(curr_action_dist)
mean_kl_loss = torch.mean(action_kl)
if mean_kl_loss.isinf():
logger.warning(
"KL divergence is non-finite, this will likely destabilize "
"your model and the training process. Action(s) in a "
"specific state have near-zero probability. "
"This can happen naturally in deterministic "
"environments where the optimal policy has zero mass "
"for a specific action. To fix this issue, consider "
"setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
"config."
)
else:
mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)

Expand Down
2 changes: 0 additions & 2 deletions rllib/core/learner/learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,8 +588,6 @@ def compile_results(
loss_per_module: A dict mapping module IDs (including ALL_MODULES) to the
individual loss tensors as returned by calls to
`compute_loss_for_module(module_id=...)`.
postprocessed_gradients: The postprocessed gradients dict, (flat) mapping
gradient tensor refs to the already postprocessed gradient tensors.
metrics_per_module: The collected metrics defaultdict mapping ModuleIDs to
metrics dicts. These metrics are collected during loss- and
gradient computation, gradient postprocessing, and gradient application.
Expand Down
Loading