Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Don't add a cpu to bundle for learner when using gpu #35529

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Move kl check out of traced update, torch, tf
Signed-off-by: Avnish <[email protected]>
  • Loading branch information
avnishn committed May 22, 2023
commit c496c344c1fbc41a4e84d9766e45725ebcf4eca3
44 changes: 31 additions & 13 deletions rllib/algorithms/ppo/tf/ppo_tf_learner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Any, Dict, Mapping
from typing import Any, DefaultDict, Dict, Mapping

from ray.rllib.algorithms.ppo.ppo_learner import (
LEARNER_RESULTS_KL_KEY,
Expand All @@ -12,7 +12,7 @@
from ray.rllib.core.learner.tf.tf_learner import TfLearner
from ray.rllib.core.rl_module.rl_module import ModuleID
from ray.rllib.evaluation.postprocessing import Postprocessing
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.tf_utils import explained_variance
from ray.rllib.utils.annotations import override
Expand Down Expand Up @@ -60,17 +60,6 @@ def compute_loss_for_module(
if self.hps.kl_coeff > 0.0:
action_kl = prev_action_dist.kl(curr_action_dist)
mean_kl_loss = tf.reduce_mean(action_kl)
if tf.math.is_inf(mean_kl_loss):
logger.warning(
"KL divergence is non-finite, this will likely destabilize "
"your model and the training process. Action(s) in a "
"specific state have near-zero probability. "
"This can happen naturally in deterministic "
"environments where the optimal policy has zero mass "
"for a specific action. To fix this issue, consider "
"setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
"config."
)
else:
mean_kl_loss = tf.constant(0.0, dtype=logp_ratio.dtype)

Expand Down Expand Up @@ -151,3 +140,32 @@ def additional_update_for_module(
results.update({LEARNER_RESULTS_CURR_KL_COEFF_KEY: curr_var.numpy()})

return results

def compile_results(
self,
*,
batch: MultiAgentBatch,
fwd_out: Mapping[str, Any],
loss_per_module: Mapping[str, TensorType],
metrics_per_module: DefaultDict[ModuleID, Dict[str, Any]]
) -> Mapping[str, Any]:
if self.hps.kl_coeff > 0.0:
for metrics in metrics_per_module.values():
mean_kl_loss = metrics[LEARNER_RESULTS_KL_KEY]
if tf.math.is_inf(mean_kl_loss):
logger.warning(
"KL divergence is non-finite, this will likely destabilize "
"your model and the training process. Action(s) in a "
"specific state have near-zero probability. "
"This can happen naturally in deterministic "
"environments where the optimal policy has zero mass "
"for a specific action. To fix this issue, consider "
"setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in "
"your config."
)
return super().compile_results(
batch=batch,
fwd_out=fwd_out,
loss_per_module=loss_per_module,
metrics_per_module=metrics_per_module,
)
44 changes: 31 additions & 13 deletions rllib/algorithms/ppo/torch/ppo_torch_learner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Any, Dict, Mapping
from typing import Any, DefaultDict, Dict, Mapping

from ray.rllib.algorithms.ppo.ppo_learner import (
LEARNER_RESULTS_KL_KEY,
Expand All @@ -12,7 +12,7 @@
from ray.rllib.core.learner.torch.torch_learner import TorchLearner
from ray.rllib.core.rl_module.rl_module import ModuleID
from ray.rllib.evaluation.postprocessing import Postprocessing
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.nested_dict import NestedDict
Expand Down Expand Up @@ -63,17 +63,6 @@ def compute_loss_for_module(
if self.hps.kl_coeff > 0.0:
action_kl = prev_action_dist.kl(curr_action_dist)
mean_kl_loss = torch.mean(action_kl)
if mean_kl_loss.isinf():
logger.warning(
"KL divergence is non-finite, this will likely destabilize "
"your model and the training process. Action(s) in a "
"specific state have near-zero probability. "
"This can happen naturally in deterministic "
"environments where the optimal policy has zero mass "
"for a specific action. To fix this issue, consider "
"setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your "
"config."
)
else:
mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)

Expand Down Expand Up @@ -150,3 +139,32 @@ def additional_update_for_module(
results.update({LEARNER_RESULTS_CURR_KL_COEFF_KEY: curr_var.item()})

return results

def compile_results(
self,
*,
batch: MultiAgentBatch,
fwd_out: Mapping[str, Any],
loss_per_module: Mapping[str, TensorType],
metrics_per_module: DefaultDict[ModuleID, Dict[str, Any]]
) -> Mapping[str, Any]:
if self.hps.kl_coeff > 0.0:
for metrics in metrics_per_module.values():
mean_kl_loss = metrics[LEARNER_RESULTS_KL_KEY]
if mean_kl_loss.isinf():
logger.warning(
"KL divergence is non-finite, this will likely destabilize "
"your model and the training process. Action(s) in a "
"specific state have near-zero probability. "
"This can happen naturally in deterministic "
"environments where the optimal policy has zero mass "
"for a specific action. To fix this issue, consider "
"setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in "
"your config."
)
return super().compile_results(
batch=batch,
fwd_out=fwd_out,
loss_per_module=loss_per_module,
metrics_per_module=metrics_per_module,
)
Loading