From 47c003ccc1f2221ed6bd9e5fbe95df05eda3fe9d Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 15 Apr 2023 11:30:59 +0200 Subject: [PATCH 01/34] wip Signed-off-by: sven1977 --- .../algorithms/appo/tf/appo_tf_policy_rlm.py | 212 +--------------- .../appo/torch/appo_torch_policy_rlm.py | 33 +++ .../impala/tf/impala_tf_policy_rlm.py | 157 +----------- .../impala/torch/impala_torch_policy_rlm.py | 161 +------------ rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py | 175 +------------- .../ppo/torch/ppo_torch_policy_rlm.py | 227 +----------------- 6 files changed, 86 insertions(+), 879 deletions(-) create mode 100644 rllib/algorithms/appo/torch/appo_torch_policy_rlm.py diff --git a/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py b/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py index f01235834d852..4539d8f7ccff9 100644 --- a/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py +++ b/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py @@ -1,226 +1,28 @@ -import logging -from typing import Dict, List, Union - from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import ( - EntropyCoeffSchedule, - LearningRateSchedule, - KLCoeffMixin, - GradStatsMixin, - TargetNetworkMixin, -) - -from ray.rllib.algorithms.impala.impala_tf_policy import ( - VTraceClipGradients, - VTraceOptimizer, -) - from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import Deprecated -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_utils import ( - explained_variance, -) - - -from ray.rllib.algorithms.impala.tf.vtrace_tf_v2 import make_time_major, vtrace_tf2 -from ray.rllib.utils.typing import TensorType - -tf1, tf, tfv = try_import_tf() -logger = logging.getLogger(__name__) - -class APPOTfPolicyWithRLModule( - VTraceClipGradients, - VTraceOptimizer, - LearningRateSchedule, - KLCoeffMixin, - EntropyCoeffSchedule, - TargetNetworkMixin, - GradStatsMixin, - EagerTFPolicyV2, -): +# TODO: Remove once we have a RLModule capable sampler class that can replace +# `Policy.compute_actions_from_input_dict()`. +class APPOTfPolicyWithRLModule(EagerTFPolicyV2): def __init__(self, observation_space, action_space, config): validate_config(config) EagerTFPolicyV2.enable_eager_execution_if_necessary() - # Initialize MixIns before super().__init__ because base class will call - # self.loss, which requires these MixIns to be initialized. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - # Although this is a no-op, we call __init__ here to make it clear - # that base.__init__ will use the make_model() call. - VTraceClipGradients.__init__(self) - VTraceOptimizer.__init__(self) self.framework = "tf2" - KLCoeffMixin.__init__(self, config) - GradStatsMixin.__init__(self) EagerTFPolicyV2.__init__(self, observation_space, action_space, config) - # construct the target model and make its weights the same as the model - self.target_model = self.make_rl_module() - self.target_model.set_weights(self.model.get_weights()) - # Initiate TargetNetwork ops after loss initialization. self.maybe_initialize_optimizer_and_loss() - TargetNetworkMixin.__init__(self) - @Deprecated(new="APPOTfLearner.compute_loss_per_module()", error=False) @override(EagerTFPolicyV2) - def loss( - self, - model: Union[ModelV2, "tf.keras.Model"], - dist_class, - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - train_batch[SampleBatch.ACTIONS] + def loss(self, model, dist_class, train_batch): + self.stats = {} train_batch[SampleBatch.ACTION_LOGP] + train_batch[SampleBatch.ACTIONS] train_batch[SampleBatch.REWARDS] train_batch[SampleBatch.TERMINATEDS] - - seqs_len = train_batch.get(SampleBatch.SEQ_LENS) - rollout_frag_or_episode_len = ( - self.config["rollout_fragment_length"] if not seqs_len else None - ) - drop_last = self.config["vtrace_drop_last_ts"] - - target_policy_fwd_out = model.forward_train(train_batch) - values = target_policy_fwd_out[SampleBatch.VF_PREDS] - target_policy_dist = target_policy_fwd_out[SampleBatch.ACTION_DIST] - - old_target_policy_fwd_out = self.target_model.forward_train(train_batch) - old_target_policy_dist = old_target_policy_fwd_out[SampleBatch.ACTION_DIST] - - behaviour_actions_logp = train_batch[SampleBatch.ACTION_LOGP] - target_actions_logp = target_policy_dist.logp(train_batch[SampleBatch.ACTIONS]) - old_target_actions_logp = old_target_policy_dist.logp( - train_batch[SampleBatch.ACTIONS] - ) - behaviour_actions_logp_time_major = make_time_major( - behaviour_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ) - target_actions_logp_time_major = make_time_major( - target_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ) - old_target_actions_logp_time_major = make_time_major( - old_target_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ) - values_time_major = make_time_major( - values, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ) - bootstrap_value = values_time_major[-1] - rewards_time_major = make_time_major( - train_batch[SampleBatch.REWARDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ) - - # how to compute discouts? - # should they be pre computed? - discounts_time_major = ( - 1.0 - - tf.cast( - make_time_major( - train_batch[SampleBatch.TERMINATEDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ), - dtype=tf.float32, - ) - ) * self.config["gamma"] - vtrace_adjusted_target_values, pg_advantages = vtrace_tf2( - target_action_log_probs=old_target_actions_logp_time_major, - behaviour_action_log_probs=behaviour_actions_logp_time_major, - rewards=rewards_time_major, - values=values_time_major, - bootstrap_value=bootstrap_value, - clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"], - clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], - discounts=discounts_time_major, - ) - - is_ratio = tf.clip_by_value( - tf.math.exp( - behaviour_actions_logp_time_major - target_actions_logp_time_major - ), - 0.0, - 2.0, - ) - logp_ratio = is_ratio * tf.math.exp( - target_actions_logp_time_major - behaviour_actions_logp_time_major - ) - - clip_param = self.config["clip_param"] - surrogate_loss = tf.math.minimum( - pg_advantages * logp_ratio, - ( - pg_advantages - * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param) - ), - ) - action_kl = old_target_policy_dist.kl(target_policy_dist) - mean_kl_loss = tf.math.reduce_mean(action_kl) - mean_pi_loss = -tf.math.reduce_mean(surrogate_loss) - - # The baseline loss. - delta = values_time_major - vtrace_adjusted_target_values - mean_vf_loss = 0.5 * tf.math.reduce_mean(delta**2) - - # The entropy loss. - mean_entropy_loss = -tf.math.reduce_mean(target_actions_logp_time_major) - - # The summed weighted loss. - total_loss = ( - mean_pi_loss - + (mean_vf_loss * self.config["vf_loss_coeff"]) - + (mean_entropy_loss * self.entropy_coeff) - + (mean_kl_loss * self.kl_coeff) - ) - - self.stats = { - "total_loss": total_loss, - "policy_loss": mean_pi_loss, - "vf_loss": mean_vf_loss, - "values": values_time_major, - "entropy_loss": mean_entropy_loss, - "vtrace_adjusted_target_values": vtrace_adjusted_target_values, - "mean_kl": mean_kl_loss, - } - return total_loss - - @override(EagerTFPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return { - "cur_lr": tf.cast(self.cur_lr, tf.float64), - "policy_loss": self.stats["policy_loss"], - "entropy": self.stats["entropy_loss"], - "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), - "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables), - "vf_loss": self.stats["vf_loss"], - "vf_explained_var": explained_variance( - tf.reshape(self.stats["vtrace_adjusted_target_values"], [-1]), - tf.reshape(self.stats["values"], [-1]), - ), - "mean_kl": self.stats["mean_kl"], - } + return 0.0 @override(EagerTFPolicyV2) def get_batch_divisibility_req(self) -> int: diff --git a/rllib/algorithms/appo/torch/appo_torch_policy_rlm.py b/rllib/algorithms/appo/torch/appo_torch_policy_rlm.py new file mode 100644 index 0000000000000..4b4396a4f29b7 --- /dev/null +++ b/rllib/algorithms/appo/torch/appo_torch_policy_rlm.py @@ -0,0 +1,33 @@ +import logging + +from ray.rllib.algorithms.ppo.ppo_torch_policy import validate_config +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch + +torch, _ = try_import_torch() + +logger = logging.getLogger(__name__) + + +# TODO: Remove once we have a RLModule capable sampler class that can replace +# `Policy.compute_actions_from_input_dict()`. +class APPOTorchPolicyWithRLModule(TorchPolicyV2): + def __init__(self, observation_space, action_space, config): + validate_config(config) + TorchPolicyV2.__init__(self, observation_space, action_space, config) + self._initialize_loss_from_dummy_batch() + + @override(TorchPolicyV2) + def loss(self, model, dist_class, train_batch): + self.stats = {} + train_batch[SampleBatch.ACTION_LOGP] + train_batch[SampleBatch.ACTIONS] + train_batch[SampleBatch.REWARDS] + train_batch[SampleBatch.TERMINATEDS] + return 0.0 + + @override(TorchPolicyV2) + def get_batch_divisibility_req(self) -> int: + return self.config["rollout_fragment_length"] diff --git a/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py b/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py index 0244a96c0ac21..f8447ca312c3e 100644 --- a/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py +++ b/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py @@ -1,162 +1,27 @@ -import logging -from typing import Dict, List, Union - from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import ( - EntropyCoeffSchedule, - LearningRateSchedule, -) from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import Deprecated -from ray.rllib.algorithms.ppo.tf.ppo_tf_rl_module import PPOTfRLModule -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_utils import ( - explained_variance, -) -from ray.rllib.algorithms.impala.tf.vtrace_tf_v2 import make_time_major, vtrace_tf2 -from ray.rllib.utils.typing import TensorType - -tf1, tf, tfv = try_import_tf() -logger = logging.getLogger(__name__) - -class ImpalaTfPolicyWithRLModule( - LearningRateSchedule, - EntropyCoeffSchedule, - EagerTFPolicyV2, -): +# TODO: Remove once we have a RLModule capable sampler class that can replace +# `Policy.compute_actions_from_input_dict()`. +class ImpalaTfPolicyWithRLModule(EagerTFPolicyV2): def __init__(self, observation_space, action_space, config): validate_config(config) EagerTFPolicyV2.enable_eager_execution_if_necessary() - # Initialize MixIns before super().__init__ because base class will call - # self.loss, which requires these MixIns to be initialized. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) + self.framework = "tf2" EagerTFPolicyV2.__init__(self, observation_space, action_space, config) - self.maybe_initialize_optimizer_and_loss() - @Deprecated(new="ImpalaTfLearner.compute_loss_per_module()", error=False) - @override(EagerTFPolicyV2) - def loss( - self, - model: PPOTfRLModule, - dist_class, - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - seq_len = train_batch.get(SampleBatch.SEQ_LENS) - rollout_frag_or_episode_len = ( - self.config["rollout_fragment_length"] if not seq_len else None - ) - drop_last = self.config["vtrace_drop_last_ts"] - - fwd_out = model.forward_train(train_batch) - - values = fwd_out[SampleBatch.VF_PREDS] - target_policy_dist = fwd_out[SampleBatch.ACTION_DIST] - - # this is probably a horribly inefficient way to do this. I should be able to - # compute this in a batch fashion - behaviour_actions_logp = train_batch[SampleBatch.ACTION_LOGP] - target_actions_logp = target_policy_dist.logp(train_batch[SampleBatch.ACTIONS]) - behaviour_actions_logp_time_major = make_time_major( - behaviour_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - target_actions_logp_time_major = make_time_major( - target_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - values_time_major = make_time_major( - values, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - bootstrap_value = values_time_major[-1] - rewards_time_major = make_time_major( - train_batch[SampleBatch.REWARDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - - # how to compute discouts? - # should they be pre computed? - discounts_time_major = ( - 1.0 - - tf.cast( - make_time_major( - train_batch[SampleBatch.TERMINATEDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ), - dtype=tf.float32, - ) - ) * self.config["gamma"] - vtrace_adjusted_target_values, pg_advantages = vtrace_tf2( - target_action_log_probs=target_actions_logp_time_major, - behaviour_action_log_probs=behaviour_actions_logp_time_major, - rewards=rewards_time_major, - values=values_time_major, - bootstrap_value=bootstrap_value, - clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"], - clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], - discounts=discounts_time_major, - ) - - # The policy gradients loss. - pi_loss = -tf.reduce_sum(target_actions_logp_time_major * pg_advantages) - mean_pi_loss = -tf.reduce_mean(target_actions_logp_time_major * pg_advantages) - - # The baseline loss. - delta = values_time_major - vtrace_adjusted_target_values - vf_loss = 0.5 * tf.reduce_sum(tf.math.pow(delta, 2.0)) - mean_vf_loss = 0.5 * tf.reduce_mean(tf.math.pow(delta, 2.0)) - - # The entropy loss. - entropy_loss = -tf.reduce_sum(target_actions_logp_time_major) - - # The summed weighted loss. - total_loss = ( - pi_loss - + vf_loss * self.config["vf_loss_coeff"] - + entropy_loss * self.entropy_coeff - ) - self.stats = { - "total_loss": total_loss, - "pi_loss": mean_pi_loss, - "vf_loss": mean_vf_loss, - "values": values_time_major, - "entropy_loss": entropy_loss, - "vtrace_adjusted_target_values": vtrace_adjusted_target_values, - } - return total_loss - @override(EagerTFPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return { - "cur_lr": tf.cast(self.cur_lr, tf.float64), - "policy_loss": self.stats["pi_loss"], - "entropy": self.stats["entropy_loss"], - "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), - "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables), - "vf_loss": self.stats["vf_loss"], - "vf_explained_var": explained_variance( - tf.reshape(self.stats["vtrace_adjusted_target_values"], [-1]), - tf.reshape(self.stats["values"], [-1]), - ), - } + def loss(self, model, dist_class, train_batch): + self.stats = {} + train_batch[SampleBatch.ACTION_LOGP] + train_batch[SampleBatch.ACTIONS] + train_batch[SampleBatch.REWARDS] + train_batch[SampleBatch.TERMINATEDS] + return 0.0 @override(EagerTFPolicyV2) def get_batch_divisibility_req(self) -> int: diff --git a/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py b/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py index 751b441098a88..a2d49650f68ec 100644 --- a/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py +++ b/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py @@ -1,164 +1,25 @@ -import logging -from typing import Dict, List, Union - -from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import ( - make_time_major, - vtrace_torch, -) from ray.rllib.algorithms.ppo.ppo_torch_policy import validate_config -from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.torch_utils import convert_to_torch_tensor -from ray.rllib.policy.torch_mixins import ( - EntropyCoeffSchedule, - LearningRateSchedule, -) from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override, Deprecated -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import ( - explained_variance, - global_norm, -) -from ray.rllib.utils.typing import TensorType - -torch, nn = try_import_torch() +from ray.rllib.utils.annotations import override -logger = logging.getLogger(__name__) - -class ImpalaTorchPolicyWithRLModule( - LearningRateSchedule, - EntropyCoeffSchedule, - TorchPolicyV2, -): +# TODO: Remove once we have a RLModule capable sampler class that can replace +# `Policy.compute_actions_from_input_dict()`. +class ImpalaTorchPolicyWithRLModule(TorchPolicyV2): def __init__(self, observation_space, action_space, config): validate_config(config) TorchPolicyV2.__init__(self, observation_space, action_space, config) - # Initialize MixIns. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - - # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch() - @Deprecated(new="ImpalaTorchLearner.compute_loss_per_module()", error=False) - @override(TorchPolicyV2) - def loss( - self, - model: PPOTorchRLModule, - dist_class, - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - seq_len = train_batch.get(SampleBatch.SEQ_LENS) - rollout_frag_or_episode_len = ( - self.config["rollout_fragment_length"] if not seq_len else None - ) - drop_last = self.config["vtrace_drop_last_ts"] - - fwd_out = model.forward_train(train_batch) - - values = fwd_out[SampleBatch.VF_PREDS] - target_policy_dist = fwd_out[SampleBatch.ACTION_DIST] - - # this is probably a horribly inefficient way to do this. I should be able to - # compute this in a batch fashion - behaviour_actions_logp = train_batch[SampleBatch.ACTION_LOGP] - target_actions_logp = target_policy_dist.logp(train_batch[SampleBatch.ACTIONS]) - behaviour_actions_logp_time_major = make_time_major( - behaviour_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - target_actions_logp_time_major = make_time_major( - target_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - values_time_major = make_time_major( - values, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - bootstrap_value = values_time_major[-1] - rewards_time_major = make_time_major( - train_batch[SampleBatch.REWARDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - - # how to compute discouts? - # should they be pre computed? - discounts_time_major = ( - 1.0 - - make_time_major( - train_batch[SampleBatch.TERMINATEDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ).type(dtype=torch.float32) - ) * self.config["gamma"] - vtrace_adjusted_target_values, pg_advantages = vtrace_torch( - target_action_log_probs=target_actions_logp_time_major, - behaviour_action_log_probs=behaviour_actions_logp_time_major, - rewards=rewards_time_major, - values=values_time_major, - bootstrap_value=bootstrap_value, - clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"], - clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], - discounts=discounts_time_major, - ) - - # The policy gradients loss. - pi_loss = -torch.sum(target_actions_logp_time_major * pg_advantages) - mean_pi_loss = -torch.mean(target_actions_logp_time_major * pg_advantages) - - # The baseline loss. - delta = values_time_major - vtrace_adjusted_target_values - vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0)) - mean_vf_loss = 0.5 * torch.mean(torch.pow(delta, 2.0)) - - # The entropy loss. - entropy_loss = -torch.sum(target_actions_logp_time_major) - - # The summed weighted loss. - total_loss = ( - pi_loss - + vf_loss * self.config["vf_loss_coeff"] - + entropy_loss * self.entropy_coeff - ) - self.stats = { - "total_loss": total_loss, - "pi_loss": mean_pi_loss, - "vf_loss": mean_vf_loss, - "values": values_time_major, - "entropy_loss": entropy_loss, - "vtrace_adjusted_target_values": vtrace_adjusted_target_values, - } - return total_loss - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return { - "cur_lr": convert_to_torch_tensor(self.cur_lr).type(torch.float64), - "policy_loss": self.stats["pi_loss"], - "entropy": self.stats["entropy_loss"], - "entropy_coeff": convert_to_torch_tensor(self.entropy_coeff).type( - torch.float64 - ), - "var_gnorm": global_norm(self.model.parameters()), - "vf_loss": self.stats["vf_loss"], - "vf_explained_var": explained_variance( - torch.reshape(self.stats["vtrace_adjusted_target_values"], [-1]), - torch.reshape(self.stats["values"], [-1]), - ), - } + def loss(self, model, dist_class, train_batch): + self.stats = {} + train_batch[SampleBatch.ACTION_LOGP] + train_batch[SampleBatch.ACTIONS] + train_batch[SampleBatch.REWARDS] + train_batch[SampleBatch.TERMINATEDS] + return 0.0 @override(TorchPolicyV2) def get_batch_divisibility_req(self) -> int: diff --git a/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py b/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py index 66aa0e408d3ab..7f0957db84cf2 100644 --- a/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py +++ b/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py @@ -1,177 +1,28 @@ -import logging -from typing import Dict, List, Union - from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.evaluation.postprocessing import ( - Postprocessing, - compute_gae_for_sample_batch, -) -from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.evaluation.postprocessing import compute_gae_for_sample_batch from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import ( - EntropyCoeffSchedule, - KLCoeffMixin, - LearningRateSchedule, -) from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf - -from ray.rllib.utils.tf_utils import ( - explained_variance, - warn_if_infinite_kl_divergence, -) - -from ray.rllib.utils.typing import TensorType - -tf1, tf, tfv = try_import_tf() - -logger = logging.getLogger(__name__) - - -class PPOTfPolicyWithRLModule( - LearningRateSchedule, - EntropyCoeffSchedule, - KLCoeffMixin, - EagerTFPolicyV2, -): - """PyTorch policy class used with PPO. - - This class is copied from PPOTFPolicy and is modified to support RLModules. - Some subtle differences: - - if config._enable_rl_module api is true make_rl_module should be implemented by - the policy the policy is assumed to be compatible with rl_modules (i.e. self.model - would be an RLModule) - - Tower stats no longer belongs to the model (i.e. RLModule) instead it belongs to - the policy itself. - - Connectors should be enabled to use this policy - - So far it only works for vectorized obs and action spaces (Fully connected neural - networks). we need model catalog to work for other obs and action spaces. - # TODO: In the future we will deprecate doing all phases of training, exploration, - # and inference via one policy abstraction. Instead, we will use separate - # abstractions for each phase. For training (i.e. gradient updates, given the - # sample that have been collected) we will use Learner which will own one or - # possibly many RLModules, and RLOptimizer. For exploration, we will use RLSampler - # which will own RLModule, and RLTrajectoryProcessor. The exploration and inference - # phase details are TBD but the whole point is to make rllib extremely modular. - """ +# TODO: Remove once we have a RLModule capable sampler class that can replace +# `Policy.compute_actions_from_input_dict()`. +class PPOTfPolicyWithRLModule(EagerTFPolicyV2): def __init__(self, observation_space, action_space, config): - # TODO: Move into Policy API, if needed at all here. Why not move this into - # `PPOConfig`?. - self.framework = "tf2" - EagerTFPolicyV2.enable_eager_execution_if_necessary() validate_config(config) - # Initialize MixIns. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - KLCoeffMixin.__init__(self, config) + EagerTFPolicyV2.enable_eager_execution_if_necessary() + self.framework = "tf2" EagerTFPolicyV2.__init__(self, observation_space, action_space, config) - self.maybe_initialize_optimizer_and_loss() @override(EagerTFPolicyV2) - def loss( - self, - model: Union[ModelV2, "tf.keras.Model"], - dist_class, - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - - fwd_out = model.forward_train(train_batch) - curr_action_dist = fwd_out[SampleBatch.ACTION_DIST] - - action_dist_class = type(fwd_out[SampleBatch.ACTION_DIST]) - prev_action_dist = action_dist_class.from_logits( - train_batch[SampleBatch.ACTION_DIST_INPUTS] - ) - - logp_ratio = tf.exp( - fwd_out[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.ACTION_LOGP] - ) - - # Only calculate kl loss if necessary (kl-coeff > 0.0). - if self.config["kl_coeff"] > 0.0: - action_kl = prev_action_dist.kl(curr_action_dist) - mean_kl_loss = tf.reduce_mean(action_kl) - warn_if_infinite_kl_divergence(self, mean_kl_loss) - else: - mean_kl_loss = tf.constant(0.0) - - curr_entropy = fwd_out["entropy"] - mean_entropy = tf.reduce_mean(curr_entropy) - - surrogate_loss = tf.minimum( - train_batch[Postprocessing.ADVANTAGES] * logp_ratio, - train_batch[Postprocessing.ADVANTAGES] - * tf.clip_by_value( - logp_ratio, - 1 - self.config["clip_param"], - 1 + self.config["clip_param"], - ), - ) - - # Compute a value function loss. - if self.config["use_critic"]: - value_fn_out = fwd_out[SampleBatch.VF_PREDS] - vf_loss = tf.math.square( - value_fn_out - train_batch[Postprocessing.VALUE_TARGETS] - ) - vf_loss_clipped = tf.clip_by_value( - vf_loss, - 0, - self.config["vf_clip_param"], - ) - mean_vf_loss = tf.reduce_mean(vf_loss_clipped) - mean_vf_unclipped_loss = tf.reduce_mean(vf_loss) - # Ignore the value function. - else: - mean_vf_unclipped_loss = tf.constant(0.0) - value_fn_out = vf_loss_clipped = mean_vf_loss = tf.constant(0.0) - - total_loss = tf.reduce_mean( - -surrogate_loss - + self.config["vf_loss_coeff"] * vf_loss_clipped - - self.entropy_coeff * curr_entropy - ) - # Add mean_kl_loss (already processed through `reduce_mean_valid`), - # if necessary. - if self.config["kl_coeff"] > 0.0: - total_loss += self.kl_coeff * mean_kl_loss - - # Store stats in policy for stats_fn. - self._total_loss = total_loss - self._mean_policy_loss = tf.reduce_mean(-surrogate_loss) - self._mean_vf_loss = mean_vf_loss - self._unclipped_mean_vf_loss = mean_vf_unclipped_loss - self._mean_entropy = mean_entropy - # Backward compatibility: Deprecate self._mean_kl. - self._mean_kl_loss = self._mean_kl = mean_kl_loss - self._value_fn_out = value_fn_out - self._value_mean = tf.reduce_mean(value_fn_out) - - return total_loss - - @override(EagerTFPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return { - "cur_kl_coeff": tf.cast(self.kl_coeff, tf.float64), - "cur_lr": tf.cast(self.cur_lr, tf.float64), - "total_loss": self._total_loss, - "policy_loss": self._mean_policy_loss, - "vf_loss": self._mean_vf_loss, - "unclipped_vf_loss": self._unclipped_mean_vf_loss, - "vf_explained_var": explained_variance( - train_batch[Postprocessing.VALUE_TARGETS], self._value_fn_out - ), - "kl": self._mean_kl_loss, - "entropy": self._mean_entropy, - "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), - "value_mean": tf.cast(self._value_mean, tf.float64), - } + def loss(self, model, dist_class, train_batch): + self.stats = {} + train_batch[SampleBatch.ACTION_LOGP] + train_batch[SampleBatch.ACTIONS] + train_batch[SampleBatch.REWARDS] + train_batch[SampleBatch.TERMINATEDS] + return 0.0 @override(EagerTFPolicyV2) def postprocess_trajectory( diff --git a/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py b/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py index 04ac92fc2ba5b..ae3a07d41233e 100644 --- a/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py +++ b/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py @@ -1,69 +1,18 @@ -import logging -from typing import Dict, List, Type, Union - from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.evaluation.postprocessing import ( - Postprocessing, - compute_gae_for_sample_batch, -) -from ray.rllib.models.action_dist import ActionDistribution -from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.evaluation.postprocessing import compute_gae_for_sample_batch from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import ( - EntropyCoeffSchedule, - KLCoeffMixin, - LearningRateSchedule, -) from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.torch_utils import ( - apply_grad_clipping, - explained_variance, - sequence_mask, - warn_if_infinite_kl_divergence, -) -from ray.rllib.utils.typing import TensorType torch, nn = try_import_torch() -logger = logging.getLogger(__name__) - - -class PPOTorchPolicyWithRLModule( - LearningRateSchedule, - EntropyCoeffSchedule, - KLCoeffMixin, - TorchPolicyV2, -): - """PyTorch policy class used with PPO. - - This class is copied from PPOTorchPolicyV2 and is modified to support RLModules. - Some subtle differences: - - if config._enable_rl_module api is true make_rl_module should be implemented by - the policy the policy is assumed to be compatible with rl_modules (i.e. self.model - would be an RLModule) - - Tower stats no longer belongs to the model (i.e. RLModule) instead it belongs to - the policy itself. - - Connectors should be enabled to use this policy - - So far it only works for vectorized obs and action spaces (Fully connected neural - networks). we need model catalog to work for other obs and action spaces. - - # TODO: In the future we will deprecate doing all phases of training, exploration, - # and inference via one policy abstraction. Instead, we will use separate - # abstractions for each phase. For training (i.e. gradient updates, given the - # sample that have been collected) we will use Learner which will own one or - # possibly many RLModules, and RLOptimizer. For exploration, we will use RLSampler - # which will own RLModule, and RLTrajectoryProcessor. The exploration and inference - # phase details are TBD but the whole point is to make rllib extremely modular. - """ +# TODO: Remove once we have a RLModule capable sampler class that can replace +# `Policy.compute_actions_from_input_dict()`. +class PPOTorchPolicyWithRLModule(TorchPolicyV2): def __init__(self, observation_space, action_space, config): - # TODO: Move into Policy API, if needed at all here. Why not move this into - # `PPOConfig`?. validate_config(config) - TorchPolicyV2.__init__( self, observation_space, @@ -71,175 +20,21 @@ def __init__(self, observation_space, action_space, config): config, max_seq_len=config["model"]["max_seq_len"], ) - - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - KLCoeffMixin.__init__(self, config) - - # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch() @override(TorchPolicyV2) - def loss( - self, - model: ModelV2, - dist_class: Type[ActionDistribution], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - """Compute loss for Proximal Policy Objective. - - Args: - model: The Model to calculate the loss for. - dist_class: The action distr. class. - train_batch: The training data. - - Returns: - The PPO loss tensor given the input batch. - """ - - fwd_out = model.forward_train(train_batch) - curr_action_dist = fwd_out[SampleBatch.ACTION_DIST] - state = fwd_out.get("state_out", {}) - - # TODO (Kourosh): come back to RNNs later - # RNN case: Mask away 0-padded chunks at end of time axis. - if state: - B = len(train_batch[SampleBatch.SEQ_LENS]) - max_seq_len = train_batch[SampleBatch.OBS].shape[0] // B - mask = sequence_mask( - train_batch[SampleBatch.SEQ_LENS], - max_seq_len, - time_major=self.config["model"]["_time_major"], - ) - mask = torch.reshape(mask, [-1]) - num_valid = torch.sum(mask) - - def reduce_mean_valid(t): - return torch.sum(t[mask]) / num_valid - - # non-RNN case: No masking. - else: - mask = None - reduce_mean_valid = torch.mean - - action_dist_class = type(fwd_out[SampleBatch.ACTION_DIST]) - prev_action_dist = action_dist_class.from_logits( - train_batch[SampleBatch.ACTION_DIST_INPUTS] - ) - - logp_ratio = torch.exp( - fwd_out[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.ACTION_LOGP] - ) - - # Only calculate kl loss if necessary (kl-coeff > 0.0). - if self.config["kl_coeff"] > 0.0: - action_kl = prev_action_dist.kl(curr_action_dist) - mean_kl_loss = reduce_mean_valid(action_kl) - # TODO smorad: should we do anything besides warn? Could discard KL term - # for this update - warn_if_infinite_kl_divergence(self, mean_kl_loss) - else: - mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device) - - curr_entropy = fwd_out["entropy"] - mean_entropy = reduce_mean_valid(curr_entropy) - - surrogate_loss = torch.min( - train_batch[Postprocessing.ADVANTAGES] * logp_ratio, - train_batch[Postprocessing.ADVANTAGES] - * torch.clamp( - logp_ratio, 1 - self.config["clip_param"], 1 + self.config["clip_param"] - ), - ) - - # Compute a value function loss. - if self.config["use_critic"]: - value_fn_out = fwd_out[SampleBatch.VF_PREDS] - vf_loss = torch.pow( - value_fn_out - train_batch[Postprocessing.VALUE_TARGETS], 2.0 - ) - vf_loss_clipped = torch.clamp(vf_loss, 0, self.config["vf_clip_param"]) - mean_vf_loss = reduce_mean_valid(vf_loss_clipped) - mean_vf_unclipped_loss = reduce_mean_valid(vf_loss) - # Ignore the value function. - else: - value_fn_out = torch.tensor(0.0).to(surrogate_loss.device) - mean_vf_unclipped_loss = vf_loss_clipped = mean_vf_loss = torch.tensor( - 0.0 - ).to(surrogate_loss.device) - - total_loss = reduce_mean_valid( - -surrogate_loss - + self.config["vf_loss_coeff"] * vf_loss_clipped - - self.entropy_coeff * curr_entropy - ) - - # Add mean_kl_loss (already processed through `reduce_mean_valid`), - # if necessary. - if self.config["kl_coeff"] > 0.0: - total_loss += self.kl_coeff * mean_kl_loss - - # TODO (Kourosh) Where would tower_stats go? How should stats_fn be implemented - # here? - # Store values for stats function in model (tower), such that for - # multi-GPU, we do not override them during the parallel loss phase. - self.tower_stats[model]["total_loss"] = total_loss - self.tower_stats[model]["mean_policy_loss"] = reduce_mean_valid(-surrogate_loss) - self.tower_stats[model]["mean_vf_loss"] = mean_vf_loss - self.tower_stats[model]["unclipped_vf_loss"] = mean_vf_unclipped_loss - self.tower_stats[model]["vf_explained_var"] = explained_variance( - train_batch[Postprocessing.VALUE_TARGETS], value_fn_out - ) - self.tower_stats[model]["mean_entropy"] = mean_entropy - self.tower_stats[model]["mean_kl_loss"] = mean_kl_loss - - return total_loss - - # TODO: Make this an event-style subscription (e.g.: - # "after_gradients_computed"). - @override(TorchPolicyV2) - def extra_grad_process(self, local_optimizer, loss): - return apply_grad_clipping(self, local_optimizer, loss) - - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return convert_to_numpy( - { - "cur_kl_coeff": self.kl_coeff, - "cur_lr": self.cur_lr, - "total_loss": torch.mean( - torch.stack(self.get_tower_stats("total_loss")) - ), - "policy_loss": torch.mean( - torch.stack(self.get_tower_stats("mean_policy_loss")) - ), - "vf_loss": torch.mean( - torch.stack(self.get_tower_stats("mean_vf_loss")) - ), - "vf_explained_var": torch.mean( - torch.stack(self.get_tower_stats("vf_explained_var")) - ), - "kl": torch.mean(torch.stack(self.get_tower_stats("mean_kl_loss"))), - "entropy": torch.mean( - torch.stack(self.get_tower_stats("mean_entropy")) - ), - "entropy_coeff": self.entropy_coeff, - "unclipped_vf_loss": torch.mean( - torch.stack(self.get_tower_stats("unclipped_vf_loss")) - ), - } - ) + def loss(self, model, dist_class, train_batch): + self.stats = {} + train_batch[SampleBatch.ACTION_LOGP] + train_batch[SampleBatch.ACTIONS] + train_batch[SampleBatch.REWARDS] + train_batch[SampleBatch.TERMINATEDS] + return 0.0 @override(TorchPolicyV2) def postprocess_trajectory( self, sample_batch, other_agent_batches=None, episode=None ): - # Do all post-processing always with no_grad(). - # Not using this here will introduce a memory leak - # in torch (issue #6962). - # TODO: no_grad still necessary? with torch.no_grad(): return compute_gae_for_sample_batch( self, sample_batch, other_agent_batches, episode From cc25d0caa07b7f05e0060d970ae889e1e410bf16 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 15 Apr 2023 11:56:29 +0200 Subject: [PATCH 02/34] wip Signed-off-by: sven1977 --- .../impala/tests/test_impala_learner.py | 10 ++-- .../algorithms/ppo/tests/test_ppo_learner.py | 10 ++-- rllib/algorithms/ppo/tf/ppo_tf_rl_module.py | 48 +++++++++---------- rllib/core/models/base.py | 22 ++++----- rllib/core/models/tf/encoder.py | 6 +-- 5 files changed, 48 insertions(+), 48 deletions(-) diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index 5358816b8195f..a7cec6898ea4f 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -92,6 +92,10 @@ def test_impala_loss(self): train_batch = convert_to_torch_tensor(SampleBatch(FAKE_BATCH)) policy_loss = policy.loss(policy.model, policy.dist_class, train_batch) + # We shim'd the loss function of the PolicyRLM class. Always returns + # 0.0 b/c losses on Policies are no longer needed with the Learner API + # enabled. + check(policy_loss, 0.0) train_batch = SampleBatch(FAKE_BATCH) algo_config = config.copy(copy_frozen=False) @@ -110,11 +114,7 @@ def test_impala_loss(self): learner_group_config.num_learner_workers = 0 learner_group = learner_group_config.build() learner_group.set_weights(trainer.get_weights()) - results = learner_group.update(train_batch.as_multi_agent()) - - learner_group_loss = results[ALL_MODULES]["total_loss"] - - check(learner_group_loss, policy_loss) + learner_group.update(train_batch.as_multi_agent()) if __name__ == "__main__": diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index 12e910ed85996..214dcd5ffc620 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -92,6 +92,10 @@ def test_loss(self): ) policy_loss = policy.loss(policy.model, policy.dist_class, train_batch) + # We shim'd the loss function of the PolicyRLM class. Always returns + # 0.0 b/c losses on Policies are no longer needed with the Learner API + # enabled. + check(policy_loss, 0.0) algo_config = config.copy(copy_frozen=False) algo_config.training(_enable_learner_api=True) @@ -111,11 +115,7 @@ def test_loss(self): # load the trainer weights onto the learner_group learner_group.set_weights(trainer.get_weights()) - results = learner_group.update(train_batch.as_multi_agent()) - - learner_group_loss = results[ALL_MODULES]["total_loss"] - - check(learner_group_loss, policy_loss) + learner_group.update(train_batch.as_multi_agent()) if __name__ == "__main__": diff --git a/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py b/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py index a4a134e8408e9..40558747df8d0 100644 --- a/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py +++ b/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py @@ -69,14 +69,14 @@ def _forward_inference(self, batch: NestedDict) -> Mapping[str, Any]: output = {} # TODO (Artur): Remove this once Policy supports RNN - if self.encoder.config.shared: - batch[STATE_IN] = None - else: - batch[STATE_IN] = { - ACTOR: None, - CRITIC: None, - } - batch[SampleBatch.SEQ_LENS] = None + # if self.encoder.config.shared: + # batch[STATE_IN] = None + # else: + # batch[STATE_IN] = { + # ACTOR: None, + # CRITIC: None, + # } + # batch[SampleBatch.SEQ_LENS] = None encoder_outs = self.encoder(batch) # TODO (Artur): Un-uncomment once Policy supports RNN @@ -99,14 +99,14 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]: output = {} # TODO (Artur): Remove this once Policy supports RNN - if self.encoder.config.shared: - batch[STATE_IN] = None - else: - batch[STATE_IN] = { - ACTOR: None, - CRITIC: None, - } - batch[SampleBatch.SEQ_LENS] = None + # if self.encoder.config.shared: + # batch[STATE_IN] = None + # else: + # batch[STATE_IN] = { + # ACTOR: None, + # CRITIC: None, + # } + # batch[SampleBatch.SEQ_LENS] = None # Shared encoder encoder_outs = self.encoder(batch) @@ -132,14 +132,14 @@ def _forward_train(self, batch: NestedDict): output = {} # TODO (Artur): Remove this once Policy supports RNN - if self.encoder.config.shared: - batch[STATE_IN] = None - else: - batch[STATE_IN] = { - ACTOR: None, - CRITIC: None, - } - batch[SampleBatch.SEQ_LENS] = None + # if self.encoder.config.shared: + # batch[STATE_IN] = None + # else: + # batch[STATE_IN] = { + # ACTOR: None, + # CRITIC: None, + # } + # batch[SampleBatch.SEQ_LENS] = None # Shared encoder encoder_outs = self.encoder(batch) diff --git a/rllib/core/models/base.py b/rllib/core/models/base.py index 1be7eaf3ef6ce..cbf2115360251 100644 --- a/rllib/core/models/base.py +++ b/rllib/core/models/base.py @@ -363,19 +363,19 @@ def __init__(self, config: ModelConfig) -> None: @override(Model) def get_input_specs(self) -> Optional[Spec]: - if self.config.shared: - state_in_spec = self.encoder.input_specs[STATE_IN] - else: - state_in_spec = { - ACTOR: self.actor_encoder.input_specs[STATE_IN], - CRITIC: self.critic_encoder.input_specs[STATE_IN], - } + #if self.config.shared: + # state_in_spec = self.encoder.input_specs[STATE_IN] + #else: + # state_in_spec = { + # ACTOR: self.actor_encoder.input_specs[STATE_IN], + # CRITIC: self.critic_encoder.input_specs[STATE_IN], + # } return SpecDict( { SampleBatch.OBS: None, - STATE_IN: state_in_spec, - SampleBatch.SEQ_LENS: None, + #STATE_IN: state_in_spec, + #SampleBatch.SEQ_LENS: None, } ) @@ -420,9 +420,9 @@ def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: } ) else: - actor_inputs = NestedDict({**inputs, **{STATE_IN: inputs[STATE_IN][ACTOR]}}) + actor_inputs = NestedDict({**inputs})#, **{STATE_IN: inputs[STATE_IN][ACTOR]}}) critic_inputs = NestedDict( - {**inputs, **{STATE_IN: inputs[STATE_IN][CRITIC]}} + {**inputs}#, **{STATE_IN: inputs[STATE_IN][CRITIC]}} ) actor_out = self.actor_encoder(actor_inputs, **kwargs) critic_out = self.critic_encoder(critic_inputs, **kwargs) diff --git a/rllib/core/models/tf/encoder.py b/rllib/core/models/tf/encoder.py index 0725931a41532..9f30e84b0686a 100644 --- a/rllib/core/models/tf/encoder.py +++ b/rllib/core/models/tf/encoder.py @@ -48,8 +48,8 @@ def get_input_specs(self) -> Optional[Spec]: return SpecDict( { SampleBatch.OBS: TfTensorSpec("b, d", d=self.config.input_dims[0]), - STATE_IN: None, - SampleBatch.SEQ_LENS: None, + #STATE_IN: None, + #SampleBatch.SEQ_LENS: None, } ) @@ -67,7 +67,7 @@ def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: return NestedDict( { ENCODER_OUT: self.net(inputs[SampleBatch.OBS]), - STATE_OUT: inputs[STATE_IN], + STATE_OUT: None,#inputs[STATE_IN], } ) From 0248ce85a72aa44d4d82d657299e7f3cfa09533b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 4 May 2023 17:23:34 +0200 Subject: [PATCH 03/34] wip Signed-off-by: sven1977 --- ci/lint/format.sh | 16 ++++++++-------- rllib/algorithms/algorithm_config.py | 16 ++++++++++------ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/ci/lint/format.sh b/ci/lint/format.sh index 3ada3d92c02a5..7dfc6be0dafe5 100755 --- a/ci/lint/format.sh +++ b/ci/lint/format.sh @@ -107,14 +107,14 @@ else echo "WARNING: clang-format is not installed!" fi -#if command -v java >/dev/null; then -# if [ ! -f "$GOOGLE_JAVA_FORMAT_JAR" ]; then -# echo "Java code format tool google-java-format.jar is not installed, start to install it." -# wget https://github.com/google/google-java-format/releases/download/google-java-format-1.7/google-java-format-1.7-all-deps.jar -O "$GOOGLE_JAVA_FORMAT_JAR" -# fi -#else -# echo "WARNING:java is not installed, skip format java files!" -#fi +if command -v java >/dev/null; then + if [ ! -f "$GOOGLE_JAVA_FORMAT_JAR" ]; then + echo "Java code format tool google-java-format.jar is not installed, start to install it." + wget https://github.com/google/google-java-format/releases/download/google-java-format-1.7/google-java-format-1.7-all-deps.jar -O "$GOOGLE_JAVA_FORMAT_JAR" + fi +else + echo "WARNING:java is not installed, skip format java files!" +fi if [[ $(flake8 --version) != *"flake8_quotes"* ]]; then echo "WARNING: Ray uses flake8 with flake8_quotes. Might error without it. Install with: pip install flake8-quotes" diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 6d20f874cdef6..20cbc161b228e 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -844,7 +844,7 @@ def validate(self) -> None: error=True, ) - # RLModule API only works with connectors. + # RLModule API only works with connectors and with Learner API. if not self.enable_connectors and self._enable_rl_module_api: raise ValueError( "RLModule API only works with connectors. " @@ -853,11 +853,15 @@ def validate(self) -> None: ) # Learner API requires RLModule API. - if self._enable_learner_api and not self._enable_rl_module_api: + if not ( + self._enable_learner_api is True == self._enable_rl_module_api is True + ): raise ValueError( - "Learner API requires RLModule API. " - "Please enable RLModule API via " - "`config.training(_enable_rl_module_api=True)`." + "Learner API requires RLModule API and vice-versa! " + "Enable RLModule API via " + "`config.rl_module(_enable_rl_module_api=True)` and the Learner API " + "via `config.training(_enable_learner_api=True)` (or set both to " + "False)." ) if bool(os.environ.get("RLLIB_ENABLE_RL_MODULE", False)): @@ -1676,7 +1680,7 @@ def training( deprecation_warning( old="AlgorithmConfig.training(_use_default_native_models=True)", help="_use_default_native_models is not supported " - "anymore. To get rid of this error, set `experimental(" + "anymore. To get rid of this error, set `rl_module(" "_enable_rl_module_api` to True. Native models will " "be better supported by the upcoming RLModule API.", # Error out if user tries to enable this From 04595b3a81173d0eb9374a020ad704381d96afdc Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 4 May 2023 17:53:56 +0200 Subject: [PATCH 04/34] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 2 +- rllib/algorithms/appo/tests/tf/test_appo_learner.py | 10 ++++++---- rllib/algorithms/impala/tests/test_impala_learner.py | 1 - rllib/algorithms/ppo/tests/test_ppo_learner.py | 6 ++---- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 20cbc161b228e..b3a61628361a8 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -854,7 +854,7 @@ def validate(self) -> None: # Learner API requires RLModule API. if not ( - self._enable_learner_api is True == self._enable_rl_module_api is True + (self._enable_learner_api is True) is (self._enable_rl_module_api is True) ): raise ValueError( "Learner API requires RLModule API and vice-versa! " diff --git a/rllib/algorithms/appo/tests/tf/test_appo_learner.py b/rllib/algorithms/appo/tests/tf/test_appo_learner.py index 8c3978fc30550..87f48156d83c1 100644 --- a/rllib/algorithms/appo/tests/tf/test_appo_learner.py +++ b/rllib/algorithms/appo/tests/tf/test_appo_learner.py @@ -66,6 +66,7 @@ def test_appo_loss(self): fcnet_activation="linear", vf_share_layers=False, ), + _enable_learner_api=True, ) .rl_module( _enable_rl_module_api=True, @@ -86,9 +87,12 @@ def test_appo_loss(self): else: train_batch = SampleBatch(FAKE_BATCH) policy_loss = policy.loss(policy.model, policy.dist_class, train_batch) + # We shim'd the loss function of the PolicyRLM class. Always returns + # 0.0 b/c losses on Policies are no longer needed with the Learner API + # enabled. + check(policy_loss, 0.0) algo_config = config.copy(copy_frozen=False) - algo_config.training(_enable_learner_api=True) algo_config.validate() algo_config.freeze() @@ -104,10 +108,8 @@ def test_appo_loss(self): learner_group_config.num_learner_workers = 0 learner_group = learner_group_config.build() learner_group.set_weights(trainer.get_weights()) - results = learner_group.update(train_batch.as_multi_agent()) - learner_group_loss = results[ALL_MODULES]["total_loss"] - check(learner_group_loss, policy_loss) + learner_group.update(train_batch.as_multi_agent()) def test_kl_coeff_changes(self): initial_kl_coeff = 0.01 diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index a7cec6898ea4f..7e47d15bb18a0 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -7,7 +7,6 @@ from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch, try_import_tf -from ray.rllib.utils.metrics import ALL_MODULES from ray.rllib.utils.test_utils import check from ray.rllib.utils.test_utils import framework_iterator from ray.rllib.utils.torch_utils import convert_to_torch_tensor diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index e454a3c50e93a..b5ec88b618c73 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -12,7 +12,6 @@ from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.test_utils import check, framework_iterator -from ray.rllib.utils.metrics import ALL_MODULES from ray.rllib.evaluation.postprocessing import ( compute_gae_for_sample_batch, @@ -68,6 +67,7 @@ def test_loss(self): fcnet_activation="linear", vf_share_layers=False, ), + _enable_learner_api=True, ) .rl_module( _enable_rl_module_api=True, @@ -87,7 +87,6 @@ def test_loss(self): lambda x: torch.as_tensor(x).float(), train_batch ) else: - # tf train_batch = tree.map_structure( lambda x: tf.convert_to_tensor(x), train_batch ) @@ -99,7 +98,6 @@ def test_loss(self): check(policy_loss, 0.0) algo_config = config.copy(copy_frozen=False) - algo_config.training(_enable_learner_api=True) algo_config.validate() algo_config.freeze() @@ -142,7 +140,7 @@ def test_save_load_state(self): algo = config.build() policy = algo.get_policy() - for fw in framework_iterator(config, ("tf2", "torch"), with_eager_tracing=True): + for _ in framework_iterator(config, ("tf2", "torch"), with_eager_tracing=True): algo_config = config.copy(copy_frozen=False) algo_config.validate() algo_config.freeze() From bf78988dd399d55d5d684082c6b87b18e25b9c41 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 4 May 2023 18:19:40 +0200 Subject: [PATCH 05/34] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/tests/tf/test_appo_learner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rllib/algorithms/appo/tests/tf/test_appo_learner.py b/rllib/algorithms/appo/tests/tf/test_appo_learner.py index 87f48156d83c1..b75eb5ebb3b68 100644 --- a/rllib/algorithms/appo/tests/tf/test_appo_learner.py +++ b/rllib/algorithms/appo/tests/tf/test_appo_learner.py @@ -8,7 +8,6 @@ ) from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.policy.sample_batch import SampleBatch, DEFAULT_POLICY_ID -from ray.rllib.utils.metrics import ALL_MODULES from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.test_utils import check, framework_iterator From ceb8caaa4685db9bcfee24a9d6a43a31cc03e19e Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 4 May 2023 20:47:22 +0200 Subject: [PATCH 06/34] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index b3a61628361a8..1b80e0c49226c 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -865,9 +865,10 @@ def validate(self) -> None: ) if bool(os.environ.get("RLLIB_ENABLE_RL_MODULE", False)): - # enable RLModule API and connectors if env variable is set + # Enable RLModule API and connectors if env variable is set # (to be used in unittesting) self.rl_module(_enable_rl_module_api=True) + self.training(_enable_learner_api=True) self.enable_connectors = True # Explore parameter cannot be False with RLModule API enabled. From b36304db51199480d0a54883a0ecdfe48cc69bfa Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 5 May 2023 22:13:45 +0200 Subject: [PATCH 07/34] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo.py | 17 +---- .../algorithms/appo/tf/appo_tf_policy_rlm.py | 3 +- .../appo/torch/appo_torch_policy_rlm.py | 3 +- rllib/algorithms/impala/impala.py | 67 +++++++------------ .../impala/tf/impala_tf_policy_rlm.py | 3 +- .../impala/torch/impala_torch_policy_rlm.py | 3 +- rllib/algorithms/ppo/ppo.py | 23 ++----- rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py | 3 +- .../ppo/torch/ppo_torch_policy_rlm.py | 3 +- 9 files changed, 42 insertions(+), 83 deletions(-) diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index 5b503f239192e..8b27ef5100cc4 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -381,16 +381,9 @@ def get_default_policy_class( cls, config: AlgorithmConfig ) -> Optional[Type[Policy]]: if config["framework"] == "torch": - if config._enable_rl_module_api: - from ray.rllib.algorithms.appo.torch.appo_torch_policy_rlm import ( - APPOTorchPolicyWithRLModule, - ) - - return APPOTorchPolicyWithRLModule - else: - from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy + from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy - return APPOTorchPolicy + return APPOTorchPolicy elif config["framework"] == "tf": if config._enable_rl_module_api: raise ValueError( @@ -402,12 +395,6 @@ def get_default_policy_class( return APPOTF1Policy else: - if config._enable_rl_module_api: - from ray.rllib.algorithms.appo.tf.appo_tf_policy_rlm import ( - APPOTfPolicyWithRLModule, - ) - - return APPOTfPolicyWithRLModule from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF2Policy return APPOTF2Policy diff --git a/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py b/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py index 4539d8f7ccff9..6d7d4e92c7826 100644 --- a/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py +++ b/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py @@ -4,7 +4,7 @@ from ray.rllib.utils.annotations import override -# TODO: Remove once we have a RLModule capable sampler class that can replace +"""# TODO: Remove once we have a RLModule capable sampler class that can replace # `Policy.compute_actions_from_input_dict()`. class APPOTfPolicyWithRLModule(EagerTFPolicyV2): def __init__(self, observation_space, action_space, config): @@ -27,3 +27,4 @@ def loss(self, model, dist_class, train_batch): @override(EagerTFPolicyV2) def get_batch_divisibility_req(self) -> int: return self.config["rollout_fragment_length"] +""" \ No newline at end of file diff --git a/rllib/algorithms/appo/torch/appo_torch_policy_rlm.py b/rllib/algorithms/appo/torch/appo_torch_policy_rlm.py index 4b4396a4f29b7..e3f387bec5bce 100644 --- a/rllib/algorithms/appo/torch/appo_torch_policy_rlm.py +++ b/rllib/algorithms/appo/torch/appo_torch_policy_rlm.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -# TODO: Remove once we have a RLModule capable sampler class that can replace +"""# TODO: Remove once we have a RLModule capable sampler class that can replace # `Policy.compute_actions_from_input_dict()`. class APPOTorchPolicyWithRLModule(TorchPolicyV2): def __init__(self, observation_space, action_space, config): @@ -31,3 +31,4 @@ def loss(self, model, dist_class, train_batch): @override(TorchPolicyV2) def get_batch_divisibility_req(self) -> int: return self.config["rollout_fragment_length"] +""" \ No newline at end of file diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 57c47e3ec9403..1548698e23077 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -561,58 +561,39 @@ def get_default_policy_class( if not config["vtrace"]: raise ValueError("IMPALA with the learner API does not support non-VTrace ") - if config._enable_rl_module_api: - if config["framework"] == "tf2": - from ray.rllib.algorithms.impala.tf.impala_tf_policy_rlm import ( - ImpalaTfPolicyWithRLModule, + if config["framework"] == "torch": + if config["vtrace"]: + from ray.rllib.algorithms.impala.impala_torch_policy import ( + ImpalaTorchPolicy, ) - return ImpalaTfPolicyWithRLModule - if config["framework"] == "torch": - from ray.rllib.algorithms.impala.torch.impala_torch_policy_rlm import ( - ImpalaTorchPolicyWithRLModule, - ) - - return ImpalaTorchPolicyWithRLModule + return ImpalaTorchPolicy else: - raise ValueError( - f"IMPALA with the learner API does not support framework " - f"{config['framework']} " - ) - else: - if config["framework"] == "torch": - if config["vtrace"]: - from ray.rllib.algorithms.impala.impala_torch_policy import ( - ImpalaTorchPolicy, - ) + from ray.rllib.algorithms.a3c.a3c_torch_policy import A3CTorchPolicy - return ImpalaTorchPolicy - else: - from ray.rllib.algorithms.a3c.a3c_torch_policy import A3CTorchPolicy + return A3CTorchPolicy + elif config["framework"] == "tf": + if config["vtrace"]: + from ray.rllib.algorithms.impala.impala_tf_policy import ( + ImpalaTF1Policy, + ) - return A3CTorchPolicy - elif config["framework"] == "tf": - if config["vtrace"]: - from ray.rllib.algorithms.impala.impala_tf_policy import ( - ImpalaTF1Policy, - ) + return ImpalaTF1Policy + else: + from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTFPolicy - return ImpalaTF1Policy - else: - from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTFPolicy + return A3CTFPolicy + else: + if config["vtrace"]: + from ray.rllib.algorithms.impala.impala_tf_policy import ( + ImpalaTF2Policy, + ) - return A3CTFPolicy + return ImpalaTF2Policy else: - if config["vtrace"]: - from ray.rllib.algorithms.impala.impala_tf_policy import ( - ImpalaTF2Policy, - ) - - return ImpalaTF2Policy - else: - from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTFPolicy + from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTFPolicy - return A3CTFPolicy + return A3CTFPolicy @override(Algorithm) def setup(self, config: AlgorithmConfig): diff --git a/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py b/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py index f8447ca312c3e..ba67c142f7765 100644 --- a/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py +++ b/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py @@ -4,7 +4,7 @@ from ray.rllib.utils.annotations import override -# TODO: Remove once we have a RLModule capable sampler class that can replace +"""# TODO: Remove once we have a RLModule capable sampler class that can replace # `Policy.compute_actions_from_input_dict()`. class ImpalaTfPolicyWithRLModule(EagerTFPolicyV2): def __init__(self, observation_space, action_space, config): @@ -26,3 +26,4 @@ def loss(self, model, dist_class, train_batch): @override(EagerTFPolicyV2) def get_batch_divisibility_req(self) -> int: return self.config["rollout_fragment_length"] +""" \ No newline at end of file diff --git a/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py b/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py index a2d49650f68ec..2caa4f34ffab0 100644 --- a/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py +++ b/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py @@ -4,7 +4,7 @@ from ray.rllib.utils.annotations import override -# TODO: Remove once we have a RLModule capable sampler class that can replace +"""# TODO: Remove once we have a RLModule capable sampler class that can replace # `Policy.compute_actions_from_input_dict()`. class ImpalaTorchPolicyWithRLModule(TorchPolicyV2): def __init__(self, observation_space, action_space, config): @@ -24,3 +24,4 @@ def loss(self, model, dist_class, train_batch): @override(TorchPolicyV2) def get_batch_divisibility_req(self) -> int: return self.config["rollout_fragment_length"] +""" \ No newline at end of file diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 89481a8352572..05ab3ee5ca3ea 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -371,32 +371,17 @@ def get_default_policy_class( ) -> Optional[Type[Policy]]: if config["framework"] == "torch": - if config._enable_rl_module_api: - from ray.rllib.algorithms.ppo.torch.ppo_torch_policy_rlm import ( - PPOTorchPolicyWithRLModule, - ) - - return PPOTorchPolicyWithRLModule - else: - from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy + from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy - return PPOTorchPolicy + return PPOTorchPolicy elif config["framework"] == "tf": from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy return PPOTF1Policy else: - if config._enable_rl_module_api: - from ray.rllib.algorithms.ppo.tf.ppo_tf_policy_rlm import ( - PPOTfPolicyWithRLModule, - ) - - return PPOTfPolicyWithRLModule - else: - - from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF2Policy + from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF2Policy - return PPOTF2Policy + return PPOTF2Policy @ExperimentalAPI def training_step(self) -> ResultDict: diff --git a/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py b/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py index 7f0957db84cf2..508cae595d15a 100644 --- a/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py +++ b/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py @@ -5,7 +5,7 @@ from ray.rllib.utils.annotations import override -# TODO: Remove once we have a RLModule capable sampler class that can replace +""""# TODO: Remove once we have a RLModule capable sampler class that can replace # `Policy.compute_actions_from_input_dict()`. class PPOTfPolicyWithRLModule(EagerTFPolicyV2): def __init__(self, observation_space, action_space, config): @@ -32,3 +32,4 @@ def postprocess_trajectory( return compute_gae_for_sample_batch( self, sample_batch, other_agent_batches, episode ) +""" \ No newline at end of file diff --git a/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py b/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py index ae3a07d41233e..4cbe75b4cf4ec 100644 --- a/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py +++ b/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py @@ -8,7 +8,7 @@ torch, nn = try_import_torch() -# TODO: Remove once we have a RLModule capable sampler class that can replace +"""# TODO: Remove once we have a RLModule capable sampler class that can replace # `Policy.compute_actions_from_input_dict()`. class PPOTorchPolicyWithRLModule(TorchPolicyV2): def __init__(self, observation_space, action_space, config): @@ -39,3 +39,4 @@ def postprocess_trajectory( return compute_gae_for_sample_batch( self, sample_batch, other_agent_batches, episode ) +""" From a112d5b080a925301331a6668d84af712a2e5ffe Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 5 May 2023 22:40:15 +0200 Subject: [PATCH 08/34] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 36 ++++++----- rllib/algorithms/appo/appo_torch_policy.py | 29 +++++---- .../appo/tests/test_appo_learner.py | 5 -- rllib/algorithms/impala/impala_tf_policy.py | 21 +++---- .../algorithms/impala/impala_torch_policy.py | 19 +++--- .../impala/tests/test_impala_learner.py | 6 -- rllib/algorithms/ppo/ppo_tf_policy.py | 21 +++---- rllib/algorithms/ppo/ppo_torch_policy.py | 17 +++--- .../algorithms/ppo/tests/test_ppo_learner.py | 6 -- rllib/policy/policy.py | 5 +- rllib/policy/torch_policy_v2.py | 59 ++++++++++--------- 11 files changed, 112 insertions(+), 112 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index d91b4516bfd7a..a254b35bf4371 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -81,10 +81,13 @@ def __init__( # First thing first, enable eager execution if necessary. base.enable_eager_execution_if_necessary() + old_stack = not config.get("_enable_learner_api", False) + # Although this is a no-op, we call __init__ here to make it clear # that base.__init__ will use the make_model() call. - VTraceClipGradients.__init__(self) - VTraceOptimizer.__init__(self) + if old_stack: + VTraceClipGradients.__init__(self) + VTraceOptimizer.__init__(self) # Initialize base class. base.__init__( @@ -96,22 +99,23 @@ def __init__( existing_model=existing_model, ) - # TF LearningRateSchedule depends on self.framework, so initialize - # after base.__init__() is called. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - ValueNetworkMixin.__init__(self, config) - KLCoeffMixin.__init__(self, config) - GradStatsMixin.__init__(self) + if old_stack: + # TF LearningRateSchedule depends on self.framework, so initialize + # after base.__init__() is called. + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + ValueNetworkMixin.__init__(self, config) + KLCoeffMixin.__init__(self, config) + GradStatsMixin.__init__(self) - # Note: this is a bit ugly, but loss and optimizer initialization must - # happen after all the MixIns are initialized. - self.maybe_initialize_optimizer_and_loss() + # Note: this is a bit ugly, but loss and optimizer initialization must + # happen after all the MixIns are initialized. + self.maybe_initialize_optimizer_and_loss() - # Initiate TargetNetwork ops after loss initialization. - TargetNetworkMixin.__init__(self) + # Initiate TargetNetwork ops after loss initialization. + TargetNetworkMixin.__init__(self) @override(base) def make_model(self) -> ModelV2: diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index b92b7c32fd510..2c2d2253874d0 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -69,10 +69,12 @@ class APPOTorchPolicy( def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.algorithms.appo.appo.APPOConfig().to_dict(), **config) - # Although this is a no-op, we call __init__ here to make it clear - # that base.__init__ will use the make_model() call. - VTraceOptimizer.__init__(self) - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + old_stack = not config.get("_enable_learner_api", False) + if old_stack: + # Although this is a no-op, we call __init__ here to make it clear + # that base.__init__ will use the make_model() call. + VTraceOptimizer.__init__(self) + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) TorchPolicyV2.__init__( self, @@ -82,17 +84,18 @@ def __init__(self, observation_space, action_space, config): max_seq_len=config["model"]["max_seq_len"], ) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - ValueNetworkMixin.__init__(self, config) - KLCoeffMixin.__init__(self, config) + if old_stack: + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + ValueNetworkMixin.__init__(self, config) + KLCoeffMixin.__init__(self, config) - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() + # TODO: Don't require users to call this manually. + self._initialize_loss_from_dummy_batch() - # Initiate TargetNetwork ops after loss initialization. - TargetNetworkMixin.__init__(self) + # Initiate TargetNetwork ops after loss initialization. + TargetNetworkMixin.__init__(self) @override(TorchPolicyV2) def init_view_requirements(self): diff --git a/rllib/algorithms/appo/tests/test_appo_learner.py b/rllib/algorithms/appo/tests/test_appo_learner.py index 005296c446f1d..1bc1bd1b0a087 100644 --- a/rllib/algorithms/appo/tests/test_appo_learner.py +++ b/rllib/algorithms/appo/tests/test_appo_learner.py @@ -90,11 +90,6 @@ def test_appo_loss(self): train_batch = SampleBatch( tree.map_structure(lambda x: convert_to_torch_tensor(x), FAKE_BATCH) ) - policy_loss = policy.loss(policy.model, policy.dist_class, train_batch) - # We shim'd the loss function of the PolicyRLM class. Always returns - # 0.0 b/c losses on Policies are no longer needed with the Learner API - # enabled. - check(policy_loss, 0.0) algo_config = config.copy(copy_frozen=False) algo_config.validate() diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index e1b66f5332127..5a495cdf16081 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -297,17 +297,18 @@ def __init__( existing_model=existing_model, ) - GradStatsMixin.__init__(self) - VTraceClipGradients.__init__(self) - VTraceOptimizer.__init__(self) - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) + if not self.config.get("_enable_learner_api"): + GradStatsMixin.__init__(self) + VTraceClipGradients.__init__(self) + VTraceOptimizer.__init__(self) + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) - # Note: this is a bit ugly, but loss and optimizer initialization must - # happen after all the MixIns are initialized. - self.maybe_initialize_optimizer_and_loss() + # Note: this is a bit ugly, but loss and optimizer initialization must + # happen after all the MixIns are initialized. + self.maybe_initialize_optimizer_and_loss() @override(base) def loss( diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index 73d4b3c7bd121..894d34a6260ac 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -201,13 +201,14 @@ def __init__(self, observation_space, action_space, config): ray.rllib.algorithms.impala.impala.ImpalaConfig().to_dict(), **config ) - VTraceOptimizer.__init__(self) - # Need to initialize learning rate variable before calling - # TorchPolicyV2.__init__. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) + if not self.config.get("_enable_learner_api"): + VTraceOptimizer.__init__(self) + # Need to initialize learning rate variable before calling + # TorchPolicyV2.__init__. + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) TorchPolicyV2.__init__( self, @@ -217,8 +218,8 @@ def __init__(self, observation_space, action_space, config): max_seq_len=config["model"]["max_seq_len"], ) - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() + if not self.config.get("_enable_learner_api"): + self._initialize_loss_from_dummy_batch() @override(TorchPolicyV2) def loss( diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index f72466cfcc72b..0ed63b3557cfb 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -89,12 +89,6 @@ def test_impala_loss(self): elif fw == "torch": train_batch = convert_to_torch_tensor(SampleBatch(FAKE_BATCH)) - policy_loss = policy.loss(policy.model, policy.dist_class, train_batch) - # We shim'd the loss function of the PolicyRLM class. Always returns - # 0.0 b/c losses on Policies are no longer needed with the Learner API - # enabled. - check(policy_loss, 0.0) - algo_config = config.copy(copy_frozen=False) algo_config.validate() algo_config.freeze() diff --git a/rllib/algorithms/ppo/ppo_tf_policy.py b/rllib/algorithms/ppo/ppo_tf_policy.py index a00f8c037eb6e..93a1382c3e382 100644 --- a/rllib/algorithms/ppo/ppo_tf_policy.py +++ b/rllib/algorithms/ppo/ppo_tf_policy.py @@ -87,17 +87,18 @@ def __init__( existing_model=existing_model, ) - # Initialize MixIns. - ValueNetworkMixin.__init__(self, config) - KLCoeffMixin.__init__(self, config) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + if not self.config.get("_enable_learner_api", False): + # Initialize MixIns. + ValueNetworkMixin.__init__(self, config) + KLCoeffMixin.__init__(self, config) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - # Note: this is a bit ugly, but loss and optimizer initialization must - # happen after all the MixIns are initialized. - self.maybe_initialize_optimizer_and_loss() + # Note: this is a bit ugly, but loss and optimizer initialization must + # happen after all the MixIns are initialized. + self.maybe_initialize_optimizer_and_loss() @override(base) def loss( diff --git a/rllib/algorithms/ppo/ppo_torch_policy.py b/rllib/algorithms/ppo/ppo_torch_policy.py index df45eefeb14c8..96adddbce90ea 100644 --- a/rllib/algorithms/ppo/ppo_torch_policy.py +++ b/rllib/algorithms/ppo/ppo_torch_policy.py @@ -56,15 +56,16 @@ def __init__(self, observation_space, action_space, config): max_seq_len=config["model"]["max_seq_len"], ) - ValueNetworkMixin.__init__(self, config) - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - KLCoeffMixin.__init__(self, config) + if not self.config.get("_enable_learner_api", False): + ValueNetworkMixin.__init__(self, config) + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + KLCoeffMixin.__init__(self, config) - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() + # TODO: Don't require users to call this manually. + self._initialize_loss_from_dummy_batch() @override(TorchPolicyV2) def loss( diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index 9f224f0fd7265..e16ea35641e68 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -91,12 +91,6 @@ def test_loss(self): lambda x: tf.convert_to_tensor(x), train_batch ) - policy_loss = policy.loss(policy.model, policy.dist_class, train_batch) - # We shim'd the loss function of the PolicyRLM class. Always returns - # 0.0 b/c losses on Policies are no longer needed with the Learner API - # enabled. - check(policy_loss, 0.0) - algo_config = config.copy(copy_frozen=False) algo_config.validate() algo_config.freeze() diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index bb534497e3d28..75fb3ccf91582 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -1384,7 +1384,10 @@ def _initialize_loss_from_dummy_batch( the loss. """ - if self.config.get("_disable_initialize_loss_from_dummy_batch", False): + if ( + self.config.get("_disable_initialize_loss_from_dummy_batch", False) + or self.config.get("_enable_learner_api", False) + ): return # Signal Policy that currently we do not like to eager/jit trace # any function calls. This is to be able to track, which columns diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index ea648fd912ade..f82ce9ff3db47 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -182,34 +182,37 @@ def __init__( self.exploration = None else: self.exploration = self._create_exploration() - self._optimizers = force_list(self.optimizer()) - - # Backward compatibility workaround so Policy will call self.loss() directly. - # TODO(jungong): clean up after all policies are migrated to new sub-class - # implementation. - self._loss = None - - # Store, which params (by index within the model's list of - # parameters) should be updated per optimizer. - # Maps optimizer idx to set or param indices. - self.multi_gpu_param_groups: List[Set[int]] = [] - main_params = {p: i for i, p in enumerate(self.model.parameters())} - for o in self._optimizers: - param_indices = [] - for pg_idx, pg in enumerate(o.param_groups): - for p in pg["params"]: - param_indices.append(main_params[p]) - self.multi_gpu_param_groups.append(set(param_indices)) - - # Create n sample-batch buffers (num_multi_gpu_tower_stacks), each - # one with m towers (num_gpus). - num_buffers = self.config.get("num_multi_gpu_tower_stacks", 1) - self._loaded_batches = [[] for _ in range(num_buffers)] - - # If set, means we are using distributed allreduce during learning. - self.distributed_world_size = None - - self.batch_divisibility_req = self.get_batch_divisibility_req() + + if not self.config.get("_enable_learner_api", False): + self._optimizers = force_list(self.optimizer()) + + # Backward compatibility workaround so Policy will call self.loss() directly. + # TODO (jungong): clean up after all policies are migrated to new sub-class + # implementation. + self._loss = None + + # Store, which params (by index within the model's list of + # parameters) should be updated per optimizer. + # Maps optimizer idx to set or param indices. + self.multi_gpu_param_groups: List[Set[int]] = [] + main_params = {p: i for i, p in enumerate(self.model.parameters())} + for o in self._optimizers: + param_indices = [] + for pg_idx, pg in enumerate(o.param_groups): + for p in pg["params"]: + param_indices.append(main_params[p]) + self.multi_gpu_param_groups.append(set(param_indices)) + + # Create n sample-batch buffers (num_multi_gpu_tower_stacks), each + # one with m towers (num_gpus). + num_buffers = self.config.get("num_multi_gpu_tower_stacks", 1) + self._loaded_batches = [[] for _ in range(num_buffers)] + + # If set, means we are using distributed allreduce during learning. + self.distributed_world_size = None + + self.batch_divisibility_req = self.get_batch_divisibility_req() + self.max_seq_len = max_seq_len # If model is an RLModule it won't have tower_stats instead there will be a From 4b600ef18d3cebccbc0599c07e25d5c6f77e9f91 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 07:15:21 +0200 Subject: [PATCH 09/34] wip Signed-off-by: sven1977 --- rllib/tuned_examples/impala/cartpole-impala.yaml | 4 ++-- rllib/tuned_examples/ppo/cartpole-ppo-with-rl-module.yaml | 2 +- rllib/tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/rllib/tuned_examples/impala/cartpole-impala.yaml b/rllib/tuned_examples/impala/cartpole-impala.yaml index 46c37c52ea697..1df02c4313cb5 100644 --- a/rllib/tuned_examples/impala/cartpole-impala.yaml +++ b/rllib/tuned_examples/impala/cartpole-impala.yaml @@ -8,8 +8,8 @@ cartpole-impala: # Works for both torch and tf. framework: tf2 num_gpus: 0 - _enable_rl_module_api: True - _enable_learner_api: True + _enable_rl_module_api: true + _enable_learner_api: true grad_clip: 40 num_workers: 2 num_learner_workers: 1 diff --git a/rllib/tuned_examples/ppo/cartpole-ppo-with-rl-module.yaml b/rllib/tuned_examples/ppo/cartpole-ppo-with-rl-module.yaml index fbfb6905b4ace..2f6afebd53efa 100644 --- a/rllib/tuned_examples/ppo/cartpole-ppo-with-rl-module.yaml +++ b/rllib/tuned_examples/ppo/cartpole-ppo-with-rl-module.yaml @@ -19,5 +19,5 @@ cartpole-ppo: vf_share_layers: true enable_connectors: true _enable_rl_module_api: true - _enable_learner_api: false + _enable_learner_api: true eager_tracing: false \ No newline at end of file diff --git a/rllib/tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml b/rllib/tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml index 5b2888d709f79..98da67a36fdb9 100644 --- a/rllib/tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml +++ b/rllib/tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml @@ -21,6 +21,7 @@ pendulum-ppo: enable_connectors: true model: fcnet_activation: relu + _enable_learner_api: true _enable_rl_module_api: true # Need to unset this b/c we are using the RLModule API, which # provides exploration control via the RLModule's `forward_exploration` method. From 6dba48be4ff1080acc3e7dc471a6ae03f5e6bb7a Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 07:32:35 +0200 Subject: [PATCH 10/34] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 21 +++++++++++---------- rllib/algorithms/appo/appo_torch_policy.py | 19 ++++++++++--------- rllib/algorithms/ppo/ppo_tf_policy.py | 16 ++++++++-------- rllib/algorithms/ppo/ppo_torch_policy.py | 17 +++++++---------- rllib/policy/policy.py | 4 ++++ 5 files changed, 40 insertions(+), 37 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index a254b35bf4371..a473c1a11f031 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -99,23 +99,24 @@ def __init__( existing_model=existing_model, ) + # TF LearningRateSchedule depends on self.framework, so initialize + # after base.__init__() is called. + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + ValueNetworkMixin.__init__(self, config) + KLCoeffMixin.__init__(self, config) + if old_stack: - # TF LearningRateSchedule depends on self.framework, so initialize - # after base.__init__() is called. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - ValueNetworkMixin.__init__(self, config) - KLCoeffMixin.__init__(self, config) GradStatsMixin.__init__(self) # Note: this is a bit ugly, but loss and optimizer initialization must # happen after all the MixIns are initialized. self.maybe_initialize_optimizer_and_loss() - # Initiate TargetNetwork ops after loss initialization. - TargetNetworkMixin.__init__(self) + # Initiate TargetNetwork ops after loss initialization. + TargetNetworkMixin.__init__(self) @override(base) def make_model(self) -> ModelV2: diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 2c2d2253874d0..47659361fc5ee 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -74,7 +74,8 @@ def __init__(self, observation_space, action_space, config): # Although this is a no-op, we call __init__ here to make it clear # that base.__init__ will use the make_model() call. VTraceOptimizer.__init__(self) - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) TorchPolicyV2.__init__( self, @@ -84,18 +85,18 @@ def __init__(self, observation_space, action_space, config): max_seq_len=config["model"]["max_seq_len"], ) - if old_stack: - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - ValueNetworkMixin.__init__(self, config) - KLCoeffMixin.__init__(self, config) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + ValueNetworkMixin.__init__(self, config) + KLCoeffMixin.__init__(self, config) + if old_stack: # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch() - # Initiate TargetNetwork ops after loss initialization. - TargetNetworkMixin.__init__(self) + # Initiate TargetNetwork ops after loss initialization. + TargetNetworkMixin.__init__(self) @override(TorchPolicyV2) def init_view_requirements(self): diff --git a/rllib/algorithms/ppo/ppo_tf_policy.py b/rllib/algorithms/ppo/ppo_tf_policy.py index 93a1382c3e382..cc4d19c86f15f 100644 --- a/rllib/algorithms/ppo/ppo_tf_policy.py +++ b/rllib/algorithms/ppo/ppo_tf_policy.py @@ -87,15 +87,15 @@ def __init__( existing_model=existing_model, ) - if not self.config.get("_enable_learner_api", False): - # Initialize MixIns. - ValueNetworkMixin.__init__(self, config) - KLCoeffMixin.__init__(self, config) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + # Initialize MixIns. + ValueNetworkMixin.__init__(self, config) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + KLCoeffMixin.__init__(self, config) + if not self.config.get("_enable_learner_api", False): # Note: this is a bit ugly, but loss and optimizer initialization must # happen after all the MixIns are initialized. self.maybe_initialize_optimizer_and_loss() diff --git a/rllib/algorithms/ppo/ppo_torch_policy.py b/rllib/algorithms/ppo/ppo_torch_policy.py index 96adddbce90ea..fe235886bb73b 100644 --- a/rllib/algorithms/ppo/ppo_torch_policy.py +++ b/rllib/algorithms/ppo/ppo_torch_policy.py @@ -44,8 +44,6 @@ class PPOTorchPolicy( def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.algorithms.ppo.ppo.PPOConfig().to_dict(), **config) - # TODO: Move into Policy API, if needed at all here. Why not move this into - # `PPOConfig`?. validate_config(config) TorchPolicyV2.__init__( @@ -56,15 +54,14 @@ def __init__(self, observation_space, action_space, config): max_seq_len=config["model"]["max_seq_len"], ) - if not self.config.get("_enable_learner_api", False): - ValueNetworkMixin.__init__(self, config) - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - KLCoeffMixin.__init__(self, config) + ValueNetworkMixin.__init__(self, config) + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + KLCoeffMixin.__init__(self, config) - # TODO: Don't require users to call this manually. + if not self.config.get("_enable_learner_api", False): self._initialize_loss_from_dummy_batch() @override(TorchPolicyV2) diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index 75fb3ccf91582..88dcd7d4a9d31 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -1358,6 +1358,10 @@ def _get_default_view_requirements(self): SampleBatch.UNROLL_ID: ViewRequirement(), SampleBatch.AGENT_INDEX: ViewRequirement(), SampleBatch.T: ViewRequirement(), + SampleBatch.VF_PREDS: ViewRequirement(), + SampleBatch.ACTION_DIST_INPUTS: ViewRequirement(), + SampleBatch.ACTION_LOGP: ViewRequirement(), + SampleBatch.ACTION_PROB: ViewRequirement(), } def _initialize_loss_from_dummy_batch( From 95fd464e3ac9d96cdca9d651b154f3deba3c0710 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 08:46:01 +0200 Subject: [PATCH 11/34] wip Signed-off-by: sven1977 --- rllib/policy/policy.py | 53 ++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index 88dcd7d4a9d31..e1b594de12d1c 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -1358,10 +1358,6 @@ def _get_default_view_requirements(self): SampleBatch.UNROLL_ID: ViewRequirement(), SampleBatch.AGENT_INDEX: ViewRequirement(), SampleBatch.T: ViewRequirement(), - SampleBatch.VF_PREDS: ViewRequirement(), - SampleBatch.ACTION_DIST_INPUTS: ViewRequirement(), - SampleBatch.ACTION_LOGP: ViewRequirement(), - SampleBatch.ACTION_PROB: ViewRequirement(), } def _initialize_loss_from_dummy_batch( @@ -1390,7 +1386,6 @@ def _initialize_loss_from_dummy_batch( if ( self.config.get("_disable_initialize_loss_from_dummy_batch", False) - or self.config.get("_enable_learner_api", False) ): return # Signal Policy that currently we do not like to eager/jit trace @@ -1477,29 +1472,31 @@ def _initialize_loss_from_dummy_batch( seq_len = sample_batch_size // B seq_lens = np.array([seq_len for _ in range(B)], dtype=np.int32) postprocessed_batch[SampleBatch.SEQ_LENS] = seq_lens - # Switch on lazy to-tensor conversion on `postprocessed_batch`. - train_batch = self._lazy_tensor_dict(postprocessed_batch) - # Calling loss, so set `is_training` to True. - train_batch.set_training(True) - if seq_lens is not None: - train_batch[SampleBatch.SEQ_LENS] = seq_lens - train_batch.count = self._dummy_batch.count - # Call the loss function, if it exists. - # TODO(jungong) : clean up after all agents get migrated. - # We should simply do self.loss(...) here. - if self._loss is not None: - self._loss(self, self.model, self.dist_class, train_batch) - elif ( - is_overridden(self.loss) or self.config.get("_enable_rl_module_api", False) - ) and not self.config["in_evaluation"]: - self.loss(self.model, self.dist_class, train_batch) - # Call the stats fn, if given. - # TODO(jungong) : clean up after all agents get migrated. - # We should simply do self.stats_fn(train_batch) here. - if stats_fn is not None: - stats_fn(self, train_batch) - if hasattr(self, "stats_fn") and not self.config["in_evaluation"]: - self.stats_fn(train_batch) + + if not self.config.get("_enable_learner_api"): + # Switch on lazy to-tensor conversion on `postprocessed_batch`. + train_batch = self._lazy_tensor_dict(postprocessed_batch) + # Calling loss, so set `is_training` to True. + train_batch.set_training(True) + if seq_lens is not None: + train_batch[SampleBatch.SEQ_LENS] = seq_lens + train_batch.count = self._dummy_batch.count + # Call the loss function, if it exists. + # TODO(jungong) : clean up after all agents get migrated. + # We should simply do self.loss(...) here. + if self._loss is not None: + self._loss(self, self.model, self.dist_class, train_batch) + elif ( + is_overridden(self.loss) or self.config.get("_enable_rl_module_api", False) + ) and not self.config["in_evaluation"]: + self.loss(self.model, self.dist_class, train_batch) + # Call the stats fn, if given. + # TODO(jungong) : clean up after all agents get migrated. + # We should simply do self.stats_fn(train_batch) here. + if stats_fn is not None: + stats_fn(self, train_batch) + if hasattr(self, "stats_fn") and not self.config["in_evaluation"]: + self.stats_fn(train_batch) # Re-enable tracing. self._no_tracing = False From 6e8ab3f08e970a1fecbd99cd751c027cd4c6ffa9 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 09:20:40 +0200 Subject: [PATCH 12/34] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 6 +++--- rllib/algorithms/appo/appo_torch_policy.py | 4 +--- rllib/algorithms/impala/impala_tf_policy.py | 6 +++--- rllib/algorithms/impala/impala_torch_policy.py | 3 +-- rllib/algorithms/ppo/ppo_tf_policy.py | 7 +++---- rllib/algorithms/ppo/ppo_torch_policy.py | 3 +-- rllib/policy/policy.py | 7 +++---- rllib/policy/torch_policy_v2.py | 7 +++---- 8 files changed, 18 insertions(+), 25 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index a473c1a11f031..6aad0c015dc23 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -111,9 +111,9 @@ def __init__( if old_stack: GradStatsMixin.__init__(self) - # Note: this is a bit ugly, but loss and optimizer initialization must - # happen after all the MixIns are initialized. - self.maybe_initialize_optimizer_and_loss() + # Note: this is a bit ugly, but loss and optimizer initialization must + # happen after all the MixIns are initialized. + self.maybe_initialize_optimizer_and_loss() # Initiate TargetNetwork ops after loss initialization. TargetNetworkMixin.__init__(self) diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 47659361fc5ee..373ac22285561 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -91,9 +91,7 @@ def __init__(self, observation_space, action_space, config): ValueNetworkMixin.__init__(self, config) KLCoeffMixin.__init__(self, config) - if old_stack: - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() + self._initialize_loss_from_dummy_batch() # Initiate TargetNetwork ops after loss initialization. TargetNetworkMixin.__init__(self) diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index 5a495cdf16081..932366f648769 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -306,9 +306,9 @@ def __init__( self, config["entropy_coeff"], config["entropy_coeff_schedule"] ) - # Note: this is a bit ugly, but loss and optimizer initialization must - # happen after all the MixIns are initialized. - self.maybe_initialize_optimizer_and_loss() + # Note: this is a bit ugly, but loss and optimizer initialization must + # happen after all the MixIns are initialized. + self.maybe_initialize_optimizer_and_loss() @override(base) def loss( diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index 894d34a6260ac..fca152342a902 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -218,8 +218,7 @@ def __init__(self, observation_space, action_space, config): max_seq_len=config["model"]["max_seq_len"], ) - if not self.config.get("_enable_learner_api"): - self._initialize_loss_from_dummy_batch() + self._initialize_loss_from_dummy_batch() @override(TorchPolicyV2) def loss( diff --git a/rllib/algorithms/ppo/ppo_tf_policy.py b/rllib/algorithms/ppo/ppo_tf_policy.py index cc4d19c86f15f..76e8d0161689a 100644 --- a/rllib/algorithms/ppo/ppo_tf_policy.py +++ b/rllib/algorithms/ppo/ppo_tf_policy.py @@ -95,10 +95,9 @@ def __init__( LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) KLCoeffMixin.__init__(self, config) - if not self.config.get("_enable_learner_api", False): - # Note: this is a bit ugly, but loss and optimizer initialization must - # happen after all the MixIns are initialized. - self.maybe_initialize_optimizer_and_loss() + # Note: this is a bit ugly, but loss and optimizer initialization must + # happen after all the MixIns are initialized. + self.maybe_initialize_optimizer_and_loss() @override(base) def loss( diff --git a/rllib/algorithms/ppo/ppo_torch_policy.py b/rllib/algorithms/ppo/ppo_torch_policy.py index fe235886bb73b..26a52dbe4d2b1 100644 --- a/rllib/algorithms/ppo/ppo_torch_policy.py +++ b/rllib/algorithms/ppo/ppo_torch_policy.py @@ -61,8 +61,7 @@ def __init__(self, observation_space, action_space, config): ) KLCoeffMixin.__init__(self, config) - if not self.config.get("_enable_learner_api", False): - self._initialize_loss_from_dummy_batch() + self._initialize_loss_from_dummy_batch() @override(TorchPolicyV2) def loss( diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index e1b594de12d1c..d40185f20297d 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -1481,14 +1481,13 @@ def _initialize_loss_from_dummy_batch( if seq_lens is not None: train_batch[SampleBatch.SEQ_LENS] = seq_lens train_batch.count = self._dummy_batch.count + # Call the loss function, if it exists. # TODO(jungong) : clean up after all agents get migrated. # We should simply do self.loss(...) here. if self._loss is not None: self._loss(self, self.model, self.dist_class, train_batch) - elif ( - is_overridden(self.loss) or self.config.get("_enable_rl_module_api", False) - ) and not self.config["in_evaluation"]: + elif is_overridden(self.loss) and not self.config["in_evaluation"]: self.loss(self.model, self.dist_class, train_batch) # Call the stats fn, if given. # TODO(jungong) : clean up after all agents get migrated. @@ -1502,7 +1501,7 @@ def _initialize_loss_from_dummy_batch( self._no_tracing = False # Add new columns automatically to view-reqs. - if auto_remove_unneeded_view_reqs: + if not self.config.get("_enable_learner_api") and auto_remove_unneeded_view_reqs: # Add those needed for postprocessing and training. all_accessed_keys = ( train_batch.accessed_keys diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index f82ce9ff3db47..f08326250fd61 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -208,11 +208,10 @@ def __init__( num_buffers = self.config.get("num_multi_gpu_tower_stacks", 1) self._loaded_batches = [[] for _ in range(num_buffers)] - # If set, means we are using distributed allreduce during learning. - self.distributed_world_size = None - - self.batch_divisibility_req = self.get_batch_divisibility_req() + # If set, means we are using distributed allreduce during learning. + self.distributed_world_size = None + self.batch_divisibility_req = self.get_batch_divisibility_req() self.max_seq_len = max_seq_len # If model is an RLModule it won't have tower_stats instead there will be a From b224d9fdb4b0346a842a724123c35a923237ea0b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 09:44:58 +0200 Subject: [PATCH 13/34] wip Signed-off-by: sven1977 --- .../algorithms/appo/tf/appo_tf_policy_rlm.py | 30 ------------- .../appo/torch/appo_torch_policy_rlm.py | 34 --------------- .../impala/tf/impala_tf_policy_rlm.py | 29 ------------- .../impala/torch/impala_torch_policy_rlm.py | 27 ------------ rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py | 35 ---------------- .../ppo/torch/ppo_torch_policy_rlm.py | 42 ------------------- rllib/examples/self_play_with_open_spiel.py | 5 ++- 7 files changed, 3 insertions(+), 199 deletions(-) delete mode 100644 rllib/algorithms/appo/tf/appo_tf_policy_rlm.py delete mode 100644 rllib/algorithms/appo/torch/appo_torch_policy_rlm.py delete mode 100644 rllib/algorithms/impala/tf/impala_tf_policy_rlm.py delete mode 100644 rllib/algorithms/impala/torch/impala_torch_policy_rlm.py delete mode 100644 rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py delete mode 100644 rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py diff --git a/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py b/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py deleted file mode 100644 index 6d7d4e92c7826..0000000000000 --- a/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py +++ /dev/null @@ -1,30 +0,0 @@ -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.utils.annotations import override - - -"""# TODO: Remove once we have a RLModule capable sampler class that can replace -# `Policy.compute_actions_from_input_dict()`. -class APPOTfPolicyWithRLModule(EagerTFPolicyV2): - def __init__(self, observation_space, action_space, config): - validate_config(config) - EagerTFPolicyV2.enable_eager_execution_if_necessary() - self.framework = "tf2" - EagerTFPolicyV2.__init__(self, observation_space, action_space, config) - # Initiate TargetNetwork ops after loss initialization. - self.maybe_initialize_optimizer_and_loss() - - @override(EagerTFPolicyV2) - def loss(self, model, dist_class, train_batch): - self.stats = {} - train_batch[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.ACTIONS] - train_batch[SampleBatch.REWARDS] - train_batch[SampleBatch.TERMINATEDS] - return 0.0 - - @override(EagerTFPolicyV2) - def get_batch_divisibility_req(self) -> int: - return self.config["rollout_fragment_length"] -""" \ No newline at end of file diff --git a/rllib/algorithms/appo/torch/appo_torch_policy_rlm.py b/rllib/algorithms/appo/torch/appo_torch_policy_rlm.py deleted file mode 100644 index e3f387bec5bce..0000000000000 --- a/rllib/algorithms/appo/torch/appo_torch_policy_rlm.py +++ /dev/null @@ -1,34 +0,0 @@ -import logging - -from ray.rllib.algorithms.ppo.ppo_torch_policy import validate_config -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch - -torch, _ = try_import_torch() - -logger = logging.getLogger(__name__) - - -"""# TODO: Remove once we have a RLModule capable sampler class that can replace -# `Policy.compute_actions_from_input_dict()`. -class APPOTorchPolicyWithRLModule(TorchPolicyV2): - def __init__(self, observation_space, action_space, config): - validate_config(config) - TorchPolicyV2.__init__(self, observation_space, action_space, config) - self._initialize_loss_from_dummy_batch() - - @override(TorchPolicyV2) - def loss(self, model, dist_class, train_batch): - self.stats = {} - train_batch[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.ACTIONS] - train_batch[SampleBatch.REWARDS] - train_batch[SampleBatch.TERMINATEDS] - return 0.0 - - @override(TorchPolicyV2) - def get_batch_divisibility_req(self) -> int: - return self.config["rollout_fragment_length"] -""" \ No newline at end of file diff --git a/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py b/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py deleted file mode 100644 index ba67c142f7765..0000000000000 --- a/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py +++ /dev/null @@ -1,29 +0,0 @@ -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.utils.annotations import override - - -"""# TODO: Remove once we have a RLModule capable sampler class that can replace -# `Policy.compute_actions_from_input_dict()`. -class ImpalaTfPolicyWithRLModule(EagerTFPolicyV2): - def __init__(self, observation_space, action_space, config): - validate_config(config) - EagerTFPolicyV2.enable_eager_execution_if_necessary() - self.framework = "tf2" - EagerTFPolicyV2.__init__(self, observation_space, action_space, config) - self.maybe_initialize_optimizer_and_loss() - - @override(EagerTFPolicyV2) - def loss(self, model, dist_class, train_batch): - self.stats = {} - train_batch[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.ACTIONS] - train_batch[SampleBatch.REWARDS] - train_batch[SampleBatch.TERMINATEDS] - return 0.0 - - @override(EagerTFPolicyV2) - def get_batch_divisibility_req(self) -> int: - return self.config["rollout_fragment_length"] -""" \ No newline at end of file diff --git a/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py b/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py deleted file mode 100644 index 2caa4f34ffab0..0000000000000 --- a/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py +++ /dev/null @@ -1,27 +0,0 @@ -from ray.rllib.algorithms.ppo.ppo_torch_policy import validate_config -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override - - -"""# TODO: Remove once we have a RLModule capable sampler class that can replace -# `Policy.compute_actions_from_input_dict()`. -class ImpalaTorchPolicyWithRLModule(TorchPolicyV2): - def __init__(self, observation_space, action_space, config): - validate_config(config) - TorchPolicyV2.__init__(self, observation_space, action_space, config) - self._initialize_loss_from_dummy_batch() - - @override(TorchPolicyV2) - def loss(self, model, dist_class, train_batch): - self.stats = {} - train_batch[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.ACTIONS] - train_batch[SampleBatch.REWARDS] - train_batch[SampleBatch.TERMINATEDS] - return 0.0 - - @override(TorchPolicyV2) - def get_batch_divisibility_req(self) -> int: - return self.config["rollout_fragment_length"] -""" \ No newline at end of file diff --git a/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py b/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py deleted file mode 100644 index 508cae595d15a..0000000000000 --- a/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py +++ /dev/null @@ -1,35 +0,0 @@ -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.evaluation.postprocessing import compute_gae_for_sample_batch -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.utils.annotations import override - - -""""# TODO: Remove once we have a RLModule capable sampler class that can replace -# `Policy.compute_actions_from_input_dict()`. -class PPOTfPolicyWithRLModule(EagerTFPolicyV2): - def __init__(self, observation_space, action_space, config): - validate_config(config) - EagerTFPolicyV2.enable_eager_execution_if_necessary() - self.framework = "tf2" - EagerTFPolicyV2.__init__(self, observation_space, action_space, config) - self.maybe_initialize_optimizer_and_loss() - - @override(EagerTFPolicyV2) - def loss(self, model, dist_class, train_batch): - self.stats = {} - train_batch[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.ACTIONS] - train_batch[SampleBatch.REWARDS] - train_batch[SampleBatch.TERMINATEDS] - return 0.0 - - @override(EagerTFPolicyV2) - def postprocess_trajectory( - self, sample_batch, other_agent_batches=None, episode=None - ): - sample_batch = super().postprocess_trajectory(sample_batch) - return compute_gae_for_sample_batch( - self, sample_batch, other_agent_batches, episode - ) -""" \ No newline at end of file diff --git a/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py b/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py deleted file mode 100644 index 4cbe75b4cf4ec..0000000000000 --- a/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py +++ /dev/null @@ -1,42 +0,0 @@ -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.evaluation.postprocessing import compute_gae_for_sample_batch -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch - -torch, nn = try_import_torch() - - -"""# TODO: Remove once we have a RLModule capable sampler class that can replace -# `Policy.compute_actions_from_input_dict()`. -class PPOTorchPolicyWithRLModule(TorchPolicyV2): - def __init__(self, observation_space, action_space, config): - validate_config(config) - TorchPolicyV2.__init__( - self, - observation_space, - action_space, - config, - max_seq_len=config["model"]["max_seq_len"], - ) - self._initialize_loss_from_dummy_batch() - - @override(TorchPolicyV2) - def loss(self, model, dist_class, train_batch): - self.stats = {} - train_batch[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.ACTIONS] - train_batch[SampleBatch.REWARDS] - train_batch[SampleBatch.TERMINATEDS] - return 0.0 - - @override(TorchPolicyV2) - def postprocess_trajectory( - self, sample_batch, other_agent_batches=None, episode=None - ): - with torch.no_grad(): - return compute_gae_for_sample_batch( - self, sample_batch, other_agent_batches, episode - ) -""" diff --git a/rllib/examples/self_play_with_open_spiel.py b/rllib/examples/self_play_with_open_spiel.py index 3c5360de15ded..33dfaf3b54fa8 100644 --- a/rllib/examples/self_play_with_open_spiel.py +++ b/rllib/examples/self_play_with_open_spiel.py @@ -274,6 +274,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): # Train the "main" policy to play really well using self-play. results = None if not args.from_checkpoint: + create_checkpoints = not bool(os.environ.get("RLLIB_ENABLE_RL_MODULE", False)) results = tune.Tuner( "PPO", param_space=config, @@ -294,8 +295,8 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): sort_by_metric=True, ), checkpoint_config=air.CheckpointConfig( - checkpoint_at_end=True, - checkpoint_frequency=10, + checkpoint_at_end=create_checkpoints, + checkpoint_frequency=10 if create_checkpoints else None, ), ), ).fit() From 26bdf8d8c3e2a16e8f18549c26034f5c9b2684ae Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 09:45:34 +0200 Subject: [PATCH 14/34] fix Signed-off-by: sven1977 --- rllib/algorithms/impala/impala_torch_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index fca152342a902..85f95d42f8c00 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -201,7 +201,7 @@ def __init__(self, observation_space, action_space, config): ray.rllib.algorithms.impala.impala.ImpalaConfig().to_dict(), **config ) - if not self.config.get("_enable_learner_api"): + if not config.get("_enable_learner_api"): VTraceOptimizer.__init__(self) # Need to initialize learning rate variable before calling # TorchPolicyV2.__init__. From 07409be030ae34be97585a4a27594d643d7a4114 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 09:46:50 +0200 Subject: [PATCH 15/34] fix Signed-off-by: sven1977 --- rllib/algorithms/impala/tests/test_impala_learner.py | 2 +- rllib/policy/policy.py | 9 +++++---- rllib/policy/torch_policy_v2.py | 3 ++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index 0ed63b3557cfb..849489a5375e8 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -7,7 +7,7 @@ from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch, try_import_tf -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import framework_iterator from ray.rllib.utils.torch_utils import convert_to_torch_tensor torch, nn = try_import_torch() diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index d40185f20297d..c1ca60b83904a 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -1384,9 +1384,7 @@ def _initialize_loss_from_dummy_batch( the loss. """ - if ( - self.config.get("_disable_initialize_loss_from_dummy_batch", False) - ): + if self.config.get("_disable_initialize_loss_from_dummy_batch", False): return # Signal Policy that currently we do not like to eager/jit trace # any function calls. This is to be able to track, which columns @@ -1501,7 +1499,10 @@ def _initialize_loss_from_dummy_batch( self._no_tracing = False # Add new columns automatically to view-reqs. - if not self.config.get("_enable_learner_api") and auto_remove_unneeded_view_reqs: + if ( + not self.config.get("_enable_learner_api") + and auto_remove_unneeded_view_reqs + ): # Add those needed for postprocessing and training. all_accessed_keys = ( train_batch.accessed_keys diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index f08326250fd61..86b0f3dbc9a3a 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -186,7 +186,8 @@ def __init__( if not self.config.get("_enable_learner_api", False): self._optimizers = force_list(self.optimizer()) - # Backward compatibility workaround so Policy will call self.loss() directly. + # Backward compatibility workaround so Policy will call self.loss() + # directly. # TODO (jungong): clean up after all policies are migrated to new sub-class # implementation. self._loss = None From 74f159f2993cd1854c195aa056cf1969760cc116 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 11:57:52 +0200 Subject: [PATCH 16/34] wip Signed-off-by: sven1977 --- .../tests/test_impala_off_policyness.py | 34 ++++++------------- .../ppo/tests/test_ppo_with_rl_module.py | 1 + 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/rllib/algorithms/impala/tests/test_impala_off_policyness.py b/rllib/algorithms/impala/tests/test_impala_off_policyness.py index 09600ff3f046b..c065efcd2acf6 100644 --- a/rllib/algorithms/impala/tests/test_impala_off_policyness.py +++ b/rllib/algorithms/impala/tests/test_impala_off_policyness.py @@ -28,43 +28,31 @@ def test_impala_off_policyness(self): .environment("CartPole-v1") .resources(num_gpus=0) .rollouts(num_rollout_workers=4) + .training(_enable_learner_api=True) + .rl_module(_enable_rl_module_api=True) ) num_iterations = 3 num_aggregation_workers_options = [0, 1] - enable_rlm_learner_group_options = [True, False] - default_exploration_config = config.exploration_config.copy() - for permutation in itertools.product( - num_aggregation_workers_options, enable_rlm_learner_group_options - ): - num_aggregation_workers, enable_learner_api = permutation - for fw in framework_iterator( - config, with_eager_tracing=True, frameworks=["tf2"] + for num_aggregation_workers in num_aggregation_workers_options: + for _ in framework_iterator( + config, frameworks=("tf2", "torch"), with_eager_tracing=True ): - # TODO(avnishn): Enable this for torch when we merge the torch learner. - if enable_learner_api and fw != "tf2": - continue - config.training(_enable_learner_api=enable_learner_api) - config.rl_module(_enable_rl_module_api=enable_learner_api) - if enable_learner_api: - # We have to set exploration_config here manually because setting - # it through config.exploration() only deepupdates it - config.exploration_config = {} - else: - config.exploration_config = default_exploration_config + # We have to set exploration_config here manually because setting + # it through config.exploration() only deepupdates it + config.exploration_config = {} config.num_aggregation_workers = num_aggregation_workers print("aggregation-workers={}".format(config.num_aggregation_workers)) algo = config.build() for i in range(num_iterations): results = algo.train() # TODO (Avnish): Add off-policiness check when the metrics are - # added back to the IMPALA Learner - if not enable_learner_api: - off_policy_ness = check_off_policyness(results, upper_limit=2.0) - print(f"off-policy'ness={off_policy_ness}") + # added back to the IMPALA Learner. + # off_policy_ness = check_off_policyness(results, upper_limit=2.0) + # print(f"off-policy'ness={off_policy_ness}") check_compute_single_action( algo, diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 3f2fc1d007b1c..5a446b3e6798b 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -159,6 +159,7 @@ def test_ppo_exploration_setup(self): enable_connectors=True, ) .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) ) obs = np.array(0) From 70e6127d30d6593a8b551ee9b6950a80a5b8c74b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 12:00:51 +0200 Subject: [PATCH 17/34] fix Signed-off-by: sven1977 --- rllib/policy/tf_mixins.py | 52 ++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/rllib/policy/tf_mixins.py b/rllib/policy/tf_mixins.py index 8ce18df5e9796..bf1df318d469a 100644 --- a/rllib/policy/tf_mixins.py +++ b/rllib/policy/tf_mixins.py @@ -208,37 +208,32 @@ class TargetNetworkMixin: """ def __init__(self): - if self.config.get("_enable_rl_module_api", False): - # In order to access the variables for rl modules, we need to - # use the underlying keras api model.trainable_variables. - model_vars = self.model.trainable_variables - target_model_vars = self.target_model.trainable_variables - else: + if not self.config.get("_enable_rl_module_api", False): model_vars = self.model.trainable_variables() target_model_vars = self.target_model.trainable_variables() - @make_tf_callable(self.get_session()) - def update_target_fn(tau): - tau = tf.convert_to_tensor(tau, dtype=tf.float32) - update_target_expr = [] - assert len(model_vars) == len(target_model_vars), ( - model_vars, - target_model_vars, - ) - for var, var_target in zip(model_vars, target_model_vars): - update_target_expr.append( - var_target.assign(tau * var + (1.0 - tau) * var_target) + @make_tf_callable(self.get_session()) + def update_target_fn(tau): + tau = tf.convert_to_tensor(tau, dtype=tf.float32) + update_target_expr = [] + assert len(model_vars) == len(target_model_vars), ( + model_vars, + target_model_vars, ) - logger.debug("Update target op {}".format(var_target)) - return tf.group(*update_target_expr) - - # Hard initial update. - self._do_update = update_target_fn - # TODO: The previous SAC implementation does an update(1.0) here. - # If this is changed to tau != 1.0 the sac_loss_function test fails. Why? - # Also the test is not very maintainable, we need to change that unittest - # anyway. - self.update_target(tau=1.0) # self.config.get("tau", 1.0)) + for var, var_target in zip(model_vars, target_model_vars): + update_target_expr.append( + var_target.assign(tau * var + (1.0 - tau) * var_target) + ) + logger.debug("Update target op {}".format(var_target)) + return tf.group(*update_target_expr) + + # Hard initial update. + self._do_update = update_target_fn + # TODO: The previous SAC implementation does an update(1.0) here. + # If this is changed to tau != 1.0 the sac_loss_function test fails. Why? + # Also the test is not very maintainable, we need to change that unittest + # anyway. + self.update_target(tau=1.0) # self.config.get("tau", 1.0)) @property def q_func_vars(self): @@ -276,7 +271,8 @@ def set_weights(self, weights): EagerTFPolicyV2.set_weights(self, weights) elif isinstance(self, EagerTFPolicy): # Handle TF2 policies. EagerTFPolicy.set_weights(self, weights) - self.update_target(self.config.get("tau", 1.0)) + if not self.config.get("_enable_rl_module_api", False): + self.update_target(self.config.get("tau", 1.0)) class ValueNetworkMixin: From e4a58eec28fd3dead71b92439a6c475b9e3fbffa Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 12:01:40 +0200 Subject: [PATCH 18/34] fix Signed-off-by: sven1977 --- rllib/examples/self_play_with_open_spiel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/examples/self_play_with_open_spiel.py b/rllib/examples/self_play_with_open_spiel.py index 33dfaf3b54fa8..f611cac7d1556 100644 --- a/rllib/examples/self_play_with_open_spiel.py +++ b/rllib/examples/self_play_with_open_spiel.py @@ -296,7 +296,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): ), checkpoint_config=air.CheckpointConfig( checkpoint_at_end=create_checkpoints, - checkpoint_frequency=10 if create_checkpoints else None, + checkpoint_frequency=10 if create_checkpoints else 0, ), ), ).fit() From c40cb8d433f8794e623c06e5179f9aa72644dcfe Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 12:03:35 +0200 Subject: [PATCH 19/34] fix Signed-off-by: sven1977 --- rllib/evaluation/tests/test_trajectory_view_api.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 0b81dc45bd3e2..47d9e06c8183f 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -7,9 +7,6 @@ from ray.rllib.algorithms.callbacks import DefaultCallbacks import ray.rllib.algorithms.dqn as dqn import ray.rllib.algorithms.ppo as ppo -from ray.rllib.algorithms.ppo.torch.ppo_torch_policy_rlm import ( - PPOTorchPolicyWithRLModule, -) from ray.rllib.examples.env.debug_counter_env import MultiAgentDebugCounterEnv from ray.rllib.examples.env.multi_agent import MultiAgentPendulum from ray.rllib.evaluation.rollout_worker import RolloutWorker @@ -239,9 +236,7 @@ def test_traj_view_next_action(self): enable_rl_module_api = config._enable_rl_module_api rollout_worker_w_api = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v1"), - default_policy_class=PPOTorchPolicyWithRLModule - if enable_rl_module_api - else ppo.PPOTorchPolicy, + default_policy_class=ppo.PPOTorchPolicy, config=config, ) # Add the next action (a') and 2nd next action (a'') to the view From a2bc97ae52fdb983fce2c9ab4061568cefa0909b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 12:42:44 +0200 Subject: [PATCH 20/34] fix Signed-off-by: sven1977 --- rllib/policy/eager_tf_policy_v2.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/rllib/policy/eager_tf_policy_v2.py b/rllib/policy/eager_tf_policy_v2.py index 7e1e543b08ab0..161a4bea0a225 100644 --- a/rllib/policy/eager_tf_policy_v2.py +++ b/rllib/policy/eager_tf_policy_v2.py @@ -432,16 +432,17 @@ def _init_view_requirements(self): self.view_requirements[SampleBatch.INFOS].used_for_training = False def maybe_initialize_optimizer_and_loss(self): - optimizers = force_list(self.optimizer()) - if self.exploration: - # Policies with RLModules don't have an exploration object. - optimizers = self.exploration.get_exploration_optimizer(optimizers) - - # The list of local (tf) optimizers (one per loss term). - self._optimizers: List[LocalOptimizer] = optimizers - # Backward compatibility: A user's policy may only support a single - # loss term and optimizer (no lists). - self._optimizer: LocalOptimizer = optimizers[0] if optimizers else None + if not self.config.get("_enable_learner_api", False): + optimizers = force_list(self.optimizer()) + if self.exploration: + # Policies with RLModules don't have an exploration object. + optimizers = self.exploration.get_exploration_optimizer(optimizers) + + # The list of local (tf) optimizers (one per loss term). + self._optimizers: List[LocalOptimizer] = optimizers + # Backward compatibility: A user's policy may only support a single + # loss term and optimizer (no lists). + self._optimizer: LocalOptimizer = optimizers[0] if optimizers else None self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, From 81bcbd5f420b0cc629489f93e0ceb27cfb076888 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 12:47:25 +0200 Subject: [PATCH 21/34] fix Signed-off-by: sven1977 --- .../algorithms/tests/test_algorithm_config.py | 19 ++++++++++++------- rllib/core/models/tests/test_catalog.py | 1 + rllib/models/tests/test_preprocessors.py | 4 +--- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index cc949c45f0e5a..13c44c20e936c 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -179,6 +179,7 @@ def test_rl_module_api(self): .framework("torch") .rollouts(enable_connectors=True) .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) ) config.validate() @@ -328,7 +329,11 @@ def get_default_rl_module_spec(self): ######################################## # This is the simplest case where we have to construct the marl module based on # the default specs only. - config = SingleAgentAlgoConfig().rl_module(_enable_rl_module_api=True) + config = ( + SingleAgentAlgoConfig() + .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) + ) config.validate() spec, expected = self._get_expected_marl_spec(config, DiscreteBCTorchModule) @@ -351,7 +356,7 @@ def get_default_rl_module_spec(self): "p2": SingleAgentRLModuleSpec(module_class=CustomRLModule1), } ), - ) + ).training(_enable_learner_api=True) config.validate() spec, expected = self._get_expected_marl_spec(config, CustomRLModule1) @@ -363,7 +368,7 @@ def get_default_rl_module_spec(self): config = SingleAgentAlgoConfig().rl_module( _enable_rl_module_api=True, rl_module_spec=SingleAgentRLModuleSpec(module_class=CustomRLModule1), - ) + ).training(_enable_learner_api=True) config.validate() spec, expected = self._get_expected_marl_spec(config, CustomRLModule1) @@ -382,7 +387,7 @@ def get_default_rl_module_spec(self): rl_module_spec=MultiAgentRLModuleSpec( module_specs=SingleAgentRLModuleSpec(module_class=CustomRLModule1) ), - ) + ).training(_enable_learner_api=True) config.validate() spec, expected = self._get_expected_marl_spec(config, CustomRLModule1) @@ -407,7 +412,7 @@ def get_default_rl_module_spec(self): "p2": SingleAgentRLModuleSpec(module_class=CustomRLModule1), }, ), - ) + ).training(_enable_learner_api=True) config.validate() spec, expected = self._get_expected_marl_spec( @@ -437,7 +442,7 @@ def get_default_rl_module_spec(self): # SingleAgentRLmoduleSpecs. config = MultiAgentAlgoConfigWithNoSingleAgentSpec().rl_module( _enable_rl_module_api=True - ) + ).training(_enable_learner_api=True) self.assertRaisesRegex( ValueError, @@ -449,7 +454,7 @@ def get_default_rl_module_spec(self): # This is the case where we ask the algorithm to use its default # MultiAgentRLModuleSpec, and the MultiAgentRLModuleSpec has defined its # SingleAgentRLmoduleSpecs. - config = MultiAgentAlgoConfig().rl_module(_enable_rl_module_api=True) + config = MultiAgentAlgoConfig().rl_module(_enable_rl_module_api=True).training(_enable_learner_api=True) config.validate() spec, expected = self._get_expected_marl_spec( diff --git a/rllib/core/models/tests/test_catalog.py b/rllib/core/models/tests/test_catalog.py index bb3bb52c71106..9d17766ee8c4f 100644 --- a/rllib/core/models/tests/test_catalog.py +++ b/rllib/core/models/tests/test_catalog.py @@ -385,6 +385,7 @@ def build_vf_head(self, framework): _enable_rl_module_api=True, rl_module_spec=SingleAgentRLModuleSpec(catalog_class=MyCatalog), ) + .training(_enable_learner_api=True) .framework("torch") ) diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index d26ba8b028ba6..ecd0824e0d463 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -56,9 +56,7 @@ def test_rlms_and_preprocessing(self): .framework("tf2") ) - # TODO (Artur): No need to manually enable RLModules here since we have not - # fully migrated. Clear this up after migration. - config.rl_module(_enable_rl_module_api=True) + config.rl_module(_enable_rl_module_api=True).training(_enable_learner_api=True) for _ in framework_iterator(config, frameworks=("torch", "tf2")): algo = config.build() From 499d818e989070f67d1a15bbb060a7cc7d956ec4 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 14:40:56 +0200 Subject: [PATCH 22/34] wip Signed-off-by: sven1977 --- .../impala/tests/test_impala_learner.py | 5 +++-- .../ppo/tests/test_ppo_with_rl_module.py | 21 +++++++++---------- rllib/models/tests/test_preprocessors.py | 15 +++++++------ 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index 849489a5375e8..5669a88b5d460 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -1,6 +1,7 @@ import unittest import numpy as np +import tree # pip install dm_tree import ray from ray.rllib.algorithms.impala import ImpalaConfig @@ -83,9 +84,9 @@ def test_impala_loss(self): policy = algo.get_policy() if fw == "tf2": - train_batch = tf.nest.map_structure( + train_batch = SampleBatch(tree.map_structure( lambda x: tf.convert_to_tensor(x), FAKE_BATCH - ) + )) elif fw == "torch": train_batch = convert_to_torch_tensor(SampleBatch(FAKE_BATCH)) diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 5a446b3e6798b..dcfc05a9fa32d 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -167,14 +167,14 @@ def test_ppo_exploration_setup(self): config, frameworks=("torch", "tf2"), with_eager_tracing=True ): # Default Agent should be setup with StochasticSampling. - trainer = config.build() + algo = config.build() # explore=False, always expect the same (deterministic) action. - a_ = trainer.compute_single_action( + a_ = algo.compute_single_action( obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0) ) for _ in range(50): - a = trainer.compute_single_action( + a = algo.compute_single_action( obs, explore=False, prev_action=np.array(2), @@ -186,12 +186,12 @@ def test_ppo_exploration_setup(self): actions = [] for _ in range(300): actions.append( - trainer.compute_single_action( + algo.compute_single_action( obs, prev_action=np.array(2), prev_reward=np.array(1.0) ) ) check(np.mean(actions), 1.5, atol=0.2) - trainer.stop() + algo.stop() def test_ppo_free_log_std_with_rl_modules(self): """Tests the free log std option works.""" @@ -217,8 +217,8 @@ def test_ppo_free_log_std_with_rl_modules(self): ) for fw in framework_iterator(config, frameworks=("torch", "tf2")): - trainer = config.build() - policy = trainer.get_policy() + algo = config.build() + policy = algo.get_policy() # Check the free log std var is created. if fw == "torch": @@ -245,14 +245,13 @@ def get_value(fw=fw, policy=policy, log_std_var=log_std_var): init_std = get_value() assert init_std == 0.0, init_std batch = compute_gae_for_sample_batch(policy, PENDULUM_FAKE_BATCH.copy()) - if fw == "torch": - batch = policy._lazy_tensor_dict(batch) - policy.learn_on_batch(batch) + batch = policy._lazy_tensor_dict(batch) + algo.learner_group.update(batch) # Check the variable is updated. post_std = get_value() assert post_std != 0.0, post_std - trainer.stop() + algo.stop() if __name__ == "__main__": diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index ecd0824e0d463..4093260634589 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -39,6 +39,7 @@ def tearDownClass(cls) -> None: def test_rlms_and_preprocessing(self): config = ( ppo.PPOConfig() + .framework("tf2") .environment( env="ray.rllib.examples.env.random_env.RandomEnv", env_config={ @@ -48,16 +49,18 @@ def test_rlms_and_preprocessing(self): }, ) # Run this very quickly locally. - .rollouts(rollout_fragment_length=10) - .rollouts(num_rollout_workers=0) - .training(train_batch_size=10, sgd_minibatch_size=1, num_sgd_iter=1) + .rollouts(num_rollout_workers=0, rollout_fragment_length=10) + .training( + train_batch_size=10, + sgd_minibatch_size=1, + num_sgd_iter=1, + _enable_learner_api=True, + ) + .rl_module(_enable_rl_module_api=True) # Set this to True to enforce no preprocessors being used. .experimental(_disable_preprocessor_api=True) - .framework("tf2") ) - config.rl_module(_enable_rl_module_api=True).training(_enable_learner_api=True) - for _ in framework_iterator(config, frameworks=("torch", "tf2")): algo = config.build() results = algo.train() From fd61ef6d6d184b8e548a2566dd9c1e09d4ec6e2b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 14:47:00 +0200 Subject: [PATCH 23/34] LINT Signed-off-by: sven1977 --- rllib/algorithms/impala/tests/test_impala_off_policyness.py | 6 +----- rllib/evaluation/tests/test_trajectory_view_api.py | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/rllib/algorithms/impala/tests/test_impala_off_policyness.py b/rllib/algorithms/impala/tests/test_impala_off_policyness.py index c065efcd2acf6..82a92916172f6 100644 --- a/rllib/algorithms/impala/tests/test_impala_off_policyness.py +++ b/rllib/algorithms/impala/tests/test_impala_off_policyness.py @@ -1,4 +1,3 @@ -import itertools import unittest import ray @@ -6,7 +5,6 @@ from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.test_utils import ( check_compute_single_action, - check_off_policyness, framework_iterator, ) @@ -34,8 +32,6 @@ def test_impala_off_policyness(self): num_iterations = 3 num_aggregation_workers_options = [0, 1] - default_exploration_config = config.exploration_config.copy() - for num_aggregation_workers in num_aggregation_workers_options: for _ in framework_iterator( config, frameworks=("tf2", "torch"), with_eager_tracing=True @@ -48,7 +44,7 @@ def test_impala_off_policyness(self): print("aggregation-workers={}".format(config.num_aggregation_workers)) algo = config.build() for i in range(num_iterations): - results = algo.train() + algo.train() # TODO (Avnish): Add off-policiness check when the metrics are # added back to the IMPALA Learner. # off_policy_ness = check_off_policyness(results, upper_limit=2.0) diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 47d9e06c8183f..6bcf6225444a4 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -233,7 +233,6 @@ def test_traj_view_next_action(self): .rollouts(rollout_fragment_length=200, num_rollout_workers=0) ) config.validate() - enable_rl_module_api = config._enable_rl_module_api rollout_worker_w_api = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=ppo.PPOTorchPolicy, From a53b0432677857811fa7ed0c3a8d679c6973f381 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 16:47:07 +0200 Subject: [PATCH 24/34] fix Signed-off-by: sven1977 --- rllib/BUILD | 2 +- rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py | 2 +- rllib/policy/torch_policy_v2.py | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 30df6d9413f69..e32276939ee44 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -805,7 +805,7 @@ py_test( py_test( name = "test_algorithm_config", tags = ["team:rllib", "algorithms_dir", "algorithms_dir_generic"], - size = "small", + size = "medium", srcs = ["algorithms/tests/test_algorithm_config.py"], ) diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index dcfc05a9fa32d..48a3c0b1f6bdf 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -246,7 +246,7 @@ def get_value(fw=fw, policy=policy, log_std_var=log_std_var): assert init_std == 0.0, init_std batch = compute_gae_for_sample_batch(policy, PENDULUM_FAKE_BATCH.copy()) batch = policy._lazy_tensor_dict(batch) - algo.learner_group.update(batch) + algo.learner_group.update(batch.as_multi_agent()) # Check the variable is updated. post_std = get_value() diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 86b0f3dbc9a3a..36cf71bd2e19d 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -1101,8 +1101,11 @@ def _compute_action_helper( if self.model: self.model.eval() + print(self.model) + extra_fetches = {} if isinstance(self.model, RLModule): + print("is RLModule!") if explore: fwd_out = self.model.forward_exploration(input_dict) else: From f175a3999c420b4f4c99cbe2e3aeb433bef79f0e Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 17:11:34 +0200 Subject: [PATCH 25/34] LINT Signed-off-by: sven1977 --- .../impala/tests/test_impala_learner.py | 6 +- .../algorithms/tests/test_algorithm_config.py | 88 ++++++++++++------- 2 files changed, 58 insertions(+), 36 deletions(-) diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index 5669a88b5d460..8725434b97515 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -84,9 +84,9 @@ def test_impala_loss(self): policy = algo.get_policy() if fw == "tf2": - train_batch = SampleBatch(tree.map_structure( - lambda x: tf.convert_to_tensor(x), FAKE_BATCH - )) + train_batch = SampleBatch( + tree.map_structure(lambda x: tf.convert_to_tensor(x), FAKE_BATCH) + ) elif fw == "torch": train_batch = convert_to_torch_tensor(SampleBatch(FAKE_BATCH)) diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index 13c44c20e936c..7b6804ea92cc6 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -348,15 +348,19 @@ def get_default_rl_module_spec(self): ######################################## # This is the case where we pass in a multi-agent RLModuleSpec that asks the # algorithm to assign a specific type of RLModule class to certain module_ids. - config = SingleAgentAlgoConfig().rl_module( - _enable_rl_module_api=True, - rl_module_spec=MultiAgentRLModuleSpec( - module_specs={ - "p1": SingleAgentRLModuleSpec(module_class=CustomRLModule1), - "p2": SingleAgentRLModuleSpec(module_class=CustomRLModule1), - } - ), - ).training(_enable_learner_api=True) + config = ( + SingleAgentAlgoConfig() + .rl_module( + _enable_rl_module_api=True, + rl_module_spec=MultiAgentRLModuleSpec( + module_specs={ + "p1": SingleAgentRLModuleSpec(module_class=CustomRLModule1), + "p2": SingleAgentRLModuleSpec(module_class=CustomRLModule1), + }, + ), + ) + .training(_enable_learner_api=True) + ) config.validate() spec, expected = self._get_expected_marl_spec(config, CustomRLModule1) @@ -365,10 +369,14 @@ def get_default_rl_module_spec(self): ######################################## # This is the case where we ask the algorithm to assign a specific type of # RLModule class to ALL module_ids. - config = SingleAgentAlgoConfig().rl_module( - _enable_rl_module_api=True, - rl_module_spec=SingleAgentRLModuleSpec(module_class=CustomRLModule1), - ).training(_enable_learner_api=True) + config = ( + SingleAgentAlgoConfig() + .rl_module( + _enable_rl_module_api=True, + rl_module_spec=SingleAgentRLModuleSpec(module_class=CustomRLModule1), + ) + .training(_enable_learner_api=True) + ) config.validate() spec, expected = self._get_expected_marl_spec(config, CustomRLModule1) @@ -382,12 +390,16 @@ def get_default_rl_module_spec(self): ######################################## # This is an alternative way to ask the algorithm to assign a specific type of # RLModule class to ALL module_ids. - config = SingleAgentAlgoConfig().rl_module( - _enable_rl_module_api=True, - rl_module_spec=MultiAgentRLModuleSpec( - module_specs=SingleAgentRLModuleSpec(module_class=CustomRLModule1) - ), - ).training(_enable_learner_api=True) + config = ( + SingleAgentAlgoConfig() + .rl_module( + _enable_rl_module_api=True, + rl_module_spec=MultiAgentRLModuleSpec( + module_specs=SingleAgentRLModuleSpec(module_class=CustomRLModule1) + ), + ) + .training(_enable_learner_api=True) + ) config.validate() spec, expected = self._get_expected_marl_spec(config, CustomRLModule1) @@ -403,16 +415,20 @@ def get_default_rl_module_spec(self): # This is not only assigning a specific type of RLModule class to EACH # module_id, but also defining a new custom MultiAgentRLModule class to be used # in the multi-agent scenario. - config = SingleAgentAlgoConfig().rl_module( - _enable_rl_module_api=True, - rl_module_spec=MultiAgentRLModuleSpec( - marl_module_class=CustomMARLModule1, - module_specs={ - "p1": SingleAgentRLModuleSpec(module_class=CustomRLModule1), - "p2": SingleAgentRLModuleSpec(module_class=CustomRLModule1), - }, - ), - ).training(_enable_learner_api=True) + config = ( + SingleAgentAlgoConfig() + .rl_module( + _enable_rl_module_api=True, + rl_module_spec=MultiAgentRLModuleSpec( + marl_module_class=CustomMARLModule1, + module_specs={ + "p1": SingleAgentRLModuleSpec(module_class=CustomRLModule1), + "p2": SingleAgentRLModuleSpec(module_class=CustomRLModule1), + }, + ), + ) + .training(_enable_learner_api=True) + ) config.validate() spec, expected = self._get_expected_marl_spec( @@ -440,9 +456,11 @@ def get_default_rl_module_spec(self): # This is the case where we ask the algorithm to use its default # MultiAgentRLModuleSpec, but the MultiAgentRLModuleSpec has not defined its # SingleAgentRLmoduleSpecs. - config = MultiAgentAlgoConfigWithNoSingleAgentSpec().rl_module( - _enable_rl_module_api=True - ).training(_enable_learner_api=True) + config = ( + MultiAgentAlgoConfigWithNoSingleAgentSpec() + .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) + ) self.assertRaisesRegex( ValueError, @@ -454,7 +472,11 @@ def get_default_rl_module_spec(self): # This is the case where we ask the algorithm to use its default # MultiAgentRLModuleSpec, and the MultiAgentRLModuleSpec has defined its # SingleAgentRLmoduleSpecs. - config = MultiAgentAlgoConfig().rl_module(_enable_rl_module_api=True).training(_enable_learner_api=True) + config = ( + MultiAgentAlgoConfig() + .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) + ) config.validate() spec, expected = self._get_expected_marl_spec( From 3e95159b6149ced4a5c54fc818185902498349d9 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 17:57:14 +0200 Subject: [PATCH 26/34] fix Signed-off-by: sven1977 --- rllib/policy/torch_policy_v2.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 36cf71bd2e19d..82e0c0c2f29ad 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -1101,11 +1101,8 @@ def _compute_action_helper( if self.model: self.model.eval() - print(self.model) - - extra_fetches = {} + extra_fetches = None if isinstance(self.model, RLModule): - print("is RLModule!") if explore: fwd_out = self.model.forward_exploration(input_dict) else: @@ -1166,7 +1163,7 @@ def _compute_action_helper( ) # Add default and custom fetches. - if not extra_fetches: + if extra_fetches is None: extra_fetches = self.extra_action_out( input_dict, state_batches, self.model, action_dist ) From 0ad186f6a693e31db2ed2ccaf6ccc001b7894a07 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 17:57:14 +0200 Subject: [PATCH 27/34] fix Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 4 +--- rllib/algorithms/appo/appo_tf_policy.py | 14 ++++++++------ rllib/algorithms/appo/appo_torch_policy.py | 7 +++++-- rllib/algorithms/impala/impala_tf_policy.py | 4 ++++ rllib/algorithms/impala/impala_torch_policy.py | 4 ++++ rllib/policy/torch_policy_v2.py | 7 ++----- 6 files changed, 24 insertions(+), 16 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 5dfaa7e6a956f..48fff3dead043 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -855,9 +855,7 @@ def validate(self) -> None: ) # Learner API requires RLModule API. - if not ( - (self._enable_learner_api is True) is (self._enable_rl_module_api is True) - ): + if self._enable_learner_api is not self._enable_rl_module_api: raise ValueError( "Learner API requires RLModule API and vice-versa! " "Enable RLModule API via " diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index 6aad0c015dc23..8441f8032ede8 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -81,11 +81,13 @@ def __init__( # First thing first, enable eager execution if necessary. base.enable_eager_execution_if_necessary() - old_stack = not config.get("_enable_learner_api", False) - - # Although this is a no-op, we call __init__ here to make it clear - # that base.__init__ will use the make_model() call. - if old_stack: + # If Learner API is used, we don't need any loss-specific mixins. + # However, we also would like to avoid creating special Policy-subclasses + # for this as the entire Policy concept will soon not be used anymore with + # the new Learner- and RLModule APIs. + if not config.get("_enable_learner_api", False): + # Although this is a no-op, we call __init__ here to make it clear + # that base.__init__ will use the make_model() call. VTraceClipGradients.__init__(self) VTraceOptimizer.__init__(self) @@ -108,7 +110,7 @@ def __init__( ValueNetworkMixin.__init__(self, config) KLCoeffMixin.__init__(self, config) - if old_stack: + if not config.get("_enable_learner_api", False): GradStatsMixin.__init__(self) # Note: this is a bit ugly, but loss and optimizer initialization must diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 373ac22285561..4a7754830f321 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -69,8 +69,11 @@ class APPOTorchPolicy( def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.algorithms.appo.appo.APPOConfig().to_dict(), **config) - old_stack = not config.get("_enable_learner_api", False) - if old_stack: + # If Learner API is used, we don't need any loss-specific mixins. + # However, we also would like to avoid creating special Policy-subclasses + # for this as the entire Policy concept will soon not be used anymore with + # the new Learner- and RLModule APIs. + if not config.get("_enable_learner_api", False): # Although this is a no-op, we call __init__ here to make it clear # that base.__init__ will use the make_model() call. VTraceOptimizer.__init__(self) diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index 932366f648769..d8b830ef7653b 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -297,6 +297,10 @@ def __init__( existing_model=existing_model, ) + # If Learner API is used, we don't need any loss-specific mixins. + # However, we also would like to avoid creating special Policy-subclasses + # for this as the entire Policy concept will soon not be used anymore with + # the new Learner- and RLModule APIs. if not self.config.get("_enable_learner_api"): GradStatsMixin.__init__(self) VTraceClipGradients.__init__(self) diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index 85f95d42f8c00..71aed03206015 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -201,6 +201,10 @@ def __init__(self, observation_space, action_space, config): ray.rllib.algorithms.impala.impala.ImpalaConfig().to_dict(), **config ) + # If Learner API is used, we don't need any loss-specific mixins. + # However, we also would like to avoid creating special Policy-subclasses + # for this as the entire Policy concept will soon not be used anymore with + # the new Learner- and RLModule APIs. if not config.get("_enable_learner_api"): VTraceOptimizer.__init__(self) # Need to initialize learning rate variable before calling diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 36cf71bd2e19d..82e0c0c2f29ad 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -1101,11 +1101,8 @@ def _compute_action_helper( if self.model: self.model.eval() - print(self.model) - - extra_fetches = {} + extra_fetches = None if isinstance(self.model, RLModule): - print("is RLModule!") if explore: fwd_out = self.model.forward_exploration(input_dict) else: @@ -1166,7 +1163,7 @@ def _compute_action_helper( ) # Add default and custom fetches. - if not extra_fetches: + if extra_fetches is None: extra_fetches = self.extra_action_out( input_dict, state_batches, self.model, action_dist ) From 3a85eae4935d64d0e37825da9026e80321b2a361 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 20:04:01 +0200 Subject: [PATCH 28/34] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 23 ++++++++++++++++++- rllib/algorithms/impala/impala.py | 21 +++++++++++++++++ rllib/algorithms/impala/impala_learner.py | 5 ++-- rllib/algorithms/ppo/ppo.py | 21 +++++++++++++++++ rllib/algorithms/ppo/ppo_learner.py | 5 ++-- .../ppo/tests/test_ppo_with_rl_module.py | 5 +++- rllib/algorithms/ppo/tf/ppo_tf_learner.py | 2 +- rllib/policy/tf_mixins.py | 10 ++++++-- rllib/policy/torch_mixins.py | 12 ++++++---- 9 files changed, 91 insertions(+), 13 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 48fff3dead043..c9a220cac4b03 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -871,6 +871,27 @@ def validate(self) -> None: self.training(_enable_learner_api=True) self.enable_connectors = True + # LR-schedule checking. + # For the new Learner API stack, any schedule must start at ts 0 to avoid + # ambiguity (user might think that the `lr` setting plays a role as well + # of that that's the initial LR, when it isn't). + if self._enable_learner_api and self.lr_schedule is not None: + if not isinstance(self.lr_schedule, (list, tuple)) or ( + len(self.lr_schedule) < 2 + ): + raise ValueError( + f"Invalid `lr_schedule` ({self.lr_schedule}) specified! Must be a " + "list of at least 2 tuples, each of the form " + "(`timestep`, `learning rate to reach`), e.g. " + "`[(0, 0.001), (1e6, 0.0001), (2e6, 0.00005)]`." + ) + elif self.lr_schedule[0][0] != 0: + raise ValueError( + "When providing a `lr_schedule`, the first timestep must be 0 and " + "the corresponding lr value is the initial learning rate! You " + f"provided ts={self.lr_schedule[0][0]} lr={self.lr_schedule[0][1]}." + ) + # Explore parameter cannot be False with RLModule API enabled. # The reason is that `explore` is not just a parameter that will get passed # down to the policy.compute_actions() anymore. It is a phase in which RLModule. @@ -1611,7 +1632,7 @@ def training( lr_schedule: Learning rate schedule. In the format of [[timestep, lr-value], [timestep, lr-value], ...] Intermediary timesteps will be assigned to interpolated learning rate - values. A schedule should normally start from timestep 0. + values. A schedule must start from timestep 0. grad_clip: The value to use for gradient clipping. Depending on the `grad_clip_by` setting, gradients will either be clipped by value, norm, or global_norm (see docstring on `grad_clip_by` below for more diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 1548698e23077..231530264c629 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -371,6 +371,27 @@ def validate(self) -> None: # Check `entropy_coeff` for correctness. if self.entropy_coeff < 0.0: raise ValueError("`entropy_coeff` must be >= 0.0!") + # Entropy coeff schedule checking. + # For the new Learner API stack, any schedule must start at ts 0 to avoid + # ambiguity (user might think that the `entropy_coeff` setting plays a role as + # well of that that's the initial entropy coeff, when it isn't). + if self._enable_learner_api and self.entropy_coeff_schedule is not None: + if not isinstance(self.entropy_coeff_schedule, (list, tuple)) or ( + len(self.entropy_coeff_schedule) < 2 + ): + raise ValueError( + f"Invalid `entropy_coeff_schedule` ({self.entropy_coeff_schedule}) " + "specified! Must be a list of at least 2 tuples, each of the form " + "(`timestep`, `coeff to reach`), e.g. " + "`[(0, 0.01, (1e6, 0.001), (2e6, 0.0005)]`." + ) + elif self.entropy_coeff_schedule[0][0] != 0: + raise ValueError( + "When providing a `entropy_coeff_schedule`, the first timestep must" + " be 0 and the corresponding value is the initial coefficient! " + f"You provided ts={self.entropy_coeff_schedule[0][0]} " + f"entropy_coeff={self.entropy_coeff_schedule[0][1]}." + ) # Check whether worker to aggregation-worker ratio makes sense. if self.num_aggregation_workers > self.num_rollout_workers: diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py index db0d28e1f9a9f..34eaae94d3f0a 100644 --- a/rllib/algorithms/impala/impala_learner.py +++ b/rllib/algorithms/impala/impala_learner.py @@ -49,7 +49,6 @@ def build(self) -> None: super().build() # Build entropy coeff scheduling tools. - self.entropy_coeff_scheduler = None if self.hps.entropy_coeff_schedule: # Custom schedule, based on list of # ([ts], [value to be reached by ts])-tuples. @@ -60,8 +59,10 @@ def build(self) -> None: framework=None, ) ) + # As initial entropy coeff value, use the first timestep's (must be 0) + # value. self.curr_entropy_coeffs_per_module = defaultdict( - lambda: self._get_tensor_variable(self.hps.entropy_coeff) + lambda: self._get_tensor_variable(self.hps.entropy_coeff_schedule[0][1]) ) @override(Learner) diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 05ab3ee5ca3ea..ed82bec06393c 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -325,6 +325,27 @@ def validate(self) -> None: # Check `entropy_coeff` for correctness. if self.entropy_coeff < 0.0: raise ValueError("`entropy_coeff` must be >= 0.0") + # Entropy coeff schedule checking. + # For the new Learner API stack, any schedule must start at ts 0 to avoid + # ambiguity (user might think that the `entropy_coeff` setting plays a role as + # well of that that's the initial entropy coeff, when it isn't). + if self._enable_learner_api and self.entropy_coeff_schedule is not None: + if not isinstance(self.entropy_coeff_schedule, (list, tuple)) or ( + len(self.entropy_coeff_schedule) < 2 + ): + raise ValueError( + f"Invalid `entropy_coeff_schedule` ({self.entropy_coeff_schedule}) " + "specified! Must be a list of at least 2 tuples, each of the form " + "(`timestep`, `coeff to reach`), e.g. " + "`[(0, 0.01, (1e6, 0.001), (2e6, 0.0005)]`." + ) + elif self.entropy_coeff_schedule[0][0] != 0: + raise ValueError( + "When providing a `entropy_coeff_schedule`, the first timestep must" + " be 0 and the corresponding value is the initial coefficient! " + f"You provided ts={self.entropy_coeff_schedule[0][0]} " + f"entropy_coeff={self.entropy_coeff_schedule[0][1]}." + ) class UpdateKL: diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py index 576471b285edd..ee7d4df482e8f 100644 --- a/rllib/algorithms/ppo/ppo_learner.py +++ b/rllib/algorithms/ppo/ppo_learner.py @@ -45,7 +45,6 @@ def build(self) -> None: super().build() # Build entropy coeff scheduling tools. - self.entropy_coeff_scheduler = None if self.hps.entropy_coeff_schedule: # Custom schedule, based on list of # ([ts], [value to be reached by ts])-tuples. @@ -56,8 +55,10 @@ def build(self) -> None: framework=None, ) ) + # As initial entropy coeff value, use the first timestep's (must be 0) + # value. self.curr_entropy_coeffs_per_module = defaultdict( - lambda: self._get_tensor_variable(self.hps.entropy_coeff) + lambda: self._get_tensor_variable(self.hps.entropy_coeff_schedule[0][1]) ) # If no schedule, pin entropy coeff to its given (fixed) value. else: diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 48a3c0b1f6bdf..44e7a836a60b6 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -126,10 +126,13 @@ def test_ppo_compilation_and_schedule_mixins(self): config.training(model=get_model_config(fw, lstm=lstm)) algo = config.build(env=env) + learner = algo.learner_group._learner optim = algo.learner_group._learner._named_optimizers[ DEFAULT_POLICY_ID ] - entropy_coeff = algo.get_policy().entropy_coeff + entropy_coeff = ( + learner.curr_entropy_coeffs_per_module[DEFAULT_POLICY_ID] + ) lr = optim.param_groups[0]["lr"] if fw == "torch" else optim.lr check(entropy_coeff, 0.1) # Check initial LR directly set in optimizer. diff --git a/rllib/algorithms/ppo/tf/ppo_tf_learner.py b/rllib/algorithms/ppo/tf/ppo_tf_learner.py index e65f658688252..4a780e83f8528 100644 --- a/rllib/algorithms/ppo/tf/ppo_tf_learner.py +++ b/rllib/algorithms/ppo/tf/ppo_tf_learner.py @@ -25,7 +25,7 @@ class PPOTfLearner(PPOLearner, TfLearner): - """Implements tf-specific PPO loss logic on top of PPOLearner. + """Implements curr_entropy_coeffs_per_moduletf-specific PPO loss logic on top of PPOLearner. This class implements the ppo loss under `_compute_loss_per_module()`. """ diff --git a/rllib/policy/tf_mixins.py b/rllib/policy/tf_mixins.py index bf1df318d469a..fe5e23a330e84 100644 --- a/rllib/policy/tf_mixins.py +++ b/rllib/policy/tf_mixins.py @@ -33,7 +33,9 @@ class LearningRateSchedule: @DeveloperAPI def __init__(self, lr, lr_schedule): self._lr_schedule = None - if lr_schedule is None: + # Disable any scheduling behavior related to learning if Learner API is active. + # Schedules are handled by Learner class. + if lr_schedule is None or self.config.get("_enable_learner_api", False): self.cur_lr = tf1.get_variable("lr", initializer=lr, trainable=False) else: self._lr_schedule = PiecewiseSchedule( @@ -78,7 +80,11 @@ class EntropyCoeffSchedule: @DeveloperAPI def __init__(self, entropy_coeff, entropy_coeff_schedule): self._entropy_coeff_schedule = None - if entropy_coeff_schedule is None: + # Disable any scheduling behavior related to learning if Learner API is active. + # Schedules are handled by Learner class. + if entropy_coeff_schedule is None or ( + self.config.get("_enable_learner_api", False) + ): self.entropy_coeff = get_variable( entropy_coeff, framework="tf", tf_name="entropy_coeff", trainable=False ) diff --git a/rllib/policy/torch_mixins.py b/rllib/policy/torch_mixins.py index d6c4b03a935d3..0d21e27a58b6e 100644 --- a/rllib/policy/torch_mixins.py +++ b/rllib/policy/torch_mixins.py @@ -8,8 +8,6 @@ torch, nn = try_import_torch() -# TODO: (sven) Unify hyperparam annealing procedures across RLlib (tf/torch) -# and for all possible hyperparams, not just lr. @DeveloperAPI class LearningRateSchedule: """Mixin for TorchPolicy that adds a learning rate schedule.""" @@ -17,7 +15,9 @@ class LearningRateSchedule: @DeveloperAPI def __init__(self, lr, lr_schedule): self._lr_schedule = None - if lr_schedule is None: + # Disable any scheduling behavior related to learning if Learner API is active. + # Schedules are handled by Learner class. + if lr_schedule is None or self.config.get("_enable_learner_api", False): self.cur_lr = lr else: self._lr_schedule = PiecewiseSchedule( @@ -42,7 +42,11 @@ class EntropyCoeffSchedule: @DeveloperAPI def __init__(self, entropy_coeff, entropy_coeff_schedule): self._entropy_coeff_schedule = None - if entropy_coeff_schedule is None: + # Disable any scheduling behavior related to learning if Learner API is active. + # Schedules are handled by Learner class. + if entropy_coeff_schedule is None or ( + self.config.get("_enable_learner_api", False) + ): self.entropy_coeff = entropy_coeff else: # Allows for custom schedule similar to lr_schedule format From 05103ba67d448d8ea8446861ef2a95ca5014a271 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 20:53:30 +0200 Subject: [PATCH 29/34] fix Signed-off-by: sven1977 --- rllib/algorithms/ppo/tf/ppo_tf_learner.py | 2 +- rllib/policy/torch_mixins.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rllib/algorithms/ppo/tf/ppo_tf_learner.py b/rllib/algorithms/ppo/tf/ppo_tf_learner.py index 4a780e83f8528..e65f658688252 100644 --- a/rllib/algorithms/ppo/tf/ppo_tf_learner.py +++ b/rllib/algorithms/ppo/tf/ppo_tf_learner.py @@ -25,7 +25,7 @@ class PPOTfLearner(PPOLearner, TfLearner): - """Implements curr_entropy_coeffs_per_moduletf-specific PPO loss logic on top of PPOLearner. + """Implements tf-specific PPO loss logic on top of PPOLearner. This class implements the ppo loss under `_compute_loss_per_module()`. """ diff --git a/rllib/policy/torch_mixins.py b/rllib/policy/torch_mixins.py index 0d21e27a58b6e..b258c1d74560f 100644 --- a/rllib/policy/torch_mixins.py +++ b/rllib/policy/torch_mixins.py @@ -17,7 +17,7 @@ def __init__(self, lr, lr_schedule): self._lr_schedule = None # Disable any scheduling behavior related to learning if Learner API is active. # Schedules are handled by Learner class. - if lr_schedule is None or self.config.get("_enable_learner_api", False): + if lr_schedule is None: self.cur_lr = lr else: self._lr_schedule = PiecewiseSchedule( @@ -28,7 +28,7 @@ def __init__(self, lr, lr_schedule): @override(Policy) def on_global_var_update(self, global_vars): super().on_global_var_update(global_vars) - if self._lr_schedule: + if self._lr_schedule and not self.config.get("_enable_learner_api", False): self.cur_lr = self._lr_schedule.value(global_vars["timestep"]) for opt in self._optimizers: for p in opt.param_groups: From f0c1145397dc899b81283bff474a836f3d49d10c Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 6 May 2023 20:54:09 +0200 Subject: [PATCH 30/34] LINT Signed-off-by: sven1977 --- rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 44e7a836a60b6..3d80e8c74f1d6 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -130,9 +130,9 @@ def test_ppo_compilation_and_schedule_mixins(self): optim = algo.learner_group._learner._named_optimizers[ DEFAULT_POLICY_ID ] - entropy_coeff = ( - learner.curr_entropy_coeffs_per_module[DEFAULT_POLICY_ID] - ) + entropy_coeff = learner.curr_entropy_coeffs_per_module[ + DEFAULT_POLICY_ID + ] lr = optim.param_groups[0]["lr"] if fw == "torch" else optim.lr check(entropy_coeff, 0.1) # Check initial LR directly set in optimizer. From 90ee0abcb42b836f31c58f2a73e01ddc7b3ded37 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sun, 7 May 2023 13:08:55 +0200 Subject: [PATCH 31/34] Add new Scheduler API. Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 22 +---- rllib/algorithms/impala/impala.py | 30 ++---- rllib/algorithms/impala/impala_learner.py | 23 ++--- .../algorithms/impala/tf/impala_tf_learner.py | 22 +---- .../impala/torch/impala_torch_learner.py | 22 +---- rllib/algorithms/ppo/ppo.py | 36 +++----- rllib/algorithms/ppo/ppo_learner.py | 31 ++----- rllib/algorithms/ppo/tf/ppo_tf_learner.py | 11 +-- .../algorithms/ppo/torch/ppo_torch_learner.py | 8 -- rllib/core/learner/learner.py | 28 ++---- rllib/core/learner/tf/tf_learner.py | 24 +++-- rllib/core/learner/torch/torch_learner.py | 11 ++- rllib/utils/schedules/scheduler.py | 91 +++++++++++++++++++ 13 files changed, 158 insertions(+), 201 deletions(-) create mode 100644 rllib/utils/schedules/scheduler.py diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 45c2b52353d5c..8b2720e9c15b3 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -52,6 +52,7 @@ try_import_gymnasium_and_gym, ) from ray.rllib.utils.policy import validate_policy_id +from ray.rllib.utils.schedules.scheduler import Scheduler from ray.rllib.utils.serialization import ( deserialize_type, NOT_SERIALIZABLE, @@ -872,25 +873,8 @@ def validate(self) -> None: self.enable_connectors = True # LR-schedule checking. - # For the new Learner API stack, any schedule must start at ts 0 to avoid - # ambiguity (user might think that the `lr` setting plays a role as well - # of that that's the initial LR, when it isn't). - if self._enable_learner_api and self.lr_schedule is not None: - if not isinstance(self.lr_schedule, (list, tuple)) or ( - len(self.lr_schedule) < 2 - ): - raise ValueError( - f"Invalid `lr_schedule` ({self.lr_schedule}) specified! Must be a " - "list of at least 2 tuples, each of the form " - "(`timestep`, `learning rate to reach`), e.g. " - "`[(0, 0.001), (1e6, 0.0001), (2e6, 0.00005)]`." - ) - elif self.lr_schedule[0][0] != 0: - raise ValueError( - "When providing a `lr_schedule`, the first timestep must be 0 and " - "the corresponding lr value is the initial learning rate! You " - f"provided ts={self.lr_schedule[0][0]} lr={self.lr_schedule[0][1]}." - ) + if self._enable_learner_api: + Scheduler.validate(self.lr_schedule, "lr_schedule", "learning rate") # Validate grad clipping settings. if self.grad_clip_by not in ["value", "norm", "global_norm"]: diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 231530264c629..d1ecfda9d6e4a 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -46,10 +46,10 @@ SYNCH_WORKER_WEIGHTS_TIMER, SAMPLE_TIMER, ) +from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import ReplayMode from ray.rllib.utils.replay_buffers.replay_buffer import _ALL_POLICIES - -from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder +from ray.rllib.utils.schedules.scheduler import Scheduler from ray.rllib.utils.typing import ( PartialAlgorithmConfigDict, PolicyID, @@ -372,26 +372,12 @@ def validate(self) -> None: if self.entropy_coeff < 0.0: raise ValueError("`entropy_coeff` must be >= 0.0!") # Entropy coeff schedule checking. - # For the new Learner API stack, any schedule must start at ts 0 to avoid - # ambiguity (user might think that the `entropy_coeff` setting plays a role as - # well of that that's the initial entropy coeff, when it isn't). - if self._enable_learner_api and self.entropy_coeff_schedule is not None: - if not isinstance(self.entropy_coeff_schedule, (list, tuple)) or ( - len(self.entropy_coeff_schedule) < 2 - ): - raise ValueError( - f"Invalid `entropy_coeff_schedule` ({self.entropy_coeff_schedule}) " - "specified! Must be a list of at least 2 tuples, each of the form " - "(`timestep`, `coeff to reach`), e.g. " - "`[(0, 0.01, (1e6, 0.001), (2e6, 0.0005)]`." - ) - elif self.entropy_coeff_schedule[0][0] != 0: - raise ValueError( - "When providing a `entropy_coeff_schedule`, the first timestep must" - " be 0 and the corresponding value is the initial coefficient! " - f"You provided ts={self.entropy_coeff_schedule[0][0]} " - f"entropy_coeff={self.entropy_coeff_schedule[0][1]}." - ) + if self._enable_learner_api: + Scheduler.validate( + self.entropy_coeff_schedule, + "entropy_coeff_schedule", + "entropy coefficient", + ) # Check whether worker to aggregation-worker ratio makes sense. if self.num_aggregation_workers > self.num_rollout_workers: diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py index 34eaae94d3f0a..c9d9e0b592a8b 100644 --- a/rllib/algorithms/impala/impala_learner.py +++ b/rllib/algorithms/impala/impala_learner.py @@ -1,4 +1,3 @@ -from collections import defaultdict from dataclasses import dataclass from typing import Any, List, Mapping, Optional, Union @@ -13,7 +12,7 @@ NUM_AGENT_STEPS_TRAINED, NUM_ENV_STEPS_TRAINED, ) -from ray.rllib.utils.schedules.piecewise_schedule import PiecewiseSchedule +from ray.rllib.utils.schedules.scheduler import Scheduler from ray.rllib.utils.typing import ResultDict @@ -49,21 +48,11 @@ def build(self) -> None: super().build() # Build entropy coeff scheduling tools. - if self.hps.entropy_coeff_schedule: - # Custom schedule, based on list of - # ([ts], [value to be reached by ts])-tuples. - self.entropy_coeff_schedule_per_module = defaultdict( - lambda: PiecewiseSchedule( - self.hps.entropy_coeff_schedule, - outside_value=self.hps.entropy_coeff_schedule[-1][-1], - framework=None, - ) - ) - # As initial entropy coeff value, use the first timestep's (must be 0) - # value. - self.curr_entropy_coeffs_per_module = defaultdict( - lambda: self._get_tensor_variable(self.hps.entropy_coeff_schedule[0][1]) - ) + self.entropy_coeff_scheduler = Scheduler( + fixed_value=self.hps.entropy_coeff, + schedule=self.hps.entropy_coeff_schedule, + framework=self.framework, + ) @override(Learner) def compile_results( diff --git a/rllib/algorithms/impala/tf/impala_tf_learner.py b/rllib/algorithms/impala/tf/impala_tf_learner.py index fa2481b5bbd5b..fd04a6bf8d23c 100644 --- a/rllib/algorithms/impala/tf/impala_tf_learner.py +++ b/rllib/algorithms/impala/tf/impala_tf_learner.py @@ -1,11 +1,9 @@ -from typing import Any, Dict, Mapping +from typing import Mapping from ray.rllib.algorithms.impala.impala_learner import ImpalaLearner from ray.rllib.algorithms.impala.tf.vtrace_tf_v2 import make_time_major, vtrace_tf2 -from ray.rllib.algorithms.ppo.ppo_learner import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY from ray.rllib.core.learner.learner import ENTROPY_KEY from ray.rllib.core.learner.tf.tf_learner import TfLearner -from ray.rllib.core.rl_module.rl_module import ModuleID from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf @@ -102,21 +100,3 @@ def compute_loss_per_module( "vf_loss": mean_vf_loss, ENTROPY_KEY: -mean_entropy_loss, } - - @override(ImpalaLearner) - def additional_update_per_module( - self, module_id: ModuleID, timestep: int - ) -> Dict[str, Any]: - results = super().additional_update_per_module( - module_id, - timestep=timestep, - ) - - # Update entropy coefficient. - value = self.hps.entropy_coeff - if self.hps.entropy_coeff_schedule is not None: - value = self.entropy_coeff_schedule_per_module[module_id].value(t=timestep) - self.curr_entropy_coeffs_per_module[module_id].assign(value) - results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: value}) - - return results diff --git a/rllib/algorithms/impala/torch/impala_torch_learner.py b/rllib/algorithms/impala/torch/impala_torch_learner.py index 907c2d4e32611..50dd9911823fb 100644 --- a/rllib/algorithms/impala/torch/impala_torch_learner.py +++ b/rllib/algorithms/impala/torch/impala_torch_learner.py @@ -1,14 +1,12 @@ -from typing import Any, Dict, Mapping +from typing import Mapping from ray.rllib.algorithms.impala.impala_learner import ImpalaLearner from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import ( vtrace_torch, make_time_major, ) -from ray.rllib.algorithms.ppo.ppo_learner import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY from ray.rllib.core.learner.learner import ENTROPY_KEY from ray.rllib.core.learner.torch.torch_learner import TorchLearner -from ray.rllib.core.rl_module.rl_module import ModuleID from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch @@ -118,21 +116,3 @@ def compute_loss_per_module( "vf_loss": mean_vf_loss, ENTROPY_KEY: -mean_entropy_loss, } - - @override(ImpalaLearner) - def additional_update_per_module( - self, module_id: ModuleID, timestep: int - ) -> Dict[str, Any]: - results = super().additional_update_per_module( - module_id, - timestep=timestep, - ) - - # Update entropy coefficient. - value = self.hps.entropy_coeff - if self.hps.entropy_coeff_schedule is not None: - value = self.entropy_coeff_schedule_per_module[module_id].value(t=timestep) - self.curr_entropy_coeffs_per_module[module_id].data = torch.tensor(value) - results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: value}) - - return results diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index ed82bec06393c..05f531117901e 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -13,7 +13,6 @@ import logging from typing import List, Optional, Type, Union, TYPE_CHECKING -from ray.util.debug import log_once from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided from ray.rllib.algorithms.pg import PGConfig @@ -25,21 +24,19 @@ from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.execution.rollout_ops import ( standardize_fields, + synchronous_parallel_sample, ) from ray.rllib.execution.train_ops import ( train_one_step, multi_gpu_train_one_step, ) -from ray.rllib.utils.annotations import ExperimentalAPI from ray.rllib.policy.policy import Policy -from ray.rllib.utils.annotations import override +from ray.rllib.utils.annotations import ExperimentalAPI, override from ray.rllib.utils.deprecation import ( DEPRECATED_VALUE, deprecation_warning, ) from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY -from ray.rllib.utils.typing import ResultDict -from ray.rllib.execution.rollout_ops import synchronous_parallel_sample from ray.rllib.utils.metrics import ( NUM_AGENT_STEPS_SAMPLED, NUM_ENV_STEPS_SAMPLED, @@ -47,6 +44,9 @@ SAMPLE_TIMER, ALL_MODULES, ) +from ray.rllib.utils.schedules.scheduler import Scheduler +from ray.rllib.utils.typing import ResultDict +from ray.util.debug import log_once if TYPE_CHECKING: from ray.rllib.core.learner.learner import Learner @@ -326,26 +326,12 @@ def validate(self) -> None: if self.entropy_coeff < 0.0: raise ValueError("`entropy_coeff` must be >= 0.0") # Entropy coeff schedule checking. - # For the new Learner API stack, any schedule must start at ts 0 to avoid - # ambiguity (user might think that the `entropy_coeff` setting plays a role as - # well of that that's the initial entropy coeff, when it isn't). - if self._enable_learner_api and self.entropy_coeff_schedule is not None: - if not isinstance(self.entropy_coeff_schedule, (list, tuple)) or ( - len(self.entropy_coeff_schedule) < 2 - ): - raise ValueError( - f"Invalid `entropy_coeff_schedule` ({self.entropy_coeff_schedule}) " - "specified! Must be a list of at least 2 tuples, each of the form " - "(`timestep`, `coeff to reach`), e.g. " - "`[(0, 0.01, (1e6, 0.001), (2e6, 0.0005)]`." - ) - elif self.entropy_coeff_schedule[0][0] != 0: - raise ValueError( - "When providing a `entropy_coeff_schedule`, the first timestep must" - " be 0 and the corresponding value is the initial coefficient! " - f"You provided ts={self.entropy_coeff_schedule[0][0]} " - f"entropy_coeff={self.entropy_coeff_schedule[0][1]}." - ) + if self._enable_learner_api: + Scheduler.validate( + self.entropy_coeff_schedule, + "entropy_coeff_schedule", + "entropy coefficient", + ) class UpdateKL: diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py index ee7d4df482e8f..0e3763638aa4b 100644 --- a/rllib/algorithms/ppo/ppo_learner.py +++ b/rllib/algorithms/ppo/ppo_learner.py @@ -5,7 +5,7 @@ from ray.rllib.core.learner.learner import LearnerHyperparameters from ray.rllib.core.learner.learner import Learner from ray.rllib.utils.annotations import override -from ray.rllib.utils.schedules.piecewise_schedule import PiecewiseSchedule +from ray.rllib.utils.schedules.scheduler import Scheduler LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY = "vf_loss_unclipped" @@ -34,10 +34,6 @@ class to configure your algorithm. entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = None vf_loss_coeff: float = None - # TODO: Move to base LearnerHyperparameter class (and handling of this setting - # into base Learners). - lr_schedule: Optional[List[List[Union[int, float]]]] = None - class PPOLearner(Learner): @override(Learner) @@ -45,26 +41,11 @@ def build(self) -> None: super().build() # Build entropy coeff scheduling tools. - if self.hps.entropy_coeff_schedule: - # Custom schedule, based on list of - # ([ts], [value to be reached by ts])-tuples. - self.entropy_coeff_schedule_per_module = defaultdict( - lambda: PiecewiseSchedule( - self.hps.entropy_coeff_schedule, - outside_value=self.hps.entropy_coeff_schedule[-1][-1], - framework=None, - ) - ) - # As initial entropy coeff value, use the first timestep's (must be 0) - # value. - self.curr_entropy_coeffs_per_module = defaultdict( - lambda: self._get_tensor_variable(self.hps.entropy_coeff_schedule[0][1]) - ) - # If no schedule, pin entropy coeff to its given (fixed) value. - else: - self.curr_entropy_coeffs_per_module = defaultdict( - lambda: self.hps.entropy_coeff - ) + self.entropy_coeff_scheduler = Scheduler( + fixed_value=self.hps.entropy_coeff, + schedule=self.hps.entropy_coeff_schedule, + framework=self.framework, + ) # Set up KL coefficient variables (per module). # Note that the KL coeff is not controlled by a schedul, but seeks diff --git a/rllib/algorithms/ppo/tf/ppo_tf_learner.py b/rllib/algorithms/ppo/tf/ppo_tf_learner.py index e65f658688252..6ab8960a326a5 100644 --- a/rllib/algorithms/ppo/tf/ppo_tf_learner.py +++ b/rllib/algorithms/ppo/tf/ppo_tf_learner.py @@ -3,7 +3,6 @@ from ray.rllib.algorithms.ppo.ppo_learner import ( LEARNER_RESULTS_KL_KEY, - LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY, LEARNER_RESULTS_CURR_KL_COEFF_KEY, LEARNER_RESULTS_VF_EXPLAINED_VAR_KEY, LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY, @@ -97,7 +96,8 @@ def compute_loss_per_module( total_loss = tf.reduce_mean( -surrogate_loss + self.hps.vf_loss_coeff * vf_loss_clipped - - self.curr_entropy_coeffs_per_module[module_id] * curr_entropy + - self.entropy_coeff_scheduler.get_current_value(module_id) * curr_entropy + # - self.curr_entropy_coeffs_per_module[module_id] * curr_entropy ) # Add mean_kl_loss (already processed through `reduce_mean_valid`), @@ -139,11 +139,4 @@ def additional_update_per_module( curr_var.assign(curr_var * 0.5) results.update({LEARNER_RESULTS_CURR_KL_COEFF_KEY: curr_var.numpy()}) - # Update entropy coefficient. - value = self.hps.entropy_coeff - if self.hps.entropy_coeff_schedule is not None: - value = self.entropy_coeff_schedule_per_module[module_id].value(t=timestep) - self.curr_entropy_coeffs_per_module[module_id].assign(value) - results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: value}) - return results diff --git a/rllib/algorithms/ppo/torch/ppo_torch_learner.py b/rllib/algorithms/ppo/torch/ppo_torch_learner.py index 675539f50f5c8..95fd17e09331a 100644 --- a/rllib/algorithms/ppo/torch/ppo_torch_learner.py +++ b/rllib/algorithms/ppo/torch/ppo_torch_learner.py @@ -3,7 +3,6 @@ from ray.rllib.algorithms.ppo.ppo_learner import ( LEARNER_RESULTS_KL_KEY, - LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY, LEARNER_RESULTS_CURR_KL_COEFF_KEY, LEARNER_RESULTS_VF_EXPLAINED_VAR_KEY, LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY, @@ -135,11 +134,4 @@ def additional_update_per_module( curr_var.data *= 0.5 results.update({LEARNER_RESULTS_CURR_KL_COEFF_KEY: curr_var.item()}) - # Update entropy coefficient. - value = self.hps.entropy_coeff - if self.hps.entropy_coeff_schedule is not None: - value = self.entropy_coeff_schedule_per_module[module_id].value(t=timestep) - self.curr_entropy_coeffs_per_module[module_id].data = torch.tensor(value) - results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: value}) - return results diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 688d1574d0342..5e4f60a678029 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -47,7 +47,7 @@ OverrideToImplementCustomLogic, OverrideToImplementCustomLogic_CallToSuperRecommended, ) -from ray.rllib.utils.schedules.piecewise_schedule import PiecewiseSchedule +from ray.rllib.utils.schedules.scheduler import Scheduler torch, _ = try_import_torch() tf1, tf, tfv = try_import_tf() @@ -622,24 +622,12 @@ def build(self) -> None: return self._is_built = True - # Generic LR scheduling tools. - self.lr_scheduler = None - if self.hps.lr_schedule is not None: - # Custom schedule, based on list of - # ([ts], [value to be reached by ts])-tuples. - self.lr_schedule_per_module = defaultdict( - lambda: PiecewiseSchedule( - self.hps.lr_schedule, - outside_value=self.hps.lr_schedule[-1][-1], - framework=None, - ) - ) - self.curr_lr_per_module = defaultdict( - lambda: self._get_tensor_variable(self._optimizer_config["lr"]) - ) - # If no schedule, pin learning rate to its given (fixed) value. - else: - self.curr_lr_per_module = defaultdict(lambda: self._optimizer_config["lr"]) + # Build learning rate scheduling tools. + self.lr_scheduler = Scheduler( + fixed_value=self._optimizer_config["lr"], + schedule=self.hps.lr_schedule, + framework=self.framework, + ) self._module = self._make_module() for param_seq, optimizer in self.configure_optimizers(): @@ -773,7 +761,7 @@ def additional_update_per_module(self, module_id: ModuleID, tau: float): return results_all_modules - @OverrideToImplementCustomLogic + @OverrideToImplementCustomLogic_CallToSuperRecommended def additional_update_per_module( self, module_id: ModuleID, **kwargs ) -> Dict[str, Any]: diff --git a/rllib/core/learner/tf/tf_learner.py b/rllib/core/learner/tf/tf_learner.py index 45f98d4f75205..eef34f78cc29c 100644 --- a/rllib/core/learner/tf/tf_learner.py +++ b/rllib/core/learner/tf/tf_learner.py @@ -528,17 +528,23 @@ def filter_fwd_out(x): def additional_update_per_module( self, module_id: ModuleID, *, timestep: int, **kwargs ) -> Mapping[str, Any]: + + results = super().additional_update_per_module(module_id, timestep=timestep) + # Handle lr scheduling updates and apply new learning rates to the optimizers. + new_lr = self.lr_scheduler.update(timestep=timestep) + + # Not sure why we need to do this here besides setting the original + # tf Variable `self.curr_lr_per_module[module_id]`. But when tf creates the + # optimizer, it seems to detach its lr value from the given variable. + # Updating this variable is NOT sufficient to update the actual optimizer's + # learning rate, so we have to explicitly set it here. if self.hps.lr_schedule is not None: - value = self.lr_schedule_per_module[module_id].value(t=timestep) - self.curr_lr_per_module[module_id].assign(value) - # Not sure why we need to do this here besides setting the original - # tf Variable `self.curr_lr_per_module[module_id]`. When tf creates the - # optimizer, maybe it detaches its lr value from the given variable? - self._named_optimizers[module_id].lr = value - return { - LEARNER_RESULTS_CURR_LR_KEY: self._named_optimizers[module_id].lr.numpy() - } + self._named_optimizers[module_id].lr = new_lr + + results.update({LEARNER_RESULTS_CURR_LR_KEY: new_lr}) + + return results @override(Learner) def _get_tensor_variable(self, value, dtype=None, trainable=False) -> "tf.Tensor": diff --git a/rllib/core/learner/torch/torch_learner.py b/rllib/core/learner/torch/torch_learner.py index 43eb0ac9910be..a6c67f5afc1e4 100644 --- a/rllib/core/learner/torch/torch_learner.py +++ b/rllib/core/learner/torch/torch_learner.py @@ -100,12 +100,13 @@ def compute_gradients( def additional_update_per_module( self, module_id: ModuleID, *, timestep: int, **kwargs ) -> Mapping[str, Any]: + results = super().additional_update_per_module(module_id, timestep=timestep) + # Handle lr scheduling updates and apply new learning rates to the optimizers. - value = self._optimizer_config["lr"] - if self.hps.lr_schedule is not None: - value = self.lr_schedule_per_module[module_id].value(t=timestep) - self.curr_lr_per_module[module_id].data = torch.tensor(value) - return {LEARNER_RESULTS_CURR_LR_KEY: value} + new_lr = self.lr_scheduler.update(timestep=timestep) + results.update({LEARNER_RESULTS_CURR_LR_KEY: new_lr}) + + return results @override(Learner) def postprocess_gradients( diff --git a/rllib/utils/schedules/scheduler.py b/rllib/utils/schedules/scheduler.py new file mode 100644 index 0000000000000..6b46e90423a4f --- /dev/null +++ b/rllib/utils/schedules/scheduler.py @@ -0,0 +1,91 @@ +from collections import defaultdict + +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.schedules.piecewise_schedule import PiecewiseSchedule + + +_, tf, _ = try_import_tf() +torch, _ = try_import_torch() + + +class Scheduler: + def __init__(self, fixed_value, schedule, framework): + + self.use_schedule = schedule is not None + self.framework = framework + + if self.use_schedule: + # Custom schedule, based on list of + # ([ts], [value to be reached by ts])-tuples. + self.entropy_coeff_schedule_per_module = defaultdict( + lambda: PiecewiseSchedule( + schedule, + outside_value=schedule[-1][-1], + framework=None, + ) + ) + # As initial entropy coeff value, use the first timestep's (must be 0) + # value. + self.curr_entropy_coeffs_per_module = defaultdict( + lambda: self._get_tensor_variable( + schedule[0][1], + framework=self.framework, + ) + ) + # If no schedule, pin entropy coeff to its given (fixed) value. + else: + self.curr_entropy_coeffs_per_module = defaultdict(lambda: fixed_value) + + @staticmethod + def validate(schedule, schedule_name, value_name): + # Schedule checking: Any schedule data must start at ts=0 to avoid + # ambiguity (user might think that the fixed setting (e.g. `lr` vs + # `lr_schedule`) plays a role as well of that that's the initial value, + # when it isn't). If a schedule is provided (not None), we ignore the + # corresponding fixed value. + if schedule is not None: + if not isinstance(schedule, (list, tuple)) or (len(schedule) < 2): + raise ValueError( + f"Invalid `{schedule_name}` ({schedule}) specified! Must be a " + "list of at least 2 tuples, each of the form " + f"(`timestep`, `{value_name} to reach`), e.g. " + "`[(0, 0.001), (1e6, 0.0001), (2e6, 0.00005)]`." + ) + elif schedule[0][0] != 0: + raise ValueError( + f"When providing a `{schedule_name}`, the first timestep must be 0 " + f"and the corresponding lr value is the initial {value_name}! You " + f"provided ts={schedule[0][0]} {value_name}={schedule[0][1]}." + ) + + def get_current_value(self, module_id): + return self.curr_entropy_coeffs_per_module[module_id] + + def update(self, module_id, timestep: int) -> float: + if self.use_schedule: + value = self.entropy_coeff_schedule_per_module[module_id].value(t=timestep) + if self.framework == "torch": + self.curr_entropy_coeffs_per_module[module_id].data = torch.tensor( + value + ) + else: + self.curr_entropy_coeffs_per_module[module_id].assign(value) + else: + value = self.curr_entropy_coeffs_per_module[module_id] + + return value + + def _get_tensor_variable(self, initial_value, framework, device=None): + if framework == "torch": + return torch.tensor( + initial_value, + requires_grad=False, + dtype=torch.float32, + device=device, + ) + else: + return tf.Variable( + initial_value, + trainable=False, + dtype=tf.float32, + ) From 6e9b0cdbc820ddf41881eae2be00488c22639f74 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sun, 7 May 2023 19:54:43 +0200 Subject: [PATCH 32/34] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 3 +- rllib/algorithms/impala/impala_learner.py | 21 ++- rllib/algorithms/ppo/ppo_learner.py | 22 ++- .../ppo/tests/test_ppo_with_rl_module.py | 15 ++- .../algorithms/ppo/torch/ppo_torch_learner.py | 2 +- rllib/core/learner/learner.py | 4 + rllib/core/learner/tf/tf_learner.py | 6 +- rllib/core/learner/torch/torch_learner.py | 6 +- rllib/utils/schedules/scheduler.py | 127 +++++++++++++----- 9 files changed, 159 insertions(+), 47 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 8b2720e9c15b3..241645f632a7a 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -1595,7 +1595,8 @@ def training( lr_schedule: Learning rate schedule. In the format of [[timestep, lr-value], [timestep, lr-value], ...] Intermediary timesteps will be assigned to interpolated learning rate - values. A schedule must start from timestep 0. + values. A schedule config's first entry must start with timestep 0, + i.e.: [[0, initial_value], [...]]. grad_clip: The value to use for gradient clipping. Depending on the `grad_clip_by` setting, gradients will either be clipped by value, norm, or global_norm (see docstring on `grad_clip_by` below for more diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py index c9d9e0b592a8b..3bfea8e28ed07 100644 --- a/rllib/algorithms/impala/impala_learner.py +++ b/rllib/algorithms/impala/impala_learner.py @@ -1,10 +1,11 @@ from dataclasses import dataclass -from typing import Any, List, Mapping, Optional, Union +from typing import Any, Dict, List, Mapping, Optional, Union import numpy as np import tree # pip install dm_tree from ray.rllib.core.learner.learner import Learner, LearnerHyperparameters +from ray.rllib.core.rl_module.rl_module import ModuleID from ray.rllib.policy.sample_batch import MultiAgentBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.metrics import ( @@ -16,6 +17,9 @@ from ray.rllib.utils.typing import ResultDict +LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY = "curr_entropy_coeff" + + @dataclass class ImpalaHyperparameters(LearnerHyperparameters): """Hyperparameters for the ImpalaLearner sub-classes (framework specific). @@ -52,8 +56,23 @@ def build(self) -> None: fixed_value=self.hps.entropy_coeff, schedule=self.hps.entropy_coeff_schedule, framework=self.framework, + device=self._device, ) + @override(Learner) + def additional_update_per_module( + self, module_id: ModuleID, timestep: int + ) -> Dict[str, Any]: + results = super().additional_update_per_module(module_id, timestep=timestep) + + # Update entropy coefficient via our Scheduler. + new_entropy_coeff = ( + self.entropy_coeff_scheduler.update(module_id, timestep=timestep) + ) + results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: new_entropy_coeff}) + + return results + @override(Learner) def compile_results( self, diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py index 0e3763638aa4b..7f75b9e1a06a5 100644 --- a/rllib/algorithms/ppo/ppo_learner.py +++ b/rllib/algorithms/ppo/ppo_learner.py @@ -1,9 +1,10 @@ from collections import defaultdict from dataclasses import dataclass -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union from ray.rllib.core.learner.learner import LearnerHyperparameters from ray.rllib.core.learner.learner import Learner +from ray.rllib.core.rl_module.rl_module import ModuleID from ray.rllib.utils.annotations import override from ray.rllib.utils.schedules.scheduler import Scheduler @@ -45,6 +46,7 @@ def build(self) -> None: fixed_value=self.hps.entropy_coeff, schedule=self.hps.entropy_coeff_schedule, framework=self.framework, + device=self._device, ) # Set up KL coefficient variables (per module). @@ -53,3 +55,21 @@ def build(self) -> None: self.curr_kl_coeffs_per_module = defaultdict( lambda: self._get_tensor_variable(self.hps.kl_coeff) ) + + @override(Learner) + def additional_update_per_module( + self, module_id: ModuleID, sampled_kl_values: dict, timestep: int + ) -> Dict[str, Any]: + results = super().additional_update_per_module( + module_id, + sampled_kl_values=sampled_kl_values, + timestep=timestep, + ) + + # Update entropy coefficient via our Scheduler. + new_entropy_coeff = ( + self.entropy_coeff_scheduler.update(module_id, timestep=timestep) + ) + results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: new_entropy_coeff}) + + return results diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 3d80e8c74f1d6..5b9eaaf2d3973 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -126,17 +126,22 @@ def test_ppo_compilation_and_schedule_mixins(self): config.training(model=get_model_config(fw, lstm=lstm)) algo = config.build(env=env) + # TODO: Maybe add an API to get the Learner(s) instances within + # a learner group, remote or not. learner = algo.learner_group._learner optim = algo.learner_group._learner._named_optimizers[ DEFAULT_POLICY_ID ] - entropy_coeff = learner.curr_entropy_coeffs_per_module[ - DEFAULT_POLICY_ID - ] + # Check initial LR directly set in optimizer vs the first (ts=0) + # value from the schedule. lr = optim.param_groups[0]["lr"] if fw == "torch" else optim.lr + check(lr, config.lr_schedule[0][1]) + + # Check current entropy coeff value using the respective Scheduler. + entropy_coeff = learner.entropy_coeff_scheduler.get_current_value( + DEFAULT_POLICY_ID + ) check(entropy_coeff, 0.1) - # Check initial LR directly set in optimizer. - check(lr, config.lr) for i in range(num_iterations): results = algo.train() diff --git a/rllib/algorithms/ppo/torch/ppo_torch_learner.py b/rllib/algorithms/ppo/torch/ppo_torch_learner.py index 95fd17e09331a..aa32a956d9780 100644 --- a/rllib/algorithms/ppo/torch/ppo_torch_learner.py +++ b/rllib/algorithms/ppo/torch/ppo_torch_learner.py @@ -92,7 +92,7 @@ def compute_loss_per_module( total_loss = torch.mean( -surrogate_loss + self.hps.vf_loss_coeff * vf_loss_clipped - - self.curr_entropy_coeffs_per_module[module_id] * curr_entropy + - self.entropy_coeff_scheduler.get_current_value(module_id) * curr_entropy ) # Add mean_kl_loss (already processed through `reduce_mean_valid`), diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 5e4f60a678029..ef2e3fc012174 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -245,6 +245,7 @@ def __init__( self._module_obj = module self._optimizer_config = optimizer_config self._hps = learner_hyperparameters or LearnerHyperparameters() + self._device = None # pick the configs that we need for the learner from scaling config self._learner_group_scaling_config = ( @@ -623,10 +624,13 @@ def build(self) -> None: self._is_built = True # Build learning rate scheduling tools. + # TODO (sven): Move lr from optimizer config to Learner HPs? + # We might not need optimizer config. self.lr_scheduler = Scheduler( fixed_value=self._optimizer_config["lr"], schedule=self.hps.lr_schedule, framework=self.framework, + device=self._device, ) self._module = self._make_module() diff --git a/rllib/core/learner/tf/tf_learner.py b/rllib/core/learner/tf/tf_learner.py index eef34f78cc29c..bfe03311cadca 100644 --- a/rllib/core/learner/tf/tf_learner.py +++ b/rllib/core/learner/tf/tf_learner.py @@ -88,9 +88,7 @@ def configure_optimizer_per_module( self, module_id: ModuleID ) -> Union[ParamOptimizerPair, NamedParamOptimizerPairs]: module = self._module[module_id] - # TODO (sven): Move lr from optimizer config to Learner HPs? - # We might not need optimizer config. - lr = self.curr_lr_per_module[module_id] + lr = self.lr_scheduler.get_current_value(module_id) optim = tf.keras.optimizers.Adam(learning_rate=lr) pair: ParamOptimizerPair = ( self.get_parameters(module), @@ -532,7 +530,7 @@ def additional_update_per_module( results = super().additional_update_per_module(module_id, timestep=timestep) # Handle lr scheduling updates and apply new learning rates to the optimizers. - new_lr = self.lr_scheduler.update(timestep=timestep) + new_lr = self.lr_scheduler.update(module_id=module_id, timestep=timestep) # Not sure why we need to do this here besides setting the original # tf Variable `self.curr_lr_per_module[module_id]`. But when tf creates the diff --git a/rllib/core/learner/torch/torch_learner.py b/rllib/core/learner/torch/torch_learner.py index a6c67f5afc1e4..29c4a2e4fc456 100644 --- a/rllib/core/learner/torch/torch_learner.py +++ b/rllib/core/learner/torch/torch_learner.py @@ -74,9 +74,7 @@ def configure_optimizer_per_module( self, module_id: ModuleID ) -> Union[ParamOptimizerPair, NamedParamOptimizerPairs]: module = self._module[module_id] - # TODO (sven): Move lr from optimizer config to Learner HPs? - # We might not need optimizer config. - lr = self.curr_lr_per_module[module_id] + lr = self.lr_scheduler.get_current_value(module_id) pair: ParamOptimizerPair = ( self.get_parameters(module), torch.optim.Adam(self.get_parameters(module), lr=lr), @@ -103,7 +101,7 @@ def additional_update_per_module( results = super().additional_update_per_module(module_id, timestep=timestep) # Handle lr scheduling updates and apply new learning rates to the optimizers. - new_lr = self.lr_scheduler.update(timestep=timestep) + new_lr = self.lr_scheduler.update(module_id=module_id, timestep=timestep) results.update({LEARNER_RESULTS_CURR_LR_KEY: new_lr}) return results diff --git a/rllib/utils/schedules/scheduler.py b/rllib/utils/schedules/scheduler.py index 6b46e90423a4f..37a409680c78b 100644 --- a/rllib/utils/schedules/scheduler.py +++ b/rllib/utils/schedules/scheduler.py @@ -1,7 +1,10 @@ from collections import defaultdict +from typing import List, Optional, Tuple +from ray.rllib.core.rl_module.rl_module import ModuleID from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.schedules.piecewise_schedule import PiecewiseSchedule +from ray.rllib.utils.typing import TensorType _, tf, _ = try_import_tf() @@ -9,40 +12,79 @@ class Scheduler: - def __init__(self, fixed_value, schedule, framework): + """Class to manage a scheduled (framework-dependent) tensor variable. + Uses the PiecewiseSchedule (for maximum configuration flexibility) + """ + def __init__( + self, + *, + fixed_value: Optional[float] = None, + schedule: Optional[List[Tuple[int, float]]] = None, + framework: str = "torch", + device: Optional[str] = None, + ): + """Initializes a Scheduler instance. + + Args: + fixed_value: A fixed, constant value (in case no schedule should be used). + Set `schedule` to None to always just use this fixed value. + If `fixed_value` is None, `schedule` must be provided. + schedule: The schedule configuration to use. In the format of + [[timestep, value], [timestep, value], ...] + Intermediary timesteps will be assigned to interpolated values (linear + interpolation will be used). A schedule config's first entry must + start with timestep 0, i.e.: [[0, initial_value], [...]]. + framework: The framework string, for which to create the tensor variable + that hold the current value. This is the variable that can be used in + the graph, e.g. in a loss function. + device: Optional device (for torch) to place the tensor variable on. + """ self.use_schedule = schedule is not None self.framework = framework + self.device = device if self.use_schedule: # Custom schedule, based on list of # ([ts], [value to be reached by ts])-tuples. - self.entropy_coeff_schedule_per_module = defaultdict( + self.schedule_per_module = defaultdict( lambda: PiecewiseSchedule( schedule, outside_value=schedule[-1][-1], framework=None, ) ) - # As initial entropy coeff value, use the first timestep's (must be 0) - # value. - self.curr_entropy_coeffs_per_module = defaultdict( - lambda: self._get_tensor_variable( - schedule[0][1], - framework=self.framework, - ) + # As initial tensor valie, use the first timestep's (must be 0) value. + self.curr_value_per_module = defaultdict( + lambda: self._create_tensor_variable(initial_value=schedule[0][1]) ) - # If no schedule, pin entropy coeff to its given (fixed) value. + # If no schedule, pin (fix) given value. else: - self.curr_entropy_coeffs_per_module = defaultdict(lambda: fixed_value) + self.curr_value_per_module = defaultdict(lambda: fixed_value) @staticmethod - def validate(schedule, schedule_name, value_name): - # Schedule checking: Any schedule data must start at ts=0 to avoid - # ambiguity (user might think that the fixed setting (e.g. `lr` vs - # `lr_schedule`) plays a role as well of that that's the initial value, - # when it isn't). If a schedule is provided (not None), we ignore the - # corresponding fixed value. + def validate( + schedule: Optional[List[Tuple[int, float]]], + schedule_name: str, + value_name: str, + ) -> None: + """Performs checking of a certain schedule configuration. + + The first entry in `schedule` must have a timestep of 0. + + Args: + schedule: The schedule configuration to check. In the format of + [[timestep, value], [timestep, value], ...] + Intermediary timesteps will be assigned to interpolated values (linear + interpolation will be used). A schedule config's first entry must + start with timestep 0, i.e.: [[0, initial_value], [...]]. + schedule_name: The name of the schedule, e.g. `lr_schedule`. + value_name: A full text description of the variable that's being scheduled, + e.g. `learning rate`. + + Raises: + ValueError: In case, errors are found in the schedule's format. + """ if schedule is not None: if not isinstance(schedule, (list, tuple)) or (len(schedule) < 2): raise ValueError( @@ -58,30 +100,55 @@ def validate(schedule, schedule_name, value_name): f"provided ts={schedule[0][0]} {value_name}={schedule[0][1]}." ) - def get_current_value(self, module_id): - return self.curr_entropy_coeffs_per_module[module_id] + def get_current_value(self, module_id: ModuleID) -> TensorType: + """Returns the current value (as a tensor variable), given a ModuleID. + + Args: + module_id: The module ID, for which to retrueve the current tensor value. + + Returns: + The tensor variable (holding the current value to be used). + """ + return self.curr_value_per_module[module_id] - def update(self, module_id, timestep: int) -> float: + def update(self, module_id: ModuleID, timestep: int) -> float: + """Updates the underlying (framework specific) tensor variable. + + Args: + module_id: The module ID, for which to update the tensor variable. + timestep: The current timestep. + + Returns: + The current value of the tensor variable as a python float. + """ if self.use_schedule: - value = self.entropy_coeff_schedule_per_module[module_id].value(t=timestep) + python_value = ( + self.schedule_per_module[module_id].value(t=timestep) + ) if self.framework == "torch": - self.curr_entropy_coeffs_per_module[module_id].data = torch.tensor( - value - ) + self.curr_value_per_module[module_id].data = torch.tensor(python_value) else: - self.curr_entropy_coeffs_per_module[module_id].assign(value) + self.curr_value_per_module[module_id].assign(python_value) else: - value = self.curr_entropy_coeffs_per_module[module_id] + python_value = self.curr_value_per_module[module_id] + + return python_value + + def _create_tensor_variable(self, initial_value: float) -> TensorType: + """Creates a framework-specific tensor variable to be scheduled. - return value + Args: + initial_value: The initial (float) value for the variable to hold. - def _get_tensor_variable(self, initial_value, framework, device=None): - if framework == "torch": + Returns: + The created framework-specific tensor variable. + """ + if self.framework == "torch": return torch.tensor( initial_value, requires_grad=False, dtype=torch.float32, - device=device, + device=self.device, ) else: return tf.Variable( From 7f85d0acc7fd73fc99edd735e5068f1dadf4aa7a Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sun, 7 May 2023 22:10:44 +0200 Subject: [PATCH 33/34] LINT Signed-off-by: sven1977 --- rllib/algorithms/impala/impala_learner.py | 4 ++-- rllib/algorithms/ppo/ppo_learner.py | 4 ++-- rllib/utils/schedules/scheduler.py | 5 ++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py index 3bfea8e28ed07..7e1153f4acc40 100644 --- a/rllib/algorithms/impala/impala_learner.py +++ b/rllib/algorithms/impala/impala_learner.py @@ -66,8 +66,8 @@ def additional_update_per_module( results = super().additional_update_per_module(module_id, timestep=timestep) # Update entropy coefficient via our Scheduler. - new_entropy_coeff = ( - self.entropy_coeff_scheduler.update(module_id, timestep=timestep) + new_entropy_coeff = self.entropy_coeff_scheduler.update( + module_id, timestep=timestep ) results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: new_entropy_coeff}) diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py index 7f75b9e1a06a5..be16bbb531121 100644 --- a/rllib/algorithms/ppo/ppo_learner.py +++ b/rllib/algorithms/ppo/ppo_learner.py @@ -67,8 +67,8 @@ def additional_update_per_module( ) # Update entropy coefficient via our Scheduler. - new_entropy_coeff = ( - self.entropy_coeff_scheduler.update(module_id, timestep=timestep) + new_entropy_coeff = self.entropy_coeff_scheduler.update( + module_id, timestep=timestep ) results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: new_entropy_coeff}) diff --git a/rllib/utils/schedules/scheduler.py b/rllib/utils/schedules/scheduler.py index 37a409680c78b..7d349329791bd 100644 --- a/rllib/utils/schedules/scheduler.py +++ b/rllib/utils/schedules/scheduler.py @@ -16,6 +16,7 @@ class Scheduler: Uses the PiecewiseSchedule (for maximum configuration flexibility) """ + def __init__( self, *, @@ -122,9 +123,7 @@ def update(self, module_id: ModuleID, timestep: int) -> float: The current value of the tensor variable as a python float. """ if self.use_schedule: - python_value = ( - self.schedule_per_module[module_id].value(t=timestep) - ) + python_value = self.schedule_per_module[module_id].value(t=timestep) if self.framework == "torch": self.curr_value_per_module[module_id].data = torch.tensor(python_value) else: From 1bb3ec253db01905b5058c5302536fffbeb6a077 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sun, 7 May 2023 22:17:20 +0200 Subject: [PATCH 34/34] fix Signed-off-by: sven1977 --- .../algorithms/ppo/tests/test_ppo_with_rl_module.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 5b9eaaf2d3973..c700ff7ab16e0 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -227,23 +227,20 @@ def test_ppo_free_log_std_with_rl_modules(self): for fw in framework_iterator(config, frameworks=("torch", "tf2")): algo = config.build() policy = algo.get_policy() + learner = algo.learner_group._learner + module = learner.module[DEFAULT_POLICY_ID] # Check the free log std var is created. if fw == "torch": - matching = [ - v for (n, v) in policy.model.named_parameters() if "log_std" in n - ] + matching = [v for (n, v) in module.named_parameters() if "log_std" in n] else: matching = [ - v for v in policy.model.trainable_variables if "log_std" in str(v) + v for v in module.trainable_variables if "log_std" in str(v) ] assert len(matching) == 1, matching log_std_var = matching[0] - # linter yells at you if you don't pass in the parameters. - # reason: https://docs.python-guide.org/writing/gotchas/ - # #late-binding-closures - def get_value(fw=fw, policy=policy, log_std_var=log_std_var): + def get_value(): if fw == "torch": return log_std_var.detach().cpu().numpy()[0] else: