Wrote appo tf policy rlm which has working loss but isn't seemingly u…

…pdating? Signed-off-by: Avnish <[email protected]>
ray-project · gjoliver · Mar 26, 2023 · Mar 13, 2023 · Mar 13, 2023 · Mar 14, 2023
commit f0ea9206e22693032e4547ee9871710710b56045
@@ -925,6 +925,13 @@ py_test(
  srcs = ["algorithms/appo/tests/test_appo_off_policyness.py"]
 )
 
+py_test(
+ name = "test_appo_learner",
+ tags = ["team:rllib", "algorithms_dir"],
+ size = "medium",
+ srcs = ["algorithms/appo/tests/tf/test_appo_learner.py"]
+)
+
 # ARS
 py_test(
  name = "test_ars",

@@ -24,8 +24,6 @@
  NUM_AGENT_STEPS_SAMPLED,
  NUM_ENV_STEPS_SAMPLED,
  NUM_TARGET_UPDATES,
- NUM_AGENT_STEPS_TRAINED,
- NUM_ENV_STEPS_TRAINED,
 )
 from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY
 from ray.rllib.utils.typing import (
@@ -378,7 +376,9 @@ def get_default_policy_class(
  from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
 
  return EagerTFPolicyV2
- # from ray.rllib.algorithms.appo.tf.appo_tf_policy_rlm import APPOTfPolicyWithRLModule
+ # TODO(avnishn): This policy class doesn't work just yet
+ # from ray.rllib.algorithms.appo.tf.appo_tf_policy_rlm import(
+ # ) APPOTfPolicyWithRLModule
  # return APPOTfPolicyWithRLModule
  from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF2Policy
 

@@ -0,0 +1,109 @@
+import unittest
+import numpy as np
+
+import ray
+import ray.rllib.algorithms.appo as appo
+from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.metrics import ALL_MODULES
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.test_utils import check, framework_iterator
+
+
+tf1, tf, _ = try_import_tf()
+
+tf1.enable_eager_execution()
+
+frag_length = 32
+
+FAKE_BATCH = {
+ SampleBatch.OBS: np.random.uniform(low=0, high=1, size=(frag_length, 4)).astype(
+ np.float32
+ ),
+ SampleBatch.ACTIONS: np.random.choice(2, frag_length).astype(np.float32),
+ SampleBatch.REWARDS: np.random.uniform(low=-1, high=1, size=(frag_length,)).astype(
+ np.float32
+ ),
+ SampleBatch.TERMINATEDS: np.array(
+ [False for _ in range(frag_length - 1)] + [True]
+ ).astype(np.float32),
+ SampleBatch.VF_PREDS: np.array(
+ list(reversed(range(frag_length))), dtype=np.float32
+ ),
+ SampleBatch.ACTION_LOGP: np.log(
+ np.random.uniform(low=0, high=1, size=(frag_length,))
+ ).astype(np.float32),
+}
+
+
+class TestImpalaTfLearner(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ ray.init()
+
+ @classmethod
+ def tearDownClass(cls):
+ ray.shutdown()
+
+ def test_appo_loss(self):
+ """Test that appo_policy_rlm loss matches the appo learner loss."""
+ config = (
+ appo.APPOConfig()
+ .environment("CartPole-v1")
+ .rollouts(
+ num_rollout_workers=0,
+ rollout_fragment_length=frag_length,
+ )
+ .resources(num_gpus=0)
+ .training(
+ gamma=0.99,
+ model=dict(
+ fcnet_hiddens=[10, 10],
+ fcnet_activation="linear",
+ vf_share_layers=False,
+ ),
+ )
+ .rl_module(
+ _enable_rl_module_api=True,
+ )
+ )
+
+ for fw in framework_iterator(config, ("tf2")):
+ trainer = config.build()
+ policy = trainer.get_policy()
+
+ if fw == "tf2":
+ train_batch = tf.nest.map_structure(
+ lambda x: tf.convert_to_tensor(x), FAKE_BATCH
+ )
+ train_batch = SampleBatch(FAKE_BATCH)
+ policy_loss = policy.loss(policy.model, policy.dist_class, train_batch)
+
+ algo_config = config.copy(copy_frozen=False)
+ algo_config.training(_enable_learner_api=True)
+ algo_config.validate()
+ algo_config.freeze()
+
+ learner_group_config = algo_config.get_learner_group_config(
+ SingleAgentRLModuleSpec(
+ module_class=algo_config.rl_module_spec.module_class,
+ observation_space=policy.observation_space,
+ action_space=policy.action_space,
+ model_config_dict=policy.config["model"],
+ catalog_class=algo_config.rl_module_spec.catalog_class,
+ )
+ )
+ learner_group_config.num_learner_workers = 0
+ learner_group = learner_group_config.build()
+ learner_group.set_weights(trainer.get_weights())
+ results = learner_group.update(train_batch.as_multi_agent())
+ learner_group_loss = results[ALL_MODULES]["total_loss"]
+
+ check(learner_group_loss, policy_loss)
+
+
+if __name__ == "__main__":
+ import pytest
+ import sys
+
+ sys.exit(pytest.main(["-v", __file__]))
@@ -147,9 +147,13 @@ def compute_loss_per_module(
  )
 
  # The policy gradients loss.
- is_ratio = tf.clip_by_value(tf.math.exp(
- behaviour_actions_logp_time_major - old_actions_logp_time_major
- ), 0.0, 2.0)
+ is_ratio = tf.clip_by_value(
+ tf.math.exp(
+ behaviour_actions_logp_time_major - old_actions_logp_time_major
+ ),
+ 0.0,
+ 2.0,
+ )
  logp_ratio = is_ratio * tf.math.exp(
  target_actions_logp_time_major - behaviour_actions_logp_time_major
  )