wip

Signed-off-by: sven1977 <[email protected]>
ray-project · sven1977 · May 8, 2023 · Apr 15, 2023 · Apr 15, 2023 · May 4, 2023
commit 499d818e989070f67d1a15bbb060a7cc7d956ec4
@@ -1,6 +1,7 @@
 import unittest
 
 import numpy as np
+import tree # pip install dm_tree
 
 import ray
 from ray.rllib.algorithms.impala import ImpalaConfig
@@ -83,9 +84,9 @@ def test_impala_loss(self):
  policy = algo.get_policy()
 
  if fw == "tf2":
- train_batch = tf.nest.map_structure(
+ train_batch = SampleBatch(tree.map_structure(
  lambda x: tf.convert_to_tensor(x), FAKE_BATCH
- )
+ ))
  elif fw == "torch":
  train_batch = convert_to_torch_tensor(SampleBatch(FAKE_BATCH))
 

@@ -167,14 +167,14 @@ def test_ppo_exploration_setup(self):
  config, frameworks=("torch", "tf2"), with_eager_tracing=True
  ):
  # Default Agent should be setup with StochasticSampling.
- trainer = config.build()
+ algo = config.build()
  # explore=False, always expect the same (deterministic) action.
- a_ = trainer.compute_single_action(
+ a_ = algo.compute_single_action(
  obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0)
  )
 
  for _ in range(50):
- a = trainer.compute_single_action(
+ a = algo.compute_single_action(
  obs,
  explore=False,
  prev_action=np.array(2),
@@ -186,12 +186,12 @@ def test_ppo_exploration_setup(self):
  actions = []
  for _ in range(300):
  actions.append(
- trainer.compute_single_action(
+ algo.compute_single_action(
  obs, prev_action=np.array(2), prev_reward=np.array(1.0)
  )
  )
  check(np.mean(actions), 1.5, atol=0.2)
- trainer.stop()
+ algo.stop()
 
  def test_ppo_free_log_std_with_rl_modules(self):
  """Tests the free log std option works."""
@@ -217,8 +217,8 @@ def test_ppo_free_log_std_with_rl_modules(self):
  )
 
  for fw in framework_iterator(config, frameworks=("torch", "tf2")):
- trainer = config.build()
- policy = trainer.get_policy()
+ algo = config.build()
+ policy = algo.get_policy()
 
  # Check the free log std var is created.
  if fw == "torch":
@@ -245,14 +245,13 @@ def get_value(fw=fw, policy=policy, log_std_var=log_std_var):
  init_std = get_value()
  assert init_std == 0.0, init_std
  batch = compute_gae_for_sample_batch(policy, PENDULUM_FAKE_BATCH.copy())
- if fw == "torch":
- batch = policy._lazy_tensor_dict(batch)
- policy.learn_on_batch(batch)
+ batch = policy._lazy_tensor_dict(batch)
+ algo.learner_group.update(batch)
 
  # Check the variable is updated.
  post_std = get_value()
  assert post_std != 0.0, post_std
- trainer.stop()
+ algo.stop()
 
 
 if __name__ == "__main__":

@@ -39,6 +39,7 @@ def tearDownClass(cls) -> None:
  def test_rlms_and_preprocessing(self):
  config = (
  ppo.PPOConfig()
+ .framework("tf2")
  .environment(
  env="ray.rllib.examples.env.random_env.RandomEnv",
  env_config={
@@ -48,16 +49,18 @@ def test_rlms_and_preprocessing(self):
  },
  )
  # Run this very quickly locally.
- .rollouts(rollout_fragment_length=10)
- .rollouts(num_rollout_workers=0)
- .training(train_batch_size=10, sgd_minibatch_size=1, num_sgd_iter=1)
+ .rollouts(num_rollout_workers=0, rollout_fragment_length=10)
+ .training(
+ train_batch_size=10,
+ sgd_minibatch_size=1,
+ num_sgd_iter=1,
+ _enable_learner_api=True,
+ )
+ .rl_module(_enable_rl_module_api=True)
  # Set this to True to enforce no preprocessors being used.
  .experimental(_disable_preprocessor_api=True)
- .framework("tf2")
  )
 
- config.rl_module(_enable_rl_module_api=True).training(_enable_learner_api=True)
-
  for _ in framework_iterator(config, frameworks=("torch", "tf2")):
  algo = config.build()
  results = algo.train()