Skip to content

Commit

Permalink
PG unify/cleanup tf vs torch and PG functionality test cases (tf + to…
Browse files Browse the repository at this point in the history
…rch). (#6650)

* Unifying the code for PGTrainer/Policy wrt tf vs torch.
Adding loss function test cases for the PGAgent (confirm equivalence of tf and torch).

* Fix LINT line-len errors.

* Fix LINT errors.

* Fix `tf_pg_policy` imports (formerly: `pg_policy`).

* Rename tf_pg_... into pg_tf_... following <alg>_<framework>_... convention, where ...=policy/loss/agent/trainer.
Retire `PGAgent` class (use PGTrainer instead).

* - Move PG test into agents/pg/tests directory.
- All test cases will be located near the classes that are tested and
  then built into the Bazel/Travis test suite.

* Moved post_process_advantages into pg.py (from pg_tf_policy.py), b/c
the function is not a tf-specific one.

* Fix remaining import errors for agents/pg/...

* Fix circular dependency in pg imports.

* Add pg tests to Jenkins test suite.
  • Loading branch information
sven1977 authored and ericl committed Jan 3, 2020
1 parent d206445 commit f1b56fa
Show file tree
Hide file tree
Showing 21 changed files with 215 additions and 102 deletions.
3 changes: 3 additions & 0 deletions ci/jenkins_tests/run_rllib_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
--stop '{"training_iteration": 1}' \
--config '{"num_workers": 2}'

docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output python /ray/rllib/agents/pg/tests/test_pg.py

docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output /ray/rllib/train.py \
--env CartPole-v0 \
Expand Down
7 changes: 6 additions & 1 deletion python/ray/tune/trainable.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,13 @@ def default_resource_request(cls, config):

@classmethod
def resource_help(cls, config):
"""Returns a help string for configuring this trainable's resources."""
"""
Args:
config (dict): The Trainer's config dict.
Returns:
str: A help string for configuring this trainable's resources.
"""
return ""

def current_ip(self):
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/a3c/a3c.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
DEFAULT_CONFIG = with_common_config({
# Size of rollout batch
"sample_batch_size": 10,
# Use PyTorch as backend - no LSTM support
# Use PyTorch as framework - no LSTM support
"use_pytorch": False,
# GAE(gamma) parameter
"lambda": 1.0,
Expand Down
9 changes: 5 additions & 4 deletions rllib/agents/pg/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from ray.rllib.agents.pg.pg import PGTrainer, DEFAULT_CONFIG
from ray.rllib.utils import renamed_agent
from ray.rllib.agents.pg.pg_tf_policy import pg_tf_loss, \
post_process_advantages
from ray.rllib.agents.pg.pg_torch_policy import pg_torch_loss

PGAgent = renamed_agent(PGTrainer)

__all__ = ["PGAgent", "PGTrainer", "DEFAULT_CONFIG"]
__all__ = ["PGTrainer", "pg_tf_loss", "pg_torch_loss",
"post_process_advantages", "DEFAULT_CONFIG"]
16 changes: 6 additions & 10 deletions rllib/agents/pg/pg.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,24 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.pg.pg_policy import PGTFPolicy
from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# No remote workers by default
# No remote workers by default.
"num_workers": 0,
# Learning rate
# Learning rate.
"lr": 0.0004,
# Use PyTorch as backend
"use_pytorch": False,
# Use PyTorch as framework?
"use_pytorch": False
})
# __sphinx_doc_end__
# yapf: enable


def get_policy_class(config):
if config["use_pytorch"]:
from ray.rllib.agents.pg.torch_pg_policy import PGTorchPolicy
from ray.rllib.agents.pg.pg_torch_policy import PGTorchPolicy
return PGTorchPolicy
else:
return PGTFPolicy
Expand Down
37 changes: 0 additions & 37 deletions rllib/agents/pg/pg_policy.py

This file was deleted.

30 changes: 30 additions & 0 deletions rllib/agents/pg/pg_tf_policy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import ray
from ray.rllib.evaluation.postprocessing import Postprocessing, \
compute_advantages
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils import try_import_tf

tf = try_import_tf()


def post_process_advantages(policy, sample_batch, other_agent_batches=None,
episode=None):
"""This adds the "advantages" column to the sample train_batch."""
return compute_advantages(sample_batch, 0.0, policy.config["gamma"],
use_gae=False)


def pg_tf_loss(policy, model, dist_class, train_batch):
"""The basic policy gradients loss."""
logits, _ = model.from_batch(train_batch)
action_dist = dist_class(logits, model)
return -tf.reduce_mean(action_dist.logp(train_batch[SampleBatch.ACTIONS])
* train_batch[Postprocessing.ADVANTAGES])


PGTFPolicy = build_tf_policy(
name="PGTFPolicy",
get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
postprocess_fn=post_process_advantages,
loss_fn=pg_tf_loss)
35 changes: 35 additions & 0 deletions rllib/agents/pg/pg_torch_policy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import ray
from ray.rllib.agents.pg.pg_tf_policy import post_process_advantages
from ray.rllib.evaluation.postprocessing import Postprocessing
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.torch_policy_template import build_torch_policy
from ray.rllib.utils.framework import try_import_torch

torch, _ = try_import_torch()


def pg_torch_loss(policy, model, dist_class, train_batch):
"""The basic policy gradients loss."""
logits, _ = model.from_batch(train_batch)
action_dist = dist_class(logits, model)
log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS])
# Save the error in the policy object.
# policy.pi_err = -train_batch[Postprocessing.ADVANTAGES].dot(
# log_probs.reshape(-1)) / len(log_probs)
policy.pi_err = -torch.mean(
log_probs * train_batch[Postprocessing.ADVANTAGES]
)
return policy.pi_err


def pg_loss_stats(policy, train_batch):
""" The error is recorded when computing the loss."""
return {"policy_loss": policy.pi_err.item()}


PGTorchPolicy = build_torch_policy(
name="PGTorchPolicy",
get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
loss_fn=pg_torch_loss,
stats_fn=pg_loss_stats,
postprocess_fn=post_process_advantages)
Empty file.
108 changes: 108 additions & 0 deletions rllib/agents/pg/tests/test_pg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import numpy as np
import unittest

import ray
import ray.rllib.agents.pg as pg
from ray.rllib.evaluation.postprocessing import Postprocessing
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.models.torch.torch_action_dist import TorchCategorical
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils import check, fc


class TestPG(unittest.TestCase):

ray.init()

def test_pg_compilation(self):
"""Test whether a PGTrainer can be built with both frameworks."""
config = pg.DEFAULT_CONFIG.copy()
config["num_workers"] = 0 # Run locally.

# tf.
trainer = pg.PGTrainer(config=config, env="CartPole-v0")

num_iterations = 2
for i in range(num_iterations):
trainer.train()

# Torch.
config["use_pytorch"] = True
trainer = pg.PGTrainer(config=config, env="CartPole-v0")
for i in range(num_iterations):
trainer.train()

def test_pg_loss_functions(self):
"""Tests the PG loss function math."""
config = pg.DEFAULT_CONFIG.copy()
config["num_workers"] = 0 # Run locally.
config["eager"] = True
config["gamma"] = 0.99
config["model"]["fcnet_hiddens"] = [10]
config["model"]["fcnet_activation"] = "linear"

# Fake CartPole episode of n timesteps.
train_batch = {
SampleBatch.CUR_OBS: np.array([
[0.1, 0.2, 0.3, 0.4],
[0.5, 0.6, 0.7, 0.8],
[0.9, 1.0, 1.1, 1.2]
]),
SampleBatch.ACTIONS: np.array([0, 1, 1]),
SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]),
SampleBatch.DONES: np.array([False, False, True])
}

# tf.
trainer = pg.PGTrainer(config=config, env="CartPole-v0")
policy = trainer.get_policy()
vars = policy.model.trainable_variables()

# Post-process (calculate simple (non-GAE) advantages) and attach to
# train_batch dict.
# A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
# [2.9701, 1.99, 1.0]
train_batch = pg.post_process_advantages(policy, train_batch)
# Check Advantage values.
check(train_batch[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

# Actual loss results.
results = pg.pg_tf_loss(
policy, policy.model, dist_class=Categorical,
train_batch=train_batch
)

# Calculate expected results.
expected_logits = fc(
fc(
train_batch[SampleBatch.CUR_OBS],
vars[0].numpy(), vars[1].numpy()
),
vars[2].numpy(), vars[3].numpy()
)
expected_logp = Categorical(expected_logits, policy.model).logp(
train_batch[SampleBatch.ACTIONS]
)
expected_loss = -np.mean(
expected_logp * train_batch[Postprocessing.ADVANTAGES]
)
check(results.numpy(), expected_loss, decimals=4)

# Torch.
config["use_pytorch"] = True
trainer = pg.PGTrainer(config=config, env="CartPole-v0")
policy = trainer.get_policy()
train_batch = policy._lazy_tensor_dict(train_batch)
results = pg.pg_torch_loss(
policy, policy.model, dist_class=TorchCategorical,
train_batch=train_batch
)
expected_logits = policy.model._last_output
expected_logp = TorchCategorical(expected_logits, policy.model).logp(
train_batch[SampleBatch.ACTIONS]
)
expected_loss = -np.mean(
expected_logp.detach().numpy() *
train_batch[Postprocessing.ADVANTAGES].numpy()
)
check(results.detach().numpy(), expected_loss, decimals=4)
40 changes: 0 additions & 40 deletions rllib/agents/pg/torch_pg_policy.py

This file was deleted.

12 changes: 10 additions & 2 deletions rllib/evaluation/rollout_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,10 +666,18 @@ def foreach_policy(self, func):

@DeveloperAPI
def foreach_trainable_policy(self, func):
"""Apply the given function to each (policy, policy_id) tuple.
"""
Applies the given function to each (policy, policy_id) tuple, which
can be found in `self.policies_to_train`.
This only applies func to policies in `self.policies_to_train`."""
Args:
func (callable): A function - taking a Policy and its ID - that is
called on all Policies within `self.policies_to_train`.
Returns:
List[any]: The list of n return values of all
`func([policy], [ID])`-calls.
"""
return [
func(policy, pid) for pid, policy in self.policy_map.items()
if pid in self.policies_to_train
Expand Down
2 changes: 1 addition & 1 deletion rllib/examples/rock_paper_scissors_multiagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from ray import tune
from ray.rllib.agents.pg.pg import PGTrainer
from ray.rllib.agents.pg.pg_policy import PGTFPolicy
from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
from ray.rllib.policy.policy import Policy
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.utils import try_import_tf
Expand Down
1 change: 0 additions & 1 deletion rllib/optimizers/async_samples_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
This class coordinates the data transfers between the learner thread
and remote workers (IMPALA actors).
"""

def __init__(self,
workers,
train_batch_size=500,
Expand Down
1 change: 0 additions & 1 deletion rllib/optimizers/segment_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,5 +142,4 @@ def __init__(self, capacity):

def min(self, start=0, end=None):
"""Returns min(arr[start], ..., arr[end])"""

return super(MinSegmentTree, self).reduce(start, end)
3 changes: 3 additions & 0 deletions rllib/tests/agents/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
3 changes: 3 additions & 0 deletions rllib/tests/agents/functionality/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
2 changes: 1 addition & 1 deletion rllib/tests/test_external_multi_agent_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import unittest

import ray
from ray.rllib.agents.pg.pg_policy import PGTFPolicy
from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
from ray.rllib.optimizers import SyncSamplesOptimizer
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.evaluation.worker_set import WorkerSet
Expand Down
Loading

0 comments on commit f1b56fa

Please sign in to comment.