Skip to content

Commit

Permalink
Allow PPO to turn off advantage normalization (DLR-RM#763)
Browse files Browse the repository at this point in the history
* Allow PPO to turn of advantage normalization

* update changelog

* Add a test case

* Update test and sanity check

* Fix tests

Co-authored-by: Antonin RAFFIN <[email protected]>
  • Loading branch information
vwxyzjn and araffin authored Feb 22, 2022
1 parent 7ce4bb8 commit d2ebd2e
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 4 deletions.
1 change: 1 addition & 0 deletions docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ New Features:
^^^^^^^^^^^^^
- Makes the length of keys and values in ``HumanOutputFormat`` configurable,
depending on desired maximum width of output.
- Allow PPO to turn of advantage normalization (see `PR #763 <https://github.com/DLR-RM/stable-baselines3/pull/763>`_) @vwxyzjn

SB3-Contrib
^^^^^^^^^^^
Expand Down
13 changes: 9 additions & 4 deletions stable_baselines3/ppo/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class PPO(OnPolicyAlgorithm):
This is a parameter specific to the OpenAI implementation. If None is passed (default),
no clipping will be done on the value function.
IMPORTANT: this clipping depends on the reward scaling.
:param normalize_advantage: Whether to normalize or not the advantage
:param ent_coef: Entropy coefficient for the loss calculation
:param vf_coef: Value function coefficient for the loss calculation
:param max_grad_norm: The maximum value for the gradient clipping
Expand Down Expand Up @@ -76,6 +77,7 @@ def __init__(
gae_lambda: float = 0.95,
clip_range: Union[float, Schedule] = 0.2,
clip_range_vf: Union[None, float, Schedule] = None,
normalize_advantage: bool = True,
ent_coef: float = 0.0,
vf_coef: float = 0.5,
max_grad_norm: float = 0.5,
Expand Down Expand Up @@ -120,9 +122,10 @@ def __init__(

# Sanity check, otherwise it will lead to noisy gradient and NaN
# because of the advantage normalization
assert (
batch_size > 1
), "`batch_size` must be greater than 1. See https://github.com/DLR-RM/stable-baselines3/issues/440"
if normalize_advantage:
assert (
batch_size > 1
), "`batch_size` must be greater than 1. See https://github.com/DLR-RM/stable-baselines3/issues/440"

if self.env is not None:
# Check that `n_steps * n_envs > 1` to avoid NaN
Expand All @@ -146,6 +149,7 @@ def __init__(
self.n_epochs = n_epochs
self.clip_range = clip_range
self.clip_range_vf = clip_range_vf
self.normalize_advantage = normalize_advantage
self.target_kl = target_kl

if _init_setup_model:
Expand Down Expand Up @@ -200,7 +204,8 @@ def train(self) -> None:
values = values.flatten()
# Normalize advantage
advantages = rollout_data.advantages
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
if self.normalize_advantage:
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

# ratio between old and new policy, should be one at the first iteration
ratio = th.exp(log_prob - rollout_data.old_log_prob)
Expand Down
7 changes: 7 additions & 0 deletions tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ def test_a2c(env_id):
model.learn(total_timesteps=1000, eval_freq=500)


@pytest.mark.parametrize("model_class", [A2C, PPO])
@pytest.mark.parametrize("normalize_advantage", [False, True])
def test_advantage_normalization(model_class, normalize_advantage):
model = model_class("MlpPolicy", "CartPole-v1", n_steps=64, normalize_advantage=normalize_advantage)
model.learn(64)


@pytest.mark.parametrize("env_id", ["CartPole-v1", "Pendulum-v1"])
@pytest.mark.parametrize("clip_range_vf", [None, 0.2, -0.2])
def test_ppo(env_id, clip_range_vf):
Expand Down

0 comments on commit d2ebd2e

Please sign in to comment.