Skip to content

Commit

Permalink
PG solution code updated for proper normalization and baselines
Browse files Browse the repository at this point in the history
  • Loading branch information
Alessio committed May 28, 2024
1 parent 470e1c6 commit 741f81b
Showing 1 changed file with 104 additions and 35 deletions.
139 changes: 104 additions & 35 deletions solution/04_PG.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,18 @@
"ACTION_SIZE = 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def adjust_reward(state, reward):\n",
" angle = state[2] if len(state.shape) == 1 else state[:, 2]\n",
" position = state[0] if len(state.shape) == 1 else state[:, 0]\n",
" return reward - np.abs(angle) / 0.418 - np.abs(position) / 4.8"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -120,7 +132,8 @@
" x = self.fc1(x)\n",
" x = F.relu(x)\n",
" x = self.fc2(x)\n",
" return F.softmax(x, dim=1)"
" x = F.softmax(x, dim=-1)\n",
" return x"
]
},
{
Expand All @@ -134,24 +147,27 @@
" self.policy = PolicyNetwork()\n",
" self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)\n",
"\n",
" def sample_action(self, state: np.array):\n",
" state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)\n",
" probs = self.policy.forward(state).cpu()\n",
" def sample_action(self, state: np.array) -> tuple[np.array, torch.Tensor]:\n",
" state = torch.from_numpy(state).float().to(DEVICE)\n",
" probs = self.policy.forward(state)\n",
" cdist = Categorical(probs)\n",
" action = cdist.sample()\n",
" return action.item(), cdist.log_prob(action)\n",
" logprob = cdist.log_prob(action)\n",
" return action.cpu().numpy(), logprob\n",
"\n",
" def learn(self, log_probs: list[torch.Tensor], returns: Union[np.float64, np.array]):\n",
" returns = torch.tensor(returns, dtype=torch.float64, device=DEVICE)\n",
" policy_loss = -(torch.cat(log_probs) * returns).sum()\n",
" log_probs = torch.stack(log_probs)\n",
" policy_loss = -(log_probs * returns).sum()\n",
" self.optimizer.zero_grad()\n",
" policy_loss.backward()\n",
" self.optimizer.step()\n",
" \n",
" @torch.no_grad\n",
" def act(self, state):\n",
" \"\"\"Convenient method for the agent to select an action during simulation.\"\"\"\n",
" return self.sample_action(state)[0]"
" action, _ = self.sample_action(state[np.newaxis, :])\n",
" return action[0]"
]
},
{
Expand All @@ -160,17 +176,17 @@
"metadata": {},
"outputs": [],
"source": [
"def REINFORCE(env, agent, max_episodes=10_000, max_t=1_000, gamma=1.0):\n",
"def REINFORCE(env, agent, max_episodes=10_000, max_t=1_000, gamma=0.9999):\n",
" scores = []\n",
" for i_episode in range(1, max_episodes + 1):\n",
" rewards = []\n",
" log_probs = []\n",
" state, _ = env.reset()\n",
"\n",
" for _ in range(max_t):\n",
" for t in range(max_t):\n",
" action, log_prob = agent.sample_action(state)\n",
" state, reward, terminated, truncated, _ = env.step(action)\n",
" rewards.append(reward)\n",
" rewards.append(adjust_reward(state, reward))\n",
" log_probs.append(log_prob)\n",
" if terminated or truncated:\n",
" break\n",
Expand All @@ -181,12 +197,12 @@
" agent.learn(log_probs, R)\n",
"\n",
" # Track scores and print statistics.\n",
" scores.append(sum(rewards))\n",
" avg_score = np.mean(scores[-100:])\n",
" scores.append(t)\n",
" avg_duration = np.mean(scores[-100:])\n",
" if i_episode % 100 == 0:\n",
" print(f'Episode {i_episode}\\tAverage Score: {avg_score:.2f}')\n",
" if avg_score >= 490.0: # Solved\n",
" print(f'Environment solved at episode {i_episode}\\tAverage Score: {avg_score:.2f}')\n",
" print(f'Episode {i_episode}\\tAverage duration: {avg_duration:.2f}')\n",
" if avg_duration >= 490.0: # Solved\n",
" print(f'Environment solved at episode {i_episode}\\Avg. duration: {avg_duration:.2f}')\n",
" break\n",
"\n",
" return scores"
Expand Down Expand Up @@ -270,43 +286,48 @@
"metadata": {},
"outputs": [],
"source": [
"def REINFORCE_v2(env, agent, max_episodes=10_000, max_t=1_000, gamma=1.0):\n",
"def REINFORCE_v2(env: gym.vector.VectorEnv, agent, max_episodes=10_000, max_t=1_000, gamma=0.9999,\n",
" with_normalization=True, with_baseline=True):\n",
" scores = []\n",
" for i_episode in range(1, max_episodes + 1):\n",
" rewards = []\n",
" log_probs = []\n",
" states, rewards, log_probs = ([], [], [])\n",
" state, _ = env.reset()\n",
"\n",
" for _ in range(max_t):\n",
" for t in range(max_t):\n",
" action, log_prob = agent.sample_action(state)\n",
" log_probs.append(log_prob)\n",
" state, reward, terminated, truncated, _ = env.step(action)\n",
" rewards.append(reward)\n",
" if terminated or truncated:\n",
" rewards.append(adjust_reward(state, reward))\n",
" states.append(state)\n",
" if terminated.any() or truncated.any():\n",
" break\n",
"\n",
" discounts = np.power(gamma, np.arange(len(rewards)))\n",
" discounted_rewards = discounts * rewards\n",
" future_returns = discounted_rewards[::-1].cumsum()[::-1]\n",
" discounted_rewards = discounts[:, np.newaxis] * rewards\n",
" future_returns = discounted_rewards[::-1].cumsum(axis=0)[::-1] # (batch, n_bots)\n",
"\n",
" baseline = np.mean(future_returns)\n",
" future_returns = future_returns - baseline\n",
" if with_baseline:\n",
" # Velocity would be: np.abs(states[t][:, 1] * states[t][:, 3])\n",
" # This one is a weird product between angle and angular velocity...\n",
" baseline = np.asarray([states[t][:, 2] * states[t][:, 3] for t in range(len(rewards))])\n",
" future_returns = future_returns - baseline\n",
"\n",
" returns_mean = np.mean(future_returns)\n",
" returns_std = np.std(future_returns) + 1.0e-10 # To avoid a zero division\n",
" normalized_returns = (future_returns - returns_mean) / returns_std\n",
" if with_normalization:\n",
" returns_mean = np.mean(future_returns, axis=1)[:, np.newaxis]\n",
" returns_std = np.std(future_returns, axis=1)[:, np.newaxis] + 1.0e-10 # avoid 0 division\n",
" future_returns = (future_returns - returns_mean) / returns_std\n",
"\n",
" # copy() for negative strides :(\n",
" # https://discuss.pytorch.org/t/negative-strides-in-tensor-error/134287/2\n",
" agent.learn(log_probs, normalized_returns.copy())\n",
" agent.learn(log_probs, future_returns.copy())\n",
"\n",
" # Track scores and print statistics\n",
" scores.append(sum(rewards))\n",
" avg_score = np.mean(scores[-100:])\n",
" scores.append(t)\n",
" avg_duration = np.mean(scores[-100:])\n",
" if i_episode % 100 == 0:\n",
" print(f'Episode {i_episode}\\tAverage Score: {avg_score:.2f}')\n",
" if avg_score >= 490.0: # Solved\n",
" print(f'Environment solved at episode {i_episode}\\tAverage Score: {avg_score:.2f}')\n",
" print(f'Episode {i_episode}\\tAverage duration: {avg_duration:.2f}')\n",
" if avg_duration >= 490.0: # Solved\n",
" print(f'Environment solved at episode {i_episode}\\tAvg. duration: {avg_duration:.2f}')\n",
" break\n",
"\n",
" return scores"
Expand All @@ -318,7 +339,7 @@
"metadata": {},
"outputs": [],
"source": [
"with init_random(gym.make('CartPole-v1')) as env:\n",
"with init_random(gym.vector.make('CartPole-v1', num_envs=5)) as env:\n",
" agent_v2 = Agent()\n",
" scores_v2 = REINFORCE_v2(env, agent_v2)\n",
"plot_scores(scores_v2)"
Expand Down Expand Up @@ -371,6 +392,54 @@
"\n",
"More details on [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html#id14).\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"init_random()\n",
"base, norm, all = ([], [], [])\n",
"random_seeds = np.random.randint(3_141_592, size=10)\n",
"for seed in random_seeds:\n",
" with init_random(gym.vector.make('CartPole-v1', num_envs=5), seed=int(seed)) as env:\n",
" print('Future rewards only:')\n",
" agent_v3 = Agent()\n",
" scores_v3 = REINFORCE_v2(env, agent_v3, with_normalization=False, with_baseline=False)\n",
" base.append(len(scores_v3))\n",
"\n",
" with init_random(gym.vector.make('CartPole-v1', num_envs=5), seed=int(seed)) as env:\n",
" print('Future rewards + normalization:')\n",
" agent_v3_b = Agent()\n",
" scores_v3_b = REINFORCE_v2(env, agent_v3_b, with_normalization=True, with_baseline=False)\n",
" norm.append(len(scores_v3_b))\n",
"\n",
" with init_random(gym.vector.make('CartPole-v1', num_envs=5), seed=int(seed)) as env:\n",
" print('Future rewards + normalization + baseline:') \n",
" agent_v3_bn = Agent()\n",
" scores_v3_bn = REINFORCE_v2(env, agent_v3_bn, with_normalization=True, with_baseline=True)\n",
" all.append(len(scores_v3_bn))\n",
" print()\n",
"\n",
"x = np.arange(len(norm))\n",
"plt.figure('Episode scores')\n",
"plt.plot(x, base, label='Future rewards')\n",
"plt.plot(x, norm, 'r', label='Future rewards + normalization')\n",
"plt.plot(x, all, 'g', label='Future rewards + normalization + baseline')\n",
"plt.ylabel('Score')\n",
"plt.xlabel('Episode #')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 741f81b

Please sign in to comment.