From 741f81b6719199d2cad2c7099cb6e37c91489f0a Mon Sep 17 00:00:00 2001 From: Alessio Date: Mon, 27 May 2024 23:36:27 -0700 Subject: [PATCH] PG solution code updated for proper normalization and baselines --- solution/04_PG.ipynb | 139 ++++++++++++++++++++++++++++++++----------- 1 file changed, 104 insertions(+), 35 deletions(-) diff --git a/solution/04_PG.ipynb b/solution/04_PG.ipynb index 2c8c714..d23b8a4 100644 --- a/solution/04_PG.ipynb +++ b/solution/04_PG.ipynb @@ -70,6 +70,18 @@ "ACTION_SIZE = 2" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def adjust_reward(state, reward):\n", + " angle = state[2] if len(state.shape) == 1 else state[:, 2]\n", + " position = state[0] if len(state.shape) == 1 else state[:, 0]\n", + " return reward - np.abs(angle) / 0.418 - np.abs(position) / 4.8" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -120,7 +132,8 @@ " x = self.fc1(x)\n", " x = F.relu(x)\n", " x = self.fc2(x)\n", - " return F.softmax(x, dim=1)" + " x = F.softmax(x, dim=-1)\n", + " return x" ] }, { @@ -134,16 +147,18 @@ " self.policy = PolicyNetwork()\n", " self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)\n", "\n", - " def sample_action(self, state: np.array):\n", - " state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)\n", - " probs = self.policy.forward(state).cpu()\n", + " def sample_action(self, state: np.array) -> tuple[np.array, torch.Tensor]:\n", + " state = torch.from_numpy(state).float().to(DEVICE)\n", + " probs = self.policy.forward(state)\n", " cdist = Categorical(probs)\n", " action = cdist.sample()\n", - " return action.item(), cdist.log_prob(action)\n", + " logprob = cdist.log_prob(action)\n", + " return action.cpu().numpy(), logprob\n", "\n", " def learn(self, log_probs: list[torch.Tensor], returns: Union[np.float64, np.array]):\n", " returns = torch.tensor(returns, dtype=torch.float64, device=DEVICE)\n", - " policy_loss = -(torch.cat(log_probs) * returns).sum()\n", + " log_probs = torch.stack(log_probs)\n", + " policy_loss = -(log_probs * returns).sum()\n", " self.optimizer.zero_grad()\n", " policy_loss.backward()\n", " self.optimizer.step()\n", @@ -151,7 +166,8 @@ " @torch.no_grad\n", " def act(self, state):\n", " \"\"\"Convenient method for the agent to select an action during simulation.\"\"\"\n", - " return self.sample_action(state)[0]" + " action, _ = self.sample_action(state[np.newaxis, :])\n", + " return action[0]" ] }, { @@ -160,17 +176,17 @@ "metadata": {}, "outputs": [], "source": [ - "def REINFORCE(env, agent, max_episodes=10_000, max_t=1_000, gamma=1.0):\n", + "def REINFORCE(env, agent, max_episodes=10_000, max_t=1_000, gamma=0.9999):\n", " scores = []\n", " for i_episode in range(1, max_episodes + 1):\n", " rewards = []\n", " log_probs = []\n", " state, _ = env.reset()\n", "\n", - " for _ in range(max_t):\n", + " for t in range(max_t):\n", " action, log_prob = agent.sample_action(state)\n", " state, reward, terminated, truncated, _ = env.step(action)\n", - " rewards.append(reward)\n", + " rewards.append(adjust_reward(state, reward))\n", " log_probs.append(log_prob)\n", " if terminated or truncated:\n", " break\n", @@ -181,12 +197,12 @@ " agent.learn(log_probs, R)\n", "\n", " # Track scores and print statistics.\n", - " scores.append(sum(rewards))\n", - " avg_score = np.mean(scores[-100:])\n", + " scores.append(t)\n", + " avg_duration = np.mean(scores[-100:])\n", " if i_episode % 100 == 0:\n", - " print(f'Episode {i_episode}\\tAverage Score: {avg_score:.2f}')\n", - " if avg_score >= 490.0: # Solved\n", - " print(f'Environment solved at episode {i_episode}\\tAverage Score: {avg_score:.2f}')\n", + " print(f'Episode {i_episode}\\tAverage duration: {avg_duration:.2f}')\n", + " if avg_duration >= 490.0: # Solved\n", + " print(f'Environment solved at episode {i_episode}\\Avg. duration: {avg_duration:.2f}')\n", " break\n", "\n", " return scores" @@ -270,43 +286,48 @@ "metadata": {}, "outputs": [], "source": [ - "def REINFORCE_v2(env, agent, max_episodes=10_000, max_t=1_000, gamma=1.0):\n", + "def REINFORCE_v2(env: gym.vector.VectorEnv, agent, max_episodes=10_000, max_t=1_000, gamma=0.9999,\n", + " with_normalization=True, with_baseline=True):\n", " scores = []\n", " for i_episode in range(1, max_episodes + 1):\n", - " rewards = []\n", - " log_probs = []\n", + " states, rewards, log_probs = ([], [], [])\n", " state, _ = env.reset()\n", "\n", - " for _ in range(max_t):\n", + " for t in range(max_t):\n", " action, log_prob = agent.sample_action(state)\n", " log_probs.append(log_prob)\n", " state, reward, terminated, truncated, _ = env.step(action)\n", - " rewards.append(reward)\n", - " if terminated or truncated:\n", + " rewards.append(adjust_reward(state, reward))\n", + " states.append(state)\n", + " if terminated.any() or truncated.any():\n", " break\n", "\n", " discounts = np.power(gamma, np.arange(len(rewards)))\n", - " discounted_rewards = discounts * rewards\n", - " future_returns = discounted_rewards[::-1].cumsum()[::-1]\n", + " discounted_rewards = discounts[:, np.newaxis] * rewards\n", + " future_returns = discounted_rewards[::-1].cumsum(axis=0)[::-1] # (batch, n_bots)\n", "\n", - " baseline = np.mean(future_returns)\n", - " future_returns = future_returns - baseline\n", + " if with_baseline:\n", + " # Velocity would be: np.abs(states[t][:, 1] * states[t][:, 3])\n", + " # This one is a weird product between angle and angular velocity...\n", + " baseline = np.asarray([states[t][:, 2] * states[t][:, 3] for t in range(len(rewards))])\n", + " future_returns = future_returns - baseline\n", "\n", - " returns_mean = np.mean(future_returns)\n", - " returns_std = np.std(future_returns) + 1.0e-10 # To avoid a zero division\n", - " normalized_returns = (future_returns - returns_mean) / returns_std\n", + " if with_normalization:\n", + " returns_mean = np.mean(future_returns, axis=1)[:, np.newaxis]\n", + " returns_std = np.std(future_returns, axis=1)[:, np.newaxis] + 1.0e-10 # avoid 0 division\n", + " future_returns = (future_returns - returns_mean) / returns_std\n", "\n", " # copy() for negative strides :(\n", " # https://discuss.pytorch.org/t/negative-strides-in-tensor-error/134287/2\n", - " agent.learn(log_probs, normalized_returns.copy())\n", + " agent.learn(log_probs, future_returns.copy())\n", "\n", " # Track scores and print statistics\n", - " scores.append(sum(rewards))\n", - " avg_score = np.mean(scores[-100:])\n", + " scores.append(t)\n", + " avg_duration = np.mean(scores[-100:])\n", " if i_episode % 100 == 0:\n", - " print(f'Episode {i_episode}\\tAverage Score: {avg_score:.2f}')\n", - " if avg_score >= 490.0: # Solved\n", - " print(f'Environment solved at episode {i_episode}\\tAverage Score: {avg_score:.2f}')\n", + " print(f'Episode {i_episode}\\tAverage duration: {avg_duration:.2f}')\n", + " if avg_duration >= 490.0: # Solved\n", + " print(f'Environment solved at episode {i_episode}\\tAvg. duration: {avg_duration:.2f}')\n", " break\n", "\n", " return scores" @@ -318,7 +339,7 @@ "metadata": {}, "outputs": [], "source": [ - "with init_random(gym.make('CartPole-v1')) as env:\n", + "with init_random(gym.vector.make('CartPole-v1', num_envs=5)) as env:\n", " agent_v2 = Agent()\n", " scores_v2 = REINFORCE_v2(env, agent_v2)\n", "plot_scores(scores_v2)" @@ -371,6 +392,54 @@ "\n", "More details on [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html#id14).\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "init_random()\n", + "base, norm, all = ([], [], [])\n", + "random_seeds = np.random.randint(3_141_592, size=10)\n", + "for seed in random_seeds:\n", + " with init_random(gym.vector.make('CartPole-v1', num_envs=5), seed=int(seed)) as env:\n", + " print('Future rewards only:')\n", + " agent_v3 = Agent()\n", + " scores_v3 = REINFORCE_v2(env, agent_v3, with_normalization=False, with_baseline=False)\n", + " base.append(len(scores_v3))\n", + "\n", + " with init_random(gym.vector.make('CartPole-v1', num_envs=5), seed=int(seed)) as env:\n", + " print('Future rewards + normalization:')\n", + " agent_v3_b = Agent()\n", + " scores_v3_b = REINFORCE_v2(env, agent_v3_b, with_normalization=True, with_baseline=False)\n", + " norm.append(len(scores_v3_b))\n", + "\n", + " with init_random(gym.vector.make('CartPole-v1', num_envs=5), seed=int(seed)) as env:\n", + " print('Future rewards + normalization + baseline:') \n", + " agent_v3_bn = Agent()\n", + " scores_v3_bn = REINFORCE_v2(env, agent_v3_bn, with_normalization=True, with_baseline=True)\n", + " all.append(len(scores_v3_bn))\n", + " print()\n", + "\n", + "x = np.arange(len(norm))\n", + "plt.figure('Episode scores')\n", + "plt.plot(x, base, label='Future rewards')\n", + "plt.plot(x, norm, 'r', label='Future rewards + normalization')\n", + "plt.plot(x, all, 'g', label='Future rewards + normalization + baseline')\n", + "plt.ylabel('Score')\n", + "plt.xlabel('Episode #')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {