In [46]:
import gym
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

sns.reset_defaults()
sns.set_theme(rc={'figure.dpi': 72, 'savefig.dpi': 300,
 'figure.autolayout': True})
sns.set_style('ticks')
sns.set_context('paper')

np.set_printoptions(precision=4)


In [47]:
env = gym.make("FrozenLake-v1", is_slippery=False)
print("Number of actions:", env.action_space.n)
print("Number of observations:", env.observation_space.n)

Number of actions: 4
Number of observations: 16


In [48]:
# Part b

def QEval(episodes: int, Q: np.ndarray = None):
 win_cnt = 0
 for _ in range(episodes):
 observation = env.reset()[0]
 terminated = False
 while not terminated:
 if Q is None:
 action = env.action_space.sample()
 else:
 action = np.argmax(Q[observation])
 observation, reward, terminated, *_ = env.step(action)
 win_cnt += reward
 win_cnt = int(win_cnt)
 print("Win rate:", round(win_cnt/episodes, 4))
 print("Number of wins:", win_cnt)


QEval(1000)


Win rate: 0.009
Number of wins: 9


In [49]:
# Part c

def QTrain(
 episodes: int = 100_000,
 alpha: float = 0.01,
 gamma: float = 0.9,
 epsilon: float = 1.0,
 epsilon_decay: float = 0.0001
):
 """Q-learning through Epsilon-Greedy algorithm

 Args:
 episodes (int): Total number of episodes.
 alpha (float): Learning rate.
 gamma (float): Discount factor.
 epsilon (float): Randomness probability in action selection.
 epsilon_decay (float): Fixed amount to decrease.

 Returns:
 NDArray: Q-table
 """
 Q = np.zeros((env.observation_space.n, env.action_space.n))

 for _ in range(episodes):
 state = env.reset()[0]
 terminated = False

 # Train the agent until it gets stuck in a hole or reaches the goal
 while not terminated:
 rnd = np.random.random()
 if rnd < epsilon:
 # Take a random action (exploration)
 action = env.action_space.sample()
 else:
 # Take the action with the highest value in the current state (exploitation)
 action = np.argmax(Q[state])

 # Implement this action and move the agent in the desired direction
 new_state, reward, terminated, *_ = env.step(action)

 Q[state, action] = Q[state, action] + alpha * \
 (reward + gamma * np.max(Q[new_state]) - Q[state, action])

 state = new_state

 epsilon = max(epsilon - epsilon_decay, 0)

 return Q


In [50]:
# Part d

qtable = QTrain()
print('Q-table:')
print(qtable)

Q-table:
[[0.5303 0.5905 0.5453 0.5295]
 [0.3349 0. 0.6333 0.2903]
 [0.2797 0.7225 0.141 0.3965]
 [0.3313 0. 0.0453 0.0395]
 [0.5885 0.6561 0. 0.5298]
 [0. 0. 0. 0. ]
 [0. 0.8096 0. 0.3792]
 [0. 0. 0. 0. ]
 [0.6513 0. 0.729 0.5857]
 [0.6437 0.7935 0.81 0. ]
 [0.7153 0.9 0. 0.7094]
 [0. 0. 0. 0. ]
 [0. 0. 0. 0. ]
 [0. 0.5566 0.8995 0.446 ]
 [0.7902 0.891 1. 0.7967]
 [0. 0. 0. 0. ]]


In [51]:
# Part e

QEval(1000, qtable)

Win rate: 1.0
Number of wins: 1000


In [52]:
# Part f

def save_frames(frames, file_path):
 from matplotlib import animation
 from IPython.display import HTML, display

 f = 72
 w = frames[0].shape[1] / f
 h = frames[0].shape[0] / f
 plt.figure(figsize=(w, h))

 patch = plt.imshow(frames[0])
 plt.axis('off')

 def animate(i):
 patch.set_data(frames[i])

 anim = animation.FuncAnimation(
 plt.gcf(), animate, frames=len(frames), interval=300)
 anim.save(file_path, writer='pillow')
 plt.close()

 rnd = np.random.randint(0, 2e9)
 display(HTML(f''))


env = gym.make("FrozenLake-v1", is_slippery=False,
 render_mode="rgb_array_list")
QEval(1, qtable)
save_frames(env.render(), 'P4_f.gif')


Win rate: 1.0
Number of wins: 1
