Skip to content

Commit

Permalink
update algorithm & function's name
Browse files Browse the repository at this point in the history
  • Loading branch information
wisnunugroho21 committed Oct 26, 2020
1 parent 3f51f38 commit 54fff8d
Show file tree
Hide file tree
Showing 10 changed files with 165 additions and 102 deletions.
4 changes: 2 additions & 2 deletions PPO/pytorch/ppo_pong_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def save_eps(self, state, action, reward, done, next_state):
self.dones.append(done)
self.next_states.append(next_state)

def clearMemory(self):
def clear_memory(self):
del self.actions[:]
del self.states[:]
del self.rewards[:]
Expand Down Expand Up @@ -250,7 +250,7 @@ def update_ppo(self):
self.training_ppo(states.float().to(device), actions.float().to(device), rewards.float().to(device), dones.float().to(device), next_states.float().to(device))

# Clear the memory
self.memory.clearMemory()
self.memory.clear_memory()

# Copy new weights into old policy:
self.actor_old.load_state_dict(self.actor.state_dict())
Expand Down
49 changes: 32 additions & 17 deletions PPO/pytorch/ppo_pytorch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import gym
import slimevolleygym
from gym.envs.registration import register

import torch
Expand Down Expand Up @@ -31,9 +32,9 @@ def __init__(self, state_dim, action_dim):
super(Actor_Model, self).__init__()

self.nn_layer = nn.Sequential(
nn.Linear(state_dim, 64),
nn.Linear(state_dim, 256),
nn.ReLU(),
nn.Linear(64, 64),
nn.Linear(256, 64),
nn.ReLU(),
nn.Linear(64, action_dim),
nn.Softmax(-1)
Expand All @@ -47,9 +48,9 @@ def __init__(self, state_dim, action_dim):
super(Critic_Model, self).__init__()

self.nn_layer = nn.Sequential(
nn.Linear(state_dim, 64),
nn.Linear(state_dim, 256),
nn.ReLU(),
nn.Linear(64, 64),
nn.Linear(256, 64),
nn.ReLU(),
nn.Linear(64, 1)
).float().to(device)
Expand Down Expand Up @@ -78,7 +79,7 @@ def save_eps(self, state, action, reward, done, next_state):
self.dones.append(done)
self.next_states.append(next_state)

def clearMemory(self):
def clear_memory(self):
del self.actions[:]
del self.states[:]
del self.rewards[:]
Expand Down Expand Up @@ -250,7 +251,7 @@ def update_ppo(self):
self.training_ppo(states.float().to(device), actions.float().to(device), rewards.float().to(device), dones.float().to(device), next_states.float().to(device))

# Clear the memory
self.memory.clearMemory()
self.memory.clear_memory()

# Copy new weights into old policy:
self.actor_old.load_state_dict(self.actor.state_dict())
Expand All @@ -260,19 +261,19 @@ def save_weights(self):
torch.save({
'model_state_dict': self.actor.state_dict(),
'optimizer_state_dict': self.actor_optimizer.state_dict()
}, '/test/My Drive/Bipedal4/actor.tar')
}, 'SlimeVolley/actor.tar')

torch.save({
'model_state_dict': self.critic.state_dict(),
'optimizer_state_dict': self.critic_optimizer.state_dict()
}, '/test/My Drive/Bipedal4/critic.tar')
}, 'SlimeVolley/critic.tar')

def load_weights(self):
actor_checkpoint = torch.load('/test/My Drive/Bipedal4/actor.tar')
actor_checkpoint = torch.load('SlimeVolley/actor.tar')
self.actor.load_state_dict(actor_checkpoint['model_state_dict'])
self.actor_optimizer.load_state_dict(actor_checkpoint['optimizer_state_dict'])

critic_checkpoint = torch.load('/test/My Drive/Bipedal4/critic.tar')
critic_checkpoint = torch.load('SlimeVolley/critic.tar')
self.critic.load_state_dict(critic_checkpoint['model_state_dict'])
self.critic_optimizer.load_state_dict(critic_checkpoint['optimizer_state_dict'])

Expand All @@ -299,7 +300,21 @@ def run_episode(env, agent, state_dim, render, training_mode, t_updates, n_updat

while not done:
action = int(agent.act(state))
next_state, reward, done, _ = env.step(action)

if action == 0:
action_gym = [0, 0, 0] # NOOP
elif action == 1:
action_gym = [1, 0, 0] # LEFT (forward)
elif action == 2:
action_gym = [0, 1, 0] # RIGHT (backward)
elif action == 3:
action_gym = [0, 0, 1] # UP (jump)
elif action == 4:
action_gym = [1, 0, 1] # UPLEFT (forward jump)
elif action == 5:
action_gym = [0, 1, 1] # UPRIGHT (backward jump)

next_state, reward, done, _ = env.step(action_gym)

eps_time += 1
t_updates += 1
Expand All @@ -323,13 +338,13 @@ def run_episode(env, agent, state_dim, render, training_mode, t_updates, n_updat

def main():
############## Hyperparameters ##############
load_weights = False # If you want to load the agent, set this to True
save_weights = False # If you want to save the agent, set this to True
load_weights = True # If you want to load the agent, set this to True
save_weights = True # If you want to save the agent, set this to True
training_mode = True # If you want to train the agent, set this to True. But set this otherwise if you only want to test it
reward_threshold = 300 # Set threshold for reward. The learning will stop if reward has pass threshold. Set none to sei this off
using_google_drive = False

render = False # If you want to display the image, set this to True. Turn this off if you run this in Google Collab
render = True # If you want to display the image, set this to True. Turn this off if you run this in Google Collab
n_update = 128 # How many episode before you update the Policy. Recommended set to 128 for Discrete
n_plot_batch = 100000000 # How many episode you want to plot the result
n_episode = 100000 # How many episode you want to run
Expand All @@ -347,11 +362,11 @@ def main():
lam = 0.95 # Just set to 0.95
learning_rate = 2.5e-4 # Just set to 0.95
#############################################
env_name = 'Env Name' # Set the env you want
env_name = 'SlimeVolley-v0' # Set the env you want
env = gym.make(env_name)

state_dim = env.observation_space.n
action_dim = env.action_space.n
state_dim = env.observation_space.shape[0]
action_dim = 6 # env.action_space.n

agent = Agent(state_dim, action_dim, training_mode, policy_kl_range, policy_params, value_clip, entropy_coef, vf_loss_coef,
minibatch, PPO_epochs, gamma, lam, learning_rate)
Expand Down
4 changes: 2 additions & 2 deletions PPO/tensorflow 2/ppo_pong_tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def save_eps(self, state, action, reward, done, next_state):
self.dones.append(done)
self.next_states.append(next_state)

def clearMemory(self):
def clear_memory(self):
del self.actions[:]
del self.states[:]
del self.rewards[:]
Expand Down Expand Up @@ -233,7 +233,7 @@ def update_ppo(self):
self.training_ppo(states, actions, rewards, dones, next_states)

# Clear the memory
self.memory.clearMemory()
self.memory.clear_memory()

# Copy new weights into old policy:
self.actor_old.set_weights(self.actor.get_weights())
Expand Down
4 changes: 2 additions & 2 deletions PPO/tensorflow 2/ppo_tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def save_eps(self, state, action, reward, done, next_state):
self.dones.append(done)
self.next_states.append(next_state)

def clearMemory(self):
def clear_memory(self):
del self.actions[:]
del self.states[:]
del self.rewards[:]
Expand Down Expand Up @@ -233,7 +233,7 @@ def update_ppo(self):
self.training_ppo(states, actions, rewards, dones, next_states)

# Clear the memory
self.memory.clearMemory()
self.memory.clear_memory()

# Copy new weights into old policy:
self.actor_old.set_weights(self.actor.get_weights())
Expand Down
55 changes: 34 additions & 21 deletions PPO_RND/pytorch/ppo_rnd_frozen_notslippery_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from torch.distributions.kl import kl_divergence
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tensorflow.keras.utils import to_categorical

import matplotlib.pyplot as plt
import numpy as np
Expand All @@ -26,17 +27,17 @@ def prepro(self, I):
X = I.astype(np.float32).ravel() # Combine items in 1 array
return X

def countNewMean(self, prevMean, prevLen, newData):
def count_new_mean(self, prevMean, prevLen, newData):
return ((prevMean * prevLen) + newData.sum(0)) / (prevLen + newData.shape[0])

def countNewStd(self, prevStd, prevLen, newData):
def count_new_std(self, prevStd, prevLen, newData):
return (((prevStd.pow(2) * prevLen) + (newData.var(0) * newData.shape[0])) / (prevLen + newData.shape[0])).sqrt()

def normalize(self, data, mean = None, std = None, clip = None):
if isinstance(mean, torch.Tensor) and isinstance(std, torch.Tensor):
data_normalized = (data - mean) / (std + 1e-8)
else:
data_normalized = (data - torch.mean(data)) / (torch.std(data) + 1e-8)
data_normalized = (data - data.mean()) / (data.std() + 1e-8)

if clip:
data_normalized = torch.clamp(data_normalized, -1 * clip, clip)
Expand Down Expand Up @@ -105,6 +106,9 @@ def __len__(self):
def __getitem__(self, idx):
return np.array(self.observations[idx], dtype = np.float32)

def get_all(self):
return torch.FloatTensor(self.observations)

def save_eps(self, obs):
self.observations.append(obs)

Expand All @@ -117,7 +121,7 @@ def save_rewards_normalize_parameter(self, std_in_rewards, total_number_rwd):
self.std_in_rewards = std_in_rewards
self.total_number_rwd = total_number_rwd

def clearMemory(self):
def clear_memory(self):
del self.observations[:]

class Memory(Dataset):
Expand All @@ -133,15 +137,15 @@ def __len__(self):

def __getitem__(self, idx):
return np.array(self.states[idx], dtype = np.float32), np.array(self.actions[idx], dtype = np.float32), np.array([self.rewards[idx]], dtype = np.float32), np.array([self.dones[idx]], dtype = np.float32), np.array(self.next_states[idx], dtype = np.float32)

def save_eps(self, state, action, reward, done, next_state):
self.rewards.append(reward)
self.states.append(state)
self.actions.append(action)
self.dones.append(done)
self.next_states.append(next_state)

def clearMemory(self):
def clear_memory(self):
del self.actions[:]
del self.states[:]
del self.rewards[:]
Expand Down Expand Up @@ -253,17 +257,17 @@ def save_eps(self, state, action, reward, done, next_state):
def save_observation(self, obs):
self.obs_memory.save_eps(obs)

def updateObsNormalizationParam(self, obs):
def update_obs_normalization_param(self, obs):
obs = torch.FloatTensor(obs).to(device).detach()

mean_obs = self.utils.countNewMean(self.obs_memory.mean_obs, self.obs_memory.total_number_obs, obs)
std_obs = self.utils.countNewStd(self.obs_memory.std_obs, self.obs_memory.total_number_obs, obs)
mean_obs = self.utils.count_new_mean(self.obs_memory.mean_obs, self.obs_memory.total_number_obs, obs)
std_obs = self.utils.count_new_std(self.obs_memory.std_obs, self.obs_memory.total_number_obs, obs)
total_number_obs = len(obs) + self.obs_memory.total_number_obs

self.obs_memory.save_observation_normalize_parameter(mean_obs, std_obs, total_number_obs)

def updateRwdNormalizationParam(self, in_rewards):
std_in_rewards = self.utils.countNewStd(self.obs_memory.std_in_rewards, self.obs_memory.total_number_rwd, in_rewards)
def update_rwd_normalization_param(self, in_rewards):
std_in_rewards = self.utils.count_new_std(self.obs_memory.std_in_rewards, self.obs_memory.total_number_rwd, in_rewards)
total_number_rwd = len(in_rewards) + self.obs_memory.total_number_rwd

self.obs_memory.save_rewards_normalize_parameter(std_in_rewards, total_number_rwd)
Expand All @@ -287,13 +291,13 @@ def get_PPO_loss(self, action_probs, ex_values, old_action_probs, old_ex_values,
# Getting external general advantages estimator
External_Advantages = self.policy_function.generalized_advantage_estimation(ex_values, ex_rewards, next_ex_values, dones)
External_Returns = (External_Advantages + ex_values).detach()
External_Advantages = ((External_Advantages - External_Advantages.mean()) / (External_Advantages.std() + 1e-6)).detach()
External_Advantages = self.utils.normalize(External_Advantages).detach()

# Computing internal reward, then getting internal general advantages estimator
in_rewards = (state_targets - state_preds).pow(2) * 0.5 / (std_in_rewards.mean() + 1e-8)
Internal_Advantages = self.policy_function.generalized_advantage_estimation(in_values, in_rewards, next_in_values, dones)
Internal_Returns = (Internal_Advantages + in_values).detach()
Internal_Advantages = ((Internal_Advantages - Internal_Advantages.mean()) / (Internal_Advantages.std() + 1e-6)).detach()
Internal_Advantages = self.utils.normalize(Internal_Advantages).detach()

# Getting overall advantages
Advantages = (self.ex_advantages_coef * External_Advantages + self.in_advantages_coef * Internal_Advantages).detach()
Expand Down Expand Up @@ -348,6 +352,14 @@ def act(self, state):

return action.cpu().item()

def compute_intrinsic_reward(self, obs, mean_obs, std_obs):
obs = self.utils.normalize(obs, mean_obs, std_obs)

state_pred = self.rnd_predict(obs)
state_target = self.rnd_target(obs)

return (state_target - state_pred)

# Get loss and Do backpropagation
def training_rnd(self, obs, mean_obs, std_obs):
obs = self.utils.normalize(obs, mean_obs, std_obs)
Expand Down Expand Up @@ -393,16 +405,17 @@ def update_rnd(self):
dataloader = DataLoader(self.obs_memory, batch_size, shuffle = False)

# Optimize policy for K epochs:
intrinsic_rewards = 0
for _ in range(self.RND_epochs):
for obs in dataloader:
intrinsic_rewards = self.training_rnd(obs.float().to(device), self.obs_memory.mean_obs.float().to(device), self.obs_memory.std_obs.float().to(device))
self.training_rnd(obs.float().to(device), self.obs_memory.mean_obs.float().to(device), self.obs_memory.std_obs.float().to(device))

self.updateObsNormalizationParam(self.obs_memory.observations)
self.updateRwdNormalizationParam(intrinsic_rewards)
intrinsic_rewards = self.compute_intrinsic_reward(self.obs_memory.get_all().to(device), self.obs_memory.mean_obs.to(device), self.obs_memory.std_obs.to(device))

self.update_obs_normalization_param(self.obs_memory.observations)
self.update_rwd_normalization_param(intrinsic_rewards)

# Clear the memory
self.obs_memory.clearMemory()
self.obs_memory.clear_memory()

# Update the model
def update_ppo(self):
Expand All @@ -416,7 +429,7 @@ def update_ppo(self):
self.obs_memory.mean_obs.float().to(device), self.obs_memory.std_obs.float().to(device), self.obs_memory.std_in_rewards.float().to(device))

# Clear the memory
self.memory.clearMemory()
self.memory.clear_memory()

# Copy new weights into old policy:
self.actor_old.load_state_dict(self.actor.state_dict())
Expand Down Expand Up @@ -481,8 +494,8 @@ def run_inits_episode(env, agent, state_dim, render, n_init_episode):
if done:
env.reset()

agent.updateObsNormalizationParam(agent.obs_memory.observations)
agent.obs_memory.clearMemory()
agent.update_obs_normalization_param(agent.obs_memory.observations)
agent.obs_memory.clear_memory()

return agent

Expand Down
Loading

0 comments on commit 54fff8d

Please sign in to comment.