Skip to content

Commit

Permalink
complete mujoco gail, vail
Browse files Browse the repository at this point in the history
  • Loading branch information
dongminlee94 committed Feb 21, 2019
1 parent 3f3033f commit cfe1757
Show file tree
Hide file tree
Showing 14 changed files with 73 additions and 77 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file modified mujoco/gail/__pycache__/model.cpython-36.pyc
Binary file not shown.
Binary file modified mujoco/gail/__pycache__/train_model.cpython-36.pyc
Binary file not shown.
57 changes: 27 additions & 30 deletions mujoco/gail/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,7 @@ def main():
demonstrations = np.array(expert_demo)
print("demonstrations.shape", demonstrations.shape)

# writer = SummaryWriter(args.logdir)

writer = SummaryWriter(comment="-gail_discrim-" + str(args.discrim_update_num))
writer = SummaryWriter(args.logdir)

if args.load_model is not None:
saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
Expand All @@ -100,7 +98,8 @@ def main():


episodes = 0
train_rewards = True
train_discrim_flag = True

for iter in range(args.max_iter_num):
actor.eval(), critic.eval()
memory = deque()
Expand Down Expand Up @@ -146,36 +145,34 @@ def main():
score_avg = np.mean(scores)
print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
writer.add_scalar('log/score', float(score_avg), iter)
# writer.add_scalar('log/score', float(score_avg), iter)

actor.train(), critic.train(), discrim.train()
if train_rewards:
exp_acc, gen_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args)
print("Experts: %.2f%% | Generated: %.2f%%" % (exp_acc * 100, gen_acc * 100))
if exp_acc > args.suspend_accu_exp and gen_acc > args.suspend_accu_gen:
train_rewards = False
#train_discrim(discrim, memory, discrim_optim, demonstrations, args)
if train_discrim_flag:
expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args)
print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
train_discrim_flag = False
train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

# if iter % 100:
# score_avg = int(score_avg)

# model_path = os.path.join(os.getcwd(),'save_model')
# if not os.path.isdir(model_path):
# os.makedirs(model_path)

# ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

# save_checkpoint({
# 'actor': actor.state_dict(),
# 'critic': critic.state_dict(),
# 'discrim': discrim.state_dict(),
# 'z_filter_n':running_state.rs.n,
# 'z_filter_m': running_state.rs.mean,
# 'z_filter_s': running_state.rs.sum_square,
# 'args': args,
# 'score': score_avg
# }, filename=ckpt_path)
if iter % 100:
score_avg = int(score_avg)

model_path = os.path.join(os.getcwd(),'save_model')
if not os.path.isdir(model_path):
os.makedirs(model_path)

ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

save_checkpoint({
'actor': actor.state_dict(),
'critic': critic.state_dict(),
'discrim': discrim.state_dict(),
'z_filter_n':running_state.rs.n,
'z_filter_m': running_state.rs.mean,
'z_filter_s': running_state.rs.sum_square,
'args': args,
'score': score_avg
}, filename=ckpt_path)

if __name__=="__main__":
main()
10 changes: 5 additions & 5 deletions mujoco/gail/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ def train_discrim(discrim, memory, discrim_optim, demonstrations, args):
criterion = torch.nn.BCELoss()

for _ in range(args.discrim_update_num):
expert_state_action = torch.Tensor(demonstrations)
learner = discrim(torch.cat([states, actions], dim=1))
expert = discrim(torch.Tensor(demonstrations))
demonstrations = torch.Tensor(demonstrations)
expert = discrim(demonstrations)

discrim_loss = criterion(learner, torch.ones((states.shape[0], 1))) + \
criterion(expert, torch.zeros((demonstrations.shape[0], 1)))
Expand All @@ -24,10 +24,10 @@ def train_discrim(discrim, memory, discrim_optim, demonstrations, args):
discrim_loss.backward()
discrim_optim.step()

exp_acc = ((discrim(expert_state_action) < 0.5).float()).mean()
gen_acc = ((discrim(torch.cat([states, actions], dim=1)) > 0.5).float()).mean()
expert_acc = ((discrim(demonstrations) < 0.5).float()).mean()
learner_acc = ((discrim(torch.cat([states, actions], dim=1)) > 0.5).float()).mean()

return exp_acc, gen_acc
return expert_acc, learner_acc


def train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args):
Expand Down
Binary file modified mujoco/gail/utils/__pycache__/utils.cpython-36.pyc
Binary file not shown.
Binary file modified mujoco/gail/utils/__pycache__/zfilter.cpython-36.pyc
Binary file not shown.
4 changes: 2 additions & 2 deletions mujoco/ppo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@
help='total sample size to collect before PPO update (default: 2048)')
parser.add_argument('--batch_size', type=int, default=64,
help='batch size to update (default: 64)')
parser.add_argument('--max_iter_num', type=int, default=4000,
help='maximal number of main iterations (default: 4000)')
parser.add_argument('--max_iter_num', type=int, default=15000,
help='maximal number of main iterations (default: 15000)')
parser.add_argument('--seed', type=int, default=500,
help='random seed (default: 500)')
parser.add_argument('--logdir', type=str, default='logs',
Expand Down
Binary file modified mujoco/vail/__pycache__/model.cpython-36.pyc
Binary file not shown.
Binary file modified mujoco/vail/__pycache__/train_model.cpython-36.pyc
Binary file not shown.
63 changes: 31 additions & 32 deletions mujoco/vail/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from utils.utils import *
from utils.zfilter import ZFilter
from model import Actor, Critic, VDB
from train_model import train_ppo, train_vdb
from train_model import train_actor_critic, train_vdb

parser = argparse.ArgumentParser(description='PyTorch VAIL')
parser.add_argument('--env_name', type=str, default="Hopper-v2",
Expand Down Expand Up @@ -86,8 +86,8 @@ def main():
demonstrations = np.array(expert_demo)
print("demonstrations.shape", demonstrations.shape)

writer = SummaryWriter(comment="-vail_" + str(args.z_size) + str(args.alpha_beta) + str(args.vdb_update_num))
writer = SummaryWriter(args.logdir)

if args.load_model is not None:
saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
ckpt = torch.load(saved_ckpt_path)
Expand All @@ -104,7 +104,8 @@ def main():


episodes = 0
train_rewards = True
train_discrim_flag = True

for iter in range(args.max_iter_num):
actor.eval(), critic.eval()
memory = deque()
Expand Down Expand Up @@ -152,34 +153,32 @@ def main():
writer.add_scalar('log/score', float(score_avg), iter)

actor.train(), critic.train(), vdb.train()
if train_rewards:
exp_acc, gen_acc = train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args)
print("Experts: %.2f%% | Generated: %.2f%%" % (exp_acc * 100, gen_acc * 100))
if exp_acc > args.suspend_accu_exp and gen_acc > args.suspend_accu_gen:
train_rewards = False

#train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args)
train_ppo(actor, critic, memory, actor_optim, critic_optim, args)

# if iter % 100:
# score_avg = int(score_avg)

# model_path = os.path.join(os.getcwd(),'save_model')
# if not os.path.isdir(model_path):
# os.makedirs(model_path)

# ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

# save_checkpoint({
# 'actor': actor.state_dict(),
# 'critic': critic.state_dict(),
# 'vdb': vdb.state_dict(),
# 'z_filter_n':running_state.rs.n,
# 'z_filter_m': running_state.rs.mean,
# 'z_filter_s': running_state.rs.sum_square,
# 'args': args,
# 'score': score_avg
# }, filename=ckpt_path)
if train_discrim_flag:
expert_acc, learner_acc = train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args)
print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
train_discrim_flag = False
train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

if iter % 100:
score_avg = int(score_avg)

model_path = os.path.join(os.getcwd(),'save_model')
if not os.path.isdir(model_path):
os.makedirs(model_path)

ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

save_checkpoint({
'actor': actor.state_dict(),
'critic': critic.state_dict(),
'vdb': vdb.state_dict(),
'z_filter_n':running_state.rs.n,
'z_filter_m': running_state.rs.mean,
'z_filter_s': running_state.rs.sum_square,
'args': args,
'score': score_avg
}, filename=ckpt_path)

if __name__=="__main__":
main()
16 changes: 8 additions & 8 deletions mujoco/vail/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ def train_vdb(vdb, memory, vdb_optim, demonstrations, beta, args):
criterion = torch.nn.BCELoss()

for _ in range(args.vdb_update_num):
expert_state_action = torch.Tensor(demonstrations)
learner, l_mu, l_logvar = vdb(torch.cat([states, actions], dim=1))
expert, e_mu, e_logvar = vdb(torch.Tensor(demonstrations))
demonstrations = torch.Tensor(demonstrations)
expert, e_mu, e_logvar = vdb(demonstrations)

l_kld = kl_divergence(l_mu, l_logvar)
l_kld = l_kld.mean()
Expand All @@ -29,20 +29,20 @@ def train_vdb(vdb, memory, vdb_optim, demonstrations, beta, args):
beta = max(0, beta + args.alpha_beta * bottleneck_loss)

vdb_loss = criterion(learner, torch.ones((states.shape[0], 1))) + \
criterion(expert, torch.zeros((demonstrations.shape[0], 1))) + \
beta * bottleneck_loss
criterion(expert, torch.zeros((demonstrations.shape[0], 1))) + \
beta * bottleneck_loss

vdb_optim.zero_grad()
vdb_loss.backward(retain_graph=True)
vdb_optim.step()

exp_acc = ((vdb(expert_state_action)[0] < 0.5).float()).mean()
gen_acc = ((vdb(torch.cat([states, actions], dim=1))[0] > 0.5).float()).mean()
expert_acc = ((vdb(demonstrations)[0] < 0.5).float()).mean()
learner_acc = ((vdb(torch.cat([states, actions], dim=1))[0] > 0.5).float()).mean()

return exp_acc, gen_acc
return expert_acc, learner_acc


def train_ppo(actor, critic, memory, actor_optim, critic_optim, args):
def train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args):
memory = np.array(memory)
states = np.vstack(memory[:, 0])
actions = list(memory[:, 1])
Expand Down
Binary file modified mujoco/vail/utils/__pycache__/utils.cpython-36.pyc
Binary file not shown.
Binary file modified mujoco/vail/utils/__pycache__/zfilter.cpython-36.pyc
Binary file not shown.

0 comments on commit cfe1757

Please sign in to comment.