Skip to content

Commit

Permalink
fixed default hyperparamters
Browse files Browse the repository at this point in the history
  • Loading branch information
qfettes committed Jun 19, 2018
1 parent 1c67ef9 commit 296c7ad
Show file tree
Hide file tree
Showing 11 changed files with 779 additions and 558 deletions.
78 changes: 42 additions & 36 deletions 01.DQN.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@
"from timeit import default_timer as timer\n",
"from datetime import timedelta\n",
"import math\n",
"from utils.wrappers import make_atari, wrap_deepmind, wrap_pytorch"
"from utils.wrappers import make_atari, wrap_deepmind, wrap_pytorch\n",
"\n",
"from utils.hyperparameters import Config"
]
},
{
Expand All @@ -54,26 +56,28 @@
"metadata": {},
"outputs": [],
"source": [
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"config = Config()\n",
"\n",
"config.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"#epsilon variables\n",
"epsilon_start = 1.0\n",
"epsilon_final = 0.01\n",
"epsilon_decay = 30000\n",
"epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)\n",
"config.epsilon_start = 1.0\n",
"config.epsilon_final = 0.01\n",
"config.epsilon_decay = 30000\n",
"config.epsilon_by_frame = lambda frame_idx: config.epsilon_final + (config.epsilon_start - config.epsilon_final) * math.exp(-1. * frame_idx / config.epsilon_decay)\n",
"\n",
"#misc agent variables\n",
"GAMMA=0.99\n",
"LR=1e-4\n",
"config.GAMMA=0.99\n",
"config.LR=1e-4\n",
"\n",
"#memory\n",
"TARGET_NET_UPDATE_FREQ = 1000\n",
"EXP_REPLAY_SIZE = 100000\n",
"BATCH_SIZE = 32\n",
"config.TARGET_NET_UPDATE_FREQ = 1000\n",
"config.EXP_REPLAY_SIZE = 100000\n",
"config.BATCH_SIZE = 32\n",
"\n",
"#Learning control variables\n",
"LEARN_START = 10000\n",
"MAX_FRAMES=1000000"
"config.LEARN_START = 10000\n",
"config.MAX_FRAMES=1000000"
]
},
{
Expand Down Expand Up @@ -161,16 +165,18 @@
"outputs": [],
"source": [
"class Model(object):\n",
" def __init__(self, static_policy=False, env=None):\n",
" def __init__(self, static_policy=False, env=None, config=None):\n",
" super(Model, self).__init__()\n",
" self.gamma=GAMMA\n",
" self.lr = LR\n",
" self.target_net_update_freq = TARGET_NET_UPDATE_FREQ\n",
" self.experience_replay_size = EXP_REPLAY_SIZE\n",
" self.batch_size = BATCH_SIZE\n",
" self.learn_start = LEARN_START\n",
"\n",
" self.static_policy=static_policy\n",
" self.device = config.device\n",
"\n",
" self.gamma = config.GAMMA\n",
" self.lr = config.LR\n",
" self.target_net_update_freq = config.TARGET_NET_UPDATE_FREQ\n",
" self.experience_replay_size = config.EXP_REPLAY_SIZE\n",
" self.batch_size = config.BATCH_SIZE\n",
" self.learn_start = config.LEARN_START\n",
"\n",
" self.static_policy = static_policy\n",
" self.num_feats = env.observation_space.shape\n",
" self.num_actions = env.action_space.n\n",
" self.env = env\n",
Expand All @@ -181,8 +187,8 @@
" self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)\n",
" \n",
" #move to correct device\n",
" self.model = self.model.to(device)\n",
" self.target_model.to(device)\n",
" self.model = self.model.to(self.device)\n",
" self.target_model.to(self.device)\n",
"\n",
" if self.static_policy:\n",
" self.model.eval()\n",
Expand All @@ -209,19 +215,19 @@
"\n",
" def prep_minibatch(self):\n",
" # random transition batch is taken from experience replay memory\n",
" transitions = self.memory.sample(BATCH_SIZE)\n",
" transitions = self.memory.sample(self.batch_size)\n",
" \n",
" batch_state, batch_action, batch_reward, batch_next_state = zip(*transitions)\n",
"\n",
" shape = (-1,)+self.num_feats\n",
"\n",
" batch_state = torch.tensor(batch_state, device=device, dtype=torch.float).view(shape)\n",
" batch_action = torch.tensor(batch_action, device=device, dtype=torch.long).squeeze().view(-1, 1)\n",
" batch_reward = torch.tensor(batch_reward, device=device, dtype=torch.float).squeeze().view(-1, 1)\n",
" batch_state = torch.tensor(batch_state, device=self.device, dtype=torch.float).view(shape)\n",
" batch_action = torch.tensor(batch_action, device=self.device, dtype=torch.long).squeeze().view(-1, 1)\n",
" batch_reward = torch.tensor(batch_reward, device=self.device, dtype=torch.float).squeeze().view(-1, 1)\n",
" \n",
" non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch_next_state)), device=device, dtype=torch.uint8)\n",
" non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch_next_state)), device=self.device, dtype=torch.uint8)\n",
" try: #sometimes all next states are false\n",
" non_final_next_states = torch.tensor([s for s in batch_next_state if s is not None], device=device, dtype=torch.float).view(shape)\n",
" non_final_next_states = torch.tensor([s for s in batch_next_state if s is not None], device=self.sdevice, dtype=torch.float).view(shape)\n",
" empty_next_state_values = False\n",
" except:\n",
" non_final_next_states = None\n",
Expand All @@ -237,7 +243,7 @@
" \n",
" #target\n",
" with torch.no_grad():\n",
" max_next_q_values = torch.zeros(self.batch_size, device=device, dtype=torch.float).unsqueeze(dim=1)\n",
" max_next_q_values = torch.zeros(self.batch_size, device=self.device, dtype=torch.float).unsqueeze(dim=1)\n",
" if not empty_next_state_values:\n",
" max_next_action = self.get_max_next_state_action(non_final_next_states)\n",
" max_next_q_values[non_final_mask] = self.target_model(non_final_next_states).gather(1, max_next_action)\n",
Expand Down Expand Up @@ -276,7 +282,7 @@
" def get_action(self, s, eps=0.1):\n",
" with torch.no_grad():\n",
" if np.random.random() >= eps or self.static_policy:\n",
" X = torch.tensor([s], device=device, dtype=torch.float)\n",
" X = torch.tensor([s], device=self.device, dtype=torch.float)\n",
" a = self.model(X).max(1)[1].view(1, 1)\n",
" return a.item()\n",
" else:\n",
Expand Down Expand Up @@ -351,15 +357,15 @@
"env = make_atari(env_id)\n",
"env = wrap_deepmind(env, frame_stack=False)\n",
"env = wrap_pytorch(env)\n",
"model = Model(env=env)\n",
"model = Model(env=env, config=config)\n",
"\n",
"losses = []\n",
"all_rewards = []\n",
"episode_reward = 0\n",
"\n",
"observation = env.reset()\n",
"for frame_idx in range(1, MAX_FRAMES + 1):\n",
" epsilon = epsilon_by_frame(frame_idx)\n",
"for frame_idx in range(1, config.MAX_FRAMES + 1):\n",
" epsilon = config.epsilon_by_frame(frame_idx)\n",
"\n",
" action = model.get_action(observation, epsilon)\n",
" prev_observation=observation\n",
Expand Down Expand Up @@ -412,7 +418,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.6.3"
}
},
"nbformat": 4,
Expand Down
96 changes: 53 additions & 43 deletions 02.NStep_DQN.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@
"from utils.wrappers import *\n",
"from networks.networks import DQN\n",
"from networks.network_bodies import AtariBody\n",
"from utils.ReplayMemory import ExperienceReplayMemory"
"from utils.ReplayMemory import ExperienceReplayMemory\n",
"\n",
"from utils.hyperparameters import Config"
]
},
{
Expand All @@ -52,29 +54,31 @@
"metadata": {},
"outputs": [],
"source": [
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"config = Config()\n",
"\n",
"config.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"#epsilon variables\n",
"epsilon_start = 1.0\n",
"epsilon_final = 0.01\n",
"epsilon_decay = 30000\n",
"epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)\n",
"config.epsilon_start = 1.0\n",
"config.epsilon_final = 0.01\n",
"config.epsilon_decay = 30000\n",
"config.epsilon_by_frame = lambda frame_idx: config.epsilon_final + (config.epsilon_start - config.epsilon_final) * math.exp(-1. * frame_idx / config.epsilon_decay)\n",
"\n",
"#misc agent variables\n",
"GAMMA=0.99\n",
"LR=1e-4\n",
"config.GAMMA=0.99\n",
"config.LR=1e-4\n",
"\n",
"#memory\n",
"TARGET_NET_UPDATE_FREQ = 1000\n",
"EXP_REPLAY_SIZE = 100000\n",
"BATCH_SIZE = 32\n",
"config.TARGET_NET_UPDATE_FREQ = 1000\n",
"config.EXP_REPLAY_SIZE = 100000\n",
"config.BATCH_SIZE = 32\n",
"\n",
"#Learning control variables\n",
"LEARN_START = 10000\n",
"MAX_FRAMES=1000000\n",
"config.LEARN_START = 10000\n",
"config.MAX_FRAMES=1000000\n",
"\n",
"#Nstep controls\n",
"N_STEPS=3"
"config.N_STEPS=3"
]
},
{
Expand All @@ -91,16 +95,22 @@
"outputs": [],
"source": [
"class Model(object):\n",
" def __init__(self, static_policy=False, env=None):\n",
" def __init__(self, static_policy=False, env=None, config=None):\n",
" super(Model, self).__init__()\n",
" self.gamma=GAMMA\n",
" self.lr = LR\n",
" self.target_net_update_freq = TARGET_NET_UPDATE_FREQ\n",
" self.experience_replay_size = EXP_REPLAY_SIZE\n",
" self.batch_size = BATCH_SIZE\n",
" self.learn_start = LEARN_START\n",
"\n",
" self.static_policy=static_policy\n",
" self.device = config.device\n",
"\n",
" self.gamma = config.GAMMA\n",
" self.lr = config.LR\n",
" self.target_net_update_freq = config.TARGET_NET_UPDATE_FREQ\n",
" self.experience_replay_size = config.EXP_REPLAY_SIZE\n",
" self.batch_size = config.BATCH_SIZE\n",
" self.learn_start = config.LEARN_START\n",
" self.sigma_init= config.SIGMA_INIT\n",
" self.priority_beta_start = config.PRIORITY_BETA_START\n",
" self.priority_beta_frames = config.PRIORITY_BETA_FRAMES\n",
" self.priority_alpha = config.PRIORITY_ALPHA\n",
"\n",
" self.static_policy = static_policy\n",
" self.num_feats = env.observation_space.shape\n",
" self.num_actions = env.action_space.n\n",
" self.env = env\n",
Expand All @@ -111,8 +121,8 @@
" self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)\n",
" \n",
" #move to correct device\n",
" self.model = self.model.to(device)\n",
" self.target_model.to(device)\n",
" self.model = self.model.to(self.device)\n",
" self.target_model.to(self.device)\n",
"\n",
" if self.static_policy:\n",
" self.model.eval()\n",
Expand All @@ -125,7 +135,7 @@
"\n",
" self.declare_memory()\n",
"\n",
" self.nsteps = N_STEPS\n",
" self.nsteps = config.N_STEPS\n",
" self.nstep_buffer = []\n",
"\n",
" def declare_networks(self):\n",
Expand All @@ -146,22 +156,21 @@
"\n",
" self.memory.push((state, action, R, s_))\n",
"\n",
"\n",
" def prep_minibatch(self):\n",
" # random transition batch is taken from experience replay memory\n",
" transitions, indices, weights = self.memory.sample(BATCH_SIZE)\n",
" transitions, indices, weights = self.memory.sample(self.batch_size)\n",
" \n",
" batch_state, batch_action, batch_reward, batch_next_state = zip(*transitions)\n",
"\n",
" shape = (-1,)+self.num_feats\n",
"\n",
" batch_state = torch.tensor(batch_state, device=device, dtype=torch.float).view(shape)\n",
" batch_action = torch.tensor(batch_action, device=device, dtype=torch.long).squeeze().view(-1, 1)\n",
" batch_reward = torch.tensor(batch_reward, device=device, dtype=torch.float).squeeze().view(-1, 1)\n",
" batch_state = torch.tensor(batch_state, device=self.device, dtype=torch.float).view(shape)\n",
" batch_action = torch.tensor(batch_action, device=self.device, dtype=torch.long).squeeze().view(-1, 1)\n",
" batch_reward = torch.tensor(batch_reward, device=self.device, dtype=torch.float).squeeze().view(-1, 1)\n",
" \n",
" non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch_next_state)), device=device, dtype=torch.uint8)\n",
" non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch_next_state)), device=self.device, dtype=torch.uint8)\n",
" try: #sometimes all next states are false\n",
" non_final_next_states = torch.tensor([s for s in batch_next_state if s is not None], device=device, dtype=torch.float).view(shape)\n",
" non_final_next_states = torch.tensor([s for s in batch_next_state if s is not None], device=self.device, dtype=torch.float).view(shape)\n",
" empty_next_state_values = False\n",
" except:\n",
" non_final_next_states = None\n",
Expand All @@ -173,13 +182,15 @@
" batch_state, batch_action, batch_reward, non_final_next_states, non_final_mask, empty_next_state_values, indices, weights = batch_vars\n",
"\n",
" #estimate\n",
" self.model.sample_noise()\n",
" current_q_values = self.model(batch_state).gather(1, batch_action)\n",
" \n",
" #target\n",
" with torch.no_grad():\n",
" max_next_q_values = torch.zeros(self.batch_size, device=device, dtype=torch.float).unsqueeze(dim=1)\n",
" max_next_q_values = torch.zeros(self.batch_size, device=self.device, dtype=torch.float).unsqueeze(dim=1)\n",
" if not empty_next_state_values:\n",
" max_next_action = self.get_max_next_state_action(non_final_next_states)\n",
" self.target_model.sample_noise()\n",
" max_next_q_values[non_final_mask] = self.target_model(non_final_next_states).gather(1, max_next_action)\n",
" expected_q_values = batch_reward + ((self.gamma**self.nsteps)*max_next_q_values)\n",
"\n",
Expand All @@ -205,18 +216,17 @@
" # Optimize the model\n",
" self.optimizer.zero_grad()\n",
" loss.backward()\n",
" #for param in self.model.parameters():\n",
" # param.grad.data.clamp_(-1, 1)\n",
" for param in self.model.parameters():\n",
" param.grad.data.clamp_(-1, 1)\n",
" self.optimizer.step()\n",
"\n",
" self.update_target_model()\n",
" return loss.item()\n",
"\n",
"\n",
" def get_action(self, s, eps=0.1):\n",
" with torch.no_grad():\n",
" if np.random.random() >= eps or self.static_policy:\n",
" X = torch.tensor([s], device=device, dtype=torch.float)\n",
" X = torch.tensor([s], device=self.device, dtype=torch.float)\n",
" a = self.model(X).max(1)[1].view(1, 1)\n",
" return a.item()\n",
" else:\n",
Expand All @@ -239,7 +249,7 @@
" self.memory.push((state, action, R, None))\n",
"\n",
" def huber(self, x):\n",
" cond = (x < 1.0).float().detach()\n",
" cond = (x.abs() < 1.0).float().detach()\n",
" return 0.5 * x.pow(2) * cond + (x.abs() - 0.5) * (1.0 - cond)"
]
},
Expand Down Expand Up @@ -302,15 +312,15 @@
"env = wrap_pytorch(env)\n",
"#env = gym.make('CartPole-v0')\n",
"#env = wrappers.Monitor(env, 'Delete', force=True)\n",
"model = Model(env=env)\n",
"model = Model(env=env, config=config)\n",
"\n",
"losses = []\n",
"all_rewards = []\n",
"episode_reward = 0\n",
"\n",
"observation = env.reset()\n",
"for frame_idx in range(1, MAX_FRAMES + 1):\n",
" epsilon = epsilon_by_frame(frame_idx)\n",
"for frame_idx in range(1, config.MAX_FRAMES + 1):\n",
" epsilon = config.epsilon_by_frame(frame_idx)\n",
"\n",
" action = model.get_action(observation, epsilon)\n",
" prev_observation=observation\n",
Expand Down Expand Up @@ -364,7 +374,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.6.3"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 296c7ad

Please sign in to comment.