Skip to content

Commit

Permalink
comments + adam
Browse files Browse the repository at this point in the history
  • Loading branch information
Kelvin Xu committed May 7, 2015
1 parent 163aad6 commit c04bdf1
Showing 1 changed file with 64 additions and 5 deletions.
69 changes: 64 additions & 5 deletions optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,29 @@
def itemlist(tparams):
return [vv for kk, vv in tparams.iteritems()]

# TODO write some comments
# optimizers
# name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
"""
General Optimizer Structure: (adadelta, adam, rmsprop, sgd)
Parameters
----------
lr : theano shared variable
learning rate, currently only necessaary for sgd
tparams : OrderedDict()
dictionary of shared variables {name: variable}
grads :
dictionary of gradients
inputs :
inputs required to compute gradients
cost :
objective of optimization
hard_attn_up :
additional updates required for hard attention mechanism learning
Returns
-------
f_grad_shared : compute cost, update optimizer shared variables
f_update : update parameters
"""
# See "ADADELTA: An adaptive learning rate method", Matt Zeiler (2012) arXiv
# preprint http:https://arxiv.org/abs/1212.5701
def adadelta(lr, tparams, grads, inp, cost, hard_attn_up):
zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()]
Expand All @@ -27,6 +47,12 @@ def adadelta(lr, tparams, grads, inp, cost, hard_attn_up):

return f_grad_shared, f_update

# See Lecture 6.5, Coursera: Neural Networks for Machine Learning (2012),
# Tieleman, T. and Hinton. G. for original methods
#
# This implementation (with Nesterov Momentum) is described well in:
# "Generating Sequences with Recurrent Neural Networks", Alex Graves, arxiv preprint
# http:https://arxiv.org/abs/1308.0850
def rmsprop(lr, tparams, grads, inp, cost, hard_attn_up):
zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
running_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad'%k) for k, p in tparams.iteritems()]
Expand All @@ -45,6 +71,41 @@ def rmsprop(lr, tparams, grads, inp, cost, hard_attn_up):

return f_grad_shared, f_update

# See "Adam: A Method for Stochastic Optimization" Kingma et al. (ICLR 2015)
# Theano implementation adapted from Soren Kaae Sonderby (https://github.com/skaae)
# preprint: http:https://arxiv.org/abs/1412.6980
def adam(lr, tparams, grads, inp, cost, hard_attn_up):
gshared = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
gsup = [(gs, g) for gs, g in zip(gshared, grads)]

f_grad_shared = theano.function(inp, cost, updates=gsup+hard_attn_up)
lr0 = 0.0002
b1 = 0.1
b2 = 0.001
e = 1e-8
updates = []
i = theano.shared(numpy.float32(0.))
i_t = i + 1.
fix1 = 1. - b1**(i_t)
fix2 = 1. - b2**(i_t)
lr_t = lr0 * (tensor.sqrt(fix2) / fix1)

for p, g in zip(tparams.values(), gshared):
m = theano.shared(p.get_value() * numpy.float32(0.))
v = theano.shared(p.get_value() * numpy.float32(0.))
m_t = (b1 * g) + ((1. - b1) * m)
v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
g_t = m_t / (tensor.sqrt(v_t) + e)
p_t = p - (lr_t * g_t)
updates.append((m, m_t))
updates.append((v, v_t))
updates.append((p, p_t))
updates.append((i, i_t))

f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore')

return f_grad_shared, f_update

def sgd(lr, tparams, grads, inp, cost, hard_attn_up):
gshared = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
gsup = [(gs, g) for gs, g in zip(gshared, grads)]
Expand All @@ -55,5 +116,3 @@ def sgd(lr, tparams, grads, inp, cost, hard_attn_up):
f_update = theano.function([lr], [], updates=pup, profile=False)

return f_grad_shared, f_update


0 comments on commit c04bdf1

Please sign in to comment.