Skip to content
This repository has been archived by the owner on Jan 3, 2023. It is now read-only.

Commit

Permalink
Nesterov momentum was updating velocities incorrectly
Browse files Browse the repository at this point in the history
Test didn't catch it because it doesn't appear on first iteration. Test should now run for 10 iterations.
  • Loading branch information
tyler-nervana authored and Jennifer Myers committed Jan 12, 2017
1 parent 7ff1df8 commit 720f84a
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 30 deletions.
24 changes: 11 additions & 13 deletions neon/optimizers/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,8 @@ def __init__(self, learning_rate, momentum_coef, stochastic_round=False,
self.schedule = schedule
self.stochastic_round = stochastic_round
self.nesterov = nesterov
if self.momentum_coef == 0 and self.nesterov:
raise ValueError("nesterov requires non-zero momentum")

def optimize(self, layer_list, epoch):
"""
Expand All @@ -505,31 +507,27 @@ def optimize(self, layer_list, epoch):
param.rounding = self.stochastic_round
if len(states) == 0 and self.momentum_coef != 0:
states.append(self.be.zeros_like(grad))
if self.nesterov:
states.append(self.be.zeros_like(grad))

grad = grad / self.be.bsz
grad = self.clip_gradient_value(grad, self.gradient_clip_value)
grad = scale_factor * grad + self.wdecay * param

if self.momentum_coef == 0:
velocity = - lrate * (scale_factor * grad + self.wdecay * param)
velocity = - lrate * grad
param[:] = param + velocity
else:
velocity = states[0]
if self.nesterov:
velocity_backup = states[-1]
velocity_backup[:] = velocity

velocity[:] = velocity * self.momentum_coef \
- lrate * (scale_factor * grad + self.wdecay * param)
velocity[:] = self.momentum_coef * velocity - lrate * grad

# Nesterov accelerated gradient (NAG) is implemented the same
# as in torch's "sgd.lua". It's a reformulation of Sutskever's
# NAG equation found in "On the importance of initialization
# and momentum in deep learning".
if self.nesterov:
velocity[:] = (1 + self.momentum_coef) * velocity - \
self.momentum_coef * velocity_backup

param[:] = param + velocity
param[:] = param + self.momentum_coef * velocity -\
lrate * grad
else:
param[:] = param + velocity


class RMSProp(Optimizer):
Expand Down
40 changes: 23 additions & 17 deletions tests/test_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,24 +70,29 @@ def test_gdm_nesterov(backend_default):
lrate, mom, wdecay = 0.1, 0.9, 0.005
gdm = GradientDescentMomentum(learning_rate=lrate, momentum_coef=mom,
wdecay=wdecay, nesterov=True)
data_shape = (200, 128)

# params to be updated using GDM
param = np.random.rand(200, 128)
grad = 0.01 * np.random.rand(200, 128)
np_param = np.random.rand(*data_shape)
param = wrap(np_param)

# params to be update manually
param2 = copy.deepcopy(param)
grad2 = grad / 128.
states = [0.01 * np.random.rand(200, 128),
0.01 * np.zeros_like(grad)]
velocity = states[0]
velocity_backup = states[1]
velocity_backup[:] = velocity
# Optimizer states
velocity = 0.01 * np.random.rand(*data_shape)
states = [wrap(velocity)]

param2[:] = param2 + (1 + mom) * (velocity * mom - grad2 * lrate
- wdecay * lrate * param) - mom * velocity_backup
param_list = [((wrap(param), wrap(grad)),
[wrap(states[0]), wrap(states[1])])]
compare_tensors(gdm, param_list, param2, tol=1e-7)
# Check a few iterations in a row
for ii in range(20):
# Choose a gradient
np_grad = 0.01 * np.random.rand(*data_shape)
grad = wrap(np_grad)

# Update manually
np_grad = np_grad / data_shape[1]
velocity[:] = mom * velocity - lrate * (np_grad + wdecay * np_param)
np_param[:] = np_param + mom * velocity - lrate * (np_grad + wdecay * np_param)
param_list = [((param, grad),
states)]
compare_tensors(gdm, param_list, np_param, tol=1e-6)


def test_rmsprop(backend_default):
Expand Down Expand Up @@ -240,5 +245,6 @@ def test_multi_optimizer(backend_default):
assert map_list[opt_rms_1][1].__class__.__name__ == 'GRU'

if __name__ == '__main__':
be = gen_backend(backend='gpu', batch_size=50)
test_multi_optimizer(be)
be = gen_backend(backend='gpu', batch_size=128)
# test_multi_optimizer(be)
test_gdm_nesterov(be)

0 comments on commit 720f84a

Please sign in to comment.