diff --git a/neon/optimizers/optimizer.py b/neon/optimizers/optimizer.py index 3db46631..6f6522a7 100644 --- a/neon/optimizers/optimizer.py +++ b/neon/optimizers/optimizer.py @@ -487,6 +487,8 @@ def __init__(self, learning_rate, momentum_coef, stochastic_round=False, self.schedule = schedule self.stochastic_round = stochastic_round self.nesterov = nesterov + if self.momentum_coef == 0 and self.nesterov: + raise ValueError("nesterov requires non-zero momentum") def optimize(self, layer_list, epoch): """ @@ -505,31 +507,27 @@ def optimize(self, layer_list, epoch): param.rounding = self.stochastic_round if len(states) == 0 and self.momentum_coef != 0: states.append(self.be.zeros_like(grad)) - if self.nesterov: - states.append(self.be.zeros_like(grad)) + grad = grad / self.be.bsz grad = self.clip_gradient_value(grad, self.gradient_clip_value) + grad = scale_factor * grad + self.wdecay * param if self.momentum_coef == 0: - velocity = - lrate * (scale_factor * grad + self.wdecay * param) + velocity = - lrate * grad + param[:] = param + velocity else: velocity = states[0] - if self.nesterov: - velocity_backup = states[-1] - velocity_backup[:] = velocity - - velocity[:] = velocity * self.momentum_coef \ - - lrate * (scale_factor * grad + self.wdecay * param) + velocity[:] = self.momentum_coef * velocity - lrate * grad # Nesterov accelerated gradient (NAG) is implemented the same # as in torch's "sgd.lua". It's a reformulation of Sutskever's # NAG equation found in "On the importance of initialization # and momentum in deep learning". if self.nesterov: - velocity[:] = (1 + self.momentum_coef) * velocity - \ - self.momentum_coef * velocity_backup - - param[:] = param + velocity + param[:] = param + self.momentum_coef * velocity -\ + lrate * grad + else: + param[:] = param + velocity class RMSProp(Optimizer): diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index c85dad76..82ec83d9 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -70,24 +70,29 @@ def test_gdm_nesterov(backend_default): lrate, mom, wdecay = 0.1, 0.9, 0.005 gdm = GradientDescentMomentum(learning_rate=lrate, momentum_coef=mom, wdecay=wdecay, nesterov=True) + data_shape = (200, 128) + # params to be updated using GDM - param = np.random.rand(200, 128) - grad = 0.01 * np.random.rand(200, 128) + np_param = np.random.rand(*data_shape) + param = wrap(np_param) - # params to be update manually - param2 = copy.deepcopy(param) - grad2 = grad / 128. - states = [0.01 * np.random.rand(200, 128), - 0.01 * np.zeros_like(grad)] - velocity = states[0] - velocity_backup = states[1] - velocity_backup[:] = velocity + # Optimizer states + velocity = 0.01 * np.random.rand(*data_shape) + states = [wrap(velocity)] - param2[:] = param2 + (1 + mom) * (velocity * mom - grad2 * lrate - - wdecay * lrate * param) - mom * velocity_backup - param_list = [((wrap(param), wrap(grad)), - [wrap(states[0]), wrap(states[1])])] - compare_tensors(gdm, param_list, param2, tol=1e-7) + # Check a few iterations in a row + for ii in range(20): + # Choose a gradient + np_grad = 0.01 * np.random.rand(*data_shape) + grad = wrap(np_grad) + + # Update manually + np_grad = np_grad / data_shape[1] + velocity[:] = mom * velocity - lrate * (np_grad + wdecay * np_param) + np_param[:] = np_param + mom * velocity - lrate * (np_grad + wdecay * np_param) + param_list = [((param, grad), + states)] + compare_tensors(gdm, param_list, np_param, tol=1e-6) def test_rmsprop(backend_default): @@ -240,5 +245,6 @@ def test_multi_optimizer(backend_default): assert map_list[opt_rms_1][1].__class__.__name__ == 'GRU' if __name__ == '__main__': - be = gen_backend(backend='gpu', batch_size=50) - test_multi_optimizer(be) + be = gen_backend(backend='gpu', batch_size=128) + # test_multi_optimizer(be) + test_gdm_nesterov(be)