Nesterov momentum was updating velocities incorrectly

Test didn't catch it because it doesn't appear on first iteration. Test should now run for 10 iterations.
NervanaSystems · Jan 12, 2017 · 720f84a · 720f84a
1 parent 7ff1df8
commit 720f84a
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 30 deletions.
diff --git a/neon/optimizers/optimizer.py b/neon/optimizers/optimizer.py
@@ -487,6 +487,8 @@ def __init__(self, learning_rate, momentum_coef, stochastic_round=False,
         self.schedule = schedule
         self.stochastic_round = stochastic_round
         self.nesterov = nesterov
+        if self.momentum_coef == 0 and self.nesterov:
+            raise ValueError("nesterov requires non-zero momentum")
 
     def optimize(self, layer_list, epoch):
         """
@@ -505,31 +507,27 @@ def optimize(self, layer_list, epoch):
             param.rounding = self.stochastic_round
             if len(states) == 0 and self.momentum_coef != 0:
                 states.append(self.be.zeros_like(grad))
-                if self.nesterov:
-                    states.append(self.be.zeros_like(grad))
+
             grad = grad / self.be.bsz
             grad = self.clip_gradient_value(grad, self.gradient_clip_value)
+            grad = scale_factor * grad + self.wdecay * param
 
             if self.momentum_coef == 0:
-                velocity = - lrate * (scale_factor * grad + self.wdecay * param)
+                velocity = - lrate * grad
+                param[:] = param + velocity
             else:
                 velocity = states[0]
-                if self.nesterov:
-                    velocity_backup = states[-1]
-                    velocity_backup[:] = velocity
-
-                velocity[:] = velocity * self.momentum_coef \
-                    - lrate * (scale_factor * grad + self.wdecay * param)
+                velocity[:] = self.momentum_coef * velocity - lrate * grad
 
                 # Nesterov accelerated gradient (NAG) is implemented the same
                 # as in torch's "sgd.lua". It's a reformulation of Sutskever's
                 # NAG equation found in "On the importance of initialization
                 # and momentum in deep learning".
                 if self.nesterov:
-                    velocity[:] = (1 + self.momentum_coef) * velocity - \
-                                  self.momentum_coef * velocity_backup
-
-            param[:] = param + velocity
+                    param[:] = param + self.momentum_coef * velocity -\
+                               lrate * grad
+                else:
+                    param[:] = param + velocity
 
 
 class RMSProp(Optimizer):

diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
@@ -70,24 +70,29 @@ def test_gdm_nesterov(backend_default):
     lrate, mom, wdecay = 0.1, 0.9, 0.005
     gdm = GradientDescentMomentum(learning_rate=lrate, momentum_coef=mom,
                                   wdecay=wdecay, nesterov=True)
+    data_shape = (200, 128)
+
     # params to be updated using GDM
-    param = np.random.rand(200, 128)
-    grad = 0.01 * np.random.rand(200, 128)
+    np_param = np.random.rand(*data_shape)
+    param = wrap(np_param)
 
-    # params to be update manually
-    param2 = copy.deepcopy(param)
-    grad2 = grad / 128.
-    states = [0.01 * np.random.rand(200, 128),
-              0.01 * np.zeros_like(grad)]
-    velocity = states[0]
-    velocity_backup = states[1]
-    velocity_backup[:] = velocity
+    # Optimizer states
+    velocity = 0.01 * np.random.rand(*data_shape)
+    states = [wrap(velocity)]
 
-    param2[:] = param2 + (1 + mom) * (velocity * mom - grad2 * lrate
-                                      - wdecay * lrate * param) - mom * velocity_backup
-    param_list = [((wrap(param), wrap(grad)),
-                   [wrap(states[0]), wrap(states[1])])]
-    compare_tensors(gdm, param_list, param2, tol=1e-7)
+    # Check a few iterations in a row
+    for ii in range(20):
+        # Choose a gradient
+        np_grad = 0.01 * np.random.rand(*data_shape)
+        grad = wrap(np_grad)
+
+        # Update manually
+        np_grad = np_grad / data_shape[1]
+        velocity[:] = mom * velocity - lrate * (np_grad + wdecay * np_param)
+        np_param[:] = np_param + mom * velocity - lrate * (np_grad + wdecay * np_param)
+        param_list = [((param, grad),
+                       states)]
+        compare_tensors(gdm, param_list, np_param, tol=1e-6)
 
 
 def test_rmsprop(backend_default):
@@ -240,5 +245,6 @@ def test_multi_optimizer(backend_default):
     assert map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
 
 if __name__ == '__main__':
-    be = gen_backend(backend='gpu', batch_size=50)
-    test_multi_optimizer(be)
+    be = gen_backend(backend='gpu', batch_size=128)
+    # test_multi_optimizer(be)
+    test_gdm_nesterov(be)