google-deepmind · copybara-service · Mar 10, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/optax/_src/alias.py b/optax/_src/alias.py
@@ -992,11 +992,27 @@ def noisy_sgd(
     gamma: float = 0.55,
     seed: int = 0
 ) -> base.GradientTransformation:
-  r"""A variant of SGD with added noise.
+  r"""Noisy SGD is a variant of :func:`optax.sgd` that incorporates Gaussian 
+  noise into the updates. It has been found that adding noise to the gradients 
+  can improve both the training error and the generalization error in very deep 
+  networks.
+
+  The update :math:`u_t` is modified to include this noise as follows:
+
+  .. math::
+    u_t \leftarrow -\alpha_t g_t + N(0, \sigma_t^2),
+
+  where :math:`N(0, \sigma_t^2)` represents Gaussian noise with zero mean and a 
+  variance of :math:`\sigma_t^2`.
+
+  The variance of this noise decays over time according to the formula
+
+  .. math::
+    \sigma_t^2 = \frac{\eta}{(1+t)^\gamma},
+
+  where :math:`\gamma` is the decay rate parameter ``gamma`` and :math:`\eta` 
+  represents the initial variance ``eta``.
 
-  It has been found that adding noise to the gradients can improve
-  both the training error and the generalization error in very deep networks.
-
   Examples:
     >>> import optax
     >>> import jax