From 3db7a99f2fcb049dd07e5edf6a472d7af8322c8b Mon Sep 17 00:00:00 2001 From: Abhinav kumar <126642111+Abhinavcode13@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:50:44 +0530 Subject: [PATCH 1/8] feat: add a mathematical description of AdamGrad optimizer --- optax/_src/alias.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/optax/_src/alias.py b/optax/_src/alias.py index 39e0d96f..42e04d87 100644 --- a/optax/_src/alias.py +++ b/optax/_src/alias.py @@ -282,10 +282,31 @@ def adagrad( initial_accumulator_value: float = 0.1, eps: float = 1e-7 ) -> base.GradientTransformation: - """The Adagrad optimizer. + r"""The Adagrad optimizer. - Adagrad is an algorithm for gradient based optimization that anneals the - learning rate for each parameter during the course of training. + AdaGrad’s concept is to modify the learning rate for every parameter in a model + depending on the parameter’s previous gradients. + + .. math:: + + w_{t+1}^{(i)} = w_{t}^{(i)} - \eta \frac{g_{t}^{(i)}}{\sqrt{\sum_{\tau=1}^{t} (g_{\tau}^{(i)})^2 + \epsilon}} + + where: + - \( w_t^{(i)} \) is the parameter \( i \) at time step \( t \), + - \( \eta \) is the learning rate, + - \( g_t^{(i)} \) is the gradient of parameter \( i \) at time step \( t \), + - \( \epsilon \) is a small constant to ensure numerical stability. + + When there is no regularization term, the update simplifies to: + + .. math:: + + w_{t+1} = w_{t} - \eta \cdot \text{diag}(G)^{-\frac{1}{2}} \cdot g_t + + where \( \text{diag}(G)^{-\frac{1}{2}} \) is a diagonal matrix with elements \( \frac{1}{\sqrt{\sum_{\tau=1}^{t} (g_{\tau}^{(i)})^2}} \). + + This formulation ensures that each parameter update is scaled according to the accumulated sum of squared gradients, + effectively adapting the learning rate to each parameter's specific gradient behavior over time. .. warning:: Adagrad's main limit is the monotonic accumulation of squared From 2047885cbe93e9bdb035e16fa68250b6f43db71a Mon Sep 17 00:00:00 2001 From: Abhinav kumar <126642111+Abhinavcode13@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:57:14 +0530 Subject: [PATCH 2/8] chores: change definition --- optax/_src/alias.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/optax/_src/alias.py b/optax/_src/alias.py index 42e04d87..6cc2c062 100644 --- a/optax/_src/alias.py +++ b/optax/_src/alias.py @@ -284,8 +284,9 @@ def adagrad( ) -> base.GradientTransformation: r"""The Adagrad optimizer. - AdaGrad’s concept is to modify the learning rate for every parameter in a model - depending on the parameter’s previous gradients. + AdaGrad is a sub-gradient algorithm for stochastic optimization that adapts the learning rate individually for + each feature based on its gradient history. It assigns higher learning rates to infrequent features, ensuring + updates prioritize less frequent but potentially more informative parameters in the optimization process. .. math:: From 290192c8bc4766c4281901d42e659fe529293a03 Mon Sep 17 00:00:00 2001 From: Abhinav kumar <126642111+Abhinavcode13@users.noreply.github.com> Date: Mon, 15 Jul 2024 22:08:14 +0530 Subject: [PATCH 3/8] Update optax/_src/alias.py Co-authored-by: Fabian Pedregosa --- optax/_src/alias.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optax/_src/alias.py b/optax/_src/alias.py index 6cc2c062..dd851951 100644 --- a/optax/_src/alias.py +++ b/optax/_src/alias.py @@ -287,7 +287,7 @@ def adagrad( AdaGrad is a sub-gradient algorithm for stochastic optimization that adapts the learning rate individually for each feature based on its gradient history. It assigns higher learning rates to infrequent features, ensuring updates prioritize less frequent but potentially more informative parameters in the optimization process. - +The updated parameters adopt the form: .. math:: w_{t+1}^{(i)} = w_{t}^{(i)} - \eta \frac{g_{t}^{(i)}}{\sqrt{\sum_{\tau=1}^{t} (g_{\tau}^{(i)})^2 + \epsilon}} From 6f5d4d4dea882eac2b8dc3be4ea69903cb7b1be0 Mon Sep 17 00:00:00 2001 From: Abhinav kumar <126642111+Abhinavcode13@users.noreply.github.com> Date: Mon, 15 Jul 2024 22:11:49 +0530 Subject: [PATCH 4/8] chores: minor fixes --- optax/_src/alias.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/optax/_src/alias.py b/optax/_src/alias.py index dd851951..1c6d0610 100644 --- a/optax/_src/alias.py +++ b/optax/_src/alias.py @@ -284,13 +284,17 @@ def adagrad( ) -> base.GradientTransformation: r"""The Adagrad optimizer. - AdaGrad is a sub-gradient algorithm for stochastic optimization that adapts the learning rate individually for - each feature based on its gradient history. It assigns higher learning rates to infrequent features, ensuring - updates prioritize less frequent but potentially more informative parameters in the optimization process. -The updated parameters adopt the form: + AdaGrad is a sub-gradient algorithm for stochastic optimization that adapts + the learning rate individually for each feature based on its gradient history. + It assigns higher learning rates to infrequent features, ensuring updates + prioritize less frequent but potentially more informative parameters in the + optimization process. + + The updated parameters adopt the form: .. math:: - w_{t+1}^{(i)} = w_{t}^{(i)} - \eta \frac{g_{t}^{(i)}}{\sqrt{\sum_{\tau=1}^{t} (g_{\tau}^{(i)})^2 + \epsilon}} + w_{t+1}^{(i)} = w_{t}^{(i)} - \eta \frac{g_{t}^{(i)}} + {\sqrt{\sum_{\tau=1}^{t} (g_{\tau}^{(i)})^2 + \epsilon}} where: - \( w_t^{(i)} \) is the parameter \( i \) at time step \( t \), @@ -304,10 +308,12 @@ def adagrad( w_{t+1} = w_{t} - \eta \cdot \text{diag}(G)^{-\frac{1}{2}} \cdot g_t - where \( \text{diag}(G)^{-\frac{1}{2}} \) is a diagonal matrix with elements \( \frac{1}{\sqrt{\sum_{\tau=1}^{t} (g_{\tau}^{(i)})^2}} \). + where \( \text{diag}(G)^{-\frac{1}{2}} \) is a diagonal matrix with elements + \( \frac{1}{\sqrt{\sum_{\tau=1}^{t} (g_{\tau}^{(i)})^2}} \). - This formulation ensures that each parameter update is scaled according to the accumulated sum of squared gradients, - effectively adapting the learning rate to each parameter's specific gradient behavior over time. + This formulation ensures that each parameter update is scaled according to + the accumulated sum of squared gradients, effectively adapting the learning + rate to each parameter's specific gradient behavior over time. .. warning:: Adagrad's main limit is the monotonic accumulation of squared From b2240ad8eae3fa7a4d2788d5ee46f395c24d212d Mon Sep 17 00:00:00 2001 From: Abhinav kumar <126642111+Abhinavcode13@users.noreply.github.com> Date: Tue, 16 Jul 2024 14:13:50 +0530 Subject: [PATCH 5/8] chores: remove extra info --- optax/_src/alias.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/optax/_src/alias.py b/optax/_src/alias.py index 1c6d0610..9c856bfa 100644 --- a/optax/_src/alias.py +++ b/optax/_src/alias.py @@ -286,9 +286,6 @@ def adagrad( AdaGrad is a sub-gradient algorithm for stochastic optimization that adapts the learning rate individually for each feature based on its gradient history. - It assigns higher learning rates to infrequent features, ensuring updates - prioritize less frequent but potentially more informative parameters in the - optimization process. The updated parameters adopt the form: .. math:: From 05d14b9dea7f5c9fd7cbb9b4dd0e848daf82f053 Mon Sep 17 00:00:00 2001 From: Abhinav kumar <126642111+Abhinavcode13@users.noreply.github.com> Date: Tue, 16 Jul 2024 14:23:03 +0530 Subject: [PATCH 6/8] chores: replaced paper version --- optax/_src/alias.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/optax/_src/alias.py b/optax/_src/alias.py index 9c856bfa..110228b8 100644 --- a/optax/_src/alias.py +++ b/optax/_src/alias.py @@ -301,16 +301,14 @@ def adagrad( When there is no regularization term, the update simplifies to: - .. math:: - - w_{t+1} = w_{t} - \eta \cdot \text{diag}(G)^{-\frac{1}{2}} \cdot g_t + Defining \(G = \sum_{t=1}^\tau g_t g_t^\top\), the update can be written as - where \( \text{diag}(G)^{-\frac{1}{2}} \) is a diagonal matrix with elements - \( \frac{1}{\sqrt{\sum_{\tau=1}^{t} (g_{\tau}^{(i)})^2}} \). + .. math:: + + w_{t+1} = w_{t} - \eta \cdot \text{diag}(G + \epsilon I)^{-1/2} \cdot g_t - This formulation ensures that each parameter update is scaled according to - the accumulated sum of squared gradients, effectively adapting the learning - rate to each parameter's specific gradient behavior over time. + where \(\text{diag} (G) = (G_{ii})_{i=1}^p\) is the vector of diagonal entries of + \(G \in \mathbb{R}^p\) and \(I\) is the identity matrix in \(\mathbb{R}^p\). .. warning:: Adagrad's main limit is the monotonic accumulation of squared From ae8e7bbe9b9514aafefbd472ea13f7de0d36bf1f Mon Sep 17 00:00:00 2001 From: Abhinav kumar <126642111+Abhinavcode13@users.noreply.github.com> Date: Tue, 16 Jul 2024 14:31:07 +0530 Subject: [PATCH 7/8] chores: fix CI --- optax/_src/alias.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/optax/_src/alias.py b/optax/_src/alias.py index 110228b8..b00ff4a5 100644 --- a/optax/_src/alias.py +++ b/optax/_src/alias.py @@ -305,10 +305,11 @@ def adagrad( .. math:: - w_{t+1} = w_{t} - \eta \cdot \text{diag}(G + \epsilon I)^{-1/2} \cdot g_t + w_{t+1} = w_{t} - \eta \cdot \text{diag}(G + \epsilon I)^{-1/2} \cdot g_t - where \(\text{diag} (G) = (G_{ii})_{i=1}^p\) is the vector of diagonal entries of - \(G \in \mathbb{R}^p\) and \(I\) is the identity matrix in \(\mathbb{R}^p\). + where \(\text{diag} (G) = (G_{ii})_{i=1}^p\) is the vector of diagonal + entries of \(G \in \mathbb{R}^p\) and \(I\) is the identity matrix + in \(\mathbb{R}^p\). .. warning:: Adagrad's main limit is the monotonic accumulation of squared From 199e1d1c5ba63aff416306c5de454220daba37a6 Mon Sep 17 00:00:00 2001 From: Abhinav kumar <126642111+Abhinavcode13@users.noreply.github.com> Date: Wed, 17 Jul 2024 13:46:15 +0530 Subject: [PATCH 8/8] chores: fix terms --- optax/_src/alias.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/optax/_src/alias.py b/optax/_src/alias.py index b00ff4a5..ca2366a5 100644 --- a/optax/_src/alias.py +++ b/optax/_src/alias.py @@ -299,8 +299,6 @@ def adagrad( - \( g_t^{(i)} \) is the gradient of parameter \( i \) at time step \( t \), - \( \epsilon \) is a small constant to ensure numerical stability. - When there is no regularization term, the update simplifies to: - Defining \(G = \sum_{t=1}^\tau g_t g_t^\top\), the update can be written as .. math::