From b09672650b69d4494f87be19eb5219cb6bf5cc07 Mon Sep 17 00:00:00 2001
From: runame <re393@cam.ac.uk>
Date: Mon, 16 Sep 2024 23:30:13 -0400
Subject: [PATCH] Add inverse EKFAC support

---
 curvlinops/inverse.py | 195 ++++++++++++++++--------------------------
 1 file changed, 72 insertions(+), 123 deletions(-)

diff --git a/curvlinops/inverse.py b/curvlinops/inverse.py
index cd5482b..4b563a5 100644
--- a/curvlinops/inverse.py
+++ b/curvlinops/inverse.py
@@ -10,11 +10,7 @@
 from torch import Tensor, cat, cholesky_inverse, eye, float64, outer
 from torch.linalg import cholesky, eigh
 
-from curvlinops.kfac import KFACLinearOperator, ParameterMatrixType
-
-KFACInvType = TypeVar(
-    "KFACInvType", Optional[Tensor], Tuple[Optional[Tensor], Optional[Tensor]]
-)
+from curvlinops.kfac import KFACLinearOperator, KFACType, ParameterMatrixType
 
 
 class _InverseLinearOperator(LinearOperator):
@@ -355,6 +351,8 @@ def __init__(
             raise ValueError(
                 "Heuristic and exact damping require a single damping value."
             )
+        if self._A._correct_eigenvalues and not use_exact_damping:
+            raise ValueError("Only exact damping is supported for EKFAC.")
 
         self._damping = damping
         self._use_heuristic_damping = use_heuristic_damping
@@ -362,8 +360,8 @@ def __init__(
         self._use_exact_damping = use_exact_damping
         self._cache = cache
         self._retry_double_precision = retry_double_precision
-        self._inverse_input_covariances: Dict[str, KFACInvType] = {}
-        self._inverse_gradient_covariances: Dict[str, KFACInvType] = {}
+        self._inverse_input_covariances: Dict[str, KFACType] = {}
+        self._inverse_gradient_covariances: Dict[str, KFACType] = {}
 
     def _compute_damping(
         self, aaT: Optional[Tensor], ggT: Optional[Tensor]
@@ -408,18 +406,20 @@ def _damped_cholesky(self, M: Tensor, damping: float) -> Tensor:
         )
 
     def _compute_inverse_factors(
-        self, aaT: Optional[Tensor], ggT: Optional[Tensor]
-    ) -> Tuple[KFACInvType, KFACInvType]:
+        self, aaT: Optional[Tensor], ggT: Optional[Tensor], name: str
+    ) -> Tuple[KFACType, KFACType, Optional[Tensor]]:
         """Compute the inverses of the Kronecker factors for a given layer.
 
         Args:
             aaT: Input covariance matrix. ``None`` for biases.
             ggT: Gradient covariance matrix.
+            name: Name of the layer for which to invert Kronecker factors.
 
         Returns:
             Tuple of inverses (or eigendecompositions) of the input and gradient
-            covariance Kronecker factors. Can be ``None`` if the input or gradient
-            covariance is ``None`` (e.g. the input covariances for biases).
+            covariance Kronecker factors and optionally eigenvalues. Can be ``None`` if
+            the input or gradient covariance is ``None`` (e.g. the input covariances for
+            biases).
 
         Raises:
             RuntimeError: If a Cholesky decomposition (and optionally the retry in
@@ -430,7 +430,27 @@ def _compute_inverse_factors(
             # Kronecker-factored eigenbasis (KFE).
             aaT_eigvals, aaT_eigvecs = (None, None) if aaT is None else eigh(aaT)
             ggT_eigvals, ggT_eigvecs = (None, None) if ggT is None else eigh(ggT)
-            return (aaT_eigvecs, aaT_eigvals), (ggT_eigvecs, ggT_eigvals)
+            param_pos = self._A._mapping[name]
+            if (
+                not self._A._separate_weight_and_bias
+                and "weight" in param_pos
+                and "bias" in param_pos
+            ):
+                inv_damped_eigenvalues = (
+                    outer(ggT_eigvals, aaT_eigvals).add_(self._damping).pow_(-1)
+                )
+            else:
+                inv_damped_eigenvalues = {}
+                for p_name, pos in param_pos.items():
+                    if p_name == "weight":
+                        inv_damped_eigenvalues[pos] = (
+                            outer(ggT_eigvals, aaT_eigvals).add_(self._damping).pow_(-1)
+                        )
+                    else:
+                        inv_damped_eigenvalues[pos] = ggT_eigvals.add(
+                            self._damping
+                        ).pow_(-1)
+            return aaT_eigvecs, ggT_eigvecs, inv_damped_eigenvalues
         else:
             damping_aaT, damping_ggT = self._compute_damping(aaT, ggT)
 
@@ -476,11 +496,11 @@ def _compute_inverse_factors(
                     raise error
             ggT_inv = None if ggT_chol is None else cholesky_inverse(ggT_chol)
 
-            return aaT_inv, ggT_inv
+            return aaT_inv, ggT_inv, None
 
     def _compute_or_get_cached_inverse(
         self, name: str
-    ) -> Tuple[KFACInvType, KFACInvType]:
+    ) -> Tuple[KFACType, KFACType, Optional[Tensor]]:
         """Invert the Kronecker factors of the KFACLinearOperator or retrieve them.
 
         Args:
@@ -488,117 +508,37 @@ def _compute_or_get_cached_inverse(
 
         Returns:
             Tuple of inverses (or eigendecompositions) of the input and gradient
-            covariance Kronecker factors. Can be ``None`` if the input or gradient
-            covariance is ``None`` (e.g. the input covariances for biases).
+            covariance Kronecker factors and optionally eigenvalues. Can be ``None`` if
+            the input or gradient covariance is ``None`` (e.g. the input covariances for
+            biases).
         """
         if name in self._inverse_input_covariances:
             aaT_inv = self._inverse_input_covariances.get(name)
             ggT_inv = self._inverse_gradient_covariances.get(name)
-            return aaT_inv, ggT_inv
-
-        aaT = self._A._input_covariances.get(name)
-        ggT = self._A._gradient_covariances.get(name)
-        aaT_inv, ggT_inv = self._compute_inverse_factors(aaT, ggT)
-
-        if self._cache:
-            self._inverse_input_covariances[name] = aaT_inv
-            self._inverse_gradient_covariances[name] = ggT_inv
-
-        return aaT_inv, ggT_inv
-
-    def _left_and_right_multiply(
-        self, M_joint: Tensor, aaT_inv: KFACInvType, ggT_inv: KFACInvType
-    ) -> Tensor:
-        """Left and right multiply matrix with inverse Kronecker factors.
-
-        Args:
-            M_joint: Matrix for multiplication.
-            aaT_inv: Inverse of the input covariance Kronecker factor. ``None`` for
-                biases.
-            ggT_inv: Inverse of the gradient covariance Kronecker factor.
-
-        Returns:
-            Matrix-multiplication result ``KFAC⁻¹ @ M_joint``.
-        """
-        if self._use_exact_damping:
-            # Perform damped preconditioning in KFE, e.g. see equation (21) in
-            # https://arxiv.org/abs/2308.03296.
-            aaT_eigvecs, aaT_eigvals = aaT_inv
-            ggT_eigvecs, ggT_eigvals = ggT_inv
-            # Transform in eigenbasis.
-            M_joint = einsum(
-                ggT_eigvecs, M_joint, aaT_eigvecs, "i j, m i k, k l -> m j l"
-            )
-            # Divide by damped eigenvalues to perform the inversion.
-            M_joint.div_(outer(ggT_eigvals, aaT_eigvals).add_(self._damping))
-            # Transform back to standard basis.
-            M_joint = einsum(
-                ggT_eigvecs, M_joint, aaT_eigvecs, "i j, m j k, l k -> m i l"
-            )
+            return aaT_inv, ggT_inv, None
+
+        if self._A._correct_eigenvalues:
+            aaT_inv = self._A._input_covariances_eigenvectors.get(name)
+            ggT_inv = self._A._gradient_covariances_eigenvectors.get(name)
+            eigenvalues = self._A._corrected_eigenvalues.get(name)
+            if isinstance(eigenvalues, dict):
+                inv_damped_eigenvalues = {}
+                for key, val in eigenvalues.items():
+                    inv_damped_eigenvalues[key] = val.add(self._damping).pow_(-1)
+            elif isinstance(eigenvalues, Tensor):
+                inv_damped_eigenvalues = eigenvalues.add(self._damping).pow_(-1)
         else:
-            M_joint = einsum(ggT_inv, M_joint, aaT_inv, "i j, m j k, k l -> m i l")
-        return M_joint
-
-    def _separate_left_and_right_multiply(
-        self,
-        M_torch: Tensor,
-        param_pos: Dict[str, int],
-        aaT_inv: KFACInvType,
-        ggT_inv: KFACInvType,
-    ) -> Tensor:
-        """Multiply matrix with inverse Kronecker factors for separated weight and bias.
-
-        Args:
-            M_torch: Matrix for multiplication.
-            param_pos: Dictionary with positions of the weight and bias parameters.
-            aaT_inv: Inverse of the input covariance Kronecker factor. ``None`` for
-                biases.
-            ggT_inv: Inverse of the gradient covariance Kronecker factor.
-
-        Returns:
-            Matrix-multiplication result ``KFAC⁻¹ @ M_torch``.
-        """
-        if self._use_exact_damping:
-            # Perform damped preconditioning in KFE, e.g. see equation (21) in
-            # https://arxiv.org/abs/2308.03296.
-            aaT_eigvecs, aaT_eigvals = aaT_inv
-            ggT_eigvecs, ggT_eigvals = ggT_inv
-
-        for p_name, pos in param_pos.items():
-            # for weights we need to multiply from the right with aaT
-            # for weights and biases we need to multiply from the left with ggT
-            if p_name == "weight":
-                M_w = rearrange(M_torch[pos], "m c_out ... -> m c_out (...)")
-                aaT_fac = aaT_eigvecs if self._use_exact_damping else aaT_inv
-                # If `use_exact_damping` is `True`, we transform to eigenbasis
-                M_torch[pos] = einsum(M_w, aaT_fac, "m i j, j k -> m i k")
-
-            ggT_fac = ggT_eigvecs if self._use_exact_damping else ggT_inv
-            dims = (
-                "m i ... -> m j ..."
-                if self._use_exact_damping
-                else " m j ... -> m i ..."
+            aaT = self._A._input_covariances.get(name)
+            ggT = self._A._gradient_covariances.get(name)
+            aaT_inv, ggT_inv, inv_damped_eigenvalues = self._compute_inverse_factors(
+                aaT, ggT, name
             )
-            # If `use_exact_damping` is `True`, we transform to eigenbasis
-            M_torch[pos] = einsum(ggT_fac, M_torch[pos], f"i j, {dims}")
-
-            if self._use_exact_damping:
-                # Divide by damped eigenvalues to perform the inversion and transform
-                # back to standard basis.
-                if p_name == "weight":
-                    M_torch[pos].div_(
-                        outer(ggT_eigvals, aaT_eigvals).add_(self._damping)
-                    )
-                    M_torch[pos] = einsum(
-                        M_torch[pos], aaT_eigvecs, "m i j, k j -> m i k"
-                    )
-                else:
-                    M_torch[pos].div_(ggT_eigvals.add_(self._damping))
-                M_torch[pos] = einsum(
-                    ggT_eigvecs, M_torch[pos], "i j, m j ... -> m i ..."
-                )
 
-        return M_torch
+            if self._cache:
+                self._inverse_input_covariances[name] = aaT_inv
+                self._inverse_gradient_covariances[name] = ggT_inv
+
+        return aaT_inv, ggT_inv, inv_damped_eigenvalues
 
     def torch_matmat(self, M_torch: ParameterMatrixType) -> ParameterMatrixType:
         """Apply the inverse of KFAC to a matrix (multiple vectors) in PyTorch.
@@ -621,12 +561,19 @@ def torch_matmat(self, M_torch: ParameterMatrixType) -> ParameterMatrixType:
             ``[D, K]`` with some ``K``.
         """
         return_tensor, M_torch = self._A._check_input_type_and_preprocess(M_torch)
-        if not self._A._input_covariances and not self._A._gradient_covariances:
+        if (
+            not self._A._input_covariances
+            and not self._A._gradient_covariances
+            and not self._A._input_covariances_eigenvectors
+            and not self._A._gradient_covariances_eigenvectors
+        ):
             self._A._compute_kfac()
 
         for mod_name, param_pos in self._A._mapping.items():
             # retrieve the inverses of the Kronecker factors from cache or invert them
-            aaT_inv, ggT_inv = self._compute_or_get_cached_inverse(mod_name)
+            aaT_inv, ggT_inv, inv_damped_eigenvalues = (
+                self._compute_or_get_cached_inverse(mod_name)
+            )
             # cache the weight shape to ensure correct shapes are returned
             if "weight" in param_pos:
                 weight_shape = M_torch[param_pos["weight"]].shape
@@ -640,12 +587,14 @@ def torch_matmat(self, M_torch: ParameterMatrixType) -> ParameterMatrixType:
                 w_pos, b_pos = param_pos["weight"], param_pos["bias"]
                 M_w = rearrange(M_torch[w_pos], "m c_out ... -> m c_out (...)")
                 M_joint = cat([M_w, M_torch[b_pos].unsqueeze(2)], dim=2)
-                M_joint = self._left_and_right_multiply(M_joint, aaT_inv, ggT_inv)
+                M_joint = self._A._left_and_right_multiply(
+                    M_joint, aaT_inv, ggT_inv, inv_damped_eigenvalues
+                )
                 w_cols = M_w.shape[2]
                 M_torch[w_pos], M_torch[b_pos] = M_joint.split([w_cols, 1], dim=2)
             else:
-                M_torch = self._separate_left_and_right_multiply(
-                    M_torch, param_pos, aaT_inv, ggT_inv
+                M_torch = self._A._separate_left_and_right_multiply(
+                    M_torch, param_pos, aaT_inv, ggT_inv, inv_damped_eigenvalues
                 )
 
             # restore original shapes