back-propagating Gramians #157

PierreQuinton · 2024-09-22T11:33:09Z

PierreQuinton
Sep 22, 2024
Maintainer

We need to explore the possibility of traversing the PyTorch graph to be able to backpropagate Gramians. Here is a code that implements a simplified version of backward (no freeing of the graph and such):

import torch
from torch import Tensor

import torch.autograd.graph as graph
from torch.autograd.graph import Node


def backward(tensor: Tensor) -> None:
    initial_node = graph.get_gradient_edge(tensor).node
    initial_grad = torch.ones_like(tensor)
    gradient_dict = {initial_node:initial_grad}
    while gradient_dict:
        # pop last inserted item, might want to pop first. Last gives DFS and first gives BFS, if we
        # want to free the graph we need to do BFS
        node, grad = gradient_dict.popitem()
        edges = _get_next_nodes(node)
        grads = _get_next_grads(node, grad)
        gradient_dict |= dict(zip(edges, grads))


def _get_next_nodes(node: Node) -> list[Node]:
    nodes = [function[0] for function in node.next_functions if function[0] is not None]
    return nodes


def _get_next_grads(node: Node, grad: Tensor) -> list[Tensor]:
    # __call__ is not officially a method of Nodes, however they all have them.
    grads = [grad for grad in node.(grad) if grad is not None]
    return grads

with example:

>>> import torch
>>> from torchjd.autojac.autograd_backward import backward
>>> a = torch.tensor([2., 4.], requires_grad=True)
>>> b = torch.tensor([3., 5.], requires_grad=True)
>>> c = torch.tensor([1., 7.], requires_grad=True)
>>> d = torch.tensor([9., 2.])
>>> e = a @ b
>>> f = c @ d
>>> g = e + f
>>> backward(g)
>>> a.grad, b.grad, c.grad, d.grad
(tensor([3., 5.], grad_fn=<CopyBackwards>), tensor([2., 4.], grad_fn=<CopyBackwards>), tensor([9., 2.]), None)

Here is some code that sometimes work for backpropagating the Gramian:

import torch
from torch import Tensor

import torch.autograd.graph as graph


def gramian_backward(vector: Tensor, chunk_size: int | None = None) -> Tensor:
    if vector.ndim != 1:
        raise ValueError("Can only backpropagate Gramians of vectors")

    dimension = vector.shape[0]

    initial_node = graph.get_gradient_edge(vector).node
    initial_jacobian = torch.eye(dimension, dtype=vector.dtype, device=vector.device)
    node_jacobian_map = {initial_node: initial_jacobian}

    jacobian_dict = {}

    while node_jacobian_map:
        # pop last inserted item, might want to pop first. Last gives DFS and first gives BFS, if we
        # want to free the graph we need to do BFS
        node, jac_output = node_jacobian_map.popitem()
        if hasattr(node, 'variable'):
            if node.variable in jacobian_dict:
                jacobian_dict[node.variable] += jac_output
            else:
                jacobian_dict[node.variable] = jac_output
        else:
            nodes = {i: function[0] for i, function in enumerate(node.next_functions) if function[0] is not None}
            def vjp(grad_output: Tensor) -> tuple[Tensor]:
                propagated_grads = node(grad_output)
                return tuple([propagated_grads[i] for i in nodes.keys()])
            jacobians = torch.vmap(vjp, chunk_size=chunk_size)(jac_output)
            node_jacobian_map |= dict(zip(nodes.values(), jacobians))

    gramian = torch.zeros([dimension, dimension], dtype=vector.dtype, device=vector.device)
    for jacobian in jacobian_dict.values():
        gramian += compute_gramian(jacobian)

    return gramian

def compute_gramian(jacobian: Tensor) -> Tensor:
    contracted_dims = list(range(1, jacobian.ndim))
    return torch.tensordot(jacobian, jacobian, dims=(contracted_dims, contracted_dims))

Observe that we had to wait until the Jacobian is fully backpropagated to compute the Gramian, indeed if the tensor is a+a, then we have to wait until the full Jacobian of the output wrt to a is computed, since the Gramian of the Jacobian is not a linear operator.

I think that this makes such a low level implementation useless since we want to consider only sequential of models that have different parameters. Maybe I can find a nice theoretical idea on how to workaround this non linearity still.

Also related:

https://amsword.medium.com/understanding-pytorchs-autograd-with-grad-fn-and-next-functions-b2c4836daa00

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

back-propagating Gramians #157

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 0 comments

Select a reply

back-propagating Gramians #157

PierreQuinton Sep 22, 2024 Maintainer

Replies: 0 comments

PierreQuinton
Sep 22, 2024
Maintainer