facebookresearch · lessw2020 · Aug 17, 2023 · Aug 17, 2023 · Aug 17, 2023 · Aug 17, 2023
diff --git a/tests/modules/layers/test_position_embedding.py b/tests/modules/layers/test_position_embedding.py
@@ -4,13 +4,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import pytest
+import math
 
+import pytest
 import torch
 from tests.test_utils import assert_expected
 from torch import nn
+
 from torchmultimodal.modules.layers.position_embedding import (
  BroadcastedPositionEmbedding,
+ RotaryPositionalEmbeddings,
  SinusoidalPositionEmbeddings,
 )
 
@@ -112,3 +115,38 @@ def test_forward(self, data, emb):
  actual = emb(data)
  expected = torch.Size([3, 5])
  assert_expected(actual.shape, expected)
+
+
+def test_rotary_embeddings_math():
+ q = (
+ torch.tensor([[1, 0], [1, 0]], dtype=torch.float).unsqueeze(0).unsqueeze(0)
+ ) # b h s e
+
+ k = 2 * torch.tensor([[1, 0], [1, 0]], dtype=torch.float).unsqueeze(0).unsqueeze(
+ 0
+ ) # b h s e
+
+ rotary_embeddings = RotaryPositionalEmbeddings(2, 2, 1)
+ qr, kr = rotary_embeddings(q, k, 0)
+ rot0 = torch.tensor([[math.cos(0), -math.sin(0)], [math.sin(0), math.cos(0)]])
+ rot1 = torch.tensor([[math.cos(1), -math.sin(1)], [math.sin(1), math.cos(1)]])
+
+ assert_expected(torch.matmul(rot0, q[..., 0, :].squeeze()), qr[..., 0, :].squeeze())
+ assert_expected(torch.matmul(rot1, q[..., 1, :].squeeze()), qr[..., 1, :].squeeze())
+ assert_expected(torch.matmul(rot0, k[..., 0, :].squeeze()), kr[..., 0, :].squeeze())
+ assert_expected(torch.matmul(rot1, k[..., 1, :].squeeze()), kr[..., 1, :].squeeze())
+
+
+def test_rotary_embeddings_left_padding():
+ q = torch.ones(2, 1, 4, 16, dtype=torch.float) # b h s e
+ k = 2 * torch.ones(2, 1, 4, 16, dtype=torch.float) # b h s e
+ rotary_embeddings = RotaryPositionalEmbeddings(16, 32)
+
+ qr, kr = rotary_embeddings(q, k, 0)
+ qr2, kr2 = rotary_embeddings(q, k, torch.tensor([0, 1]))
+
+ assert_expected(qr[0], qr2[0])
+ assert_expected(qr[0, :, 1], qr2[1, :, 0])
+
+ assert_expected(kr[0], kr2[0])
+ assert_expected(kr[0, :, 1], kr2[1, :, 0])
diff --git a/torchmultimodal/modules/layers/position_embedding.py b/torchmultimodal/modules/layers/position_embedding.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import itertools
-from typing import Tuple
+from typing import Tuple, Union
 
 import torch
 from torch import nn, Tensor
@@ -169,3 +169,108 @@ def forward(self, t: Tensor) -> Tensor:
  if self.embed_dim % 2 == 1:
  embeddings = nn.functional.pad(embeddings, (0, 1))
  return embeddings
+
+
+class RotaryPositionalEmbeddings(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ max_position_embeddings: Union[int, float] = 2048,
+ ratio: int = 10000,
+ device: torch.device = None,
+ ):
+ """
+ Implements Rotary Positional Embeddings (RoPE)
+ proposed in: https://arxiv.org/abs/2104.09864
+
+ Args
+ ----
+ dim : int
+ Per-head embedding dimension
+ max_position_embeddings : int
+ Maximum expected sequence length for the model, if exceeded the cached freqs will be recomputed
+ ratio: int
+ The ratio for the geometric progression to compute the rotation angles
+ """
+ super().__init__()
+ self.register_buffer(
+ "freqs",
+ 1.0
+ / (
+ ratio
+ ** (torch.arange(0, dim, 2, device=device)[: (dim // 2)].float() / dim)
+ ),
+ )
+ self.compute_freqs_cis(max_position_embeddings)
+
+ def compute_freqs_cis(
+ self, max_position_embeddings: Union[int, float] = 2048
+ ) -> None:
+ t = torch.arange(
+ max_position_embeddings, device=self.freqs.device, dtype=self.freqs.dtype
+ )
+ freqs = torch.outer(t, self.freqs).float()
+ self.max_seq_len_cached = max_position_embeddings
+ self.register_buffer(
+ "cached_freqs",
+ torch.stack(
+ [
+ torch.cos(freqs),
+ -torch.sin(freqs),
+ torch.sin(freqs),
+ torch.cos(freqs),
+ ],
+ dim=2,
+ ).view(*freqs.shape, 2, 2),
+ )
+
+ def reshape_for_broadcast(
+ self, x: torch.Tensor, cur_freqs: torch.Tensor
+ ) -> torch.Tensor:
+ ndim = x.ndim
+ assert 1 < ndim
+ assert cur_freqs.shape[:2] == (x.shape[2], x.shape[-2])
+ shape = [d if i == 2 or i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
+ return cur_freqs.view(*shape, 2)
+
+ def forward(
+ self,
+ q: torch.Tensor,
+ k: torch.Tensor,
+ start_pos: Union[int, float, torch.LongTensor],
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Args
+ ----
+ q : torch.Tensor
+ Embedded query tensor, expected size is B x H x S x Eh
+ k : torch.Tensor
+ Embedded query tensor, expected size is B x H x S x Eh
+ start_pos : Union[int, torch.LongTensor]
+ The starting position of the tokens encoded in q and k. This is important in
+ kv-caching and left-padding situations, for which the rotation to be applied might
+ not always be the pre-cached position 0...S. For kv-caching without dynamic batching
+ start_pos is shared for all the batch.
+ """
+ seq_len = q.shape[2]
+ q_ = q.float().reshape(*q.shape[:-1], -1, 2) # B H L D/2 2
+ k_ = k.float().reshape(*k.shape[:-1], -1, 2) # B H L D/2 2
+
+ if isinstance(start_pos, int):
+ if start_pos + seq_len > self.max_seq_len_cached:
+ self.compute_freqs_cis(start_pos + seq_len)
+ cur_freqs = self.cached_freqs[start_pos : start_pos + seq_len]
+ freqs = self.reshape_for_broadcast(q_, cur_freqs)
+ else:
+ max_start_pos = torch.max(start_pos).item()
+ if max_start_pos + seq_len > self.max_seq_len_cached:
+ self.compute_freqs_cis(max_start_pos + seq_len)
+ freqs_idxs = torch.arange(0, seq_len, dtype=torch.long).repeat(
+ start_pos.shape[0]
+ ).view(-1, seq_len) + start_pos.view(-1, 1)
+ freqs = self.cached_freqs[freqs_idxs].unsqueeze(1)
+
+ freqs = freqs.float() # 1 1 L D/2 2 2
+ q_out = freqs.mul(q_.unsqueeze(-2)).sum(5).flatten(3)
+ k_out = freqs.mul(k_.unsqueeze(-2)).sum(5).flatten(3)
+ return q_out.type_as(q).contiguous(), k_out.type_as(k).contiguous()