Add option to remove bias from q,k,v projections in MHA (#436)

Summary: Pull Request resolved: #436 Adds an option to remove bias q,k,v projections in MHA, This diff adds a `add_bias` arg to `MultiHeadAttentionWithCache`. The options defaults to True and will not change the behavior of existing use sites. Reviewed By: ankitade, pikapecan Differential Revision: D47594542 fbshipit-source-id: 667680dd238efb7ecc3531d28da9cfe8f631f2b4
facebookresearch · Jul 19, 2023 · ff92860 · ff92860
1 parent fe379e4
commit ff92860
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 5 deletions.
diff --git a/tests/modules/layers/test_multi_head_attention.py b/tests/modules/layers/test_multi_head_attention.py
@@ -70,6 +70,10 @@ def dim_kv(self):
     def q(self):
         return torch.Tensor([[[1, 2, 3, 1], [4, 3, 2, 1], [1, 1, 1, 1]]])
 
+    @pytest.fixture
+    def kv(self):
+        return torch.Tensor([[[3, 2], [1, 1]]])
+
     @pytest.fixture
     def current_key_value(self):
         return torch.Tensor(
@@ -106,6 +110,13 @@ def multi_head_cross_attn(self, dim_q, dim_kv):
         mha.eval()
         return mha
 
+    @pytest.fixture
+    def multi_head_cross_attn_without_bias(self, dim_q, dim_kv):
+        mha = MultiHeadAttentionWithCache(dim_q, dim_kv, num_heads=2, add_bias=False)
+        init_weights_with_constant(mha)
+        mha.eval()
+        return mha
+
     def test_multi_head_self_attention_use_cache(
         self,
         multi_head_self_attn_use_cache,
@@ -136,8 +147,7 @@ def test_multi_head_self_attention_use_cache(
             torch.cat([past_key_value, current_key_value], dim=2),
         )
 
-    def test_multi_head_cross_attention(self, multi_head_cross_attn, q):
-        kv = torch.Tensor([[[3, 2], [1, 1]]])
+    def test_multi_head_cross_attention(self, multi_head_cross_attn, q, kv):
         actual = multi_head_cross_attn(q, kv, kv)
         expected = torch.tensor(
             [
@@ -150,6 +160,21 @@ def test_multi_head_cross_attention(self, multi_head_cross_attn, q):
         )
         assert_expected(actual, expected, rtol=0, atol=1e-4)
 
+    def test_multi_head_cross_attention_without_bias(
+        self, multi_head_cross_attn_without_bias, q, kv
+    ):
+        actual = multi_head_cross_attn_without_bias(q, kv, kv)
+        expected = torch.tensor(
+            [
+                [
+                    [21.0, 21.0, 21.0, 21.0],
+                    [21.0, 21.0, 21.0, 21.0],
+                    [21.0, 21.0, 21.0, 21.0],
+                ],
+            ]
+        )
+        assert_expected(actual, expected, rtol=0, atol=1e-4)
+
     def test_scripting(
         self,
         multi_head_self_attn_use_cache,

diff --git a/torchmultimodal/modules/layers/multi_head_attention.py b/torchmultimodal/modules/layers/multi_head_attention.py
@@ -89,6 +89,8 @@ class MultiHeadAttentionWithCache(nn.Module):
             same as dim_q for SA; equals to encoder dimension for cross-attention
         num_heads (int): number of attention heads
         dropout (float): dropout rate
+        add_bias (bool): if true, adds a learnable bias to query, key, value.
+            Defaults to True.
     """
 
     def __init__(
@@ -97,12 +99,13 @@ def __init__(
         dim_kv: int,
         num_heads: int,
         dropout: float = 0.0,
+        add_bias: bool = True,
     ) -> None:
         super().__init__()
         self.num_heads = num_heads
-        self.q_proj = nn.Linear(dim_q, dim_q)
-        self.k_proj = nn.Linear(dim_kv, dim_q)
-        self.v_proj = nn.Linear(dim_kv, dim_q)
+        self.q_proj = nn.Linear(dim_q, dim_q, bias=add_bias)
+        self.k_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
+        self.v_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
         self.output_proj = nn.Linear(dim_q, dim_q)
         self.dropout = dropout