Add option to remove bias from q,k,v projections in MHA (facebookrese…

…arch#436) Summary: Pull Request resolved: facebookresearch#436 Adds an option to remove bias q,k,v projections in MHA, This diff adds a `add_bias` arg to `MultiHeadAttentionWithCache`. The options defaults to True and will not change the behavior of existing use sites. Reviewed By: ankitade, pikapecan Differential Revision: D47594542 fbshipit-source-id: f2cea27842e2adcb264aed766895676fc723ac1b
abhinavarora · Jul 19, 2023 · 43e1ed1 · 43e1ed1
1 parent fe379e4
commit 43e1ed1
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 3 deletions.
diff --git a/tests/modules/layers/test_multi_head_attention.py b/tests/modules/layers/test_multi_head_attention.py
@@ -106,6 +106,13 @@ def multi_head_cross_attn(self, dim_q, dim_kv):
  mha.eval()
  return mha
 
+ @pytest.fixture
+ def multi_head_cross_attn_without_bias(self, dim_q, dim_kv):
+ mha = MultiHeadAttentionWithCache(dim_q, dim_kv, num_heads=2, add_bias=False)
+ init_weights_with_constant(mha)
+ mha.eval()
+ return mha
+
  def test_multi_head_self_attention_use_cache(
  self,
  multi_head_self_attn_use_cache,
@@ -150,6 +157,22 @@ def test_multi_head_cross_attention(self, multi_head_cross_attn, q):
  )
  assert_expected(actual, expected, rtol=0, atol=1e-4)
 
+ def test_multi_head_cross_attention_without_bias(
+ self, multi_head_cross_attn_without_bias, q
+ ):
+ kv = torch.Tensor([[[3, 2], [1, 1]]])
+ actual = multi_head_cross_attn_without_bias(q, kv, kv)
+ expected = torch.tensor(
+ [
+ [
+ [21.0, 21.0, 21.0, 21.0],
+ [21.0, 21.0, 21.0, 21.0],
+ [21.0, 21.0, 21.0, 21.0],
+ ],
+ ]
+ )
+ assert_expected(actual, expected, rtol=0, atol=1e-4)
+
  def test_scripting(
  self,
  multi_head_self_attn_use_cache,

diff --git a/torchmultimodal/modules/layers/multi_head_attention.py b/torchmultimodal/modules/layers/multi_head_attention.py
@@ -89,6 +89,8 @@ class MultiHeadAttentionWithCache(nn.Module):
  same as dim_q for SA; equals to encoder dimension for cross-attention
  num_heads (int): number of attention heads
  dropout (float): dropout rate
+ add_bias (bool): if true, adds a learnable bias to query, key, value.
+ Defaults to True.
  """
 
  def __init__(
@@ -97,12 +99,13 @@ def __init__(
  dim_kv: int,
  num_heads: int,
  dropout: float = 0.0,
+ add_bias: bool = True,
  ) -> None:
  super().__init__()
  self.num_heads = num_heads
- self.q_proj = nn.Linear(dim_q, dim_q)
- self.k_proj = nn.Linear(dim_kv, dim_q)
- self.v_proj = nn.Linear(dim_kv, dim_q)
+ self.q_proj = nn.Linear(dim_q, dim_q, bias=add_bias)
+ self.k_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
+ self.v_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
  self.output_proj = nn.Linear(dim_q, dim_q)
  self.dropout = dropout