Skip to content

Commit

Permalink
Add option to remove bias from q,k,v projections in MHA (#436)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #436

Adds an option to remove bias q,k,v projections in MHA, This diff adds a `add_bias` arg to `MultiHeadAttentionWithCache`. The options defaults to True and will not change the behavior of existing  use sites.

Reviewed By: ankitade, pikapecan

Differential Revision: D47594542

fbshipit-source-id: 8389e7b488a631bd9ac62a395fba98a2df68a486
  • Loading branch information
Abhinav Arora authored and facebook-github-bot committed Jul 20, 2023
1 parent fe379e4 commit 1940623
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 5 deletions.
29 changes: 27 additions & 2 deletions tests/modules/layers/test_multi_head_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ def dim_kv(self):
def q(self):
return torch.Tensor([[[1, 2, 3, 1], [4, 3, 2, 1], [1, 1, 1, 1]]])

@pytest.fixture
def kv(self):
return torch.Tensor([[[3, 2], [1, 1]]])

@pytest.fixture
def current_key_value(self):
return torch.Tensor(
Expand Down Expand Up @@ -106,6 +110,13 @@ def multi_head_cross_attn(self, dim_q, dim_kv):
mha.eval()
return mha

@pytest.fixture
def multi_head_cross_attn_without_bias(self, dim_q, dim_kv):
mha = MultiHeadAttentionWithCache(dim_q, dim_kv, num_heads=2, add_bias=False)
init_weights_with_constant(mha)
mha.eval()
return mha

def test_multi_head_self_attention_use_cache(
self,
multi_head_self_attn_use_cache,
Expand Down Expand Up @@ -136,8 +147,7 @@ def test_multi_head_self_attention_use_cache(
torch.cat([past_key_value, current_key_value], dim=2),
)

def test_multi_head_cross_attention(self, multi_head_cross_attn, q):
kv = torch.Tensor([[[3, 2], [1, 1]]])
def test_multi_head_cross_attention(self, multi_head_cross_attn, q, kv):
actual = multi_head_cross_attn(q, kv, kv)
expected = torch.tensor(
[
Expand All @@ -150,6 +160,21 @@ def test_multi_head_cross_attention(self, multi_head_cross_attn, q):
)
assert_expected(actual, expected, rtol=0, atol=1e-4)

def test_multi_head_cross_attention_without_bias(
self, multi_head_cross_attn_without_bias, q, kv
):
actual = multi_head_cross_attn_without_bias(q, kv, kv)
expected = torch.tensor(
[
[
[21.0, 21.0, 21.0, 21.0],
[21.0, 21.0, 21.0, 21.0],
[21.0, 21.0, 21.0, 21.0],
],
]
)
assert_expected(actual, expected, rtol=0, atol=1e-4)

def test_scripting(
self,
multi_head_self_attn_use_cache,
Expand Down
9 changes: 6 additions & 3 deletions torchmultimodal/modules/layers/multi_head_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ class MultiHeadAttentionWithCache(nn.Module):
same as dim_q for SA; equals to encoder dimension for cross-attention
num_heads (int): number of attention heads
dropout (float): dropout rate
add_bias (bool): if true, adds a learnable bias to query, key, value.
Defaults to True.
"""

def __init__(
Expand All @@ -97,12 +99,13 @@ def __init__(
dim_kv: int,
num_heads: int,
dropout: float = 0.0,
add_bias: bool = True,
) -> None:
super().__init__()
self.num_heads = num_heads
self.q_proj = nn.Linear(dim_q, dim_q)
self.k_proj = nn.Linear(dim_kv, dim_q)
self.v_proj = nn.Linear(dim_kv, dim_q)
self.q_proj = nn.Linear(dim_q, dim_q, bias=add_bias)
self.k_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
self.v_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
self.output_proj = nn.Linear(dim_q, dim_q)
self.dropout = dropout

Expand Down

0 comments on commit 1940623

Please sign in to comment.