Skip to content

Commit

Permalink
make l2 distance attention work with flash attention
Browse files Browse the repository at this point in the history
  • Loading branch information
lucidrains committed Oct 1, 2024
1 parent 2d26af6 commit adaa1d3
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name = 'x-transformers',
packages = find_packages(exclude=['examples']),
version = '1.37.6',
version = '1.37.7',
license='MIT',
description = 'X-Transformers - Pytorch',
author = 'Phil Wang',
Expand Down
12 changes: 11 additions & 1 deletion x_transformers/attend.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ def __init__(

# l2 distance attention

assert not (flash and l2_distance), 'l2 distance attention does not work with flash attention just yet'
self.l2_distance = l2_distance

# add a key / value token composed of zeros
Expand Down Expand Up @@ -208,6 +207,17 @@ def flash_attn(
if v.ndim == 3:
v = repeat(v, 'b ... -> b h ...', h = q.shape[1])

# handle maybe l2 distance

if self.l2_distance:
k_norm_sq = k.norm(dim = -1, keepdim = True) ** 2
k = F.pad(k, (0, 1), value = 1.)
k = torch.cat((k, -k_norm_sq), dim = -1)

q_norm_sq = q.norm(dim = -1, keepdim = True) ** 2
q = torch.cat((2 * q, -q_norm_sq), dim = -1)
q = F.pad(q, (0, 1), value = 1.)

# handle scale - by default they scale by dim_head ** -0.5, but need to take care if using cosine sim attention

if exists(self.scale):
Expand Down

0 comments on commit adaa1d3

Please sign in to comment.