make l2 distance attention work with flash attention

lucidrains · Oct 1, 2024 · adaa1d3 · adaa1d3
1 parent 2d26af6
commit adaa1d3
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 2 deletions.
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'x-transformers',
   packages = find_packages(exclude=['examples']),
-  version = '1.37.6',
+  version = '1.37.7',
   license='MIT',
   description = 'X-Transformers - Pytorch',
   author = 'Phil Wang',

diff --git a/x_transformers/attend.py b/x_transformers/attend.py
@@ -145,7 +145,6 @@ def __init__(
 
         # l2 distance attention
 
-        assert not (flash and l2_distance), 'l2 distance attention does not work with flash attention just yet'
         self.l2_distance = l2_distance
 
         # add a key / value token composed of zeros
@@ -208,6 +207,17 @@ def flash_attn(
         if v.ndim == 3:
             v = repeat(v, 'b ... -> b h ...', h = q.shape[1])
 
+        # handle maybe l2 distance
+
+        if self.l2_distance:
+            k_norm_sq = k.norm(dim = -1, keepdim = True) ** 2
+            k = F.pad(k, (0, 1), value = 1.)
+            k = torch.cat((k, -k_norm_sq), dim = -1)
+
+            q_norm_sq = q.norm(dim = -1, keepdim = True) ** 2
+            q = torch.cat((2 * q, -q_norm_sq), dim = -1)
+            q = F.pad(q, (0, 1), value = 1.)
+
         # handle scale - by default they scale by dim_head ** -0.5, but need to take care if using cosine sim attention
 
         if exists(self.scale):