From adaa1d366df89f943cea0d2ff1b0792d217b2bc3 Mon Sep 17 00:00:00 2001
From: lucidrains <lucidrains@gmail.com>
Date: Tue, 1 Oct 2024 10:38:14 -0700
Subject: [PATCH] make l2 distance attention work with flash attention

---
 setup.py                 |  2 +-
 x_transformers/attend.py | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 4abe9a96..8b7a52bb 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'x-transformers',
   packages = find_packages(exclude=['examples']),
-  version = '1.37.6',
+  version = '1.37.7',
   license='MIT',
   description = 'X-Transformers - Pytorch',
   author = 'Phil Wang',
diff --git a/x_transformers/attend.py b/x_transformers/attend.py
index 1557fa48..48828a7d 100644
--- a/x_transformers/attend.py
+++ b/x_transformers/attend.py
@@ -145,7 +145,6 @@ def __init__(
 
         # l2 distance attention
 
-        assert not (flash and l2_distance), 'l2 distance attention does not work with flash attention just yet'
         self.l2_distance = l2_distance
 
         # add a key / value token composed of zeros
@@ -208,6 +207,17 @@ def flash_attn(
         if v.ndim == 3:
             v = repeat(v, 'b ... -> b h ...', h = q.shape[1])
 
+        # handle maybe l2 distance
+
+        if self.l2_distance:
+            k_norm_sq = k.norm(dim = -1, keepdim = True) ** 2
+            k = F.pad(k, (0, 1), value = 1.)
+            k = torch.cat((k, -k_norm_sq), dim = -1)
+
+            q_norm_sq = q.norm(dim = -1, keepdim = True) ** 2
+            q = torch.cat((2 * q, -q_norm_sq), dim = -1)
+            q = F.pad(q, (0, 1), value = 1.)
+
         # handle scale - by default they scale by dim_head ** -0.5, but need to take care if using cosine sim attention
 
         if exists(self.scale):