From adaa1d366df89f943cea0d2ff1b0792d217b2bc3 Mon Sep 17 00:00:00 2001 From: lucidrains Date: Tue, 1 Oct 2024 10:38:14 -0700 Subject: [PATCH] make l2 distance attention work with flash attention --- setup.py | 2 +- x_transformers/attend.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 4abe9a96..8b7a52bb 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'x-transformers', packages = find_packages(exclude=['examples']), - version = '1.37.6', + version = '1.37.7', license='MIT', description = 'X-Transformers - Pytorch', author = 'Phil Wang', diff --git a/x_transformers/attend.py b/x_transformers/attend.py index 1557fa48..48828a7d 100644 --- a/x_transformers/attend.py +++ b/x_transformers/attend.py @@ -145,7 +145,6 @@ def __init__( # l2 distance attention - assert not (flash and l2_distance), 'l2 distance attention does not work with flash attention just yet' self.l2_distance = l2_distance # add a key / value token composed of zeros @@ -208,6 +207,17 @@ def flash_attn( if v.ndim == 3: v = repeat(v, 'b ... -> b h ...', h = q.shape[1]) + # handle maybe l2 distance + + if self.l2_distance: + k_norm_sq = k.norm(dim = -1, keepdim = True) ** 2 + k = F.pad(k, (0, 1), value = 1.) + k = torch.cat((k, -k_norm_sq), dim = -1) + + q_norm_sq = q.norm(dim = -1, keepdim = True) ** 2 + q = torch.cat((2 * q, -q_norm_sq), dim = -1) + q = F.pad(q, (0, 1), value = 1.) + # handle scale - by default they scale by dim_head ** -0.5, but need to take care if using cosine sim attention if exists(self.scale):