Update flash attention op (#616)

This commit updates the flash attention op to adhere to the addition of is_causal and scale args added by this commit: 2d46caa. Without this, we are seeing a fp8 attention export failure
nod-ai · Nov 27, 2024 · d6be43f · d6be43f
1 parent cdb4ccd
commit d6be43f
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/sharktank/sharktank/ops/attention_impls.py b/sharktank/sharktank/ops/attention_impls.py
@@ -47,7 +47,7 @@ def _extract_linear_scale(t):
     return unbox_tensor(t), None
 
 
-def flash_attention(q, k, v, a):
+def flash_attention(q, k, v, a, is_causal, scale):
     scale = torch.scalar_tensor(1.0 / math.sqrt(q.shape[-1]), dtype=torch.float32)
 
     q, qscale = _extract_linear_scale(q)