diff --git a/flax/linen/attention.py b/flax/linen/attention.py index 2c6400be07..5a86157379 100644 --- a/flax/linen/attention.py +++ b/flax/linen/attention.py @@ -15,10 +15,10 @@ """Attention core modules for Flax.""" import functools -from typing import (Any, Callable, Optional, Tuple, Union) -from flax.linen.dtypes import promote_dtype +from typing import Any, Callable, Optional, Tuple from flax.linen import initializers +from flax.linen.dtypes import promote_dtype from flax.linen.linear import default_kernel_init from flax.linen.linear import DenseGeneral from flax.linen.linear import DotGeneralT @@ -26,12 +26,13 @@ from flax.linen.module import compact from flax.linen.module import merge_param from flax.linen.module import Module - +from flax.linen.normalization import LayerNorm import jax from jax import lax from jax import random import jax.numpy as jnp + PRNGKey = Any Shape = Tuple[int, ...] Dtype = Any @@ -57,19 +58,17 @@ def dot_product_attention_weights( you can directly call this function and call einsum yourself. Args: - query: queries for calculating attention with shape of - `[batch..., q_length, num_heads, qk_depth_per_head]`. - key: keys for calculating attention with shape of - `[batch..., kv_length, num_heads, qk_depth_per_head]`. + query: queries for calculating attention with shape of `[batch..., q_length, + num_heads, qk_depth_per_head]`. + key: keys for calculating attention with shape of `[batch..., kv_length, + num_heads, qk_depth_per_head]`. bias: bias for the attention weights. This should be broadcastable to the - shape `[batch..., num_heads, q_length, kv_length]`. - This can be used for incorporating causal masks, padding masks, - proximity bias, etc. + shape `[batch..., num_heads, q_length, kv_length]`. This can be used for + incorporating causal masks, padding masks, proximity bias, etc. mask: mask for the attention weights. This should be broadcastable to the - shape `[batch..., num_heads, q_length, kv_length]`. - This can be used for incorporating causal masks. - Attention weights are masked out if their corresponding mask value - is `False`. + shape `[batch..., num_heads, q_length, kv_length]`. This can be used for + incorporating causal masks. Attention weights are masked out if their + corresponding mask value is `False`. broadcast_dropout: bool: use a broadcasted dropout along batch dims. dropout_rng: JAX PRNGKey: to be used for dropout dropout_rate: dropout rate @@ -145,21 +144,19 @@ def dot_product_attention( Note: query, key, value needn't have any batch dimensions. Args: - query: queries for calculating attention with shape of - `[batch..., q_length, num_heads, qk_depth_per_head]`. - key: keys for calculating attention with shape of - `[batch..., kv_length, num_heads, qk_depth_per_head]`. - value: values to be used in attention with shape of - `[batch..., kv_length, num_heads, v_depth_per_head]`. + query: queries for calculating attention with shape of `[batch..., q_length, + num_heads, qk_depth_per_head]`. + key: keys for calculating attention with shape of `[batch..., kv_length, + num_heads, qk_depth_per_head]`. + value: values to be used in attention with shape of `[batch..., kv_length, + num_heads, v_depth_per_head]`. bias: bias for the attention weights. This should be broadcastable to the - shape `[batch..., num_heads, q_length, kv_length]`. - This can be used for incorporating causal masks, padding masks, - proximity bias, etc. + shape `[batch..., num_heads, q_length, kv_length]`. This can be used for + incorporating causal masks, padding masks, proximity bias, etc. mask: mask for the attention weights. This should be broadcastable to the - shape `[batch..., num_heads, q_length, kv_length]`. - This can be used for incorporating causal masks. - Attention weights are masked out if their corresponding mask value - is `False`. + shape `[batch..., num_heads, q_length, kv_length]`. This can be used for + incorporating causal masks. Attention weights are masked out if their + corresponding mask value is `False`. broadcast_dropout: bool: use a broadcasted dropout along batch dims. dropout_rng: JAX PRNGKey: to be used for dropout dropout_rate: dropout rate @@ -225,6 +222,7 @@ class MultiHeadDotProductAttention(Module): key, value, and returns output of shape `[bs, dim1, dim2, ..., dimN,, num_heads, value_channels]`` decode: whether to prepare and use an autoregressive cache. + normalize_qk: should QK normalization be applied (arxiv.org/abs/2302.05442). """ num_heads: int @@ -243,6 +241,7 @@ class MultiHeadDotProductAttention(Module): use_bias: bool = True attention_fn: Callable[..., Array] = dot_product_attention decode: bool = False + normalize_qk: bool = False qkv_dot_general: DotGeneralT = lax.dot_general out_dot_general: DotGeneralT = lax.dot_general @@ -260,17 +259,13 @@ def __call__( applies dot-product attention and project the results to an output vector. Args: - inputs_q: input queries of shape - `[batch_sizes..., length, features]`. - inputs_kv: key/values of shape - `[batch_sizes..., length, features]`. - mask: attention mask of shape - `[batch_sizes..., num_heads, query_length, key/value_length]`. - Attention weights are masked out if their corresponding mask value - is `False`. - deterministic: if false, the attention weight is masked randomly - using dropout, whereas if true, the attention weights - are deterministic. + inputs_q: input queries of shape `[batch_sizes..., length, features]`. + inputs_kv: key/values of shape `[batch_sizes..., length, features]`. + mask: attention mask of shape `[batch_sizes..., num_heads, query_length, + key/value_length]`. Attention weights are masked out if their + corresponding mask value is `False`. + deterministic: if false, the attention weight is masked randomly using + dropout, whereas if true, the attention weights are deterministic. Returns: output of shape `[batch_sizes..., length, features]`. @@ -303,6 +298,12 @@ def __call__( dense(name='value')(inputs_kv), ) + if self.normalize_qk: + # Normalizing query and key projections stabilizes training with higher + # LR. See ViT-22B paper http://arxiv.org/abs/2302.05442 for analysis. + query = LayerNorm(name='query_ln', use_bias=False)(query) # type: ignore[call-arg] + key = LayerNorm(name='key_ln', use_bias=False)(key) # type: ignore[call-arg] + # During fast autoregressive decoding, we feed one position at a time, # and cache the keys and values step by step. if self.decode: @@ -413,15 +414,12 @@ def __call__( # type: ignore applies dot-product attention and project the results to an output vector. Args: - inputs_q: input queries of shape - `[batch_sizes..., length, features]`. - mask: attention mask of shape - `[batch_sizes..., num_heads, query_length, key/value_length]`. - Attention weights are masked out if their corresponding mask value - is `False`. - deterministic: if false, the attention weight is masked randomly - using dropout, whereas if true, the attention weights - are deterministic. + inputs_q: input queries of shape `[batch_sizes..., length, features]`. + mask: attention mask of shape `[batch_sizes..., num_heads, query_length, + key/value_length]`. Attention weights are masked out if their + corresponding mask value is `False`. + deterministic: if false, the attention weight is masked randomly using + dropout, whereas if true, the attention weights are deterministic. Returns: output of shape `[batch_sizes..., length, features]`. @@ -451,8 +449,8 @@ def make_attention_mask( query_input: a batched, flat input of query_length size key_input: a batched, flat input of key_length size pairwise_fn: broadcasting elementwise comparison function - extra_batch_dims: number of extra batch dims to add singleton - axes for, none by default + extra_batch_dims: number of extra batch dims to add singleton axes for, none + by default dtype: mask return dtype Returns: @@ -477,8 +475,8 @@ def make_causal_mask( Args: x: input array of shape `[batch..., len]` - extra_batch_dims: number of batch dims to add singleton axes for, - none by default + extra_batch_dims: number of batch dims to add singleton axes for, none by + default dtype: mask return dtype Returns: