From c4a5cb85bb0aacea52c3387e0e94d4a727d7bc6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?= <62263673+pggPL@users.noreply.github.com> Date: Fri, 27 Sep 2024 19:48:43 +0200 Subject: [PATCH 1/3] [PyTorch] Add GroupedLinear to the docs and fix typos (#1206) * Docs fixes Signed-off-by: Pawel Gadzinski * docs fix Signed-off-by: Pawel Gadzinski * docs fix Signed-off-by: Pawel Gadzinski --------- Signed-off-by: Pawel Gadzinski Co-authored-by: Pawel Gadzinski --- docs/api/pytorch.rst | 3 +++ transformer_engine/pytorch/attention.py | 2 +- transformer_engine/pytorch/module/grouped_linear.py | 6 +++--- transformer_engine/pytorch/module/layernorm.py | 2 +- transformer_engine/pytorch/module/layernorm_linear.py | 4 ++-- transformer_engine/pytorch/module/layernorm_mlp.py | 2 +- transformer_engine/pytorch/module/linear.py | 6 +++--- transformer_engine/pytorch/module/rmsnorm.py | 2 +- transformer_engine/pytorch/transformer.py | 2 +- 9 files changed, 16 insertions(+), 13 deletions(-) diff --git a/docs/api/pytorch.rst b/docs/api/pytorch.rst index a210019dc1..b097f14475 100644 --- a/docs/api/pytorch.rst +++ b/docs/api/pytorch.rst @@ -9,6 +9,9 @@ pyTorch .. autoapiclass:: transformer_engine.pytorch.Linear(in_features, out_features, bias=True, **kwargs) :members: forward, set_tensor_parallel_group +.. autoapiclass:: transformer_engine.pytorch.GroupedLinear(in_features, out_features, bias=True, **kwargs) + :members: forward, set_tensor_parallel_group + .. autoapiclass:: transformer_engine.pytorch.LayerNorm(hidden_size, eps=1e-5, **kwargs) .. autoapiclass:: transformer_engine.pytorch.RMSNorm(hidden_size, eps=1e-5, **kwargs) diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 192f430ae1..bc80b389d3 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -7853,7 +7853,7 @@ class MultiheadAttention(torch.nn.Module): bias : bool, default = `True` if set to `False`, the transformer layer will not learn any additive biases. device : Union[torch.device, str], default = "cuda" - The device on which the parameters of the model will allocated. It is the user's + The device on which the parameters of the model will be allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the forward pass. qkv_format: str, default = `sbhd` diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py index 10c8d91551..0bad1306c3 100644 --- a/transformer_engine/pytorch/module/grouped_linear.py +++ b/transformer_engine/pytorch/module/grouped_linear.py @@ -528,11 +528,11 @@ class GroupedLinear(TransformerEngineBaseModule): used for initializing weights in the following way: `init_method(weight)`. When set to `None`, defaults to `torch.nn.init.normal_(mean=0.0, std=0.023)`. get_rng_state_tracker : Callable, default = `None` - used to get the random number generator state tracker for initilizeing weights. + used to get the random number generator state tracker for initializing weights. rng_tracker_name : str, default = `None` the param passed to get_rng_state_tracker to get the specific rng tracker. device : Union[torch.device, str], default = "cuda" - The device on which the parameters of the model will allocated. It is the user's + The device on which the parameters of the model will be allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the forward pass. @@ -548,7 +548,7 @@ class GroupedLinear(TransformerEngineBaseModule): `set_tensor_parallel_group(tp_group)` method on the initialized module before the forward pass to supply the tensor parallel group needed for tensor and sequence parallel collectives. - parallel_mode : {None, 'Column', 'Row'}, default = `None` + parallel_mode : {None, 'column', 'row'}, default = `None` used to decide whether this GroupedLinear layer is Column Parallel Linear or Row Parallel Linear as described `here `_. When set to `None`, no communication is performed. diff --git a/transformer_engine/pytorch/module/layernorm.py b/transformer_engine/pytorch/module/layernorm.py index ec33ad2033..292fcd06de 100644 --- a/transformer_engine/pytorch/module/layernorm.py +++ b/transformer_engine/pytorch/module/layernorm.py @@ -110,7 +110,7 @@ class LayerNorm(torch.nn.Module): y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * (1 + \gamma) + \beta device : Union[torch.device, str], default = "cuda" - The device on which the parameters of the model will allocated. It is the user's + The device on which the parameters of the model will be allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the forward pass. """ diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py index da77879e06..92030a7f7a 100644 --- a/transformer_engine/pytorch/module/layernorm_linear.py +++ b/transformer_engine/pytorch/module/layernorm_linear.py @@ -816,7 +816,7 @@ class LayerNormLinear(TransformerEngineBaseModule): y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * (1 + \gamma) + \beta device : Union[torch.device, str], default = "cuda" - The device on which the parameters of the model will allocated. It is the user's + The device on which the parameters of the model will be allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the forward pass. @@ -832,7 +832,7 @@ class LayerNormLinear(TransformerEngineBaseModule): `set_tensor_parallel_group(tp_group)` method on the initialized module before the forward pass to supply the tensor parallel group needed for tensor and sequence parallel collectives. - parallel_mode : {None, 'Column', 'Row'}, default = `None` + parallel_mode : {None, 'column', 'row'}, default = `None` used to decide whether this Linear layer is Column Parallel Linear or Row Parallel Linear as described `here `_. When set to `None`, no communication is performed. diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py index b802c972d4..6d5609ccd2 100644 --- a/transformer_engine/pytorch/module/layernorm_mlp.py +++ b/transformer_engine/pytorch/module/layernorm_mlp.py @@ -1193,7 +1193,7 @@ class LayerNormMLP(TransformerEngineBaseModule): y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * (1 + \gamma) + \beta device : Union[torch.device, str], default = "cuda" - The device on which the parameters of the model will allocated. It is the user's + The device on which the parameters of the model will be allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the forward pass. diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py index a7be82ccf1..8e19a65a28 100644 --- a/transformer_engine/pytorch/module/linear.py +++ b/transformer_engine/pytorch/module/linear.py @@ -650,7 +650,7 @@ class Linear(TransformerEngineBaseModule): used for initializing weights in the following way: `init_method(weight)`. When set to `None`, defaults to `torch.nn.init.normal_(mean=0.0, std=0.023)`. get_rng_state_tracker : Callable, default = `None` - used to get the random number generator state tracker for initilizeing weights. + used to get the random number generator state tracker for initializing weights. rng_tracker_name : str, default = `None` the param passed to get_rng_state_tracker to get the specific rng tracker. parameters_split : Optional[Union[Tuple[str, ...], Dict[str, int]]], default = None @@ -662,7 +662,7 @@ class Linear(TransformerEngineBaseModule): names that end in `_weight` or `_bias`, so trailing underscores are stripped from any provided names. device : Union[torch.device, str], default = "cuda" - The device on which the parameters of the model will allocated. It is the user's + The device on which the parameters of the model will be allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the forward pass. @@ -678,7 +678,7 @@ class Linear(TransformerEngineBaseModule): `set_tensor_parallel_group(tp_group)` method on the initialized module before the forward pass to supply the tensor parallel group needed for tensor and sequence parallel collectives. - parallel_mode : {None, 'Column', 'Row'}, default = `None` + parallel_mode : {None, 'column', 'row'}, default = `None` used to decide whether this Linear layer is Column Parallel Linear or Row Parallel Linear as described `here `_. When set to `None`, no communication is performed. diff --git a/transformer_engine/pytorch/module/rmsnorm.py b/transformer_engine/pytorch/module/rmsnorm.py index 969a468426..d5dc400206 100644 --- a/transformer_engine/pytorch/module/rmsnorm.py +++ b/transformer_engine/pytorch/module/rmsnorm.py @@ -120,7 +120,7 @@ class RMSNorm(torch.nn.Module): .. math:: y = \frac{x}{RMS_\varepsilon(x)} * (1 + \gamma) device : Union[torch.device, str], default = "cuda" - The device on which the parameters of the model will allocated. It is the user's + The device on which the parameters of the model will be allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the forward pass. """ diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py index 958c7019ba..020d262be2 100644 --- a/transformer_engine/pytorch/transformer.py +++ b/transformer_engine/pytorch/transformer.py @@ -173,7 +173,7 @@ class TransformerLayer(torch.nn.Module): Type of activation used in MLP block. Options are: 'gelu', 'relu', 'reglu', 'geglu', 'swiglu', 'qgelu' and 'srelu'. device : Union[torch.device, str], default = "cuda" - The device on which the parameters of the model will allocated. It is the user's + The device on which the parameters of the model will be allocated. It is the user's responsibility to ensure all parameters are moved to the GPU before running the forward pass. attn_input_format: {'sbhd', 'bshd'}, default = 'sbhd' From 8a1b7ee2852a46f1497849a7dcb2ecf9cbfff0f6 Mon Sep 17 00:00:00 2001 From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:33:51 -0700 Subject: [PATCH 2/3] [PyTorch] Fix detection of 3 in 3hd/h3d layouts (#1187) * fix detection of 3 in 3hd/h3d layouts Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * error out when invalid layout group is provided Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../pytorch/csrc/extensions/attention.cu | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cu b/transformer_engine/pytorch/csrc/extensions/attention.cu index fb1fc97a33..b2968a688d 100644 --- a/transformer_engine/pytorch/csrc/extensions/attention.cu +++ b/transformer_engine/pytorch/csrc/extensions/attention.cu @@ -95,9 +95,21 @@ std::vector fused_attn_fwd_qkvpacked( auto qkv_sizes = QKV.sizes().vec(); std::vector qkv_shape{qkv_sizes.begin(), qkv_sizes.end()}; std::vector q_shape; - for (auto i : qkv_shape) { - if (i != 3) { - q_shape.push_back(i); + NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); + int loc_3 = 0; + switch (layout_group) { + case NVTE_3HD: + loc_3 = qkv_sizes.size() - 3; + break; + case NVTE_H3D: + loc_3 = qkv_sizes.size() - 2; + break; + default: + NVTE_ERROR("Invalid QKV layout group."); + } + for (auto it = qkv_shape.begin(); it != qkv_shape.end(); ++it) { + if (it - qkv_shape.begin() != loc_3) { + q_shape.push_back(*it); } } std::vector o_shape{q_shape.begin(), q_shape.end()}; @@ -252,9 +264,21 @@ std::vector fused_attn_bwd_qkvpacked( auto qkv_sizes = QKV.sizes().vec(); std::vector qkv_shape{qkv_sizes.begin(), qkv_sizes.end()}; std::vector q_shape; - for (auto i : qkv_shape) { - if (i != 3) { - q_shape.push_back(i); + NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout); + int loc_3 = 0; + switch (layout_group) { + case NVTE_3HD: + loc_3 = qkv_sizes.size() - 3; + break; + case NVTE_H3D: + loc_3 = qkv_sizes.size() - 2; + break; + default: + NVTE_ERROR("Invalid QKV layout group."); + } + for (auto it = qkv_shape.begin(); it != qkv_shape.end(); ++it) { + if (it - qkv_shape.begin() != loc_3) { + q_shape.push_back(*it); } } auto h = q_shape[q_shape.size() - 2]; From 7b152a83475085355f4a14406dd690a3753267e4 Mon Sep 17 00:00:00 2001 From: Xiaowei Ren <103958965+xrennvidia@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:56:03 -0700 Subject: [PATCH 3/3] Fix CP unit test on A100 and L40s (#1211) skip FP8 CP tests if hardware does not support FP8 Signed-off-by: Xiaowei Ren --- tests/pytorch/fused_attn/test_fused_attn_with_cp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/pytorch/fused_attn/test_fused_attn_with_cp.py b/tests/pytorch/fused_attn/test_fused_attn_with_cp.py index d6358d1062..c1c18ffe47 100644 --- a/tests/pytorch/fused_attn/test_fused_attn_with_cp.py +++ b/tests/pytorch/fused_attn/test_fused_attn_with_cp.py @@ -112,6 +112,8 @@ def test_cp_with_fused_attention(dtype, model, qkv_format, cp_comm_type): pytest.skip("THD format is only supported on sm90+!") if cp_comm_type == "all_gather" and get_cudnn_version() < (9, 3, 0): pytest.skip("CP implementation with KV all-gather is only supported with cuDNN >= 9.3.0!") + if dtype == "fp8" and get_device_compute_capability() < (9, 0): + pytest.skip("FP8 attention is only supported on sm90+!") config = model_configs_fused_attn[model] if qkv_format == "thd" and config.num_heads != config.num_gqa_groups: