From ef19e0260a87f0d1c8e9f6992935eeac359450ea Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Tue, 21 Nov 2023 11:31:16 -0500 Subject: [PATCH 1/3] add cpu init check (#7889) Signed-off-by: Chen Cui --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 9b05862f0de0..bec3cfbc2e89 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -246,7 +246,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): if self.megatron_amp_O2: - if not self.with_distributed_adam: + if not self.with_distributed_adam and not self.cfg.get("use_cpu_initialization", False): # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type if isinstance(self.model, list): for module in self.model: From 29a90a31d1f3c01528ddb4b5332fc136c6ad44f4 Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Date: Tue, 21 Nov 2023 16:47:03 -0800 Subject: [PATCH 2/3] Fix pinned triton version (#7925) * Fix pinned triton version Signed-off-by: Cheng-Ping Hsieh * Remove comment Signed-off-by: Cheng-Ping Hsieh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change README Signed-off-by: Cheng-Ping Hsieh * Remove flash-attn in Dockerfile Signed-off-by: Cheng-Ping Hsieh * Revert Signed-off-by: Cheng-Ping Hsieh --------- Signed-off-by: Cheng-Ping Hsieh Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Dockerfile | 4 +--- README.rst | 2 +- .../nlp/modules/common/megatron/attention.py | 17 ++++++++++++++--- requirements/requirements.txt | 1 + tests/collections/nlp/test_flash_attention.py | 6 +++++- 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4cbc898cc8ac..5d3311c7cdfd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -85,10 +85,8 @@ WORKDIR /tmp/nemo COPY requirements . RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done -# install flash attention dependencies +# install flash attention RUN pip install flash-attn -# pinned triton version for flash-attention https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3 -RUN pip install triton==2.0.0.dev20221202 # install numba for latest containers RUN pip install numba>=0.57.1 diff --git a/README.rst b/README.rst index 58a39008f704..d07b07434b20 100644 --- a/README.rst +++ b/README.rst @@ -319,7 +319,7 @@ Transformer Engine requires PyTorch to be built with CUDA 11.8. Flash Attention ~~~~~~~~~~~~~~~~~~~~ -Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models or use with attention bias (introduced from position encoding, e.g. Alibi), please install `flash-attn `_. +Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models, please install `flash-attn `_. If you want to use Flash Attention with attention bias (introduced from position encoding, e.g. Alibi), please also install triton pinned version following the `implementation `_. .. code-block:: bash diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py index 09a9251ce46e..76c4d6bec6d3 100644 --- a/nemo/collections/nlp/modules/common/megatron/attention.py +++ b/nemo/collections/nlp/modules/common/megatron/attention.py @@ -65,11 +65,23 @@ HAVE_MEGATRON_CORE = False +try: + # Flash Attention Triton + import pkg_resources + from flash_attn.flash_attn_triton import flash_attn_func as flash_attn_func_triton + + # pinned triton version for flash-attention triton https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3 + assert pkg_resources.get_distribution("triton").version == '2.0.0.dev20221202' + +except (ImportError, ModuleNotFoundError, AssertionError): + + flash_attn_func_triton = None + + try: # Flash Attention 1.X from flash_attn.bert_padding import pad_input, unpad_input from flash_attn.flash_attn_interface import flash_attn_unpadded_func - from flash_attn.flash_attn_triton import flash_attn_func as flash_attn_func_triton HAVE_FLASH_ATTENTION = True flash_attn_func = None @@ -85,8 +97,7 @@ except (ImportError, ModuleNotFoundError): HAVE_FLASH_ATTENTION = False - - flash_attn_unpadded_func, flash_attn_func_triton, flash_attn_func = None, None, None + flash_attn_unpadded_func, flash_attn_func = None, None unpad_input, pad_input = None, None try: diff --git a/requirements/requirements.txt b/requirements/requirements.txt index a9a8c1e98100..05b4531ff083 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -10,5 +10,6 @@ tensorboard text-unidecode torch tqdm>=4.41.0 +triton wget wrapt diff --git a/tests/collections/nlp/test_flash_attention.py b/tests/collections/nlp/test_flash_attention.py index 1453ab5220bb..3560229e847b 100644 --- a/tests/collections/nlp/test_flash_attention.py +++ b/tests/collections/nlp/test_flash_attention.py @@ -39,10 +39,14 @@ HAVE_FA = False try: + import pkg_resources import triton + # pinned triton version for flash-attention triton https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3 + assert pkg_resources.get_distribution("triton").version == '2.0.0.dev20221202' + HAVE_TRITON = True -except (ImportError, ModuleNotFoundError): +except (ImportError, ModuleNotFoundError, AssertionError): HAVE_TRITON = False try: From 9c7926db4ae375b77dae7eb57656213de1dd76a5 Mon Sep 17 00:00:00 2001 From: Xiaowei Ren <103958965+xrennvidia@users.noreply.github.com> Date: Tue, 21 Nov 2023 23:19:07 -0800 Subject: [PATCH 3/3] fix tp_overlap config var name (#7928) Signed-off-by: Xiaowei Ren --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index bec3cfbc2e89..e66708d2d2dd 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1585,7 +1585,7 @@ def build_transformer_config(self) -> TransformerConfig: 'recompute_method': recompute_method, 'recompute_num_layers': recompute_num_layers, 'distribute_saved_activations': False, # not currently used in NeMo - 'ub_tp_comm_overlap': ub_tp_comm_overlap, + 'tp_comm_overlap': ub_tp_comm_overlap, 'fp8': fp8, }