Merge branch 'main' into akoumparouli/nemo_ux_mistral_nemo

Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
NVIDIA · Sep 26, 2024 · ed46ff3 · ed46ff3
2 parents a02dbfc + 38e5e09
commit ed46ff3
Show file tree

Hide file tree

Showing 19 changed files with 1,027 additions and 86 deletions.
diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
@@ -120,7 +120,7 @@ jobs:
                     "type": "section",
                     "text": {
                       "type": "mrkdwn",
-                      "text": ":alert: Cherrypick bot 🤖: Hey @'$USERNAME': Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: @okoenig"
+                      "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <@${{ secrets.SLACK_WEBHOOK_ADMIN }}>"
                     }
                   }
                 ]

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -5223,6 +5223,24 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft/${{ github.run_id }}
 
+  L2_NeMo_2_T5_Pretraining:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_T5_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
+        --devices=2 \
+        --max-steps=3 \
+        --experiment-dir=tests/collections/llm/t5_pretrain_results \
+        --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \
+        --index-mapping-dir=tests/collections/llm/t5_index_mappings
+
+      AFTER_SCRIPT: |
+        rm -rf tests/collections/llm/t5_pretrain_results
+        rm -rf tests/collections/llm/t5_index_mappings
+
   Nemo_CICD_Test:
     needs: 
       - pre-flight
@@ -5359,6 +5377,7 @@ jobs:
       - L2_NeMo_2_GPT_DDP_Param_Parity_check
       - L2_NeMo_2_SSM_Pretraining
       - L2_NeMo_2_SSM_Finetuning
+      - L2_NeMo_2_T5_Pretraining
     if: always()
     runs-on: ubuntu-latest
     steps:  
@@ -5377,6 +5396,23 @@ jobs:
       - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }}
         run: exit 0
 
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' && github.event_name == 'pull_request' }}
+        uses: peter-evans/create-or-update-comment@v4
+        with:
+          issue-number: ${{ github.event.number }}
+          body: |
+            [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
+            
+            I just wanted to let you know that, you know, a <https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}|CICD pipeline> for this PR just finished successfully ✨
+
+            So it might be time to merge this PR or like to get some approvals 🚀
+
+            But I'm just a 🤖 so I'll leave it you what to do next.
+
+            Have a great day! 
+
+            //cc @ko3n1g
+
       - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
         name: Checkout repository
         uses: actions/checkout@v4
@@ -5452,4 +5488,3 @@ jobs:
       - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
         run: |
           exit 1
-
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -27,7 +27,7 @@
 
 class AutoTokenizer(TokenizerSpec):
     """
-        Wrapper of HuggingFace AutoTokenizer https://huggingface.co/transformers/model_doc/auto.html#autotokenizer.
+    Wrapper of HuggingFace AutoTokenizer https://huggingface.co/transformers/model_doc/auto.html#autotokenizer.
 
     """
 
@@ -46,15 +46,14 @@ def __init__(
         use_fast: Optional[bool] = False,
         trust_remote_code: Optional[bool] = False,
     ):
-
         """
         Args:
-            pretrained_model_name: corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input argument. 
-                For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained. 
+            pretrained_model_name: corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input argument.
+                For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained.
                 The list of all supported models can be found here: ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
             vocab_file: path to file with vocabulary which consists
                 of characters separated by newlines.
-            mask_token: mask token 
+            mask_token: mask token
             bos_token: the beginning of sequence token
             eos_token: the end of sequence token. Usually equal to sep_token
             pad_token: token to use for padding
@@ -132,24 +131,24 @@ def __init__(
 
         if len(new_tokens_in_vocab) > 0:
             """
-            Special tokens that were not previously included in the tokenizer's vocabulary file will be added to 
+            Special tokens that were not previously included in the tokenizer's vocabulary file will be added to
             the vocabulary and, as a result, the model should be resized, for example:
-            
+
             # define your model
             pretrained_model_name = 'roberta-base'
             model = nemo_nlp.modules.get_lm_model(pretrained_model_name=pretrained_model_name)
-            
+
             # define pretrained tokenizer
             tokenizer_default = nemo_nlp.modules.get_tokenizer(tokenizer_name=pretrained_model_name)
-            
+
             special_tokens = {'bos_token': '<BOS>',
                               'cls_token': '<CSL>',
                               'additional_special_tokens': ['<MY_NER_TOKEN>', '<ANOTHER_TOKEN>']}
             tokenizer_default.add_special_tokens(special_tokens_dict=special_tokens)
-            
+
             # resize your model so that the embeddings for newly added tokens are updated during training/finetuning
             model.resize_token_embeddings(tokenizer_default.vocab_size)
-            
+
             See NLP_Tokenizers.ipynb for more details.
             """
             logging.warning(
@@ -159,6 +158,7 @@ def __init__(
             )
         self.add_special_tokens(special_tokens_dict)
         self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y')
+        self._inv_vocab_dict = {}
 
     @property
     def vocab_size(self):
@@ -226,6 +226,12 @@ def vocab(self):
         id2vocab = {v: k for k, v in self.tokenizer.vocab.items()}
         return [id2vocab[i] for i in range(len(id2vocab))]
 
+    @property
+    def inv_vocab(self):
+        if self._inv_vocab_dict == {}:
+            self._inv_vocab_dict = {v: k for k, v in self.tokenizer.vocab.items()}
+        return self._inv_vocab_dict
+
     @property
     def pad_id(self):
         if getattr(self, 'pad_token') is None:

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
@@ -102,13 +102,18 @@
     gpt_data_step,
     gpt_forward_step,
 )
+from nemo.collections.llm.t5.model import T5Config, T5Model, t5_data_step, t5_forward_step
 
 __all__ = [
     "MockDataModule",
     "GPTModel",
     "GPTConfig",
     "gpt_data_step",
     "gpt_forward_step",
+    "T5Model",
+    "T5Config",
+    "t5_data_step",
+    "t5_forward_step",
     "MaskedTokenLossReduction",
     "MistralConfig7B",
     "MistralNeMoConfig12B",

diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
@@ -50,7 +50,7 @@ def forward(self, x):
             linear_output, bias, layernorm_output = linear_output
             x = layernorm_output
 
-        adapter_output = self.adapter(x)
+        adapter_output = self.adapter(x.contiguous())
         return linear_output + adapter_output, bias
 
 

diff --git a/nemo/collections/llm/t5/__init__.py b/nemo/collections/llm/t5/__init__.py
diff --git a/nemo/collections/llm/t5/data/__init__.py b/nemo/collections/llm/t5/data/__init__.py
@@ -0,0 +1,3 @@
+from nemo.collections.llm.t5.data.pre_training import PreTrainingDataModule
+
+__all__ = ["PreTrainingDataModule"]