Merge remote-tracking branch 'origin/main' into huvu/t5_nemo2.0_nemoci

NVIDIA · Nov 19, 2024 · 7b8cfd9 · 7b8cfd9
2 parents dd2c4aa + da79f6a
commit 7b8cfd9
Show file tree

Hide file tree

Showing 615 changed files with 11,960 additions and 1,857 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -2943,7 +2943,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=null \
@@ -2975,7 +2975,7 @@ jobs:
         +model.data.data_impl_kwargs.workers=null \
         +model.data.data_impl_kwargs.sort_dataset_paths=False
 
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=null \
@@ -3398,8 +3398,8 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_t5_eval.py \
-            --model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
+        python examples/nlp/language_modeling/megatron_t5_eval.py \
+            --model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
             --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
             --tensor_model_parallel_size 1
 
@@ -3410,7 +3410,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
+        python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=9999 \
@@ -3421,7 +3421,7 @@ jobs:
         exp_manager.exp_dir=/tmp/nlp_mcore_t5_lora_tuning_tp2 \
         model.pipeline_model_parallel_size=1 \
         model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
         model.peft.peft_scheme=lora \
         model.answer_only_loss=True \
         model.micro_batch_size=1 \
@@ -3433,8 +3433,8 @@ jobs:
         model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
         model.data.validation_ds.names=[quarel]
 
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
+        python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
         model.peft.restore_from_path=/tmp/nlp_mcore_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
         model.peft.restore_from_ckpt_name=null \
         model.peft.restore_from_hparams_path=null \
@@ -3852,14 +3852,14 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
+        python tests/collections/llm/megatron_t5_pretraining.py \
         --devices=2 \
         --max-steps=3 \
         --experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \
         --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \
         --index-mapping-dir=tests/collections/llm/t5_index_mappings/${{ github.run_id }}
 
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
+        python tests/collections/llm/megatron_t5_pretraining.py \
         --devices=2 \
         --max-steps=6 \
         --experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \
@@ -3876,11 +3876,11 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
+        python tests/collections/llm/megatron_t5_finetuning.py \
         --devices=2 \
         --max-steps=250 \
         --experiment-dir=tests/collections/llm/t5_finetune_results/${{ github.run_id }} \
-        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
+        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_padding_attnmasktype_150steps
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/t5_finetune_results/${{ github.run_id }}
 
@@ -3891,12 +3891,12 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
+        python tests/collections/llm/megatron_t5_finetuning.py \
         --devices=2 \
         --max-steps=250 \
         --peft=lora \
         --experiment-dir=tests/collections/llm/t5_peft_results/${{ github.run_id }} \
-        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
+        --checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_padding_attnmasktype_150steps
       AFTER_SCRIPT: |
         rm -rf tests/collections/llm/t5_peft_results/${{ github.run_id }}
 

diff --git a/.github/workflows/monitor-vms.yml b/.github/workflows/monitor-vms.yml
@@ -27,7 +27,7 @@ jobs:
             | jq -c '[
                 .runners[] 
                 | select(.status == "online")
-                | select(.name | contains("gpu"))
+                | select(.name | contains("cpu") | not)
                 | {
                   "vm": .name, 
                   "n_gpus": [

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -23,7 +23,7 @@ on:
 
 jobs: 
   release:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.10.0
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.12.3
     with:
       release-ref: ${{ inputs.release-ref }}
       image-name: nemo_container
@@ -39,3 +39,4 @@ jobs:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
       SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      PAT: ${{ secrets.PAT }}
diff --git a/.secrets.baseline b/.secrets.baseline
@@ -90,6 +90,10 @@
     {
       "path": "detect_secrets.filters.allowlist.is_line_allowlisted"
     },
+    {
+      "path": "detect_secrets.filters.common.is_baseline_file",
+      "filename": ".secrets.baseline"
+    },
     {
       "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
       "min_level": 2
@@ -273,7 +277,7 @@
         "filename": "scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py",
         "hashed_secret": "e0308bd21bffc156d79208f9ecf130370a015002",
         "is_verified": false,
-        "line_number": 460
+        "line_number": 471
       }
     ],
     "scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py": [
@@ -2083,5 +2087,5 @@
       }
     ]
   },
-  "generated_at": "2024-10-25T13:43:17Z"
+  "generated_at": "2024-11-14T09:37:19Z"
 }
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -54,7 +54,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.19.0
-ARG MCORE_TAG=aded519cfb1de2abf96f36ca059f992294b7876f
+ARG MCORE_TAG=c1728c12f1f1cdbb786e52f1ffe512295d76bef3
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

diff --git a/docs/source/nlp/distillation.rst b/docs/source/nlp/distillation.rst
diff --git a/docs/source/nlp/nemo_megatron/model_distillation/drop_layers.rst b/docs/source/nlp/nemo_megatron/model_distillation/drop_layers.rst
diff --git a/docs/source/nlp/punctuation_and_capitalization.rst b/docs/source/nlp/punctuation_and_capitalization.rst
@@ -240,7 +240,7 @@ An example of a config file is
      - trainer config
      -
      - Parameters of
-       `pytorch_lightning.Trainer <https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api>`_.
+       `lightning.pytorch.Trainer <https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api>`_.
    * - **exp_manager**
      - exp manager config
      -

diff --git a/docs/source/starthere/fundamentals.rst b/docs/source/starthere/fundamentals.rst
@@ -116,7 +116,7 @@ Below is an example training script for our ``ExampleEncDecModel`` model. We hig
 	:linenos:
 	:emphasize-lines: 10, 11, 12
 
-	import pytorch_lightning as pl
+	import lightning.pytorch as pl
 	from nemo.collections.path_to_model_class import ExampleEncDecModel
 	from nemo.core.config import hydra_runner
 

diff --git a/examples/asr/asr_adapters/eval_asr_adapter.py b/examples/asr/asr_adapters/eval_asr_adapter.py
@@ -36,7 +36,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf, open_dict
 
 from nemo.collections.asr.models import ASRModel

diff --git a/examples/asr/asr_adapters/train_asr_adapter.py b/examples/asr/asr_adapters/train_asr_adapter.py
@@ -84,7 +84,7 @@
 import os
 from dataclasses import is_dataclass
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import DictConfig, OmegaConf, open_dict
 
 from nemo.collections.asr.models import ASRModel

diff --git a/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
@@ -49,7 +49,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 

diff --git a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
@@ -42,7 +42,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf
 

diff --git a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
@@ -64,7 +64,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 from omegaconf import OmegaConf, open_dict
 

diff --git a/examples/asr/asr_ctc/speech_to_text_ctc.py b/examples/asr/asr_ctc/speech_to_text_ctc.py
@@ -68,7 +68,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecCTCModel

diff --git a/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py b/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py
@@ -64,7 +64,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE

diff --git a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py
@@ -58,7 +58,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel

diff --git a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py
@@ -69,7 +69,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecHybridRNNTCTCModel

diff --git a/examples/asr/asr_transducer/speech_to_text_rnnt.py b/examples/asr/asr_transducer/speech_to_text_rnnt.py
@@ -67,7 +67,7 @@
 
 """
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecRNNTModel