Merge branch 'NVIDIA:main' into auto_cudagraph

JimmyZhang12 · Aug 28, 2024 · a0c0445 · a0c0445
2 parents aaada57 + f45422a
commit a0c0445
Show file tree

Hide file tree

Showing 139 changed files with 12,395 additions and 599 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -159,6 +159,21 @@ jobs:
  rm -f /home/TestData/nlp/megatron_ir/sbert/sbert.nemo
  rm -rf /home/TestData/nlp/megatron_ir/sbert/model_weights
 
+ L2_Community_LLM_Checkpoints_tests_Mamba2:
+ needs: [cicd-test-container-setup]
+ uses: ./.github/workflows/_test_template.yml
+ with:
+ RUNNER: self-hosted-azure
+ SCRIPT: |
+ python scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
+ --input_name_or_path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt \
+ --output_path /home/TestData/nlp/megatron_mamba/converted_mamba.nemo \
+ --precision=bf16 \
+ --mamba_ssm_ngroups 1
+ AFTER_SCRIPT: |
+ rm -f /home/TestData/nlp/megatron_mamba/converted_mamba.nemo
+ rm -rf /home/TestData/nlp/megatron_mamba/model_weights
+
  L2_Community_LLM_Checkpoints_tests_Llama:
  needs: [cicd-test-container-setup]
  uses: ./.github/workflows/_test_template.yml
@@ -484,41 +499,41 @@ jobs:
  AFTER_SCRIPT: |
  rm -rf examples/asr/speech_finetuning_results
 
- OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
- needs: [cicd-test-container-setup]
- uses: ./.github/workflows/_test_template.yml
- with:
- RUNNER: self-hosted-azure-gpus-1
- SCRIPT: |-
- python examples/asr/speech_to_text_finetune.py \
- --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
- ~model.train_ds.hf_data_cfg \
- model.train_ds.num_workers=1 \
- model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
- model.train_ds.streaming=true \
- +model.train_ds.hf_data_cfg.path="librispeech_asr" \
- +model.train_ds.hf_data_cfg.name=null \
- +model.train_ds.hf_data_cfg.split="test.clean" \
- +model.train_ds.hf_data_cfg.streaming=true \
- ~model.validation_ds.hf_data_cfg \
- model.validation_ds.streaming=true \
- +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
- +model.validation_ds.hf_data_cfg.name=null \
- +model.validation_ds.hf_data_cfg.split="test.clean" \
- +model.validation_ds.hf_data_cfg.streaming=true \
- ~model.test_ds \
- init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
- model.tokenizer.update_tokenizer=False \
- model.optim.sched.warmup_steps=0 \
- +model.optim.sched.max_steps=3 \
- trainer.max_epochs=null \
- trainer.devices=1 \
- trainer.accelerator="gpu" \
- +trainer.fast_dev_run=True \
- exp_manager.exp_dir=examples/asr/speech_finetuning_results
- AFTER_SCRIPT: |
- rm -rf examples/asr/speech_finetuning_results
- IS_OPTIONAL: true
+ # OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
+ #  needs: [cicd-test-container-setup]
+ #  uses: ./.github/workflows/_test_template.yml
+ #  with:
+ #  RUNNER: self-hosted-azure-gpus-1
+ #  SCRIPT: |-
+ #  python examples/asr/speech_to_text_finetune.py \
+ #  --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
+ #  ~model.train_ds.hf_data_cfg \
+ #  model.train_ds.num_workers=1 \
+ #  model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
+ #  model.train_ds.streaming=true \
+ #  +model.train_ds.hf_data_cfg.path="librispeech_asr" \
+ #  +model.train_ds.hf_data_cfg.name=null \
+ #  +model.train_ds.hf_data_cfg.split="test.clean" \
+ #  +model.train_ds.hf_data_cfg.streaming=true \
+ #  ~model.validation_ds.hf_data_cfg \
+ #  model.validation_ds.streaming=true \
+ #  +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
+ #  +model.validation_ds.hf_data_cfg.name=null \
+ #  +model.validation_ds.hf_data_cfg.split="test.clean" \
+ #  +model.validation_ds.hf_data_cfg.streaming=true \
+ #  ~model.test_ds \
+ #  init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
+ #  model.tokenizer.update_tokenizer=False \
+ #  model.optim.sched.warmup_steps=0 \
+ #  +model.optim.sched.max_steps=3 \
+ #  trainer.max_epochs=null \
+ #  trainer.devices=1 \
+ #  trainer.accelerator="gpu" \
+ #  +trainer.fast_dev_run=True \
+ #  exp_manager.exp_dir=examples/asr/speech_finetuning_results
+ #  AFTER_SCRIPT: |
+ #  rm -rf examples/asr/speech_finetuning_results
+ #  IS_OPTIONAL: true
 
  ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
  needs: [cicd-test-container-setup]
@@ -2046,7 +2061,7 @@ jobs:
  with:
  RUNNER: self-hosted-azure
  SCRIPT: |
- NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+ python examples/nlp/language_modeling/megatron_bert_pretraining.py \
  trainer.devices=2 \
  trainer.accelerator=gpu \
  trainer.log_every_n_steps=1 \
@@ -2076,7 +2091,7 @@ jobs:
  model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
  model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
- NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+ python examples/nlp/language_modeling/megatron_bert_pretraining.py \
  trainer.devices=2 \
  trainer.accelerator=gpu \
  trainer.log_every_n_steps=1 \
@@ -2113,7 +2128,7 @@ jobs:
  with:
  RUNNER: self-hosted-azure
  SCRIPT: |
- NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+ python examples/nlp/language_modeling/megatron_bert_pretraining.py \
  trainer.devices=2 \
  trainer.accelerator=gpu \
  trainer.log_every_n_steps=1 \
@@ -2144,7 +2159,7 @@ jobs:
  model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
  model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
- NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+ python examples/nlp/language_modeling/megatron_bert_pretraining.py \
  trainer.devices=2 \
  trainer.accelerator=gpu \
  trainer.log_every_n_steps=1 \
@@ -2184,7 +2199,7 @@ jobs:
  with:
  RUNNER: self-hosted-azure
  SCRIPT: |
- NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+ python examples/nlp/language_modeling/megatron_bert_pretraining.py \
  trainer.devices=2 \
  trainer.accelerator=gpu \
  trainer.log_every_n_steps=1 \
@@ -2214,7 +2229,7 @@ jobs:
  model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
  model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
- NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+ python examples/nlp/language_modeling/megatron_bert_pretraining.py \
  trainer.devices=2 \
  trainer.accelerator=gpu \
  trainer.log_every_n_steps=1 \
@@ -4738,13 +4753,30 @@ jobs:
  rm -rf examples/llm/gpt_pretrain_results
  rm -rf examples/llm/gpt_index_mappings
 
+ L2_NeMo_2_GPT_DDP_Param_Parity_check:
+ needs: [cicd-test-container-setup]
+ uses: ./.github/workflows/_test_template.yml
+ with:
+ RUNNER: self-hosted-azure
+ SCRIPT: |
+
+ python tests/lightning/test_ddp_parity_checker.py \
+ --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+ --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+ --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document
+
+ AFTER_SCRIPT: |
+ rm -rf examples/llm/gpt_pretrain_results
+ rm -rf examples/llm/gpt_index_mappings
+
  Nemo_CICD_Test:
  needs: 
  - gpu-test
  - cicd-test-container-setup
  - L0_Unit_Tests_GPU
  #- OPTIONAL_L0_Unit_Tests_CPU
  - L2_Community_LLM_Checkpoints_tests_Bert
+ - L2_Community_LLM_Checkpoints_tests_Mamba2
  - L2_Community_LLM_Checkpoints_tests_Llama
  - L2_Community_LLM_Checkpoints_tests_StarCoder
  - L2_Community_LLM_Checkpoints_tests_Falcon
@@ -4843,6 +4875,7 @@ jobs:
  - Speech_Checkpoints_tests
  #- OPTIONAL_L2_Stable_Diffusion_Training
  - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
+ - L2_NeMo_2_GPT_DDP_Param_Parity_check
  if: always()
  runs-on: ubuntu-latest
  steps: 

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -34,7 +34,7 @@ WORKDIR /workspace
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.15.0
-ARG MCORE_TAG=2fd6e2b74efca73a1f2d27b89bb5419384b4d3bf
+ARG MCORE_TAG=34e607ef41cf1c0ed481a678df9c76952d0ec00c
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \

diff --git a/docs/source/asr/asr_language_modeling_and_customization.rst b/docs/source/asr/asr_language_modeling_and_customization.rst
@@ -547,6 +547,69 @@ The following is the list of the arguments for the opengrm script:
 | force | bool | ``False`` | Whether to recompile and rewrite all files |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
 
+.. _wfst-ctc-decoding:
+
+WFST CTC decoding
+=================
+Weighted Finite-State Transducers (WFST) are finite-state machines with input and output symbols on each transition and some weight element of a semiring. WFSTs can act as N-gram LMs in a special type of LM-forced beam search, called WFST decoding.
+
+.. note::
+
+ More precisely, WFST decoding is more of a greedy N-depth search with LM.
+ Thus, it is asymptotically worse than conventional beam search decoding algorithms, but faster.
+
+**WARNING** 
+At the moment, NeMo supports WFST decoding only for CTC models and word-based LMs.
+
+To run WFST decoding in NeMo, one needs to provide a NeMo ASR model and either an ARPA LM or a WFST LM (advanced). An ARPA LM can be built from source text with KenLM as follows: ``<kenlm_bin_path>/lmplz -o <ngram_length> --arpa <out_arpa_path> --prune <ngram_prune>``.
+
+The script to evaluate an ASR model with WFST decoding and N-gram models can be found at
+`scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py
+<https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py>`__.
+
+This script has a large number of possible argument overrides, therefore it is advised to use ``python eval_wfst_decoding_ctc.py --help`` to see the full list of arguments.
+
+You may evaluate an ASR model as the following:
+
+.. code-block::
+
+ python eval_wfst_decoding_ctc.py nemo_model_file=<path to the .nemo file of the model> \
+ input_manifest=<path to the evaluation JSON manifest file> \
+ arpa_model_file=<path to the ARPA LM model> \
+ decoding_wfst_file=<path to the decoding WFST file> \
+ beam_width=[<list of the beam widths, separated with commas>] \
+ lm_weight=[<list of the LM weight multipliers, separated with commas>] \
+ open_vocabulary_decoding=<whether to use open vocabulary mode for WFST decoding> \
+ decoding_mode=<decoding mode, affects output. Usually "nbest"> \
+ decoding_search_type=<WFST decoding library. Usually "riva"> \
+ preds_output_folder=<optional folder to store the predictions> \
+ probs_cache_file=null
+
+.. note::
+
+ Since WFST decoding is LM-forced (the search goes over the WIDEST graph), only word sequences accepted by the WFST can appear in the decoding results.
+ To circumvent this restriction, one can pass ``open_vocabulary_decoding=true`` (experimental feature).
+
+
+Quick start example
+-------------------
+
+.. code-block::
+
+ wget -O - https://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz | \
+ gunzip -c | tr '[:upper:]' '[:lower:]' > 3-gram.pruned.1e-7.arpa && \
+ python eval_wfst_decoding_ctc.py nemo_model_file="stt_en_conformer_ctc_small_ls" \
+ input_manifest="<data_dir>/Librispeech/test_other.json" \
+ arpa_model_file="3-gram.pruned.1e-7.arpa" \
+ decoding_wfst_file="3-gram.pruned.1e-7.fst" \
+ beam_width=[8] \
+ lm_weight=[0.5,0.6,0.7,0.8,0.9]
+
+.. note::
+
+ Building a decoding WFST is a long process, so it is better to provide a ``decoding_wfst_file`` path even if you don't have it.
+ This way, the decoding WFST will be buffered to the specified file path and there will be no need to re-build it on the next run.
+
 
 ***************************************************
 Context-biasing (word boosting) without external LM

diff --git a/docs/source/multimodal/text2img/sd.rst b/docs/source/multimodal/text2img/sd.rst
@@ -163,7 +163,7 @@ Optimization related configurations
 Training with precached latents
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Since the VAE and text encoder remain frozed during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
+Since the VAE and text encoder remain frozen during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
 
 Reference
 -----------

diff --git a/docs/source/nlp/nemo_megatron/intro.rst b/docs/source/nlp/nemo_megatron/intro.rst
@@ -20,6 +20,7 @@ To learn more about using NeMo to train Large Language Models at scale, please r
  peft/landing_page
  positional_embeddings
  mcore_customization
+ rampup_batch_size
 
 
 References
@@ -28,4 +29,4 @@ References
 .. bibliography:: ../nlp_all.bib
  :style: plain
  :labelprefix: nlp-megatron
- :keyprefix: nlp-megatron-
+ :keyprefix: nlp-megatron-
diff --git a/docs/source/nlp/nemo_megatron/rampup_batch_size.rst b/docs/source/nlp/nemo_megatron/rampup_batch_size.rst
@@ -0,0 +1,62 @@
+.. _rampup_batch_size:
+
+Ramp Up Batch Size
+------------------
+
+Ramp up batch size is a feature that allows training to start with a smaller global batch size and linearly increase to a target global batch size over a given number of training samples with specified incremental steps.
+
+Usage
+-----
+
+To enable global batch size rampup during training, set the rampup_batch_size parameter under the model section of training configuration. This parameter should be a list of three values:
+
+* ``start_batch_size``: The initial batch size.
+* ``batch_size_increment``: The amount by which the batch size will increase at each step.
+* ``rampup_samples``: The number of training samples over which the batch size will be ramped up.
+
+``model.global_batch_size=1024 model.rampup_batch_size=[256, 128, 50000000]``
+
+In this example, the training will start with a batch size of 256, increment by 128, and reach the target global batch size of 1024 over 50,000,000 training samples.
+
+Ramp Up Stages and Training Interruption
+----------------------------------------
+
+Once the next rampup stage is reached (the point in training when the global batch size increases), NeMo will stop the training. It allows to rerun the training job with a larger number of GPUs or nodes for the next stage of ramp up batch size.
+
+Automatic Node Scheduling
+-------------------------
+
+In the `NeMo-Framework-Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_, when using rampup batch size, a node scheduler is created automatically. This scheduler allows the use smaller number of nodes for smaller batch size stages and scales up according to the ``training.trainer.num_nodes`` parameter. This parameter corresponds to the maximum number of nodes you want to use for the maximum global batch size.
+
+Example
+-------
+
+Detailed example of ramp up batch size feature usage with GPT3 5B model and `NeMo-Framework-Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_. In this example, the training started with a global batch size of 256, increased by 256 at each ramp up stage, and reached the target global batch size of 2048 over 10,000,000 training samples.
+
+Node schedule looks as follows:
+
++--------------------+--------------------+
+| global_batch_size | num_nodes |
++====================+====================+
+| 256 | 8 |
++--------------------+--------------------+
+| 512 | 8 |
++--------------------+--------------------+
+| 768 | 8 |
++--------------------+--------------------+
+| 1024 | 8 |
++--------------------+--------------------+
+| 1280 | 10 |
++--------------------+--------------------+
+| 1536 | 12 |
++--------------------+--------------------+
+| 1792 | 14 |
++--------------------+--------------------+
+| 2048 | 16 |
++--------------------+--------------------+
+
+Plot of ``global_batch_size`` increase during training:
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-rampup-batch-size-example.png
+ :alt: 
+ :width: 1080px