Skip to content

Commit

Permalink
Merge branch 'NVIDIA:main' into auto_cudagraph
Browse files Browse the repository at this point in the history
  • Loading branch information
JimmyZhang12 committed Aug 28, 2024
2 parents aaada57 + f45422a commit a0c0445
Show file tree
Hide file tree
Showing 139 changed files with 12,395 additions and 599 deletions.
115 changes: 74 additions & 41 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,21 @@ jobs:
rm -f /home/TestData/nlp/megatron_ir/sbert/sbert.nemo
rm -rf /home/TestData/nlp/megatron_ir/sbert/model_weights
L2_Community_LLM_Checkpoints_tests_Mamba2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
SCRIPT: |
python scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
--input_name_or_path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt \
--output_path /home/TestData/nlp/megatron_mamba/converted_mamba.nemo \
--precision=bf16 \
--mamba_ssm_ngroups 1
AFTER_SCRIPT: |
rm -f /home/TestData/nlp/megatron_mamba/converted_mamba.nemo
rm -rf /home/TestData/nlp/megatron_mamba/model_weights
L2_Community_LLM_Checkpoints_tests_Llama:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -484,41 +499,41 @@ jobs:
AFTER_SCRIPT: |
rm -rf examples/asr/speech_finetuning_results
OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |-
python examples/asr/speech_to_text_finetune.py \
--config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
~model.train_ds.hf_data_cfg \
model.train_ds.num_workers=1 \
model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
model.train_ds.streaming=true \
+model.train_ds.hf_data_cfg.path="librispeech_asr" \
+model.train_ds.hf_data_cfg.name=null \
+model.train_ds.hf_data_cfg.split="test.clean" \
+model.train_ds.hf_data_cfg.streaming=true \
~model.validation_ds.hf_data_cfg \
model.validation_ds.streaming=true \
+model.validation_ds.hf_data_cfg.path="librispeech_asr" \
+model.validation_ds.hf_data_cfg.name=null \
+model.validation_ds.hf_data_cfg.split="test.clean" \
+model.validation_ds.hf_data_cfg.streaming=true \
~model.test_ds \
init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
model.tokenizer.update_tokenizer=False \
model.optim.sched.warmup_steps=0 \
+model.optim.sched.max_steps=3 \
trainer.max_epochs=null \
trainer.devices=1 \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_finetuning_results
AFTER_SCRIPT: |
rm -rf examples/asr/speech_finetuning_results
IS_OPTIONAL: true
# OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
# needs: [cicd-test-container-setup]
# uses: ./.github/workflows/_test_template.yml
# with:
# RUNNER: self-hosted-azure-gpus-1
# SCRIPT: |-
# python examples/asr/speech_to_text_finetune.py \
# --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
# ~model.train_ds.hf_data_cfg \
# model.train_ds.num_workers=1 \
# model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
# model.train_ds.streaming=true \
# +model.train_ds.hf_data_cfg.path="librispeech_asr" \
# +model.train_ds.hf_data_cfg.name=null \
# +model.train_ds.hf_data_cfg.split="test.clean" \
# +model.train_ds.hf_data_cfg.streaming=true \
# ~model.validation_ds.hf_data_cfg \
# model.validation_ds.streaming=true \
# +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
# +model.validation_ds.hf_data_cfg.name=null \
# +model.validation_ds.hf_data_cfg.split="test.clean" \
# +model.validation_ds.hf_data_cfg.streaming=true \
# ~model.test_ds \
# init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
# model.tokenizer.update_tokenizer=False \
# model.optim.sched.warmup_steps=0 \
# +model.optim.sched.max_steps=3 \
# trainer.max_epochs=null \
# trainer.devices=1 \
# trainer.accelerator="gpu" \
# +trainer.fast_dev_run=True \
# exp_manager.exp_dir=examples/asr/speech_finetuning_results
# AFTER_SCRIPT: |
# rm -rf examples/asr/speech_finetuning_results
# IS_OPTIONAL: true

ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -2046,7 +2061,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
Expand Down Expand Up @@ -2076,7 +2091,7 @@ jobs:
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
Expand Down Expand Up @@ -2113,7 +2128,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
Expand Down Expand Up @@ -2144,7 +2159,7 @@ jobs:
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
Expand Down Expand Up @@ -2184,7 +2199,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
Expand Down Expand Up @@ -2214,7 +2229,7 @@ jobs:
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
python examples/nlp/language_modeling/megatron_bert_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
Expand Down Expand Up @@ -4738,13 +4753,30 @@ jobs:
rm -rf examples/llm/gpt_pretrain_results
rm -rf examples/llm/gpt_index_mappings
L2_NeMo_2_GPT_DDP_Param_Parity_check:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/lightning/test_ddp_parity_checker.py \
--vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
--merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
--data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document
AFTER_SCRIPT: |
rm -rf examples/llm/gpt_pretrain_results
rm -rf examples/llm/gpt_index_mappings
Nemo_CICD_Test:
needs:
- gpu-test
- cicd-test-container-setup
- L0_Unit_Tests_GPU
#- OPTIONAL_L0_Unit_Tests_CPU
- L2_Community_LLM_Checkpoints_tests_Bert
- L2_Community_LLM_Checkpoints_tests_Mamba2
- L2_Community_LLM_Checkpoints_tests_Llama
- L2_Community_LLM_Checkpoints_tests_StarCoder
- L2_Community_LLM_Checkpoints_tests_Falcon
Expand Down Expand Up @@ -4843,6 +4875,7 @@ jobs:
- Speech_Checkpoints_tests
#- OPTIONAL_L2_Stable_Diffusion_Training
- L2_NeMo_2_GPT_Pretraining_no_transformer_engine
- L2_NeMo_2_GPT_DDP_Param_Parity_check
if: always()
runs-on: ubuntu-latest
steps:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ WORKDIR /workspace
# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.15.0
ARG MCORE_TAG=2fd6e2b74efca73a1f2d27b89bb5419384b4d3bf
ARG MCORE_TAG=34e607ef41cf1c0ed481a678df9c76952d0ec00c
ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
--mount=type=bind,source=requirements,target=requirements \
Expand Down
63 changes: 63 additions & 0 deletions docs/source/asr/asr_language_modeling_and_customization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,69 @@ The following is the list of the arguments for the opengrm script:
| force | bool | ``False`` | Whether to recompile and rewrite all files |
+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+

.. _wfst-ctc-decoding:

WFST CTC decoding
=================
Weighted Finite-State Transducers (WFST) are finite-state machines with input and output symbols on each transition and some weight element of a semiring. WFSTs can act as N-gram LMs in a special type of LM-forced beam search, called WFST decoding.

.. note::

More precisely, WFST decoding is more of a greedy N-depth search with LM.
Thus, it is asymptotically worse than conventional beam search decoding algorithms, but faster.

**WARNING**
At the moment, NeMo supports WFST decoding only for CTC models and word-based LMs.

To run WFST decoding in NeMo, one needs to provide a NeMo ASR model and either an ARPA LM or a WFST LM (advanced). An ARPA LM can be built from source text with KenLM as follows: ``<kenlm_bin_path>/lmplz -o <ngram_length> --arpa <out_arpa_path> --prune <ngram_prune>``.

The script to evaluate an ASR model with WFST decoding and N-gram models can be found at
`scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py
<https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py>`__.

This script has a large number of possible argument overrides, therefore it is advised to use ``python eval_wfst_decoding_ctc.py --help`` to see the full list of arguments.

You may evaluate an ASR model as the following:

.. code-block::
python eval_wfst_decoding_ctc.py nemo_model_file=<path to the .nemo file of the model> \
input_manifest=<path to the evaluation JSON manifest file> \
arpa_model_file=<path to the ARPA LM model> \
decoding_wfst_file=<path to the decoding WFST file> \
beam_width=[<list of the beam widths, separated with commas>] \
lm_weight=[<list of the LM weight multipliers, separated with commas>] \
open_vocabulary_decoding=<whether to use open vocabulary mode for WFST decoding> \
decoding_mode=<decoding mode, affects output. Usually "nbest"> \
decoding_search_type=<WFST decoding library. Usually "riva"> \
preds_output_folder=<optional folder to store the predictions> \
probs_cache_file=null
.. note::

Since WFST decoding is LM-forced (the search goes over the WIDEST graph), only word sequences accepted by the WFST can appear in the decoding results.
To circumvent this restriction, one can pass ``open_vocabulary_decoding=true`` (experimental feature).


Quick start example
-------------------

.. code-block::
wget -O - https://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz | \
gunzip -c | tr '[:upper:]' '[:lower:]' > 3-gram.pruned.1e-7.arpa && \
python eval_wfst_decoding_ctc.py nemo_model_file="stt_en_conformer_ctc_small_ls" \
input_manifest="<data_dir>/Librispeech/test_other.json" \
arpa_model_file="3-gram.pruned.1e-7.arpa" \
decoding_wfst_file="3-gram.pruned.1e-7.fst" \
beam_width=[8] \
lm_weight=[0.5,0.6,0.7,0.8,0.9]
.. note::

Building a decoding WFST is a long process, so it is better to provide a ``decoding_wfst_file`` path even if you don't have it.
This way, the decoding WFST will be buffered to the specified file path and there will be no need to re-build it on the next run.


***************************************************
Context-biasing (word boosting) without external LM
Expand Down
2 changes: 1 addition & 1 deletion docs/source/multimodal/text2img/sd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ Optimization related configurations
Training with precached latents
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Since the VAE and text encoder remain frozed during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
Since the VAE and text encoder remain frozen during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.

Reference
-----------
Expand Down
3 changes: 2 additions & 1 deletion docs/source/nlp/nemo_megatron/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ To learn more about using NeMo to train Large Language Models at scale, please r
peft/landing_page
positional_embeddings
mcore_customization
rampup_batch_size


References
Expand All @@ -28,4 +29,4 @@ References
.. bibliography:: ../nlp_all.bib
:style: plain
:labelprefix: nlp-megatron
:keyprefix: nlp-megatron-
:keyprefix: nlp-megatron-
62 changes: 62 additions & 0 deletions docs/source/nlp/nemo_megatron/rampup_batch_size.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
.. _rampup_batch_size:

Ramp Up Batch Size
------------------

Ramp up batch size is a feature that allows training to start with a smaller global batch size and linearly increase to a target global batch size over a given number of training samples with specified incremental steps.

Usage
-----

To enable global batch size rampup during training, set the rampup_batch_size parameter under the model section of training configuration. This parameter should be a list of three values:

* ``start_batch_size``: The initial batch size.
* ``batch_size_increment``: The amount by which the batch size will increase at each step.
* ``rampup_samples``: The number of training samples over which the batch size will be ramped up.

``model.global_batch_size=1024 model.rampup_batch_size=[256, 128, 50000000]``

In this example, the training will start with a batch size of 256, increment by 128, and reach the target global batch size of 1024 over 50,000,000 training samples.

Ramp Up Stages and Training Interruption
----------------------------------------

Once the next rampup stage is reached (the point in training when the global batch size increases), NeMo will stop the training. It allows to rerun the training job with a larger number of GPUs or nodes for the next stage of ramp up batch size.

Automatic Node Scheduling
-------------------------

In the `NeMo-Framework-Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_, when using rampup batch size, a node scheduler is created automatically. This scheduler allows the use smaller number of nodes for smaller batch size stages and scales up according to the ``training.trainer.num_nodes`` parameter. This parameter corresponds to the maximum number of nodes you want to use for the maximum global batch size.

Example
-------

Detailed example of ramp up batch size feature usage with GPT3 5B model and `NeMo-Framework-Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_. In this example, the training started with a global batch size of 256, increased by 256 at each ramp up stage, and reached the target global batch size of 2048 over 10,000,000 training samples.

Node schedule looks as follows:

+--------------------+--------------------+
| global_batch_size | num_nodes |
+====================+====================+
| 256 | 8 |
+--------------------+--------------------+
| 512 | 8 |
+--------------------+--------------------+
| 768 | 8 |
+--------------------+--------------------+
| 1024 | 8 |
+--------------------+--------------------+
| 1280 | 10 |
+--------------------+--------------------+
| 1536 | 12 |
+--------------------+--------------------+
| 1792 | 14 |
+--------------------+--------------------+
| 2048 | 16 |
+--------------------+--------------------+

Plot of ``global_batch_size`` increase during training:

.. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-rampup-batch-size-example.png
:alt:
:width: 1080px
Loading

0 comments on commit a0c0445

Please sign in to comment.