diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index ebdc99cef847..8c61c767b4f1 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -34,39 +34,32 @@ on:
         description: Last 2000 characters of the test step's log
         value: ${{ jobs.main.outputs.log }} 
 jobs:
-  runner-auto-clean:
-    runs-on: ${{ inputs.RUNNER }}
-    steps:
-        - name: Docker system cleanup
-          run: |
-            docker system prune -a --filter "until=48h" --force
-
+  
   main:
     runs-on: ${{ inputs.RUNNER }} 
     outputs:
       conclusion: ${{ steps.main.conclusion }}
       log: ${{ steps.main.outputs.log }}
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
+        - name: Docker system cleanup
+          run: |
+            docker system prune -a --filter "until=48h" --force
+
+        - name: Docker pull image
+          run: |
+            docker pull nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+
         - id: main
           name: Run main script
           timeout-minutes: ${{ inputs.TIMEOUT }}
           run: |
+            mkdir -p ${{ github.run_id }}
+            cd ${{ github.run_id }}/
             set +e 
             (  
               set -e
 
-              ${{ inputs.SCRIPT }}
+              docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}'
             ) 2> >(tee err.log)
 
             EXIT_CODE=$?
@@ -79,4 +72,5 @@ jobs:
           if: failure() && inputs.IS_OPTIONAL == false
         - name: after_script
           if: always() && inputs.AFTER_SCRIPT != ':'
-          run: ${{ inputs.AFTER_SCRIPT }}
\ No newline at end of file
+          run: |
+            docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
\ No newline at end of file
diff --git a/.github/workflows/changelog-build.yml b/.github/workflows/changelog-build.yml
index 7e16c344acb8..b6d836f71cec 100644
--- a/.github/workflows/changelog-build.yml
+++ b/.github/workflows/changelog-build.yml
@@ -1,13 +1,13 @@
 name: 'Changelog Build (Release)'
 
 on:
+  workflow_dispatch:
   push:
     tags:
       - '*'
-
+    
 jobs:
   changelog:
-    if: startsWith(github.ref, 'refs/tags/')
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
@@ -39,7 +39,7 @@ jobs:
           ignorePreReleases: "false"
           failOnError: "false"
           fromTag: ${{ steps.previous_tag.outputs.tag_name }}
-          toTag: ${{ github.ref_name }}
+          toTag: ${{ github.ref_name || github.sha }}
 
       - name: Print Changelog
         run: |
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 4094d15ee4c2..797b7888b01e 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -19,13 +19,32 @@ on:
       - 'main'
       - 'r**'
     types: [ labeled ]
+  workflow_dispatch:
+    inputs:
+      test_to_run:
+        required: false
+        default: all
+        type: string
+        description: Comma-separated list of tests to run. Use "all" to run the full test suite.
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 jobs:
+  pre-flight:
+    runs-on: ubuntu-latest
+    outputs:
+      test_to_run: ${{ steps.main.outputs.test_to_run }}
+    steps:
+      - name: Parse test_to_run
+        id: main
+        run: |
+          parsed_string=$(echo ${{ inputs.test_to_run }} | jq -c --raw-input 'split(",")')
+          echo "test_to_run=${parsed_string}" >> "$GITHUB_ENV"
+
   gpu-test:
+    needs: [pre-flight]
     runs-on: self-hosted-azure
     if: ${{ github.event.label.name == 'Run CICD' }}
     steps:
@@ -36,6 +55,7 @@ jobs:
 
   cicd-cluster-clean:
     runs-on: self-hosted-azure-builder
+    needs: [pre-flight]
     if: ${{ github.event.label.name == 'Run CICD' }}
     steps:
     - name: Clean server from old files
@@ -93,8 +113,7 @@ jobs:
           exit 0
         '
         ### \'\'
-
-
+  
   L0_Unit_Tests_GPU:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -105,14 +124,16 @@ jobs:
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
       IS_OPTIONAL: true
 
-  L0_Unit_Tests_CPU:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-cpu
-      TIMEOUT: 60
-      SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+   # # TODO refactor: Commenting this test out until it is fixed & works properly again (test passes again)
+  # OPTIONAL_L0_Unit_Tests_CPU:
+  #   needs: [cicd-test-container-setup]
+  #   uses: ./.github/workflows/_test_template.yml
+  #   with:
+  #     RUNNER: self-hosted-azure-cpu
+  #     TIMEOUT: 60
+  #     SCRIPT: |
+  #       CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+  #     IS_OPTIONAL: true
 
   L0_Setup_Test_Data_And_Models:
     needs: [cicd-test-container-setup]
@@ -125,6 +146,19 @@ jobs:
   ##     - name: L2: Multimodal Imagen Train
 
   # L2: Community LLM Checkpoints tests
+  L2_Community_LLM_Checkpoints_tests_Bert:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+          python scripts/checkpoint_converters/convert_bert_hf_to_nemo.py  \
+          --input_name_or_path /home/TestData/nlp/megatron_ir/sbert/hf_model/bert-base-uncased \
+          --output_path /home/TestData/nlp/megatron_ir/sbert/sbert.nemo
+      AFTER_SCRIPT: |
+        rm -f /home/TestData/nlp/megatron_ir/sbert/sbert.nemo
+        rm -rf /home/TestData/nlp/megatron_ir/sbert/model_weights
+
   L2_Community_LLM_Checkpoints_tests_Llama:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -187,6 +221,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
+        mkdir /home/TestData/multimodal/video_neva/llama3-ci-hf/${{ github.run_id }}
         export PYTHONPATH=/home/TestData/multimodal/video_neva/LLaVA:$PYTHONPATH
         CUDA_VISIBLE_DEVICES=0 python examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py \
           --in-file /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/llm \
@@ -194,13 +229,12 @@ jobs:
           --mm-vision-tower /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/vision_tower \
           --tokenizer-model /home/TestData/multimodal/video_neva/vita-tokenizer/ \
           --config-file vita_config.yaml \
-          --out-file=/home/TestData/multimodal/video_neva/llama3-ci-hf/llama3_ci.nemo \
+          --out-file=/home/TestData/multimodal/video_neva/llama3-ci-hf/${{ github.run_id }}/llama3_ci.nemo \
           --model-type VITA \
           --conv-template llama_3
       AFTER_SCRIPT: |
-        rm -f /home/TestData/multimodal/video_neva/llama3-ci-hf/llama3_ci.nemo
-        rm -rf /home/TestData/multimodal/video_neva/llama3-ci-hf/model_weights
-
+        rm -rf /home/TestData/multimodal/video_neva/llama3-ci-hf/${{ github.run_id }}
+  
   # this test is using a 7B model which is too large for GitHub CI
   # replace the model in this test with a toy model or move the test
   # to the nightly CI
@@ -256,6 +290,7 @@ jobs:
           quantization.num_calib_size=8 \
           inference.batch_size=2 \
           export.inference_tensor_parallel=2 \
+          export.sample_output=False \
           export.save_path=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
@@ -268,12 +303,13 @@ jobs:
       TIMEOUT: 15
       SCRIPT: |
         python examples/nlp/language_modeling/megatron_gpt_ptq.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-        quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-        quantization.algorithm=int8_sq \
-        quantization.num_calib_size=8 \
-        inference.batch_size=2 \
-        export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+          quantization.algorithm=int8_sq \
+          quantization.num_calib_size=8 \
+          inference.batch_size=2 \
+          export.sample_output=False \
+          export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
       IS_OPTIONAL: true
@@ -348,6 +384,34 @@ jobs:
 
   #           rm -rf llama2_qat_results
 
+  L2_Distill_Llama2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_distillation.py \
+          trainer.devices=2 \
+          trainer.num_nodes=1 \
+          trainer.precision=bf16 \
+          trainer.max_steps=5 \
+          trainer.log_every_n_steps=5 \
+          trainer.val_check_interval=5 \
+          trainer.limit_val_batches=2 \
+          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          model.kd_teacher_restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          model.tensor_model_parallel_size=2 \
+          model.pipeline_model_parallel_size=1 \
+          model.micro_batch_size=1 \
+          model.global_batch_size=4 \
+          model.optim.name=distributed_fused_adam \
+          model.optim.sched.warmup_steps=1 \
+          model.data.data_prefix=[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+          model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
+          exp_manager.exp_dir=examples/nlp/megatron_llama_distill
+      AFTER_SCRIPT: |
+          rm -rf examples/nlp/megatron_llama_distill
+
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]
@@ -517,55 +581,45 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/asr/speech_to_text_results
 
-
-  # L2_Speech_to_Text_AED:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure-gpus-1
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/asr/speech_multitask/speech_to_text_aed.py \
-  #           model.prompt_format=canary \
-  #           model.model_defaults.asr_enc_hidden=256 \
-  #           model.model_defaults.lm_dec_hidden=256 \
-  #           model.encoder.n_layers=12 \
-  #           model.transf_encoder.num_layers=0 \
-  #           model.transf_decoder.config_dict.num_layers=12 \
-  #           model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \
-  #           ++model.train_ds.is_tarred=false \
-  #           model.train_ds.batch_duration=60 \
-  #           +model.train_ds.text_field="answer" \
-  #           +model.train_ds.lang_field="target_lang" \
-  #           model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
-  #           +model.validation_ds.text_field="answer" \
-  #           +model.validation_ds.lang_field="target_lang" \
-  #           model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
-  #           +model.test_ds.text_field="answer" \
-  #           +model.test_ds.lang_field="target_lang" \
-  #           model.tokenizer.langs.spl_tokens.dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \
-  #           model.tokenizer.langs.spl_tokens.type="bpe" \
-  #           model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \
-  #           model.tokenizer.langs.en.type=bpe \
-  #           ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \
-  #           ++model.tokenizer.langs.es.type=bpe \
-  #           trainer.devices=1 \
-  #           trainer.accelerator="gpu" \
-  #           +trainer.use_distributed_sampler=false \
-  #           +trainer.fast_dev_run=True \
-  #           exp_manager.exp_dir=examples/asr/speech_to_text_aed_results
-  #           rm -rf examples/asr/speech_to_text_results
-
+  L2_Speech_to_Text_AED:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/speech_multitask/speech_to_text_aed.py \
+        model.prompt_format=canary \
+        model.model_defaults.asr_enc_hidden=256 \
+        model.model_defaults.lm_dec_hidden=256 \
+        model.encoder.n_layers=12 \
+        model.transf_encoder.num_layers=0 \
+        model.transf_decoder.config_dict.num_layers=12 \
+        model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \
+        model.train_ds.batch_duration=60 \
+        model.train_ds.use_bucketing=false \
+        model.train_ds.shuffle_buffer_size=100 \
+        model.train_ds.num_workers=0 \
+        +model.train_ds.text_field="answer" \
+        +model.train_ds.lang_field="target_lang" \
+        model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
+        +model.validation_ds.text_field="answer" \
+        +model.validation_ds.lang_field="target_lang" \
+        model.validation_ds.num_workers=0 \
+        model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
+        +model.test_ds.text_field="answer" \
+        +model.test_ds.lang_field="target_lang" \
+        model.test_ds.num_workers=0 \
+        spl_tokens.model_dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \
+        model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \
+        model.tokenizer.langs.en.type=bpe \
+        ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \
+        ++model.tokenizer.langs.es.type=bpe \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_aed_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_results
 
   # L2: Speaker dev run
   L2_Speaker_dev_run_Speaker_Recognition:
@@ -787,6 +841,67 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/asr/speech_to_text_adapters_mha_results
 
+  # L2: OOMptimizer
+  L2_Speech_Estimate_Duration_Bins:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        set -x
+        # 1D buckets [SSL, CTC]
+        python scripts/speech_recognition/estimate_duration_bins.py \
+          /home/TestData/an4_dataset/an4_train.json \
+          --buckets 5 
+        # 2D buckets [CTC, RNNT, TDT] / with tokenizer
+        python scripts/speech_recognition/estimate_duration_bins_2d.py \
+          /home/TestData/an4_dataset/an4_train_lang.json \
+          --tokenizer /home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4/tokenizer.model \
+          --buckets 5 \
+          --sub-buckets 2
+        # TODO(pzelasko): Figure out how to quote the value in the test properly for CI to accept it...
+        # 2D buckets with prompt [AED/Canary, SpeechLM] / with aggregate tokenizer + prompt format 
+        # python scripts/speech_recognition/estimate_duration_bins_2d.py \
+        #   /home/TestData/an4_dataset/an4_train_lang.json \
+        #   --tokenizer /home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32/tokenizer.model \
+        #      /home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4/tokenizer.model \
+        #      /home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4/tokenizer.model \
+        #   --langs spl_tokens en es \
+        #   --prompt-format canary \
+        #   --prompt '[{"role":"user","slots":{"source_lang":"en","target_lang":"en","task":"asr","pnc":"yes"}}]' \
+        #   --buckets 5 \
+        #   --sub-buckets 2
+
+  # L2: OOMptimizer
+  L2_Speech_Batch_Size_OOMptimizer:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        # 1D bucketing
+        python scripts/speech_recognition/oomptimizer.py \
+          -c /home/TestData/oomptimizer/fast-conformer_ctc_bpe.yaml \
+          -m nemo.collections.asr.models.EncDecCTCModelBPE \
+          -b "[5.0,10.0]"
+        # 2D bucketing
+        python scripts/speech_recognition/oomptimizer.py \
+          -c /home/TestData/oomptimizer/fast-conformer_ctc_bpe.yaml \
+          -m nemo.collections.asr.models.EncDecCTCModelBPE \
+          -b "[[5.0,30],[5.0,45],[10.0,57],[10.0,71]]"
+
+  # L2: OOMptimizer Canary (has a different batch schema)
+  L2_Speech_Batch_Size_OOMptimizer_Canary:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python scripts/speech_recognition/oomptimizer.py \
+          -c /home/TestData/oomptimizer/fast-conformer_aed.yaml \
+          -m nemo.collections.asr.models.EncDecMultiTaskModel \
+          -b "[[5.0,30],[5.0,45],[10.0,57],[10.0,71]]"
+
   # L2: Speech Transcription
   L2_Speech_Transcription_Speech_to_Text_Transcribe:
     needs: [cicd-test-container-setup]
@@ -802,6 +917,66 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf stt_test_res.json
 
+  # L2: Speech Transcription
+  L2_Speech_Transcription_Canary_Transcribe_Full_Manifest:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/asr/transcribe_speech.py \
+        dataset_manifest=/home/TestData/asr/canary/dev-other-wav-10-canary-fields.json \
+        output_filename=preds.json \
+        batch_size=10 \
+        pretrained_name=nvidia/canary-1b \
+        num_workers=0 \
+        amp=false \
+        compute_dtype=bfloat16 \
+        matmul_precision=medium
+      AFTER_SCRIPT: |
+        rm -rf preds.json transcribe.log
+
+  L2_Speech_Transcription_Canary_Transcribe_With_Prompt:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/asr/transcribe_speech.py \
+        dataset_manifest=/home/TestData/asr/canary/dev-other-wav-10.json \
+        output_filename=preds.json \
+        batch_size=10 \
+        pretrained_name=nvidia/canary-1b \
+        num_workers=0 \
+        amp=false \
+        compute_dtype=bfloat16 \
+        matmul_precision=medium \
+        +prompt.source_lang="en" \
+        +prompt.target_lang="en" \
+        +prompt.task="asr" \
+        +prompt.pnc="no"
+      AFTER_SCRIPT: |
+        rm -rf preds.json transcribe.log
+
+  L2_Speech_Transcription_Canary_Transcribe_Audio_Dir:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/asr/transcribe_speech.py \
+        audio_dir=/home/TestData/asr/canary/dev-other-wav \
+        output_filename=preds.json \
+        batch_size=10 \
+        pretrained_name=nvidia/canary-1b \
+        num_workers=0 \
+        amp=false \
+        compute_dtype=bfloat16 \
+        matmul_precision=medium
+      AFTER_SCRIPT: |
+        rm -rf preds.json
+  
+
   # L2: Transducer alignment
   L2_Transducer_alignment_Running_pytest:
     needs: [cicd-test-container-setup]
@@ -960,7 +1135,6 @@ jobs:
         data.test_ds.use_cache=false \
         data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
 
-
   # L2: Intent and Slot Classification Tasks
   L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
@@ -998,11 +1172,11 @@ jobs:
         rm -rf checkpoints2
 
     # TODO: add when megatron-bert is supported again
-    # stage('L2: Model Parallel Size 2 Megatron Text Classification') {
+    # stage("L2: Model Parallel Size 2 Megatron Text Classification") {
     #   when {
     #     anyOf{
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
@@ -1027,11 +1201,11 @@ jobs:
     #   }
     # }
 
-    # stage('L2: Model Parallel Size 2 Megatron Autoresume') {
+    # stage("L2: Model Parallel Size 2 Megatron Autoresume") {
     #   when {
     #     anyOf{
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
@@ -1058,11 +1232,11 @@ jobs:
     #   }
     # }
 
-    # stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') {
+    # stage("L2: Model Parallel Size 2 Megatron Evaluation from .nemo") {
     #   when {
     #     anyOf{
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
@@ -1079,11 +1253,11 @@ jobs:
     #   }
     # }
 
-    # stage('L2: Model Parallel Size 2 Megatron Train from .nemo') {
+    # stage("L2: Model Parallel Size 2 Megatron Train from .nemo") {
     #   when {
     #     anyOf{
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
@@ -1485,18 +1659,18 @@ jobs:
         model.encoder.num_layers=4 \
         model.encoder.hidden_size=64 \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
+        model.encoder.activation="swiglu" \
         model.encoder.masked_softmax_fusion=False \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=2 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
+        model.decoder.activation="swiglu" \
         model.decoder.masked_softmax_fusion=False \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
         model.micro_batch_size=2 \
         model.global_batch_size=4 \
@@ -1530,18 +1704,18 @@ jobs:
         model.encoder.num_layers=4 \
         model.encoder.hidden_size=64 \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
+        model.encoder.activation="swiglu" \
         model.encoder.masked_softmax_fusion=False \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=2 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
+        model.decoder.activation="swiglu" \
         model.decoder.masked_softmax_fusion=False \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
         model.micro_batch_size=2 \
         model.global_batch_size=4 \
@@ -1583,24 +1757,24 @@ jobs:
         model.encoder.hidden_size=64 \
         model.encoder.arch=perceiver \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
+        model.encoder.activation="swiglu" \
         model.encoder.masked_softmax_fusion=False \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=2 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
+        model.decoder.activation="swiglu" \
         model.decoder.masked_softmax_fusion=False \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
         model.micro_batch_size=2 \
         model.global_batch_size=4 \
         model.data.data_impl=text_mmap \
         model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string='"800,100,100"' \
+        model.data.splits_string="\"800,100,100\"" \
         model.data.whole_word_masking=False \
         model.tokenizer.library=sentencepiece \
         model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
@@ -1628,24 +1802,24 @@ jobs:
         model.encoder.hidden_size=64 \
         model.encoder.arch=perceiver \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
+        model.encoder.activation="swiglu" \
         model.encoder.masked_softmax_fusion=False \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=2 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
+        model.decoder.activation="swiglu" \
         model.decoder.masked_softmax_fusion=False \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
         model.micro_batch_size=2 \
         model.global_batch_size=4 \
         model.data.data_impl=text_mmap \
         model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string='"800,100,100"' \
+        model.data.splits_string="\"800,100,100\"" \
         model.data.whole_word_masking=False \
         model.tokenizer.library=sentencepiece \
         model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
@@ -1657,16 +1831,16 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/nlp/language_modeling/megatron_mim_results
 
-    # stage('L2: NMT Bottleneck Fallback') {
+    # stage("L2: NMT Bottleneck Fallback") {
     #   when {
     #     anyOf {
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
     #   parallel {
-    #     stage('L2: seq2seq (no bottleneck)') {
+    #     stage("L2: seq2seq (no bottleneck)") {
     #         steps {
     #           cd examples/nlp/machine_translation && \
     #           enc_dec_nmt-bottleneck.py \
@@ -1703,16 +1877,16 @@ jobs:
     #     }
     #   }
     # }
-    # stage('L2: NMT Bottleneck Architecture') {
+    # stage("L2: NMT Bottleneck Architecture") {
     #   when {
     #     anyOf {
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
     #   parallel {
-    #     stage('Bridge Encoder (identity)') {
+    #     stage("Bridge Encoder (identity)") {
     #         steps {
     #           cd examples/nlp/machine_translation && \
     #           enc_dec_nmt-bottleneck.py \
@@ -1747,7 +1921,7 @@ jobs:
     #           exp_manager=null
     #         }
     #     }
-    #     stage('Perceiver Encoder (params)') {
+    #     stage("Perceiver Encoder (params)") {
     #         steps {
     #           cd examples/nlp/machine_translation && \
     #           enc_dec_nmt-bottleneck.py \
@@ -1784,16 +1958,16 @@ jobs:
     #     }
     #   }
     # }
-    # stage('L2: NMT Bottleneck LVM') {
+    # stage("L2: NMT Bottleneck LVM") {
     #   when {
     #     anyOf {
-    #       branch 'main'
-    #       changeRequest target: 'main'
+    #       branch "main"
+    #       changeRequest target: "main"
     #     }
     #   }
     #   failFast true
     #   parallel {
-    #     stage('VAE') {
+    #     stage("VAE") {
     #         steps {
     #           cd examples/nlp/machine_translation && \
     #           enc_dec_nmt-bottleneck.py \
@@ -1828,7 +2002,7 @@ jobs:
     #           exp_manager=null
     #         }
     #     }
-    #     stage('MIM') {
+    #     stage("MIM") {
     #         steps {
     #           cd examples/nlp/machine_translation && \
     #           enc_dec_nmt-bottleneck.py \
@@ -1868,250 +2042,210 @@ jobs:
         
   L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=bf16 \
-            model.megatron_amp_O2=True \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=bf16 \
+        model.megatron_amp_O2=True \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=20 \
-            trainer.precision=bf16 \
-            model.megatron_amp_O2=True \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=20 \
+        trainer.precision=bf16 \
+        model.megatron_amp_O2=True \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
   L2_Megatron_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=bf16 \
-            model.megatron_amp_O2=True \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.sequence_parallel=True \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=20 \
-            trainer.precision=bf16 \
-            model.megatron_amp_O2=True \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=bf16 \
+        model.megatron_amp_O2=True \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.sequence_parallel=True \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-            rm -rf examples/nlp/language_modeling/bert_pretrain_results
-            rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=20 \
+        trainer.precision=bf16 \
+        model.megatron_amp_O2=True \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bert_pretrain_results
+        rm -rf examples/nlp/language_modeling/bert_index_mappings
 
   L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            model.mcore_bert=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.sequence_parallel=True \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method='block' \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=20 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.mcore_bert=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method='block' \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            rm -rf examples/nlp/language_modeling/bert_pretrain_results
-            rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+          trainer.devices=2 \
+          trainer.accelerator=gpu \
+          trainer.log_every_n_steps=1 \
+          trainer.val_check_interval=10 \
+          trainer.limit_val_batches=2 \
+          trainer.accumulate_grad_batches=1 \
+          trainer.max_steps=10 \
+          trainer.gradient_clip_val=1.0 \
+          exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+          model.mcore_bert=True \
+          model.tensor_model_parallel_size=2 \
+          model.optim.name=fused_adam \
+          model.optim.lr=2e-4 \
+          model.sequence_parallel=True \
+          model.optim.sched.warmup_steps=2 \
+          model.optim.sched.constant_steps=2 \
+          model.optim.sched.min_lr=8e-5 \
+          model.max_position_embeddings=128 \
+          model.encoder_seq_length=128 \
+          model.data.seq_length=128 \
+          model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+          model.num_layers=8 \
+          model.hidden_size=256 \
+          model.num_attention_heads=8 \
+          model.activations_checkpoint_method="block" \
+          model.activations_checkpoint_num_layers=1 \
+          model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+          model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+          NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+          trainer.devices=2 \
+          trainer.accelerator=gpu \
+          trainer.log_every_n_steps=1 \
+          trainer.val_check_interval=10 \
+          trainer.limit_val_batches=2 \
+          trainer.accumulate_grad_batches=1 \
+          trainer.max_steps=20 \
+          trainer.gradient_clip_val=1.0 \
+          exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+          exp_manager.resume_if_exists=True \
+          model.mcore_bert=True \
+          model.tensor_model_parallel_size=2 \
+          model.optim.name=fused_adam \
+          model.optim.lr=2e-4 \
+          model.optim.sched.warmup_steps=2 \
+          model.optim.sched.constant_steps=2 \
+          model.optim.sched.min_lr=8e-5 \
+          model.max_position_embeddings=128 \
+          model.encoder_seq_length=128 \
+          model.data.seq_length=128 \
+          model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+          model.num_layers=8 \
+          model.hidden_size=256 \
+          model.num_attention_heads=8 \
+          model.activations_checkpoint_method="block" \
+          model.activations_checkpoint_num_layers=1 \
+          model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+          model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bert_pretrain_results
+        rm -rf examples/nlp/language_modeling/bert_index_mappings
 
   L2_Megatron_RETRO_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
@@ -2124,7 +2258,7 @@ jobs:
         trainer.devices=2 \
         trainer.precision=bf16 \
         trainer.accelerator=gpu \
-        model.data.data_prefix=['none'] \
+        model.data.data_prefix=["none"] \
         exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
         model.mcore_gpt=True \
         model.tensor_model_parallel_size=1 \
@@ -2139,7 +2273,7 @@ jobs:
         model.init_method_std=0.023 \
         model.optim.lr=6.0e-4 \
         model.megatron_amp_O2=True \
-        model.data.splits_string=\'\"98,2,0\"\' \
+        model.data.splits_string="\"98,2,0\"" \
         model.data.dataloader_type=cyclic \
         trainer.max_steps=10
 
@@ -2148,7 +2282,7 @@ jobs:
         trainer.devices=2 \
         trainer.precision=bf16 \
         trainer.accelerator=gpu \
-        model.data.data_prefix=['none'] \
+        model.data.data_prefix=["none"] \
         exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
         model.mcore_gpt=True \
         model.tensor_model_parallel_size=1 \
@@ -2163,7 +2297,7 @@ jobs:
         model.init_method_std=0.023 \
         model.optim.lr=6.0e-4 \
         model.megatron_amp_O2=True \
-        model.data.splits_string=\'\"98,2,0\"\' \
+        model.data.splits_string="\"98,2,0\"" \
         model.data.dataloader_type=cyclic \
         trainer.max_steps=20
       AFTER_SCRIPT: |
@@ -2317,16 +2451,16 @@ jobs:
   #               from pandas.testing import assert_frame_equal
   #               from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
   #               import torch
-  #               if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()):
+  #               if not (torch.cuda.is_available() and "A100" in torch.cuda.get_device_name()):
   #                   import sys
   #                   sys.exit(0)
-  #               event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
+  #               event_file = list(pathlib.Path("examples/nlp/language_modeling/retro_results/megatron_retro/smalltest").glob("events.out.tfevents*"))[0]
   #               ea = EventAccumulator(str(event_file)).Reload()
   #               vals = []
-  #               for i in ea.Scalars('reduced_train_loss'):
+  #               for i in ea.Scalars("reduced_train_loss"):
   #                   vals.append(i.value)
-  #               training_curve = pd.DataFrame({'loss': vals})
-  #               gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv')
+  #               training_curve = pd.DataFrame({"loss": vals})
+  #               gt_curve = pd.read_csv("/home/TestData/nlp/megatron_retro/expected_learning_curve.csv")
   #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
 
   #               rm -rf examples/nlp/language_modeling/retro_results
@@ -2335,65 +2469,37 @@ jobs:
 
   L2_RAG_Pipeline_Indexing:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/rag/rag_indexing.py \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            trainer.precision='bf16-mixed' \
-            indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \
-            indexing.embedder.embed_batch_size=128 \
-            indexing.data.data_path='/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data' \
-            indexing.data.chunk_size=256 \
-            indexing.data.chunk_overlap=10 \
-            indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index'
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/rag/rag_indexing.py \
+        trainer.num_nodes=1 \
+        trainer.devices=1 \
+        trainer.precision="bf16-mixed" \
+        indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \
+        indexing.embedder.embed_batch_size=128 \
+        indexing.data.data_path="/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data" \
+        indexing.data.chunk_size=256 \
+        indexing.data.chunk_overlap=10 \
+        indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index"
 
   L2_RAG_Pipeline_Generating:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/rag/rag_generating.py \
-            trainer.devices=1 \
-            trainer.precision='bf16-mixed' \
-            indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \
-            indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' \
-            generating.llm.model_path='/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo' \
-            generating.inference.tokens_to_generate=50 \
-            generating.inference.greedy=False \
-            generating.inference.temperature=1.0 \
-            generating.query='Which art schools did I applied to?'
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/rag/rag_generating.py \
+        trainer.devices=1 \
+        trainer.precision="bf16-mixed" \
+        indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \
+        indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index" \
+        generating.llm.model_path="/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo" \
+        generating.inference.tokens_to_generate=50 \
+        generating.inference.greedy=False \
+        generating.inference.temperature=1.0 \
+        generating.query="Which art schools did I applied to?"
 
   L2_BioMegatron_Bert_NER_Task:
     needs: [cicd-test-container-setup]
@@ -2412,7 +2518,7 @@ jobs:
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-2-h100
     timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
@@ -2424,6 +2530,21 @@ jobs:
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
         --volume /mnt/datadrive/TestData:/home/TestData
+    env:
+      # This is to improve p2p overlap on H100
+      NVTE_FWD_LAYERNORM_SM_MARGIN: 8
+      NVTE_BWD_LAYERNORM_SM_MARGIN: 8
+      TORCH_NCCL_AVOID_RECORD_STREAMS: 1
+      NCCL_MIN_NCHANNELS: 4
+      # TP overlap is not supported in docker environment
+      #NVTE_UB_SPLIT_RS: 0
+      #NVTE_UB_ATOMIC_GEMM_RS: 1
+      #NVTE_RS_STRIDED_ATOMIC: 1
+      #NVTE_UB_FP8_RS: 1
+      # Increase p2p chunksize to 2MB
+      NCCL_P2P_NET_CHUNKSIZE: 2097152
+      # Disable gc when switching to/from validation steps
+      NEMO_MANUAL_GC_IN_VALIDATION: 0
     steps:
         - name: Checkout repository
           uses: actions/checkout@v4
@@ -2438,8 +2559,17 @@ jobs:
             trainer.max_steps=3 \
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            ++model.transformer_engine=True \
+            ++model.fp8=True \
+            ++model.fp8_hybrid=True \
+            ++model.fp8_amax_history_len=1024 \
+            ++model.fp8_amax_compute_algo=max \
+            ++model.reduce_amax=True \
+            ++model.use_te_rng_tracker=True \
+            ++model.name=megatron_gpt_full_te_layer_autocast \
+            model.ub_tp_comm_overlap=False \
             model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
+            model.optim.name=distributed_fused_adam \
             model.optim.lr=2e-4 \
             model.optim.sched.warmup_steps=1 \
             model.optim.sched.constant_steps=1 \
@@ -2473,8 +2603,17 @@ jobs:
             trainer.gradient_clip_val=1.0 \
             exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
             exp_manager.resume_if_exists=True \
+            ++model.transformer_engine=True \
+            ++model.fp8=True \
+            ++model.fp8_hybrid=True \
+            ++model.fp8_amax_history_len=1024 \
+            ++model.fp8_amax_compute_algo=max \
+            ++model.reduce_amax=True \
+            ++model.use_te_rng_tracker=True \
+            ++model.name=megatron_gpt_full_te_layer_autocast \
+            model.ub_tp_comm_overlap=False \
             model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
+            model.optim.name=distributed_fused_adam \
             model.optim.lr=2e-4 \
             model.optim.sched.warmup_steps=2 \
             model.optim.sched.constant_steps=2 \
@@ -2504,99 +2643,85 @@ jobs:
 
   L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-           trainer.devices=2 \
-           trainer.accelerator=gpu \
-           trainer.log_every_n_steps=1 \
-           trainer.val_check_interval=2 \
-           trainer.limit_val_batches=2 \
-           trainer.accumulate_grad_batches=1 \
-           trainer.max_steps=3 \
-           trainer.gradient_clip_val=1.0 \
-           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-           model.tensor_model_parallel_size=2 \
-           model.optim.name=fused_adam \
-           model.optim.lr=2e-4 \
-           model.optim.sched.warmup_steps=1 \
-           model.optim.sched.constant_steps=1 \
-           model.optim.sched.min_lr=8e-5 \
-           model.max_position_embeddings=128 \
-           model.encoder_seq_length=128 \
-           model.data.seq_length=128 \
-           model.position_embedding_type=rope \
-           model.rotary_percentage=0.5 \
-           model.bias=False \
-           model.bias_activation_fusion=False \
-           model.bias_dropout_add_fusion=False \
-           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-           model.num_layers=8 \
-           model.hidden_size=256 \
-           model.num_attention_heads=8 \
-           model.activations_checkpoint_method=block \
-           model.activations_checkpoint_granularity=full \
-           model.activations_checkpoint_num_layers=1 \
-           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            #  commented out to save time on github ci @adithyare
-            # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            # trainer.devices=2 \
-            # trainer.accelerator=gpu \
-            # trainer.log_every_n_steps=1 \
-            # trainer.val_check_interval=2 \
-            # trainer.limit_val_batches=1 \
-            # trainer.accumulate_grad_batches=1 \
-            # trainer.max_steps=6 \
-            # trainer.gradient_clip_val=1.0 \
-            # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            # exp_manager.resume_if_exists=True \
-            # model.tensor_model_parallel_size=2 \
-            # model.optim.name=fused_adam \
-            # model.optim.lr=2e-4 \
-            # model.optim.sched.warmup_steps=2 \
-            # model.optim.sched.constant_steps=2 \
-            # model.optim.sched.min_lr=8e-5 \
-            # model.max_position_embeddings=128 \
-            # model.encoder_seq_length=128 \
-            # model.data.seq_length=128 \
-            # model.position_embedding_type=rope \
-            # model.rotary_percentage=0.5 \
-            # model.normalization=rmsnorm \
-            # model.bias=False \
-            # model.bias_activation_fusion=False \
-            # model.bias_dropout_add_fusion=False \
-            # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            # model.num_layers=8 \
-            # model.hidden_size=256 \
-            # model.num_attention_heads=8 \
-            # model.activations_checkpoint_method=block \
-            # model.activations_checkpoint_granularity=full \
-            # model.activations_checkpoint_num_layers=1 \
-            # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-
-           rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-           rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.position_embedding_type=rope \
+        model.rotary_percentage=0.5 \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+    
+        #  commented out to save time on github ci @adithyare
+        # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        # trainer.devices=2 \
+        # trainer.accelerator=gpu \
+        # trainer.log_every_n_steps=1 \
+        # trainer.val_check_interval=2 \
+        # trainer.limit_val_batches=1 \
+        # trainer.accumulate_grad_batches=1 \
+        # trainer.max_steps=6 \
+        # trainer.gradient_clip_val=1.0 \
+        # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        # exp_manager.resume_if_exists=True \
+        # model.tensor_model_parallel_size=2 \
+        # model.optim.name=fused_adam \
+        # model.optim.lr=2e-4 \
+        # model.optim.sched.warmup_steps=2 \
+        # model.optim.sched.constant_steps=2 \
+        # model.optim.sched.min_lr=8e-5 \
+        # model.max_position_embeddings=128 \
+        # model.encoder_seq_length=128 \
+        # model.data.seq_length=128 \
+        # model.position_embedding_type=rope \
+        # model.rotary_percentage=0.5 \
+        # model.normalization=rmsnorm \
+        # model.bias=False \
+        # model.bias_activation_fusion=False \
+        # model.bias_dropout_add_fusion=False \
+        # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        # model.num_layers=8 \
+        # model.hidden_size=256 \
+        # model.num_attention_heads=8 \
+        # model.activations_checkpoint_method=block \
+        # model.activations_checkpoint_granularity=full \
+        # model.activations_checkpoint_num_layers=1 \
+        # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
     #  This test requires Ampere but some of the test GPUs are Volta
     #  Need to add a check for compute capability before uncommenting this test
@@ -2692,284 +2817,243 @@ jobs:
 
   L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-           trainer.devices=2 \
-           trainer.accelerator=gpu \
-           trainer.log_every_n_steps=1 \
-           trainer.val_check_interval=3 \
-           trainer.limit_val_batches=2 \
-           trainer.accumulate_grad_batches=1 \
-           trainer.max_steps=3 \
-           trainer.precision=bf16 \
-           trainer.gradient_clip_val=1.0 \
-           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-           model.tensor_model_parallel_size=2 \
-           model.megatron_amp_O2=True \
-           model.optim.name=distributed_fused_adam \
-           model.optim.lr=2e-4 \
-           model.optim.sched.warmup_steps=2 \
-           model.optim.sched.constant_steps=2 \
-           model.optim.sched.min_lr=8e-5 \
-           model.max_position_embeddings=128 \
-           model.encoder_seq_length=128 \
-           model.data.seq_length=128 \
-           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-           model.num_layers=8 \
-           model.hidden_size=256 \
-           model.num_attention_heads=8 \
-           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-           trainer.devices=2 \
-           trainer.accelerator=gpu \
-           trainer.log_every_n_steps=1 \
-           trainer.val_check_interval=3 \
-           trainer.limit_val_batches=2 \
-           trainer.accumulate_grad_batches=1 \
-           trainer.max_steps=6 \
-           trainer.precision=bf16 \
-           trainer.gradient_clip_val=1.0 \
-           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-           exp_manager.resume_if_exists=True \
-           model.reset_lr=True \
-           model.tensor_model_parallel_size=2 \
-           model.megatron_amp_O2=True \
-           model.optim.name=distributed_fused_adam \
-           model.optim.lr=2e-4 \
-           model.optim.sched.warmup_steps=2 \
-           model.optim.sched.constant_steps=2 \
-           model.optim.sched.min_lr=8e-5 \
-           model.max_position_embeddings=128 \
-           model.encoder_seq_length=128 \
-           model.data.seq_length=128 \
-           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-           model.num_layers=8 \
-           model.hidden_size=256 \
-           model.num_attention_heads=8 \
-           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-
-           rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-           rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=3 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=bf16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.megatron_amp_O2=True \
+        model.optim.name=distributed_fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+    
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=3 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=6 \
+        trainer.precision=bf16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.reset_lr=True \
+        model.tensor_model_parallel_size=2 \
+        model.megatron_amp_O2=True \
+        model.optim.name=distributed_fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
   L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.position_embedding_type=alibi \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            # not testing resume functionality to save time on ci @adithyare
-            #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            #trainer.devices=2 \
-            #trainer.accelerator=gpu \
-            #trainer.log_every_n_steps=1 \
-            #trainer.val_check_interval=2 \
-            #trainer.limit_val_batches=1 \
-            #trainer.accumulate_grad_batches=1 \
-            #trainer.max_steps=6 \
-            #trainer.gradient_clip_val=1.0 \
-            #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            #exp_manager.resume_if_exists=True \
-            #model.tensor_model_parallel_size=2 \
-            #model.optim.name=fused_adam \
-            #model.optim.lr=2e-4 \
-            #model.optim.sched.warmup_steps=2 \
-            #model.optim.sched.constant_steps=2 \
-            #model.optim.sched.min_lr=8e-5 \
-            #model.max_position_embeddings=128 \
-            #model.encoder_seq_length=128 \
-            #model.data.seq_length=128 \
-            #model.position_embedding_type=alibi \
-            #model.normalization=rmsnorm \
-            #model.bias=False \
-            #model.bias_activation_fusion=False \
-            #model.bias_dropout_add_fusion=False \
-            #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            #model.num_layers=8 \
-            #model.hidden_size=256 \
-            #model.num_attention_heads=8 \
-            #model.activations_checkpoint_method=block \
-            #model.activations_checkpoint_granularity=full \
-            #model.activations_checkpoint_num_layers=1 \
-            #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.position_embedding_type=alibi \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+    
+        # not testing resume functionality to save time on ci @adithyare
+        #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        #trainer.devices=2 \
+        #trainer.accelerator=gpu \
+        #trainer.log_every_n_steps=1 \
+        #trainer.val_check_interval=2 \
+        #trainer.limit_val_batches=1 \
+        #trainer.accumulate_grad_batches=1 \
+        #trainer.max_steps=6 \
+        #trainer.gradient_clip_val=1.0 \
+        #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        #exp_manager.resume_if_exists=True \
+        #model.tensor_model_parallel_size=2 \
+        #model.optim.name=fused_adam \
+        #model.optim.lr=2e-4 \
+        #model.optim.sched.warmup_steps=2 \
+        #model.optim.sched.constant_steps=2 \
+        #model.optim.sched.min_lr=8e-5 \
+        #model.max_position_embeddings=128 \
+        #model.encoder_seq_length=128 \
+        #model.data.seq_length=128 \
+        #model.position_embedding_type=alibi \
+        #model.normalization=rmsnorm \
+        #model.bias=False \
+        #model.bias_activation_fusion=False \
+        #model.bias_dropout_add_fusion=False \
+        #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        #model.num_layers=8 \
+        #model.hidden_size=256 \
+        #model.num_attention_heads=8 \
+        #model.activations_checkpoint_method=block \
+        #model.activations_checkpoint_granularity=full \
+        #model.activations_checkpoint_num_layers=1 \
+        #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
   L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.position_embedding_type=kerple \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-            
-            # commented out to save time on github ci @adithyare
-            #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            #trainer.devices=2 \
-            #trainer.accelerator=gpu \
-            #trainer.log_every_n_steps=1 \
-            #trainer.val_check_interval=2 \
-            #trainer.limit_val_batches=1 \
-            #trainer.accumulate_grad_batches=1 \
-            #trainer.max_steps=6 \
-            #trainer.precision=16 \
-            #trainer.gradient_clip_val=1.0 \
-            #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            #exp_manager.resume_if_exists=True \
-            #model.tensor_model_parallel_size=2 \
-            #model.optim.name=fused_adam \
-            #model.optim.lr=2e-4 \
-            #model.optim.sched.warmup_steps=2 \
-            #model.optim.sched.constant_steps=2 \
-            #model.optim.sched.min_lr=8e-5 \
-            #model.max_position_embeddings=128 \
-            #model.encoder_seq_length=128 \
-            #model.data.seq_length=128 \
-            #model.position_embedding_type=kerple \
-            #model.normalization=rmsnorm \
-            #model.bias=False \
-            #model.bias_activation_fusion=False \
-            #model.bias_dropout_add_fusion=False \
-            #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            #model.num_layers=8 \
-            #model.hidden_size=256 \
-            #model.num_attention_heads=8 \
-            #model.activations_checkpoint_method=block \
-            #model.activations_checkpoint_granularity=full \
-            #model.activations_checkpoint_num_layers=1 \
-            #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-            
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.position_embedding_type=kerple \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+        # commented out to save time on github ci @adithyare
+        #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        #trainer.devices=2 \
+        #trainer.accelerator=gpu \
+        #trainer.log_every_n_steps=1 \
+        #trainer.val_check_interval=2 \
+        #trainer.limit_val_batches=1 \
+        #trainer.accumulate_grad_batches=1 \
+        #trainer.max_steps=6 \
+        #trainer.precision=16 \
+        #trainer.gradient_clip_val=1.0 \
+        #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        #exp_manager.resume_if_exists=True \
+        #model.tensor_model_parallel_size=2 \
+        #model.optim.name=fused_adam \
+        #model.optim.lr=2e-4 \
+        #model.optim.sched.warmup_steps=2 \
+        #model.optim.sched.constant_steps=2 \
+        #model.optim.sched.min_lr=8e-5 \
+        #model.max_position_embeddings=128 \
+        #model.encoder_seq_length=128 \
+        #model.data.seq_length=128 \
+        #model.position_embedding_type=kerple \
+        #model.normalization=rmsnorm \
+        #model.bias=False \
+        #model.bias_activation_fusion=False \
+        #model.bias_dropout_add_fusion=False \
+        #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        #model.num_layers=8 \
+        #model.hidden_size=256 \
+        #model.num_attention_heads=8 \
+        #model.activations_checkpoint_method=block \
+        #model.activations_checkpoint_granularity=full \
+        #model.activations_checkpoint_num_layers=1 \
+        #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     with:
-      RUNNER: self-hosted-azure
+      RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
         python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
         trainer.devices=2 \
+        trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
         trainer.val_check_interval=2 \
         trainer.limit_val_batches=2 \
@@ -2978,6 +3062,15 @@ jobs:
         trainer.precision=bf16 \
         trainer.gradient_clip_val=1.0 \
         exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        ++model.transformer_engine=True \
+        ++model.fp8=True \
+        ++model.fp8_hybrid=True \
+        ++model.fp8_amax_history_len=1024 \
+        ++model.fp8_amax_compute_algo=max \
+        ++model.reduce_amax=True \
+        ++model.use_te_rng_tracker=True \
+        ++model.name=megatron_gpt_full_te_layer_autocast \
+        model.ub_tp_comm_overlap=False \
         model.pipeline_model_parallel_size=2 \
         model.tensor_model_parallel_size=1 \
         model.mcore_gpt=True \
@@ -3002,12 +3095,15 @@ jobs:
         model.hidden_size=256 \
         model.num_attention_heads=8 \
         model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
         model.activations_checkpoint_num_layers=1 \
+        model.data.validation_drop_last=False \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
 
         python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
         trainer.devices=2 \
+        trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
         trainer.val_check_interval=2 \
         trainer.limit_val_batches=2 \
@@ -3019,6 +3115,15 @@ jobs:
         model.megatron_amp_O2=True \
         exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
         exp_manager.resume_if_exists=True \
+        ++model.transformer_engine=True \
+        ++model.fp8=True \
+        ++model.fp8_hybrid=True \
+        ++model.fp8_amax_history_len=1024 \
+        ++model.fp8_amax_compute_algo=max \
+        ++model.reduce_amax=True \
+        ++model.use_te_rng_tracker=True \
+        ++model.name=megatron_gpt_full_te_layer_autocast \
+        model.ub_tp_comm_overlap=False \
         model.pipeline_model_parallel_size=2 \
         model.tensor_model_parallel_size=1 \
         model.optim.name=distributed_fused_adam \
@@ -3041,7 +3146,9 @@ jobs:
         model.hidden_size=256 \
         model.num_attention_heads=8 \
         model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
         model.activations_checkpoint_num_layers=1 \
+        model.data.validation_drop_last=False \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
       AFTER_SCRIPT: |
@@ -3118,7 +3225,39 @@ jobs:
 
   L2_Megatron_GPT_Finetuning_StarCoder_PP1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=1 \
+        trainer.num_nodes=1 \
+        trainer.precision=bf16 \
+        trainer.max_steps=4 \
+        trainer.val_check_interval=4 \
+        trainer.enable_checkpointing=False \
+        +trainer.limit_val_batches=2 \
+        +trainer.limit_test_batches=2 \
+        exp_manager.checkpoint_callback_params.save_best_model=False \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+        model.peft.peft_scheme=none \
+        model.optim.name=distributed_fused_adam \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
+        model.tensor_model_parallel_size=1 \
+        model.pipeline_model_parallel_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.num_workers=0 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.test_ds.num_workers=0 \
+        model.data.train_ds.concat_sampling_probabilities=[1.0]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_sft_results
+  
+  L2_Megatron_GPT_Reranker:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
     timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
@@ -3134,34 +3273,29 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
-            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+            rm -rf /home/TestData/nlp/megatron_ir/working_dir
+
+            python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \
+            exp_manager.exp_dir="/home/TestData/nlp/megatron_ir/working_dir" \
+            model.global_batch_size=4 \
+            model.micro_batch_size=4 \
             trainer.devices=1 \
             trainer.num_nodes=1 \
-            trainer.precision=bf16 \
-            trainer.max_steps=4 \
-            trainer.val_check_interval=4 \
-            trainer.enable_checkpointing=False \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            exp_manager.checkpoint_callback_params.save_best_model=False \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-            model.peft.peft_scheme=none \
-            model.optim.name=distributed_fused_adam \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.train_ds.num_workers=0 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.test_ds.num_workers=0 \
-            model.data.train_ds.concat_sampling_probabilities=[1.0]
-        
-            rm -rf examples/nlp/language_modeling/gpt_sft_results
+            trainer.max_epochs=null \
+            trainer.max_steps=20 \
+            trainer.val_check_interval=10 \
+            model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
+            model.peft.lora_tuning.adapter_dim=8 \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
+            model.data.validation_ds.write_embeddings_to_file=True \
+            model.data.validation_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/val_embs" \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
+
+
+            rm -rf /home/TestData/nlp/megatron_ir/working_dir
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
-  
+
   L2_Megatron_GPT_Embedding:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -3171,7 +3305,7 @@ jobs:
         rm -rf /home/TestData/nlp/megatron_ir/working_dir
 
         python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
-        exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
+        exp_manager.exp_dir="/home/TestData/nlp/megatron_ir/working_dir" \
         model.global_batch_size=4 \
         model.micro_batch_size=4 \
         trainer.devices=1 \
@@ -3179,25 +3313,25 @@ jobs:
         trainer.max_epochs=null \
         trainer.max_steps=20 \
         trainer.val_check_interval=10 \
-        model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
+        model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
         model.peft.lora_tuning.adapter_dim=8 \
         model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
         model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
         model.data.validation_ds.write_embeddings_to_file=True \
-        model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
+        model.data.validation_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/val_embs" \
         model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
 
 
         python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
         trainer.devices=1 \
         trainer.num_nodes=1 \
-        model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
-        model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \
+        model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
+        model.peft.restore_from_path="/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo" \
         model.global_batch_size=4 \
         model.micro_batch_size=4 \
         model.peft.lora_tuning.adapter_dim=8 \
         model.data.test_ds.write_embeddings_to_file=True \
-        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \
+        model.data.test_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/test_embs" \
         model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
         model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl]
       AFTER_SCRIPT: |
@@ -3243,15 +3377,15 @@ jobs:
         trainer.devices=2 \
         model.megatron_amp_O2=True \
         model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=['quarel4'] \
+        model.data.test_ds.names=["quarel4"] \
         model.global_batch_size=2 \
         model.micro_batch_size=1 \
         model.data.test_ds.tokens_to_generate=10 \
         model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \
+        model.data.test_ds.output_file_path_prefix="/home/TestData/nlp/lora_tuning_pp2/out" \
         inference.greedy=True \
         inference.repetition_penalty=1.0 \
-        inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl'
+        inference.outfile_path="/home/TestData/nlp/lora_tuning_pp2/out.jsonl"
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/lora_tuning_pp2
 
@@ -3275,7 +3409,7 @@ jobs:
         model.pipeline_model_parallel_size=1 \
         model.tensor_model_parallel_size=2 \
         model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
-        model.peft.peft_scheme='lora' \
+        model.peft.peft_scheme="lora" \
         model.answer_only_loss=True \
         model.micro_batch_size=1 \
         model.global_batch_size=1 \
@@ -3292,15 +3426,15 @@ jobs:
         model.tensor_model_parallel_size=2 \
         trainer.devices=2 \
         model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=['quarel4'] \
+        model.data.test_ds.names=["quarel4"] \
         model.global_batch_size=2 \
         model.micro_batch_size=1 \
         model.data.test_ds.tokens_to_generate=10 \
         model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \
+        model.data.test_ds.output_file_path_prefix="/home/TestData/nlp/lora_tuning_tp2/out" \
         inference.greedy=True \
         inference.repetition_penalty=1.0 \
-        inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'
+        inference.outfile_path="/home/TestData/nlp/lora_tuning_tp2/out.jsonl"
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/lora_tuning_tp2
 
@@ -3340,12 +3474,12 @@ jobs:
         +model.tp_comm_overlap_ag=False \
         +model.tp_comm_overlap_rs=False \
         +model.tp_comm_overlap_disable_qkv=True \
-        model.peft.peft_scheme='lora' \
+        model.peft.peft_scheme="lora" \
         model.peft.lora_tuning.adapter_dim=16 \
         model.peft.lora_tuning.alpha=32 \
         model.peft.lora_tuning.column_init_method="kaiming" \
-        +model.peft.lora_tuning.dropout_position='pre' \
-        model.peft.lora_tuning.target_modules=['attention'] \
+        +model.peft.lora_tuning.dropout_position="pre" \
+        model.peft.lora_tuning.target_modules=["attention"] \
         model.peft.lora_tuning.adapter_dropout=0.1 \
         +model.peft.lora_tuning.a2a_experimental=1 \
         model.answer_only_loss=True \
@@ -3368,7 +3502,7 @@ jobs:
       SCRIPT: |
         python examples/nlp/language_modeling/megatron_gpt_eval.py \
             gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
-            prompts=['How to fix GPU memory? A:'] \
+            prompts=["How to fix GPU memory? A:"] \
             tensor_model_parallel_size=1 \
             inference.tokens_to_generate=32 \
             trainer.precision=32
@@ -3595,8 +3729,8 @@ jobs:
         model.decoder.num_layers=4 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
+        model.encoder.transformer_block_type="pre_ln" \
+        model.decoder.transformer_block_type="pre_ln" \
         model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
         model.data.data_impl=text_mmap \
@@ -3628,8 +3762,8 @@ jobs:
         model.decoder.num_layers=4 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
+        model.encoder.transformer_block_type="pre_ln" \
+        model.decoder.transformer_block_type="pre_ln" \
         model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
         model.data.data_impl=text_mmap \
@@ -4024,7 +4158,7 @@ jobs:
       SCRIPT: |
         python examples/nlp/language_modeling/megatron_t5_eval.py \
             --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
+            --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
             --tensor_model_parallel_size 1
 
   L2_Megatron_Core_T5_Eval:
@@ -4035,7 +4169,7 @@ jobs:
       SCRIPT: |
         NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_t5_eval.py \
             --model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
-            --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
+            --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
             --tensor_model_parallel_size 1
 
   L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
@@ -4060,18 +4194,18 @@ jobs:
         model.encoder.num_layers=4 \
         model.encoder.hidden_size=64 \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='reglu' \
+        model.encoder.activation="reglu" \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=4 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='reglu' \
+        model.decoder.activation="reglu" \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
+        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
 
         python examples/nlp/language_modeling/megatron_bart_pretraining.py \
         trainer.devices=2 \
@@ -4090,18 +4224,18 @@ jobs:
         model.encoder.num_layers=4 \
         model.encoder.hidden_size=64 \
         model.encoder.num_attention_heads=8 \
-        model.encoder.activation='reglu' \
+        model.encoder.activation="reglu" \
         model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_method="block" \
         model.encoder.activations_checkpoint_num_layers=1 \
         model.decoder.num_layers=4 \
         model.decoder.hidden_size=64 \
         model.decoder.num_attention_heads=8 \
-        model.decoder.activation='reglu' \
+        model.decoder.activation="reglu" \
         model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_method="block" \
         model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
+        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
       AFTER_SCRIPT: |
         rm -rf examples/nlp/language_modeling/bart_pretrain_results
 
@@ -4522,7 +4656,7 @@ jobs:
       AFTER_SCRIPT: |
         rm -f examples/asr/evaluation_transcripts.json
 
-  L2_Stable_Diffusion_Training:
+  OPTIONAL_L2_Stable_Diffusion_Training:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     with:
@@ -4570,55 +4704,47 @@ jobs:
         model.data.synthetic_data=True
       AFTER_SCRIPT: |
         rm -rf examples/multimodal/text_to_image/sd_train_results
+      IS_OPTIONAL: true
 
   L2_NeMo_2_GPT_Pretraining_no_transformer_engine:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options:
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            pip uninstall -y apex ## TODO: remove when apex is no longer a dependency
-            pip uninstall -y transformer_engine
-
-            python examples/llm/megatron_gpt_pretraining.py \
-            --devices=2 \
-            --max-steps=3 \
-            --experiment-dir=examples/llm/gpt_pretrain_results \
-            --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
-            --index-mapping-dir=examples/llm/gpt_index_mappings
-
-            python examples/llm/megatron_gpt_pretraining.py \
-            --devices=2 \
-            --max-steps=6 \
-            --experiment-dir=examples/llm/gpt_pretrain_results \
-            --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
-            --index-mapping-dir=examples/llm/gpt_index_mappings
-
-            rm -rf examples/llm/gpt_pretrain_results
-            rm -rf examples/llm/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        pip uninstall -y apex ## TODO: remove when apex is no longer a dependency
+        pip uninstall -y transformer_engine
+
+        python examples/llm/megatron_gpt_pretraining.py \
+        --devices=2 \
+        --max-steps=3 \
+        --experiment-dir=examples/llm/gpt_pretrain_results \
+        --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
+        --index-mapping-dir=examples/llm/gpt_index_mappings \
+        --no-masked-softmax-fusion
+
+        python examples/llm/megatron_gpt_pretraining.py \
+        --devices=2 \
+        --max-steps=6 \
+        --experiment-dir=examples/llm/gpt_pretrain_results \
+        --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
+        --index-mapping-dir=examples/llm/gpt_index_mappings \
+        --no-masked-softmax-fusion
+      AFTER_SCRIPT: |
+        rm -rf examples/llm/gpt_pretrain_results
+        rm -rf examples/llm/gpt_index_mappings
 
   Nemo_CICD_Test:
     needs: 
       - gpu-test
       - cicd-test-container-setup
       - L0_Unit_Tests_GPU
-      - L0_Unit_Tests_CPU
+      #- OPTIONAL_L0_Unit_Tests_CPU
+      - L2_Community_LLM_Checkpoints_tests_Bert
       - L2_Community_LLM_Checkpoints_tests_Llama
       - L2_Community_LLM_Checkpoints_tests_StarCoder
       - L2_Community_LLM_Checkpoints_tests_Falcon
@@ -4715,7 +4841,7 @@ jobs:
       - L2_TTS_Fast_dev_runs_1_Mixer-TTS
       - L2_TTS_Fast_dev_runs_1_Hifigan
       - Speech_Checkpoints_tests
-      - L2_Stable_Diffusion_Training
+      #- OPTIONAL_L2_Stable_Diffusion_Training
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
     if: always()
     runs-on: ubuntu-latest
diff --git a/.github/workflows/import-test.yml b/.github/workflows/import-test.yml
index 6f2f52bfb0ae..3af15294b2a2 100644
--- a/.github/workflows/import-test.yml
+++ b/.github/workflows/import-test.yml
@@ -12,7 +12,7 @@ jobs:
   test-asr-imports:
     runs-on: ubuntu-latest
     container:
-      image: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+      image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
     steps:
     - name: Checkout repo
       uses: actions/checkout@v2
@@ -43,7 +43,7 @@ jobs:
   test-tts-imports:
     runs-on: ubuntu-latest
     container:
-      image: pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+      image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
     steps:
     - name: Checkout repo
       uses: actions/checkout@v2
@@ -70,4 +70,4 @@ jobs:
         # Run import checks
         python tests/core_ptl/check_imports.py --domain "tts"
         # Uninstall NeMo
-        pip uninstall -y nemo_toolkit
\ No newline at end of file
+        pip uninstall -y nemo_toolkit
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3f4c4f3c19de..af09fa241c59 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,112 +1,60 @@
 name: "NeMo Code release"
 
 on:
-  issue_comment:
-    types: [created]
-
+  workflow_dispatch:
+    inputs:
+      branch: 
+        description: Branch to release
+        required: true
+        type: string
 jobs: 
   main:
-    if: >
-      github.event_name == 'issue_comment' &&
-      github.event.issue.pull_request &&
-      startsWith(github.event.comment.body, '/release-please') &&
-      contains(fromJSON('["ko3n1g"]'), github.actor)
+    if: contains(fromJSON('["ko3n1g"]'), github.actor)
     runs-on: ubuntu-latest
     environment: 
       name: main
     steps:  
-      - name: Update PR issue comment
-        shell: bash
-        env:
-          message: ${{ github.event.comment.body }}
-        run: |
-          message="$message
-
-          ---
-
-          Releasebot 🤖: Release processes started...
-          "
-          message="${message//$'\n'/<br>}"
-
-          curl -L \
-            -X PATCH \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }} \
-            -d '{"body":"'"$message"'"}'
-
-      - name: Get PR number
-        shell: bash
-        id: get-pr-num
-        run: |
-          PR_URL="${{ github.event.issue.pull_request.url }}"
-          PR_NUM=${PR_URL##*/}
-          echo "pr_number=$PR_NUM" >> $GITHUB_OUTPUT
-
-      - name: Get Pull Request Information
-        uses: actions/github-script@v6
-        id: get-pr-branch
-        with:
-          result-encoding: string
-          script: |
-            const pr = await github.rest.pulls.get({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: ${{ steps.get-pr-num.outputs.pr_number }}
-            });
-            console.log('Pull Request Information:', pr.data);
-            return pr.data.head.ref;
 
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
           path: ${{ github.run_id }}
-          ref: ${{ steps.get-pr-branch.outputs.result }}
+          ref: ${{ inputs.branch }}
 
-      - name: Get version number
+      - name: Create release
         id: version-number
         run: |
           cd ${{ github.run_id }}
           VERSION=$(python -c "import nemo; print(nemo.__version__)")
-          echo "VERSION=$VERSION" >> "$GITHUB_OUTPUT"
-
-      - name: Extract changelog
-        id: extract-changelog
-        uses: peter-evans/find-comment@v3
-        with:
-          issue-number: ${{ steps.get-pr-num.outputs.pr_number }}
-          body-includes: '# Detailed Changelogs'
-      
-      - name: Extract summary
-        id: extract-summary
-        uses: peter-evans/find-comment@v3
-        with:
-          issue-number: ${{ steps.get-pr-num.outputs.pr_number }}
-          body-includes: '# Highlights'
-        
-      - name: Create Release doc
-        id: create-release-doc
-        env:
-          SUMMARY: ${{ steps.extract-summary.outputs.comment-body }}
-          CHANGELOG: ${{ steps.extract-changelog.outputs.comment-body }}
-        run: |
           
-          echo "TITLE<<EOF" >> $GITHUB_ENV
-          echo "NVIDIA Neural Modules ${{ steps.version-number.outputs.VERSION }}" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
+          NAME="NVIDIA Neural Modules ${VERSION}"
+          CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
+          CHANGELOG=$(echo "$CHANGELOG" | sed '/./,$!d' | sed ':a;N;$!ba;s/\n$//')
+
+          PAYLOAD=$(jq \
+                      -n \
+                      -c \
+                      --arg CI_COMMIT_BRANCH "${{ inputs.branch }}" \
+                      --arg NAME "$NAME" \
+                      --arg BODY "$CHANGELOG" \
+                      '{
+                        "tag_name": $CI_COMMIT_BRANCH,
+                        "target_commitish": $CI_COMMIT_BRANCH,
+                        "name": $NAME,
+                        "body": $BODY,
+                        "draft": false,
+                        "prerelease": false,
+                        "generate_release_notes": false
+                      }'
+                  )
 
-          echo "BODY<<EOF" >> $GITHUB_ENV
-          echo "$SUMMARY" >> $GITHUB_ENV
-          echo "$CHANGELOG" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-      - name: Create Release
-        uses: softprops/action-gh-release@v2
-        with:
-          name: ${{ env.TITLE }}
-          tag_name: ${{ steps.version-number.outputs.VERSION }}
-          body: ${{ env.BODY }}
+          curl -L \
+            -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.PAT }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/NVIDIA/NeMo/releases \
+            -d "$PAYLOAD"
       
       - name: Build, test, and release wheel
         env:
@@ -114,6 +62,8 @@ jobs:
           TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
         run: |
           cd ${{ github.run_id }}
+          EXPECTED_VERSION=$(python -c 'import nemo; print(nemo.__version__)')
+
           python3 -m pip install --upgrade build
           python3 -m build
 
@@ -122,7 +72,6 @@ jobs:
           cd ../
 
           INSTALLED_VERSION=$(python -c 'import nemo; print(nemo.__version__)')
-          EXPECTED_VERSION=${{ steps.version-number.outputs.VERSION }}
           
           if [[ "$INSTALLED_VERSION" != "$EXPECTED_VERSION" ]]; then
             echo 'Wheel has an outdated version, mission abort immediately!'
@@ -134,34 +83,6 @@ jobs:
           python3 -m pip install --upgrade twine
           python3 -m twine upload --repository pypi dist/*
 
-      - name: Update PR issue comment
-        shell: bash
-        env:
-          message: ${{ github.event.comment.body }}
-        run: |
-          message="$message
-
-          ---
-
-          Releasebot 🤖: Release done 🎉
-          "
-          message="${message//$'\n'/<br>}"
-
-          curl -L \
-            -X PATCH \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }} \
-            -d '{"body":"'"$message"'"}'
-
-      - name: Close Pull
-        run: |
-          cd ${{ github.run_id }}
-          gh pr close --comment "Releasebot 🤖: Closing PR" "${{ steps.get-pr-num.outputs.pr_number }}"
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
       - name: notify
         run: |
           MESSAGE='{
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 000000000000..7fd5cd00b352
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,921 @@
+# Changelog
+
+## NVIDIA Neural Modules 2.0.0rc1
+
+### Highlights
+
+#### Large language models
+
+- PEFT: QLoRA support, LoRA/QLora for Mixture-of-Experts (MoE) dense layer
+- State Space Models & Hybrid Architecture support (Mamba2 and NV-Mamba2-hybrid)
+- Support Nemotron, Minitron, Gemma2, Qwen, RAG
+- Custom Tokenizer training in NeMo
+- Update the Auto-Configurator for EP, CP and FSDP
+
+#### Multimodal
+
+- NeVA: Add SOTA LLM backbone support (Mixtral/LLaMA3) and suite of model parallelism support (PP/EP)
+- Support Language Instructed Temporal-Localization Assistant (LITA) on top of video NeVA
+
+#### ASR
+
+- SpeechLM and SALM
+- Adapters for Canary Customization
+- Pytorch allocator in PyTorch 2.2 improves training speed up to 30% for all ASR models
+- Cuda Graphs for Transducer Inference
+- Replaced webdataset with Lhotse - gives up to 2x speedup
+- Transcription Improvements - Speedup and QoL Changes
+- ASR Prompt Formatter for multimodal Canary
+
+#### Export & Deploy
+
+- In framework PyTriton deployment with backends: - PyTorch - vLLM - TRT-LLM update to 0.10
+- TRT-LLM C++ runtime
+
+### Detailed Changelogs
+
+#### ASR
+
+<details><summary>Changelog</summary>
+
+- Support dataloader as input to `audio` for transcription by @titu1994 :: PR: #9201  
+- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205  
+- Fix Online_Offline_Microphone_VAD_Demo.ipynb by @stevehuang52 :: PR: #9251  
+- Remove .nemo instead of renaming by @mikolajblaz :: PR: #9281  
+- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. by @galv :: PR: #9347
+- Revert "Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer." by @titu1994 :: PR: #9351
+- Prompt formatter API and canary transcribe tensor input support by @pzelasko :: PR: #9206
+- Fix prompt formatter's defaults=None case in multi-task model by @pzelasko :: PR: #9366
+- move AED chunked infer script by @stevehuang52 :: PR: #9367
+- Use model-cast-to-bfloat16 rather than AMP-to-bfloat16 for inference. by @galv :: PR: #9198
+- ci: Fix `L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_C… by @ko3n1g :: PR: #9399
+- Fix logging message for ASR by @titu1994 :: PR: #9469
+- Add support to change Multi task model prompt by @titu1994 :: PR: #9542
+- Enable encoder adapters for Canary and MultiTaskAED models by @titu1994 :: PR: #9409
+- Audio model collection by @anteju :: PR: #9263
+- TitaNet Batch Verify Speaker by @monica-sekoyan :: PR: #9337
+- Fix the arguments  of forward_for_export function in msdd_models by @tango4j :: PR: #9624
+- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697
+- refactor: notebook branch release by @ko3n1g :: PR: #9711
+- Canary Adapters tutorial (#9670) by @nithinraok :: PR: #9777
+- typos and branch name update to r2.0.0rc1 by @nithinraok :: PR: #9846
+- Fix RNNT alignments test by @artbataev :: PR: #9770
+- By default trust remote code from HF Datasets by @nithinraok :: PR: #9886
+- Temporarily disable cuda graph based RNN-T greedy inference for r2.0.0rc1 by @galv :: PR: #9904
+- Enable CUDA graphs by default, but require CUDA 12.6 for full graphs by @artbataev :: PR: #9919
+- update branch name for script by @nithinraok :: PR: #9936
+- updte branch by @nithinraok :: PR: #9942
+</details>
+
+#### TTS
+
+<details><summary>Changelog</summary>
+
+- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205
+- Add mel codec checkpoints by @anteju :: PR: #9228
+- GPU unit tests: Mark flaky tests to be fixed by @pablo-garay :: PR: #9559
+- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697
+- refactor: notebook branch release by @ko3n1g :: PR: #9711
+
+</details>
+
+#### LLM/Multimodal
+  
+<details><summary>Changelog</summary>
+
+- Update nemo.export module for quantized models by @janekl :: PR: #9218
+- Add save option to the TRT-LLM export test script by @oyilmaz-nvidia :: PR: #9221
+- Checkpoint resuming compatible for 2403 container by @suiyoubi :: PR: #9199
+- Clean up dev docs collection section by @yaoyu-33 :: PR: #9205
+- use get with fallback when reading checkpoint_callback_params by @akoumpa :: PR: #9223
+- Revert rope fusion defaults by @cuichenx :: PR: #9237
+- fix import by @akoumpa :: PR: #9240
+- Add TRT-LLM params like max_num_tokens and opt_num_tokens by @oyilmaz-nvidia :: PR: #9210
+- sum-reduce grad_norm in DP+CP domain by @erhoo82 :: PR: #9262
+- Alit/bert convert fix by @JRD971000 :: PR: #9285
+- conv1d stable version by @JRD971000 :: PR: #9330
+- Fix trainer builder when exp_manager is not in config by @yaoyu-33 :: PR: #9293
+- Fix Peft Weights Loading in NeVA by @yaoyu-33 :: PR: #9341
+- Skip sequence_parallel allreduce when using Mcore DistOpt by @akoumpa :: PR: #9344
+- Fix FSDP gradient calculation with orig params by @janEbert :: PR: #9335
+- TRT-LLM Export Code Cleanup by @oyilmaz-nvidia :: PR: #9270
+- support null/None truncation field by @arendu :: PR: #9355
+- NeVa token fusion by @paul-gibbons :: PR: #9245
+- bugfix if using mcore distOpt with sft by @akoumpa :: PR: #9356
+- Re-org export code by @oyilmaz-nvidia :: PR: #9353
+- QLoRA by @cuichenx :: PR: #9340
+- PeFT fix for distOpt by @akoumpa :: PR: #9392
+- [NeMo-UX] Integrating mcore's DistributedDataParallel into MegatronStrategy by @marcromeyn :: PR: #9387
+- cherry pick of #9266 by @dimapihtar :: PR: #9411
+- Enable specifying alpha for PTQ INT8 SmoothQuant method by @janekl :: PR: #9423
+- add support for new mcore ds features by @dimapihtar :: PR: #9388
+- LoRA for MoE Layer by @cuichenx :: PR: #9396
+- Mistral-7B: apply user's precision to output checkpoint by @akoumpa :: PR: #9222
+- Add option to merge distributed optimizer buckets by @timmoon10 :: PR: #9414
+- TRT-LLM 0.10 Update by @oyilmaz-nvidia :: PR: #9402
+- In-framework deployment by @oyilmaz-nvidia :: PR: #9438
+- Bugfix missing variables and argument changes to MegatronPretrainingRandomSampler by @jstjohn :: PR: #9458
+- Hyena Operator by @guyjacob :: PR: #9264
+- Refactor Quantizer for reusing in QAT by @kevalmorabia97 :: PR: #9276
+- move load state dict after initialize parallel state in nlp_model by @ryxli :: PR: #9382
+- Enable user to optionally upgrade Megatron by @jstjohn :: PR: #9478
+- Fix unwrap model by @cuichenx :: PR: #9480
+- fix operator precedence by @akoumpa :: PR: #9403
+- [NeMo-UX] Adding context- & expert-parallelism to MegatronStrategy by @marcromeyn :: PR: #9525
+- update mcoreddp call by @akoumpa :: PR: #9345
+- mcore distOpt restore fix by @akoumpa :: PR: #9421
+- vLLM Export Support by @apanteleev :: PR: #9381
+- PL: Delete precision if using plugin. TODO switch to MegatronTrainerB… by @akoumpa :: PR: #9535
+- extend get_gpt_layer_modelopt_spec to support MoE by @akoumpa :: PR: #9532
+- fix mock data generation for legacy dataset by @dimapihtar :: PR: #9530
+- add reset learning rate functionality by @dimapihtar :: PR: #9372
+- Use closed-formula to round by multiple by @akoumpa :: PR: #9307
+- GPU unit tests: Mark flaky tests to be fixed by @pablo-garay :: PR: #9559
+- Consolidate gpt continue training script into pretraining script by @yaoyu-33 :: PR: #9413
+- Enable encoder adapters for Canary and MultiTaskAED models by @titu1994 :: PR: #9409
+- PTQ refinements by @janekl :: PR: #9574
+- Add ModelOpt QAT example for Llama2 SFT model by @kevalmorabia97 :: PR: #9326
+- Multimodal projection layer adapter fix for PP>1 by @paul-gibbons :: PR: #9445
+- Add offline quantization script for QLoRA deployment by @cuichenx :: PR: #9455
+- Make QLoRA more model-agnostic by @cuichenx :: PR: #9488
+- Set n_gpu to None in nemo export by @oyilmaz-nvidia :: PR: #9593
+- [NeMo-UX] Fix Megatron-optimizer by @marcromeyn :: PR: #9599
+- Chat template support for megatron_gpt_eval.py by @akoumpa :: PR: #9354
+- [NeMo-UX] Add PEFT by @cuichenx :: PR: #9490
+- Alit/mamba tmp by @JRD971000 :: PR: #9612
+- Enable MCore checkpointing optimizations by @mikolajblaz :: PR: #9505
+- Change mixtral moe key name for trt-llm by @oyilmaz-nvidia :: PR: #9620
+- fix ckpt load bug by @dimapihtar :: PR: #9621
+- Alit/mamba by @JRD971000 :: PR: #9575
+- Unwrap ckpt_io for model opt (async save) by @mikolajblaz :: PR: #9622
+- MCore T5 support for NeMo - Training by @huvunvidia :: PR: #9432
+- [Nemo-UX] Expose transformer_layer_spec inside GPTConfig by @marcromeyn :: PR: #9592
+- Update NeMo Clip to Use MCore Modules by @yaoyu-33 :: PR: #9594
+- Mistral + Mixtral Support for NeVa by @paul-gibbons :: PR: #9459
+- Adding support for mcore generate by @shanmugamr1992 :: PR: #9566
+- Improve error messaging during trt-llm export by @oyilmaz-nvidia :: PR: #9638
+- [Cherrypick] support lora when kv_channel != hidden_size / num_heads by @cuichenx :: PR: #9644
+- Parametrize FPS group by @mikolajblaz :: PR: #9648
+- Cherry-pick megatron export fix from main by @borisfom :: PR: #9643
+- add documentation for reset_lr feature by @dimapihta
+- chore: Pin branch in notebooks by @ko3n1g :: PR: #9697
+- Cherry pick: LITA Integration by @Slyne :: PR: #9684
+- SDXL improvements (and support for Draft+) by @rohitrango :: PR: #9654
+- Gemma 2 by @cuichenx :: PR: #9672
+- Allows non-strict load with distributed checkpoints by @mikolajblaz :: PR: #9613
+- refactor: notebook branch release by @ko3n1g :: PR: #9711
+- [NeMo-UX] Make TE and Apex dependencies optional by @ashors1 :: PR: #9550
+- Alit/r2.0.0 by @JRD971000 :: PR: #9718
+- Manually cherry-pick from PR 9679 (PR to main - Support SFT/Eval/PEFT for mcore T5) by @huvunvidia :: PR: #9737
+- In framework export by @oyilmaz-nvidia :: PR: #9658
+- T5 changes based on mcore changes by @pablo-garay :: PR: #9829
+- [NeMo-UX] Use single instance of loss reductions in GPTModel by @hemildesai :: PR: #9801
+- deprecate NeMo NLP tutorial by @dimapihtar :: PR: #9864
+- Disable nvFuser setup with PyTorch 23.11 and later by @athitten :: PR: #9837
+- make torch_dist ckpt strategy as default by @dimapihtar :: PR: #9852
+- add rampup bs documentation by @dimapihtar :: PR: #9884
+- copy of #9576 by @dimapihtar :: PR: #9986
+- Support Nvidia Torch and Arch versions by @thomasdhc :: PR: #9897
+- Bug fix for pooler causing dist checkpointing exception by @shanmugamr1992 :: PR: #10008
+
+</details>
+
+#### Export
+
+<details><summary>Changelog</summary>
+
+- Update nemo.export module for quantized models by @janekl :: PR: #9218
+- Add save option to the TRT-LLM export test script by @oyilmaz-nvidia :: PR: #9221
+- Add TRT-LLM params like max_num_tokens and opt_num_tokens by @oyilmaz-nvidia :: PR: #9210
+- TRT-LLM Export Code Cleanup by @oyilmaz-nvidia :: PR: #9270
+- Re-org export code by @oyilmaz-nvidia :: PR: #9353
+- Use TensorRT-LLM native parameter names in nemo.export module by @janekl :: PR: #9424
+- TRT-LLM 0.10 Update by @oyilmaz-nvidia :: PR: #9402
+- vLLM Export Support by @apanteleev :: PR: #9381
+- Add page context fmha option in TensorRTLLM export by @meatybobby :: PR: #9526
+- Test C++ runtime on demand in nemo_export.py to avoid possible OOMs by @janekl :: PR: #9544
+- Fix nemo export test by @oyilmaz-nvidia :: PR: #9547
+- Add tps and pps params to the export script by @oyilmaz-nvidia :: PR: #9558
+- Add Multimodal Exporter by @meatybobby :: PR: #9256
+- Set n_gpu to None in nemo export by @oyilmaz-nvidia :: PR: #9593
+- Inflight nemo model export support by @JimmyZhang12 :: PR: #9527
+- vLLM Export Improvements by @apanteleev :: PR: #9596
+- Akoumparouli/nemo ux mixtral export by @akoumpa :: PR: #9603
+- Change mixtral moe key name for trt-llm by @oyilmaz-nvidia :: PR: #9620
+- Fix the arguments  of forward_for_export function in msdd_models by @tango4j :: PR: #9624
+- Improve error messaging during trt-llm export by @oyilmaz-nvidia :: PR: #9638
+- Cherry-pick megatron export fix from main by @borisfom :: PR: #9643
+- In framework export by @oyilmaz-nvidia :: PR: #9658
+- Add missing imports for torch dist ckpt in export by @oyilmaz-nvidia :: PR: #9826~
+
+</details>
+
+
+
+
+#### Bugfixes
+  
+<details><summary>Changelog</summary>
+
+- use get with fallback when reading checkpoint_callback_params by @akoumpa :: PR: #9223
+- fix import by @akoumpa :: PR: #9240
+- Remove .nemo instead of renaming by @mikolajblaz :: PR: #9281
+- call set_expert_model_parallel_world_size instead of set_cpu_expert_m… by @akoumpa :: PR: #9275
+- Fix typos in Mixtral NeMo->HF and Starcoder2 NeMo->HF conversion scripts by @evellasques :: PR: #9325
+- Skip sequence_parallel allreduce when using Mcore DistOpt by @akoumpa :: PR: #9344
+- Add OpenAI format response to r2.0.0rc1 by @athitten :: PR: #9796
+- [NeMo UX] Support generating datasets using different train/valid/test distributions by @ashors1 :: PR: #9771
+- Add missing imports for torch dist ckpt in export by @oyilmaz-nvidia :: PR: #9826
+
+</details>
+
+#### General Improvements
+
+<details><summary>Changelog</summary>
+
+- [Nemo CICD] run_cicd_for_release_branches_also by @pablo-garay :: PR: #9213
+- rename paths2audiofiles to audio by @github-actions[bot] :: PR: #9220
+- Fix ASR_Context_Biasing.ipynb contains FileNotFoundError by @github-actions[bot] :: PR: #9234
+- ci: Remove duplicated job by @ko3n1g :: PR: #9258
+- Fix document links by @yaoyu-33 :: PR: #9260
+- Pin transformers by @github-actions[bot] :: PR: #9273
+- Fix loading github raw images on notebook by @github-actions[bot] :: PR: #9283
+- Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward by @github-actions[bot] :: PR: #9278
+- Refactor Sequence Packing Script by @cuichenx :: PR: #9271
+- [Nemo-UX] Move code to collections + fix some small bugs by @marcromeyn :: PR: #9277
+- Fix typo in HF tutorial by @github-actions[bot] :: PR: #9304
+- Expand documentation for data parallelism and distributed optimizer by @timmoon10 :: PR: #9227
+- Install alerting by @ko3n1g :: PR: #9311
+- typos by @github-actions[bot] :: PR: #9315
+- FP8 feature documentation by @ksivaman :: PR: #9265
+- [Nemo CICD] Comment out flaky tests by @pablo-garay :: PR: #9333
+- Fixed typos in README.rst by @gdevakumar :: PR: #9322
+- Update README.rst to clarify installation via Conda by @SimonCW :: PR: #9323
+- [Nemo CICD] update flaky test by @pablo-garay :: PR: #9339
+- fix lora and ptuning and isort/black by @github-actions[bot] :: PR: #9295
+- Fix P-tuning for Llama based models by @github-actions[bot] :: PR: #9300
+- add large model stable training fix and contrastive loss update for variable seq by @github-actions[bot] :: PR: #9348
+- Guard cuda memory allocator update by @github-actions[bot] :: PR: #9313
+- [Nemo CICD] Remove unnecessary commented out code by @pablo-garay :: PR: #9364
+- Update Gemma conversion script by @yaoyu-33 :: PR: #9365
+- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347) by @github-actions[bot] :: PR: #9371
+- Re-enable cuda graphs in training modes. by @github-actions[bot] :: PR: #9343
+- fix typo infer_seq_lenght -> infer_seq_length by @akoumpa :: PR: #9370
+- Make a backward compatibility for old MSDD configs in label models by @github-actions[bot] :: PR: #9378
+- Dgalvez/fix greedy batch strategy name r2.0.0rc0 by @github-actions[bot] :: PR: #9253
+- Update README.rst by @jgerh :: PR: #9393
+- Force diarizer to use CUDA if cuda is available and if device=None. by @github-actions[bot] :: PR: #9390
+- ci: Properly catch failed tests by introduction of workflow templates by @ko3n1g :: PR: #9324
+- Fix T5 G2P Input and Output Types by @github-actions[bot] :: PR: #9269
+- Huvu/rag pipeline citest by @huvunvidia :: PR: #9384
+- Fix circular import for MM dataprep notebook by @github-actions[bot] :: PR: #9292
+- add check if num layers is divisible by pp size by @github-actions[bot] :: PR: #9298
+- [Nemo CICD] timeouts fix by @pablo-garay :: PR: #9407
+- [NeMo-UX] Removing un-used ModelConfig class by @marcromeyn :: PR: #9389
+- Add tutorial for Llama-3-8B lora training and deployment by @shashank3959 :: PR: #9359
+- [NeMo-UX] Removing default_path from ModelConnector by @marcromeyn :: PR: #9401
+- Fix README by @ericharper :: PR: #9415
+- [SD] Fix SD CUDA Graph Failure by @alpha0422 :: PR: #9319
+- [NeMo-UX] Adding file-lock to Connector by @marcromeyn :: PR: #9400
+- Add Dev Container Bug Report by @pablo-garay :: PR: #9430
+- Akoumparouli/profiling docs by @akoumpa :: PR: #9420
+- ci: Enrich notifications by @ko3n1g :: PR: #9412
+- Fix failing RIR unit test with lhotse 1.24+ by @pzelasko :: PR: #9444
+- [NeMo-UX] Adding support for mcore distributed optimizer by @marcromeyn :: PR: #9435
+- Use ModelOpt build_tensorrt_llm for building engines for qnemo checkpoints by @janekl :: PR: #9452
+- ci(notifications): Fix extraction of last 2K chars by @ko3n1g :: PR: #9450
+- Update readme with mlperf news by @ericharper :: PR: #9457
+- [NeMo-UX] Add nsys callback by @ashors1 :: PR: #9461
+- [NeMo UX] Introducing optimizer module by @marcromeyn :: PR: #9454
+- Fix minor import bug in deploy module by @oyilmaz-nvidia :: PR: #9463
+- ci(notifications): Fetch all jobs by @ko3n1g :: PR: #9465
+- Update build_dataset.py by @stevehuang52 :: PR: #9467
+- bionemo: bn2/add pipelineparallel dtype by @skothenhill-nv :: PR: #9475
+- [NeMo-UX] Integrate experiment manager features with NeMo-UX APIs by @ashors1 :: PR: #9460
+- Add python_requires by @galv :: PR: #9431
+- [NeMo-UX] Fixing imports of NeMoLogging, AutoResume & ModelCheckpoint by @marcromeyn :: PR: #9476
+- Modelopt Refactor for SDXL Quantization by @suiyoubi :: PR: #9279
+- [NeMo-UX] Fixing defaults in llm.train & Mistral7BModel by @marcromeyn :: PR: #9486
+- In framework deploy using deploy script by @oyilmaz-nvidia :: PR: #9468
+- [NeMo-UX] Integrate tokenizer import into model.import_ckpt by @marcromeyn :: PR: #9485
+- append to file by @malay-nagda :: PR: #9483
+- [NeMo-UX] Fix bug in import_ckpt by @marcromeyn :: PR: #9492
+- Add nemotron news by @ericharper :: PR: #9510
+- Add CICD test for Stable Diffusion by @michal2409 :: PR: #9464
+- Akoumparouli/nemo ux mixtral by @akoumpa :: PR: #9446
+- [NeMo-UX] Llama and Gemma by @cuichenx :: PR: #9528
+- [NeMo-UX] minor logging bug fixes by @ashors1 :: PR: #9529
+- Update neva conversion script from and to HF by @yaoyu-33 :: PR: #9296
+- [Nemo-UX] IO fixes by @marcromeyn :: PR: #9512
+- Fix lhotse tests for v1.24.2 by @pzelasko :: PR: #9546
+- [Nemo CICD] Make GPU Unit Tests non-optional by @pablo-garay :: PR: #9551
+- Add Python AIStore SDK to container and bump min Lhotse version by @pzelasko :: PR: #9537
+- [NeMo-UX] Fix tokenizer IO by @marcromeyn :: PR: #9555
+- [NeMo UX] Move mistral_7b.py to mistral.py by @akoumpa :: PR: #9545
+- ci: Do not attempt to send slack on fork by @ko3n1g :: PR: #9556
+- Fix SDXL incorrect name in Docs by @suiyoubi :: PR: #9534
+- Bump PTL version by @athitten :: PR: #9557
+- [Resiliency] Straggler detection by @jbieniusiewi :: PR: #9473
+- [NeMo-UX] Switch to torch_dist as default distributed checkpointing backend by @ashors1 :: PR: #9541
+- [NeMo-UX] Checkpointing bug fixes by @ashors1 :: PR: #9562
+- Expose MCore path_to_cache option by @maanug-nv :: PR: #9570
+- [NeMo-UX] Fix Trainer serialization by @marcromeyn :: PR: #9571
+- Update click version requirement by @thomasdhc :: PR: #9580
+- [Fault tolerance] Heartbeat detection by @maanug-nv :: PR: #9352
+- [Nemo-UX] Add fabric-API for manual forward-pass by @marcromeyn :: PR: #9577
+- [Nemo-UX] Add SDK-factories to llm-collection by @marcromeyn :: PR: #9589
+- [NeMo-UX] Some improvements to NeMoLogger by @marcromeyn :: PR: #9591
+- Set no_sync_func & grad_sync_fucn by @akoumpa :: PR: #9601
+- [NeMo-UX] Fix nemo logger when trainer has no loggers by @ashors1 :: PR: #9607
+- Fix the dictionary  format returned by the `scheduler` method by @sararb :: PR: #9609
+- [NeMo-UX] Dataloading enhancements and bug fixes by @ashors1 :: PR: #9595
+- Fix serialization of AutoResume by @sararb :: PR: #9616
+- Jsonl support by @adityavavre :: PR: #9611
+- Akoumparouli/mistral import instruct chat template fix by @akoumpa :: PR: #9567
+- Remove .cuda calls, use device isntead by @akoumpa :: PR: #9602
+- fix converter defautl args by @akoumpa :: PR: #9565
+- fix: remove non_blocking from PTL's .cuda call by @akoumpa :: PR: #9618
+- NeVA Minor Fixes by @yaoyu-33 :: PR: #9608
+- [NeMo-UX] fix pretrianing data sizes and weights by @cuichenx :: PR: #9627
+- [NeMo-UX] async checkpointing support by @ashors1 :: PR: #9466
+- Change default parallel_save to False by @mikolajblaz :: PR: #9632
+- Add REST API to deploy module by @athitten :: PR: #9539
+- ci: Timeout per step, not job by @ko3n1g :: PR: #9635
+- [NeMo-UX] Fix when optimizers are setup for PEFT by @marcromeyn :: PR: #9619
+- [NeMo-UX] Fix pipeline parallel bug by @ashors1 :: PR: #9637
+- Fixing import error fior llama-index (RAG pipeline) by @pablo-garay :: PR: #9662
+- llama CI fix by @rohitrango :: PR: #9663
+- [NeMo-UX] Make 'load_directly_on_device' configurable by @ashors1 :: PR: #9657
+- [Nemo-UX] Including all trainable-params in a PEFT-checkpoint by @marcromeyn :: PR: #9650
+- [NeMo-UX] Fix imports so local configuration of runs works again by @marcromeyn :: PR: #9690
+- Set TE flag in legacy -> mcore conversion script by @terrykong :: PR: #9722
+- Update starthere docs text by @erastorgueva-nv :: PR: #9724
+- TorchAudio installation workaround for incorrect `PYTORCH_VERSION` variable by @artbataev :: PR: #9736
+- [NeMo-UX] Match nemo 1's default behavior for drop_last and pad_samples_to_global_batch_size by @ashors1 :: PR: #9707
+- add a bit more for timeout (#9702) by @pablo-garay :: PR: #9754
+- Fix missing parallelisms by @maanug-nv :: PR: #9725
+- update branch by @nithinraok :: PR: #9764
+- Fix data preprocessing script by @cuichenx :: PR: #9759
+- vLLM 0.5.1 update by @apanteleev :: PR: #9779
+- upper bound hf-hub by @akoumpa :: PR: #9805
+- Fix few issues and docs for neva and clip in r2.0.0rc1 by @yaoyu-33 :: PR: #9681
+- add dummy vision and text transformer config (assumed mcore to be false) by @rohitrango :: PR: #9699
+- fix lita bugs by @Slyne :: PR: #9810
+- [NeMo-UX] Log `val_loss` by @ashors1 :: PR: #9814
+- [NeMo-UX] Fix some dataloading bugs by @ashors1 :: PR: #9807
+- [NeMo-UX] Adding recipes by @marcromeyn :: PR: #9720
+- [NeMo-UX] Set async_save from strategy rather than ModelCheckpoint by @ashors1 :: PR: #9800
+- Fix hf hub for 0.24+ by @titu1994 :: PR: #9806
+- [NeMo-UX] Fix a minor bug with async checkpointing by @ashors1 :: PR: #9856
+- [NeMo-UX] make progress bar easier to parse by @ashors1 :: PR: #9877
+- Docs: add "Nemo Fundamentals" page by @erastorgueva-nv :: PR: #9835
+- Create __init__.py by @stevehuang52 :: PR: #9892
+- [NeMo-UX] Fixes to make PreemptionCallback work by @hemildesai :: PR: #9830
+- Fix Docker build. Make Dockerfile consistent with CI by @artbataev :: PR: #9784
+- Multimodal data prep notebook fix by @cuichenx :: PR: #9910
+- [NeMo-UX] Add distributed checkpointing unit tests by @ashors1 :: PR: #9794
+- r2.0.0rc1 fix for dist checkpoint loading by @yaoyu-33 :: PR: #9854
+- [NeMo-UX] Rename sdk references to NeMo Run by @hemildesai :: PR: #9872
+- [NeMo-UX] Fix some serialization bugs by @ashors1 :: PR: #9868
+- add mixtral neva tutorial (moe + token fusion + siglip) by @paul-gibbons :: PR: #9926
+- [NeMo-UX] Add more NeMo Logger tests by @ashors1 :: PR: #9795
+- Akoumparouli/mixtral fixes for r2.0.0rc1 by @akoumpa :: PR: #9911
+- R2.0.0rc1 clip fix by @Slyne :: PR: #9871
+- [NeMo-UX] Add missing docstrings and update some defaults by @ashors1 :: PR: #9895
+- Add REST service requirements.txt by @oyilmaz-nvidia :: PR: #9923
+- add bert latest fix by @JRD971000 :: PR: #9921
+- remove empy reconfigure_limit_batches by @akoumpa :: PR: #9934
+- fix mem by @terrykong :: PR: #9964
+- Run a sample query for a quantized model conditionally by @janekl :: PR: #9965
+- Add pydantic-settings  by @oyilmaz-nvidia :: PR: #9961
+- Resiliency features update by @jbieniusiewi :: PR: #9714
+- [NeMo-UX] Wrap task config save in a try/except by @ashors1 :: PR: #9956
+- [NeMo-UX] Update default PTL logging `save_dir` by @ashors1 :: PR: #9954
+- Fix lita tutorial by @Slyne :: PR: #9980
+- Add deploy and REST API support to NeMo 2.0 by @athitten :: PR: #9834
+- ci: Allow changelog manual (#10156) by @ko3n1g :: PR: #10157
+- docs: Add changelog by @ko3n1g :: PR: #10155
+- add manifest file by @ko3n1g :: PR: #10161
+
+</details>
+
+## NVIDIA Neural Modules 2.0.0rc0
+
+### Highlights
+
+#### LLM and MM
+
+##### Models
+
+- Megatron Core RETRO
+  - Pre-training
+  - Zero-shot Evaluation
+
+- Pretraining, conversion, evaluation, SFT, and PEFT for:
+  - Mixtral 8X22B
+  - Llama 3
+  - SpaceGemma
+
+- Embedding Models Fine Tuning
+  - Mistral
+  - BERT
+
+- BERT models
+  - Context Parallel
+  - Distributed checkpoint
+
+- Video capabilities with NeVa
+
+##### Performance
+
+- Distributed Checkpointing
+  - Torch native backend
+  - Parallel read/write
+  - Async write
+
+- Multimodal LLM (LLAVA/NeVA)
+  - Pipeline Parallelism support
+  - Sequence packing support
+
+##### Export
+
+- Integration of Export & Deploy Modules into NeMo Framework container
+  - Upgrade to TRT-LLM 0.9
+
+#### Speech (ASR & TTS)
+
+##### Models
+
+- AED Multi Task Models (Canary) - Multi-Task Multi-Lingual Speech Recognition / Speech Translation model
+- Multimodal Domain - Speech LLM supporting SALM Model
+- Parakeet-tdt_ctc-1.1b Model - RTFx of > 1500 (can transcribe 1500 seconds of audio in 1 second)
+- Audio Codec 16kHz Small - NeMo Neural Audio Codec for discretizing speech for use in LLMs
+  - mel_codec_22khz_medium
+  - mel_codec_44khz_medium
+
+##### Perf Improvements
+
+- Transcribe() upgrade - Enables one line transcribe with files, tensors, data loaders
+- Frame looping algorithm for RNNT faster decoding - Improves Real Time Factor (RTF) by 2-3x
+- Cuda Graphs + Label-Looping algorithm for RNN-T and TDT Decoding - Transducer Greedy decoding at over 1500x RTFx, on par with CTC Non-Autoregressive models
+- Semi Sorted Batching support - External User contribution that speeds up training by 15-30%.
+
+##### Customization
+
+- Context biasing for CTC word stamping - Improve accuracy for custom vocabulary and pronunciation
+  - Longform Inference
+  - Longform inference support for AED models
+- Transcription of multi-channel audio for AED models
+
+##### Misc
+
+- Upgraded webdataset - Speech and LLM / Multimodal unified container
+
+### Detailed Changelogs
+
+#### ASR
+  
+<details><summary>Changelog</summary>
+
+- Enable using hybrid asr models in CTC Segmentation tool by @erastorgueva-nv :: PR: #8828
+- TDT confidence fix by @GNroy :: PR: #8982
+- Fix union type annotations for autodoc+mock-import rendering by @pzelasko :: PR: #8956
+- NeMo dev doc restructure by @yaoyu-33 :: PR: #8896
+- Improved random seed configuration for Lhotse dataloaders with docs by @pzelasko :: PR: #9001
+- Fix #8948, allow preprocessor to be stream captured to a cuda graph when doing per_feature normalization by @galv :: PR: #8964
+- [ASR] Support for transcription of multi-channel audio for AED models by @anteju :: PR: #9007
+- Add ASR latest news by @titu1994 :: PR: #9073
+- Fix docs errors and most warnings by @erastorgueva-nv :: PR: #9006
+- PyTorch CUDA allocator optimization for dynamic batch shape dataloading in ASR by @pzelasko :: PR: #9061
+- RNN-T and TDT inference: use CUDA graphs by default by @artbataev :: PR: #8972
+- Fix #8891 by supported GPU-side batched CTC Greedy Decoding by @galv :: PR: #9100
+- Update branch for notebooks and ci in release by @ericharper :: PR: #9189
+- Enable CUDA graphs by default only for transcription by @artbataev :: PR: #9196
+- rename paths2audiofiles to audio by @nithinraok :: PR: #9209
+- Fix ASR_Context_Biasing.ipynb contains FileNotFoundError by @andrusenkoau :: PR: #9233
+- Cherrypick: Support dataloader as input to `audio` for transcription (#9201) by @titu1994 :: PR: #9235
+- Update Online_Offline_Microphone_VAD_Demo.ipynb by @stevehuang52 :: PR: #9252
+- Dgalvez/fix greedy batch strategy name r2.0.0rc0 by @galv :: PR: #9243
+- Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward by @galv :: PR: #9246
+- Fix loading github raw images on notebook by @nithinraok :: PR: #9282
+- typos by @nithinraok :: PR: #9314
+- Re-enable cuda graphs in training modes. by @galv :: PR: #9338
+- add large model stable training fix and contrastive loss update for variable seq by @nithinraok :: PR: #9259
+- Fix conv1d package in r2.0.0rc0  by @pablo-garay :: PR: #9369
+- Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347) by @titu1994 :: PR: #9350
+- Make a backward compatibility for old MSDD configs in label models by @tango4j :: PR: #9377
+- Force diarizer to use CUDA if cuda is available and if device=None. by @tango4j :: PR: #9380
+
+</details>
+  
+#### TTS
+
+<details><summary>Changelog</summary>
+
+- [TTS] Add tutorial for training audio codecs by @rlangman :: PR: #8723
+- Update radtts.py by @blisc :: PR: #9097
+- [Nemo CICD] RADTTS test optional by @pablo-garay :: PR: #9112
+- Remove Radtts CI test by @blisc :: PR: #9144
+- Fix T5 G2P Input and Output Types by @blisc :: PR: #9224
+
+</details>
+
+#### LLM and MM
+
+<details><summary>Changelog</summary>
+
+- Rachitg/dpa by @rachitgarg91 :: PR: #8911
+- Remove precision args in trainer due to PTL update by @yaoyu-33 :: PR: #8908
+- Huvu/mcore retro by @huvunvidia :: PR: #8861
+- fsdp tp > 1 bug fix by @dimapihtar :: PR: #8947
+- Fix memory leak at loss func by @minitu :: PR: #8868
+- change the condition for get qkv tensor from linear_qkv output in mcoremixin by @HuiyingLi :: PR: #8965
+- Add safety checks for 'data' key in MegatronGPTModel cfg by @HuiyingLi :: PR: #8991
+- [NeMo-UX] Adding MegatronParallel by @cuichenx :: PR: #8987
+- Skip top_p computations when set to 1.0 by @odelalleau :: PR: #8905
+- Gemma bug by @cuichenx :: PR: #8962
+- [NeMo-UX] Adding megatron strategy by @marcromeyn :: PR: #8995
+- Quantized checkpoint support in export and deploy modules by @janekl :: PR: #8859
+- add geglu to mlp swap by @JRD971000 :: PR: #8999
+- add timeout for new_group by @acphile :: PR: #8998
+- Zero-shot evaluation pipeline for mcore RETRO by @huvunvidia :: PR: #8941
+- Added fusion for squared relu by @sanandaraj5597 :: PR: #8963
+- Developer Documents for mcore RETRO by @huvunvidia :: PR: #9026
+- [NeMo-UX] Adding GPTModel & MockDataModule by @marcromeyn :: PR: #9011
+- Adding unit test for mcore RETRO model by @huvunvidia :: PR: #9022
+- docs and simplification of cmd args by @arendu :: PR: #8979
+- [NeMo-UX] Add checkpoint-io to MegatronStrategy by @marcromeyn :: PR: #9057
+- Enable Sequence Packing and Pipeline Parallel in NeVA by @yaoyu-33 :: PR: #8957
+- Mingyuanm/add back fp8 support to sd by @Victor49152 :: PR: #9070
+- unfused lora by @arendu :: PR: #9004
+- Handle case where num_query_groups is set to null for LoRA config setup by @vysarge :: PR: #9075
+- Alit/griffin by @JRD971000 :: PR: #9021
+- Implement DistributedCheckpointIO by @mikolajblaz :: PR: #9016
+- Video Neva Pretraining + Inference Implementation by @paul-gibbons :: PR: #9095
+- HF to .nemo for Mixtral-8x22B-instruct by @akoumpa :: PR: #9060
+- mcore ds updates by @dimapihtar :: PR: #8951
+- Alit/griffin perf by @JRD971000 :: PR: #9107
+- Add assert for max_steps to be positive in MegatronGPTSFTModel by @athitten :: PR: #9110
+- Extend sequence length padding for GPT SFT to account for context parallel by @vysarge :: PR: #8869
+- Update gpt dataset config parameter for mock by @thomasdhc :: PR: #9118
+- Add Mcore DistributedDataParallel and distributed optimizer into Nemo by @gdengk :: PR: #9034
+- Revert "Add assert for max_steps to be positive in MegatronGPTSFTMode… by @pablo-garay :: PR: #9128
+- scripts to convert HF lora to nemo by @arendu :: PR: #9102
+- Prevent duplicated checkpoints by @mikolajblaz :: PR: #9015
+- add TN/ITN link in speech tools list by @erastorgueva-nv :: PR: #9142
+- Cleanup deprecated files and temporary changes by @cuichenx :: PR: #9088
+- Use DP+CP groups as the FSDP sharding domain by @erhoo82 :: PR: #9145
+- CUDA memory profile by @erhoo82 :: PR: #9096
+- Fix missing func for T5 model by @gdengk :: PR: #9141
+- Add knob for load_directly_on_device by @mikolajblaz :: PR: #9125
+- Revert rope fusion defaults by @cuichenx :: PR: #9238
+- Update nemo.export module for quantized models by @janekl :: PR: #9250
+- Fix circular import for MM dataprep notebook by @cuichenx :: PR: #9287
+- neva media_type + text generation default fix by @paul-gibbons :: PR: #9257
+- fix lora and ptuning and isort/black by @oyilmaz-nvidia :: PR: #9290
+- add check if num layers is divisible by pp size by @dimapihtar :: PR: #9208
+- Fix P-tuning for Llama based models by @apanteleev :: PR: #9297
+- add deprecation warnings by @pablo-garay :: PR: #9266
+- move pooler under post_process by @dimapihtar :: PR: #9328
+- add deprecation note for nmt by @dimapihtar :: PR: #9342
+- Fix incorrect checkpoint removal logic (#9192) by @mikolajblaz :: PR: #9204
+- fix fp16 precision issue by @dimapihtar :: PR: #9376
+- Fix module.training for Neva in FusedAttn backward which causes nan by @yaoyu-33 :: PR: #8877
+
+</details>
+
+#### Export
+
+<details><summary>Changelog</summary>
+
+- Updates for TRT-LLM 0.9 by @oyilmaz-nvidia :: PR: #8873
+- Mingyuanm/sdxl export by @Victor49152 :: PR: #8926
+- Avoid unpacking NeMo checkpoints before exporting to TRT-LLM by @apanteleev :: PR: #8866
+- Update gemma for trt-llm 0.9 by @oyilmaz-nvidia :: PR: #8974
+- TRT-LLM export P-tuning related fixes by @apanteleev :: PR: #8863
+
+</details>
+
+#### General Improvements
+
+<details><summary>Changelog</summary>
+
+- Update package info by @ericharper :: PR: #8793
+- [Nemo CICD] Update mcore 4.13.24 by @pablo-garay :: PR: #8917
+- Akoumparouli/low mem mixtral ckpt converter by @akoumpa :: PR: #8895
+- Adding RETRO tests to Action Tests (cicd-main.yml)  by @huvunvidia :: PR: #8942
+- Akoumparouli/fix sd train 2 by @akoumpa :: PR: #8883
+- Update te install for jenkins by @ericharper :: PR: #8954
+- [Nemo CICD] Add last job depending on others for blocking check by @pablo-garay :: PR: #8959
+- Minor quantization pipeline updates by @janekl :: PR: #8924
+- Fix External CLIP Converter by @yaoyu-33 :: PR: #8960
+- PP support in LoRA merge script by @cuichenx :: PR: #8934
+- Update PR template by @ericharper :: PR: #8978
+- Update Latest News by @shashank3959 :: PR: #8837
+- Fix incorrect link to latest news in README by @shashank3959 :: PR: #8985
+- Update dependency install for LLM and MM by @ericharper :: PR: #8990
+- Temporarily remove mcore dep by @ericharper :: PR: #9010
+- [Nemo CICD] further specialize runners for more parallelism by @pablo-garay :: PR: #9036
+- Update mm dataprep notebook based on feedback by @cuichenx :: PR: #9029
+- Fix import in lora merge script by @cuichenx :: PR: #9032
+- [Nemo CICD] Run when labeled:Run CICD by @pablo-garay :: PR: #9044
+- [Nemo CICD] Add tag/label for 1-gpu runner by @pablo-garay :: PR: #9046
+- [Nemo CICD] checkout v4 by @pablo-garay :: PR: #9048
+- [Nemo CICD] Remove temp test change by @pablo-garay :: PR: #9049
+- remove in-place addition for dreambooth train with text encoder by @Victor49152 :: PR: #8825
+- Mingyuanm/sdxl quantization notebook by @Victor49152 :: PR: #9042
+- [Nemo CICD] Trigger on comment issued by @pablo-garay :: PR: #9062
+- zarr ckpt to torch_dist ckpt converter by @dimapihtar :: PR: #8842
+- Restore PTQ tests for Llama2 (reopened) by @janekl :: PR: #9064
+- add clip H config by @JRD971000 :: PR: #9082
+- [NeMo-UX] Add mixed-precision plugin by @marcromeyn :: PR: #9065
+- Comment baichuan test and update pr template by @ericharper :: PR: #9085
+- Add safe extraction of nemo tar files by @athitten :: PR: #8976
+- Improved `shard_id` parsing in `LazyNemoTarredIterator`, enables AIS dataloading by @pzelasko :: PR: #9077
+- [NeMo-UX] Add mistral-7b model by @marcromeyn :: PR: #9066
+- Llama3 Conversion Script Update by @suiyoubi :: PR: #9089
+- dehardcode test string by @JimmyZhang12 :: PR: #8865
+- [Nemo CICD] Try trigger cicd run on comment by @pablo-garay :: PR: #9111
+- Lhotse dataloading: RIR augmentation and nemo/tarred input support for RIR and noise aug by @pzelasko :: PR: #9109
+- mixtral evaluation PR by @Slyne :: PR: #8989
+- [Nemo CICD] Revert: run GHA cicd on comment by @pablo-garay :: PR: #9119
+- [Nemo CICD] Comment out flaky test: running too long by @pablo-garay :: PR: #9123
+- [Nemo CICD] Add timeout to unit tests by @pablo-garay :: PR: #9132
+- [Nemo CICD] Indicate optional test in name (prefix) by @pablo-garay :: PR: #9139
+- video neva null image+video folder path fix by @paul-gibbons :: PR: #9116
+- [NeMo-UX] Add data module by @cuichenx :: PR: #9133
+- NeMo Inference Requirements by @oyilmaz-nvidia :: PR: #9093
+- Remove debug print by @maanug-nv :: PR: #9074
+- Remove legacy CI by @pablo-garay :: PR: #9149
+- Update support for push_to_hf_hub() by @titu1994 :: PR: #9159
+- [Nemo CICD] comment out flaky PTQ tests by @pablo-garay :: PR: #9160
+- Update branch by @ericharper :: PR: #9211
+- dist adam transpose fix by @dimapihtar :: PR: #9239
+- [Nemo CICD] Increase time limit for Speech_Checkpoints_tests (#9186) by @pablo-garay :: PR: #9247
+- Pin transformers by @ericharper :: PR: #9261
+- Fix typo in HF tutorial by @titu1994 :: PR: #9302
+
+</details>
+
+## NVIDIA Neural Modules 1.23.0
+
+### Highlights
+
+#### Models
+
+##### Nvidia Starcoder 2 - 15B
+
+- Announcement - https://developer.nvidia.com/blog/unlock-your-llm-coding-potential-with-starcoder2/
+- AI Foundation Model Inference  - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/starcoder2-15b
+- https://huggingface.co/bigcode/starcoder2-15b
+
+##### NeMo Canary
+Announcement - https://nvidia.github.io/NeMo/blogs/2024/2024-02-canary/
+
+- https://huggingface.co/nvidia/canary-1b
+
+#### NeMo LLM
+
+- Falcon
+- Code Llama
+- StarCoder
+- GPT perf improvements
+- Context parallelism
+- Mistral
+- Mixtral (without expert parallelism)
+- Mcore GPT Dataset integration
+
+#### NeMo MM
+- CLIP
+- Stable Diffusion (supporting LoRA)
+- Imagen
+- ControlNet (for SD)
+- Instruct pix2pix (for SD)
+- LLAVA
+- NeVA
+- DreamFusion++
+- NSFW filtering
+
+#### NeMo ASR
+
+- Lhotse Dataloading support #7880
+- Canary: Multi task multi lingual ASR #8242
+- LongForm Audio for Diarization #7737
+- Faster algorithm for RNN-T Greedy #7926
+- Cache-Aware streaming notebook #8296
+
+#### NeMo TTS
+
+#### NeMo Vision
+
+#### Known Issues
+
+##### ASR
+
+###### RNNT WER calculation when fused batch size > 1 during validation / test step()
+
+Previously, the RNNT metric was stateful while the CTC one was not ([r1.22.0](https://github.com/NVIDIA/NeMo/blob/r1.22.0/nemo/collections/asr/metrics/rnnt_wer_bpe.py#L419-L420), [r1.23.0](https://github.com/NVIDIA/NeMo/blob/r1.23.0/nemo/collections/asr/metrics/wer.py#L333))
+
+Therefore this calculation in the RNNT joint for fused operation worked properly. However with the unification of metrics in r1.23.0, a bug was introduced where only the last sub-batch of metrics calculates the scores and does not accumulate. This is patched via https://github.com/NVIDIA/NeMo/pull/8587 and will be fixed in the next release.
+
+**Workaround**: Explicitly disable fused batch size during inference using the following command 
+
+```python
+from omegaconf import open_dict
+model = ...
+decoding_cfg = model.cfg.decoding
+with open_dict(decoding_cfg):
+  decoding_cfg.fused_batch_size = -1
+model.change_decoding_strategy(decoding_cfg)
+```
+
+Note: This bug does not affect scores calculated via model.transcribe() (since it does not calculate metrics during inference, just text), or using the `transcribe_speech.py` or `speech_to_text_eval.py` in `examples/asr`.
+
+###### Two failing unit tests due to a change in expected results, caused by lhotse version update
+
+#### Container
+
+For additional information regarding NeMo containers, please visit: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo
+
+`docker pull nvcr.io/nvidia/nemo:24.01.speech`
+
+#### ASR
+
+<details><summary>Changelog</summary>
+
+- Update link to yaml file in ASR_with_Transducers.ipynb by @Faith-Nchifor :: PR: #8014
+- Use convert_hf_dataset_to_nemo by @karpnv :: PR: #8017
+- Update asr_language_modeling.rst: Add a missing word by @martin0258 :: PR: #8007
+- spelling mistake by @orena1 :: PR: #7903
+- update asr eval by @stevehuang52 :: PR: #8045
+- fix noise aug by @stevehuang52 :: PR: #8057
+- Various fixes for typos and urls by @titu1994 :: PR: #8066
+- [Fix] Increase length check tolerance to prevent test failing by @anteju :: PR: #8067
+- Add text metrics to asr eval by @stevehuang52 :: PR: #8087
+- fix device setting to allow using accelerator cpu by @orena1 :: PR: #8084
+- .ctm in data simulator annotator compliant with RT-09 specification by @popcornell :: PR: #8004
+- Fix AST eval by @stevehuang52 :: PR: #8112
+- fix: numba.*_num_threads resets torch num_threads #8141 by @itzsimpl :: PR: #8145
+- Update dependencies by @titu1994 :: PR: #8156
+- NeMo + Lhotse integration by @pzelasko :: PR: #7880
+- Speedup RNN-T greedy decoding by @artbataev :: PR: #7926
+- [docker] Install k2 before NeMo for faster image rebuilding by @pzelasko :: PR: #8204
+- [docs] Add --force_codec to tarred dataset creation examples by @pzelasko :: PR: #8227
+- Temporarily use the previous RNN-T decoding algorithm as default by @artbataev :: PR: #8226
+- Make TDT inference not require duration params by @hainan-xv :: PR: #8207
+- Cache Aware Streaming tutorial notebook by @erastorgueva-nv :: PR: #8296
+- fix path location and branch by @nithinraok :: PR: #8304
+- Attention encoder-decoder models for multiple speech-to-text tasks  … by @titu1994 :: PR: #8324
+- Remove asr webapp by @titu1994 :: PR: #8347
+- remove _target_ at model level in aed model config [ASR] by @krishnacpuvvada :: PR: #8351
+- Add change_vocabulary and save_tokenizers() support to Multitask ASR models by @titu1994 :: PR: #8357
+- Change default beam size by @titu1994 :: PR: #8371
+-  adding jenkins test for speech_to_text_aed model by @krishnacpuvvada :: PR: #8368
+- Add Finetuning tutorial with HF Datasets by @nithinraok :: PR: #8356
+- wer fix by @tbartley94 :: PR: #8404
+- add ensemble decoding fix by @nithinraok :: PR: #8427
+- Update k2 by @artbataev :: PR: #8492
+
+</details>
+
+#### TTS
+
+<details><summary>Changelog</summary>
+
+- [TTS] Scale sampler steps by number of devices by @rlangman :: PR: #7947
+- Add All Multimodal Source Code Part 2: Text to image, x to nerf by @yaoyu-33 :: PR: #7970
+- [TTS] Add period discriminator and feature matching loss to codec recipe by @rlangman :: PR: #7884
+- Added VectorQuantizer base class by @anteju :: PR: #8011
+
+</details>
+
+#### LLMS
+
+<details><summary>Changelog</summary>
+
+- Add interface to set NCCL options of each process group by @erhoo82 :: PR: #7923
+- Support O2 training of PEFT and SFT by @cuichenx :: PR: #7971
+- [NLP] Access scaler only in FP16 case by @janekl :: PR: #7916
+- [NLP] Minor improvements in Llama conversion script by @janekl :: PR: #7978
+- [NLP] Use helpers from utils_funcs.py in Llama conversion by @janekl :: PR: #7979
+- [NLP] Remove replace_sampler_ddp (deprecated in Trainer) by @janekl :: PR: #7981
+- Reworked MegatronPretrainingRandomBatchSampler to correctly handle epochs > 1 by @trias702 :: PR: #7920
+- Remove deprecated arguments from TE's TransformerLayer by @jbaczek :: PR: #7917
+- Add All Multimodal Source Code by @yaoyu-33 :: PR: #7791
+- First draft of mcore bert model in NeMo by @shanmugamr1992 :: PR: #7814
+- Support Falcon Variants (7B/40B/180B) in Mcore NeMo by @xuanzic :: PR: #7666
+- FSDP + Tensor Parallelism by @erhoo82 :: PR: #7897
+- Packed Sequence by @cuichenx :: PR: #7945
+- Adding method back that was removed accidentally by @ericharper :: PR: #8038
+- [NLP] ArtifactItem with init=True to make it debuggable by @janekl :: PR: #7980
+- SFT patch: (1) enable sequence parallelism and (2) enable profile by @erhoo82 :: PR: #7963
+- migration to PTL 2.0 for spellmapper model by @bene-ges :: PR: #7924
+- Change the megatron config lr scheduler default and fix to change partitions script by @shan18 :: PR: #8094
+- (1) Add SHARP interface to M-CORE, (2) use send/recv to send train loss to the first rank instead of b-cast by @erhoo82 :: PR: #7793
+- Reconfigure limit_val_batches only for int by @athitten :: PR: #8099
+- Fixing wrapper and moving it to base class by @shanmugamr1992 :: PR: #8055
+- fix gated_linear_unit bug by @Agoniii :: PR: #8042
+- Fix Adapter for MCore models by @cuichenx :: PR: #8124
+- add war fix for sync issues by @gshennvm :: PR: #8130
+- Improve PEFT UX by @cuichenx :: PR: #8131
+- Enhance flexibility by passing callbacks as method argument by @michal2409 :: PR: #8015
+- context parallelism by @xrennvidia :: PR: #7739
+- Make pipelined TP comm overlap available with mcore by @erhoo82 :: PR: #8005
+- remove deprecated scripts by @arendu :: PR: #8138
+- adding OnlineSampleMapping by @arendu :: PR: #8137
+- Add distopt support for FP8 params and BF16 optimizer state by @timmoon10 :: PR: #7909
+- Revert adding OnlineSampleMapping by @pablo-garay :: PR: #8164
+- Token count and sequence length logging for MegatronGPTSFTModel by @vysarge :: PR: #8136
+- Use latest apex internal API by @jbaczek :: PR: #8129
+- tune specific params in the base model by @arendu :: PR: #7745
+- Virtual pipeline parallel support for MegatronGPTSFTModel by @vysarge :: PR: #7964
+- removed deprecated peft model by @arendu :: PR: #8183
+- remove more deprecated files by @arendu :: PR: #8169
+- Pre-generate cu_seqlens argmin and max_seqlen to remove host-to-device sync by @erhoo82 :: PR: #8108
+- Add the interface to use SHARP to FSDP strategy by @erhoo82 :: PR: #8202
+- Multimodal required NLP base model changes by @yaoyu-33 :: PR: #8188
+- [NLP] Improve and unify loading state_dict for community models by @janekl :: PR: #7977
+- Rename Finetuning Scripts by @cuichenx :: PR: #8201
+- Final multimodal PR with our recent developments on MM side by @yaoyu-33 :: PR: #8127
+- Add include_text parameter to SFT dataloaders by @Kipok :: PR: #8198
+- Add random_seed argument to generate by @Kipok :: PR: #8162
+- Added support for neptune logger by @harishankar-gopalan :: PR: #8210
+- Pre-compute max_seqlen and cu_seqlens_argmin in all model-parallel cases by @erhoo82 :: PR: #8222
+- Use PackedSeqParams in accordance with changes in Megatron-LM by @cuichenx :: PR: #8205
+- Fix to peft & virtual pipeline parallel unsupported check by @vysarge :: PR: #8216
+- Fixed the tp overlap switch by @sanandaraj5597 :: PR: #8195
+- add knobs for rope/swiglu fusion by @lhb8125 :: PR: #8184
+- Added sample cpu_offloading switch to YAML by @sanandaraj5597 :: PR: #8148
+- Syncing random seed between ranks in generate by @Kipok :: PR: #8230
+- add first_val_step to mcore scheduler by @JimmyZhang12 :: PR: #8150
+- Correct padding for SFT input data to account for sequence parallel + TE's fp8 op dimension requirements by @vysarge :: PR: #8240
+- Mistral 7b conversion script by @akoumpa :: PR: #8052
+- switch to mcore dataset [with FIM support] by @dimapihtar :: PR: #8149
+- Mixtral to NeMo conversion script. by @akoumpa :: PR: #8155
+- fixes to accomendate mcore changes by @HuiyingLi :: PR: #8261
+- Allow MegatronPretrainingRandomSampler to do multi-epoch training by @trias702 :: PR: #8239
+- Add dist ckpt support for regular optimizers by @mikolajblaz :: PR: #7749
+- add deallocate pipeline output optimization by @JimmyZhang12 :: PR: #8279
+- Fix memory leak caused by context parallelism hanging references by omegaconf by @JimmyZhang12 :: PR: #8299
+- distributed fused adam + rampup bs support by @dimapihtar :: PR: #8302
+- Update PEFT Doc by @cuichenx :: PR: #8262
+- Converter script fixes for mixtral/mistral by @akoumpa :: PR: #8272
+- Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 by @erhoo82 :: PR: #8334
+- Enable megatron core loggers for GPT pretraining by @ashbhandare :: PR: #8354
+- mcore ds fix by @dimapihtar :: PR: #8283
+- release updates by @dimapihtar :: PR: #8378
+- Mcore customization doc by @HuiyingLi :: PR: #8298
+- updated link to pubmed by @nithinraok :: PR: #8402
+- mcore customization doc minor fix by @HuiyingLi :: PR: #8421
+- Fixing mcore bert for TP, PP and SP by @shanmugamr1992 :: PR: #8336
+- Add settings to suppress bf16 compile errors in CI on V100 by @athitten :: PR: #8481
+- MoE parameter passing by @akoumpa :: PR: #8255
+- Add fp8 support for SD/Update notebook paths by @Victor49152 :: PR: #8489
+
+</details>
+
+#### NeMo Tools
+
+<details><summary>Changelog</summary>
+
+- SDE bugfix log by @Jorjeous :: PR: #8430
+
+</details>
+
+#### General Improvements
+
+<details><summary>Changelog</summary>
+
+- Add news section to README by @ericharper :: PR: #7984
+- Fixing conversion script to work for code llama by @shanmugamr1992 :: PR: #7997
+- Fix crash when converting to mcore a model using rotary embeddings by @odelalleau :: PR: #7998
+- Added a procedure for Windows users, README by @Jorjeous :: PR: #7942
+- Update manifest.py to speedup loading tarred datasets by @stevehuang52 :: PR: #7900
+- [Fix] Fixed name of a test by @anteju :: PR: #7986
+- Fix lora merge script by @cuichenx :: PR: #8113
+- Support transcoding audio formats when saving tarred datasets (FLAC, OPUS) by @pzelasko :: PR: #8102
+- README edit to change Apple Silicon install instructions (to fix a break introduced by pytorch 2) by @stephenmcconnachie :: PR: #8122
+- Fixes NVIDIA/apex installation to not erroneously install the  pkg by @terrykong :: PR: #8126
+- Graphviz fix by @GNroy :: PR: #7843
+- Update README.rst by @fayejf :: PR: #8154
+- Fix TP>1 issue for conversion script by @cuichenx :: PR: #8144
+- Support torch jit script by @artbataev :: PR: #8027
+- NeMo Multimodal Docs and Tests Initial PR by @yaoyu-33 :: PR: #8028
+- Remove left-over prints in NeMo+Lhotse code by @pzelasko :: PR: #8180
+- Upgrade to DLFW PyTorch 23.12 by @ericharper :: PR: #8163
+- Add Lhotse support for  key in NeMo manifests by @pzelasko :: PR: #8197
+- Fix CPU Initialization and TP>1 for LoRA Merge Script by @cuichenx :: PR: #8199
+- Add support in Neural Typecheck to disable semantic checks by @titu1994 :: PR: #8212
+- Pin lhotse=1.19.2 in r1.23.0 by @pzelasko :: PR: #8303
+- Multimodal r1.23.0 bug fix  by @yaoyu-33 :: PR: #8315
+- MCore dataset compatibility for tokenizers by @vysarge :: PR: #8390
+- Update NFA video download link by @erastorgueva-nv :: PR: #8406
+- Update MM Dataprep Tutorial by @cuichenx :: PR: #8410
+- Fix dreambooth data sampler issue by @yaoyu-33 :: PR: #8400
+- Fix a bug in CTM line processing function for multi-speaker data simulations by @tango4j :: PR: #8416
+- Akoumparouli/mistral bugfix by @akoumpa :: PR: #8353
+- pin to 0.5.0 by @ericharper :: PR: #8465
+- Update NeMo Multimodal Requirements by @yaoyu-33 :: PR: #8515
+- Fix link in multimodal dataprep tutorial by @cuichenx :: PR: #8517
+
+</details>
diff --git a/Dockerfile.ci b/Dockerfile.ci
index 964fd419ccf5..38b82a288a2b 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -33,8 +33,8 @@ WORKDIR /workspace
 
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
-ARG MODELOPT_VERSION=0.13.0
-ARG MCORE_TAG=2bbe55be32e2d478c4b2ce575af1cccb8fc3d9b9
+ARG MODELOPT_VERSION=0.15.0
+ARG MCORE_TAG=2fd6e2b74efca73a1f2d27b89bb5419384b4d3bf
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \
diff --git a/README.md b/README.md
index cb2a357fd7ed..9b019d3ac175 100644
--- a/README.md
+++ b/README.md
@@ -10,10 +10,38 @@
 # **NVIDIA NeMo Framework**
 
 ## Latest News
+
 <!-- markdownlint-disable -->
 <details open>
-  <summary><b>Large Language Models and Multimodal</b></summary>
+  <summary><b>Large Language Models and Multimodal Models</b></summary>
+      <details>
+      <summary>
+        <a href="https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/llama/index.html#new-llama-3-1-support for more information/">
+        New Llama 3.1 Support
+        </a> (2024-07-23)
+      </summary>
+        The NeMo Framework now supports training and customizing the Llama 3.1 collection of LLMs from Meta.
+      <br><br>
+    </details>
     <details>
+      <summary>
+        <a href="https://aws.amazon.com/blogs/machine-learning/accelerate-your-generative-ai-distributed-training-workloads-with-the-nvidia-nemo-framework-on-amazon-eks/">
+          Accelerate your Generative AI Distributed Training Workloads with the NVIDIA NeMo Framework on Amazon EKS
+        </a> (2024-07-16)
+      </summary>
+     NVIDIA NeMo Framework now runs distributed training workloads on an Amazon Elastic Kubernetes Service (Amazon EKS) cluster. For step-by-step instructions on creating an EKS cluster and running distributed training workloads with NeMo, see the GitHub repository <a href="https://github.com/aws-samples/awsome-distributed-training/tree/main/3.test_cases/2.nemo-launcher/EKS/"> here.</a>
+      <br><br>
+    </details>
+    <details>
+      <summary>
+        <a href="https://developer.nvidia.com/blog/nvidia-nemo-accelerates-llm-innovation-with-hybrid-state-space-model-support/">
+          NVIDIA NeMo Accelerates LLM Innovation with Hybrid State Space Model Support
+        </a> (2024/06/17)
+      </summary>
+     NVIDIA NeMo and Megatron Core now support pre-training and fine-tuning of state space models (SSMs). NeMo also supports training models based on the Griffin architecture as described by Google DeepMind. 
+      <br><br>
+    </details>
+      <details>
       <summary>
         <a href="https://huggingface.co/models?sort=trending&search=nvidia%2Fnemotron-4-340B">
           NVIDIA releases 340B base, instruct, and reward models pretrained on a total of 9T tokens.
@@ -46,45 +74,6 @@
         The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework.
         <br><br>
       </details>
-    <details>
-      <summary>
-        <a href="https://blogs.nvidia.com/blog/bria-builds-responsible-generative-ai-using-nemo-picasso/">
-          Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso
-        </a> (2024/03/06)
-      </summary>
-      Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. 
-      The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. 
-      Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference.
-      <br><br>
-    </details>
-    <details>
-      <summary>
-        <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility/">
-          New NVIDIA NeMo Framework Features and NVIDIA H200
-        </a> (2023/12/06)
-      </summary>
-      NVIDIA NeMo Framework now includes several optimizations and enhancements, 
-      including: 
-      1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, 
-      2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, 
-      3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and 
-      4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs.
-      <br><br>
-      <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility">
-      <img src="https://github.com/sbhavani/TransformerEngine/blob/main/docs/examples/H200-NeMo-performance.png" alt="H200-NeMo-performance" style="width: 600px;"></a>
-      <br><br>
-    </details>
-    <details>
-      <summary>
-        <a href="https://blogs.nvidia.com/blog/nemo-amazon-titan/">
-          NVIDIA now powers training for Amazon Titan Foundation models
-        </a> (2023/11/28)
-      </summary>
-      NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). 
-      The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. 
-      The NeMo Framework provides a versatile framework for building, customizing, and running LLMs.
-      <br><br>
-    </details>
 </details>
 
 <details open>
@@ -604,6 +593,53 @@ to the `gh-pages-src` branch of this repository. For detailed
 information, please consult the README located at the [gh-pages-src
 branch](https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme).
 
+## Blogs
+
+<!-- markdownlint-disable -->
+<details open>
+  <summary><b>Large Language Models and Multimodal Models</b></summary>
+    <details>
+      <summary>
+        <a href="https://blogs.nvidia.com/blog/bria-builds-responsible-generative-ai-using-nemo-picasso/">
+          Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso
+        </a> (2024/03/06)
+      </summary>
+      Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. 
+      The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. 
+      Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference.
+      <br><br>
+    </details>
+    <details>
+      <summary>
+        <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility/">
+          New NVIDIA NeMo Framework Features and NVIDIA H200
+        </a> (2023/12/06)
+      </summary>
+      NVIDIA NeMo Framework now includes several optimizations and enhancements, 
+      including: 
+      1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, 
+      2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, 
+      3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and 
+      4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs.
+      <br><br>
+      <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility">
+      <img src="https://github.com/sbhavani/TransformerEngine/blob/main/docs/examples/H200-NeMo-performance.png" alt="H200-NeMo-performance" style="width: 600px;"></a>
+      <br><br>
+    </details>
+    <details>
+      <summary>
+        <a href="https://blogs.nvidia.com/blog/nemo-amazon-titan/">
+          NVIDIA now powers training for Amazon Titan Foundation models
+        </a> (2023/11/28)
+      </summary>
+      NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). 
+      The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. 
+      The NeMo Framework provides a versatile framework for building, customizing, and running LLMs.
+      <br><br>
+    </details>
+</details>
+<!-- markdownlint-enable -->
+
 ## Licenses
 
 - [NeMo GitHub Apache 2.0
diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index a6e9cbe96c63..2c0657d1c6ce 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -803,21 +803,153 @@ The following script may be used:
 .. code-block:: bash
 
     $ python scripts/speech_recognition/estimate_duration_bins.py -b 30 manifest.json
+
+    # The script's output:
     Use the following options in your config:
             num_buckets=30
             bucket_duration_bins=[1.78,2.34,2.69,...
     <other diagnostic information about the dataset>
 
-For multi-dataset setups, one may provide multiple manifests and even their weights:
+For multi-dataset setups, one may provide a dataset config directly:
+
+.. code-block:: bash
+
+    $ python scripts/speech_recognition/estimate_duration_bins.py -b 30 input_cfg.yaml
+
+    # The script's output:
+    Use the following options in your config:
+            num_buckets=30
+            bucket_duration_bins=[1.91,3.02,3.56,...
+    <other diagnostic information about the dataset>
+
+It's also possible to manually specify the list of data manifests (optionally together with weights):
 
 .. code-block:: bash
 
     $ python scripts/speech_recognition/estimate_duration_bins.py -b 30 [[manifest.json,0.7],[other.json,0.3]]
+
+    # The script's output:
     Use the following options in your config:
             num_buckets=30
             bucket_duration_bins=[1.91,3.02,3.56,...
     <other diagnostic information about the dataset>
 
+2D bucketing
+~~~~~~~~~~~~
+
+To achieve maximum training efficiency for some classes of models it is necessary to stratify the sampling
+both on the input sequence lengths and the output sequence lengths.
+One such example are attention encoder-decoder models, where the overall GPU memory usage can be factorized
+into two main components: input-sequence-length bound (encoder activations) and output-sequence-length bound
+(decoder activations).
+Classical bucketing techniques only stratify on the input sequence length (e.g. duration in speech),
+which leverages encoder effectively but leads to excessive padding on on decoder's side.
+
+To amend this we support a 2D bucketing technique which estimates the buckets in two stages.
+The first stage is identical to 1D bucketing, i.e. we determine the input-sequence bucket bins so that
+every bin holds roughly an equal duration of audio.
+In the second stage, we use a tokenizer and optionally a prompt formatter (for prompted models) to
+estimate the total number of tokens in each duration bin, and sub-divide it into several sub-buckets,
+where each sub-bucket again holds roughly an equal number of tokens.
+
+To run 2D bucketing with 30 buckets sub-divided into 5 sub-buckets each (150 buckets total), use the following script:
+
+.. code-block:: bash
+
+    $ python scripts/speech_recognition/estimate_duration_bins_2d.py \
+        --tokenizer path/to/tokenizer.model \
+        --buckets 30 \
+        --sub-buckets 5 \
+        input_cfg.yaml
+
+    # The script's output:
+    Use the following options in your config:
+            num_buckets=30
+            bucket_duration_bins=[[1.91,10],[1.91,17],[1.91,25],...
+            max_duration=...
+            max_tps=...
+    <other diagnostic information about the dataset>
+
+Note that the output in ``bucket_duration_bins`` is a nested list, where every bin specifies
+the maximum duration and the maximum number of tokens that go into the bucket.
+Passing this option to Lhotse dataloader will automatically enable 2D bucketing.
+Note the presence of ``max_duration`` and ``max_tps`` (token-per-second) options:
+these need to be included in dataloader's configuration to ensure we can use the buckets correctly at runtime
+in case of outliers.
+In general, if you change your data in training, it is highly advisable to re-estimate the duration bins.
+
+Note that reasonable values for tokens-per-second rarely exceed 12tps with reasonably good tokenizers.
+If you find your dataset's TPS is much higher than that, you may have some bad data outliers.
+In that case you may specify ``--max_tps`` option to discard those both in bin estimation and dataloading.
+
+We also support aggregate tokenizers for 2D bucketing estimation:
+
+.. code-block:: bash
+
+    $ python scripts/speech_recognition/estimate_duration_bins_2d.py \
+        --tokenizer path/to/en/tokenizer.model path/to/pl/tokenizer1.model \
+        --langs en pl \
+        --buckets 30 \
+        --sub-buckets 5 \
+        input_cfg.yaml
+
+To estimate 2D buckets for a prompted model such as Canary-1B, provide prompt format name and an example prompt.
+For Canary-1B, we'll also provide the special tokens tokenizer. Example:
+
+.. code-block:: bash
+
+    $ python scripts/speech_recognition/estimate_duration_bins_2d.py \
+        --prompt-format canary \
+        --prompt "[{'role':'user','slots':{'source_lang':'en','target_lang':'de','task':'ast','pnc':'yes'}}]" \
+        --tokenizer path/to/spl_tokens/tokenizer.model path/to/en/tokenizer.model path/to/de/tokenizer1.model \
+        --langs spl_tokens en de \
+        --buckets 30 \
+        --sub-buckets 5 \
+        input_cfg.yaml
+
+Pushing GPU utilization to the limits with bucketing and OOMptimizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The default approach of specifying a ``batch_duration``, ``bucket_duration_bins`` and ``quadratic_duration``
+is quite flexible, but is not maximally efficient. We observed that in practice it often leads to under-utilization
+of GPU memory and compute for most buckets (especially those with shorter durations).
+While it is impossible to estimate GPU memory usage up-front, we can determine it empirically with a bit of search.
+
+OOMptimizer is an approach that given a NeMo model, optimizer, and a list of buckets (1D or 2D)
+estimates the maximum possible batch size to use for each bucket.
+It performs a binary search over batch sizes that succeed or lead to CUDA OOM until convergence.
+We find that the resulting bucketing batch size profiles enable full GPU utilization in training,
+while it only takes a couple of minutes to complete the search.
+
+In order to run OOMptimizer, you only need the bucketing bins (from previous sections) and a model configuration:
+
+.. code-block:: bash
+
+    $ python scripts/speech_recognition/oomptimizer.py \
+        --config-path fast-conformer_aed.yaml \
+        --module-name nemo.collections.asr.models.EncDecMultiTaskModel \
+        --buckets '[[3.975,30],[3.975,48],[4.97,37],[4.97,60],[5.851,42],[5.851,71],[6.563,46],[6.563,79],[7.32,49],[7.32,88],[8.19,54],[8.19,99],[8.88,61],[8.88,107],[9.75,66],[9.75,117],[10.55,72],[10.55,127],[11.21,76],[11.21,135],[11.87,79],[11.87,143],[12.54,82],[12.54,151],[13.08,87],[13.08,157],[13.62,91],[13.62,164],[14.16,93],[14.16,170],[14.7,96],[14.7,177],[15.19,99],[15.19,183],[15.67,101],[15.67,189],[16.13,103],[16.13,194],[16.66,105],[16.66,200],[17.2,108],[17.2,207],[17.73,111],[17.73,213],[18.2,114],[18.2,219],[18.69,117],[18.69,225],[19.15,120],[19.15,230],[19.62,123],[19.62,236],[20.264,122],[20.264,244],[32.547,173],[32.547,391],[36.587,227],[36.587,440],[40.0,253],[40.0,480]]'
+
+    # The script's output:
+    <output logs from the search>
+    The final profile is:
+            bucket_duration_bins=[[3.975,30],[3.975,48],[4.97,37],[4.97,60],[5.851,42],[5.851,71],[6.563,46],[6.563,79],[7.32,49],[7.32,88],[8.19,54],[8.19,99],[8.88,61],[8.88,107],[9.75,66],[9.75,117],[10.55,72],[10.55,127],[11.21,76],[11.21,135],[11.87,79],[11.87,143],[12.54,82],[12.54,151],[13.08,87],[13.08,157],[13.62,91],[13.62,164],[14.16,93],[14.16,170],[14.7,96],[14.7,177],[15.19,99],[15.19,183],[15.67,101],[15.67,189],[16.13,103],[16.13,194],[16.66,105],[16.66,200],[17.2,108],[17.2,207],[17.73,111],[17.73,213],[18.2,114],[18.2,219],[18.69,117],[18.69,225],[19.15,120],[19.15,230],[19.62,123],[19.62,236],[20.264,122],[20.264,244],[32.547,173],[32.547,391],[36.587,227],[36.587,440],[40.0,253],[40.0,480]]
+            bucket_batch_size=[352,308,280,245,245,206,206,180,186,163,168,142,151,132,136,119,126,106,116,98,110,92,104,88,99,83,94,79,90,76,86,72,86,72,81,68,80,65,78,63,74,60,72,58,70,58,68,54,66,52,65,52,62,50,37,28,31,24,28,21]
+            max_tps=12.0
+            max_duration=40.0
+
+Use the resulting options in your training configuration (typically under namespace ``model.train_ds``) to apply the profile.
+
+It's also possible to run OOMptimizer using a pretrained model's name and bucket bins corresponding
+to your fine-tuning data:
+
+    $ python scripts/speech_recognition/oomptimizer.py \
+        --pretrained-name nvidia/canary-1b \
+        --buckets '[2.0,3.1,5.6,6.6,...]'
+
+Note that your training script can perform some additional actions using GPU RAM that cannot be anticipated by the OOMptimizer.
+By default, we let the script use up to 90% of GPU's RAM for this estimation to account for that.
+In the unlikely case you run into an OutOfMemoryError during training, you can try re-estimating the profile with the option ``--memory-fraction 0.75`` (or another value) that will further cap OOMptimizer's available GPU RAM.
 
 Seeds and randomness
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/asr/speech_intent_slot/api.rst b/docs/source/asr/speech_intent_slot/api.rst
index d45f24f807f6..4a45715f78f7 100644
--- a/docs/source/asr/speech_intent_slot/api.rst
+++ b/docs/source/asr/speech_intent_slot/api.rst
@@ -15,10 +15,10 @@ Mixins
 .. autoclass:: nemo.collections.asr.parts.mixins.ASRModuleMixin
     :show-inheritance:
     :members:
-    :no-index:
+    :noindex:
 
 .. autoclass:: nemo.collections.asr.parts.mixins.ASRBPEMixin
     :show-inheritance:
     :members:
-    :no-index:
+    :noindex:
 
diff --git a/docs/source/asr/ssl/api.rst b/docs/source/asr/ssl/api.rst
index 8e6f83986032..77614e9ad5e3 100644
--- a/docs/source/asr/ssl/api.rst
+++ b/docs/source/asr/ssl/api.rst
@@ -15,12 +15,12 @@ Mixins
 .. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin
     :show-inheritance:
     :members:
-    :no-index:
+    :noindex:
 
 .. autoclass:: nemo.core.classes.mixins.access_mixins.AccessMixin
     :show-inheritance:
     :members:
-    :no-index:
+    :noindex:
 
 
 
diff --git a/docs/source/checkpoints/intro.rst b/docs/source/checkpoints/intro.rst
index 7c7154d64015..37d3bd7051f9 100644
--- a/docs/source/checkpoints/intro.rst
+++ b/docs/source/checkpoints/intro.rst
@@ -4,8 +4,8 @@ Checkpoints
 
 In this section, we present key functionalities of NVIDIA NeMo related to checkpoint management.
 
-Understanding Checkpoint Formats
---------------------------------
+Checkpoint Formats
+------------------
 
 A ``.nemo`` checkpoint is fundamentally a tar file that bundles the model configurations (specified inside a YAML file), model weights (inside a ``.ckpt`` file), and other artifacts like tokenizer models or vocabulary files. This consolidated design streamlines sharing, loading, tuning, evaluating, and inference.
 
@@ -43,7 +43,7 @@ The following example shows the contents of a quantized model intended to be ser
     └── tokenizer_config.yaml
 
 Community Checkpoint Converter
------------------------------
+------------------------------
 We provide easy-to-use tools that enable users to convert community checkpoints into the NeMo format. These tools facilitate various operations, including resuming training, Supervised Fine-Tuning (SFT), Parameter-Efficient Fine-Tuning (PEFT), and deployment. For detailed instructions and guidelines, please refer to our documentation.
 
 We offer comprehensive guides to assist both end users and developers:
diff --git a/docs/source/collections.rst b/docs/source/collections.rst
index 0198ef250ce3..2f04d1557628 100644
--- a/docs/source/collections.rst
+++ b/docs/source/collections.rst
@@ -25,7 +25,7 @@ Documentation for the individual collections
    multimodal/vlm/intro
    multimodal/text2img/intro
    multimodal/nerf/intro
-   mumtimoda/speech_llm/intro
+   multimodal/speech_llm/intro
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/core/adapters/api.rst b/docs/source/core/adapters/api.rst
index 8922c72d63eb..dee215ba0ed8 100644
--- a/docs/source/core/adapters/api.rst
+++ b/docs/source/core/adapters/api.rst
@@ -9,7 +9,7 @@ Core
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
-    :no-index:
+    :noindex:
 
 -----
 
@@ -18,7 +18,7 @@ Core
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
-    :no-index:
+    :noindex:
 
 -----
 
@@ -30,7 +30,7 @@ Adapter Networks
     :show-inheritance:
     :members:
     :member-order: bysource
-    :no-index:
+    :noindex:
 
 -----
 
@@ -38,7 +38,7 @@ Adapter Networks
     :show-inheritance:
     :members:
     :member-order: bysource
-    :no-index:
+    :noindex:
 
 -----
 
@@ -51,7 +51,7 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
-    :no-index:
+    :noindex:
 
 -----
 
@@ -60,7 +60,7 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
-    :no-index:
+    :noindex:
 
 -----
 
@@ -69,4 +69,4 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
-    :no-index:
+    :noindex:
diff --git a/docs/source/core/adapters/components.rst b/docs/source/core/adapters/components.rst
index d8bed1b23a75..d4b38bc147b2 100644
--- a/docs/source/core/adapters/components.rst
+++ b/docs/source/core/adapters/components.rst
@@ -28,7 +28,7 @@ Adapter modules represent the functional form of the adapter. We discuss an exam
     :show-inheritance:
     :members:
     :member-order: bysource
-    :no-index:
+    :noindex:
 
 -----
 
@@ -36,7 +36,7 @@ Adapter modules represent the functional form of the adapter. We discuss an exam
     :show-inheritance:
     :members:
     :member-order: bysource
-    :no-index:
+    :noindex:
 
 
 Insertion Form - Module Adapters
@@ -72,7 +72,7 @@ We discuss a simple residual additional connection strategy below - that accepts
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
-    :no-index:
+    :noindex:
 
 -----
 
@@ -81,7 +81,7 @@ We discuss a simple residual additional connection strategy below - that accepts
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
-    :no-index:
+    :noindex:
 
 -----
 
diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst
index 3c1a496993bd..6bdd18559902 100644
--- a/docs/source/core/core.rst
+++ b/docs/source/core/core.rst
@@ -4,7 +4,7 @@ NeMo Models
 Basics
 ------
 
-NeMo models contain everything needed to train and reproduce Conversational AI models:
+NeMo models contain everything needed to train and reproduce conversational AI models:
 
 - neural network architectures 
 - datasets/data loaders
@@ -35,7 +35,7 @@ As an example, we can instantiate QuartzNet with the following:
 
     model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En")
 
-To see all available pretrained models for a specific NeMo model, use the ``list_available_models()`` method.
+To see all available pretrained models for a specific NeMo model, use the ``list_available_models()`` method:
 
 .. code-block:: Python
 
@@ -52,7 +52,7 @@ Training
 
 NeMo leverages `PyTorch Lightning <https://www.pytorchlightning.ai/>`__ for model training. PyTorch Lightning lets NeMo decouple the
 conversational AI code from the PyTorch training code. This means that NeMo users can focus on their domain (ASR, NLP, TTS) and 
-build complex AI applications without having to rewrite boiler plate code for PyTorch training.
+build complex AI applications without having to rewrite boilerplate code for PyTorch training.
 
 When using PyTorch Lightning, NeMo users can automatically train with:
 
@@ -168,7 +168,7 @@ While validation logic can be found in ``validation_step``:
 
         return {'val_loss': val_loss, 'tp': tp, 'fn': fn, 'fp': fp}
 
-PyTorch Lightning then handles all of the boiler plate code needed for training. Virtually any aspect of training can be customized 
+PyTorch Lightning then handles all of the boilerplate code needed for training. Virtually any aspect of training can be customized
 via PyTorch Lightning `hooks <https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#hooks>`_, 
 `Plugins <https://pytorch-lightning.readthedocs.io/en/stable/extensions/plugins.html>`_, 
 `callbacks <https://pytorch-lightning.readthedocs.io/en/stable/extensions/callbacks.html>`_, or by overriding `methods <https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#methods>`_. 
@@ -239,8 +239,8 @@ Every NeMo example YAML has the same underlying configuration structure:
 - exp_manager
 - model
 
-Model configuration always contain ``train_ds``, ``validation_ds``, ``test_ds``, and ``optim``.  Model architectures vary across 
-domains, therefore, refer to the ASR, NLP, and TTS Collections documentation for more detailed information on Model architecture configuration.
+The model configuration always contains ``train_ds``, ``validation_ds``, ``test_ds``, and ``optim``.  Model architectures, however, can vary across domains.
+Refer to the documentation of specific collections (LLM, ASR etc.) for detailed information on model architecture configuration.
 
 A NeMo configuration file should look similar to the following:
 
@@ -288,15 +288,11 @@ A NeMo configuration file should look similar to the following:
         decoder:
             ...
 
-More specific details about configuration files for each collection can be found on the following pages:
-
-:ref:`NeMo ASR Configuration Files`
-        
 CLI
 ~~~
 
 With NeMo and Hydra, every aspect of model training can be modified from the command-line. This is extremely helpful for running lots 
-of experiments on compute clusters or for quickly testing parameters while developing.
+of experiments on compute clusters or for quickly testing parameters during development.
 
 All NeMo `examples <https://github.com/NVIDIA/NeMo/tree/v1.0.2/examples>`_ come with instructions on how to
 run the training/inference script from the command-line (see `here <https://github.com/NVIDIA/NeMo/blob/4e9da75f021fe23c9f49404cd2e7da4597cb5879/examples/asr/asr_ctc/speech_to_text_ctc.py#L24>`__
@@ -374,7 +370,7 @@ be instantiated and modified like any Python `Dataclass <https://docs.python.org
     # modify the training batch size
     cfg.train_ds.tokens_in_batch = 8192
 
-.. note:: Configuration with Hydra always has the following precedence CLI > YAML > Dataclass
+.. note:: Configuration with Hydra always has the following precedence CLI > YAML > Dataclass.
 
 .. _optimization-label:
 
@@ -382,7 +378,7 @@ Optimization
 ------------
 
 Optimizers and learning rate schedules are configurable across all NeMo models and have their own namespace. Here is a sample YAML 
-configuration for a Novograd optimizer with Cosine Annealing learning rate schedule.
+configuration for a Novograd optimizer with a Cosine Annealing learning rate schedule.
 
 .. code-block:: yaml
 
@@ -408,7 +404,7 @@ configuration for a Novograd optimizer with Cosine Annealing learning rate sched
             warmup_ratio: null
             min_lr: 1e-9:
 
-.. note:: `NeMo Examples <https://github.com/NVIDIA/NeMo/tree/v1.0.2/examples>`_ has optimizer and scheduler configurations for every NeMo model.
+.. note:: `NeMo Examples <https://github.com/NVIDIA/NeMo/tree/stable/examples>`_ has optimizer and scheduler configurations for every NeMo model.
 
 Optimizers can be configured from the CLI as well:
 
@@ -596,7 +592,7 @@ as shown below we can update this config prior to restoring the model.
 Register Artifacts
 ------------------
 
-Conversational AI models can be complicated to restore as more information is needed than just the checkpoint weights in order to use the model.
+Restoring conversational AI models can be complicated because it requires more than just the checkpoint weights; additional information is also needed to use the model.
 NeMo models can save additional artifacts in the .nemo file by calling ``.register_artifact``.
 When restoring NeMo models using ``.restore_from`` or ``.from_pretrained``, any artifacts that were registered will be available automatically.
 
@@ -643,7 +639,7 @@ Push to Hugging Face Hub
 NeMo models can be pushed to the `Hugging Face Hub <https://huggingface.co/>`_ with the :meth:`~nemo.core.classes.mixins.hf_io_mixin.HuggingFaceFileIO.push_to_hf_hub` method. This method performs the same actions as ``save_to()`` and then uploads the model to the HuggingFace Hub. It offers an additional ``pack_nemo_file`` argument that allows the user to upload the entire NeMo file or just the ``.nemo`` file. This is useful for large language models that have a massive number of parameters, and a single NeMo file could exceed the max upload size of Hugging Face Hub.
 
 
-Upload a model to the hub
+Upload a model to the Hub
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: python
@@ -688,15 +684,15 @@ Use a Custom Model Card Template for the Hub
 Nested NeMo Models
 ------------------
 
-In some cases, it may be helpful to use NeMo models inside other NeMo models. For example, we can incorporate language models into ASR models to use in a decoding process to improve accuracy or use hybrid ASR-TTS models to generate audio from the text on the fly to train or finetune the ASR model.
+In some cases, it may be helpful to use NeMo models inside other NeMo models. For example, we can incorporate language models into ASR models to use in a decoding process to improve accuracy or use hybrid ASR-TTS models to generate audio from the text on the fly to train or fine-tune the ASR model.
 
-There are 3 ways to instantiate child models inside parent models:
+There are three ways to instantiate child models inside parent models:
 
 - use subconfig directly
 - use the ``.nemo`` checkpoint path to load the child model
 - use a pretrained NeMo model
 
-To register a child model, use the ``register_nemo_submodule`` method of the parent model. This method will add the child model to a provided model attribute and, in the serialization process, will handle child artifacts correctly and store the child model config in the parent model config in ``config_field``.
+To register a child model, use the ``register_nemo_submodule`` method of the parent model. This method will add the child model to a specified model attribute. During serialization, it will correctly handle child artifacts and store the child model’s configuration in the parent model’s ``config_field``.
 
 .. code-block:: python
 
@@ -746,30 +742,38 @@ To register a child model, use the ``register_nemo_submodule`` method of the par
 Profiling 
 ---------
 
-NeMo offers users two options for profiling: Nsys & CUDA memory profiling. These two options allow users
+NeMo offers users two options for profiling: Nsys and CUDA memory profiling. These two options allow users
 to debug performance issues as well as memory issues such as memory leaks.
 
 To enable Nsys profiling, add the following options to the model config:
-nsys_profile: False
-   start_step: 10  # Global batch to start profiling
-   end_step: 10 # Global batch to end profiling
-   ranks: [0] # Global rank IDs to profile
-   gen_shape: False # Generate model and kernel details including input shapes
 
-Finally, the model training script with:
+.. code-block:: yaml
+
+    nsys_profile: False
+        start_step: 10  # Global batch to start profiling
+        end_step: 10 # Global batch to end profiling
+        ranks: [0] # Global rank IDs to profile
+        gen_shape: False # Generate model and kernel details including input shapes
+
+Finally, run the model training script with:
+
+.. code-block:: bash
+
+    nsys profile -s none -o <profile filepath> -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/...
 
-nsys profile -s none -o <profile filepath> -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/... 
 See more options at `nsight user guide <https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling>`_.
 
 
 
 To enable CUDA memory profiling, add the following options to the model config:
 
-memory_profile:
-   enabled: True
-   start_step: 10  # Global batch to start profiling
-   end_step: 10 # Global batch to end profiling
-   rank: 0 # Global rank ID to profile
-   output_path: None # Path to store the profile output file
+.. code-block:: yaml
+
+    memory_profile:
+        enabled: True
+        start_step: 10  # Global batch to start profiling
+        end_step: 10 # Global batch to end profiling
+        rank: 0 # Global rank ID to profile
+        output_path: None # Path to store the profile output file
 
-And invoke your NeMo script without any changes in the invocation command.
+Then invoke your NeMo script without any changes in the invocation command.
diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst
index ce5f7a9cb087..50ff94bfcb80 100644
--- a/docs/source/core/exp_manager.rst
+++ b/docs/source/core/exp_manager.rst
@@ -4,16 +4,16 @@
 Experiment Manager
 ==================
 
-NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing, TensorBoard Logging, Weights and Biases, DLLogger and MLFlow logging. The
+The NeMo Framework Experiment Manager leverages PyTorch Lightning for model checkpointing, TensorBoard Logging, Weights and Biases, DLLogger and MLFlow logging. The
 Experiment Manager is included by default in all NeMo example scripts.
 
-To use the experiment manager simply call :class:`~nemo.utils.exp_manager.exp_manager` and pass in the PyTorch Lightning ``Trainer``.
+To use the Experiment Manager, call :class:`~nemo.utils.exp_manager.exp_manager` and pass in the PyTorch Lightning ``Trainer``.
 
 .. code-block:: python
 
     exp_dir = exp_manager(trainer, cfg.get("exp_manager", None))
 
-And is configurable via YAML with Hydra.
+The Experiment Manager is configurable using YAML with Hydra.
 
 .. code-block:: bash
 
@@ -23,7 +23,7 @@ And is configurable via YAML with Hydra.
         create_tensorboard_logger: True
         create_checkpoint_callback: True
 
-Optionally, launch TensorBoard to view the training results in ``./nemo_experiments`` (by default).
+Optionally, launch TensorBoard to view the training results in ``exp_dir``, which by default is set to ``./nemo_experiments``.
 
 .. code-block:: bash
 
@@ -33,7 +33,7 @@ Optionally, launch TensorBoard to view the training results in ``./nemo_experime
 
 If ``create_checkpoint_callback`` is set to ``True``, then NeMo automatically creates checkpoints during training
 using PyTorch Lightning's `ModelCheckpoint <https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html>`_.
-We can configure the ``ModelCheckpoint`` via YAML or CLI.
+We can configure the ``ModelCheckpoint`` via YAML or CLI:
 
 .. code-block:: yaml
 
@@ -51,9 +51,8 @@ We can configure the ``ModelCheckpoint`` via YAML or CLI.
 Resume Training
 ---------------
 
-We can auto-resume training as well by configuring the ``exp_manager``. Being able to auto-resume is important when doing long training
-runs that are premptible or may be shut down before the training procedure has completed. To auto-resume training, set the following
-via YAML or CLI:
+To auto-resume training, configure the ``exp_manager``. This feature is important for long training runs that might be interrupted or
+shut down before the procedure has completed. To auto-resume training, set the following parameters via YAML or CLI:
 
 .. code-block:: yaml
 
@@ -73,7 +72,7 @@ via YAML or CLI:
 Experiment Loggers
 ------------------
 
-Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow, DLLogger, ClearML and NeptuneLogger. To use these loggers, simply set the following
+Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow, DLLogger, ClearML and NeptuneLogger. To use these loggers,  set the following
 via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`.
 
 
@@ -179,7 +178,7 @@ Exponential Moving Average
 .. _exp_manager_ema-label:
 
 NeMo supports using exponential moving average (EMA) for model parameters. This can be useful for improving model generalization
-and stability. To use EMA, simply set the following via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`.
+and stability. To use EMA, set the following parameters via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`.
 
 .. code-block:: yaml
 
@@ -193,186 +192,149 @@ and stability. To use EMA, simply set the following via YAML or :class:`~nemo.ut
             every_n_steps: 1  # How often to update EMA weights
             validate_original_weights: False  # Whether to use original weights for validation calculation or EMA weights
 
-Support for Preemption
-----------------------
+..  Support for Preemption
+    ----------------------
 
-.. _exp_manager_preemption_support-label:
+    .. _exp_manager_preemption_support-label:
 
-NeMo adds support for a callback upon preemption while running the models on clusters. The callback takes care of saving the current state of training via the ``.ckpt``
-file followed by a graceful exit from the run. The checkpoint saved upon preemption has the ``*last.ckpt`` suffix and replaces the previously saved last checkpoints.
-This feature is useful to increase utilization on clusters.
-The ``PreemptionCallback`` is enabled by default. To disable it simply add ``create_preemption_callback: False`` under exp_manager in the config YAML file. 
+    NeMo adds support for a callback upon preemption while running the models on clusters. The callback takes care of saving the current state of training via the ``.ckpt``
+    file followed by a graceful exit from the run. The checkpoint saved upon preemption has the ``*last.ckpt`` suffix and replaces the previously saved last checkpoints.
+    This feature is useful to increase utilization on clusters.
+    The ``PreemptionCallback`` is enabled by default. To disable it, add ``create_preemption_callback: False`` under exp_manager in the config YAML file.
 
-Stragglers Detection
-----------------------
+    Stragglers Detection
+    ----------------------
 
-.. _exp_manager_straggler_det_support-label:
+    .. _exp_manager_straggler_det_support-label:
 
-.. note::
-    Stragglers Detection feature is included in the optional NeMo resiliency package.
+    .. note::
+        Stragglers Detection feature is included in the optional NeMo resiliency package.
 
-Distributed training can be affected by stragglers, which are slow workers that slow down the overall training process. 
-NeMo provides a straggler detection feature that can identify slower GPUs.
+    Distributed training can be affected by stragglers, which are workers that slow down the overall training process.
+    NeMo provides a straggler detection feature that can identify slower GPUs.
 
-This feature is implemented in the ``StragglerDetectionCallback``, which is disabled by default.
+    This feature is implemented in the ``StragglerDetectionCallback``, which is disabled by default.
 
-The callback computes normalized GPU performance scores, which are scalar values ranging from 0.0 (worst) to 1.0 (best). 
-A performance score can be interpreted as the ratio of current performance to reference performance.
+    The callback computes normalized GPU performance scores, which are scalar values ranging from 0.0 (worst) to 1.0 (best). 
+    A performance score can be interpreted as the ratio of current performance to reference performance.
 
-There are two types of performance scores provided by the callback:
-    - Relative GPU performance score: The best-performing GPU in the workload is used as a reference.
-    - Individual GPU performance score: The best historical performance of the GPU is used as a reference.
+    There are two types of performance scores provided by the callback:
+        * Relative GPU performance score: The best-performing GPU in the workload is used as a reference.
+        * Individual GPU performance score: The best historical performance of the GPU is used as a reference.
 
-Examples:
-    - If the relative performance score is 0.5, it means that a GPU is twice slower than the fastest GPU.
-    - If the individual performance score is 0.5, it means that a GPU is twice slower than its best observed performance.
+    Examples:
+        * If the relative performance score is 0.5, it means that a GPU is twice slower than the fastest GPU.
+        * If the individual performance score is 0.5, it means that a GPU is twice slower than its best observed performance.
 
-If a GPU performance score drops below the specified threshold, it is identified as a straggler.
+    If a GPU performance score drops below the specified threshold, it is identified as a straggler.
 
-To enable straggler detection, add ``create_straggler_detection_callback: True`` under exp_manager in the config YAML file. 
-You might also want to adjust the callback parameters:
+    To enable straggler detection, add ``create_straggler_detection_callback: True`` under exp_manager in the config YAML file. 
+    You might also want to adjust the callback parameters:
 
-.. code-block:: yaml
-
-    exp_manager:
-        ...
-        create_straggler_detection_callback: True
-        straggler_detection_callback_params:
-            report_time_interval: 300      # Interval [seconds] of the straggler check
-            calc_relative_gpu_perf: True   # Calculate relative GPU performance
-            calc_individual_gpu_perf: True # Calculate individual GPU performance
-            num_gpu_perf_scores_to_log: 5       # Log 5 best and 5 worst GPU performance scores, even if no stragglers are detected
-            gpu_relative_perf_threshold: 0.7    # Threshold for relative GPU performance scores
-            gpu_individual_perf_threshold: 0.7  # Threshold for individual GPU performance scores
-            stop_if_detected: True              # Terminate the workload if stragglers are detected
-
-Straggler detection might involve inter-rank synchronization, and should be invoked with reasonable frequency (e.g. every few minutes).
+    .. code-block:: yaml
 
-.. _exp_manager_straggler_det_support-label:
+        exp_manager:
+            ...
+            create_straggler_detection_callback: True
+            straggler_detection_callback_params:
+                report_time_interval: 300      # Interval [seconds] of the straggler check
+                calc_relative_gpu_perf: True   # Calculate relative GPU performance
+                calc_individual_gpu_perf: True # Calculate individual GPU performance
+                num_gpu_perf_scores_to_log: 5       # Log 5 best and 5 worst GPU performance scores, even if no stragglers are detected
+                gpu_relative_perf_threshold: 0.7    # Threshold for relative GPU performance scores
+                gpu_individual_perf_threshold: 0.7  # Threshold for individual GPU performance scores
+                stop_if_detected: True              # Terminate the workload if stragglers are detected
 
-.. note::
-    Stragglers Detection feature is included in the optional NeMo resiliency package.
+    Straggler detection may require inter-rank synchronization and should be performed at regular intervals, such as every few minutes.
 
-Distributed training can be affected by stragglers, which are slow workers that slow down the overall training process. 
-NeMo provides a straggler detection feature that can identify slower GPUs.
 
-This feature is implemented in the ``StragglerDetectionCallback``, which is disabled by default.
+.. Fault Tolerance
+    ---------------
 
-The callback computes normalized GPU performance scores, which are scalar values ranging from 0.0 (worst) to 1.0 (best). 
-A performance score can be interpreted as the ratio of current performance to reference performance.
+    .. _exp_manager_fault_tolerance_support-label:
 
-There are two types of performance scores provided by the callback:
-    - Relative GPU performance score: The best-performing GPU in the workload is used as a reference.
-    - Individual GPU performance score: The best historical performance of the GPU is used as a reference.
+    .. note::
+        Fault Tolerance feature is included in the optional NeMo resiliency package.
 
-Examples:
-    - If the relative performance score is 0.5, it means that a GPU is twice slower than the fastest GPU.
-    - If the individual performance score is 0.5, it means that a GPU is twice slower than its best observed performance.
+    When training Deep Neural Network (DNN models), faults may occur, hindering the progress of the entire training process.
+    This is particularly common in distributed, multi-node training scenarios, with many nodes and GPUs involved. 
 
-If a GPU performance score drops below the specified threshold, it is identified as a straggler.
+    NeMo incorporates a fault tolerance mechanism to detect training halts. 
+    In response, it can terminate a hung workload and, if requested, restart it from the last checkpoint.
 
-To enable straggler detection, add ``create_straggler_detection_callback: True`` under exp_manager in the config YAML file. 
-You might also want to adjust the callback parameters:
+    Fault tolerance ("FT") relies on a special launcher (``ft_launcher``), which is a modified ``torchrun``. 
+    The FT launcher runs background processes called rank monitors. **You need to use ft_launcher to start 
+    your workload if you are using FT**. I.e., `NeMo-Framework-Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_  
+    can be used to generate SLURM batch scripts with FT support. 
 
-.. code-block:: yaml
+    Each training process (rank) sends `heartbeats` to its monitor during training and validation steps.
+    If a rank monitor stops receiving `heartbeats`, a training failure is detected.
 
-    exp_manager:
-        ...
-        create_straggler_detection_callback: True
-        straggler_detection_callback_params:
-            report_time_interval: 300      # Interval [seconds] of the straggler check
-            calc_relative_gpu_perf: True   # Calculate relative GPU performance
-            calc_individual_gpu_perf: True # Calculate individual GPU performance
-            num_gpu_perf_scores_to_log: 5       # Log 5 best and 5 worst GPU performance scores, even if no stragglers are detected
-            gpu_relative_perf_threshold: 0.7    # Threshold for relative GPU performance scores
-            gpu_individual_perf_threshold: 0.7  # Threshold for individual GPU performance scores
-            stop_if_detected: True              # Terminate the workload if stragglers are detected
-
-Straggler detection might involve inter-rank synchronization, and should be invoked with reasonable frequency (e.g. every few minutes).
-
-Fault Tolerance
----------------
+    Fault detection is implemented in the ``FaultToleranceCallback`` and is disabled by default. 
+    To enable it, add a ``create_fault_tolerance_callback: True`` option under ``exp_manager`` in the 
+    config YAML file. Additionally, you can customize FT parameters by adding ``fault_tolerance`` section:
 
-.. _exp_manager_fault_tolerance_support-label:
+    .. code-block:: yaml
 
-.. note::
-    Fault Tolerance feature is included in the optional NeMo resiliency package.
+        exp_manager:
+            ...
+            create_fault_tolerance_callback: True
+            fault_tolerance:
+                initial_rank_heartbeat_timeout: 600  # wait for 10 minutes for the initial heartbeat
+                rank_heartbeat_timeout: 300  # wait for 5 minutes for subsequent heartbeats
+                calculate_timeouts: True # estimate more accurate timeouts based on observed intervals
 
-When training DNN models, faults may occur, hindering the progress of the entire training process. 
-This is particularly common in distributed, multi-node training scenarios, with many nodes and GPUs involved. 
+    Timeouts for fault detection need to be adjusted for a given workload:
+        * ``initial_rank_heartbeat_timeout`` should be long enough to allow for workload initialization.
+        * ``rank_heartbeat_timeout`` should be at least as long as the longest possible interval between steps. 
 
-NeMo incorporates a fault tolerance mechanism to detect training halts. 
-In response, it can terminate a hung workload and, if requested, restart it from the last checkpoint.
+    **Importantly, `heartbeats` are not sent during checkpoint loading and saving**, so time for 
+    checkpointing related operations should be taken into account.
 
-Fault tolerance ("FT") relies on a special launcher (``ft_launcher``), which is a modified ``torchrun``. 
-The FT launcher runs background processes called rank monitors. **You need to use ft_launcher to start 
-your workload if you are using FT**. I.e., `NeMo-Framework-Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_  
-can be used to generate SLURM batch scripts with FT support. 
+    If ``calculate_timeouts: True``, timeouts will be automatically estimated based on observed intervals.
+    Estimated timeouts take precedence over timeouts defined in the config file. **Timeouts are estimated
+    at the end of a training run when checkpoint loading and saving were observed.** Hence, in a multi-part
+    training started from scratch, estimated timeouts won't be available during the initial two runs.
+    Estimated timeouts are stored in a separate JSON file.
 
-Each training process (rank) sends `heartbeats` to its monitor during training and validation steps.
-If a rank monitor stops receiving `heartbeats`, a training failure is detected.
+    ``max_subsequent_job_failures`` allows for the automatic continuation of training on a SLURM cluster. 
+    This feature requires SLURM job to be scheduled with ``NeMo-Framework-Launcher``. If ``max_subsequent_job_failures`` 
+    value is `>0` continuation job is prescheduled. It will continue  the work until ``max_subsequent_job_failures`` 
+    subsequent jobs failed (SLURM job exit code is `!= 0`) or the training is completed successfully 
+    ("end of training" marker file is produced by the ``FaultToleranceCallback``, i.e. due to iters or time limit reached).
 
-Fault detection is implemented in the ``FaultToleranceCallback`` and is disabled by default. 
-To enable it, add a ``create_fault_tolerance_callback: True`` option under ``exp_manager`` in the 
-config YAML file. Additionally, you can customize FT parameters by adding ``fault_tolerance`` section:
-
-.. code-block:: yaml
-
-    exp_manager:
-        ...
-        create_fault_tolerance_callback: True
-        fault_tolerance:
-            initial_rank_heartbeat_timeout: 600  # wait for 10 minutes for the initial heartbeat
-            rank_heartbeat_timeout: 300  # wait for 5 minutes for subsequent heartbeats
-            calculate_timeouts: True # estimate more accurate timeouts based on observed intervals
-
-Timeouts for fault detection need to be adjusted for a given workload:
-    * ``initial_rank_heartbeat_timeout`` should be long enough to allow for workload initialization.
-    * ``rank_heartbeat_timeout`` should be at least as long as the longest possible interval between steps. 
-
-**Importantly, `heartbeats` are not sent during checkpoint loading and saving**, so time for 
-checkpointing related operations should be taken into account.
-
-If ``calculate_timeouts: True`` timeouts will be automatically estimated based on observed intervals. 
-Estimated timeouts take precedence over timeouts defined in the config file. **Timeouts are estimated after 
-checkpoint loading and saving was observed**. For example, in multi-part training started from scratch, 
-estimated timeouts won't be available during the first run. Estimated timeouts are stored in the checkpoint. 
-
-``max_subsequent_job_failures`` allows for the automatic continuation of training on a SLURM cluster. 
-This feature requires SLURM job to be scheduled with ``NeMo-Framework-Launcher``. If ``max_subsequent_job_failures`` 
-value is `>0` continuation job is prescheduled. It will continue  the work until ``max_subsequent_job_failures`` 
-subsequent jobs failed (SLURM job exit code is `!= 0`) or the training is completed successfully 
-("end of training" marker file is produced by the ``FaultToleranceCallback``, i.e. due to iters or time limit reached).
-
-All FT configuration items summary:
-    * ``workload_check_interval`` (float, default=5.0) Periodic workload check interval [seconds] in the workload monitor.
-    * ``initial_rank_heartbeat_timeout`` (Optional[float], default=60.0 * 60.0) Timeout for the first heartbeat from a rank. 
-    * ``rank_heartbeat_timeout`` (Optional[float], default=45.0 * 60.0) Timeout for subsequent heartbeats from a rank. 
-    * ``calculate_timeouts`` (bool, default=True) Try to calculate ``rank_heartbeat_timeout`` and ``initial_rank_heartbeat_timeout`` 
-      based on the observed heartbeat intervals.
-    * ``rank_termination_signal`` (signal.Signals, default=signal.SIGKILL) Signal used to terminate the rank when failure is detected.
-    * ``log_level`` (str, default='INFO') Log level for the FT client and server(rank monitor).
-    * ``max_rank_restarts`` (int, default=0) Used by FT launcher. Max number of restarts for a rank. 
-      If ``>0`` ranks will be restarted on existing nodes in case of a failure.
-    * ``max_subsequent_job_failures`` (int, default=0) Used by FT launcher. How many subsequent job failures are allowed until stopping autoresuming. 
-      ``0`` means do not autoresume.
-    * ``additional_ft_launcher_args`` (str, default='') Additional FT launcher params (for advanced use).
+    All FT configuration items summary:
+        * ``workload_check_interval`` (float, default=5.0) Periodic workload check interval [seconds] in the workload monitor.
+        * ``initial_rank_heartbeat_timeout`` (Optional[float], default=60.0 * 60.0) Timeout [seconds] for the first heartbeat from a rank. 
+        * ``rank_heartbeat_timeout`` (Optional[float], default=45.0 * 60.0) Timeout [seconds] for subsequent heartbeats from a rank. 
+        * ``calculate_timeouts`` (bool, default=True) Try to calculate ``rank_heartbeat_timeout`` and ``initial_rank_heartbeat_timeout`` 
+        based on the observed heartbeat intervals.
+        * ``safety_factor``: (float, default=5.0) When calculating the timeouts, multiply the maximum observed heartbeat interval 
+        by this factor to obtain the timeout estimate. Can be made smaller for stable environments and larger for unstable ones.  
+        * ``rank_termination_signal`` (signal.Signals, default=signal.SIGKILL) Signal used to terminate the rank when failure is detected.
+        * ``log_level`` (str, default='INFO') Log level for the FT client and server(rank monitor).
+        * ``max_rank_restarts`` (int, default=0) Used by FT launcher. Max number of restarts for a rank. 
+        If ``>0`` ranks will be restarted on existing nodes in case of a failure.
+        * ``max_subsequent_job_failures`` (int, default=0) Used by FT launcher. How many subsequent job failures are allowed until stopping autoresuming. 
+        ``0`` means do not auto-resume.
+        * ``additional_ft_launcher_args`` (str, default='') Additional FT launcher params (for advanced use).
 
 
 .. _nemo_multirun-label:
+
 Hydra Multi-Run with NeMo
 -------------------------
 
-When training neural networks, it is common to perform hyper parameter search in order to improve the performance of a model
-on some validation data. However, it can be tedious to manually prepare a grid of experiments and management of all checkpoints
-and their metrics. In order to simplify such tasks, NeMo integrates with `Hydra Multi-Run support <https://hydra.cc/docs/tutorials/basic/running_your_app/multi-run/>`_ in order to provide a unified way to run a set of experiments all
-from the config.
+When training neural networks, it is common to perform a hyperparameter search to improve the model’s performance on validation data.
+However, manually preparing a grid of experiments and managing all checkpoints and their metrics can be tedious.
+To simplify these tasks, NeMo integrates with `Hydra Multi-Run support <https://hydra.cc/docs/tutorials/basic/running_your_app/multi-run/>`_,
+providing a unified way to run a set of experiments directly from the configuration.
 
 There are certain limitations to this framework, which we list below:
 
 * All experiments are assumed to be run on a single GPU, and multi GPU for single run (model parallel models are not supported as of now).
-* NeMo Multi-Run supports only grid search over a set of hyper-parameters, but we will eventually add support for advanced hyper parameter search strategies.
-* **NeMo Multi-Run only supports running on one or more GPUs** and will not work if no GPU devices are present.
+* NeMo Multi-Run currently supports only grid search over a set of hyperparameters. Support for advanced hyperparameter search strategies will be added in the future.
+* **NeMo Multi-Run requires one or more GPUs** to function and will not work without GPU devices.
 
 Config Setup
 ~~~~~~~~~~~~
@@ -443,10 +405,10 @@ name as shown below -
       resume_ignore_no_checkpoint: true
 
 
-Running a Multi-Run config
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+Run a NeMo Multi-Run Configuration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Once the config has been updated, we can now run it just like any normal Hydra script -- with one special flag (``-m``) !
+Once the config has been updated, we can now run it just like any normal Hydra script, with one special flag (``-m``).
 
 .. code-block:: bash
 
@@ -455,21 +417,24 @@ Once the config has been updated, we can now run it just like any normal Hydra s
         ...
 
 Tips and Tricks
-~~~~~~~~~~~~~~~
+---------------
 
-* Preserving disk space for large number of experiments
+This section provides recommendations for using the Experiment Manager.
 
-Some models may have a large number of parameters, and it may be very expensive to save a large number of checkpoints on
-physical storage drives. For example, if you use Adam optimizer, each PyTorch Lightning ".ckpt" file will actually be 3x the
-size of just the model parameters - per ckpt file ! This can be exhorbitant if you have multiple runs.
+Preserving disk space for a large number of experiments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-In the above config, we explicitly set ``save_top_k: 1`` and ``always_save_nemo: True`` - what this does is limit the number of
-ckpt files to just 1, and also save a NeMo file (which will contain just the model parameters without optimizer state) and
-can be restored immediately for further work.
+Some models may have a large number of parameters, making it very expensive to save numerous checkpoints on physical storage drives.
+For example, if you use the Adam optimizer, each PyTorch Lightning ".ckpt" file will be three times the size of just the model
+parameters. This can become exorbitant if you have multiple runs.
 
-We can further reduce the storage space by utilizing some utility functions of NeMo to automatically delete either
-ckpt or NeMo files after a training run has finished. This is sufficient in case you are collecting results in some experiment
-tracking tool and can simply rerun the best config after the search is finished.
+In the above configuration, we explicitly set ``save_top_k: 1`` and ``always_save_nemo: True``. This limits the number of ".ckpt"
+files to just one and also saves a NeMo file, which contains only the model parameters without the optimizer state.
+This NeMo file can be restored immediately for further work.
+
+We can further save storage space by using NeMo's utility functions to automatically delete either ".ckpt" or NeMo files
+after a training run has finished. This is sufficient if you are collecting results in an experiment tracking tool and can
+simply rerun the best configuration after the search is completed.
 
 .. code-block:: python
 
@@ -490,24 +455,26 @@ tracking tool and can simply rerun the best config after the search is finished.
         clean_exp_ckpt(exp_log_dir, remove_ckpt=True, remove_nemo=False)
 
 
-* Debugging Multi-Run Scripts
+Debugging Multi-Run Scripts
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When running Hydra scripts, you may encounter configuration issues that crash the program. In NeMo Multi-Run, a crash in
+any single run will not crash the entire program. Instead, we will note the error and proceed to the next job. Once all
+jobs are completed, we will raise the errors in the order they occurred, crashing the program with the first error’s stack trace.
 
-When running hydra scripts, you may sometimes face config issues which crash the program. In NeMo Multi-Run, a crash in
-any one run will **not** crash the entire program, we will simply take note of it and move onto the next job. Once all
-jobs are completed, we then raise the error in the order that it occurred (it will crash the program with the first error's
-stack trace).
 
-In order to debug Muti-Run, we suggest to comment out the full hyper parameter config set inside ``sweep.params``
-and instead run just a single experiment with the config - which would immediately raise the error.
+To debug NeMo Multi-Run, we recommend commenting out the entire hyperparameter configuration set inside ``sweep.params``.
+Instead, run a single experiment with the configuration, which will immediately raise the error.
 
 
-* Experiment name cannot be parsed by Hydra
+Experiment name cannot be parsed by Hydra
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Sometimes our hyper parameters include PyTorch Lightning ``trainer`` arguments - such as number of steps, number of epochs
-whether to use gradient accumulation or not etc. When we attempt to add these as keys to the expriment manager's ``name``,
+Sometimes our hyperparameters include PyTorch Lightning ``trainer`` arguments, such as the number of steps, number of epochs,
+and whether to use gradient accumulation. When we attempt to add these as keys to the experiment manager's ``name``,
 Hydra may complain that ``trainer.xyz`` cannot be resolved.
 
-A simple solution is to finalize the hydra config before you call ``exp_manager()`` as follows -
+A simple solution is to finalize the Hydra config before you call ``exp_manager()`` as follows:
 
 .. code-block:: python
 
@@ -532,4 +499,4 @@ ExpManagerConfig
     :show-inheritance:
     :members:
     :member-order: bysource
-    :no-index:
+    :noindex:
diff --git a/docs/source/core/neural_types.rst b/docs/source/core/neural_types.rst
index ec7d94336c05..989cc8d998f4 100644
--- a/docs/source/core/neural_types.rst
+++ b/docs/source/core/neural_types.rst
@@ -24,7 +24,7 @@ Types are implemented in ``nemo.core.neural_types.NeuralType`` class. When you i
 are expected to include both *axes* information and *element type* information.
 
 .. autoclass:: nemo.core.neural_types.NeuralType
-    :no-index:
+    :noindex:
 
 Type Comparison Results
 -----------------------
@@ -32,7 +32,7 @@ Type Comparison Results
 When comparing two neural types, the following comparison results are generated.
 
 .. autoclass:: nemo.core.neural_types.NeuralTypeComparisonResult
-    :no-index:
+    :noindex:
 
 Examples
 --------
@@ -115,7 +115,7 @@ Custom element types
 It is possible to create user-defined element types to express the semantics of elements in your tensors. To do so, the user will need to inherit and implement abstract methods of the ``nemo.core.neural_types.elements.ElementType`` class
 
 .. autoclass:: nemo.core.neural_types.elements.ElementType
-    :no-index:
+    :noindex:
 
 Note that element types can be parametrized. Consider this example where it distinguishes between audio sampled at 8Khz and 16Khz.
 
diff --git a/docs/source/features/mixed_precision.rst b/docs/source/features/mixed_precision.rst
index 7e1e8c2f05fc..b1ec196c567e 100644
--- a/docs/source/features/mixed_precision.rst
+++ b/docs/source/features/mixed_precision.rst
@@ -3,15 +3,15 @@
 Mixed Precision Training
 ------------------------
 
-Mixed precision training significantly enhances computational efficiency by conducting operations in low-precision format, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. NeMo now supports FP16, BF16, and FP8 (via Transformer Engine) across most models. Further details will be provided shortly.
+Mixed precision training significantly enhances computational efficiency by conducting operations in low-precision format, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. NeMo Framework now supports FP16, BF16, and FP8 via Transformer Engine (TE) across most models.
 
 
 Half-precision Training
 =======================
 
-NeMo supports half-precision (FP16 and BF16) computation training via Megatron Core and the distributed optimizer.
+NeMo Framework supports half-precision FP16 and BF16 computation training via Megatron Core and the distributed optimizer.
 This training recipe uses half-precision in all layer computation keeping the model states (optimizer states and master parameters) in single-precision.
-To avoid repeated data type casting at each layer computation, Megatron Core keeps a separate copy of half-precision parameters that is updated after each optimizer.step.
+To avoid repeated data type casting at each layer computation, Megatron Core keeps a separate copy of half-precision parameters that is updated after each optimizer step.
 
 Half-precision training is enabled when setting ``precision`` to either of ``fp16-mixed`` or ``bf16-mixed`` along with  ``megatron_amp_O2=true``.
 The parameter gradients are computed in the same half-precision, and the precision of gradient reduce-scatter across data-parallel GPUs can be set by ``optim.grad_sync_dtype``.
@@ -19,13 +19,10 @@ The parameter gradients are computed in the same half-precision, and the precisi
 FP8 Training
 ============
 
-Overview
-^^^^^^^^
-
-NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. NeMo uses the NVIDIA `TransformerEngine <https://github.com/NVIDIA/TransformerEngine>`_ (TE) in order to leverage speedups from FP8. The following table summarizes the FP8 related arguments that can be configured in NeMo (`example config setting <https://github.com/NVIDIA/NeMo/blob/2e1814c9f031ad2aeeebad44597365e97253d2c4/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml/#L192-L200>`_). For a more detailed overview, refer to the TE `documentation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_, specifically the FP8 `format <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#transformer_engine.common.recipe.Format>`_ and `recipe <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#transformer_engine.common.recipe.DelayedScaling>`_.
+NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. NeMo Framework uses the NVIDIA `TransformerEngine <https://github.com/NVIDIA/TransformerEngine>`_ (TE) to leverage speedups from FP8. The following table summarizes the FP8 related arguments that can be configured in NeMo (`example config setting <https://github.com/NVIDIA/NeMo/blob/2e1814c9f031ad2aeeebad44597365e97253d2c4/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml/#L192-L200>`_). For a more detailed overview, refer to the TE `documentation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_, specifically the FP8 `format <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#transformer_engine.common.recipe.Format>`_ and `recipe <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#transformer_engine.common.recipe.DelayedScaling>`_.
 
 .. list-table:: FP8 arguments
-   :widths: 25 25 50
+   :widths: 10 20
    :header-rows: 1
 
    * - Argument
@@ -33,7 +30,7 @@ NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point
    * - transformer_engine
      - TE and related functionality can be enabled by setting this boolean argument to True. If this argument is not set to True, all subsequent arguments will be ignored.
    * - fp8
-     - Enables FP8 training. For transformer networks, the QKV, projection, FC1, and FC2 matrix multiplications are executed using the 4th generation H100 tensor cores with FP8 support.
+     - Enables FP8 training. For transformer networks, the QKV, projection, FC1, and FC2 matrix multiplications are executed using the fourth-generation NVIDIA H100 Tensor Cores with FP8 support.
    * - fp8_e4m3
      - Training recipe format for FP8. Activations, weights, and gradient tensors use the E4M3 format.
    * - fp8_hybrid
@@ -47,12 +44,12 @@ NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point
    * - reduce_amax
      - Indicates whether or not to perform an allreduce on the amax (absolute max) values for the FP8 tensors. Since the amax is directly used to compute the scaling factor for FP8 tensors, setting this argument ensures that the scaling factors for a tensor remain synchronized across devices in multi-GPU training configurations.
    * - fp8_params
-     - Indicates whether or not to store module level parameters in FP8. Enabling this option can lead to reduced memory consumption. It eliminates the need to store a copy of weights in higher precision (> half) for cases where these weights are externally maintained, such as master parameters in the optimizer. For more information, refer to the `fp8_model_init <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html#transformer_engine.pytorch.fp8_model_init>`_ API in TE.
+     - Indicates whether to store module-level parameters in FP8. Enabling this option can reduce memory consumption by eliminating the need to store a copy of weights in higher precision for cases where these weights are externally maintained, such as master parameters in the optimizer. For more information, refer to the `fp8_model_init <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html#transformer_engine.pytorch.fp8_model_init>`_ API in TE.
 
 Resources
 ^^^^^^^^^
 
-- `TE documentation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_
+- `Transformer Engine documentation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_
 - `Intro to FP8, floating point formats, and mixed precision training <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html#Introduction-to-FP8>`_
-- `Performance optimizations <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/advanced_optimizations.html>`_ that are natively supported in NeMo by enabling FP8 training with TE
-- `TE installation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_
+- `Performance optimizations <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/advanced_optimizations.html>`_ that are natively supported in NeMo Framework by enabling FP8 training with TE
+- `Transformer Engine installation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_
diff --git a/docs/source/features/moe.rst b/docs/source/features/moe.rst
new file mode 100644
index 000000000000..4457043777c6
--- /dev/null
+++ b/docs/source/features/moe.rst
@@ -0,0 +1,75 @@
+Mixture of Experts
+==================
+
+Overview
+--------
+
+NeMo Framework supports Mixture of Experts (MoE) in the feedforward block of the transformer layer.
+
+MoE is a machine learning technique where multiple specialized models (experts,
+usually multi-layer perceptrons) are combined to solve a complex task. Each expert
+focuses on a specific subtask or domain, while a gating network dynamically activates
+the most appropriate expert based on the current input.
+
+
+Use MoE
+-------
+
+To use MoE  in the NeMo Framework, adjust the ``num_moe_experts`` parameter in the model configuration:
+
+1. Set ``num_moe_experts`` to `8` to leverage 8 experts in the MoE module.
+
+   .. code-block:: yaml
+
+       num_moe_experts: 8  # Set MoE to use 8 experts
+
+2. Set ``moe_router_topk`` to the number of experts you want activated. For example, if you want to process each input with two experts:
+
+   .. code-block:: yaml
+
+       moe_router_topk: 2  # Processes each token using 2 experts.
+
+Configure MoE-specific Loss Functions
+-------------------------------------
+
+In addition, NeMo provides options to configure MoE-specific loss function.
+To balance token distribution across experts:
+
+1. Set ``moe_router_load_balancing_type`` to specify the load balancing method:
+
+   .. code-block:: yaml
+
+      moe_router_load_balancing_type: aux_loss  # to use the auxilary loss, other options include "sinkhorn".
+
+2. Set ``moe_aux_loss_coeff`` to specify the weight of the auxilary loss. The auxiliary loss is added to encourage distributing tokens equally among all experts. Values in the 1e-2 range are a good start, as follows:
+
+   .. code-block:: yaml
+
+      moe_aux_loss_coeff: 1e-2  # set the aux-loss weight to 1e-2
+
+3. Set ``moe_z_loss_coeff`` to specify the weight of the z-loss. A starting value of 1e-3 is recommended, as follows:
+
+   .. code-block:: yaml
+
+      moe_z_loss_coeff: 1e-3
+
+Other options include:
+
+1. ``moe_input_jitter_eps`` adds noise to the input tensor by applying jitter with a specified epsilon value.
+
+2. ``moe_token_dropping`` enables selectively dropping and padding tokens for each expert to achieve
+   a specified capacity, similar to GShard, Switch-Transformer, and DeepSpeed-MoE. Briefly, if the number
+   of tokens routed to an expert exceeds its capacity, then the exceeding tokens are dropped. Note that this is
+   currently unsupported so should remain False.
+
+3. ``moe_token_dispatcher_type`` specifies the token dispatcher type, options include 'allgather' and 'alltoall'.
+
+4. ``moe_per_layer_logging`` enables per-layer logging for MoE, currently support aux-loss and z-loss.
+
+5. ``moe_expert_capacity_factor`` the capacity factor determines the maximum number of tokens that can be routed to each expert in any MoE layer. None means no token will be dropped. The default is None.
+
+6. ``moe_pad_expert_input_to_capacity`` if True, pads the input for each expert to match the expert capacity length. It is effective only after the moe_expert_capacity_factor is set. The default setting is False.
+
+7. ``moe_token_drop_policy`` the policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped. The default value is "probs".
+
+8. ``moe_layer_recompute`` if True, checkpointing moe_layer to save activation memory. The default is False.
diff --git a/docs/source/features/activation_recomputation.rst b/docs/source/features/optimizations/activation_recomputation.rst
similarity index 86%
rename from docs/source/features/activation_recomputation.rst
rename to docs/source/features/optimizations/activation_recomputation.rst
index 5151c2aac48f..67de4401a4bc 100644
--- a/docs/source/features/activation_recomputation.rst
+++ b/docs/source/features/optimizations/activation_recomputation.rst
@@ -35,4 +35,18 @@ This is because the input sizes of softmax, dropout, and qkv dot-product attenti
 However, their recomputation cost is relatively smaller than the other linear projection layers that are linear with the hidden size square.
 
 Self-attention recomputation is hard-enabled when using FlashAttention, which is supported in Transformer Engine.
-Also, a user can use the self-attention recomputation without FlashAttention by setting ``activations_checkpoint_granularity=selective``.
\ No newline at end of file
+Also, a user can use the self-attention recomputation without FlashAttention by setting ``activations_checkpoint_granularity=selective``.
+
+Scheme of full and selective checkpointing granularity:
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-activation-recomputation-exampe-2.jpg
+    :align: center
+    :alt: activation-recomputation-example-2
+    :scale: 50%
+
+Scheme of uniform and block checkpointing method (full checkpointing granularity):
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/asset-post-activation-recomputation-exampe-1.jpg
+    :align: center
+    :alt: activation-recomputation-example-1
+    :scale: 50%
\ No newline at end of file
diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/optimizations/attention_optimizations.rst
similarity index 51%
rename from docs/source/features/memory_optimizations.rst
rename to docs/source/features/optimizations/attention_optimizations.rst
index bedc8e775f09..d5ffe3c6fae8 100644
--- a/docs/source/features/memory_optimizations.rst
+++ b/docs/source/features/optimizations/attention_optimizations.rst
@@ -1,83 +1,5 @@
-Memory Optimizations
-====================
-
-Parallelism
------------
-Refer to :doc:`Parallelism <./parallelisms>`.
-
-
-Mixture of Experts
-------------------
-
-Overview
-^^^^^^^^
-
-NeMo supports Mixture of Experts (MoE) in the transformer layer for NLP models.
-
-MoE is a machine learning technique where multiple specialized models (experts,
-usually multi-layer perceptrons) are combined to solve a complex task. Each expert
-focuses on a specific subtask or domain, while a gating network dynamically activates
-the most appropriate expert based on the current input.
-
-
-To use MoE  in the NeMo Framework, adjust the ``num_moe_experts`` parameter in the model configuration:
-
-1. Set ``num_moe_experts`` to `8` to leverage 8 experts in the MoE module.
-
-   .. code-block:: yaml
-
-       num_moe_experts: 8  # Set MoE to use 8 experts
-
-2. Set ``moe_router_topk`` to the number of experts you want activated. For example, if you want to process each input with two experts:
-
-   .. code-block:: yaml
-
-       moe_router_topk: 2  # Processes each token using 2 experts.
-
-In addition, NeMo provides options to configure MoE-specific loss function.
-To balance token distribution across experts:
-
-1. Set ``moe_router_load_balancing_type`` to specify the load balancing method:
-
-   .. code-block:: yaml
-
-      moe_router_load_balancing_type: aux_loss  # to use the auxilary loss, other options include "sinkhorn".
-
-2. Set ``moe_aux_loss_coeff`` to specify the weight of the auxilary loss. Values in the 1e-2 range are a good start, as follows:
-
-   .. code-block:: yaml
-
-      moe_aux_loss_coeff: 1e-2  # set the aux-loss weight to 1e-2
-
-3. Set ``moe_z_loss_coeff`` to specify the weight of the z-loss. A starting value of 1e-3 is recommended, as follows:
-
-   .. code-block:: yaml
-
-      moe_z_loss_coeff: 1e-3
-
-Other options include:
-
-1. ``moe_input_jitter_eps`` adds noise to the input tensor by applying jitter with a specified epsilon value.
-
-2. ``moe_token_dropping`` enables selectively dropping and padding tokens for each expert to achieve
-   a specified capacity.
-
-3. ``moe_token_dropping`` specifies the token dispatcher type, options include 'allgather' and 'alltoall'.
-
-4. ``moe_per_layer_logging`` enables per-layer logging for MoE, currently support aux-loss and z-loss.
-
-5. ``moe_expert_capacity_factor`` the capacity factor for each expert, None means no token will be dropped. The default is None.
-
-6. ``moe_pad_expert_input_to_capacity`` if True, pads the input for each expert to match the expert capacity length, effective only after the moe_expert_capacity_factor is set. The default setting is False.
-
-7. ``moe_token_drop_policy`` the policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped. Default value is "probs".
-
-8. ``moe_layer_recompute`` if True, checkpointing moe_layer to save activation memory, default is False.
-
-
-
-
-
+Attention Optimizations
+=======================
 
 Flash Attention
 ---------------
@@ -106,26 +28,6 @@ To disable Tri Dao flash attention, set the environment variable ``NVTE_FLASH_AT
 
 For more details on the Dot Product Attention backends supported in Transformer Engine, please refer to the source code at `Transformer Engine's Attention Mechanism <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_.
 
-Activation Recomputation
-------------------------
-
-Overview
-^^^^^^^^
-
-Full Activation Recomputation
-"""""""""""""""""""""""""""""
-The full activation recomputation method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed.
-
-Partial Activation Recomputation
-""""""""""""""""""""""""""""""""
-The partial activation recomputation method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency.
-
-Selective Activation Recomputation
-""""""""""""""""""""""""""""""""""
-The selective activation recomputation method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
-
-Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198.
-
 Multi-query Attention (MQA) and Grouped-query Attention (GQA)
 -------------------------------------------------------------
 
@@ -179,24 +81,3 @@ Implement MQA or GQA
 NeMo's support for GQA and MQA is enabled through the integration of Megatron Core's Attention mechanism. The underlying implementation details can be explored within the Attention class of Megatron Core, which provides the functional backbone for these advanced attention methods. To understand the specific modifications and implementations of MQA and GQA, refer to the source code in the Attention class:
 
 Check implementation details from Attention Class in Megatron Core Repo: https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/attention.py#L49.
-
-
-CPU Offloading
---------------
-
-Overview
-^^^^^^^^
-
-CPU Offloading in NeMo is a feature that reduces the peak memory usage of the GPU by offloading activations and inactive weights to CPU storage. NeMo supports offloading at the transformer layer level, allowing users to specify the number of transformer layers in their language model that require CPU offloading. During the forward pass, NeMo offloads activations at the optimal time and reloads them as needed during the backward pass.
-
-Features
-^^^^^^^^
-. Supports training models with long sequence lengths by managing activation memory efficiently.
-. Enables high batch sizes per GPU by offloading activation memory.
-. Overlaps computation with data transfers (Host2Device and Device2Host) during offloading and reloading.
-
-Usage
-^^^^^
-. Set cpu_offloading to True to enable CPU offloading.
-. Set cpu_offloading_num_layers to a value between 0 and the total number of layers in the model minus one.
-. Set cpu_offloading_activations and cpu_offloading_weights based on your needs to offload activations only, weights only, or both.
diff --git a/docs/source/features/communication_overlap.rst b/docs/source/features/optimizations/communication_overlap.rst
similarity index 97%
rename from docs/source/features/communication_overlap.rst
rename to docs/source/features/optimizations/communication_overlap.rst
index 605b2ba3d221..0ff93fe80604 100644
--- a/docs/source/features/communication_overlap.rst
+++ b/docs/source/features/optimizations/communication_overlap.rst
@@ -1,5 +1,5 @@
 Communication Overlap
-====================
+=====================
 
 Data-parallel Communication Overlap
 -----------------------------------
@@ -26,7 +26,8 @@ The TP communication and computation are chunked and the chunks are overlapped i
 In the pipelined overlap, the activation (gradient) tensor all-gather is replaced with multiple steps of input P2P ring exchanges, and reduce-scatter is replaced with multiple steps of GEMM output P2P ring exchanges followed by a reduction of the received outputs.
 In case of the reduce-scatter overlap, NeMo also provides the option to pipeline-overlap using chunks of reduce-scatter, which exposes one reduce-scatter chunk.
 
-.. image:: ../nlp/nemo_megatron/images/tp_comm_overlap.png
+
+.. image:: ../../nlp/nemo_megatron/images/tp_comm_overlap.png
     :align: center
     :width: 600px
     :alt: Tensor-parallel communication overlap
@@ -44,7 +45,7 @@ This increasing PP communication overhead and it cancels off the reduced the pip
 NeMo supports the overlap of the PP communications with non-dependant computations in the 1F1B stage (the body of pipelining, where 1X forward and 1X backward micro-batch executions are interleaved).
 The PP communications in pipeline fill and flush are still exposed.
 
-.. image:: ../nlp/nemo_megatron/images/pp_comm_overlap.png
+.. image:: ../../nlp/nemo_megatron/images/pp_comm_overlap.png
     :align: center
     :width: 600px
     :alt: Pipeline-parallel communication overlap in 1F1B pipelining phase
diff --git a/docs/source/features/optimizations/cpu_offloading.rst b/docs/source/features/optimizations/cpu_offloading.rst
new file mode 100644
index 000000000000..cf9d8951bf93
--- /dev/null
+++ b/docs/source/features/optimizations/cpu_offloading.rst
@@ -0,0 +1,19 @@
+CPU Offloading
+==============
+
+Overview
+--------
+
+CPU Offloading in NeMo is a feature that reduces the peak memory usage of the GPU by offloading activations and inactive weights to CPU storage. NeMo supports offloading at the transformer layer level, allowing users to specify the number of transformer layers in their language model that require CPU offloading. During the forward pass, NeMo offloads activations at the optimal time and reloads them as needed during the backward pass.
+
+Features
+--------
+- Supports training models with long sequence lengths by managing activation memory efficiently.
+- Enables high batch sizes per GPU by offloading activation memory.
+- Overlaps computation with data transfers (Host2Device and Device2Host) during offloading and reloading.
+
+Usage
+-----
+- Set cpu_offloading to True to enable CPU offloading.
+- Set cpu_offloading_num_layers to a value between 0 and the total number of layers in the model minus one.
+- Set cpu_offloading_activations and cpu_offloading_weights based on your needs to offload activations only, weights only, or both.
diff --git a/docs/source/features/optimizations/index.rst b/docs/source/features/optimizations/index.rst
new file mode 100644
index 000000000000..60f4428f9299
--- /dev/null
+++ b/docs/source/features/optimizations/index.rst
@@ -0,0 +1,12 @@
+Optimizations
+=============
+
+.. toctree::
+   :maxdepth: 1
+
+   ./attention_optimizations
+   ./sequence_packing
+   ./activation_recomputation
+   ./communication_overlap
+   ./cpu_offloading
+
diff --git a/docs/source/features/throughput_optimizations.rst b/docs/source/features/optimizations/sequence_packing.rst
similarity index 96%
rename from docs/source/features/throughput_optimizations.rst
rename to docs/source/features/optimizations/sequence_packing.rst
index dfd8b6cf9310..69e45f1e6a12 100644
--- a/docs/source/features/throughput_optimizations.rst
+++ b/docs/source/features/optimizations/sequence_packing.rst
@@ -1,5 +1,5 @@
-Throughput Optimizations
-========================
+Sequence Packing
+================
 
 Sequence Packing for SFT/PEFT
 -----------------------------
@@ -140,11 +140,6 @@ please refer to the documentation below
 
 :doc:`../multimodal/mllm/sequence_packing`
 
-Communication Overlap
----------------------
-NeMo leverages Megatron-Core's optimizations to enhance bandwidth utilization and effectively overlap computation with communication. Additional details will be provided soon.
-
-
 .. rubric:: Footnotes
 
 .. [#f1] Experiments were performed on Llama 7B with Dolly dataset. Actual performance improvement depends on dataset
diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst
index efc77e3d7115..c14f94eac6a0 100644
--- a/docs/source/features/parallelisms.rst
+++ b/docs/source/features/parallelisms.rst
@@ -3,7 +3,7 @@
 Parallelisms
 ============
 
-NeMo Megatron supports various data- and model-parallel deep learning workload deployment methods (which can be mixed together arbitrarily).
+NeMo Megatron supports various data-parallel and model-parallel deep learning workload deployment methods, which can be mixed together arbitrarily.
 
 Data Parallelism
 ----------------
@@ -31,17 +31,17 @@ It shards the optimizer states and the high-precision master parameters across d
 At the parameter optimizer step, each data-parallel GPU updates its shard of parameters.
 Since each GPU needs its own gradient shard, the distributed optimizer conducts reduce-scatter of the parameter gradients instead of all-reduce of them.
 Then, the updated parameter shards are all-gathered across data-parallel GPUs.
-This approach significantly reduces the memory need of large scale LLM training.
+This approach significantly reduces the memory need of large-scale LLM training.
 Also, when the precision of the gradient is higher than the parameter precision, the split execution of gradient reduce-scatter and parameter all-gather can reduce the total communication volume.
 This split collective execution increases the total computation to overlap with the communication, which improves the overlap opportunity.
 
 Enable Data Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-In NeMo, DDP is the default parallel deployment method.
-This means that the total number of GPUs corresponds to the size of the DP group and training a LLM with model parallelism decreases the size of the DP group.
+In NeMo Framework, DDP is the default parallel deployment method.
+This means that the total number of GPUs corresponds to the size of the DP group, and training an LLM with model parallelism decreases the size of the DP group.
 
-Currently, NeMo supports optimizer distribution only for Adam optimizer.
+Currently, NeMo Framework supports optimizer distribution only for Adam optimizer.
 To enable the distributed adam optimizer, set
 ``model.optim.name=distributed_fused_adam`` in the model
 configuration. It can be configured with the following options:
@@ -63,27 +63,28 @@ See the keyword arguments in `Apex DistributedFusedAdam <https://github.com/NVID
 Implement Data Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-DDP in NeMo either uses PyTorch
+DDP in NeMo Framework uses either PyTorch
 `DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`_
 (default) or a custom implementation (if custom multi-precision
 training is enabled with ``megatron_amp_O2``).
 
-The distributed optimizer in NeMo is built on top of
+The distributed optimizer in NeMo Framework is built on top of
 `DistributedFusedAdam <https://github.com/NVIDIA/apex/blob/master/apex/contrib/optimizers/distributed_fused_adam.py>`_
 from Apex.
 
-Fully-Shared Data Parallelism (FSDP)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Fully-Shared Data Parallelism
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-NeMo supports Fully-Sharded Data Parallelism (FSDP) that shards parameter gradients and low-precision parameters for computation on top of the model states that Distributed optimizer shards (optimizer states and high-precision parameters).
-Since FSDP shards the entire model states, it ensures linear model state memory saving with increasing DP size.
-FSDP can be preferred for the LLM training with unbalanced workload between pipeline stages (or Transformer layers) or with a large vocabulary size, where pipelining would cause huge computation bubbles due to the workload imbalance.
-Also, FSDP unloads the effort to search the performance-optimal mappings with 3D parallelism (TP/PP/DP) because it has a single parallelization domain.
+NeMo Framework supports Fully-Sharded Data Parallelism (FSDP), which shards parameter gradients and low-precision parameters for computation. This is in addition to the model states that the distributed optimizer shards, including optimizer states and high-precision parameters.
+Since FSDP shards the entire model states, it ensures linear model state memory savings with increasing DP size.
+FSDP is preferred for LLM training with unbalanced workloads between pipeline stages (or Transformer layers) or with a large vocabulary size, where pipelining would cause significant computation bubbles due to workload imbalance.
+Additionally, FSDP eliminates the need to search for performance-optimal mappings with 3D parallelism (TP/PP/DP) because it operates within a single parallelization domain.
 
-NeMo uses `pytorch's FSDP interface <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`_ to shard LLM model states, which flattens the parameters of each Transformer layer and partitions across datap-parallel GPUs.
-FSDP introduces collectives across data-parallel GPUs; all-gather of the parameters for computation and reduce-scatter of parameter gradients.
-The parameter all-gather occurs in both network forward- and back-propagation phases. The gradient reduce-scatter happens only in the back-propagation.
-These FSDP communications are overlapped with Transformer layer computations.
+
+NeMo Framework uses `PyTorch's FSDP interface <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`_ to shard LLM model states, flattening the parameters of each transformer layer and partitioning them across data-parallel GPUs.
+FSDP introduces collective operations across data-parallel GPUs, including all-gather for parameter computation and reduce-scatter for parameter gradients.
+The all-gather operation occurs during both the network forward and back-propagation phases, while the gradient reduce-scatter operation happens only during back-propagation.
+These FSDP communications are overlapped with transformer layer computations.
 
 Setting ``fsdp=true`` enables FSDP.
 The mixed precision recipe can be set by ``precision`` knob, which determines both the computation and communication precisions.
@@ -93,15 +94,15 @@ Also, one can use ``grad_reduce_dtype`` to override the gradient reduction preci
 Model Parallelism
 -----------------
 
-Model parallelism (MP) is a distributed model deployment method that partitions the model parameters across GPUs to reduce the need of per-GPU memory.
-NeMo supports various model-parallel methods, which can be mixed to maximize LLM training performance.
+Model Parallelism (MP) is a distributed model deployment method that partitions the model parameters across GPUs to reduce the need of per-GPU memory.
+NeMo Framework supports various model-parallel methods, which can be mixed to maximize LLM training performance.
 
 Tensor Parallelism
 ^^^^^^^^^^^^^^^^^^
 
 Tensor Parallelism (TP) is a model-parallel partitioning method that distributes the parameter tensor of an individual layer across GPUs.
-On top of reducing the model state memory usage, it also saves the activation memory as per-GPU tensor sizes shrinks.
-However, the reduced per-GPU tensor lowers per-GPU-kernel workload sizes that increases CPU overhead.
+In addition to reducing model state memory usage, it also saves activation memory as the per-GPU tensor sizes shrink.
+However, the reduced per-GPU tensor size increases CPU overhead due to smaller per-GPU kernel workloads.
 
 .. image:: ../nlp/nemo_megatron/images/tp.gif
     :align: center
@@ -111,9 +112,7 @@ However, the reduced per-GPU tensor lowers per-GPU-kernel workload sizes that in
 Enable Tensor Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To enable TP in the NeMo framework, configure the ``tensor_model_parallel_size`` parameter in the model configuration. This parameter determines the number of GPUs among which the model's tensors are partitioned.
-
-**For Tensor Parallelism**:
+To enable TP in the NeMo Framework, configure the ``tensor_model_parallel_size`` parameter in the model configuration. This parameter determines the number of GPUs among which the model's tensors are partitioned.
 
 Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism.
 
@@ -121,24 +120,24 @@ Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer m
 
        tensor_model_parallel_size: 1  # Example to enable Tensor Parallelism
 
-The configuration file can be adjusted here: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L65>`_.
+The configuration file can be adjusted here: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L65>`__.
 
 Implement Tensor Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo integrates Tensor Parallelism through the implementation from Megatron Core. To understand how TP is activated within transformer blocks, refer to the code in the following repository: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
+NeMo Framework integrates TP through the implementation from Megatron Core. To understand how TP is activated within transformer blocks, refer to the code in the following repository: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`__.
 
 For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
 
 FSDP with Tensor Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo supports FSDP along with tensor parallelism. This is done by restricting the model state sharding to the data-parallel domain.
-Using FSDP with tensor parallelism can be helpful when the model doesn't have sufficient parallelism to deploy on a large scale training system with the data-parallel mapping. For example, running a model with the global batch size of 1024 on 2048 GPUs.
-Also, tensor parallelism enables FSDP feasibility by reducing the model state size and the activation size per GPU, thus lower the FSDP communication overhead and the activation memory overhead.
+NeMo Framework supports FSDP along with TP. This is done by restricting the model state sharding to the data-parallel domain.
+Using FSDP with TP can be helpful when the model doesn't have sufficient parallelism to deploy on a large-scale training system with the data-parallel mapping. For example, running a model with the global batch size of 1024 on 2048 GPUs.
+Also, TP enables FSDP feasibility by reducing the model state size and the activation size per GPU, thus lower the FSDP communication overhead and the activation memory overhead.
 
 Using both FSDP and TP works by enabling FSDP (``fsdp=true``) and setting ``tensor_model_parllel_size > 1``.
-The user should unset ``CUDA_DEVICE_MAX_CONNECTIONS`` environment variable to enable that sets the number of GPU kernel queue to overlap of the FSDP communication with computation kernels.
+Unset the ``CUDA_DEVICE_MAX_CONNECTIONS`` environment variable to set the number of GPU kernel queues, allowing the overlap of FSDP communication with computation kernels.
 
 Pipeline Parallelism
 ^^^^^^^^^^^^^^^^^^^^
@@ -154,9 +153,7 @@ Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segm
 Enable Pipeline Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To utilize PP in the NeMo framework, you need to set the ``pipeline_model_parallel_size`` parameter in the model's configuration. This parameter specifies the number of GPUs among which the model's layers are distributed.
-
-**For Pipeline Parallelism**:
+To utilize Pipeline Parallelism (PP) in NeMo Framework, set the ``pipeline_model_parallel_size`` parameter in the model's configuration. This parameter specifies the number of GPUs among which the model's layers are distributed.
 
 Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism.
 
@@ -164,7 +161,7 @@ Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable int
 
        pipeline_model_parallel_size: 1  # Example to enable Pipeline Parallelism
 
-Adjust the configuration accordingly here: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L66>`_.
+Adjust the configuration accordingly here: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L66>`__.
 
 Interleaved Pipeline Parallel Schedule
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -180,7 +177,7 @@ For more insights into this approach, see our detailed blog: `Scaling Language M
 Implement Pipeline Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The NeMo implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
+The NeMo Framework implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
 
 For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
 
@@ -197,31 +194,31 @@ Unlike other model-parallel techniques, EP is applied to only the expert layers
 Enable Expert Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To enable EP, set ``model.expert_model_parallel_size`` to the desired expert parallel size. For example, if the model has six experts (``model.num_moe_experts=6``), then setting ``model.expert_model_parallel_size=3`` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size.
+To enable EP, set ``model.expert_model_parallel_size`` to the expert parallel size you want. For example, if the model has six experts (``model.num_moe_experts=6``), then setting ``model.expert_model_parallel_size=3`` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size.
 
    .. code-block:: yaml
 
        expert_model_parallel_size: 3  # Set EP to 3
 
-For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L68>`_.
+For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L68>`__.
 
 
 Implement Expert Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The NeMo implementation of Expert Parallelism uses functionality from Megatron Core. Please consult the `Megatron Core MoE layer <https://github.com/NVIDIA/Megatron-LM/blob/e2ec14ab5690fead7e33760b0f8fb20c83b4fd1f/megatron/core/transformer/moe/moe_layer.py#L29>`_ for more MoE implementation details.
+The NeMo Framework implementation of EP uses functionality from Megatron Core. Please consult the `Megatron Core MoE layer <https://github.com/NVIDIA/Megatron-LM/blob/e2ec14ab5690fead7e33760b0f8fb20c83b4fd1f/megatron/core/transformer/moe/moe_layer.py#L29>`_ for more MoE implementation details.
 
 
 Activation Partitioning
 -----------------------
 
 In LLM training, a large memory space is needed to store the input activations of the network layers.
-NeMo provides effective activation distribution methods, which is critical in training LLM with a large sequence length or large per-GPU micro-batch size.
+NeMo Framework provides effective activation distribution methods, which is critical in training LLM with a large sequence length or large per-GPU micro-batch size.
 
 Sequence Parallelism
 ^^^^^^^^^^^^^^^^^^^^
 
-Sequence Parallelism extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
+Sequence Parallelism (SP) extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
 
 .. image:: ../nlp/nemo_megatron/images/sp.gif
     :align: center
@@ -231,31 +228,29 @@ Sequence Parallelism extends tensor-level model parallelism by distributing comp
 Enable Sequence Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To utilize Sequence Parallelism in NeMo, set the ``sequence_parallel`` parameter to ``True`` in the model's configuration. Note that this feature is effective only when the tensor parallel size (``tensor_model_parallel_size``) is greater than ``1``.
+To utilize SP in NeMo Framework, set the ``sequence_parallel`` parameter to ``True`` in the model's configuration. Note that this feature is effective only when the tensor parallel size (``tensor_model_parallel_size``) is greater than ``1``.
 
    .. code-block:: yaml
 
        sequence_parallel: True  # Enable Sequence Parallelism
 
-For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L66>`_.
+For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L66>`__.
 
 Implement Sequence Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The NeMo implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py>`_.
+The NeMo Framework implementation of SP utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py>`_.
 
 Context Parallelism
 ^^^^^^^^^^^^^^^^^^^
 
 Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs, partitioning the input tensors in the sequence dimension.
-Unlike Sequence Parallelism (SP) that partitions the activations of specific layers, CP divides the activations of all layers.
+Unlike SP, which partitions the activations of specific layers, CP divides the activations of all layers.
 
 Enable Context Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To activate CP in the NeMo framework, set the ``context_parallel_size`` parameter in the model configuration. This parameter specifies the number of GPUs among which the model's sequence activations are distributed.
-
-**For Context Parallelism**:
+To activate CP in the NeMo Framework, set the ``context_parallel_size`` parameter in the model configuration. This parameter specifies the number of GPUs among which the model's sequence activations are distributed.
 
 Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism.
 
@@ -268,7 +263,7 @@ The configuration can be found and modified here: `NeMo Megatron Core Context Co
 Implement Context Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
+NeMo Framework leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
 
 Visit our source code for more insights into the implementation:
 - `Megatron Core wrappers for Transformer Engine <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/extensions/transformer_engine.py>`_
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9a1086cae5ae..d015796096fc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -13,6 +13,7 @@ NVIDIA NeMo Framework is an end-to-end, cloud-native framework designed to build
 - Activation Recomputation
 - Positional Embeddings and Positional Interpolation
 - Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) with `TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_
+- Knowledge Distillation-based training with `TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_
 - Sequence Packing
 
 `NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for:
@@ -54,8 +55,8 @@ For more information, browse the developer docs for your area of interest in the
 
    features/mixed_precision
    features/parallelisms
-   features/memory_optimizations
-   features/throughput_optimizations
+   features/moe
+   features/optimizations/index
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst
index 7a9fe2822d07..2ba9978b7640 100644
--- a/docs/source/multimodal/api.rst
+++ b/docs/source/multimodal/api.rst
@@ -8,7 +8,7 @@ Model Classes
     :show-inheritance:
     :no-members:
     :members: __init__, configure_optimizers
-    :no-index:
+    :noindex:
 
 
 .. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
diff --git a/docs/source/multimodal/mllm/datasets.rst b/docs/source/multimodal/mllm/datasets.rst
index 2f2000124e4d..60ee00b7f7f0 100644
--- a/docs/source/multimodal/mllm/datasets.rst
+++ b/docs/source/multimodal/mllm/datasets.rst
@@ -1,12 +1,12 @@
 Multimodal Language Model Datasets
 ==================================
 
-The NeMo multimodal language model supports the conversation data format, drawing inspiration from and designed based on `LLaVA <https://github.com/haotian-liu/LLaVA/tree/main>`_. Sample datasets can be explored at `LLaVA's data documentation <https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md>`_.
+The NeMo Framework multimodal language model supports the conversation data format, drawing inspiration from and designed based on `LLaVA <https://github.com/haotian-liu/LLaVA/tree/main>`_. Sample datasets can be explored at `LLaVA's data documentation <https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md>`_.
 
-Preparing the Training Dataset
-------------------------------
+Prepare the Training Dataset
+----------------------------
 
-The NeVA model training encompasses two phases: pretraining and finetuning. Each phase mandates a unique dataset.
+The NeVA model training encompasses two phases: pretraining and fine-tuning. Each phase mandates a unique dataset.
 
 For **pretraining**, utilize the *LAION/CC/SBU BLIP-Caption Concept-balanced 558K* dataset. Access this dataset via `LLaVA's GitHub <https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md>`_. After procuring the dataset, extract it to:
 
@@ -14,13 +14,13 @@ For **pretraining**, utilize the *LAION/CC/SBU BLIP-Caption Concept-balanced 558
 
    /path/to/neva/datasets/LLaVA-Pretrain-LCS-558K/blip_laion_cc_sbu_558k.json
 
-Acquire the image data from `HuggingFace <https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/blob/main/images.zip>`_ and extract to:
+Acquire the image data from `Hugging Face <https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/blob/main/images.zip>`__ and extract to:
 
 .. code-block:: bash
 
    /path/to/neva/datasets/LLaVA-Pretrain-LCS-558K/images
 
-For **fine-tuning**, deploy the *LLaVA-Instruct-150K* dataset. This is also available on `LLaVA's GitHub <https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md>`_. You can download the prompts from `HuggingFace <https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/tree/main>`_:
+For **fine-tuning**, deploy the *LLaVA-Instruct-150K* dataset. This is also available on `LLaVA's GitHub <https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md>`_. You can download the prompts from `HuggingFace <https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/tree/main>`__:
 
 .. code-block:: bash
 
@@ -32,19 +32,19 @@ Image data for this phase can be obtained from the `COCO Dataset <https://cocoda
 
    /path/to/neva/datasets/LLaVA-Instruct-150K/images
 
-Additional Preparation for NeVA Model
--------------------------------------
+Additional Preparation for the NeVA Model
+-----------------------------------------
 
-The following instructions are specific to the NeVA model within the NeMo Multimodal Language Models.
+The following instructions are specific to the NeVA model within the NeMo Framework multimodal language models.
 
-Setting Up LLaMA-2 Chat Checkpoints
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Set Up LLaMA-2 Chat Checkpoints
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Support is available for both the 7B and 13B chat models. Both can be downloaded from `LLaVA's Model Zoo <https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md>`_. After downloading the desired HuggingFace checkpoint, extract and store it on your local system to prep for pretraining.
+Support is available for both the 7B and 13B chat models. Both can be downloaded from `LLaVA's Model Zoo <https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md>`__. After downloading the checkpoint you want from Hugging Face, extract and store it on your local system to prepare for pretraining.
 
 To convert the LLaMA-2 checkpoints to NeMo's format, follow these steps:
 
-1. Adjust the default yaml file at `megatron_llama_config.yaml <https://TODOURL>`_. Ensure ``model.mcore_gpt`` and ``model.transformer_engine`` are set to `False` before the checkpoint conversion.
+1. Adjust the default YAML file at `megatron_llama_config.yaml <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/language_modeling/conf/megatron_llama_config.yaml>`__. Ensure ``model.mcore_gpt`` and ``model.transformer_engine`` are set to `False` before the checkpoint conversion.
 
 2. For the 7B chat model, use this conversion command:
 
@@ -56,7 +56,7 @@ To convert the LLaMA-2 checkpoints to NeMo's format, follow these steps:
 
 For the 13B model, adjust the paths in the `--in-file` and `--out-file` parameters accordingly.
 
-3. Execute the subsequent command to divide the checkpoint for tensor model parallel sizes of 4 or 8. It's advisable to use TP=4 for the 7B model and TP=8 for the 13B model to ensure both pretraining and finetuning operate without memory complications.
+3. Execute the subsequent command to divide the checkpoint for tensor model parallel sizes of 4 or 8. It's advisable to use TP=4 for the 7B model and TP=8 for the 13B model to ensure both pretraining and fine-tuning operate without memory complications.
 
 .. code-block:: bash
 
@@ -73,10 +73,10 @@ For the 13B model, adjust the paths in the `--in-file` and `--out-file` paramete
      --model_class="nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel" \
      --tokenizer_model_path=<PATH-TO-HF-CHECKPOINT>/tokenizer.model
 
-Tokenizer Configuration
-^^^^^^^^^^^^^^^^^^^^^^^
+Configure Tokenizer
+^^^^^^^^^^^^^^^^^^^
 
-For NeVA training, integrating special tokens into the tokenizer is vital. After obtaining the 7B/13B model from Huggingface, also procure the corresponding tokenizer model. Referring to the 7B-chat model:
+For NeVA training, it is vital that you integrate special tokens into the tokenizer. After obtaining the 7B/13B model from Hugging Face, you need to procure the corresponding tokenizer model. Referring to the 7B-chat model:
 
 1. Download the `tokenizer.model <https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview/blob/main/tokenizer.model>`_ to:
 
@@ -84,7 +84,7 @@ For NeVA training, integrating special tokens into the tokenizer is vital. After
 
    /path/to/neva/tokenizers/tokenizer.model
 
-2. Executing the next script necessitates the NeMo dependency. It's more convenient to run the script within the NeMo container.
+2. Step 3 requires NeMo Framework to be installed. For quick setup, we recommend running it within the NeMo Framework container.
 
 3. Employ the command below to infuse special tokens into the tokenizer:
 
diff --git a/docs/source/multimodal/mllm/sequence_packing.rst b/docs/source/multimodal/mllm/sequence_packing.rst
index b061ee1d89c6..c5587e3f7173 100644
--- a/docs/source/multimodal/mllm/sequence_packing.rst
+++ b/docs/source/multimodal/mllm/sequence_packing.rst
@@ -103,15 +103,13 @@ To train with packed sequences, modify four items in the SFT/PEFT config file.
 
 .. code-block:: bash
 
-    ++model.data.data_prefix=/lustre/fsw/coreai_dlalgo_genai/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
-    ++model.data.crop_size=[224,224]
     ++model.data.packed_sequence=True
 
 2. Use the new dataset file instead of the original JSONL file and ensure the crop sizes are correctly specified since images are now cached:
 
 .. code-block:: bash
 
-    ++model.data.data_prefix=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
+    ++model.data.data_path=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
     ++model.data.crop_size=[336,336]
 
 4. Adjust batch sizes:
diff --git a/docs/source/multimodal/speech_llm/api.rst b/docs/source/multimodal/speech_llm/api.rst
index 142190fd411d..c2415f29c720 100644
--- a/docs/source/multimodal/speech_llm/api.rst
+++ b/docs/source/multimodal/speech_llm/api.rst
@@ -8,7 +8,7 @@ Model Classes
     :show-inheritance:
     :no-members:
     :members: __init__, configure_optimizers
-    :no-index:
+    :noindex:
 
 
 .. autoclass:: nemo.collections.multimodal.speech_llm.models.modular_models.ModularAudioGPTModel
diff --git a/docs/source/multimodal/speech_llm/configs.rst b/docs/source/multimodal/speech_llm/configs.rst
index 5edd169eed25..b48a99612049 100644
--- a/docs/source/multimodal/speech_llm/configs.rst
+++ b/docs/source/multimodal/speech_llm/configs.rst
@@ -1,9 +1,9 @@
 Common Configuration Files
 ==========================
 
-This section provides a detailed overview of the NeMo configuration file setup specific to models within the NeMo SpeechLLM collection. For foundational knowledge about setting up and executing experiments common to all NeMo models, such as the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`core <../../core/core>` documentation.
+This section provides a detailed overview of the NeMo Framework configuration file setup, specifically for models within the NeMo Speech-augmented Large Language Models (SpeechLLM) collection. For foundational knowledge on setting up and executing experiments common to all NeMo Framework models, including the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`core <../../core/core>` documentation.
 
-Within the configuration files of the NeMo SpeechLLMs, details concerning dataset(s), augmentation, optimization parameters, and model architectural specifications are central. This page explores each of these aspects.
+The configuration files for NeMo SpeechLLMs focus on key details such as datasets, augmentation, optimization parameters, and model architectural specifications. This page explores each of these aspects.
 
 Discover exemplary configuration files for all SpeechLLMs in the `config directory of the examples <https://github.com/NVIDIA/NeMo/tree/main/examples/multimodal/speech_llm/conf>`_.
 
@@ -11,9 +11,9 @@ Discover exemplary configuration files for all SpeechLLMs in the `config directo
 Dataset Configuration
 ---------------------
 
-The dataset configuration is based on the NeMo ASR data configuration and the NLP data configuration
+The dataset configuration is based on the NeMo ASR data configuration and the NLP data configuration.
 
-The configuration file allows setting any initialization parameter accepted by the Dataset class used in the experiment. For a comprehensive list of Datasets and their parameters, visit the `Datasets <./api.html#Datasets>`__ section of the API.
+The configuration file enables you to set any initialization parameter accepted by the Dataset class used in the experiment. For a comprehensive list of datasets and their parameters, refer to the Datasets section of the :doc:`API <./api>`.
 
 A typical training configuration is as follows:
 
@@ -55,9 +55,9 @@ A typical training configuration is as follows:
         audio_locator: null
 
 
-Key parameters include:
+The key configuration parameters include:
 
-- ``manifest_filepath``: The path to the dataset in JSON lines format, where each line in the file is a python dictionary. This can either be a single file or a list of files.
+- ``manifest_filepath``: The path to the dataset in JSON lines format, where each line in the file is a Python dictionary. This can either be a single file or a list of files.
 - ``global_batch_size``: The global batch size that takes consideration of gradient accumulation, data parallelism.
 - ``micro_batch_size``: The micro batch size that fits on each GPU.
 - ``shuffle``: Whether to shuffle the dataset.
@@ -115,7 +115,7 @@ For a detailed list of arguments, refer to the `Pytorch Lightning Trainer <https
 Experiment Manager Configurations
 ---------------------------------
 
-The NeMo Experiment Manager provides a streamlined approach to manage various tasks such as logging, saving, and resuming.
+The NeMo Framework Experiment Manager provides a streamlined approach to manage various tasks such as logging, saving, and resuming.
 
 .. code-block:: yaml
 
@@ -149,6 +149,8 @@ The NeMo Experiment Manager provides a streamlined approach to manage various ta
 Optimizer Configurations
 -------------------------
 
+NeMo Framework offers a variety of optimizers to enhance the training of neural network models. The following example shows the ``fused_adam`` default optimizer. The learning rate scheduler can be specified in the ``optim.sched`` section.
+
 .. code-block:: yaml
 
   optim:
@@ -162,14 +164,14 @@ Optimizer Configurations
       warmup_steps: 10000
       warmup_ratio: null
 
-The default optimizer used is ``fused_adam``. For details on all supported optimizers, refer to the NeMo user guide. The learning rate scheduler can be specified in the ``optim.sched`` section.
+For more information on the supported optimizers, refer to the "Optimization" section in the NeMo APIs :doc:`docs <../../core/core>`.
 
 Model Configurations
 --------------------
 
 Each configuration file should detail the model architecture used for the experiment.
 
-The parameters commonly shared across most multimodal language models include:
+The following table shows the parameters commonly shared across most multimodal language models.
 
 +------------------------------------------+--------------+---------------------------------------------------------------------------------------+
 | **Parameter**                            | **Datatype** | **Description**                                                                       |
@@ -185,13 +187,13 @@ The parameters commonly shared across most multimodal language models include:
 | :code:`seed`                             | int          | seed used in training                                                                 |
 +------------------------------------------+--------------+---------------------------------------------------------------------------------------+
 
-SALM
-~~~~
+Speech-Augmented Language Model (SALM)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-For model-specific configurations, refer to `the examples <https://github.com/NVIDIA/NeMo/tree/main/examples/multimodal/speech_llm/conf/salm>`_.
+For information about SALM model-specific configurations, refer to `the examples <https://github.com/NVIDIA/NeMo/tree/main/examples/multimodal/speech_llm/conf/salm>`__.
 
 
-BESTOW
-~~~~~~
+BESt features from TwO Worlds (BESTOW)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-For model-specific configurations, refer to `the examples <https://github.com/NVIDIA/NeMo/tree/main/examples/multimodal/speech_llm/conf/bestow>`_.
+For information about BESTOW model-specific configurations, refer to `the examples <https://github.com/NVIDIA/NeMo/tree/main/examples/multimodal/speech_llm/conf/bestow>`__.
diff --git a/docs/source/multimodal/speech_llm/intro.rst b/docs/source/multimodal/speech_llm/intro.rst
index 55ea13d7d411..1f73ed9ed249 100644
--- a/docs/source/multimodal/speech_llm/intro.rst
+++ b/docs/source/multimodal/speech_llm/intro.rst
@@ -1,41 +1,42 @@
 Speech-agumented Large Language Models (SpeechLLM)
 ==================================================
 
-The endeavor to extend Language Models (LLMs) with the ability to understand speech and audio inputs, detailed examples can be found in the `SpeechLLM example <https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/speech_llm/README.md>`_.. 
+SpeechLLM is a multi-modal Large Language Model (LLM) designed to understand and process speech and audio inputs. Detailed information can be found in the `SpeechLLM examples README <https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/speech_llm/README.md>`_.
 
 .. toctree::
    :maxdepth: 1
+
    datasets
    configs
    api
 
 
-In general, there're three main components of a modular SpeechLLM: 
+In general, there are three main components of a modular SpeechLLM:
+
 - An audio encoder that processes the input audio and produces a sequence of audio embeddings.
-- A modality adapter that processes the audio embeddings and produces a sequence of embeddings in the same latent space as the token embeddings of a pretrained large language model (LLM).
-- A pretrained large language model (LLM) that processes embeddings from the modality adapter as well as token embeddings of input prompt, and produces the text output. The audio embeddings and text token embeddings are concatenated in time dimension before going into the LLM.
+- A modality adapter that processes the audio embeddings and produces a sequence of embeddings in the same latent space as the token embeddings of a pretrained LLM.
+- A pretrained LLM that processes embeddings from the modality adapter and token embeddings from the input prompt, then produces the text output. The audio embeddings and text token embeddings are concatenated in time dimension before going into the LLM.
 - The LLM produces text outputs based on the concatenated input audio and text embedding.
 
 
 Model Architecture
 ^^^^^^^^^^^^^^^^^^
 
-One way to incorporate speech into LLM is to concatenate speech features with the token embeddings of the input text prompt before being fed into the LLM. In this way, the LLM can have direct access to the speech information when generating the output text.
-    .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/salm.png
-        :align: center
-        :alt: SALM model
-        :scale: 50%
-
+One way to incorporate speech into an LLM is to concatenate speech features with the token embeddings of the input text prompt before feeding them into the LLM. In this way, the LLM can have direct access to the speech information when generating the output text. The `Speech-Augmented Language Model <https://arxiv.org/pdf/2310.09424>`__ (SALM) follows this approach.
 
-
-Another way is to use cross-attention mechanism, by using text embeddings to attend to speech embeddings to extract task-specific information from the speech embeddings. In order to minimize the computational cost of cross-attention, we add a cross-attention module only before the LLM.
-   
-    .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/bestow.png
-        :align: center
-        :alt: BESTOW model
-        :scale: 50%
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/salm.png
+    :align: center
+    :alt: SALM model
+    :scale: 50%
 
 
 
+Another approach is to use a cross-attention mechanism, where text embeddings attend to speech embeddings to extract task-specific information. To minimize the computational cost, we add a cross-attention module only before the LLM. The `BESt features from TwO Worlds <https://arxiv.org/pdf/2406.19954v1>`__ (BESTOW) model follows this approach.
+   
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/bestow.png
+    :align: center
+    :alt: BESTOW model
+    :scale: 50%
 
+NeMo Framework contains `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples/multimodal/speech_llm>`__ for training and evaluating for both SALM and BESTOW models. A pre-trained `checkpoint <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/speechllm_fc_llama2_7b>`__ for SALM is also available.
 
diff --git a/docs/source/nlp/distillation.rst b/docs/source/nlp/distillation.rst
new file mode 100644
index 000000000000..22b2f3dd8a1c
--- /dev/null
+++ b/docs/source/nlp/distillation.rst
@@ -0,0 +1,58 @@
+.. _megatron_distillation:
+
+Distillation
+==========================
+
+Knowledge Distillation (KD)
+--------------------------------
+
+KD involves using information from an existing trained model to train a second (usually smaller, faster) model, thereby "distilling" knowledge from one to the other.
+
+Distillation has two primary benefits: faster convergence and higher end accuracy than traditional training.
+
+In NeMo, distillation is enabled by the `NVIDIA TensorRT Model Optimizer (ModelOpt) <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ library -- a library to optimize deep-learning models for inference on GPUs.
+
+The logits-distillation process consists of the following steps:
+
+1. Loading both student and teacher model checkpoints (must support same parallelism strategy, if any)
+2. Training until convergence, where forward passes are run on both models (and backward only on student), performing a specific loss function between the logits.
+3. Saving the final student model.
+
+
+Example
+^^^^^^^
+The example below shows how to run the distillation script for LLama models.
+
+The script must be launched correctly with the number of processes equal to tensor parallelism. This is achieved with the ``torchrun`` command below:
+
+.. code-block:: bash
+
+    STUDENT_CKPT="path/to/student.nemo"  # can also be None (will use default architecture found in examples/nlp/language_modeling/conf/megatron_llama_distill.yaml)
+    TEACHER_CKPT="path/to/teacher.nemo"
+    TOKENIZER="path/to/tokenizer.model"
+    DATA_PATHS="[1.0,path/to/tokenized/data]"
+    FINAL_SAVE_FILE="final_checkpoint.nemo"
+    TP=4
+
+    NPROC=$TP
+    launch_config="torchrun --nproc_per_node=$NPROC"
+
+    ${launch_config} examples/nlp/language_modeling/megatron_gpt_distillation.py \
+        model.restore_from_path=$STUDENT_CKPT \
+        model.kd_teacher_restore_from_path=$TEACHER_CKPT \
+        model.tensor_model_parallel_size=$TP \
+        model.tokenizer.model=$TOKENIZER \
+        model.data.data_prefix=$DATA_PATHS \
+        model.nemo_path=$FINAL_SAVE_FILE \
+        trainer.precision=bf16 \
+        trainer.devices=$NPROC
+
+For large models, the command can be used in multi-node setting. For example, this can be done with `NeMo Framework Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_ using Slurm.
+
+
+Limitations
+^^^^^^^^^^^
+* Only Megatron Core-based GPT models are supported
+* Only logit-pair distillation is supported for now
+* Pipeline parallelism not yet supported
+* FSDP strategy not yet supported
diff --git a/docs/source/performance/performance_summary.md b/docs/source/performance/performance_summary.md
new file mode 100644
index 000000000000..c5bdda7b040d
--- /dev/null
+++ b/docs/source/performance/performance_summary.md
@@ -0,0 +1,42 @@
+
+# Performance Benchmarks
+
+## Large Language Models**
+
+### Pretraining
+
+- The results in the table below show pre-training performance for various tasks at FP8 precision.
+  - Container: [NeMo24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags)
+  - System: DGX-H100
+
+| Model         | #-GPUs | GBS  | MBS | Sequence Length| TP | PP | CP | VP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to train in days (10T tokens, 1K GPUs)*** |
+| -----         | ------ | ---  | --- | ---------------| -- | -- | -- | -- | ------------------ | ----------------------- | ------------------------------------------------------ |
+| GPT3-5B       | 64     | 2048 | 4   | 2048           | 1  | 1  | 1  | 1  | 22521              | 736                     | ***5***                                                |
+| GPT3-20B      | 64     | 256  | 2   | 2048           | 2  | 1  | 1  | 1  | 5851               | 750                     | ***19***                                               |
+| GPT3-175B     | 128    | 256  | 1   | 2048           | 4  | 8  | 1  | 6  | 726                | 782                     | **156**                                                |
+| GPT3-175B     | 512    | 2048 | 2   | 2048           | 4  | 8  | 1  | 6  | 782                | [842](https://mlcommons.org/benchmarks/training/)                     | **145**                                                |
+| LLAMA2-7B     | 8      | 128  | 1   | 4096           | 1  | 1  | 1  | 1  | 16847              | 776                     | ***7***                                                | 
+| LLAMA2-13B    | 16     | 128  | 1   | 4096           | 1  | 4  | 1  | 10 | 8646               | 754                     | ***13***                                               |
+| LLAMA2-70B    | 64     | 128  | 1   | 4096           | 4  | 4  | 1  | 20 | 1707               | 759                     | ***66***                                               |
+| Nemotron-8B   | 64     | 256  | 4   | 4096           | 2  | 1  | 1  | 1  | 12701              | 653                     | ***9***                                                |
+| Nemotron-22B  | 64     | 256  | 2   | 4096           | 2  | 4  | 1  | 10 | 4256               | 554                     | ***27***                                               |
+| Nemotron-340B | 128    | 32   | 1   | 4096           | 8  | 8  | 1  | 12 | 322                | 678                     | ***351***                                              |
+| LLAMA3-8B     | 8      | 128  | 1   | 8192           | 1  | 1  | 2  | 1  | 12036              | 697                     | ***9***                                                |
+| LLAMA3-70B    | 64     | 128  | 1   | 8192           | 4  | 4  | 2  | 5  | 1533               | 738                     | ***74***                                               |
+
+### Finetuning
+
+- The results in the table below show finetuning performance of LLAMA2 models with SFT (supervised fine-tuning), and LoRA (Low-rank adaptors) at FP8 precision.
+  - Container: [NeMo24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags)
+  - System: DGX-H100
+- For fine-tuning, we use `SQuAD-v1.1 <https://rajpurkar.github.io/SQuAD-explorer/>`__ dataset, and the inputs are packed to 4096 tokens.
+
+
+| Model      | Task     | #-GPUs | GBS | MBS | Packed Sequence Length | TP | PP | Tokens / sec / GPU | Model TFLOP / sec / GPU |  ***Est. time to finetune in mins (10M tokens)***  |
+| -----      | ----     | ---    | --- | --- | --------------- | -- | -- | ------------------ | -----------------------        | -------------------------------------------------- |
+| LLAMA2-7B  | SFT      | 8      | 32  | 1   | 4096            | 1  | 1  | 17120              | 682                            | ***1.2***                                          |
+| LLAMA2-13B | SFT      | 8      | 32  | 1   | 4096            | 1  | 4  | 9741               | 754                            | ***2.1***                                          |
+| LLAMA2-70B | SFT      | 16     | 32  | 1   | 4096            | 4  | 4  | 1833               | 756                            | ***5.7***                                          |
+| LLAMA2-7B  | LoRA     | 8      | 32  | 1   | 4096            | 1  | 1  | 25206              | 673                            | ***0.8***                                          |
+| LLAMA2-13B | LoRA     | 8      | 32  | 1   | 4096            | 1  | 1  | 14161              | 733                            | ***1.5***                                          |
+| LLAMA2-70B | LoRA     | 8      | 32  | 1   | 4096            | 2  | 4  | 2557               | 705                            | ***8.1***                                          |
diff --git a/docs/source/starthere/fundamentals.rst b/docs/source/starthere/fundamentals.rst
index 6413cb9d376a..e3014e0f5a03 100644
--- a/docs/source/starthere/fundamentals.rst
+++ b/docs/source/starthere/fundamentals.rst
@@ -190,10 +190,10 @@ NeMo Inference Scripts
 
 The examples scripts directory also contains many inference scripts such as `transcribe_speech.py <https://github.com/NVIDIA/NeMo/blob/main/examples/asr/transcribe_speech.py>`_. These inference scripts typically differ in structure from training scripts, as they include additional utilities for file I/O (reading and saving files). While inference scripts still use configurations (configs), they don’t require the ``trainer`` and ``model`` sections. Additionally, the default configs for inference scripts are usually specified as dataclasses rather than separate files. You can also modify elements via the command line.
 
-Specifying training data
+Specifying Training Data
 ------------------------
 
-NeMo will handle creation of data loaders for you, as long as you put your data into the expected input format. You may also need to train a tokenizer before starting training. To learn more about data formats, see :doc:`LLM <../nlp/nemo_megatron/gpt/gpt_training>`, :doc:`Multimodal <../multimodal/mllm/datasets>`, :ref:`Speech AI <section-with-manifest-format-explanation>`, and :doc:`Vision models <../vision/datasets>`.
+NeMo will handle the creation of data loaders for you, as long as you put your data into the expected input format. You may also need to train a tokenizer before starting training. To learn more about data formats, see :doc:`LLM <../nlp/nemo_megatron/gpt/gpt_training>`, :doc:`Multimodal <../multimodal/mllm/datasets>`, :ref:`Speech AI <section-with-manifest-format-explanation>`, and :doc:`Vision models <../vision/datasets>`.
 
 
 Model Checkpoints
@@ -209,7 +209,7 @@ The NeMo team also releases pretrained models which you can browse on `NGC <http
 
 
 Fine-Tuning
-----------
+-----------
 
 NeMo allows you to fine-tune models as well as train them from scratch.
 
@@ -219,24 +219,24 @@ You can achieve this by initializing a model with random weights, then replacing
 .. _where_next:
 
 Where To Go Next?
------------
+-----------------
 
 Here are some options:
 
-* Explore Examples or Tutorials: dive into NeMo by exploring our `examples <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ or :doc:`tutorials <./tutorials>`
+* Explore examples or tutorials: dive into NeMo by exploring our `examples <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ or :doc:`tutorials <./tutorials>`
 
-* Domain-Specific Documentation:
+* Domain-specific documentation:
 
-  * For Large Language Models (LLMs), checkout out the :doc:`LLM <../nlp/nemo_megatron/intro>` documentation.
+  * For Large Language Models (LLMs), checkout the :doc:`LLM <../nlp/nemo_megatron/intro>` documentation.
   * For Multimodal tasks, refer to the :doc:`Multimodal <../multimodal/mllm/intro>` documentation.
 
   * If you’re interested in Automatic Speech Recognition (ASR), explore the :doc:`ASR <../asr/intro>` documentation.
   * For Text-to-Speech (TTS), find details in the :doc:`TTS <../tts/intro>` documentation.
-  * Lastly, for Vision Models, consult the :doc:`Vision Models <../vision/intro>` documentation.
+  * For Vision Models, consult the :doc:`Vision Models <../vision/intro>` documentation.
 
 * `NeMo Primer <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/00_NeMo_Primer.ipynb>`__: This tutorial provides a hands-on introduction to NeMo, PyTorch Lightning, and OmegaConf. It covers how to use, modify, save, and restore NeMo models.
 
 * `NeMo Models <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/01_NeMo_Models.ipynb>`__: In this tutorial, you'll learn the fundamentals of creating NeMo models.
 
-* NeMo Core Documentation: Explore the :doc:`NeMo Core <../core/core>` documentation for NeMo, which explains the inner workings of the framework.
+* NeMo Core Documentation: Explore the :doc:`NeMo Core <../core/core>` documentation for NeMo, which explains the inner workings of NeMo Framework.
 
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
index 6060726d5ba8..435e3e24350e 100644
--- a/docs/source/starthere/intro.rst
+++ b/docs/source/starthere/intro.rst
@@ -102,7 +102,7 @@ This final step involves installing the TensorRT Model Optimizer package.
 
 .. code-block:: bash
 
-    pip install nvidia-modelopt[torch]~=0.13.0 --extra-index-url https://pypi.nvidia.com
+    pip install nvidia-modelopt[torch]~=0.15.0 --extra-index-url https://pypi.nvidia.com
 
 
 .. code-block:: bash
diff --git a/examples/asr/asr_ctc/speech_to_text_ctc.py b/examples/asr/asr_ctc/speech_to_text_ctc.py
index a39a0eab078a..87b1b11633f7 100644
--- a/examples/asr/asr_ctc/speech_to_text_ctc.py
+++ b/examples/asr/asr_ctc/speech_to_text_ctc.py
@@ -75,13 +75,14 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 @hydra_runner(config_path="../conf", config_name="config")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecCTCModel(cfg=cfg.model, trainer=trainer)
 
diff --git a/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py b/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py
index 5f36f3b0382f..b4e3be5f650a 100644
--- a/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py
+++ b/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py
@@ -71,13 +71,14 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 @hydra_runner(config_path="../conf/citrinet/", config_name="config_bpe")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer)
 
diff --git a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py
index 2de150c71328..796005a8fcee 100644
--- a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py
+++ b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py
@@ -65,6 +65,7 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 @hydra_runner(
@@ -73,7 +74,7 @@
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecHybridRNNTCTCBPEModel(cfg=cfg.model, trainer=trainer)
 
diff --git a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py
index 532e2c9ed0be..423e005d8f02 100644
--- a/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py
+++ b/examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_char.py
@@ -76,13 +76,14 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 @hydra_runner(config_path="../conf/conformer/hybrid_transducer_ctc/", config_name="conformer_hybrid_transducer_ctc")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecHybridRNNTCTCModel(cfg=cfg.model, trainer=trainer)
 
diff --git a/examples/asr/asr_transducer/speech_to_text_rnnt.py b/examples/asr/asr_transducer/speech_to_text_rnnt.py
index bc75a0189dd0..5b4f1e8a985d 100644
--- a/examples/asr/asr_transducer/speech_to_text_rnnt.py
+++ b/examples/asr/asr_transducer/speech_to_text_rnnt.py
@@ -74,13 +74,14 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 @hydra_runner(config_path="experimental/contextnet_rnnt", config_name="config_rnnt")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecRNNTModel(cfg=cfg.model, trainer=trainer)
 
diff --git a/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py b/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py
index 339f65aa1eb6..1fffea55686f 100644
--- a/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py
+++ b/examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py
@@ -66,13 +66,14 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 @hydra_runner(config_path="experimental/contextnet_rnnt", config_name="config_rnnt_bpe")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecRNNTBPEModel(cfg=cfg.model, trainer=trainer)
 
diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py
index 946202364c53..b435d418fda2 100644
--- a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py
+++ b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py
@@ -56,6 +56,7 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 @hydra_runner(config_path="examples/asr/conf/conformer", config_name="conformer_transducer_bpe")
@@ -67,7 +68,7 @@ def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
     OmegaConf.resolve(cfg)
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
 
     asr_model = ASRWithTTSModel.from_asr_config(
diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py
index 5ded1ff3dfa3..99bc41ba966b 100644
--- a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py
+++ b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py
@@ -52,6 +52,7 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 @hydra_runner(config_path="examples/asr/asr_tts", config_name="hybrid_asr_tts")
@@ -59,7 +60,7 @@ def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
     OmegaConf.resolve(cfg)
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
 
     asr_model = ASRWithTTSModel(cfg.model, trainer=trainer)
diff --git a/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml b/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml
index 415172b33bb9..6808f4941916 100644
--- a/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml
+++ b/examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml
@@ -6,10 +6,6 @@ init_from_nemo_model: null # path to nemo model
 
 model:
   sample_rate: 16000
-  compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
-  log_prediction: true # enables logging sample predictions in the output during training
-  rnnt_reduction: 'mean_volume'
-  skip_nan_grad: false
 
   train_ds:
     manifest_filepath: ???
diff --git a/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml b/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml
index b8d84d197012..172d09ccd60b 100644
--- a/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml
+++ b/examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml
@@ -7,10 +7,6 @@ init_from_pretrained_model: null  # name of pretrained NeMo model, e.g., `stt_en
 
 model:
   sample_rate: 16000
-  compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
-  log_prediction: true # enables logging sample predictions in the output during training
-  rnnt_reduction: 'mean_volume'
-  skip_nan_grad: false
 
   # configs for huggingface load_dataset function
   data_path: "librispeech_asr"
diff --git a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
index 77260e515a90..3d1a8c8bdf47 100644
--- a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
+++ b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
@@ -248,10 +248,12 @@ trainer:
   max_steps: 100000 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
-  strategy: ddp
+  strategy:
+    _target_: pytorch_lightning.strategies.DDPStrategy
+    gradient_as_bucket_view: true
   accumulate_grad_batches: 1
   gradient_clip_val: 0.0
-  precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP.
+  precision: bf16-mixed # Should be set to bf16-mixed/16-mixed for O1 and O2 to enable the AMP.
   log_every_n_steps: 100  # Interval of logging.
   enable_progress_bar: True
   num_sanity_val_steps: 2 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
@@ -259,6 +261,7 @@ trainer:
   sync_batchnorm: true
   enable_checkpointing: False  # Provided by exp_manager
   logger: false  # Provided by exp_manager
+  use_distributed_sampler: false  # Lhotse has its own distributed sampler
 
 exp_manager:
   exp_dir: null
diff --git a/examples/asr/speech_multitask/speech_to_text_aed.py b/examples/asr/speech_multitask/speech_to_text_aed.py
index b0e5333249f4..0c13e5289d86 100644
--- a/examples/asr/speech_multitask/speech_to_text_aed.py
+++ b/examples/asr/speech_multitask/speech_to_text_aed.py
@@ -57,13 +57,14 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging, model_utils
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 @hydra_runner(config_path="../conf/speech_multitask/", config_name="fast-conformer_aed")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
 
     # Check for spl tokens to create spl_tokenizer.
diff --git a/examples/asr/speech_pretraining/speech_pre_training.py b/examples/asr/speech_pretraining/speech_pre_training.py
index a7200a19a92b..cec9444096c3 100644
--- a/examples/asr/speech_pretraining/speech_pre_training.py
+++ b/examples/asr/speech_pretraining/speech_pre_training.py
@@ -20,7 +20,7 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
-
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 """
 # Example of unsupervised pre-training of a model
@@ -54,7 +54,7 @@
 def main(cfg):
     logging.info(f"Hydra config: {OmegaConf.to_yaml(cfg)}")
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = SpeechEncDecSelfSupervisedModel(cfg=cfg.model, trainer=trainer)
 
diff --git a/examples/asr/speech_to_text_eval.py b/examples/asr/speech_to_text_eval.py
index 7b59ffe3fbfc..0024783d2362 100644
--- a/examples/asr/speech_to_text_eval.py
+++ b/examples/asr/speech_to_text_eval.py
@@ -64,7 +64,7 @@
 
 import json
 import os
-from dataclasses import dataclass, is_dataclass
+from dataclasses import dataclass, field, is_dataclass
 from typing import Optional
 
 import torch
@@ -99,8 +99,13 @@ class EvaluationConfig(transcribe_speech.TranscriptionConfig):
     only_score_manifest: bool = False
     scores_per_sample: bool = False
 
-    text_processing: Optional[TextProcessingConfig] = TextProcessingConfig(
-        punctuation_marks=".,?", separate_punctuation=False, do_lowercase=False, rm_punctuation=False,
+    text_processing: Optional[TextProcessingConfig] = field(
+        default_factory=lambda: TextProcessingConfig(
+            punctuation_marks=".,?",
+            separate_punctuation=False,
+            do_lowercase=False,
+            rm_punctuation=False,
+        )
     )
 
 
diff --git a/examples/asr/speech_to_text_finetune.py b/examples/asr/speech_to_text_finetune.py
index dbdefef34682..148b11d8b70f 100644
--- a/examples/asr/speech_to_text_finetune.py
+++ b/examples/asr/speech_to_text_finetune.py
@@ -19,7 +19,11 @@
 1) `init_from_nemo_model` or 
 2) `init_from_pretrained_model` in the configuration.
 
-To update the model architecture in conjunction with other modifications, it is advisable to use the primary 'speech_to_text_rnnt/ctc_*.py' script.
+****************************************************************************************
+This script is mainly intended for changing the dataset, optim, spec_augment, vocabulary/tokenizer of the model.
+To update the model architecture in conjunction with other modifications, 
+it is advisable to use the primary 'speech_to_text_rnnt/ctc_*.py' script.
+****************************************************************************************
 
 Note: To create a single script for all model types, we currently only support two types of 
 initializations:
@@ -58,6 +62,7 @@
 from nemo.utils import logging, model_utils
 from nemo.utils.exp_manager import exp_manager
 from nemo.utils.get_rank import is_global_rank_zero
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 def get_base_model(trainer, cfg):
@@ -135,7 +140,7 @@ def check_vocabulary(asr_model, cfg):
 
 def update_tokenizer(asr_model, tokenizer_dir, tokenizer_type):
     """
-    Updates the tokenizer of the model and also reinitializes the decoder if the vocabulary size 
+    Updates the tokenizer of the model and also reinitializes the decoder if the vocabulary size
     of the new tokenizer differs from that of the loaded model.
     Args:
         asr_model: ASRModel instance
@@ -189,7 +194,7 @@ def setup_dataloaders(asr_model, cfg):
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
 
     if hasattr(cfg, 'init_from_ptl_ckpt') and cfg.init_from_ptl_ckpt is not None:
diff --git a/examples/asr/speech_translation/speech_to_text_transformer.py b/examples/asr/speech_translation/speech_to_text_transformer.py
index 56b600e0b4e0..ac4dc4334164 100644
--- a/examples/asr/speech_translation/speech_to_text_transformer.py
+++ b/examples/asr/speech_translation/speech_to_text_transformer.py
@@ -47,13 +47,14 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.trainer_utils import resolve_trainer_cfg
 
 
 @hydra_runner(config_path="../conf/speech_translation/", config_name="fast-conformer_transformer")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
-    trainer = pl.Trainer(**cfg.trainer)
+    trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer))
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecTransfModelBPE(cfg=cfg.model, trainer=trainer)
 
diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index d54ee34c18cd..7f5bc0659150 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -116,7 +116,7 @@
 class ModelChangeConfig:
 
     # Sub-config for changes specific to the Conformer Encoder
-    conformer: ConformerChangeConfig = ConformerChangeConfig()
+    conformer: ConformerChangeConfig = field(default_factory=ConformerChangeConfig)
 
 
 @dataclass
@@ -164,14 +164,14 @@ class TranscriptionConfig:
     overwrite_transcripts: bool = True
 
     # Decoding strategy for CTC models
-    ctc_decoding: CTCDecodingConfig = CTCDecodingConfig()
+    ctc_decoding: CTCDecodingConfig = field(default_factory=CTCDecodingConfig)
 
     # Decoding strategy for RNNT models
     # enable CUDA graphs for transcription
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
+    rnnt_decoding: RNNTDecodingConfig = field(default_factory=lambda: RNNTDecodingConfig(fused_batch_size=-1))
 
     # Decoding strategy for AED models
-    multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
+    multitask_decoding: MultiTaskDecodingConfig = field(default_factory=MultiTaskDecodingConfig)
     # Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs:
     # Implicit single-turn assuming default role='user' (works with Canary-1B)
     #  +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes
@@ -187,7 +187,7 @@ class TranscriptionConfig:
     att_context_size: Optional[list] = None
 
     # Use this for model-specific changes before transcription
-    model_change: ModelChangeConfig = ModelChangeConfig()
+    model_change: ModelChangeConfig = field(default_factory=ModelChangeConfig)
 
     # Config for word / character error rate calculation
     calculate_wer: bool = True
diff --git a/examples/audio/audio_to_audio_train.py b/examples/audio/audio_to_audio_train.py
index 2dc91036234f..b197d2084144 100644
--- a/examples/audio/audio_to_audio_train.py
+++ b/examples/audio/audio_to_audio_train.py
@@ -35,6 +35,7 @@
 from nemo.collections.audio.models.enhancement import (
     EncMaskDecAudioToAudioModel,
     PredictiveAudioToAudioModel,
+    SchroedingerBridgeAudioToAudioModel,
     ScoreBasedGenerativeAudioToAudioModel,
 )
 from nemo.core.config import hydra_runner
@@ -48,6 +49,7 @@ class ModelType(str, Enum):
     MaskBased = 'mask_based'
     Predictive = 'predictive'
     ScoreBased = 'score_based'
+    SchroedingerBridge = 'schroedinger_bridge'
 
 
 def get_model_class(model_type: ModelType):
@@ -58,6 +60,8 @@ def get_model_class(model_type: ModelType):
         return PredictiveAudioToAudioModel
     elif model_type == ModelType.ScoreBased:
         return ScoreBasedGenerativeAudioToAudioModel
+    elif model_type == ModelType.SchroedingerBridge:
+        return SchroedingerBridgeAudioToAudioModel
     else:
         raise ValueError(f'Unknown model type: {model_type}')
 
diff --git a/examples/audio/conf/schroedinger_bridge.yaml b/examples/audio/conf/schroedinger_bridge.yaml
new file mode 100644
index 000000000000..8751b91afaee
--- /dev/null
+++ b/examples/audio/conf/schroedinger_bridge.yaml
@@ -0,0 +1,164 @@
+name: schroedinger_bridge
+
+model:
+  type: schroedinger_bridge
+  sample_rate: 16000
+  skip_nan_grad: false
+  num_outputs: 1
+  normalize_input: true
+  max_utts_evaluation_metrics: 50 # metric calculation needs full inference and is slow, so we limit to first few files
+
+  train_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    audio_duration: 2.04 # 256 frames
+    random_offset: true
+    normalize_input: ${model.normalize_input}
+    batch_size: 8 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+
+  validation_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    normalize_input: false # load data as is for validation, the model will normalize it for inference
+    batch_size: 4
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  encoder:
+    _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
+    fft_length: 510
+    hop_length: 128
+    magnitude_power: 0.5
+    scale: 0.33
+
+  decoder:
+    _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
+    fft_length: ${model.encoder.fft_length} 
+    hop_length: ${model.encoder.hop_length}
+    magnitude_power: ${model.encoder.magnitude_power}
+    scale: ${model.encoder.scale}
+
+  estimator:
+    _target_: nemo.collections.audio.parts.submodules.ncsnpp.SpectrogramNoiseConditionalScoreNetworkPlusPlus
+    in_channels: 2 # concatenation of single-channel perturbed and noisy
+    out_channels: 1 # single-channel estimate
+    conditioned_on_time: true
+    num_res_blocks: 3 # increased number of res blocks
+    pad_time_to: 64 # pad to 64 frames for the time dimension
+    pad_dimension_to: 0 # no padding in the frequency dimension
+
+  estimator_output: data_prediction
+
+  noise_schedule:
+    _target_: nemo.collections.audio.parts.submodules.schroedinger_bridge.SBNoiseScheduleVE
+    k: 2.6
+    c: 0.4
+    time_min: 1e-4
+    time_max: 1.0
+    num_steps: 1000 # num steps for the forward process
+
+  sampler:
+    _target_: nemo.collections.audio.parts.submodules.schroedinger_bridge.SBSampler
+    time_min: 1e-4
+    time_max: 1.0
+    num_steps: 50 # num steps for the reverse process
+
+  # Loss in the encoded domain
+  loss_encoded:
+    _target_: nemo.collections.audio.losses.MSELoss
+    ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
+
+  # Loss in the time domain
+  loss_time:
+    _target_: nemo.collections.audio.losses.MAELoss
+  loss_time_weight: 0.001
+
+  metrics:
+    val:
+      sisdr: # output SI-SDR
+        _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
+      estoi: # output ESTOI
+        _target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility
+        fs: ${model.sample_rate}
+        extended: true
+      pesq: # output PESQ
+        _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
+        fs: ${model.sample_rate}
+        mode: wb
+    
+  optim:
+    name: adam
+    lr: 1e-4
+    # optimizer arguments
+    betas: [0.9, 0.999]
+    weight_decay: 0.0
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: null
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 5 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+
+  # use exponential moving average for model parameters
+  ema:
+      enable: true
+      decay: 0.999  # decay rate
+      cpu_offload: false  # offload EMA parameters to CPU to save GPU memory
+      every_n_steps: 1  # how often to update EMA weights
+      validate_original_weights: false  # use original weights for validation calculation?
+
+  # logging
+  create_tensorboard_logger: true
+
+  # checkpointing
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: val_pesq
+    mode: max
+    save_top_k: 5
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # early stopping
+  create_early_stopping_callback: true
+  early_stopping_callback_params:
+    monitor: val_sisdr
+    mode: max
+    min_delta: 0.0
+    patience: 20 # patience in terms of check_val_every_n_epoch
+    verbose: true
+    strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py
index 6cf7a8499122..e28fb4e69627 100644
--- a/examples/audio/process_audio.py
+++ b/examples/audio/process_audio.py
@@ -16,7 +16,7 @@
 import glob
 import json
 import os
-from dataclasses import dataclass, is_dataclass
+from dataclasses import dataclass, field, is_dataclass
 from pathlib import Path
 from typing import List, Optional
 
@@ -96,6 +96,10 @@ class ProcessConfig:
     # Override model config
     override_config_path: Optional[str] = None  # path to a yaml config that will override the internal config file
 
+    # Override sampler config
+    # For example, to set number of steps, use `++sampler.num_samples=42`
+    sampler: dict = field(default_factory=dict)
+
     # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
     # device anyway, and do inference on CPU only if CUDA device is not found.
     # If `cuda` is a negative number, inference will be on CPU only.
@@ -155,6 +159,22 @@ def main(cfg: ProcessConfig) -> ProcessConfig:
     audio_to_audio_model.set_trainer(trainer)
     audio_to_audio_model = audio_to_audio_model.eval()
 
+    # override sampler
+    if cfg.sampler is not None:
+        logging.info('Overriding sampler with %s', cfg.sampler)
+
+        if hasattr(audio_to_audio_model, 'sampler'):
+            for key, value in cfg.sampler.items():
+                if not hasattr(audio_to_audio_model.sampler, key):
+                    raise RuntimeError(f'Model sampler does not have attribute {key}')
+                logging.debug('Try to set model.sampler.%s to %s', key, value)
+                setattr(audio_to_audio_model.sampler, key, value)
+                if getattr(audio_to_audio_model.sampler, key) != value:
+                    raise RuntimeError(f'Failed to set model sampler attribute {key} to {value}')
+                logging.info('model.sampler.%s was set to %s', key, value)
+        else:
+            raise RuntimeError('Model does not have a sampler')
+
     if cfg.audio_dir is not None:
         filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
     else:
diff --git a/examples/llm/megatron_gpt_pretraining.py b/examples/llm/megatron_gpt_pretraining.py
index 180561d03bac..cfdb6a6acb4b 100644
--- a/examples/llm/megatron_gpt_pretraining.py
+++ b/examples/llm/megatron_gpt_pretraining.py
@@ -25,6 +25,12 @@ def get_args():
     parser.add_argument('--vocab-path', type=str, help="Path to vocab file")
     parser.add_argument('--merges-path', type=str, help="Path to merges file")
     parser.add_argument('--index-mapping-dir', type=str, help="directory to write index mappings to")
+    parser.add_argument(
+        '--no-masked-softmax-fusion',
+        action='store_false',
+        help='Disable fusion of softmax.',
+        dest='masked_softmax_fusion',
+    )
 
     return parser.parse_args()
 
@@ -59,13 +65,13 @@ def get_args():
         attention_dropout=0.1,
         layernorm_epsilon=1e-5,
         make_vocab_size_divisible_by=128,
+        masked_softmax_fusion=args.masked_softmax_fusion,
     )
     model = llm.GPTModel(gpt_config, tokenizer=data.tokenizer)
     strategy = nl.MegatronStrategy()
     checkpoint_callback = ModelCheckpoint(
         every_n_train_steps=5000,
         enable_nemo_ckpt_io=False,
-        async_save=False,
     )
     callbacks = [checkpoint_callback]
 
@@ -93,7 +99,7 @@ def get_args():
         callbacks=callbacks,
         log_every_n_steps=1,
         limit_val_batches=2,
-        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed", amp_O2=False),
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
     )
 
     nemo_logger = NeMoLogger(
diff --git a/examples/multimodal/multimodal_llm/neva/conf/lita_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/lita_config.yaml
index 591f528810fc..2e20fe0be272 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/lita_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/lita_config.yaml
@@ -70,7 +70,7 @@ model:
     llm:
       from_pretrained: null #path to nemo checkpoint
       freeze: False
-      model_type: llama_2 # `nvgpt` or `llama_2` supported
+      model_type: llama_2 #  `v1`, `nvgpt`, `llama_2`, `llama_3`  and `mistral` supported
     vision_encoder:
       from_pretrained: "Lin-Chen/ShareGPT4V-13B_Pretrained_vit-large336-l12" # huggingface path or name
       from_hf: True
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
index 9ec6e51bb004..89e61a8b917c 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
@@ -38,7 +38,7 @@ exp_manager:
     save_top_k: 10
     mode: min
     always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
-    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
     filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
     model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
   ema:
@@ -60,6 +60,7 @@ model:
 
   tensor_model_parallel_size: 1 # intra-layer model parallelism
   pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  context_parallel_size: 1 # kqv model parallelism
   virtual_pipeline_model_parallel_size: null # interleaved pipeline
 
   restore_from_path: null # used in fine-tuning
@@ -69,7 +70,7 @@ model:
     llm:
       from_pretrained: null # path to nemo checkpoint
       freeze: True
-      model_type: llama_2 # `nvgpt` or `llama_2` supported
+      model_type: llama_2 #  `v1`, `nvgpt`, `llama_2`, `llama_3`  and `mistral` supported
     vision_encoder:
       from_pretrained: "" # path or name
       from_hf: True
@@ -185,7 +186,22 @@ model:
     packed_sequence: False
     num_workers: 8
     dataloader_type: cyclic
-    data_path:
+    data_path: 
+      # This configuration can either be a single string pointing to a data path, or a list of data paths for data blending.
+      # When using a blendable dataset, be aware of the following:
+      # - The sampling of data across datasets depends on both the relative sizes of the datasets and the concat_sampling_probabilities.
+      # - For example, if there are two datasets with lengths of 100 and 10, and the sampling probabilities are set to 0.5 for each,
+      #   then 55 samples would be taken from the dataset of length 100 and 55 from the dataset of length 10 (with repetition).
+      # - This means not all data might be seen in one epoch, and smaller datasets may need to be repeated to match the number of samples.
+      #   Please adjust your concat_sampling_probabilities accordingly to ensure balanced and effective training.
+      
+      # - /path/to/json
+      # - /path/to/json
+    global_batch_size: ${model.global_batch_size}
+    micro_batch_size: ${model.micro_batch_size}
+    concat_sampling_probabilities: null
+      # - 0.5
+      # - 0.5
     lazy_preprocess: True
     is_multimodal: True
     media_type: image # currently supported: image
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_mixtral_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_mixtral_config.yaml
new file mode 100644
index 000000000000..6e3fb19cdab6
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_mixtral_config.yaml
@@ -0,0 +1,220 @@
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 4650 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  check_val_every_n_epoch: null
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: nemo_neva
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
+model:
+  precision: ${trainer.precision}
+
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+
+  # Batch size guideline for different types of dataset
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 1 # will use more micro batches to reach global batch size
+
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  expert_model_parallel_size: 1
+  context_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  restore_from_path: null # used in fine-tuning
+
+  # Multimodal configs
+  mm_cfg:
+    llm:
+      from_pretrained: null
+      freeze: True
+      model_type: mistral  # `v1`, `nvgpt`, `llama_2`, `llama_3`  and `mistral` supported
+    vision_encoder:
+      from_pretrained: 'google/siglip-so400m-patch14-384' # path or name
+      from_hf: True
+      patch_dim: 14
+      crop_size: [384, 384]
+      hidden_size: 1152 # could be found from model but tricky in code
+      vision_select_layer: -2   # default to the last layer
+      class_token_length: 0
+      freeze: True
+    pretrain_mm_mlp_adapter: null # path to pretrained mm adapter
+    mm_mlp_adapter_type: mlp_downsample
+    use_im_start_end: False
+
+
+  # LLM configs
+  # use GPTModel from megatron.core
+  mcore_gpt: True
+  
+  moe_grouped_gemm: False
+  moe_token_dispatcher_type: alltoall
+  moe_aux_loss_coeff: 0.01
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: 32768
+  position_embedding_type: rope
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 32
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number.
+  normalization: rmsnorm # Type of normalization layers
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: False # Whether to use bias terms in all weight matrices.
+  activation: 'fast-swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  rotary_base: 1000000.0
+  moe_router_topk: 2
+  num_moe_experts: 8
+  attention_type: 'multihead' # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: False # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
+  num_query_groups: 8 # Number of query groups for group query attention. If None, normal attention is used.
+  use_flash_attention: True
+
+  ## Activation Checkpointing
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: True
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # model fusions
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+  openai_gelu: False
+  bias_activation_fusion: False
+  megatron_legacy: False
+
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
+  async_grad_allreduce: False
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+
+  # miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: null
+    vocab_file: null
+    merge_file: null
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+
+  data:
+    num_workers: 1
+    dataloader_type: cyclic
+    data_path: null
+    lazy_preprocess: True
+    is_multimodal: True
+    media_type: image
+    sep_image_conv_front: False
+    conv_template: mistral 
+    image_folder: null
+    image_aspect_ratio: 'square'
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: fused_adam
+    lr: 1e-3
+    weight_decay: 0.
+    betas:
+      - 0.9
+      - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 70
+      constant_steps: 0
+      min_lr: 2e-5
diff --git a/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
index 8341ff857202..10d9230fc78e 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
@@ -70,7 +70,7 @@ model:
     llm:
       from_pretrained:  #path to nemo checkpoint
       freeze: True
-      model_type: llama_2 # `nvgpt` or `llama_2` supported
+      model_type: llama_2 #  `v1`, `nvgpt`, `llama_2`, `llama_3`  and `mistral` supported
     vision_encoder:
       from_pretrained: "" # path or name
       from_hf: True
diff --git a/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
old mode 100644
new mode 100755
index 60f882fa9821..b670d171fd1d
--- a/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
+++ b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
@@ -271,6 +271,7 @@ def main():
     logging.info(f"Output directory: {output_dir}")
 
     prefix_path = f"{output_dir}/packed_seq_dataset"
+    os.makedirs(prefix_path, exist_ok=True)
     # Original Datasets to Sequence Lengths Files
     builders = {}
     for item_dict in tqdm(train_dl, desc="Building indexed datasets"):
diff --git a/examples/multimodal/speech_llm/README.md b/examples/multimodal/speech_llm/README.md
index b6a9c7486331..3d7e37d05828 100644
--- a/examples/multimodal/speech_llm/README.md
+++ b/examples/multimodal/speech_llm/README.md
@@ -1,9 +1,6 @@
 # Modular SpeechLLM
 
-This directory contains example scripts to train and evaluate modular SpeechLLM (e.g, SALM[1], etc). 
-
-## Requirements
-You will need to install this specific branch of NeMo, or use the provided Dockerfile in the root directory of this repository to build a Docker image with all the necessary dependencies.
+This directory contains example scripts to train and evaluate modular SpeechLLM (e.g, SALM[1], BESTOW[2] etc).
 
 ## Architecture
 
@@ -186,4 +183,6 @@ If you have a local `.nemo` file, you can use `model.restore_from_path=/path/to/
 
 
 ## Reference
-[1] Chen, Z.\*, Huang, H.\*, Andrusenko, A., Hrinchuk, O., Puvvada, K.C., Li, J., Ghosh, S., Balam, J. and Ginsburg, B., 2023. SALM: Speech-augmented Language Model with In-context Learning for Speech Recognition and Translation. ICASSP'24.
\ No newline at end of file
+[1] Chen, Z.\*, Huang, H.\*, Andrusenko, A., Hrinchuk, O., Puvvada, K.C., Li, J., Ghosh, S., Balam, J. and Ginsburg, B., 2023. SALM: Speech-augmented Language Model with In-context Learning for Speech Recognition and Translation. ICASSP'24.
+
+[2] Chen, Z., Huang, H., Hrinchuk, O., Puvvada, K.C., Koluguri, N.R., .Zelasko, P., Balam, J., & Ginsburg, B. ,2024. BESTOW: Efficient and Streamable Speech Language Model with the Best of Two Worlds in GPT and T5. ArXiv, abs/2406.19954.
\ No newline at end of file
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd2_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd2_train.yaml
index b725b15f1ab2..d7b5ed717fc4 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd2_train.yaml
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd2_train.yaml
@@ -153,6 +153,7 @@ model:
   resume_from_checkpoint: null # manually set the checkpoint file to load from
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  ddp_overlap: False # True for using PyTorch DDP overlap.
 
   optim:
     name: fused_adam
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py
index 981e83ec95c4..de66db1725c4 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py
@@ -26,10 +26,10 @@ def model_cfg_modifier(model_cfg):
         model_cfg.precision = cfg.trainer.precision
         model_cfg.ckpt_path = None
         model_cfg.inductor = False
-        model_cfg.unet_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/unet_nemo.ckpt"
-        model_cfg.unet_config.from_NeMo = True
-        model_cfg.first_stage_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/vae_nemo.ckpt"
-        model_cfg.first_stage_config.from_NeMo = True
+        # model_cfg.unet_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/unet_nemo.ckpt"
+        # model_cfg.unet_config.from_NeMo = True
+        # model_cfg.first_stage_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/vae_nemo.ckpt"
+        # model_cfg.first_stage_config.from_NeMo = True
         model_cfg.first_stage_config._target_ = 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper'
         # model_cfg.fsdp = True
 
diff --git a/examples/nlp/language_modeling/conf/megatron_gemma2_config.yaml b/examples/nlp/language_modeling/conf/megatron_gemma2_config.yaml
new file mode 100644
index 000000000000..374223c20daf
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_gemma2_config.yaml
@@ -0,0 +1,225 @@
+name: megatron_gemma2
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  mcore_gpt: True
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 4 # limited by GPU memory
+  global_batch_size: 8 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  # model architecture
+  encoder_seq_length: 8192
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 42 # 9b: 18 | 27b: 46
+  hidden_size: 3584 # 9b: 3584 | 27b: 4608
+  ffn_hidden_size: 28672 # Transformer FFN hidden size. Usually 4 * hidden_size. | 9b: 28672 | 27b: 72728
+  num_attention_heads: 16 # 9b: 16 | 27b: 32
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: 256 # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null | 9b: 256 | 27b: 128
+  apply_embedding_scaling: True # scale sqrt(hidden_size)
+  apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number.
+  normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+  layernorm_zero_centered_gamma: True
+  layernorm_epsilon: 1e-6
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: False # Whether to use bias terms in all weight matrices.
+  activation: 'fast-geglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  openai_gelu: True # Use OpenAI's GELU instead of the default GeLU
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope']
+  rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  attention_type: 'multihead' # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: True # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: 8 # Number of query groups for group query attention. If None, normal attention is used. | 9b: 8 | 27b: 16
+  mcore_customization_config:
+    query_pre_attn_scalar: 224 # Custom scale factor (normally sqrt dim) in SDPA  9b: 224 | 27b: 144
+    attn_logit_softcapping: 50.0 # Prevents attention outputs from growing excessively by scaling them to a fixed range
+    final_logit_softcapping: 30.0 # Prevents final logits from growing excessively by scaling them to a fixed range
+
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: ??? # /path/to/tokenizer.model
+    vocab_file: null
+    merge_file: null 
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+
+  # Mixed precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+
+
+  # Miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Transformer Engine
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 
+  fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin 
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  data:
+   # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+    # data_prefix: ???
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 2
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+  
+  optim:
+    name: fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 85609c2dd9b0..95cb1dcf48ec 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -117,6 +117,7 @@ model:
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+  scale_positional_embedding: False # Apply scaling for RoPE frequencies 
 
   ## Reset learning rate schedule.
   # 1. reset_lr=True, reset_lr_steps=False. When pre-training an existing checkpoint "from scratch" on a different dataset.
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
index c70719f51210..f603ebb58eb7 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
@@ -44,4 +44,5 @@ export:
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
   dtype: ${trainer.precision} # Default precision data type
   save_path: llama2-7b-${quantization.algorithm}.qnemo # Path where the quantized model will be saved
-  compress: false # Wheter save_path should be a tarball or a directory
+  compress: false # Whether save_path should be a tarball or a directory
+  sample_output: true # Whether to run a sample prompt before saving
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml b/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml
new file mode 100644
index 000000000000..052de2ff7d48
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_llama_distill.yaml
@@ -0,0 +1,222 @@
+name: megatron_llama_distill
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: "megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}"
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  restore_from_path: null # overwrite if starting from pretrained
+  restore_from_ckpt: null # overwrite if resuming training
+  kd_teacher_restore_from_path: null # overwrite always! (path to teacher weights)
+  nemo_path: null # overwrite always! (path to save final model)
+
+  mcore_gpt: True
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 8 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 8 # 7b: 32 | 13b: 40 | 70b: 80
+  hidden_size: 4096 # 7b: 4096 | 13b: 5120 | 70b: 8192
+  ffn_hidden_size: 11008 # Transformer FFN hidden size. Usually 4 * hidden_size. | 7b: 11008 | 13b: 13824 | 70b: 28672
+  num_attention_heads: 32 # 7b: 32 | 13b: 40 | 70b: 64
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: "rmsnorm" # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+  layernorm_epsilon: 1e-5
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: False # Whether to use bias terms in all weight matrices.
+  activation: "fast-swiglu" # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: "pre_ln" # Options ['pre_ln', 'post_ln', 'normformer']
+  openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  position_embedding_type: "rope" # Position embedding type. Options ['learned_absolute', 'rope']
+  rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  attention_type: "multihead" # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: False # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: 32 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 32 | 13b: 40 | 70b: 8
+
+  tokenizer:
+    library: "sentencepiece"
+    type: null
+    model: ??? # /path/to/tokenizer.model
+    vocab_file: null
+    merge_file: null
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+
+  # Mixed precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+  # Miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Transformer Engine
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  data:
+    # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below:
+    # data_prefix:
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+    data_prefix: ???
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 98,1,1
+    seq_length: 4096
+    skip_warmup: True
+    num_workers: 2
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10 # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 1e-5
diff --git a/examples/nlp/language_modeling/megatron_gpt_distillation.py b/examples/nlp/language_modeling/megatron_gpt_distillation.py
new file mode 100644
index 000000000000..b3ecdcfc5522
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_gpt_distillation.py
@@ -0,0 +1,576 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from pathlib import Path
+from typing import Any, Dict
+
+import modelopt.torch.distill as mtd
+import modelopt.torch.opt as mto
+import torch.multiprocessing as mp
+from omegaconf import DictConfig, OmegaConf, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
+except ImportError:
+    raise AssertionError("ModelOpt only supports Megatron-Core.")
+import types
+from abc import ABCMeta
+from importlib.metadata import version
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.parallel_state import get_tensor_model_parallel_group
+from megatron.core.transformer import TransformerConfig
+from pkg_resources import packaging
+from torch import Tensor
+from torch.nn.modules.loss import _Loss
+
+from nemo.collections.common.parts.utils import extend_instance
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import (
+    EmbeddingScalingMixin,
+    MegatronGPTModel,
+    get_specs,
+)
+from nemo.collections.nlp.modules.common.megatron.module import Float16Module
+from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+from nemo.utils.model_utils import load_config, unwrap_model
+
+mp.set_start_method("spawn", force=True)
+
+# Model config fields which affect the structure of the model.
+# These will be the values taken from the teacher's config file,
+# while the rest remain the same as student's.
+MODEL_ARCHITECHTURE_KEYS = [
+    "encoder_seq_length",
+    "max_position_embeddings",
+    "num_layers",
+    "hidden_size",
+    "ffn_hidden_size",
+    "num_attention_heads",
+    "init_method_std",
+    "use_scaled_init_method",
+    "hidden_dropout",
+    "attention_dropout",
+    "ffn_dropout",
+    "kv_channels",
+    "apply_query_key_layer_scaling",
+    "normalization",
+    "layernorm_epsilon",
+    "do_layer_norm_weight_decay",
+    "make_vocab_size_divisible_by",
+    "pre_process",
+    "post_process",
+    "persist_layer_norm",
+    "bias",
+    "activation",
+    "headscale",
+    "transformer_block_type",
+    "openai_gelu",
+    "normalize_attention_scores",
+    "position_embedding_type",
+    "rotary_percentage",
+    "attention_type",
+    "share_embeddings_and_output_weights",
+    "overlap_p2p_comm",
+    "batch_p2p_comm",
+    "num_query_groups",
+    "seq_len_interpolation_factor",
+    "rotary_base",
+    "scale_positional_embedding",
+]
+
+
+class DistillationMegatronGPTModel(MegatronGPTModel):
+    """ModelOpt Distillation-enabled subclass of `MegatronGPTModel`."""
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        """
+        Constructor.
+
+        Args:
+            cfg: Model configuration.
+            trainer: Nemo trainer instance.
+        """
+        logging.info("Distillation: Enabled.")
+        assert cfg.kd_teacher_restore_from_path is not None, "Path to teacher weights must be provided."
+        assert cfg.pipeline_model_parallel_size == 1, "Distillation mode does not yet support Pipeline Parallel."
+
+        super().__init__(cfg, trainer)
+
+        logging.info("\n\n************** Final model configuration ***********")
+        logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    def model_provider_func(self, pre_process, post_process):
+        """Model depends on pipeline paralellism."""
+        if not self.mcore_gpt:
+            raise AssertionError("ModelOpt Distillation only supports MCore model edition.")
+
+        model = MCoreGPTModel(
+            config=self.transformer_config,
+            transformer_layer_spec=get_specs(
+                self.spec_name,
+                self.transformer_config,
+                self.transformer_engine,
+                self.cfg.get('hyena', None),
+            ),
+            vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
+            max_sequence_length=self.cfg.get('encoder_seq_length', 512),
+            pre_process=pre_process,
+            post_process=post_process,
+            parallel_output=True,
+            share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True),
+            position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'),
+            rotary_percent=self.cfg.get('rotary_percentage', 1.0),
+            seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
+            rotary_base=self.cfg.get('rotary_base', 10000),
+        )
+        if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage():
+            extend_instance(model.embedding, EmbeddingScalingMixin)
+
+        # [ModelOpt] Distillation mode.
+        distill_cfg = load_distillation_config(self.transformer_config)
+        # Intialize DistillationModel.
+        kd_config = {
+            "teacher_model": (_teacher_provider, [self.cfg, copy.deepcopy(self.trainer)], {}),
+            "criterion": distill_cfg["criterion"],
+            "loss_balancer": distill_cfg["loss_balancer"],
+        }
+        model = mtd.convert(model, mode=[("kd_loss", kd_config)])
+
+        # Additional tweaks needed for MCore/Nemo.
+        adjust_distillation_model_for_mcore(model, distill_cfg)
+
+        return model
+
+    def get_forward_output_and_loss_func(self, validation_step=False, tuning=False):
+        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
+
+            # Get data batch
+            batch = self.get_batch(dataloader_iter, tuning)
+
+            # Transfer needed data to GPU
+            required_keys = set()
+            max_seqlen = batch['max_seqlen'].squeeze() if 'max_seqlen' in batch else None
+            cu_seqlens_argmin = batch['cu_seqlens_argmin'] if 'cu_seqlens_argmin' in batch else None
+            if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+                required_keys.update(batch.keys())
+            else:
+                required_keys.add('attention_mask')
+                if 'cu_seqlens' in batch:
+                    required_keys.add('cu_seqlens')
+                if parallel_state.is_pipeline_first_stage():
+                    required_keys.update(('tokens', 'position_ids'))
+                if parallel_state.is_pipeline_last_stage():
+                    required_keys.update(('labels', 'loss_mask'))
+            if self.get_attention_mask_from_fusion and 'attention_mask' in required_keys:
+                required_keys.remove('attention_mask')
+            batch = {
+                key: val.cuda(non_blocking=True) if key in required_keys and isinstance(val, torch.Tensor) else None
+                for key, val in batch.items()
+            }
+
+            # slice batch along sequence dimension for context parallelism
+            batch = self.get_batch_on_this_context_parallel_rank(batch)
+
+            # Model forward pass
+            forward_args = {
+                'input_ids': batch['tokens'],
+                'position_ids': batch['position_ids'],
+                'attention_mask': None if self.get_attention_mask_from_fusion else batch['attention_mask'],
+                'labels': batch['labels'] if 'labels' in batch else None,
+                'loss_mask': batch['loss_mask'],
+            }
+
+            # TODO: @eharper can we add this to mcore?
+            forward_args.pop('loss_mask')
+
+            if 'cu_seqlens' in batch:  # packed sequence from GPTSFTPackedDataset
+                # these args are passed eventually into TEDotProductAttention.forward()
+                cu_seqlens = batch['cu_seqlens'].squeeze()  # remove batch size dimension (mbs=1)
+                # remove -1 "paddings" added in collate_fn
+                if cu_seqlens_argmin is not None:
+                    cu_seqlens = cu_seqlens[: cu_seqlens_argmin.item()]
+                else:
+                    cu_seqlens = cu_seqlens[: torch.argmin(cu_seqlens)]
+
+                try:
+                    from megatron.core.packed_seq_params import PackedSeqParams
+                except (ImportError, ModuleNotFoundError) as e:
+                    mcore_version = packaging.version.Version(version('megatron-core'))
+                    logging.error(
+                        f"megatron-core v{mcore_version} does not support training with packed sequence. "
+                        "Please use megatron-core >= 0.5.0, or set model.data.train_ds.packed_sequence=False"
+                    )
+                    raise e
+
+                forward_args['packed_seq_params'] = PackedSeqParams(
+                    cu_seqlens_q=cu_seqlens,
+                    cu_seqlens_kv=cu_seqlens,
+                    max_seqlen_q=max_seqlen,
+                    max_seqlen_kv=max_seqlen,
+                    qkv_format='thd',
+                )
+
+            output_tensor = model(**forward_args)
+
+            def loss_func(output_tensor):
+                if validation_step:
+                    loss_for_ub = self.loss_func(
+                        batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor, validation_step=True
+                    )
+                else:
+                    # [ModelOpt] KD Loss for a micro-batch (ub)
+                    unwrapped_model = unwrap_model(model, (Float16Module, MCoreFloat16Module))
+                    loss_for_ub = unwrapped_model.compute_kd_loss(
+                        loss_reduction_fn=lambda x: self.loss_func(
+                            batch['loss_mask'], batch['num_valid_tokens_in_ub'], x
+                        )
+                    )
+                cp_size = parallel_state.get_context_parallel_world_size()
+                if validation_step and not self.validation_drop_last:
+                    num_valid_tokens_in_ub = batch['num_valid_tokens_in_ub']
+                    if loss_for_ub.isnan():
+                        assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input'
+                        loss_sum_for_ub = torch.zeros_like(loss_for_ub)
+                        num_valid_tokens_in_ub = 0
+                    else:
+                        if self.sample_weight == 'constant':
+                            num_valid_tokens_in_ub = 1
+                        loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub
+
+                    loss_sum_and_ub_size_all_gpu = torch.cat(
+                        [
+                            loss_sum_for_ub.clone().detach().view(1),
+                            torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(),
+                        ]
+                    )
+                    # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds)
+                    torch.distributed.all_reduce(
+                        loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group()
+                    )
+                    return loss_for_ub * cp_size, {'loss_sum_and_ub_size': loss_sum_and_ub_size_all_gpu}
+                else:
+                    reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+                    return loss_for_ub * cp_size, {'avg': reduced_loss}
+
+            return output_tensor, loss_func
+
+        return fwd_output_and_loss_func
+
+    def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor, validation_step=False):
+        loss = super().loss_func(loss_mask, num_valid_tokens_in_ub, output_tensor)
+        if not validation_step and self.cfg.tensor_model_parallel_size > 1:
+            # [ModelOpt] KD loss requires extra all-reduce to ensure same values across MP-TP partitions.
+            loss = torch.sum(tensor_parallel.gather_from_tensor_model_parallel_region(loss.reshape(1)))
+        return loss
+
+    def configure_optimizers(self):
+        with self.model.hide_teacher_model():
+            return super().configure_optimizers()
+
+
+########################################################
+
+
+def load_distillation_config(student_cfg: TransformerConfig) -> Dict[str, Any]:
+    """Create a default distillation config for MCore GPT Models.
+
+    Args:
+        student_cfg: Model config for student model.
+    """
+    logit_pair = ("output_layer", "output_layer")  # logit module names for MCoreGPTModel
+    tp_enabled = student_cfg.tensor_model_parallel_size > 1
+
+    cfg = {
+        "criterion": {tuple(logit_pair): LogitsKLLoss(tensor_parallel=tp_enabled)},
+        "loss_balancer": None,
+        "skip_lm_loss": True,
+    }
+    return cfg
+
+
+class BaseLoss(_Loss, metaclass=ABCMeta):
+    """Abstract base class for Megatron distillation losses."""
+
+    def __init__(self, tensor_parallel: bool = False):
+        """Constructor.
+
+        Args:
+            tensor_parallel: Whether tensor parallelism is enabled or not.
+        """
+        super().__init__()
+        self._tensor_parallel = tensor_parallel
+
+    def pre_forward(self, predictions: Tensor, targets: Tensor) -> Tuple[Tensor, Tensor]:
+        """Performs projection of student tensor to match teacher's size if necessary."""
+        if isinstance(predictions, tuple):
+            # `ColumnParallelLinear` returns bias too
+            predictions, targets = predictions[0], targets[0]
+
+        targets = targets.detach()
+
+        return predictions, targets
+
+    def post_forward(self, loss: Tensor) -> Tensor:
+        """Reshapes tensor from [s, b] to [b, s] for upcoming loss masking."""
+        loss = loss.transpose(0, 1).contiguous()
+        return loss
+
+
+class LogitsKLLoss(BaseLoss):
+    """Calculates KL-Divergence loss between two logits tensors without reducing the sequence dim."""
+
+    def __init__(
+        self,
+        tensor_parallel: bool = False,
+        temperature: float = 1.0,
+        reverse: bool = False,
+    ):
+        """
+        Constructor.
+
+        Args:
+            tensor_parallel: Whether tensor parallelism is enabled or not.
+            temperature: Divide tensors by this value prior to calculating loss.
+            reverse: Whether to reverse the loss as KLD(teacher, student) instead of KLD(student, teacher)
+        """
+        super().__init__(tensor_parallel)
+        self._temperature = temperature
+        self._reverse = reverse
+
+    def forward(self, predictions: Tensor, targets: Tensor) -> Tensor:
+        """
+        Forward function.
+
+        Args:
+            predictions: Student model tensors (size [s, b, h])
+            targets: Teacher model tensors (size [s, b, h])
+
+        Returns:
+            KLD loss of tensors (size [b, s])
+        """
+        predictions, targets = self.pre_forward(predictions, targets)
+
+        # Division by temp should happen prior to finding max for both student and teacher.
+        # Currently we don't use temperature in any of ours runs (temp=1.0)
+        output_teacher = targets.float() / self._temperature
+        output_student = predictions.float() / self._temperature
+
+        # Compute local softmax, and the reweight to compute global softmax.
+        if self._tensor_parallel:
+
+            # Maximum value along vocab dimension across all GPUs.
+            teacher_logits_max, _ = torch.max(output_teacher, dim=-1)
+            torch.distributed.all_reduce(
+                teacher_logits_max,
+                op=torch.distributed.ReduceOp.MAX,
+                group=get_tensor_model_parallel_group(),
+            )
+            output_teacher = output_teacher - teacher_logits_max.unsqueeze(dim=-1)
+
+            denom_teacher = torch.sum(torch.exp(output_teacher), dim=-1)
+            # We can't use `gather_from_tensor_model_parallel_region` here since it discards
+            # gradients from other ranks - we need to all_reduce the gradients as well.
+            denom_teacher = all_reduce_autograd(denom_teacher, group=get_tensor_model_parallel_group())
+
+            # Maximum value along vocab dimension across all GPUs.
+            student_logits_max, _ = torch.max(output_student, dim=-1)
+            torch.distributed.all_reduce(
+                student_logits_max,
+                op=torch.distributed.ReduceOp.MAX,
+                group=get_tensor_model_parallel_group(),
+            )
+            output_student = output_student - student_logits_max.unsqueeze(dim=-1).detach()
+
+            denom_student = torch.sum(torch.exp(output_student), dim=-1)
+            denom_student = all_reduce_autograd(denom_student, group=get_tensor_model_parallel_group())
+
+            slen, bsz, sharded_vocab_size = output_student.shape
+            student_log_prob = output_student - torch.log(denom_student).view(slen, bsz, 1).expand(
+                slen, bsz, sharded_vocab_size
+            )
+            teacher_log_prob = output_teacher - torch.log(denom_teacher).view(slen, bsz, 1).expand(
+                slen, bsz, sharded_vocab_size
+            )
+
+            if self._reverse:
+                loss = torch.sum(
+                    F.kl_div(teacher_log_prob, student_log_prob, reduction="none", log_target=True),
+                    dim=-1,
+                )
+            else:
+                loss = torch.sum(
+                    F.kl_div(student_log_prob, teacher_log_prob, reduction="none", log_target=True),
+                    dim=-1,
+                )
+
+        else:
+            if self._reverse:
+                loss = torch.sum(
+                    F.kl_div(
+                        F.log_softmax(output_teacher, dim=-1), F.softmax(output_student, dim=-1), reduction="none"
+                    ),
+                    dim=-1,
+                )
+            else:
+                loss = torch.sum(
+                    F.kl_div(
+                        F.log_softmax(output_student, dim=-1), F.softmax(output_teacher, dim=-1), reduction="none"
+                    ),
+                    dim=-1,
+                )
+
+        return self.post_forward(loss)
+
+
+class _AllReduce(torch.autograd.Function):
+    """Implementation from old PyTorch `torch.distributed.nn.parallel`."""
+
+    @staticmethod
+    def forward(ctx, op, group, tensor):
+        ctx.group, ctx.op = group, op
+        tensor = tensor.clone()
+        torch.distributed.all_reduce(tensor, op=op, group=group)
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, None, _AllReduce.apply(ctx.op, ctx.group, grad_output))
+
+
+def all_reduce_autograd(tensor, op=torch.distributed.ReduceOp.SUM, group=torch.distributed.group.WORLD):
+    return _AllReduce.apply(op, group, tensor)
+
+
+def adjust_distillation_model_for_mcore(model: mtd.DistillationModel, distill_cfg: Dict[str, Any]):
+    """Extra modifcations to ``mtd.DistillationModel`` requried for Megatron-Core."""
+
+    # HACK: Get rid of ModelOpt Distillation state
+    mto.ModeloptStateManager(model)._state.pop()
+
+    # HACK: Hide teacher during `sharded_state_dict` method.
+    def _sharded_state_dict(self, *args, **kwargs) -> ShardedStateDict:
+        with self.hide_teacher_model():
+            return self._sharded_state_dict(*args, **kwargs)
+
+    model._sharded_state_dict = model.sharded_state_dict
+    model.sharded_state_dict = types.MethodType(_sharded_state_dict, model)
+
+    # HACK: Skip `lm_loss` bypassing it when training if not needed for backprop.
+    def _compute_language_model_loss(self, labels, logits) -> Tensor:
+        if self.training:
+            return torch.zeros_like(labels)
+        return self._compute_language_model_loss(labels, logits)
+
+    if distill_cfg["skip_lm_loss"]:
+        model._compute_language_model_loss = model.compute_language_model_loss
+        model.compute_language_model_loss = types.MethodType(_compute_language_model_loss, model)
+
+
+########################################################
+
+
+def _teacher_provider(cfg: DictConfig, trainer: Trainer) -> MCoreGPTModel:
+    """Teacher model factory (must be a non-local function to pickle)."""
+    logging.info("Distillation: Loading teacher weights...")
+    teacher_model_cfg = _merge_model_arch_fields(cfg, cfg.kd_teacher_restore_from_path)
+
+    model = MegatronGPTModel.restore_from(
+        cfg.kd_teacher_restore_from_path,
+        override_config_path=teacher_model_cfg,
+        trainer=trainer,
+    )
+    teacher_model_module_list = model.get_model_module_list()
+    logging.info("Distillation: ... teacher weights loaded.")
+    return teacher_model_module_list[0]
+
+
+def _merge_model_arch_fields(cfg: DictConfig, model_load_path: str) -> DictConfig:
+    """Overwrite model-architecture fields of a config with a checkpoint's."""
+    model_cfg = load_config(model_load_path)
+    model_arch_keys = [k for k in MODEL_ARCHITECHTURE_KEYS if k in model_cfg]
+    model_arch_cfg = OmegaConf.masked_copy(model_cfg, model_arch_keys)
+    with open_dict(cfg):
+        cfg = OmegaConf.merge(cfg, model_arch_cfg)
+        # Add tokenizer from model if not provided
+        if OmegaConf.is_missing(cfg.tokenizer, "model"):
+            cfg.tokenizer = model_cfg.tokenizer
+    return cfg
+
+
+########################################################
+
+
+@hydra_runner(config_path="conf", config_name="megatron_llama_distill")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+    exp_manager(trainer, cfg.exp_manager)
+
+    with open_dict(cfg):
+        cfg.model.name = "modelopt"  # Convert TE layernorm spec to unfused format
+        # HACK: Checkpoint-loading process hangs/loops if this isn't present here for some reason.
+        cfg.model.target = "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel"
+
+    # Continual training
+    if cfg.model.get("restore_from_path") is not None:
+        # Option 1: Restore only the model weights from a .nemo file
+        logging.info(f"Continual training: loading weights from {cfg.model.restore_from_path}")
+
+        # Merge model config's architecture fields with the one from the checkpoint
+        cfg.model = _merge_model_arch_fields(cfg.model, cfg.model.restore_from_path)
+
+        model = DistillationMegatronGPTModel.restore_from(
+            restore_path=cfg.model.restore_from_path,
+            override_config_path=cfg.model,
+            trainer=trainer,
+            save_restore_connector=NLPSaveRestoreConnector(),
+        )
+        logging.info("... weights loaded.")
+    elif cfg.model.get("restore_from_ckpt") is not None:
+        # Option 2: Restore both model weights and optimizer states from a PTL checkpoint
+        logging.info(f"Continual training: loading weights and optimizer states from {cfg.model.restore_from_ckpt}")
+        trainer.ckpt_path = Path(cfg.model.restore_from_ckpt)
+        model = DistillationMegatronGPTModel(cfg.model, trainer)
+        logging.info("... weights and optimizer states loaded.")
+
+    # Start new pretraining or resume from a checkpoint if it exists
+    else:
+        logging.info("Instantiating new model ...")
+        model = DistillationMegatronGPTModel(cfg.model, trainer)
+        logging.info("... model instantiated.")
+
+    trainer.fit(model)
+
+    if cfg.model.nemo_path:
+        model.save_to(cfg.model.nemo_path)
+    else:
+        logging.warning("Skipping saving final model as no `model.nemo_path` provided.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
index 422319a382c8..1cc3d0aae27d 100644
--- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
@@ -44,9 +44,12 @@ def main(cfg) -> None:
     if cfg.model.get("restore_from_path") is not None:
         # Option 1: Restore only the model weights from a .nemo file
         logging.info(f"Continual training: loading weights from {cfg.model.restore_from_path}")
+        from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+
+        model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
         model = MegatronGPTModel.restore_from(
             restore_path=cfg.model.restore_from_path,
-            override_config_path=cfg.model,
+            override_config_path=model_cfg,
             trainer=trainer,
             save_restore_connector=NLPSaveRestoreConnector(),
         )
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index 6517b62010b4..06551f46486c 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -158,6 +158,7 @@ model:
       index_mapping_dir: null # Path to a directory to write index mapping files.
       prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
       truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
+      global_sample_mapping: False # Whether to shuffle the replicated data all together, or shuffle the dataset within each epoch
     validation_ds:
       file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
       names: null # Names of the corresponding datasets used to log metrics.
@@ -181,6 +182,7 @@ model:
       prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
       tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
       truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      global_sample_mapping: False # Whether to shuffle the replicated data all together, or shuffle the dataset within each epoch
       metric:
         name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
         average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
@@ -208,6 +210,7 @@ model:
       prompt_template: ${model.data.train_ds.prompt_template}
       tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
       truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      global_sample_mapping: False # Whether to shuffle the replicated data all together, or shuffle the dataset within each epoch
       metric:
         name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
         average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
diff --git a/nemo/README.md b/nemo/README.md
index 869ce2f50031..a6025e77822a 100644
--- a/nemo/README.md
+++ b/nemo/README.md
@@ -10,3 +10,7 @@ NeMo (**Ne**ural **Mo**dules) is a toolkit for creating AI applications built ar
 * Vision - collection of modules and models for building computer vision networks
 * Multimodal - collection of modules and models for building multimodal networks
 * Audio - collection of modules and models for building audio processing networks
+
+**Performance**
+
+Performance benchmarks for pre-training and fine-tuning of various models can be found [here](../docs/source/performance/performance_summary.md)
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse.py b/nemo/collections/asr/data/audio_to_text_lhotse.py
index 21cecedeadbb..576ea8234c87 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse.py
@@ -50,7 +50,19 @@ def __init__(self, tokenizer):
 
     def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]:
         audio, audio_lens, cuts = self.load_audio(cuts)
-        tokens = [torch.as_tensor(self.tokenizer(c.supervisions[0].text, c.supervisions[0].language)) for c in cuts]
+        tokens = [
+            torch.as_tensor(
+                sum(
+                    (
+                        # Supervisions may come pre-tokenized from the dataloader.
+                        s.tokens if hasattr(s, "tokens") else self.tokenizer(s.text, s.language)
+                        for s in c.supervisions
+                    ),
+                    start=[],
+                )
+            )
+            for c in cuts
+        ]
         token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long)
         tokens = collate_vectors(tokens, padding_value=0)
         return audio, audio_lens, tokens, token_lens
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
index 4779e3677b05..f7c6b6adff7f 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -11,18 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Callable, Sequence
+from dataclasses import dataclass
+from typing import Callable, Union
 
 import torch.utils.data
 from lhotse import CutSet
-from lhotse.cut import MixedCut, MonoCut
 from lhotse.dataset import AudioSamples
 from lhotse.dataset.collation import collate_vectors
 
-from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
-from nemo.collections.common.prompts.canary import CanaryPromptFormatter
-from nemo.collections.common.tokenizers import CanaryTokenizer, TokenizerSpec
-from nemo.collections.common.tokenizers.canary_tokenizer import CANARY_SPECIAL_TOKENIZER
+from nemo.collections.common.tokenizers import TokenizerSpec
+
+
+@dataclass
+class PromptedAudioToTextMiniBatch:
+    audio: torch.Tensor
+    audio_lens: torch.Tensor
+    transcript: torch.Tensor
+    transcript_lens: torch.Tensor
+    prompt: torch.Tensor
+    prompt_lens: torch.Tensor
+    prompted_transcript: torch.Tensor
+    prompted_transcript_lens: torch.Tensor
+
+    def get_decoder_inputs_outputs(self) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns the inputs and outputs of transformer decoder for training.
+        The input is ``prompted_transcript`` (minus last token),
+        and the output is ``prompted_transcript`` (minus first token).
+        """
+        return self.prompted_transcript[:, :-1], self.prompted_transcript[:, 1:]
 
 
 class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):
@@ -45,143 +62,49 @@ class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):
     def __init__(
         self,
         tokenizer: TokenizerSpec,
-        prompt_format_fn: Callable[[CutSet, TokenizerWrapper, bool], Sequence[Sequence[int]]],
-        inference: bool = False,
+        prompt_format_fn: Callable[
+            [CutSet, TokenizerSpec], tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]
+        ],
     ):
         super().__init__()
-        self.tokenizer = TokenizerWrapper(tokenizer)
+        self.tokenizer = tokenizer
         self.load_audio = AudioSamples(fault_tolerant=True)
-        self.padding_value = self.tokenizer._tokenizer.pad_id
+        self.padding_value = self.tokenizer.pad
         self.prompt_format_fn = prompt_format_fn
-        self.inference = inference
 
-    def __getitem__(self, cuts: CutSet) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    def __getitem__(self, cuts: CutSet) -> PromptedAudioToTextMiniBatch:
         audio, audio_lens, cuts = self.load_audio(cuts)
 
-        prompts_with_answers, prompts = self.prompt_format_fn(cuts, self.tokenizer, inference=self.inference)
-
-        prompts_with_answers = [torch.as_tensor(t) for t in prompts_with_answers]
-        prompts_with_answers_lens = torch.tensor([t.size(0) for t in prompts_with_answers], dtype=torch.long)
-        prompts_with_answers = collate_vectors(prompts_with_answers, padding_value=self.padding_value)
-
-        if self.inference:
-            prompts = [torch.as_tensor(t) for t in prompts]
-            prompts_lens = torch.tensor([t.size(0) for t in prompts], dtype=torch.long)
-            prompts = collate_vectors(prompts, padding_value=self.padding_value)
-        else:
-            prompts = None
-            prompts_lens = None
-
-        return audio, audio_lens, prompts_with_answers, prompts_with_answers_lens, prompts, prompts_lens
-
-
-# Mapping from a string name to a known prompt formatter function.
-PROMPT_FORMAT_FNS = {}
-
-
-def registered_prompt_format_fn(prompt_fn: Callable[[CutSet, TokenizerWrapper, bool], Sequence[Sequence[int]]]):
-    """
-    Decorator for registering prompt functions under a name.
-
-    Example::
-
-        >>> @registered_prompt_format_fn
-        ... def my_prompt(cuts, tokenizer):
-        ...     pass
-        ...
-        ... prompt_fn = get_prompt_format_fn("my_prompt")
-    """
-    global PROMPT_FORMAT_FNS
-
-    PROMPT_FORMAT_FNS[prompt_fn.__name__] = prompt_fn
-    return prompt_fn
-
-
-def get_prompt_format_fn(name: str) -> Callable[[CutSet, TokenizerWrapper, bool], Sequence[Sequence[int]]]:
-    if name not in PROMPT_FORMAT_FNS:
-        raise ValueError(
-            f"Unknown prompt format function name: {name} " f"(must be one of: {list(PROMPT_FORMAT_FNS.keys())}"
-        )
-    return PROMPT_FORMAT_FNS[name]
-
-
-@registered_prompt_format_fn
-def canary(
-    cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False
-) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
-    """
-    Prepend and append control tokens to the token sequence as per Canary format.
-
-    We use the following special tokens:
-    * <|startoftranscript|>
-    * <|transcribe|>
-    * <|translate|>
-    * <|nopnc|>
-    * <|pnc|>
-    * <|endoftext|>
-    * <|LANG|> - for each supported language.
-    * <|nospeech|>
-
-    The prompt format syntax is as follows:
-
-        <|startoftranscript|> [ <|nospeech|> | <|LANG|> [ <|transcribe|> | <|translate|> ] <|LANG|> [ <|pnc|> | <|nopnc|> ] TEXT <|endoftext|> ]
-
-    Where expression ``[ a | b ]`` denotes expression ``a`` or expression ``b``, and can be nested.
-    Note that ``<|LANG|>`` appears twice: the first occurrence is for the "source" language
-    (i.e., spoken language in the recording) and the second occurrence is for the "target" language
-    (i.e., the language in which we are going to output the text).
-    """
-
-    assert isinstance(
-        tokenizer._tokenizer, CanaryTokenizer
-    ), "To use 'canary' prompt format, you must use the CanaryTokenizer."
-    formatter = CanaryPromptFormatter(tokenizer._tokenizer)
-
-    prompts_with_answers, prompts = [], []
-    for cut in cuts:
-        if isinstance(cut, MixedCut):
-            cut = cut._first_non_padding_cut
-        if not isinstance(cut, MonoCut):
-            raise TypeError(
-                f"Expected input audio to have a single channel (required MonoCut/MixedCut, but we received: {cut=})"
+        # Fast-path: the tokenization and prompt formatting was already done before sampling.
+        attrs = ("tokenized_prompt", "tokenized_transcript", "tokenized_prompted_transcript")
+        pre_formatted = all(hasattr(c, a) for c in cuts for a in attrs)
+        if pre_formatted:
+            prompts_with_answers, prompts, answers = zip(
+                *((c.tokenized_prompted_transcript, c.tokenized_prompt, c.tokenized_transcript) for c in cuts)
             )
-
-        # first, validate the utterance
-        expected_slots = set(formatter.get_slots("user"))
-        missing_keys = expected_slots - set(cut.custom)
-        if "task" in missing_keys and "taskname" in cut.custom:
-            # Compatibility with "old" Canary manifest format.
-            # For compatbility with inference options, this slot is now called "task".
-            cut.custom["task"] = cut.custom["taskname"]
-            missing_keys.remove("task")
-        if missing_keys:
-            raise RuntimeError(
-                f"We found cut with ID {cut.id} that is missing the following keys: {missing_keys}"
-                f"Please ensure that every utterance in the input manifests contains these keys."
-            )
-
-        encoded = formatter.encode_dialog(
-            turns=[
-                dict(
-                    role="user",
-                    slots={
-                        **{slot: cut.custom[slot] for slot in expected_slots},
-                        formatter.PROMPT_LANGUAGE_SLOT: CANARY_SPECIAL_TOKENIZER,
-                    },
-                ),
-                dict(
-                    role="assistant",
-                    slots={
-                        "text": ' '.join(s.text for s in cut.supervisions),
-                        formatter.PROMPT_LANGUAGE_SLOT: cut.supervisions[0].language,
-                    },
-                ),
-            ]
+        else:
+            prompts_with_answers, prompts, answers = self.prompt_format_fn(cuts, self.tokenizer)
+
+        transcript, transcript_lens = self._collate_tokens(answers)
+        prompts_with_answers, prompts_with_answers_lens = self._collate_tokens(prompts_with_answers)
+        prompts, prompt_lens = self._collate_tokens(prompts)
+
+        return PromptedAudioToTextMiniBatch(
+            audio=audio,
+            audio_lens=audio_lens,
+            transcript=transcript,
+            transcript_lens=transcript_lens,
+            prompt=prompts,
+            prompt_lens=prompt_lens,
+            prompted_transcript=prompts_with_answers,
+            prompted_transcript_lens=prompts_with_answers_lens,
         )
-        prompts_with_answers.append(encoded["input_ids"])
-        prompts.append(encoded["context_ids"])
 
-    return prompts_with_answers, prompts
+    def _collate_tokens(self, tokens: list[Union[list[int], torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]:
+        tokens = [torch.as_tensor(t) for t in tokens]
+        token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long)
+        tokens = collate_vectors(tokens, padding_value=self.padding_value)
+        return tokens, token_lens
 
 
 class ProbablyIncorrectLanguageKeyError(RuntimeError):
diff --git a/nemo/collections/asr/data/data_simulation.py b/nemo/collections/asr/data/data_simulation.py
index 5ee2ad19b951..369a95d9ee9a 100644
--- a/nemo/collections/asr/data/data_simulation.py
+++ b/nemo/collections/asr/data/data_simulation.py
@@ -1122,7 +1122,7 @@ def _generate_session(
                 alignments=self._alignments,
                 session_name=filename,
                 speaker_id=speaker_ids[speaker_turn],
-                start=int(start / self._params.data_simulator.sr),
+                start=float(start / self._params.data_simulator.sr),
             )
 
             self.annotator.annote_lists['ctm'].extend(new_ctm_entries)
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 5ad91e75a867..d2d2213be6e6 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -27,7 +27,7 @@
 
 from nemo.collections.asr.data.audio_to_text_lhotse_prompted import (
     PromptedAudioToTextLhotseDataset,
-    get_prompt_format_fn,
+    PromptedAudioToTextMiniBatch,
 )
 from nemo.collections.asr.metrics import BLEU, WER
 from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel
@@ -47,6 +47,7 @@
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
+from nemo.collections.common.prompts.fn import get_prompt_format_fn
 from nemo.collections.common.prompts.formatter import PromptFormatter
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import (
@@ -498,7 +499,7 @@ def transcribe(
 
         return super().transcribe(audio=audio, override_config=trcfg)
 
-    def _setup_dataloader_from_config(self, config: Optional[Dict], inference: bool = False):
+    def _setup_dataloader_from_config(self, config: Optional[Dict]):
         assert config.get("use_lhotse", False), (
             "Multi-task model only supports dataloading with Lhotse. "
             "Please set config.{train,validation,test}_ds.use_lhotse=True"
@@ -510,8 +511,8 @@ def _setup_dataloader_from_config(self, config: Optional[Dict], inference: bool
             dataset=PromptedAudioToTextLhotseDataset(
                 tokenizer=self.tokenizer,
                 prompt_format_fn=get_prompt_format_fn(self.prompt_format),
-                inference=inference,
             ),
+            tokenizer=self.tokenizer,
         )
 
     def setup_training_data(self, train_data_config: Optional[DictConfig]):
@@ -554,7 +555,7 @@ def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict
 
         # preserve config
         self._update_dataset_config(dataset_name='validation', config=val_data_config)
-        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config, inference=True)
+        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config)
 
     def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
         """
@@ -570,7 +571,7 @@ def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
 
         # preserve config
         self._update_dataset_config(dataset_name='test', config=test_data_config)
-        self._test_dl = self._setup_dataloader_from_config(config=test_data_config, inference=True)
+        self._test_dl = self._setup_dataloader_from_config(config=test_data_config)
 
     @property
     def input_types(self) -> Optional[Dict[str, NeuralType]]:
@@ -664,41 +665,46 @@ def forward(
         return transf_log_probs, encoded_len, enc_states, enc_mask
 
     # PTL-specific methods
-    def training_step(self, batch, batch_nb):
+    def training_step(self, batch: PromptedAudioToTextMiniBatch, batch_nb):
 
         if batch is None:
             return torch.tensor([0.0])
 
-        # During training prompt and prompt_len are null, ignore.
-        signal, signal_len, transcript, transcript_len, prompt, prompt_len = batch
-        input_ids, labels = transcript[:, :-1], transcript[:, 1:]
+        input_ids, labels = batch.get_decoder_inputs_outputs()
 
         transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
-            input_signal=signal,
-            input_signal_length=signal_len,
+            input_signal=batch.audio,
+            input_signal_length=batch.audio_lens,
             transcript=input_ids,
-            transcript_length=transcript_len,
+            transcript_length=batch.prompted_transcript_lens,
         )
 
         audio_loss = self.loss(log_probs=transf_log_probs, labels=labels)
 
+        num_frames = batch.audio_lens.sum()
+        num_tokens = batch.prompted_transcript_lens.sum()
+        tot_frames = batch.audio.numel()
+        tot_tokens = batch.prompted_transcript.numel()
         tensorboard_logs = {
             'train_loss': audio_loss,
             'learning_rate': self._optimizer.param_groups[0]['lr'],
+            'batch_size': batch.audio.shape[0],
+            'num_frames': num_frames,
+            'num_tokens': num_tokens,
+            'input_to_padding_ratio': num_frames / tot_frames,
+            'output_to_padding_ratio': num_tokens / tot_tokens,
         }
 
         return {'loss': audio_loss, 'log': tensorboard_logs}
 
-    def validation_pass(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"):
-        # During inference, dataloader passes pure prompt without transcript text.
-        signal, signal_len, transcript, transcript_len, prompt, prompt_len = batch
-        input_ids, labels = transcript[:, :-1], transcript[:, 1:]
+    def validation_pass(self, batch: PromptedAudioToTextMiniBatch, batch_idx, dataloader_idx=0, eval_mode="val"):
+        input_ids, labels = batch.get_decoder_inputs_outputs()
 
         transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
-            input_signal=signal,
-            input_signal_length=signal_len,
+            input_signal=batch.audio,
+            input_signal_length=batch.audio_lens,
             transcript=input_ids,
-            transcript_length=transcript_len,
+            transcript_length=batch.prompted_transcript_lens,
         )
 
         transf_loss = self.loss(log_probs=transf_log_probs, labels=labels)
@@ -710,10 +716,10 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"):
         self.wer.update(
             predictions=enc_states,
             predictions_lengths=encoded_len,
-            targets=transcript,
-            targets_lengths=transcript_len,
+            targets=batch.transcript,
+            targets_lengths=batch.transcript_lens,
             predictions_mask=enc_mask,
-            input_ids=prompt,
+            input_ids=batch.prompt,
         )
         wer, wer_num, wer_denom = self.wer.compute()
         output_dict.update({"val_wer": wer, "val_wer_num": wer_num, "val_wer_denom": wer_denom})
@@ -722,10 +728,10 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"):
         self.bleu.update(
             predictions=enc_states,
             predictions_lengths=encoded_len,
-            targets=transcript,
-            targets_lengths=transcript_len,
+            targets=batch.transcript,
+            targets_lengths=batch.transcript_lens,
             predictions_mask=enc_mask,
-            input_ids=prompt,
+            input_ids=batch.prompt,
         )
         bleu_metrics = self.bleu.compute(prefix=f"{eval_mode}_")
         output_dict.update(bleu_metrics)
@@ -823,7 +829,9 @@ def _transcribe_input_manifest_processing(
 
         return super()._transcribe_input_manifest_processing(audio_files, temp_dir, trcfg)
 
-    def _transcribe_forward(self, batch: Any, trcfg: MultiTaskTranscriptionConfig):
+    def _transcribe_forward(
+        self, batch: PromptedAudioToTextMiniBatch | tuple[torch.Tensor, ...], trcfg: MultiTaskTranscriptionConfig
+    ) -> dict:
         """
         Internal function to perform the model's custom forward pass to return outputs that are processed by
         `_transcribe_output_processing()`.
@@ -836,13 +844,25 @@ def _transcribe_forward(self, batch: Any, trcfg: MultiTaskTranscriptionConfig):
         Returns:
             The model's outputs that are processed by `_transcribe_output_processing()`.
         """
-        log_probs, encoded_len, enc_states, enc_mask = self.forward(
-            input_signal=batch[0], input_signal_length=batch[1]
-        )
-        if len(batch) == 6:
-            # Prompt provided by the dataloader.
-            decoder_input_ids = batch[4]
+        if isinstance(batch, PromptedAudioToTextMiniBatch):
+            # Handling regular Canary DataLoader
+            audio = batch.audio
+            audio_lens = batch.audio_lens
+            decoder_input_ids = batch.prompt
         else:
+            # Handling TensorDataset / external DataLoader
+            audio, audio_lens = batch[0], batch[1]
+            if len(batch) == 6:
+                # Prompt provided by the user.
+                decoder_input_ids = batch[4]
+            else:
+                # Prompt to be built dynamically.
+                decoder_input_ids = None
+        batch_size = audio.shape[0]
+
+        log_probs, encoded_len, enc_states, enc_mask = self.forward(input_signal=audio, input_signal_length=audio_lens)
+
+        if decoder_input_ids is None:
             # The dataloader provided only audio + audio_lens, so we
             # are constructing the prompt dynamically using TranscribeConfig.
 
@@ -877,17 +897,17 @@ def _transcribe_forward(self, batch: Any, trcfg: MultiTaskTranscriptionConfig):
             decoder_input_ids = (
                 self.prompt.encode_dialog(turns=turns)["context_ids"]
                 .unsqueeze(0)
-                .repeat(batch[0].shape[0], 1)
+                .repeat(batch_size, 1)
                 .to(trcfg._internal.device)
             )
-        output = dict(
+
+        return dict(
             log_probs=log_probs,
             encoded_lengths=encoded_len,
             encoder_states=enc_states,
             encoder_mask=enc_mask,
             decoder_input_ids=decoder_input_ids,
         )
-        return output
 
     def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionConfig) -> GenericTranscriptionType:
         """
@@ -954,7 +974,7 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
             'channel_selector': config.get('channel_selector', None),
         }
 
-        temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config), inference=True)
+        temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))
         return temporary_datalayer
 
     def _transcribe_on_end(self, trcfg: MultiTaskTranscriptionConfig):
@@ -989,13 +1009,10 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
                 entry = {
                     'audio_filepath': item,
                     'duration': 100000,
-                    trcfg.text_field: 'nothing',
                 }
             elif isinstance(item, dict):
                 entry = item
                 entry['audio_filepath'] = get_full_path(entry['audio_filepath'], manifest_file=manifest_path)
-                if trcfg.text_field not in entry:
-                    entry[trcfg.text_field] = 'nothing'
             else:
                 raise ValueError(f"Expected str or dict, got {type(item)}")
             default_turn = [t for t in trcfg.prompt if t["role"] == "user"]
@@ -1017,40 +1034,69 @@ def get_transcribe_config(cls) -> MultiTaskTranscriptionConfig:
         """
         return MultiTaskTranscriptionConfig()
 
-    def predict_step(self, batch, batch_idx=0, dataloader_idx=0, has_processed_signal=False):
-        signal, signal_len, _, _, prompt, prompt_len = batch
-
-        processed_signal = None
-        processed_signal_length = None
+    def predict_step(
+        self, batch: PromptedAudioToTextMiniBatch, batch_idx=0, dataloader_idx=0, has_processed_signal=False
+    ):
         if has_processed_signal:
-            processed_signal = signal
-            processed_signal_length = signal_len
+            processed_signal = batch.audio
+            processed_signal_length = batch.audio_lens
             signal = None
             signal_len = None
+        else:
+            processed_signal = None
+            processed_signal_length = None
+            signal = batch.audio
+            signal_len = batch.audio_lens
 
-        transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
+        _, _, enc_states, enc_mask = self.forward(
             input_signal=signal,
             input_signal_length=signal_len,
             processed_signal=processed_signal,
             processed_signal_length=processed_signal_length,
-            transcript=prompt,
-            transcript_length=prompt_len,
         )
 
         text = self.decoding.decode_predictions_tensor(
             encoder_hidden_states=enc_states,
             encoder_input_mask=enc_mask,
-            decoder_input_ids=prompt,
+            decoder_input_ids=batch.prompt,
             return_hypotheses=False,
         )[0]
 
-        text = [self.decoding.strip_special_tokens(t) for t in text]
         return text
 
     @property
     def adapter_module_names(self) -> List[str]:
         return ['', 'encoder', 'transf_encoder', 'transf_decoder']
 
+    @property
+    def oomptimizer_schema(self) -> dict:
+        """
+        Return a typing schema for optimal batch size calibration for various
+        sequence lengths using OOMptimizer.
+        """
+        return {
+            "cls": PromptedAudioToTextMiniBatch,
+            "inputs": [
+                {"name": "audio", "type": NeuralType(("B", "T"), AudioSignal()), "seq_length": "input"},
+                {"name": "audio_lens", "type": NeuralType(("B",), LengthsType()), "seq_length": "input"},
+                {
+                    "name": "prompted_transcript",
+                    "type": NeuralType(("B", "T"), LabelsType()),
+                    "seq_length": "output",
+                    "vocab_size": self.tokenizer.vocab_size,
+                },
+                {
+                    "name": "prompted_transcript_lens",
+                    "type": NeuralType(("B",), LengthsType()),
+                    "seq_length": "output",
+                },
+                {"name": "transcript", "type": "dummy"},
+                {"name": "transcript_lens", "type": "dummy"},
+                {"name": "prompt", "type": "dummy"},
+                {"name": "prompt_lens", "type": "dummy"},
+            ],
+        }
+
 
 def parse_multitask_prompt(prompt: dict | None) -> list[dict]:
     if prompt is None or not prompt:
diff --git a/nemo/collections/asr/models/asr_model.py b/nemo/collections/asr/models/asr_model.py
index 24e300aff112..a54a37fe5371 100644
--- a/nemo/collections/asr/models/asr_model.py
+++ b/nemo/collections/asr/models/asr_model.py
@@ -22,6 +22,7 @@
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.classes.exportable import Exportable
 from nemo.core.classes.mixins import AccessMixin
+from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType
 from nemo.core.utils.neural_type_utils import get_io_names
 from nemo.utils import logging, model_utils
 from nemo.utils.cast_utils import cast_all
@@ -218,6 +219,26 @@ def on_predict_epoch_start(self) -> None:
         """
         WithOptionalCudaGraphs.enable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
 
+    @property
+    def oomptimizer_schema(self) -> dict:
+        """
+        Return a typing schema for optimal batch size calibration for various
+        sequence lengths using OOMptimizer.
+        """
+        return {
+            "cls": tuple,
+            "inputs": [
+                {"type": NeuralType(("B", "T"), AudioSignal()), "seq_length": "input"},
+                {"type": NeuralType(("B",), LengthsType()), "seq_length": "input"},
+                {
+                    "type": NeuralType(("B", "T"), LabelsType()),
+                    "seq_length": "output",
+                    "vocab_size": self.tokenizer.vocab_size,
+                },
+                {"type": NeuralType(("B",), LengthsType()), "seq_length": "output"},
+            ],
+        }
+
 
 class ExportableEncDecModel(Exportable):
     """
diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py
index f861a971f5ea..2e313ce3c928 100644
--- a/nemo/collections/asr/models/ctc_bpe_models.py
+++ b/nemo/collections/asr/models/ctc_bpe_models.py
@@ -100,6 +100,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
                 global_rank=self.global_rank,
                 world_size=self.world_size,
                 dataset=LhotseSpeechToTextBpeDataset(tokenizer=self.tokenizer),
+                tokenizer=self.tokenizer,
             )
 
         dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config(
@@ -324,7 +325,10 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig):
         decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls))
         decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg)
 
-        self.decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer,)
+        self.decoding = CTCBPEDecoding(
+            decoding_cfg=decoding_cfg,
+            tokenizer=self.tokenizer,
+        )
 
         self.wer = WER(
             decoding=self.decoding,
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
index 39375f08e139..089c34d98884 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
@@ -96,7 +96,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.cfg.decoding = self.set_decoding_type_according_to_loss(self.cfg.decoding)
         # Setup decoding object
         self.decoding = RNNTBPEDecoding(
-            decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+            decoding_cfg=self.cfg.decoding,
+            decoder=self.decoder,
+            joint=self.joint,
+            tokenizer=self.tokenizer,
         )
 
         # Setup wer object
@@ -139,7 +142,10 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
                 config,
                 global_rank=self.global_rank,
                 world_size=self.world_size,
-                dataset=LhotseSpeechToTextBpeDataset(tokenizer=self.tokenizer,),
+                dataset=LhotseSpeechToTextBpeDataset(
+                    tokenizer=self.tokenizer,
+                ),
+                tokenizer=self.tokenizer,
             )
 
         dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config(
@@ -322,7 +328,10 @@ def change_vocabulary(
         decoding_cfg = self.set_decoding_type_according_to_loss(decoding_cfg)
 
         self.decoding = RNNTBPEDecoding(
-            decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+            decoding_cfg=decoding_cfg,
+            decoder=self.decoder,
+            joint=self.joint,
+            tokenizer=self.tokenizer,
         )
 
         self.wer = WER(
@@ -429,7 +438,10 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type
             decoding_cfg = self.set_decoding_type_according_to_loss(decoding_cfg)
 
             self.decoding = RNNTBPEDecoding(
-                decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+                decoding_cfg=decoding_cfg,
+                decoder=self.decoder,
+                joint=self.joint,
+                tokenizer=self.tokenizer,
             )
 
             self.wer = WER(
diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py
index bb4e7f718a8e..48e857af4033 100644
--- a/nemo/collections/asr/models/rnnt_bpe_models.py
+++ b/nemo/collections/asr/models/rnnt_bpe_models.py
@@ -317,7 +317,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.cfg.decoding = self.set_decoding_type_according_to_loss(self.cfg.decoding)
         # Setup decoding object
         self.decoding = RNNTBPEDecoding(
-            decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+            decoding_cfg=self.cfg.decoding,
+            decoder=self.decoder,
+            joint=self.joint,
+            tokenizer=self.tokenizer,
         )
 
         # Setup wer object
@@ -415,7 +418,10 @@ def change_vocabulary(
         decoding_cfg = self.set_decoding_type_according_to_loss(decoding_cfg)
 
         self.decoding = RNNTBPEDecoding(
-            decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+            decoding_cfg=decoding_cfg,
+            decoder=self.decoder,
+            joint=self.joint,
+            tokenizer=self.tokenizer,
         )
 
         self.wer = WER(
@@ -465,7 +471,10 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig):
         decoding_cfg = self.set_decoding_type_according_to_loss(decoding_cfg)
 
         self.decoding = RNNTBPEDecoding(
-            decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer,
+            decoding_cfg=decoding_cfg,
+            decoder=self.decoder,
+            joint=self.joint,
+            tokenizer=self.tokenizer,
         )
 
         self.wer = WER(
@@ -497,7 +506,10 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
                 config,
                 global_rank=self.global_rank,
                 world_size=self.world_size,
-                dataset=LhotseSpeechToTextBpeDataset(tokenizer=self.tokenizer,),
+                dataset=LhotseSpeechToTextBpeDataset(
+                    tokenizer=self.tokenizer,
+                ),
+                tokenier=self.tokenizer,
             )
 
         dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config(
diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 9970b4970236..089186e142bf 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -230,6 +230,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
                 dataset=LhotseSpeechToTextBpeDataset(
                     tokenizer=self.tokenizer,
                 ),
+                tokenizer=self.tokenizer,
             )
 
         dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config(
diff --git a/nemo/collections/asr/modules/transformer/transformer_generators.py b/nemo/collections/asr/modules/transformer/transformer_generators.py
index 1a38e7fa4b6c..e6775a48f635 100644
--- a/nemo/collections/asr/modules/transformer/transformer_generators.py
+++ b/nemo/collections/asr/modules/transformer/transformer_generators.py
@@ -15,7 +15,9 @@
 from contextlib import contextmanager
 
 import torch
+from torch.distributions import Categorical
 
+from nemo.collections.asr.parts.submodules.token_classifier import TokenClassifier
 from nemo.collections.common.parts import NEG_INF, mask_padded_tokens
 
 __all__ = [
@@ -30,12 +32,13 @@
 class GreedySequenceGenerator:
     """
     Greedy sequence generator based on the decoder followed by log_softmax.
+    Optionally supports temperature sampling with ``n_samples`` and ``temperature`` options.
 
     Args:
         embedding: nn.Module, transforms input_ids into vector embeddings
         decoder: nn.Module, takes embeddings and produces hidden_states
-        log_softmax: nn.Module, takes hidden_states and produces log_probs
-            which correspond to probability distribution of tokens (ids)
+        classifier: nn.Module, takes hidden_states and produces
+            logits or log-probability distribution of tokens (ids)
         pad: index of padding token in the vocabulary
         bos: index of beginning of sequence token in the vocabulary
         eos: index of end of sequence token in the vocabulary
@@ -45,28 +48,35 @@ class GreedySequenceGenerator:
             source sequences plus max_delta_length
         batch_size: size of the batch of generated sequences if neither
             source nor target starting sequences are provided
+        n_samples: number of sequences to generate (requires ``temperature`` to be set)
+        temperature: temperature for temperature sampling. Even with ``n_samples`` set to 1,
+            enabling temperature will sample hypotheses instead of returning the best ones.
     """
 
     def __init__(
         self,
         embedding,
         decoder,
-        log_softmax,
+        classifier: TokenClassifier,
         pad=0,
         bos=1,
         eos=2,
         max_sequence_length=512,
         max_delta_length=20,
         batch_size=1,
+        n_samples=1,
+        temperature=None,
     ):
         super().__init__()
         self.embedding = embedding
         self.decoder = decoder
-        self.log_softmax = log_softmax
+        self.classifier = classifier
         self.pad, self.bos, self.eos = pad, bos, eos
         self.max_seq_length = max_sequence_length
         self.max_delta_len = max_delta_length
         self.batch_size = batch_size
+        self.n_samples = n_samples
+        self.temperature = temperature
 
     def _one_step_forward(
         self,
@@ -75,6 +85,7 @@ def _one_step_forward(
         encoder_input_mask=None,
         decoder_mems_list=None,
         pos=0,
+        return_scores: bool = True,
     ):
         """
         One step of autoregressive output generation.
@@ -107,8 +118,9 @@ def _one_step_forward(
             decoder_mems_list = self.decoder.forward(
                 decoder_hidden_states, decoder_input_mask, decoder_mems_list, return_mems=True
             )
-        log_probs = self.log_softmax.forward(hidden_states=decoder_mems_list[-1][:, -1:])
-        return log_probs, decoder_mems_list
+        with self.classifier.with_log_softmax_enabled(return_scores) as clf:
+            logits = clf.forward(hidden_states=decoder_mems_list[-1][:, -1:])
+        return logits, decoder_mems_list
 
     def _prepare_for_search(self, decoder_input_ids=None, encoder_hidden_states=None):
         """
@@ -145,30 +157,57 @@ def _forward(
         self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False
     ):
         assert not return_beam_scores
+        is_sampling = self.temperature is not None and self.n_samples > 1
+
         tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states)
+        if is_sampling:
+            tgt = torch.repeat_interleave(tgt, self.n_samples, dim=0)
+            encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, self.n_samples, dim=0)
+            encoder_input_mask = torch.repeat_interleave(encoder_input_mask, self.n_samples, dim=0)
+            orig_batch_size = batch_size
+            batch_size = batch_size * self.n_samples
 
         # pad profile tracks sequences ending with <eos> token to replace
         # everything after <eos> with <pad> token
         decoder_parameter = next(self.decoder.parameters())
-        pad_profile = torch.zeros(batch_size, 1).long().to(decoder_parameter.device)
+        pad_profile = torch.zeros(batch_size).long().to(decoder_parameter.device)
 
         decoder_mems_list = None
         for i in range(max_generation_length):
 
-            log_probs, decoder_mems_list = self._one_step_forward(
-                tgt[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i
+            if i == 0:
+                input_ids = tgt
+            else:
+                input_ids = tgt[:, -1:]
+
+            logits, decoder_mems_list = self._one_step_forward(
+                input_ids,
+                encoder_hidden_states,
+                encoder_input_mask,
+                decoder_mems_list,
+                i,
+                return_scores=return_beam_scores,
             )
 
-            next_tokens = torch.argmax(log_probs[:, -1], dim=-1, keepdim=True)
+            if self.temperature is None:  # Greedy decoding
+                next_tokens = torch.argmax(logits[:, -1], dim=-1)
+            else:  # Temperature sampling
+                next_tokens = Categorical(logits=logits[:, -1] / self.temperature).sample()
+
             next_tokens = self.pad * pad_profile + next_tokens * (1 - pad_profile)
             pad_profile = torch.max(pad_profile, (next_tokens == self.eos).long())
-            tgt = torch.cat((tgt, next_tokens), dim=-1)
+            tgt = torch.cat((tgt, next_tokens.unsqueeze(1)), dim=-1)
 
             # abort generation if all sequences end with <eos>
             if pad_profile.sum() == batch_size:
                 break
 
-        return tgt
+        samples = None
+        if is_sampling:
+            samples = list(tgt.view(orig_batch_size, self.n_samples, -1))
+            tgt = tgt[:: self.n_samples]
+
+        return tgt, samples
 
     def __call__(
         self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False
@@ -195,9 +234,9 @@ def freeze(self) -> None:
         for param in self.decoder.parameters():
             param.requires_grad = False
         self.decoder.eval()
-        for param in self.log_softmax.parameters():
+        for param in self.classifier.parameters():
             param.requires_grad = False
-        self.log_softmax.eval()
+        self.classifier.eval()
 
     def unfreeze(self) -> None:
         """Unfreeze weights of embedding, decoder, and classification layers."""
@@ -207,14 +246,14 @@ def unfreeze(self) -> None:
         for param in self.decoder.parameters():
             param.requires_grad = True
         self.decoder.train()
-        for param in self.log_softmax.parameters():
+        for param in self.classifier.parameters():
             param.requires_grad = True
-        self.log_softmax.train()
+        self.classifier.train()
 
     @contextmanager
     def as_frozen(self):
         """
-        Context manager which temporarily freezes embedding, decoder, and log_softmax modules,
+        Context manager which temporarily freezes embedding, decoder, and classifier modules,
         yields control and finally unfreezes the modules.
         """
         self.freeze()
@@ -252,9 +291,15 @@ def _one_step_forward(
         encoder_input_mask=None,
         decoder_mems_list=None,
         pos=0,
+        return_scores: bool = True,
     ):
         log_probs, decoder_mems_list = super()._one_step_forward(
-            decoder_input_ids, encoder_hidden_states, encoder_input_mask, decoder_mems_list, pos
+            decoder_input_ids,
+            encoder_hidden_states,
+            encoder_input_mask,
+            decoder_mems_list,
+            pos,
+            return_scores=return_scores,
         )
 
         batch_size, seq_len, vocab_size = log_probs.size()
diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py
index 2105097d0aff..84dd4aed9bce 100644
--- a/nemo/collections/asr/parts/mixins/transcription.py
+++ b/nemo/collections/asr/parts/mixins/transcription.py
@@ -17,7 +17,7 @@
 import tempfile
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
-from dataclasses import dataclass
+from dataclasses import dataclass, fields, is_dataclass
 from functools import partial
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -69,13 +69,18 @@ class TranscribeConfig:
 def move_to_device(batch, device, non_blocking=False):
     """
     Recursively move all tensors in `batch` to `device`.
+    Supports tensors, lists, tuples, dictionaries, and dataclasses.
     """
     if isinstance(batch, torch.Tensor):
         return batch.to(device, non_blocking=non_blocking)
     elif isinstance(batch, (list, tuple)):
-        return [move_to_device(x, device, non_blocking) for x in batch]
+        return type(batch)(move_to_device(x, device, non_blocking) for x in batch)
     elif isinstance(batch, dict):
         return {k: move_to_device(v, device, non_blocking) for k, v in batch.items()}
+    elif is_dataclass(batch):
+        return type(batch)(
+            **{field.name: move_to_device(getattr(batch, field.name), device, non_blocking) for field in fields(batch)}
+        )
     else:
         return batch  # do nothing if not supported type
 
diff --git a/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py b/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py
index ab3938eebe35..e181772b7f18 100644
--- a/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py
+++ b/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py
@@ -14,7 +14,7 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import torch
 
@@ -212,7 +212,7 @@ def forward(
 
         return (packed_result,)
 
-    def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids: torch.Tensor | None) -> None:
+    def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids: Union[torch.Tensor, None]) -> None:
         """
         For each hypothesis in the mini-batch:
         * Remove the decoder input ids (prompt) from the predictions
@@ -231,9 +231,12 @@ def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids:
                 hyp.y_sequence = hyp.y_sequence[prefix.shape[0] :]
         for hyp in packed_result:
             ids = hyp.y_sequence
+            ids_len = ids.shape[0]
             pos = -1
             while ids[pos] == self.pad or ids[pos] == self.eos:
                 pos -= 1
+                if ids_len + pos == -1:
+                    break  # empty sequence
             if pos < -1:
                 hyp.y_sequence = ids[: pos + 1]
 
diff --git a/nemo/collections/asr/parts/submodules/multitask_decoding.py b/nemo/collections/asr/parts/submodules/multitask_decoding.py
index e2ed2ca5c4bf..715ee7168037 100644
--- a/nemo/collections/asr/parts/submodules/multitask_decoding.py
+++ b/nemo/collections/asr/parts/submodules/multitask_decoding.py
@@ -25,6 +25,10 @@
     AEDBeamInferConfig,
     TransformerAEDBeamInfer,
 )
+from nemo.collections.asr.parts.submodules.multitask_greedy_decoding import (
+    AEDGreedyInferConfig,
+    TransformerAEDGreedyInfer,
+)
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis, NBestHypotheses
 from nemo.collections.common.tokenizers.aggregate_tokenizer import AggregateTokenizer
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
@@ -60,11 +64,9 @@ class AbstractMultiTaskDecoding(ABC):
 
             The config may further contain the following sub-dictionaries:
             "greedy":
-                max_symbols: int, describing the maximum number of target tokens to decode per
-                    timestep during greedy decoding. Setting to larger values allows longer sentences
-                    to be decoded, at the cost of increased execution time.
-                preserve_frame_confidence: Same as above, overrides above value.
-                confidence_method_cfg: Same as above, overrides confidence_cfg.method_cfg.
+                temperature: None (disabled) or float, specifying this enables temperature sampling instead of greedy decoding.
+                max_generation_delta: int = -1  # -1 means up to the max length of the decoder
+                preserve_alignments: bool = False (unsupported)
 
             "beam":
                 beam_size: int, defining the beam size for beam search. Must be >= 1.
@@ -103,34 +105,47 @@ def __init__(
         self.preserve_alignments = self.cfg.get('preserve_alignments', None)
         self.compute_langs = self.cfg.get('compute_langs', False)
         self.compute_hypothesis_token_set = self.cfg.get('compute_hypothesis_token_set', False)
+        self.transformer_decoder = transformer_decoder
+        self.log_softmax_module = log_softmax_module
+        self.tokenizer = tokenizer
+
+        self.change_strategy(self.cfg.strategy)
 
+    def change_strategy(self, strategy: str) -> "AbstractMultiTaskDecoding":
         possible_strategies = ['greedy', 'greedy_batch', 'beam']
-        if self.cfg.strategy not in possible_strategies:
-            raise ValueError(f"Decoding strategy must be one of {possible_strategies}")
+        if strategy not in possible_strategies:
+            raise ValueError(f"Decoding strategy must be one of {possible_strategies}" f"but was provided {strategy}")
 
         # Update preserve alignments
         if self.preserve_alignments is None:
-            if self.cfg.strategy in ['greedy', 'greedy_batch']:
+            if strategy in ['greedy', 'greedy_batch']:
                 self.preserve_alignments = self.cfg.greedy.get('preserve_alignments', False)
 
-            elif self.cfg.strategy in ['beam']:
+            elif strategy in ['beam']:
                 self.preserve_alignments = self.cfg.beam.get('preserve_alignments', False)
 
-        if self.cfg.strategy == 'greedy' or self.cfg.strategy == 'greedy_batch':
+        if strategy in ['greedy', 'greedy_batch']:
 
-            # self.decoding = None
-            raise NotImplementedError("Greedy decoding is not implemented yet.")
+            self.decoding = TransformerAEDGreedyInfer(
+                transformer_decoder=self.transformer_decoder,
+                log_softmax_module=self.log_softmax_module,
+                tokenizer=self.tokenizer,
+                max_generation_delta=self.cfg.greedy.get('max_generation_delta', -1),
+                preserve_alignments=self.preserve_alignments,
+                temperature=self.cfg.greedy.temperature,
+                n_samples=self.cfg.greedy.n_samples,
+            )
 
-        elif self.cfg.strategy == 'beam':
+        elif strategy == 'beam':
 
             self.decoding = TransformerAEDBeamInfer(
-                transformer_decoder=transformer_decoder,
-                log_softmax_module=log_softmax_module,
-                tokenizer=tokenizer,
+                transformer_decoder=self.transformer_decoder,
+                log_softmax_module=self.log_softmax_module,
+                tokenizer=self.tokenizer,
                 search_type=self.cfg.beam.get('search_type', 'default'),
                 beam_size=self.cfg.beam.beam_size,
                 length_penalty=self.cfg.beam.get('length_penalty', 0.0),
-                max_generation_delta=self.cfg.beam.get('max_generation_delta', 50),
+                max_generation_delta=self.cfg.beam.get('max_generation_delta', -1),
                 return_best_hypothesis=self.cfg.beam.get('return_best_hypothesis', True),
                 preserve_alignments=self.preserve_alignments,
             )
@@ -139,7 +154,7 @@ def __init__(
 
             raise ValueError(
                 f"Incorrect decoding strategy provided. Must be one of {possible_strategies}\n"
-                f"but was provided {self.cfg.strategy}"
+                f"but was provided {strategy}"
             )
 
     def decode_predictions_tensor(
@@ -465,9 +480,7 @@ class MultiTaskDecodingConfig:
     compute_langs: bool = False
 
     # greedy decoding config
-    # greedy: rnnt_greedy_decoding.GreedyBatchedRNNTInferConfig = field(
-    #     default_factory=rnnt_greedy_decoding.GreedyBatchedRNNTInferConfig
-    # )
+    greedy: AEDGreedyInferConfig = field(default_factory=AEDGreedyInferConfig)
 
     # beam decoding config
     beam: AEDBeamInferConfig = field(default_factory=lambda: AEDBeamInferConfig(beam_size=1))
diff --git a/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py b/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py
new file mode 100644
index 000000000000..b38c02574d5b
--- /dev/null
+++ b/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import torch
+
+from nemo.collections.asr.modules.transformer import GreedySequenceGenerator
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis, NBestHypotheses
+from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+from nemo.core import Typing, typecheck
+from nemo.core.neural_types import ChannelType, HypothesisType, LabelsType, MaskType, NeuralType
+from nemo.utils import logging
+
+
+def pack_hypotheses(
+    hypotheses: List[Hypothesis], beam_hypotheses: torch.Tensor, scores: List[Optional[float]]
+) -> List[Hypothesis]:
+
+    for idx, hyp in enumerate(hypotheses):  # type: Hypothesis
+        if scores[idx] is not None:
+            hyp.score = scores[idx]
+
+        hypi = beam_hypotheses[idx]
+        if torch.is_tensor(hypi):
+            hyp.y_sequence = hypi.long()
+        else:
+            hyp.y_sequence = torch.tensor(hypi, dtype=torch.long)
+
+        if hyp.dec_state is not None:
+            hyp.dec_state = _states_to_device(hyp.dec_state)
+
+    return hypotheses
+
+
+def _states_to_device(dec_state, device='cpu'):
+    if torch.is_tensor(dec_state):
+        dec_state = dec_state.to(device)
+
+    elif isinstance(dec_state, (list, tuple)):
+        dec_state = tuple(_states_to_device(dec_i, device) for dec_i in dec_state)
+
+    return dec_state
+
+
+class AEDGreedyInfer(ABC):
+    def __init__(
+        self,
+        transformer_decoder: torch.nn.Module,
+        log_softmax_module: torch.nn.Module,
+        tokenizer: TokenizerSpec,
+        search_type: str = 'default',
+        preserve_alignments: bool = False,
+    ):
+        super().__init__()
+
+        self.transformer_decoder = transformer_decoder
+        self.log_softmax_module = log_softmax_module
+        self.tokenizer = tokenizer
+        self.search_type = search_type
+
+        self.preserve_alignments = preserve_alignments
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    @abstractmethod
+    def forward(
+        self,
+        encoder_hidden_states: torch.Tensor,
+        encoder_input_mask: torch.Tensor,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        partial_hypotheses: Optional[List[Hypothesis]] = None,
+    ):
+        raise NotImplementedError()
+
+    def set_decoding_type(self, decoding_type: str):
+        self.decoding_type = decoding_type
+
+
+class TransformerAEDGreedyInfer(AEDGreedyInfer, Typing):
+    """
+    A greedy decoder engine for AED Transformer models with support for temperature sampling.
+    """
+
+    @property
+    def input_types(self):
+        """Returns definitions of module input ports."""
+        # Input can be of dimention -
+        # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]
+
+        return {
+            "encoder_hidden_states": NeuralType(tuple(('B', 'T', 'D')), ChannelType()),
+            "encoder_input_mask": NeuralType(tuple(('B', 'T')), MaskType()),
+            "decoder_input_ids": NeuralType(('B', 'T'), LabelsType()),
+            "partial_hypotheses": NeuralType(optional=True),
+        }
+
+    @property
+    def output_types(self):
+        """Returns definitions of module output ports."""
+        return {"predictions": [NeuralType(elements_type=HypothesisType())]}
+
+    def __init__(
+        self,
+        transformer_decoder: torch.nn.Module,
+        log_softmax_module: torch.nn.Module,
+        tokenizer: TokenizerSpec,
+        temperature: float | None = None,
+        max_generation_delta: int = 50,
+        preserve_alignments: bool = False,
+        n_samples: int = 1,
+    ):
+        super().__init__(
+            transformer_decoder=transformer_decoder,
+            log_softmax_module=log_softmax_module,
+            tokenizer=tokenizer,
+            preserve_alignments=preserve_alignments,
+        )
+        self.temperature = temperature
+        self.n_samples = n_samples
+        self.bos = tokenizer.bos
+        self.pad = tokenizer.pad
+        self.eos = tokenizer.eos
+        self.greedy_search = GreedySequenceGenerator(
+            embedding=transformer_decoder.embedding,
+            decoder=transformer_decoder.decoder,
+            classifier=log_softmax_module,
+            max_sequence_length=transformer_decoder.max_sequence_length,
+            bos=self.bos,
+            pad=self.pad,
+            eos=self.eos,
+            max_delta_length=max_generation_delta,
+            temperature=self.temperature,
+            n_samples=n_samples,
+        )
+
+        self.preserve_alignments = preserve_alignments
+        if self.preserve_alignments:
+            logging.info(
+                "Preservation of alignments was requested but {} does not implement it.".format(
+                    self.__class__.__name__
+                )
+            )
+
+    @typecheck()
+    def forward(
+        self,
+        encoder_hidden_states: torch.Tensor,
+        encoder_input_mask: torch.Tensor,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        partial_hypotheses: Optional[List[Hypothesis]] = None,
+    ):
+        """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
+        Output token is generated auto-repressively.
+
+        Args:
+            decoder_output: A tensor of size (batch, timesteps, features) or (batch, timesteps) (each timestep is a label).
+            decoder_lengths: list of int representing the length of each sequence
+                output sequence.
+
+        Returns:
+            packed list containing batch number of sentences (Hypotheses).
+        """
+        with torch.inference_mode():
+            best_hypo, topk_hypotheses = self.greedy_search(
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_input_mask=encoder_input_mask,
+                decoder_input_ids=decoder_input_ids,
+            )
+
+            if topk_hypotheses is not None:
+                topk_hypotheses = [x.detach().cpu() for x in topk_hypotheses]  # each item is [beam, seq_len]
+                beam_scores = [[None] * self.n_samples for _ in topk_hypotheses]  # each item is [beam,]
+                packed_result = []
+                for i in range(len(topk_hypotheses)):
+                    # Pack results into Hypotheses
+                    hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestep=[]) for _ in range(self.n_samples)]
+                    self.format_hypotheses(hypotheses, decoder_input_ids)
+                    packed_result.append(
+                        NBestHypotheses(pack_hypotheses(hypotheses, topk_hypotheses[i], beam_scores[i]))
+                    )
+            else:
+                beam_scores = [None for _ in range(len(best_hypo))]
+                best_hypo = best_hypo.cpu()
+                hypotheses = [
+                    Hypothesis(score=0.0, y_sequence=[], timestep=[]) for _ in range(encoder_hidden_states.shape[0])
+                ]
+                # Pack results into Hypotheses
+                packed_result = pack_hypotheses(hypotheses, best_hypo, beam_scores)
+                self.format_hypotheses(packed_result, decoder_input_ids)
+
+        return (packed_result,)
+
+    def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids: Union[torch.Tensor, None]) -> None:
+        """
+        For each hypothesis in the mini-batch:
+        * Remove the decoder input ids (prompt) from the predictions
+        * Remove BOS, EOS, and PAD ids from the predictions.
+        Modifies results in-place.
+        """
+        if decoder_input_ids is not None:
+            assert (
+                len(packed_result) == decoder_input_ids.shape[0]
+            ), f"Mismatching number of examples {len(packed_result)=} {decoder_input_ids.shape[0]=}"
+            decoder_input_ids = decoder_input_ids.detach().cpu()
+            for hyp, prefix in zip(packed_result, decoder_input_ids):
+                assert (
+                    hyp.y_sequence[: prefix.shape[0]] == prefix
+                ).all(), f"The decoder input IDs were not found at the beginning of prediction: {hyp.y_sequence=} {prefix=})"
+                hyp.y_sequence = hyp.y_sequence[prefix.shape[0] :]
+        for hyp in packed_result:
+            ids = hyp.y_sequence
+            ids_len = ids.shape[0]
+            pos = -1
+            while ids[pos] == self.pad or ids[pos] == self.eos:
+                pos -= 1
+                if ids_len + pos == -1:
+                    break  # empty sequence
+            if pos < -1:
+                hyp.y_sequence = ids[: pos + 1]
+
+
+@dataclass
+class AEDGreedyInferConfig:
+    temperature: float | None = None
+    max_generation_delta: int = -1  # -1 means up to the max length of the decoder
+    preserve_alignments: bool = False
+    n_samples: int = 1
diff --git a/nemo/collections/asr/parts/submodules/token_classifier.py b/nemo/collections/asr/parts/submodules/token_classifier.py
index 4061d19d9015..cc435308fcae 100644
--- a/nemo/collections/asr/parts/submodules/token_classifier.py
+++ b/nemo/collections/asr/parts/submodules/token_classifier.py
@@ -11,16 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Dict, Optional
 
+import torch
 from torch import nn as nn
 
 from nemo.collections.asr.parts.submodules.classifier import Classifier
 from nemo.collections.common.parts import MultiLayerPerceptron
 from nemo.core.classes import typecheck
-from nemo.core.neural_types import LogitsType, LogprobsType, NeuralType
+from nemo.core.neural_types import ChannelType, FloatType, LogitsType, LogprobsType, NeuralType
 
 __all__ = ['BertPretrainingTokenClassifier', 'TokenClassifier']
 
@@ -42,11 +43,17 @@ class TokenClassifier(Classifier):
     """
 
     @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+    def input_types(self) -> Dict[str, NeuralType]:
+        return {
+            "hidden_states": NeuralType(('B', 'T', 'D'), ChannelType()),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
         """
         Returns definitions of module output ports.
         """
-        if not self.log_softmax:
+        if not self.mlp.log_softmax:
             return {"logits": NeuralType(('B', 'T', 'C'), LogitsType())}
         else:
             return {"log_probs": NeuralType(('B', 'T', 'C'), LogprobsType())}
@@ -61,7 +68,6 @@ def __init__(
         dropout: float = 0.0,
         use_transformer_init: bool = True,
     ) -> None:
-
         """
         Initializes the Token Classifier module.
 
@@ -75,14 +81,24 @@ def __init__(
             use_transformer_init: whether to initialize the weights of the classifier head with the same approach used in Transformer
         """
         super().__init__(hidden_size=hidden_size, dropout=dropout)
-        self.log_softmax = log_softmax
         self.mlp = MultiLayerPerceptron(
             hidden_size, num_classes, num_layers=num_layers, activation=activation, log_softmax=log_softmax
         )
         self.post_init(use_transformer_init=use_transformer_init)
 
+    @property
+    def log_softmax(self) -> bool:
+        return self.mlp.log_softmax
+
+    @contextmanager
+    def with_log_softmax_enabled(self, value: bool) -> "TokenClassifier":
+        prev = self.mlp.log_softmax
+        self.mlp.log_softmax = value
+        yield self
+        self.mlp.log_softmax = prev
+
     @typecheck()
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """
         Performs the forward step of the module.
         Args:
@@ -100,12 +116,18 @@ class BertPretrainingTokenClassifier(Classifier):
     A module to perform token level classification tasks for Bert pretraining.
     """
 
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        return {
+            "hidden_states": NeuralType(('B', 'T', 'D'), ChannelType()),
+        }
+
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
         """
         Returns definitions of module output ports.
         """
-        if not self.log_softmax:
+        if not self.mlp.log_softmax:
             return {"logits": NeuralType(('B', 'T', 'C'), LogitsType())}
         else:
             return {"log_probs": NeuralType(('B', 'T', 'C'), LogprobsType())}
@@ -120,7 +142,6 @@ def __init__(
         dropout: float = 0.0,
         use_transformer_init: bool = True,
     ) -> None:
-
         """
         Initializes the Token Classifier module.
 
@@ -135,8 +156,6 @@ def __init__(
         """
         super().__init__(hidden_size=hidden_size, dropout=dropout)
 
-        self.log_softmax = log_softmax
-
         if activation not in ACT2FN:
             raise ValueError(f'activation "{activation}" not found')
         self.dense = nn.Linear(hidden_size, hidden_size)
@@ -147,8 +166,19 @@ def __init__(
         )
         self.post_init(use_transformer_init=use_transformer_init)
 
+    @property
+    def log_softmax(self) -> bool:
+        return self.mlp.log_softmax
+
+    @contextmanager
+    def with_log_softmax_enabled(self, value: bool) -> "TokenClassifier":
+        prev = self.mlp.log_softmax
+        self.mlp.log_softmax = value
+        yield self
+        self.mlp.log_softmax = prev
+
     @typecheck()
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """
         Performs the forward step of the module.
         Args:
diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index bae2c9ffdc67..415096a0c9d5 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -21,6 +21,7 @@
 from omegaconf import OmegaConf
 from torch.utils.data import DataLoader
 
+from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextMiniBatch
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder
 from nemo.collections.asr.parts.preprocessing.features import normalize_batch
@@ -1643,7 +1644,16 @@ def _get_batch_preds(self, keep_logits=False):
             tokens = self.input_tokens.to(device).repeat(feat_signal.size(0), 1)
             tokens_len = torch.tensor([tokens.size(1)] * tokens.size(0), device=device).long()
 
-            batch_input = (feat_signal, feat_signal_len, None, None, tokens, tokens_len)
+            batch_input = PromptedAudioToTextMiniBatch(
+                audio=feat_signal,
+                audio_lens=feat_signal_len,
+                transcript=None,
+                transcript_lens=None,
+                prompt=tokens,
+                prompt_lens=tokens_len,
+                prompted_transcript=None,
+                prompted_transcript_lens=None,
+            )
             predictions = self.asr_model.predict_step(batch_input, has_processed_signal=True)
             self.all_preds.extend(predictions)
             del predictions
diff --git a/nemo/collections/audio/losses/__init__.py b/nemo/collections/audio/losses/__init__.py
index b2968b7b1ad0..f4a1a42ff20b 100644
--- a/nemo/collections/audio/losses/__init__.py
+++ b/nemo/collections/audio/losses/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.audio.losses.audio import MSELoss, SDRLoss
+from nemo.collections.audio.losses.audio import MAELoss, MSELoss, SDRLoss
diff --git a/nemo/collections/audio/losses/audio.py b/nemo/collections/audio/losses/audio.py
index 635b02c5d1fe..ce6b82875e6b 100644
--- a/nemo/collections/audio/losses/audio.py
+++ b/nemo/collections/audio/losses/audio.py
@@ -584,3 +584,168 @@ def forward(
         mse = self.reduce(mse)
 
         return mse
+
+
+def calculate_mae_batch(
+    estimate: torch.Tensor,
+    target: torch.Tensor,
+    input_length: Optional[torch.Tensor] = None,
+    mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Calculate mean absolute error (MAE) per channel.
+
+        MAE = ||estimate - target||_1 / input_length
+
+    Args:
+        estimate: estimated signal, shape (B, C, T) or (B, C, D, T)
+        target: target signal, shape (B, C, T) or (B, C, D, T)
+        input_length: Optional, length of valid samples, shape (B,)
+        mask: Optional, temporal mask, same shape as signals
+
+    Returns:
+        MAE for each channel, shape (B, C)
+    """
+    assert (
+        estimate.shape == target.shape
+    ), f'Estimate shape ({estimate.shape}) not matching target shape ({target.shape})'
+
+    if input_length is not None:
+        if mask is not None:
+            raise RuntimeError(
+                'Argument `input_length` is mutually exclusive with `mask`. Both cannot be used at the same time.'
+            )
+
+        # Construct a binary mask
+        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(estimate)
+
+    # error
+    err = estimate - target
+
+    # dimensions for averaging
+    if estimate.ndim == 3:
+        # average across time
+        dim = -1
+    elif estimate.ndim == 4:
+        # average across time and features
+        dim = (-2, -1)
+    else:
+        raise RuntimeError(f'Unexpected dimension of the input: {estimate.shape}')
+
+    # calculate masked mean
+    mse = calculate_mean(torch.abs(err), mask=mask, dim=dim)
+
+    return mse
+
+
+class MAELoss(Loss, Typing):
+    """
+    Computes the mean absolute error (MAE) loss with weighted average across channels.
+
+    Args:
+        weight: weight for loss of each output channel, used for averaging the loss across channels. Defaults to `None` (averaging).
+        reduction: batch reduction. Defaults to `mean` over the batch.
+        ndim: Number of dimensions for the input signal
+    """
+
+    def __init__(
+        self,
+        weight: Optional[List[float]] = None,
+        reduction: str = 'mean',
+        ndim: int = 3,
+    ):
+        super().__init__()
+
+        # weight buffer
+        if weight is not None:
+            if any([w <= 0 for w in weight]):
+                raise ValueError(f'Weight must be positive! Current value: {weight}')
+            elif not np.isclose(sum(weight), 1, atol=1e-6):
+                raise ValueError(f'Weight should add to one, current weight: {weight}')
+            weight = torch.tensor(weight).reshape(1, -1)
+            logging.info(f'Channel weight set to %s', weight)
+        self.register_buffer('weight', weight)
+        self.weight: Optional[Tensor]
+
+        # Batch reduction
+        self.reduction = reduction
+        if reduction == 'mean':
+            self.reduce = torch.mean
+        else:
+            raise ValueError(f'Unexpected reduction mode {reduction}.')
+
+        # Input dimension
+        self.ndim = ndim
+
+        if self.ndim == 3:
+            # Time-domain input
+            self.signal_shape = ('B', 'C', 'T')
+        elif self.ndim == 4:
+            # Spectral-domain input
+            self.signal_shape = ('B', 'C', 'D', 'T')
+        else:
+            raise ValueError(f'Unexpected input dimension: {self.ndim}')
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tweight:       %s', self.weight)
+        logging.debug('\treduction:    %s', self.reduction)
+        logging.debug('\tndim:         %s', self.ndim)
+        logging.debug('\tsignal_shape: %s', self.signal_shape)
+
+    @property
+    def input_types(self):
+        """Input types definitions for MAELoss."""
+        return {
+            "estimate": NeuralType(self.signal_shape, VoidType()),
+            "target": NeuralType(self.signal_shape, VoidType()),
+            "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "mask": NeuralType(self.signal_shape, MaskType(), optional=True),
+        }
+
+    @property
+    def output_types(self):
+        """Output types definitions for MAELoss.
+        loss:
+            NeuralType(None)
+        """
+        return {"loss": NeuralType(elements_type=LossType())}
+
+    @typecheck()
+    def forward(
+        self,
+        estimate: torch.Tensor,
+        target: torch.Tensor,
+        input_length: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """For input batch of multi-channel signals, calculate MAE between estimate and target for each channel,
+        perform averaging across channels (weighting optional), and apply reduction across the batch.
+
+        Args:
+            estimate: Estimate of the target signal
+            target: Target signal
+            input_length: Length of each example in the batch
+            mask: Mask for each signal
+
+        Returns:
+            Scalar loss.
+        """
+        mae = calculate_mae_batch(
+            estimate=estimate,
+            target=target,
+            input_length=input_length,
+            mask=mask,
+        )
+
+        # channel averaging
+        if self.weight is None:
+            mae = torch.mean(mae, dim=1)
+        else:
+            # weighting across channels
+            mae = mae * self.weight
+            mae = torch.sum(mae, dim=1)
+
+        # reduction
+        mae = self.reduce(mae)
+
+        return mae
diff --git a/nemo/collections/audio/models/audio_to_audio.py b/nemo/collections/audio/models/audio_to_audio.py
index b12f9ce73cbe..ef9ce648f1a2 100644
--- a/nemo/collections/audio/models/audio_to_audio.py
+++ b/nemo/collections/audio/models/audio_to_audio.py
@@ -46,7 +46,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
     def _setup_loss(self):
         """Setup loss for this model."""
-        self.loss = AudioToAudioModel.from_config_dict(self._cfg.loss)
+        if 'loss' in self._cfg:
+            self.loss = AudioToAudioModel.from_config_dict(self._cfg.loss)
+        else:
+            logging.warning('No loss function is defined in the config.')
+            self.loss = None
 
     def _get_num_dataloaders(self, tag: str = 'val'):
         if tag == 'val':
diff --git a/nemo/collections/audio/models/enhancement.py b/nemo/collections/audio/models/enhancement.py
index f60553704183..e7fbc9023117 100644
--- a/nemo/collections/audio/models/enhancement.py
+++ b/nemo/collections/audio/models/enhancement.py
@@ -25,7 +25,12 @@
 from nemo.core.neural_types import AudioSignal, LengthsType, LossType, NeuralType
 from nemo.utils import logging
 
-__all__ = ['EncMaskDecAudioToAudioModel', 'ScoreBasedGenerativeAudioToAudioModel', 'PredictiveAudioToAudioModel']
+__all__ = [
+    'EncMaskDecAudioToAudioModel',
+    'ScoreBasedGenerativeAudioToAudioModel',
+    'PredictiveAudioToAudioModel',
+    'SchroedingerBridgeAudioToAudioModel',
+]
 
 
 class EncMaskDecAudioToAudioModel(AudioToAudioModel):
@@ -433,14 +438,13 @@ def forward(self, input_signal, input_length=None):
             - decoder to transform the sampler output into the time domain
 
         Args:
-            input_signal: Tensor that represents a batch of raw audio signals,
-                of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as
+            input_signal: Tensor that represents a batch of time-domain audio signals,
+                of shape [B, C, T]. T here represents timesteps, with 1 second of audio represented as
                 `self.sample_rate` number of floating point values.
-            input_signal_length: Vector of length B, that contains the individual lengths of the audio
-                sequences.
+            input_signal_length: Vector of length B, contains the individual lengths of the audio sequences.
 
         Returns:
-            Output signal `output` in the time domain and the length of the output signal `output_length`.
+            Output `output_signal` in the time domain and the length of the output signal `output_length`.
         """
         batch_length = input_signal.size(-1)
 
@@ -612,3 +616,353 @@ def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str =
         self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
 
         return {f'{tag}_loss': loss}
+
+
+class SchroedingerBridgeAudioToAudioModel(AudioToAudioModel):
+    """This models is using a Schrödinger Bridge process to generate
+    an encoded representation of the enhanced signal.
+
+    The model consists of the following blocks:
+        - encoder: transforms input audio signal into an encoded representation (analysis transform)
+        - estimator: neural model, estimates the coefficients for the SB process
+        - noise_schedule: defines the path between the clean and noisy signals
+        - sampler: sampler for the reverse process, estimates coefficients of the target signal
+        - decoder: transforms sampler output into the time domain (synthesis transform)
+
+    References:
+        Schrödinger Bridge for Generative Speech Enhancement, https://arxiv.org/abs/2407.16074
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        super().__init__(cfg=cfg, trainer=trainer)
+        self.sample_rate = self._cfg.sample_rate
+
+        # Setup processing modules
+        self.encoder = self.from_config_dict(self._cfg.encoder)
+        self.decoder = self.from_config_dict(self._cfg.decoder)
+
+        # Neural estimator
+        self.estimator = self.from_config_dict(self._cfg.estimator)
+        self.estimator_output = self._cfg.estimator_output
+
+        # Noise schedule
+        self.noise_schedule = self.from_config_dict(self._cfg.noise_schedule)
+
+        # Sampler
+        self.sampler = hydra.utils.instantiate(
+            self._cfg.sampler,
+            noise_schedule=self.noise_schedule,
+            estimator=self.estimator,
+            estimator_output=self.estimator_output,
+        )
+
+        # Normalization
+        self.normalize_input = self._cfg.get('normalize_input', False)
+
+        # Metric evaluation
+        self.max_utts_evaluation_metrics = self._cfg.get('max_utts_evaluation_metrics')
+
+        if self.max_utts_evaluation_metrics is not None:
+            logging.warning(
+                'Metrics will be evaluated on first %d examples of the evaluation datasets.',
+                self.max_utts_evaluation_metrics,
+            )
+
+        # Loss in the encoded domain
+        if 'loss_encoded' in self._cfg:
+            self.loss_encoded = self.from_config_dict(self._cfg.loss_encoded)
+            self.loss_encoded_weight = self._cfg.get('loss_encoded_weight', 1.0)
+        else:
+            self.loss_encoded = None
+            self.loss_encoded_weight = 0.0
+
+        # Loss in the time domain
+        if 'loss_time' in self._cfg:
+            self.loss_time = self.from_config_dict(self._cfg.loss_time)
+            self.loss_time_weight = self._cfg.get('loss_time_weight', 1.0)
+        else:
+            self.loss_time = None
+            self.loss_time_weight = 0.0
+
+        if self.loss is not None and (self.loss_encoded is not None or self.loss_time is not None):
+            raise ValueError('Either ``loss`` or ``loss_encoded`` and ``loss_time`` should be defined, not both.')
+
+        # Term added to the denominator to improve numerical stability
+        self.eps = self._cfg.get('eps', 1e-8)
+
+        # Setup optional optimization flags
+        self.setup_optimization_flags()
+
+        logging.debug('Initialized %s', self.__class__.__name__)
+        logging.debug('\testimator_output:    %s', self.estimator_output)
+        logging.debug('\tnormalize_input:     %s', self.normalize_input)
+        logging.debug('\tloss:                %s', self.loss)
+        logging.debug('\tloss_encoded:        %s', self.loss_encoded)
+        logging.debug('\tloss_encoded_weight: %s', self.loss_encoded_weight)
+        logging.debug('\tloss_time:           %s', self.loss_time)
+        logging.debug('\tloss_time_weight:    %s', self.loss_time_weight)
+        logging.debug('\teps:                 %s', self.eps)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        # time-domain input
+        return {
+            "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        # time-domain output
+        return {
+            "output_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "output_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @typecheck()
+    @torch.inference_mode()
+    def forward(self, input_signal, input_length=None):
+        """Forward pass of the model.
+
+        Forward pass of the model consists of the following steps
+            - encoder to obtain the encoded representation of the input signal
+            - sampler to generate the estimated coefficients of the target signal
+            - decoder to transform the estimated output into the time domain
+
+        Args:
+            input_signal: Tensor that represents a batch of time-domain audio signals,
+                of shape [B, C, T]. T here represents timesteps, with 1 second of audio represented as
+                `self.sample_rate` number of floating point values.
+            input_signal_length: Vector of length B, contains the individual lengths of the audio sequences.
+
+        Returns:
+            Output `output_signal` in the time domain and the length of the output signal `output_length`.
+        """
+        batch_length = input_signal.size(-1)
+
+        if self.normalize_input:
+            # max for each example in the batch
+            norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True)
+            # scale input signal
+            input_signal = input_signal / (norm_scale + self.eps)
+
+        # Encoder
+        encoded, encoded_length = self.encoder(input=input_signal, input_length=input_length)
+
+        # Sampler
+        generated, generated_length = self.sampler(
+            prior_mean=encoded, estimator_condition=encoded, state_length=encoded_length
+        )
+
+        # Decoder
+        output, output_length = self.decoder(input=generated, input_length=generated_length)
+
+        if self.normalize_input:
+            # rescale to the original scale
+            output = output * norm_scale
+
+        # Trim or pad the estimated signal to match input length
+        output = self.match_batch_length(input=output, batch_length=batch_length)
+
+        return output, output_length
+
+    @typecheck(
+        input_types={
+            "target_signal": NeuralType(('B', 'C', 'T'), AudioSignal()),
+            "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal()),
+            "input_length": NeuralType(tuple('B'), LengthsType()),
+        },
+        output_types={
+            "loss": NeuralType(None, LossType()),
+            "loss_encoded": NeuralType(None, LossType()),
+            "loss_time": NeuralType(None, LossType()),
+        },
+    )
+    def _step(self, target_signal, input_signal, input_length=None):
+        """Randomly generate time step for each example in the batch, run neural estimator
+        to estimate the target and calculate the loss.
+        """
+        batch_size = target_signal.size(0)
+
+        if self.normalize_input:
+            # max for each example in the batch
+            norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True)
+            # scale input signal
+            input_signal = input_signal / (norm_scale + self.eps)
+            # scale the target signal
+            target_signal = target_signal / (norm_scale + self.eps)
+
+        # Apply encoder to both target and the input
+        # For example, if the encoder is STFT, then _enc is the complex-valued STFT of the corresponding signal
+        input_enc, input_enc_len = self.encoder(input=input_signal, input_length=input_length)
+        target_enc, _ = self.encoder(input=target_signal, input_length=input_length)
+
+        # Generate random time steps
+        process_time = self.noise_schedule.generate_time(size=batch_size, device=input_enc.device)
+
+        # Prepare necessary info from the noise schedule
+        alpha_t, alpha_bar_t, alpha_t_max = self.noise_schedule.get_alphas(time=process_time)
+        sigma_t, sigma_bar_t, sigma_t_max = self.noise_schedule.get_sigmas(time=process_time)
+
+        # Marginal distribution
+        weight_target = alpha_t * sigma_bar_t**2 / (sigma_t_max**2 + self.eps)
+        weight_input = alpha_bar_t * sigma_t**2 / (sigma_t_max**2 + self.eps)
+        # view weights as [B, C, D, T]
+        weight_target = weight_target.view(-1, 1, 1, 1)
+        weight_input = weight_input.view(-1, 1, 1, 1)
+        # mean
+        mean_x = weight_target * target_enc + weight_input * input_enc
+        # standard deviation
+        std_x = alpha_t * sigma_bar_t * sigma_t / (sigma_t_max + self.eps)
+        # view as [B, C, D, T]
+        std_x = std_x.view(-1, 1, 1, 1)
+
+        # Generate a random sample from a standard normal distribution
+        z_norm = torch.randn_like(input_enc)
+
+        # Generate a random sample from the marginal distribution
+        x_t = mean_x + std_x * z_norm
+
+        # Estimator is conditioned on the generated sample and the original input (prior)
+        estimator_input = torch.cat([x_t, input_enc], dim=-3)
+
+        # Neural estimator
+        # Estimator input is the same data type as the encoder output
+        # For example, if the encoder is STFT, then the estimator input and output are complex-valued coefficients
+        estimate, estimate_len = self.estimator(
+            input=estimator_input, input_length=input_enc_len, condition=process_time
+        )
+
+        # Prepare output target and calculate loss
+        if self.estimator_output == 'data_prediction':
+            if self.loss is not None:
+                # Single loss in the encoded domain
+                loss = self.loss(estimate=estimate, target=target_enc, input_length=estimate_len)
+                loss_encoded = loss_time = None
+            else:
+                # Weighted loss between encoded and time domain
+                loss = 0.0
+
+                # Loss in the encoded domain
+                if self.loss_encoded is not None:
+                    # Loss between the estimate and the target in the encoded domain
+                    loss_encoded = self.loss_encoded(estimate=estimate, target=target_enc, input_length=estimate_len)
+                    # Weighting
+                    loss += self.loss_encoded_weight * loss_encoded
+                else:
+                    loss_encoded = None
+
+                # Loss in the time domain
+                if self.loss_time is not None:
+                    # Convert the estimate to the time domain
+                    with typecheck.disable_checks():
+                        # Note: stimate is FloatType, decoder requires SpectrogramType
+                        estimate_signal, _ = self.decoder(input=estimate, input_length=estimate_len)
+
+                    # Match estimate length
+                    batch_length = input_signal.size(-1)
+                    estimate_signal = self.match_batch_length(input=estimate_signal, batch_length=batch_length)
+
+                    # Loss between the estimate and the target in the time domain
+                    loss_time = self.loss_time(
+                        estimate=estimate_signal, target=target_signal, input_length=input_length
+                    )
+                    # Weighting
+                    loss += self.loss_time_weight * loss_time
+                else:
+                    loss_time = None
+        else:
+            raise NotImplementedError(f'Output type {self.estimator_output} is not implemented')
+
+        return loss, loss_encoded, loss_time
+
+    # PTL-specific methods
+    def training_step(self, batch, batch_idx):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Calculate the loss
+        loss, loss_encoded, loss_time = self._step(
+            target_signal=target_signal, input_signal=input_signal, input_length=input_length
+        )
+
+        # Logs
+        self.log('train_loss', loss)
+        self.log('learning_rate', self._optimizer.param_groups[0]['lr'])
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        if loss_encoded is not None:
+            self.log('train_loss_encoded', loss_encoded)
+
+        if loss_time is not None:
+            self.log('train_loss_time', loss_time)
+
+        return loss
+
+    def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Calculate loss
+        loss, *_ = self._step(target_signal=target_signal, input_signal=input_signal, input_length=input_length)
+
+        # Update metrics
+        update_metrics = False
+        if self.max_utts_evaluation_metrics is None:
+            # Always update if max is not configured
+            update_metrics = True
+            # Number of examples to process
+            num_examples = input_signal.size(0)  # batch size
+        else:
+            # Check how many examples have been used for metric calculation
+            first_metric_name = next(iter(self.metrics[tag][dataloader_idx]))
+            num_examples_evaluated = self.metrics[tag][dataloader_idx][first_metric_name].num_examples
+            # Update metrics if some examples were not processed
+            update_metrics = num_examples_evaluated < self.max_utts_evaluation_metrics
+            # Number of examples to process
+            num_examples = min(self.max_utts_evaluation_metrics - num_examples_evaluated, input_signal.size(0))
+
+        if update_metrics:
+            # Generate output signal
+            output_signal, _ = self.forward(
+                input_signal=input_signal[:num_examples, ...], input_length=input_length[:num_examples]
+            )
+
+            # Update metrics
+            if hasattr(self, 'metrics') and tag in self.metrics:
+                # Update metrics for this (tag, dataloader_idx)
+                for name, metric in self.metrics[tag][dataloader_idx].items():
+                    metric.update(
+                        preds=output_signal,
+                        target=target_signal[:num_examples, ...],
+                        input_length=input_length[:num_examples],
+                    )
+
+        # Log global step
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return {f'{tag}_loss': loss}
diff --git a/nemo/collections/audio/parts/submodules/diffusion.py b/nemo/collections/audio/parts/submodules/diffusion.py
index c8b3e803e373..2c9e08fc30fd 100644
--- a/nemo/collections/audio/parts/submodules/diffusion.py
+++ b/nemo/collections/audio/parts/submodules/diffusion.py
@@ -18,7 +18,7 @@
 import numpy as np
 import torch
 
-from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
+from nemo.collections.common.parts.utils import mask_sequence_tensor
 from nemo.core.classes import NeuralModule, typecheck
 from nemo.core.neural_types import FloatType, LengthsType, NeuralType, SpectrogramType, VoidType
 from nemo.utils import logging
diff --git a/nemo/collections/audio/parts/submodules/ncsnpp.py b/nemo/collections/audio/parts/submodules/ncsnpp.py
index adbeccc0dc02..543e29fc7847 100644
--- a/nemo/collections/audio/parts/submodules/ncsnpp.py
+++ b/nemo/collections/audio/parts/submodules/ncsnpp.py
@@ -20,8 +20,7 @@
 import torch
 import torch.nn.functional as F
 
-from nemo.collections.common.parts.utils import activation_registry
-from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
+from nemo.collections.common.parts.utils import activation_registry, mask_sequence_tensor
 from nemo.core.classes import NeuralModule, typecheck
 from nemo.core.neural_types import FloatType, LengthsType, NeuralType, SpectrogramType, VoidType
 from nemo.utils import logging
diff --git a/nemo/collections/audio/parts/submodules/schroedinger_bridge.py b/nemo/collections/audio/parts/submodules/schroedinger_bridge.py
new file mode 100644
index 000000000000..07bfc2f88011
--- /dev/null
+++ b/nemo/collections/audio/parts/submodules/schroedinger_bridge.py
@@ -0,0 +1,607 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+from nemo.collections.common.parts.utils import mask_sequence_tensor
+from nemo.core.classes import NeuralModule, typecheck
+from nemo.core.neural_types import LengthsType, NeuralType, SpectrogramType
+from nemo.utils import logging
+
+
+class SBNoiseSchedule(NeuralModule, ABC):
+    """Noise schedule for the Schrödinger Bridge
+
+    Args:
+        time_min: minimum time for the process
+        time_max: maximum time for the process
+        num_steps: number of steps for the process
+        eps: small regularization
+
+    References:
+        Schrödinger Bridge for Generative Speech Enhancement, https://arxiv.org/abs/2407.16074
+    """
+
+    def __init__(
+        self,
+        time_min: float = 0.0,
+        time_max: float = 1.0,
+        num_steps: int = 100,
+        eps: float = 1e-8,
+    ):
+        super().__init__()
+
+        # min and max time
+        if time_min < 0:
+            raise ValueError(f'time_min should be non-negative, current value {time_min}')
+
+        if time_max <= time_min:
+            raise ValueError(f'time_max should be larger than time_min, current max {time_max} and min {time_min}')
+
+        self.time_min = time_min
+        self.time_max = time_max
+
+        if num_steps <= 0:
+            raise ValueError(f'Expected num_steps > 0, got {num_steps}')
+
+        self.num_steps = num_steps
+
+        if eps <= 0:
+            raise ValueError(f'Expected eps > 0, got {eps}')
+
+        self.eps = eps
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\ttime_min:  %s', self.time_min)
+        logging.debug('\ttime_max:  %s', self.time_max)
+        logging.debug('\tnum_steps: %s', self.num_steps)
+        logging.debug('\teps:       %s', self.eps)
+
+    @property
+    def dt(self) -> float:
+        """Time step for the process."""
+        return self.time_max / self.num_steps
+
+    @property
+    def time_delta(self) -> float:
+        """Time range for the process."""
+        return self.time_max - self.time_min
+
+    def generate_time(self, size: int, device: torch.device) -> torch.Tensor:
+        """Generate random time steps in the valid range."""
+        time = torch.rand(size, device=device) * self.time_delta + self.time_min
+        return time
+
+    @property
+    def alpha_t_max(self):
+        """Return alpha_t at t_max."""
+        t_max = torch.tensor([self.time_max], device=alpha.device)
+        return self.alpha(t_max)
+
+    @property
+    def sigma_t_max(self):
+        """Return sigma_t at t_max."""
+        t_max = torch.tensor([self.time_max], device=alpha.device)
+        return self.sigma(t_max)
+
+    @abstractmethod
+    def f(self, time: torch.Tensor) -> torch.Tensor:
+        """Drift scaling f(t).
+
+        Args:
+            time: tensor with time steps
+
+        Returns:
+            Tensor the same size as time, representing drift scaling.
+        """
+        pass
+
+    @abstractmethod
+    def g(self, time: torch.Tensor) -> torch.Tensor:
+        """Diffusion scaling g(t).
+
+        Args:
+            time: tensor with time steps
+
+        Returns:
+            Tensor the same size as time, representing diffusion scaling.
+        """
+        pass
+
+    @abstractmethod
+    def alpha(self, time: torch.Tensor) -> torch.Tensor:
+        """Return alpha for SB noise schedule.
+
+            alpha_t = exp( int_0^s f(s) ds  )
+
+        Args:
+            time: tensor with time steps
+
+        Returns:
+            Tensor the same size as time, representing alpha for each time.
+        """
+        pass
+
+    def alpha_bar_from_alpha(self, alpha: torch.Tensor) -> (torch.Tensor, torch.Tensor):
+        """Return alpha_bar for SB.
+
+            alpha_bar = alpha_t / alpha_t_max
+
+        Args:
+            alpha: tensor with alpha values
+
+        Returns:
+            Tensors the same size as alpha, representing alpha_bar and alpha_t_max.
+        """
+        alpha_t_max = self.alpha(torch.tensor([self.time_max], device=alpha.device))
+        alpha_bar = alpha / (alpha_t_max + self.eps)
+        return alpha_bar, alpha_t_max
+
+    def get_alphas(self, time: torch.Tensor) -> (torch.Tensor, torch.Tensor, torch.Tensor):
+        """Return alpha, alpha_bar and alpha_t_max for SB.
+
+        Args:
+            time: tensor with time steps
+
+        Returns:
+            Tuple of tensors with alpha, alpha_bar and alpha_t_max.
+        """
+        alpha = self.alpha(time)
+        alpha_bar, alpha_t_max = self.alpha_bar_from_alpha(alpha)
+        return alpha, alpha_bar, alpha_t_max
+
+    @abstractmethod
+    def sigma(self, time: torch.Tensor) -> torch.Tensor:
+        """Return sigma_t for SB.
+
+            sigma_t^2 = int_0^s g^2(s) / alpha_s^2 ds
+
+        Args:
+            time: tensor with time steps
+
+        Returns:
+            Tensor the same size as time, representing sigma for each time.
+        """
+        pass
+
+    def sigma_bar_from_sigma(self, sigma: torch.Tensor) -> (torch.Tensor, torch.Tensor):
+        """Return sigma_bar_t for SB.
+
+            sigma_bar_t^2 = sigma_t_max^2 - sigma_t^2
+
+        Args:
+            sigma: tensor with sigma values
+
+        Returns:
+            Tensors the same size as sigma, representing sigma_bar and sigma_t_max.
+        """
+        sigma_t_max = self.sigma(torch.tensor([self.time_max], device=sigma.device))
+        sigma_bar_sq = sigma_t_max**2 - sigma**2
+        return torch.sqrt(sigma_bar_sq + self.eps), sigma_t_max
+
+    def get_sigmas(self, time: torch.Tensor) -> (torch.Tensor, torch.Tensor, torch.Tensor):
+        """Return sigma, sigma_bar and sigma_t_max for SB.
+
+        Args:
+            time: tensor with time steps
+
+        Returns:
+            Tuple of tensors with sigma, sigma_bar and sigma_t_max.
+        """
+        sigma = self.sigma(time)
+        sigma_bar, sigma_t_max = self.sigma_bar_from_sigma(sigma)
+        return sigma, sigma_bar, sigma_t_max
+
+    @abstractmethod
+    def copy(self):
+        """Return a copy of the noise schedule."""
+        pass
+
+    def __repr__(self):
+        desc = f'{self.__class__.__name__}(time_min={self.time_min}, time_max={self.time_max}, num_steps={self.num_steps})'
+        desc += f'\n\tdt:         {self.dt}'
+        desc += f'\n\ttime_delta: {self.time_delta}'
+        return desc
+
+
+class SBNoiseScheduleVE(SBNoiseSchedule):
+    """Variance exploding noise schedule for the Schrödinger Bridge.
+
+    Args:
+        k: defines the base for the exponential diffusion coefficient
+        c: scaling for the diffusion coefficient
+        time_min: minimum time for the process
+        time_max: maximum time for the process
+        num_steps: number of steps for the process
+        eps: small regularization
+
+    References:
+        Schrödinger Bridge for Generative Speech Enhancement, https://arxiv.org/abs/2407.16074
+    """
+
+    def __init__(
+        self,
+        k: float,
+        c: float,
+        time_min: float = 0.0,
+        time_max: float = 1.0,
+        num_steps: int = 100,
+        eps: float = 1e-8,
+    ):
+        super().__init__(time_min=time_min, time_max=time_max, num_steps=num_steps, eps=eps)
+
+        # Shape parameters
+        if k <= 1:
+            raise ValueError(f'Expected k > 1, got {k}')
+
+        if c <= 0:
+            raise ValueError(f'Expected c > 0, got {c}')
+
+        self.c = c
+        self.k = k
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tk:         %s', self.k)
+        logging.debug('\tc:         %s', self.c)
+        logging.debug('\ttime_min:  %s', self.time_min)
+        logging.debug('\ttime_max:  %s', self.time_max)
+        logging.debug('\tnum_steps: %s', self.num_steps)
+        logging.debug('\teps:       %s', self.eps)
+
+    def f(self, time: torch.Tensor) -> torch.Tensor:
+        return torch.zeros_like(time)
+
+    def g(self, time: torch.Tensor) -> torch.Tensor:
+        return torch.sqrt(self.c) * self.k**self.time
+
+    def alpha(self, time: torch.Tensor) -> torch.Tensor:
+        return torch.ones_like(time)
+
+    def sigma(self, time: torch.Tensor) -> torch.Tensor:
+        sigma_sq = self.c * (self.k ** (2 * time) - 1) / (2 * math.log(self.k) + self.eps)
+        return torch.sqrt(sigma_sq)
+
+    def copy(self):
+        return SBNoiseScheduleVE(
+            k=self.k,
+            c=self.c,
+            time_min=self.time_min,
+            time_max=self.time_max,
+            num_steps=self.num_steps,
+            eps=self.eps,
+        )
+
+    def __repr__(self):
+        desc = super().__repr__()
+        desc += f'\n\tk: {self.k}'
+        desc += f'\n\tc: {self.c}'
+        return desc
+
+
+class SBNoiseScheduleVP(SBNoiseSchedule):
+    """Variance preserving noise schedule for the Schrödinger Bridge.
+
+    Args:
+        beta_0: defines the lower bound for diffusion coefficient
+        beta_1: defines upper bound for diffusion coefficient
+        c: scaling for the diffusion coefficient
+        time_min: minimum time for the process
+        time_max: maximum time for the process
+        num_steps: number of steps for the process
+        eps: small regularization
+    """
+
+    def __init__(
+        self,
+        beta_0: float,
+        beta_1: float,
+        c: float = 1.0,
+        time_min: float = 0.0,
+        time_max: float = 1.0,
+        num_steps: int = 100,
+        eps: float = 1e-8,
+    ):
+        super().__init__(time_min=time_min, time_max=time_max, num_steps=num_steps, eps=eps)
+
+        # Shape parameters
+        if beta_0 < 0:
+            raise ValueError(f'Expected beta_0 >= 0, got {beta_0}')
+
+        if beta_1 < 0:
+            raise ValueError(f'Expected beta_1 >= 0, got {beta_1}')
+
+        if beta_0 >= beta_1:
+            raise ValueError(f'Expected beta_0 < beta_1, got beta_0={beta_0} and beta_1={beta_1}')
+
+        if c <= 0:
+            raise ValueError(f'Expected c > 0, got {c}')
+
+        self.beta_0 = beta_0
+        self.beta_1 = beta_1
+        self.c = c
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tbeta_0:    %s', self.beta_0)
+        logging.debug('\tbeta_1:    %s', self.beta_1)
+        logging.debug('\tc:         %s', self.c)
+        logging.debug('\ttime_min:  %s', self.time_min)
+        logging.debug('\ttime_max:  %s', self.time_max)
+        logging.debug('\tnum_steps: %s', self.num_steps)
+        logging.debug('\teps:       %s', self.eps)
+
+    def f(self, time: torch.Tensor) -> torch.Tensor:
+        return -0.5 * (self.beta_0 + time * (self.beta_1 - self.beta_0))
+
+    def g(self, time: torch.Tensor) -> torch.Tensor:
+        g_sq = self.c * (self.beta_0 + time * (self.beta_1 - self.beta_0))
+        return torch.sqrt(g_sq)
+
+    def alpha(self, time: torch.Tensor) -> torch.Tensor:
+        tmp = self.beta_0 * time + (self.beta_1 - self.beta_0) / 2 * time**2
+        return torch.exp(-0.5 * tmp)
+
+    def sigma(self, time: torch.Tensor) -> torch.Tensor:
+        sigma_sq = self.beta_0 * time + (self.beta_1 - self.beta_0) / 2 * time**2
+        sigma_sq = torch.exp(sigma_sq) - 1
+        sigma_sq = self.c * sigma_sq
+        return torch.sqrt(sigma_sq)
+
+    def copy(self):
+        return SBNoiseScheduleVP(
+            beta_0=self.beta_0,
+            beta_1=self.beta_1,
+            c=self.c,
+            time_min=self.time_min,
+            time_max=self.time_max,
+            num_steps=self.num_steps,
+            eps=self.eps,
+        )
+
+    def __repr__(self):
+        desc = super().__repr__()
+        desc += f'\n\tbeta_0: {self.beta_0}'
+        desc += f'\n\tbeta_1: {self.beta_1}'
+        desc += f'\n\tc:      {self.c}'
+        return desc
+
+
+class SBSampler(NeuralModule):
+    """Schrödinger Bridge sampler.
+
+    Args:
+        noise_schedule: noise schedule for the bridge
+        estimator: neural estimator
+        estimator_output: defines the output of the estimator, e.g., data_prediction
+        estimator_time: time for conditioning the estimator, e.g., 'current'
+                        or 'previous'. Default is 'previous'.
+        process: defines the process, e.g., sde or ode
+        time_max: maximum time for the process
+        time_min: minimum time for the process
+        num_steps: number of steps for the process
+        eps: small regularization to prevent division by zero
+
+    References:
+        Schrödinger Bridge for Generative Speech Enhancement, https://arxiv.org/abs/2407.16074
+        Schrodinger Bridges Beat Diffusion Models on Text-to-Speech Synthesis, https://arxiv.org/abs/2312.03491
+    """
+
+    def __init__(
+        self,
+        noise_schedule: SBNoiseSchedule,
+        estimator: NeuralModule,  # neural estimator
+        estimator_output: str,
+        estimator_time: str = 'previous',  # time for the estimator
+        process: str = 'sde',
+        time_max: Optional[float] = None,
+        time_min: Optional[float] = None,
+        num_steps: int = 50,
+        eps: float = 1e-8,
+    ):
+        super().__init__()
+        # Create a copy of the noise schedule
+        self.noise_schedule = noise_schedule.copy()
+
+        # Update sampling parameters
+        if time_max is not None:
+            self.noise_schedule.time_max = time_max
+            logging.info('noise_schedule.time_max set to: %s', self.noise_schedule.time_max)
+
+        if time_min is not None:
+            self.noise_schedule.time_min = time_min
+            logging.info('noise_schedule.time_min set to: %s', self.noise_schedule.time_min)
+
+        self.noise_schedule.num_steps = num_steps
+        logging.info('noise_schedule.num_steps set to: %s', self.noise_schedule.num_steps)
+
+        # Estimator
+        self.estimator = estimator
+        self.estimator_output = estimator_output
+        self.estimator_time = estimator_time
+
+        # Sampling process
+        self.process = process
+
+        # Small regularization
+        if eps <= 0:
+            raise ValueError(f'Expected eps > 0, got {eps}')
+        self.eps = eps
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\testimator_output: %s', self.estimator_output)
+        logging.debug('\testimator_time:   %s', self.estimator_time)
+        logging.debug('\tprocess:          %s', self.process)
+        logging.debug('\ttime_min:         %s', self.time_min)
+        logging.debug('\ttime_max:         %s', self.time_max)
+        logging.debug('\tnum_steps:        %s', self.num_steps)
+        logging.debug('\teps:              %s', self.eps)
+
+    @property
+    def time_max(self):
+        return self.noise_schedule.time_max
+
+    @time_max.setter
+    def time_max(self, value: float):
+        self.noise_schedule.time_max = value
+        logging.debug('noise_schedule.time_max set to: %s', self.noise_schedule.time_max)
+
+    @property
+    def time_min(self):
+        return self.noise_schedule.time_min
+
+    @time_min.setter
+    def time_min(self, value: float):
+        self.noise_schedule.time_min = value
+        logging.debug('noise_schedule.time_min set to: %s', self.noise_schedule.time_min)
+
+    @property
+    def num_steps(self):
+        return self.noise_schedule.num_steps
+
+    @num_steps.setter
+    def num_steps(self, value: int):
+        self.noise_schedule.num_steps = value
+        logging.debug('noise_schedule.num_steps set to: %s', self.noise_schedule.num_steps)
+
+    @property
+    def process(self):
+        return self._process
+
+    @process.setter
+    def process(self, value: str):
+        if value not in ['sde', 'ode']:
+            raise ValueError(f'Unexpected process: {value}')
+        self._process = value
+        logging.info('process set to: %s', self._process)
+
+    @property
+    def estimator_time(self):
+        return self._estimator_time
+
+    @estimator_time.setter
+    def estimator_time(self, value: str):
+        if value not in ['current', 'previous']:
+            raise ValueError(f'Unexpected estimator time: {value}')
+        self._estimator_time = value
+        logging.info('estimator time set to: %s', self._estimator_time)
+
+    @typecheck(
+        input_types={
+            "prior_mean": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "estimator_condition": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType(), optional=True),
+            "state_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        },
+        output_types={
+            "sample": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "state_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        },
+    )
+    @torch.inference_mode()
+    def forward(
+        self, prior_mean: torch.Tensor, estimator_condition: torch.Tensor, state_length: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Takes prior mean and generates a sample."""
+        # SB starts from the prior mean
+        state = prior_mean
+
+        if state_length is not None:
+            state = mask_sequence_tensor(state, state_length)
+
+        # Time steps for sampling
+        time_steps = torch.linspace(self.time_max, self.time_min, self.num_steps + 1, device=state.device)
+
+        # Initial values
+        time_prev = time_steps[0] * torch.ones(state.shape[0], device=state.device)
+        alpha_prev, _, alpha_t_max = self.noise_schedule.get_alphas(time_prev)
+        sigma_prev, sigma_bar_prev, sigma_t_max = self.noise_schedule.get_sigmas(time_prev)
+
+        # Sampling
+        # Sample at the initial time step (`self.time_max`) is exactly the prior_mean.
+        # We do not need to estimate it, but we need to pass it to the next time step.
+        # We iterate through the following time steps to generate the sample at the final time (`self.time_min`).
+        for t in time_steps[1:]:
+
+            # Prepare time steps for the whole batch
+            time = t * torch.ones(state.shape[0], device=state.device)
+
+            # Prepare input for estimator, concatenate conditioning along the channel dimension
+            estimator_input = state if estimator_condition is None else torch.cat([state, estimator_condition], dim=1)
+            estimator_time = time if self.estimator_time == 'current' else time_prev
+
+            # Estimator
+            if self.estimator_output == 'data_prediction':
+                current_estimate, _ = self.estimator(
+                    input=estimator_input, input_length=state_length, condition=estimator_time
+                )
+            else:
+                raise NotImplementedError(f'Unexpected estimator output: {self.estimator_output}')
+
+            # Get noise schedule for current time
+            alpha_t, alpha_bar_t, _ = self.noise_schedule.get_alphas(time)
+            sigma_t, sigma_bar_t, _ = self.noise_schedule.get_sigmas(time)
+
+            if self.process == 'sde':
+                # Calculate scaling for the first-order discretization from the paper
+                weight_prev = alpha_t * sigma_t**2 / (alpha_prev * sigma_prev**2 + self.eps)
+                tmp = 1 - sigma_t**2 / (sigma_prev**2 + self.eps)
+                weight_estimate = alpha_t * tmp
+                weight_z = alpha_t * sigma_t * torch.sqrt(tmp)
+
+                # View as [B, C, D, T]
+                weight_prev = weight_prev.view(-1, 1, 1, 1)
+                weight_estimate = weight_estimate.view(-1, 1, 1, 1)
+                weight_z = weight_z.view(-1, 1, 1, 1)
+
+                # Random sample
+                z_norm = torch.randn_like(state)
+
+                # Update state: weighted sum of previous state, current estimate and noise
+                state = weight_prev * state + weight_estimate * current_estimate + weight_z * z_norm
+            elif self.process == 'ode':
+                # Calculate scaling for the first-order discretization from the paper
+                weight_prev = alpha_t * sigma_t * sigma_bar_t / (alpha_prev * sigma_prev * sigma_bar_prev + self.eps)
+                weight_estimate = (
+                    alpha_t
+                    / (sigma_t_max**2 + self.eps)
+                    * (sigma_bar_t**2 - sigma_bar_prev * sigma_t * sigma_bar_t / (sigma_prev + self.eps))
+                )
+                weight_prior_mean = (
+                    alpha_t
+                    / (alpha_t_max * sigma_t_max**2 + self.eps)
+                    * (sigma_t**2 - sigma_prev * sigma_t * sigma_bar_t / (sigma_bar_prev + self.eps))
+                )
+
+                # View as [B, C, D, T]
+                weight_prev = weight_prev.view(-1, 1, 1, 1)
+                weight_estimate = weight_estimate.view(-1, 1, 1, 1)
+                weight_prior_mean = weight_prior_mean.view(-1, 1, 1, 1)
+
+                # Update state: weighted sum of previous state, current estimate and prior
+                state = weight_prev * state + weight_estimate * current_estimate + weight_prior_mean * prior_mean
+            else:
+                raise RuntimeError(f'Unexpected process: {self.process}')
+
+            # Save previous values
+            time_prev = time
+            alpha_prev = alpha_t
+            sigma_prev = sigma_t
+            sigma_bar_prev = sigma_bar_t
+
+        # Final output
+        if state_length is not None:
+            state = mask_sequence_tensor(state, state_length)
+
+        return state, state_length
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 112f130f004f..7e7fdbc95a61 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -128,6 +128,7 @@ def read_dataset_config(config) -> tuple[CutSet, bool]:
         "text_field": config.text_field,
         "lang_field": config.lang_field,
         "metadata_only": config.metadata_only,
+        "force_finite": config.force_finite,
         "max_open_streams": config.max_open_streams,
     }
     input_cfg = config.input_cfg
@@ -244,7 +245,7 @@ def parse_and_combine_datasets(
             weights=weights if weights else None,
             max_open_streams=propagate_attrs["max_open_streams"],
             seed=propagate_attrs["shard_seed"],
-            metadata_only=propagate_attrs["metadata_only"],
+            force_finite=propagate_attrs["force_finite"] or propagate_attrs["metadata_only"],
         )
     else:
         (cuts,) = cuts
@@ -269,6 +270,7 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
         #   This is mostly useful for unit testing or debugging.
         shard_seed = config.shard_seed
         metadata_only = config.metadata_only
+        force_finite = config.force_finite
         if config.get("cuts_path") is not None:
             warnings.warn("Note: lhotse.cuts_path will be ignored because lhotse.shar_path was provided.")
         if isinstance(config.shar_path, (str, Path)):
@@ -276,7 +278,7 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
             cuts = CutSet.from_shar(
                 **_resolve_shar_inputs(config.shar_path, metadata_only), shuffle_shards=True, seed=shard_seed
             )
-            if not metadata_only:
+            if not metadata_only and not force_finite:
                 cuts = cuts.repeat()
         else:
             # Multiple datasets in Lhotse Shar format: we will dynamically multiplex them
@@ -313,7 +315,7 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
                 weights=weights,
                 max_open_streams=config.max_open_streams,
                 seed=config.shard_seed,
-                metadata_only=metadata_only,
+                force_finite=force_finite,
             )
     else:
         # Regular Lhotse manifest points to individual audio files (like native NeMo manifest).
@@ -393,6 +395,7 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
     # and other data statistics.
     notar_kwargs = {"metadata_only": config.metadata_only}
     metadata_only = config.metadata_only
+    force_finite = config.force_finite
     if isinstance(config.manifest_filepath, (str, Path)):
         logging.info(f"Initializing Lhotse CutSet from a single NeMo manifest (tarred): '{config.manifest_filepath}'")
         if is_tarred and not metadata_only:
@@ -402,7 +405,9 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
                     tar_paths=config.tarred_audio_filepaths,
                     **common_kwargs,
                 )
-            ).repeat()
+            )
+            if not force_finite:
+                cuts = cuts.repeat()
         else:
             cuts = CutSet(LazyNeMoIterator(config.manifest_filepath, **notar_kwargs, **common_kwargs))
     else:
@@ -468,7 +473,7 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
             weights=weights,
             max_open_streams=config.max_open_streams,
             seed=config.shard_seed,
-            metadata_only=metadata_only,
+            force_finite=force_finite or metadata_only,
         )
     return cuts
 
@@ -478,7 +483,7 @@ def mux(
     weights: list[int | float],
     max_open_streams: int | None = None,
     seed: str | int = "trng",
-    metadata_only: bool = False,
+    force_finite: bool = False,
 ) -> CutSet:
     """
     Helper function to call the right multiplexing method flavour in lhotse.
@@ -486,10 +491,10 @@ def mux(
     it will select a more appropriate multiplexing strategy.
     """
     if max_open_streams is not None:
-        assert not metadata_only, "max_open_streams and metadata_only options are not compatible"
+        assert not force_finite, "max_open_streams and metadata_only/force_finite options are not compatible"
         cuts = CutSet.infinite_mux(*cutsets, weights=weights, seed=seed, max_open_streams=max_open_streams)
     else:
-        if not metadata_only:
+        if not force_finite:
             cutsets = [cs.repeat() for cs in cutsets]
         cuts = CutSet.mux(*cutsets, weights=weights, seed=seed)
     return cuts
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 5533b50922f8..15c55a88c232 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -11,12 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import bisect
 import os
 import random
 import warnings
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Optional, TypeVar, Union
+from typing import Any, Optional, Sequence, TypeVar, Union
 
 import numpy as np
 import torch
@@ -33,11 +34,13 @@
 )
 from lhotse.dataset.dataloading import resolve_seed
 from lhotse.dataset.sampling.base import SamplingConstraint, TimeConstraint, TokenConstraint
+from lhotse.dataset.sampling.dynamic_bucketing import FixedBucketBatchSizeConstraint
 from lhotse.lazy import LazyFlattener
 from lhotse.utils import fastcopy, fix_random_seed
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.common.data.lhotse.cutset import guess_parse_cutset, read_cutset_from_config
+from nemo.collections.common.prompts.fn import get_prompt_format_fn
 from nemo.utils import logging
 
 
@@ -67,10 +70,12 @@ class LhotseDataLoadingConfig:
     quadratic_duration: float | None = None
     #   c. Lhotse bucketing.
     use_bucketing: bool = False
+    bucket_batch_size: list[int] | None = None
     num_buckets: int = 30
     num_cuts_for_bins_estimate: int = 10000
-    bucket_duration_bins: list[float] | None = None
+    bucket_duration_bins: Any = None  # list[float] | list[list[float]] | None = None
     bucket_buffer_size: int = 10000
+    concurrent_bucketing: bool = True  # fetches data in a background thread
     #   d. Other Lhotse sampling options.
     shuffle_buffer_size: int | None = 10000
     drop_last: bool = False
@@ -79,6 +84,8 @@ class LhotseDataLoadingConfig:
     cuda_expandable_segments: bool = True
 
     # 2.1 Multimodal sampling override options
+    pretokenize: bool = True  # should we apply tokenizer before data sampling
+    prompt_format: str | None = None  # when provided, we'll apply the prompt in addition to the tokenizer
     use_multimodal_sampling: bool = False
     token_equivalent_duration: float | None = None
     batch_tokens: int | None = None
@@ -93,6 +100,8 @@ class LhotseDataLoadingConfig:
     num_workers: int = 0
     pin_memory: bool = False
     channel_selector: int | str | None = None
+    min_tps: int = -1  # allowed tokens per second
+    max_tps: float = float("inf")
 
     # 4. Optional Lhotse data augmentation.
     #   a. On-the-fly noise/audio mixing.
@@ -132,6 +141,11 @@ class LhotseDataLoadingConfig:
     # Enables iteration of NeMo non-tarred manifests that don't have a "sampling_rate" key without performing any I/O.
     # Note that this will not allow actual dataloading; it's only for manifest iteration as Lhotse objects.
     metadata_only: bool = False
+    # Forces the resulting CutSet to be finite, so that the iteration will end after a full single epoch.
+    # Do not turn this on unless you're sure that you know what you're doing.
+    # In most cases (such as regular multi-GPU training) it will result in a deadlock due to
+    # a different number of steps on different DDP ranks.
+    force_finite: bool = False
 
 
 def get_lhotse_dataloader_from_config(
@@ -155,11 +169,9 @@ def get_lhotse_dataloader_from_config(
     For an example, see: :class:`nemo.collections.asr.data.audio_to_text_lhotse.LhotseSpeechToTextBpeDataset`,
     which is constructed from just a tokenizer and essentially loads and collates audio and tokenizes the transcript.
 
-    The ``tokenizer`` is used when text-only datasets are included in dataloading.
-    In these cases we will tokenize ``TextExample``s before sampling mini-batches so that
-    we can account for their number of tokens.
-    Note: this behaviour might eventually be extended to audio datasets too.
-
+    The ``tokenizer`` is used both for audio and text datasets for on-the-fly tokenization.
+    This allows us to stratify the bucketing by the count of input/output tokens (depending on modality).
+    If "prompt_format" is additionally provided in the config, we will also apply a prompt formatter.
     Note that ``tokenizer`` can be any tokenizer type (e.g. both SentencePiece and Aggregate tokenizers work).
     """
     logging.info("We will be using a Lhotse DataLoader.")
@@ -186,16 +198,26 @@ def get_lhotse_dataloader_from_config(
     # Expands cuts if multiple translations are provided.
     cuts = CutSet(LazyFlattener(cuts.map(_flatten_alt_text, apply_fn=None)))
 
-    if config.use_multimodal_sampling:
-        assert (
-            tokenizer is not None
-        ), "You must pass a tokenizer to `get_lhotse_dataloader_from_config` in order to read text-only datasets (enabled via use_multimodal_dataloading)"
+    if tokenizer is not None and config.pretokenize:
         from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
 
-        if not isinstance(tokenizer, TokenizerWrapper):
-            tokenizer = TokenizerWrapper(tokenizer)
-        # Note this code can also pre-tokenize the text in cuts, but for now we disable it with apply_fn.
-        cuts = cuts.map(partial(tokenize, tokenizer=tokenizer), apply_fn=is_text)
+        if not is_tarred:
+            logging.warning(
+                "You are using a non-tarred dataset and requested tokenization during data sampling (pretokenize=True). "
+                "This will cause the tokenization to happen in the main (GPU) process, possibly impacting the training speed "
+                "if your tokenizer is very large. If the impact is noticable, set pretokenize=False in dataloader config. "
+                "(note: that will disable token-per-second filtering and 2D bucketing features)"
+            )
+
+        if config.prompt_format is not None:
+            cuts = cuts.map(
+                partial(tokenize_with_prompt, tokenizer=tokenizer, prompt_format=config.prompt_format), apply_fn=None
+            )
+        else:
+            if not isinstance(tokenizer, TokenizerWrapper):
+                tokenizer = TokenizerWrapper(tokenizer)
+            cuts = cuts.map(partial(tokenize, tokenizer=tokenizer), apply_fn=None)
+        cuts = cuts.filter(TokenPerSecondFilter(config.min_tps, config.max_tps))
 
     # 2. Optional augmentations.
     # 2.a. Noise mixing.
@@ -238,19 +260,39 @@ def get_lhotse_dataloader_from_config(
     # We can filter after the augmentations because they are applied only when calling load_audio().
     cuts = cuts.filter(DurationFilter(config.min_duration, config.max_duration))
 
+    bucket_duration_bins = determine_bucket_duration_bins(config)
     if config.use_multimodal_sampling:
-        constraint = MultimodalSamplingConstraint(
-            token_equivalent_duration=config.token_equivalent_duration,
-            batch_size=config.batch_size,
-            batch_tokens=config.batch_tokens,
-            quadratic_factor=config.quadratic_factor,
-        )
+        if config.bucket_batch_size is not None:
+            assert (
+                bucket_duration_bins is not None
+            ), "Cannot use bucket_batch_size option if bucket_duration_bins are not provided."
+            constraint = MultimodalFixedBucketBatchSizeConstraint2D(
+                max_seq_len_buckets=bucket_duration_bins,
+                batch_sizes=config.bucket_batch_size,
+                token_equivalent_duration=config.token_equivalent_duration,
+            )
+        else:
+            constraint = MultimodalSamplingConstraint(
+                token_equivalent_duration=config.token_equivalent_duration,
+                batch_size=config.batch_size,
+                batch_tokens=config.batch_tokens,
+                quadratic_factor=config.quadratic_factor,
+            )
     else:
-        constraint = TimeConstraint(
-            max_cuts=config.batch_size,
-            max_duration=config.batch_duration,
-            quadratic_duration=config.quadratic_duration,
-        )
+        if config.bucket_batch_size is not None:
+            assert (
+                bucket_duration_bins is not None
+            ), "Cannot use bucket_batch_size option if bucket_duration_bins are not provided."
+            constraint = FixedBucketBatchSizeConstraint2D(
+                max_seq_len_buckets=bucket_duration_bins,
+                batch_sizes=config.bucket_batch_size,
+            )
+        else:
+            constraint = TimeConstraint(
+                max_cuts=config.batch_size,
+                max_duration=config.batch_duration,
+                quadratic_duration=config.quadratic_duration,
+            )
 
     # 3. The sampler.
     if config.use_bucketing:
@@ -274,6 +316,7 @@ def get_lhotse_dataloader_from_config(
             duration_bins=determine_bucket_duration_bins(config),
             num_cuts_for_bins_estimate=config.num_cuts_for_bins_estimate,
             buffer_size=config.bucket_buffer_size,
+            concurrent=config.concurrent_bucketing,
             rank=0 if is_tarred else global_rank,
             world_size=1 if is_tarred else world_size,
         )
@@ -356,7 +399,12 @@ def get_lhotse_dataloader_from_config(
 def determine_bucket_duration_bins(config):
     if config.bucket_duration_bins is not None:
         # Bucket duration bins are provided: just use them.
-        return config.bucket_duration_bins
+        ans = OmegaConf.to_container(config.bucket_duration_bins)
+        if isinstance(ans[0], Sequence):
+            # 2D bucketing. Ensure we're using tuples for correct behaviour of '<' operator
+            # between the bucket bin tuples and the output of measure_length.
+            ans = [tuple(item) for item in ans]
+        return ans
     # Bucket duration bins are not set.
     if config.use_multimodal_sampling:
         # For multimodal sampling it's currently impossible to define a linspace over durations
@@ -449,6 +497,61 @@ def measure_length(self, example: Any) -> float:
         raise RuntimeError(f"Unsupported example type: {type(example)}")
 
 
+@dataclass
+class FixedBucketBatchSizeConstraint2D(FixedBucketBatchSizeConstraint):
+    @property
+    def bucketing_2d_enabled(self) -> bool:
+        return isinstance(self.max_seq_len_buckets[0], Sequence) and len(self.max_seq_len_buckets[0]) == 2
+
+    def measure_length(self, example: Any) -> tuple[float, float]:
+        if self.bucketing_2d_enabled:
+            return example.duration, _measure_tokens(example)
+        else:
+            return example.duration
+
+    def select_bucket(self, buckets: Any, example: Any = None, example_len: Any = None) -> int:
+        if not self.bucketing_2d_enabled:
+            return super().select_bucket(buckets=buckets, example=example, example_len=example_len)
+        if example_len is None:
+            example_len = self.measure_length(example)
+        bucket_idx = bisect.bisect_right(buckets, example_len)
+        # For 2D bucketing we have to refine the initially found bucket_idx, as bisect
+        # looks primarily at the first index of a tuple (i.e. duration).
+        # For example, with buckets [(1, 1), (1, 2), (2, 2), (2, 4)] and example (1.5, 3)
+        # bisect would allocate it to bucket_idx=2 instead of bucket_idx=3.
+        # To refine, we'll try to push the example to as many buckets to the right as possible,
+        # as long as they have the same dim0 length (e.g. audio duration) and the example's dim1
+        # is smaller than the bin's dim1 (e.g., output token sequence length).
+        bin_dim0, bin_dim1 = self.max_seq_len_buckets[bucket_idx]
+        num_buckets = len(self.max_seq_len_buckets)
+        while (
+            (next_idx := bucket_idx + 1) < num_buckets  # There is a next bucket
+            and (bin := self.max_seq_len_buckets[next_idx])[0] == bin_dim0  # The next bucket has the same 1st dim.
+            # The example's 2nd dim is between that of the current and the next bucket; or,
+            # the next bucket's 2nd dim is still smaller than example.
+            and (bin_dim1 < example_len[1] <= bin[1] or bin[1] < example_len[1])
+        ):
+            bucket_idx = next_idx
+            bin_dim0, bin_dim1 = self.max_seq_len_buckets[bucket_idx]
+        return bucket_idx
+
+
+@dataclass
+class MultimodalFixedBucketBatchSizeConstraint2D(FixedBucketBatchSizeConstraint2D):
+    token_equivalent_duration: float | None = None
+
+    def measure_length(self, example: Any) -> float:
+        assert not self.bucketing_2d_enabled, "2D bucketing for multimodal sampling is not yet supported."
+        if hasattr(example, "num_tokens"):
+            return example.num_tokens
+        if isinstance(example, Cut):
+            assert (
+                self.token_equivalent_duration is not None
+            ), "Cannot use MultimodalFixedBucketBatchSizeConstraint with speech data when token_equivalent_duration was not specified."
+            return example.duration / self.token_equivalent_duration
+        raise RuntimeError(f"Unsupported example type: {type(example)}")
+
+
 def is_text(example) -> bool:
     return isinstance(example, (TextExample, TextPairExample))
 
@@ -459,7 +562,8 @@ def is_text(example) -> bool:
 def tokenize(example: Example, tokenizer) -> Example:
     if isinstance(example, Cut):
         for s in example.supervisions:
-            s.tokens = np.asarray(tokenizer(s.text, s.language))
+            if s.text is not None:
+                s.tokens = np.asarray(tokenizer(s.text, s.language))
     elif isinstance(example, TextExample):
         example.tokens = np.asarray(tokenizer(example.text, example.language))
     elif isinstance(example, TextPairExample):
@@ -470,6 +574,23 @@ def tokenize(example: Example, tokenizer) -> Example:
     return example
 
 
+def tokenize_with_prompt(example: Example, tokenizer, prompt_format: str) -> Example:
+    # TODO(pzelasko): This mechanism makes it possible to measure the actual output sequence length
+    #   for prompted models such as AED MultiTask (Canary), which includes the transcript and the prompt.
+    #   We intend to extend it for text modality in follow-up work.
+    if isinstance(example, Cut):
+        prompt_format_fn = get_prompt_format_fn(prompt_format)
+        (tokenized_prompted_transcript,), (tokenized_prompt,), (tokenized_transcript,) = prompt_format_fn(
+            CutSet([example]), tokenizer
+        )
+        example.tokenized_prompted_transcript = tokenized_prompted_transcript
+        example.tokenized_prompt = tokenized_prompt
+        example.tokenized_transcript = tokenized_transcript
+    else:
+        raise RuntimeError(f"Currently we only support tokenization + prompting during sampling for audio modality.")
+    return example
+
+
 # The helper callables below exist to avoid passing lambdas into lhotse CutSet map/filter methods.
 # Lambdas are not serializable across processes by pickle.
 # Note: lhotse offers LHOTSE_DILL_ENABLED=1 and ``lhotse.lazy.set_dill_enabled(True)``
@@ -490,6 +611,41 @@ def __call__(self, example) -> bool:
             return True  # does not apply to text etc.
 
 
+class TokenPerSecondFilter:
+    """
+    Callable, returns ``True`` if a cut's num_tokens (sum of len(tokens) for each supervision)
+    is in range [tps_min, tps_max] and ``False`` otherwise.
+    """
+
+    def __init__(self, tps_min: float, tps_max: float) -> None:
+        assert tps_min <= tps_max
+        self.tps_min = tps_min
+        self.tps_max = tps_max
+        self.enabled = tps_min > 0 or tps_max < float("inf")
+
+    def __call__(self, example) -> bool:
+        if not isinstance(example, Cut) or not self.enabled:
+            return True  # pass-through for non-audio examples.
+        tps = _measure_tps(example)
+        return self.tps_min <= tps <= self.tps_max
+
+
+def _measure_tokens(cut: Cut) -> int:
+    if hasattr(cut, "tokenized_prompted_transcript"):
+        return len(cut.tokenized_prompted_transcript)  # tokenized with prompt formatter
+    supervisions_with_tokens = [s for s in cut.supervisions if hasattr(s, "tokens")]
+    assert len(supervisions_with_tokens) > 0, (
+        "Cannot measure tokens-per-second with untokenized supervisions. "
+        "Did you forget to provide the tokenizer argument to get_lhotse_dataloader_from_config() method?"
+    )
+    return sum(len(s.tokens) for s in supervisions_with_tokens)
+
+
+def _measure_tps(cut: Cut) -> float:
+    num_tokens = _measure_tokens(cut)
+    return num_tokens / cut.duration
+
+
 def _normalize_loudness(cuts: CutSet, db_norm: float) -> CutSet:
     return cuts.normalize_loudness(target=db_norm, mix_first=False)
 
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index bbe44df96526..2a4b71a18880 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -25,11 +25,12 @@
 import soundfile
 from cytoolz import groupby
 from lhotse import AudioSource, Recording, SupervisionSegment
+from lhotse.audio.backend import LibsndfileBackend
 from lhotse.cut import Cut
 from lhotse.dataset.dataloading import resolve_seed
 from lhotse.lazy import LazyIteratorChain, LazyJsonlIterator
 from lhotse.serialization import open_best
-from lhotse.utils import compute_num_samples
+from lhotse.utils import compute_num_samples, ifnone
 
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 
@@ -329,9 +330,21 @@ def __iter__(self) -> Generator[Cut, None, None]:
         # Propagate the random seed
         extra_fields = [ExtraField.from_dict({"seed": seed, **field_cfg}) for field_cfg in self.extra_fields or ()]
 
+        # Handle NeMo tarred manifests with offsets.
+        # They have multiple JSONL entries where audio paths end with '-sub1', '-sub2', etc. for each offset.
+        offset_pattern = re.compile(r'^(?P<stem>.+)(?P<sub>-sub\d+)(?P<ext>\.\w+)?$')
+
         for sid in shard_ids:
             manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
-            shard_manifest = {data["audio_filepath"]: data for data in self.shard_id_to_manifest[sid]}
+
+            def basename(d: dict) -> str:
+                return (
+                    m.group("stem") + ifnone(m.group("ext"), "")
+                    if (m := offset_pattern.match(k := d["audio_filepath"])) is not None
+                    else k
+                )
+
+            shard_manifest: dict[str, list[dict]] = groupby(basename, self.shard_id_to_manifest[sid])
             tar_path = self.shard_id_to_tar_path[sid]
             with tarfile.open(fileobj=open_best(tar_path, mode="rb"), mode="r|*") as tar:
                 for tar_info in tar:
@@ -339,7 +352,6 @@ def __iter__(self) -> Generator[Cut, None, None]:
                         f"Mismatched entry between JSON manifest ('{manifest_path}') and tar file ('{tar_path}'). "
                         f"Cannot locate JSON entry for tar file '{tar_info.name}'"
                     )
-                    data = shard_manifest[tar_info.name]
                     raw_audio = tar.extractfile(tar_info).read()
                     # Note: Lhotse has a Recording.from_bytes() utility that we won't use here because
                     #       the profiling indicated significant overhead in torchaudio ffmpeg integration
@@ -353,21 +365,31 @@ def __iter__(self) -> Generator[Cut, None, None]:
                         num_samples=meta.frames,
                         duration=meta.duration,
                     )
-                    cut = recording.to_cut()
-                    cut.supervisions.append(
-                        SupervisionSegment(
-                            id=cut.id,
-                            recording_id=cut.recording_id,
-                            start=0,
-                            duration=cut.duration,
-                            text=data.get(self.text_field),
-                            language=data.get(self.lang_field),
+                    cuts_for_recording = []
+                    for data in sorted(shard_manifest[tar_info.name], key=lambda d: d["audio_filepath"]):
+                        # Cut the recording into corresponding segment and discard audio data outside the segment.
+                        cut = make_cut_with_subset_inmemory_recording(
+                            recording, offset=data.get("offset", 0.0), duration=data.get("duration")
                         )
-                    )
-                    cut.custom = _to_custom_attr_dict(data)
-                    for extra_field in extra_fields:
-                        extra_field.attach_to(cut)
-                    yield cut
+                        cut.supervisions.append(
+                            SupervisionSegment(
+                                id=cut.id,
+                                recording_id=cut.recording_id,
+                                start=0,
+                                duration=cut.duration,
+                                text=data.get(self.text_field),
+                                language=data.get(self.lang_field),
+                            )
+                        )
+                        cut.custom = _to_custom_attr_dict(data)
+                        cut.manifest_origin = manifest_path
+                        cut.tar_origin = tar_path
+                        for extra_field in extra_fields:
+                            extra_field.attach_to(cut)
+                        cuts_for_recording.append(cut)
+                    del recording  # free the memory - helps with very large audio files
+                    del raw_audio
+                    yield from cuts_for_recording
 
     def __len__(self) -> int:
         return len(self.source)
@@ -376,6 +398,45 @@ def __add__(self, other):
         return LazyIteratorChain(self, other)
 
 
+def make_cut_with_subset_inmemory_recording(
+    recording: Recording, offset: float = 0.0, duration: float | None = None
+) -> Cut:
+    """
+    This method is built specifically to optimize CPU memory usage during dataloading
+    when reading tarfiles containing very long recordings (1h+).
+    Normally each cut would hold a reference to the long in-memory recording and load
+    the necessary subset of audio (there wouldn't be a separate copy of the long recording for each cut).
+    This is fairly efficient already, but we don't actually need to hold the unused full recording in memory.
+    Instead, we re-create each cut so that it only holds a reference to the subset of recording necessary.
+    This allows us to discard unused data which would otherwise be held in memory as part of sampling buffering.
+    """
+
+    # Fast path: no offset and (almost) matching duration (within 200ms; leeway for different audio codec behavior).
+    cut = recording.to_cut()
+    if offset == 0.0 and duration is None or abs(duration - recording.duration) < 0.2:
+        return cut
+
+    # Otherwise, apply the memory optimization.
+    cut = cut.truncate(offset=offset, duration=duration, preserve_id=True)
+    audiobytes = BytesIO()
+    LibsndfileBackend().save_audio(audiobytes, cut.load_audio(), sampling_rate=cut.sampling_rate, format="wav")
+    audiobytes.seek(0)
+    new_recording = Recording(
+        id=recording.id,
+        sampling_rate=recording.sampling_rate,
+        num_samples=cut.num_samples,
+        duration=cut.duration,
+        sources=[
+            AudioSource(
+                type="memory",
+                channels=recording.channel_ids,
+                source=audiobytes.getvalue(),
+            )
+        ],
+    )
+    return new_recording.to_cut()
+
+
 class ExtraField:
     TYPE = None
     SUPPORTED_TYPES = {}
diff --git a/nemo/collections/common/metrics/__init__.py b/nemo/collections/common/metrics/__init__.py
index 9e21d93816a9..81f1a181beae 100644
--- a/nemo/collections/common/metrics/__init__.py
+++ b/nemo/collections/common/metrics/__init__.py
@@ -19,4 +19,5 @@
     MetricStringToTorchMetric,
     TextMetricsSet,
 )
+from nemo.collections.common.metrics.perf_metrics import FLOPsMeasurementCallback
 from nemo.collections.common.metrics.perplexity import Perplexity
diff --git a/nemo/collections/common/metrics/perf_metrics.py b/nemo/collections/common/metrics/perf_metrics.py
new file mode 100644
index 000000000000..5722f52d0e7c
--- /dev/null
+++ b/nemo/collections/common/metrics/perf_metrics.py
@@ -0,0 +1,233 @@
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+from pytorch_lightning.callbacks import Callback
+
+from nemo.collections.common.parts.perf_metrics_utils import LLM_VOCAB_SIZE_MAP, read_tb_log
+from nemo.utils import logging
+
+__all__ = ["FLOPsMeasurementCallback"]
+
+
+class FLOPsMeasurementCallback(Callback):
+    """
+    Calculate FLOPs per second after last train step for a given job run.
+
+    Args:
+        model_config (Dict[str, Any]): params for running the experiment/job.
+        Expects a nested dictionary with parent keys
+            1. run- for assessing model name (Eg. 'gpt3', 'llama2', etc.) from sub-key 'name'.
+                'name' usually has value like- train_gpt3_5b_*, which is matched to model name 'gpt3'.
+            2. exp_manager- for accessing 'explicit_log_dir'. tensorboard log file is stored here,
+                used for accessing step time needed for calculating TFLOPs per sec per GPU
+            3. trainer- for accessing 'num_nodes' and 'devices' needed for calculating
+                TFLOPs per sec per GPU
+            4. model- Hyperparams for the model. Specifically- global batch size, sequence length,
+                hidden size,  ffn hidden size, num_layers, num_attention_heads, num_query_groups,
+                moe_router_topk. (list might increase with new models as required)
+        log_dir (Optional[str]): Directory with tenbsorboard log file. If present, will overrride
+            'explicit_log_dir' in model_config. Defaults to None.
+        model_name (Optional[str]): If present, will override 'name' under 'run' in model_config.
+            Defaults to None.
+    """
+
+    higher_is_better = True
+
+    def __init__(
+        self,
+        model_config: Dict[str, Any],
+        log_dir: Optional[str] = None,
+        model_name: Optional[str] = None,
+    ):
+        self.cfg = model_config
+
+        self.run_cfg = self.cfg.get('run', {})
+        self.exp_cfg = self.cfg.get('exp_manager', {})
+        self.train_cfg = self.cfg.get('trainer', {})
+        self.model_cfg = self.cfg.get('model', {})
+
+        # use config params only when NOT provided explicitly
+        self.model = self.run_cfg.get('name', "") if model_name is None else model_name
+        self.log_dir = self.exp_cfg.get('explicit_log_dir', None) if log_dir is None else log_dir
+
+        self.num_nodes = self.train_cfg.get('num_nodes', None)
+        self.num_gpus_per_node = self.train_cfg.get('devices', None)
+
+        self.gbs = self.model_cfg.get('global_batch_size', None)
+        self.enc_seq_len = self.model_cfg.get('encoder_seq_length', None)
+        self.hs = self.model_cfg.get('hidden_size', None)
+        self.layers = self.model_cfg.get('num_layers', None)
+        self.ffn_hs = self.model_cfg.get('ffn_hidden_size', None)
+        self.attention_heads = self.model_cfg.get('num_attention_heads', None)
+        self.moe_router_topk = self.model_cfg.get('moe_router_topk', None)
+
+        # this handles both- 1. key is present, value is None; 2. key is absent
+        self.query_groups = self.model_cfg.get('num_query_groups', None)
+        if self.query_groups is None:
+            self.query_groups = self.attention_heads
+
+        self.model = self.model.lower() if self.model is not None else self.model
+
+    def on_train_end(self, trainer, pl_module):
+        """
+        PyTorch Lightning callback hook to calculate TFLOPs per sec per GPU after training
+        """
+        tflops_per_sec_per_gpu = -1
+
+        try:
+            if "peft" in self.cfg["model"]:
+                raise NotImplementedError("FLOPs measurement not supported for finetuning jobs")
+
+            step_time_list = read_tb_log(self.log_dir, "train_step_timing in s")
+            tflops_per_sec_per_gpu = self.eval_tflops_per_sec_per_gpu(step_time_list)
+        except Exception as exc:
+            logging.error(f"Failed to calculate TFLOPs per sec per GPU.\n{exc}")
+
+        logging.info(f"TFLOPs per sec per GPU={tflops_per_sec_per_gpu:.2f}")
+        pl_module.logger.experiment.add_scalar("tflops_per_sec_per_gpu", tflops_per_sec_per_gpu)
+
+    def eval_tflops_per_sec_per_gpu(self, train_step_time: List | float | int) -> float:
+        """
+        Args:
+            train_step_time (Any[List, float, int]): Train step time (in seconds).
+            Step time will be less stable for initial steps (~10 steps)- less
+            accurate measurement
+            Use average step time over several steps for higher accuracy
+        Returns:
+            (float): Model TFLOPs per sec per gpu
+        """
+        total_flops, flops_per_gpu = self.eval_model_flops()
+
+        if not isinstance(train_step_time, list):
+            train_step_time = [train_step_time]
+        # efficient mean computation if num train steps is very large
+        step_time_arr = np.array(train_step_time)
+        train_step_time = np.mean(step_time_arr[len(step_time_arr) // 2 :])
+
+        return flops_per_gpu / (1e12 * train_step_time)
+
+    def eval_model_flops(self):
+        """
+        Calculate model FLOPs for a given model
+        """
+
+        model_flops_map = {
+            "gpt3": self._gpt3,
+            "llama2": self._llama2,
+            "llama3": self._llama3,
+            "nemotron": self._nemotron,
+            "mixtral": self._mixtral,
+            "bert": self._bert,
+        }
+
+        if self.model is not None:
+            model_matches = [model for model in model_flops_map if model in self.model]
+            self.model = model_matches[0] if len(model_matches) > 0 else self.model
+        if self.model not in model_flops_map:
+            logging.info(f"FLOPs measurement supported for {list(model_flops_map.keys())}")
+            raise KeyError(f"Failed to extract valid model name from or missing FLOPs calculations for {self.model}")
+
+        total_flops = model_flops_map[self.model]()
+        flops_per_gpu = total_flops / (self.num_nodes * self.num_gpus_per_node)
+
+        return total_flops, flops_per_gpu
+
+    def _gpt3(self):
+        """Model FLOPs for GPT3 family"""
+
+        vocab_size = LLM_VOCAB_SIZE_MAP["gpt3"]
+
+        return (
+            24 * self.gbs * self.enc_seq_len * self.hs * self.hs
+            + 4 * self.gbs * self.enc_seq_len * self.enc_seq_len * self.hs
+        ) * (3 * self.layers) + (6 * self.gbs * self.enc_seq_len * self.hs * vocab_size)
+
+    def _llama2(self):
+        """Model FLOPs for llama2 family"""
+        vocab_size = LLM_VOCAB_SIZE_MAP["llama2"]
+
+        return (
+            self.gbs
+            * self.enc_seq_len
+            * self.layers
+            * self.hs
+            * self.hs
+            * (
+                12
+                + (12 * self.query_groups / self.attention_heads)
+                + (18 * self.ffn_hs / self.hs)
+                + (12 * self.enc_seq_len / self.hs)
+                + (6 * vocab_size / (self.layers * self.hs))
+            )
+        )
+
+    def _llama3(self):
+        """Model FLOPs for llama3 family"""
+        vocab_size = LLM_VOCAB_SIZE_MAP["llama3"]
+
+        return (
+            self.gbs
+            * self.enc_seq_len
+            * self.layers
+            * self.hs
+            * self.hs
+            * (
+                12
+                + (12 * self.query_groups / self.attention_heads)
+                + (18 * self.ffn_hs / self.hs)
+                + (12 * self.enc_seq_len / self.hs)
+                + (6 * vocab_size / (self.layers * self.hs))
+            )
+        )
+
+    def _nemotron(self):
+        """Model FLOPs for nemotron family"""
+        vocab_size = LLM_VOCAB_SIZE_MAP["nemotron"]
+
+        return (
+            self.gbs
+            * self.enc_seq_len
+            * self.layers
+            * self.hs
+            * self.hs
+            * (
+                12
+                + (12 * self.query_groups / self.attention_heads)
+                + (12 * self.ffn_hs / self.hs)
+                + (12 * self.enc_seq_len / self.hs)
+                + (6 * vocab_size / (self.layers * self.hs))
+            )
+        )
+
+    def _mixtral(self):
+        """Model FLOPs for mixtral family"""
+        vocab_size = LLM_VOCAB_SIZE_MAP["mixtral"]
+
+        return (
+            self.gbs
+            * self.enc_seq_len
+            * self.layers
+            * self.hs
+            * self.hs
+            * (
+                12
+                + (12 * self.query_groups / self.attention_heads)
+                + (18 * self.moe_router_topk * self.ffn_hs / self.hs)
+                + (12 * self.enc_seq_len / self.hs)
+                + (6 * vocab_size / (self.layers * self.hs))
+            )
+        )
+
+    def _bert(self):
+        """Model FLOPs for BERT family"""
+        vocab_size = LLM_VOCAB_SIZE_MAP["bert"]
+
+        return (
+            72
+            * self.gbs
+            * self.layers
+            * self.enc_seq_len
+            * self.hs
+            * self.hs
+            * (1 + (self.enc_seq_len / (6 * self.hs)) + (vocab_size / (12 * self.hs * self.layers)))
+        )
diff --git a/nemo/collections/common/parts/perf_metrics_utils.py b/nemo/collections/common/parts/perf_metrics_utils.py
new file mode 100644
index 000000000000..41273797e035
--- /dev/null
+++ b/nemo/collections/common/parts/perf_metrics_utils.py
@@ -0,0 +1,46 @@
+import glob
+import os
+from typing import List
+
+from tensorboard.backend.event_processing import event_accumulator
+
+from nemo.utils import logging
+
+LLM_VOCAB_SIZE_MAP = {
+    "gpt3": 51200,
+    "llama2": 32000,
+    "llama3": 128256,
+    "nemotron": 256000,
+    "bert": 29000,
+    "mixtral": 32000,
+}
+
+
+def read_tb_log(path: str, summary_name: str) -> List:
+    """
+    Reads a TensorBoard Events file from the input path, and returns the
+    summary specified.
+
+    Args:
+        path: str, path to the dir where the events file is located.
+        summary_name: str, name of the summary to read from the TB logs.
+    Returns:
+        summary_list: list, the values in the read summary list, formatted as a list.
+    """
+
+    files = glob.glob(f"{path}/events*tfevents*")
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
+    if len(files) == 0 or not os.path.isfile(files[0]):
+        raise FileNotFoundError(f"Missing TensorBoard log file.")
+
+    events_file = files[0]
+    try:
+        ea = event_accumulator.EventAccumulator(events_file)
+        ea.Reload()
+        summary = ea.Scalars(summary_name)
+        summary_list = [round(x.value, 2) for x in summary]
+        logging.info(f"{summary_name}: {summary_list}")
+    except KeyError:
+        raise KeyError(f"{summary_name} not found in {events_file}")
+
+    return summary_list
diff --git a/nemo/collections/common/parts/utils.py b/nemo/collections/common/parts/utils.py
index c22c433bdfdf..75783815548a 100644
--- a/nemo/collections/common/parts/utils.py
+++ b/nemo/collections/common/parts/utils.py
@@ -12,10 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import math
 import os
 from typing import Iterable, List
 
+logger = logging.getLogger(__name__)
+
+import einops
+import torch
 import torch.nn as nn
 
 __all__ = ['if_exist', '_compute_softmax', 'flatten']
@@ -105,3 +110,52 @@ def extend_instance(obj, mixin):
     obj.__class__ = type(
         base_cls_name, (mixin, base_cls), {}
     )  # mixin needs to go first for our forward() logic to work
+
+
+def apply_rope_scaling(freqs):
+    # Apply scaling for RoPE frequencies
+    logger.info("apply rope scaling ...")
+    # Values obtained from grid search
+    scale_factor = 8
+    low_freq_factor = 1
+    high_freq_factor = 4
+    old_context_len = 8192  # original llama3 length
+
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    new_freqs = []
+    for freq in freqs:
+        wavelen = 2 * math.pi / freq
+        if wavelen < high_freq_wavelen:
+            new_freqs.append(freq)
+        elif wavelen > low_freq_wavelen:
+            new_freqs.append(freq / scale_factor)
+        else:
+            assert low_freq_wavelen != high_freq_wavelen
+            smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+            new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
+    return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+
+
+def mask_sequence_tensor(tensor: torch.Tensor, lengths: torch.Tensor):
+    """
+    For tensors containing sequences, zero out out-of-bound elements given lengths of every element in the batch.
+
+    tensor: tensor of shape (B, L), (B, D, L) or (B, D1, D2, L),
+    lengths: LongTensor of shape (B,)
+    """
+    batch_size, *_, max_lengths = tensor.shape
+
+    if len(tensor.shape) == 2:
+        mask = torch.ones(batch_size, max_lengths).cumsum(dim=-1).type_as(lengths)
+        mask = mask <= einops.rearrange(lengths, 'B -> B 1')
+    elif len(tensor.shape) == 3:
+        mask = torch.ones(batch_size, 1, max_lengths).cumsum(dim=-1).type_as(lengths)
+        mask = mask <= einops.rearrange(lengths, 'B -> B 1 1')
+    elif len(tensor.shape) == 4:
+        mask = torch.ones(batch_size, 1, 1, max_lengths).cumsum(dim=-1).type_as(lengths)
+        mask = mask <= einops.rearrange(lengths, 'B -> B 1 1 1')
+    else:
+        raise ValueError('Can only mask tensors of shape B x L, B x D x L and B x D1 x D2 x L')
+
+    return tensor * mask
diff --git a/nemo/collections/common/prompts/__init__.py b/nemo/collections/common/prompts/__init__.py
index e69de29bb2d1..1acd1d5c60ce 100644
--- a/nemo/collections/common/prompts/__init__.py
+++ b/nemo/collections/common/prompts/__init__.py
@@ -0,0 +1,11 @@
+from nemo.collections.common.prompts.canary import CanaryPromptFormatter
+from nemo.collections.common.prompts.fn import get_prompt_format_fn, registered_prompt_format_fn
+from nemo.collections.common.prompts.formatter import PromptFormatter
+from nemo.collections.common.prompts.gemma import GemmaPromptFormatter
+from nemo.collections.common.prompts.llama import Llama2PromptFormatter, Llama3PromptFormatter
+from nemo.collections.common.prompts.mistral import MistralPromptFormatter
+from nemo.collections.common.prompts.phi2 import (
+    Phi2ChatPromptFormatter,
+    Phi2CodePromptFormatter,
+    Phi2QAPromptFormatter,
+)
diff --git a/nemo/collections/common/prompts/canary.py b/nemo/collections/common/prompts/canary.py
index f2b1e58c3bb2..0eb3296bcff9 100644
--- a/nemo/collections/common/prompts/canary.py
+++ b/nemo/collections/common/prompts/canary.py
@@ -1,4 +1,13 @@
+from typing import Any
+
+import torch
+from lhotse import CutSet, MonoCut
+from lhotse.cut import MixedCut
+from lhotse.utils import ifnone
+
+from nemo.collections.common.prompts.fn import registered_prompt_format_fn
 from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+from nemo.collections.common.tokenizers import TokenizerSpec
 from nemo.collections.common.tokenizers.canary_tokenizer import (
     CANARY_BOS,
     CANARY_EOS,
@@ -33,6 +42,11 @@ class CanaryPromptFormatter(PromptFormatter):
         },
     }
 
+    def _validate_slot_values(self, expected: dict[str, Modality], received: dict[str, Any]) -> None:
+        if "taskname" in received and "task" not in received:
+            received["task"] = received.pop("taskname")
+        return super()._validate_slot_values(expected=expected, received=received)
+
     def encode_turn(self, prompt_template: str, expected_slots: dict, slot_values: dict) -> list[int]:
         # This method handles a level of indirection for Canary.
         # It maps values provided in trcfg to the actual special tokens
@@ -78,3 +92,90 @@ def map_manifest_values_to_special_tokens(slot_values: dict[str, str]) -> dict[s
         slot_values[PromptFormatter.PROMPT_LANGUAGE_SLOT] = CANARY_SPECIAL_TOKENIZER
 
     return slot_values
+
+
+@registered_prompt_format_fn
+def canary(
+    cuts: CutSet, tokenizer: TokenizerSpec
+) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]:
+    """
+    Prepend and append control tokens to the token sequence as per Canary format.
+
+    We use the following special tokens:
+    * <|startoftranscript|>
+    * <|transcribe|>
+    * <|translate|>
+    * <|nopnc|>
+    * <|pnc|>
+    * <|endoftext|>
+    * <|LANG|> - for each supported language.
+    * <|nospeech|>
+
+    The prompt format syntax is as follows:
+
+        <|startoftranscript|> [ <|nospeech|> | <|LANG|> [ <|transcribe|> | <|translate|> ] <|LANG|> [ <|pnc|> | <|nopnc|> ] TEXT <|endoftext|> ]
+
+    Where expression ``[ a | b ]`` denotes expression ``a`` or expression ``b``, and can be nested.
+    Note that ``<|LANG|>`` appears twice: the first occurrence is for the "source" language
+    (i.e., spoken language in the recording) and the second occurrence is for the "target" language
+    (i.e., the language in which we are going to output the text).
+    """
+    formatter = CanaryPromptFormatter(tokenizer)
+
+    prompts_with_answers, prompts, answers = [], [], []
+    for cut in cuts:
+        if isinstance(cut, MixedCut):
+            cut = cut._first_non_padding_cut
+        if not isinstance(cut, MonoCut):
+            raise TypeError(
+                f"Expected input audio to have a single channel (required MonoCut/MixedCut, but we received: {cut=})"
+            )
+
+        # first, validate the utterance
+        expected_slots = set(formatter.get_slots("user"))
+        missing_keys = expected_slots - set(cut.custom)
+        if "task" in missing_keys and "taskname" in cut.custom:
+            # Compatibility with "old" Canary manifest format.
+            # For compatbility with inference options, this slot is now called "task".
+            cut.custom["task"] = cut.custom["taskname"]
+            missing_keys.remove("task")
+        if missing_keys:
+            raise RuntimeError(
+                f"We found cut with ID {cut.id} that is missing the following keys: {missing_keys}"
+                f"Please ensure that every utterance in the input manifests contains these keys."
+            )
+
+        turns = [
+            dict(
+                role="user",
+                slots={
+                    **{slot: cut.custom[slot] for slot in expected_slots},
+                    formatter.PROMPT_LANGUAGE_SLOT: CANARY_SPECIAL_TOKENIZER,
+                },
+            )
+        ]
+        # If data has no transcript, create empty response with <eos> only.
+        text = ' '.join(s.text for s in cut.supervisions if s.text is not None)
+        turns.append(
+            dict(
+                role="assistant",
+                slots={
+                    "text": text,
+                    formatter.PROMPT_LANGUAGE_SLOT: ifnone(
+                        cut.supervisions[0].language, cut.custom.get("target_lang")
+                    ),
+                },
+            ),
+        )
+        encoded = formatter.encode_dialog(turns)
+        prompts_with_answers.append(encoded["input_ids"])
+        prompts.append(encoded["context_ids"])
+        if "answer_ids" in encoded:
+            assert (
+                encoded["answer_ids"][-1].item() == formatter.tokenizer.eos
+            ), f"Expected the last token in answer_ids to be EOS, but we got {encoded['answer_ids']=}"
+            answers.append(encoded["answer_ids"][:-1])  # Strip Canary's EOS
+        else:
+            answers.append([])
+
+    return prompts_with_answers, prompts, answers
diff --git a/nemo/collections/common/prompts/fn.py b/nemo/collections/common/prompts/fn.py
new file mode 100644
index 000000000000..ce7d2fc8a69a
--- /dev/null
+++ b/nemo/collections/common/prompts/fn.py
@@ -0,0 +1,38 @@
+from typing import Callable, Sequence
+
+import torch
+from lhotse import CutSet
+
+from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+PROMPT_FORMAT_FNS = {}
+
+
+def registered_prompt_format_fn(
+    prompt_fn: Callable[[CutSet, TokenizerSpec], tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]]
+):
+    """
+    Decorator for registering prompt functions under a name.
+
+    Example::
+
+        >>> @registered_prompt_format_fn
+        ... def my_prompt(cuts, tokenizer):
+        ...     pass
+        ...
+        ... prompt_fn = get_prompt_format_fn("my_prompt")
+    """
+    global PROMPT_FORMAT_FNS
+
+    PROMPT_FORMAT_FNS[prompt_fn.__name__] = prompt_fn
+    return prompt_fn
+
+
+def get_prompt_format_fn(
+    name: str,
+) -> Callable[[CutSet, TokenizerSpec], tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]]:
+    if name not in PROMPT_FORMAT_FNS:
+        raise ValueError(
+            f"Unknown prompt format function name: {name} " f"(must be one of: {list(PROMPT_FORMAT_FNS.keys())}"
+        )
+    return PROMPT_FORMAT_FNS[name]
diff --git a/nemo/collections/common/tokenizers/canary_tokenizer.py b/nemo/collections/common/tokenizers/canary_tokenizer.py
index 6adcdd8cf734..881657d81516 100644
--- a/nemo/collections/common/tokenizers/canary_tokenizer.py
+++ b/nemo/collections/common/tokenizers/canary_tokenizer.py
@@ -71,7 +71,7 @@ def text_to_ids(self, text, lang_id) -> list[int]:
             return self._tokenize_special_prompt(text)
         if text.endswith(CANARY_EOS):
             return super().text_to_ids(text[: -len(CANARY_EOS)], lang_id) + [self.eos_id]
-        return super().text_to_ids(text[-len(CANARY_EOS) :], lang_id)
+        return super().text_to_ids(text, lang_id)
 
     def _tokenize_special_prompt(self, text: str) -> list[int]:
         """
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 83c0a3af48c0..7b2b38e50bc3 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -15,6 +15,13 @@
 )
 from nemo.collections.llm.gpt.data.api import dolly, mock, squad
 from nemo.collections.llm.gpt.model import (
+    Baichuan2Config,
+    Baichuan2Config7B,
+    Baichuan2Model,
+    ChatGLM2Config6B,
+    ChatGLM3Config6B,
+    ChatGLMConfig,
+    ChatGLMModel,
     CodeGemmaConfig2B,
     CodeGemmaConfig7B,
     CodeLlamaConfig7B,
@@ -37,29 +44,21 @@
     MaskedTokenLossReduction,
     MistralConfig7B,
     MistralModel,
+    MixtralConfig8x3B,
     MixtralConfig8x7B,
+    MixtralConfig8x22B,
     MixtralModel,
     gpt_data_step,
     gpt_forward_step,
 )
-from nemo.collections.llm.gpt.model.api import (
-    code_gemma_2b,
-    code_gemma_7b,
-    code_llama_7b,
-    code_llama_13b,
-    code_llama_34b,
-    code_llama_70b,
-    gemma,
-    gemma_2b,
-    gemma_7b,
-    llama2_7b,
-    llama2_13b,
-    llama2_70b,
-    llama3_8b,
-    llama3_70b,
-    mistral,
-    mixtral,
-)
+from nemo.collections.llm.recipes import *  # noqa
+from nemo.utils import logging
+
+try:
+    from nemo.collections.llm.api import deploy
+except ImportError as error:
+    deploy = None
+    logging.warning(f"The deploy module could not be imported: {error}")
 
 __all__ = [
     "MockDataModule",
@@ -70,7 +69,9 @@
     "MaskedTokenLossReduction",
     "MistralConfig7B",
     "MistralModel",
+    "MixtralConfig8x3B",
     "MixtralConfig8x7B",
+    "MixtralConfig8x22B",
     "MixtralModel",
     "LlamaConfig",
     "Llama2Config7B",
@@ -89,6 +90,13 @@
     "CodeGemmaConfig2B",
     "CodeGemmaConfig7B",
     "GemmaModel",
+    "Baichuan2Config",
+    "Baichuan2Config7B",
+    "Baichuan2Model",
+    "ChatGLMConfig",
+    "ChatGLM2Config6B",
+    "ChatGLM3Config6B",
+    "ChatGLMModel",
     "PreTrainingDataModule",
     "FineTuningDataModule",
     "SquadDataModule",
@@ -103,21 +111,9 @@
     "mock",
     "squad",
     "dolly",
-    "mistral",
-    "mixtral",
-    "llama2_7b",
-    "llama3_8b",
-    "llama2_13b",
-    "llama2_70b",
-    "llama3_70b",
-    "code_llama_7b",
-    "code_llama_13b",
-    "code_llama_34b",
-    "code_llama_70b",
-    "gemma",
-    "gemma_2b",
-    "gemma_7b",
-    "code_gemma_2b",
-    "code_gemma_7b",
     "peft",
 ]
+
+# add 'deploy' to __all__ if it was successfully imported
+if deploy is not None:
+    __all__.append("deploy")
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 0bb8f5fa46af..46d94d26b03b 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -1,3 +1,5 @@
+import json
+import os
 from copy import deepcopy
 from pathlib import Path
 from typing import Any, Callable, Optional, Union
@@ -6,10 +8,24 @@
 from typing_extensions import Annotated
 
 from nemo.collections.llm.utils import Config, task
+from nemo.deploy import DeployPyTriton
 from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io
 from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform
 from nemo.utils import logging
 
+trt_llm_supported = True
+try:
+    from nemo.export.tensorrt_llm import TensorRTLLM
+except ImportError as error:
+    logging.warning(f"TensorRTLLM could not be imported from nemo.export: {error}")
+    trt_llm_supported = False
+
+uvicorn_supported = True
+try:
+    import uvicorn
+except ImportError as error:
+    logging.warning(f"uvicorn could not be imported: {error}")
+    uvicorn_supported = False
 
 TokenizerType = Any
 
@@ -225,6 +241,157 @@ def validate(
     return app_state.exp_dir
 
 
+def get_trtllm_deployable(
+    nemo_checkpoint,
+    model_type,
+    triton_model_repository,
+    num_gpus,
+    tensor_parallelism_size,
+    pipeline_parallelism_size,
+    max_input_len,
+    max_output_len,
+    max_batch_size,
+    dtype,
+):
+    if triton_model_repository is None:
+        trt_llm_path = "/tmp/trt_llm_model_dir/"
+        Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
+    else:
+        trt_llm_path = triton_model_repository
+
+    if nemo_checkpoint is None and triton_model_repository is None:
+        raise ValueError(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine."
+        )
+
+    if nemo_checkpoint is None and not os.path.isdir(triton_model_repository):
+        raise ValueError(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine."
+        )
+
+    if nemo_checkpoint is not None and model_type is None:
+        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
+
+    if not trt_llm_supported:
+        raise ValueError("TensorRT-LLM engine is not supported in this environment.")
+    trt_llm_exporter = TensorRTLLM(
+        model_dir=trt_llm_path,
+        load_model=(nemo_checkpoint is None),
+    )
+
+    if nemo_checkpoint is not None:
+        try:
+            logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
+            trt_llm_exporter.export(
+                nemo_checkpoint_path=nemo_checkpoint,
+                model_type=model_type,
+                n_gpus=num_gpus,
+                tensor_parallelism_size=tensor_parallelism_size,
+                pipeline_parallelism_size=pipeline_parallelism_size,
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_batch_size=max_batch_size,
+                dtype=dtype,
+            )
+        except Exception as error:
+            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
+
+    return trt_llm_exporter
+
+
+def store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response):
+    args_dict = {
+        "triton_service_ip": triton_http_address,
+        "triton_service_port": triton_port,
+        "triton_request_timeout": triton_request_timeout,
+        "openai_format_response": openai_format_response,
+    }
+    with open("nemo/deploy/service/config.json", "w") as f:
+        json.dump(args_dict, f)
+
+
+@task(namespace="llm")
+def deploy(
+    nemo_checkpoint: Path = None,
+    model_type: str = "llama",
+    triton_model_name: str = "xxx",
+    triton_model_version: Optional[int] = 1,
+    triton_port: int = 8080,
+    triton_http_address: str = "0.0.0.0",
+    triton_request_timeout: int = 60,
+    triton_model_repository: Path = None,
+    num_gpus: int = 1,
+    tensor_parallelism_size: int = 1,
+    pipeline_parallelism_size: int = 1,
+    dtype: str = "bfloat16",
+    max_input_len: int = 256,
+    max_output_len: int = 256,
+    max_batch_size: int = 8,
+    start_rest_service: bool = False,
+    rest_service_http_address: str = "0.0.0.0",
+    rest_service_port: int = 8000,
+    openai_format_response: bool = False,
+):
+    if start_rest_service:
+        if triton_port == rest_service_port:
+            logging.error("REST service port and Triton server port cannot use the same port.")
+            return
+        # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py
+        store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response)
+
+    triton_deployable = get_trtllm_deployable(
+        nemo_checkpoint,
+        model_type,
+        triton_model_repository,
+        num_gpus,
+        tensor_parallelism_size,
+        pipeline_parallelism_size,
+        max_input_len,
+        max_output_len,
+        max_batch_size,
+        dtype,
+    )
+
+    try:
+        nm = DeployPyTriton(
+            model=triton_deployable,
+            triton_model_name=triton_model_name,
+            triton_model_version=triton_model_version,
+            max_batch_size=max_batch_size,
+            port=triton_port,
+            address=triton_http_address,
+        )
+
+        logging.info("Triton deploy function will be called.")
+        nm.deploy()
+    except Exception as error:
+        logging.error("Error message has occurred during deploy function. Error message: " + str(error))
+        return
+
+    try:
+        logging.info("Model serving on Triton is will be started.")
+        if start_rest_service and uvicorn_supported:
+            try:
+                logging.info("REST service will be started.")
+                uvicorn.run(
+                    'nemo.deploy.service.rest_model_api:app',
+                    host=rest_service_http_address,
+                    port=rest_service_port,
+                    reload=True,
+                )
+            except Exception as error:
+                logging.error("Error message has occurred during REST service start. Error message: " + str(error))
+        nm.serve()
+    except Exception as error:
+        logging.error("Error message has occurred during deploy function. Error message: " + str(error))
+        return
+
+    logging.info("Model serving will be stopped.")
+    nm.stop()
+
+
 @task(name="import", namespace="llm")
 def import_ckpt(
     model: pl.LightningModule,
@@ -289,7 +456,7 @@ def _setup(
         task_config=getattr(train, "__io__", None),
     )
     if resume is not None:
-        resume.setup(model, trainer)
+        resume.setup(trainer, model)
 
     if optim:
         optim.connect(model)
diff --git a/nemo/collections/llm/gpt/data/core.py b/nemo/collections/llm/gpt/data/core.py
index 8d99583016a4..6f8fe237e10a 100644
--- a/nemo/collections/llm/gpt/data/core.py
+++ b/nemo/collections/llm/gpt/data/core.py
@@ -32,6 +32,7 @@ def create_sft_dataset(
     truncation_method: str = 'right',
     memmap_workers: int = 2,
     hf_dataset: bool = False,
+    global_sample_mapping: bool = False,
     **kwargs,
 ) -> "GPTSFTDataset":
     from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
@@ -42,6 +43,7 @@ def create_sft_dataset(
         max_seq_length=seq_length,
         memmap_workers=memmap_workers,
         hf_dataset=hf_dataset,
+        global_sample_mapping=global_sample_mapping,
         add_bos=add_bos,
         add_eos=add_eos,
         add_sep=add_sep,
diff --git a/nemo/collections/llm/gpt/data/dolly.py b/nemo/collections/llm/gpt/data/dolly.py
index 9632a142eb35..7ed17e460e0f 100644
--- a/nemo/collections/llm/gpt/data/dolly.py
+++ b/nemo/collections/llm/gpt/data/dolly.py
@@ -7,13 +7,14 @@
 
 from nemo.collections.llm.gpt.data.core import get_dataset_root
 from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.lightning.io.mixin import IOMixin
 from nemo.utils import logging
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
 
 
-class DollyDataModule(FineTuningDataModule):
+class DollyDataModule(FineTuningDataModule, IOMixin):
     """A data module for fine-tuning on the Dolly dataset.
 
     This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models on the
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 1be5c41e4919..33a21990e8f7 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -1,3 +1,4 @@
+import math
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Union
@@ -32,6 +33,7 @@ class FineTuningDataModule(pl.LightningDataModule):
         num_workers (int, optional): The number of worker processes for data loading. Defaults to 8.
         pin_memory (bool, optional): Whether to pin memory during data loading for faster GPU training. Defaults to True.
         persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs. Defaults to False.
+        max_train_steps (int, optional): Maximum number of steps to train. Used to calculate samples mapping for the mmap dataset
     """
 
     def __init__(
@@ -60,18 +62,40 @@ def __init__(
         self.num_workers = num_workers
         self.pin_memory = pin_memory
         self.persistent_workers = persistent_workers
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
+        self.rampup_batch_size = rampup_batch_size
+        self.data_sampler = None
+        self.max_train_samples = None
+
+    def setup(self, stage: str):
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
-            micro_batch_size=micro_batch_size,
-            global_batch_size=global_batch_size,
-            rampup_batch_size=rampup_batch_size,
+            micro_batch_size=self.micro_batch_size,
+            global_batch_size=self.global_batch_size,
+            rampup_batch_size=self.rampup_batch_size,
+            dataloader_type="batch",
         )
 
+        # Follows the calculation in nemo.collections.nlp.data.language_modeling.megatron.
+        # base_dataset_utils.get_datasets_weights_and_num_samples
+        self.max_train_samples = int(math.ceil(self.global_batch_size * self.trainer.max_steps * 1.005))
+
     def train_dataloader(self) -> DataLoader:
-        return self._create_dataloader(self._create_dataset(str(self.train_path)))
+        return self._create_dataloader(
+            self._create_dataset(
+                str(self.train_path),
+                max_num_samples=self.max_train_samples,
+            )
+        )
 
     def val_dataloader(self) -> DataLoader:
-        return self._create_dataloader(self._create_dataset(str(self.validation_path)))
+        return self._create_dataloader(
+            self._create_dataset(
+                str(self.validation_path),
+                is_test=True,
+            ),
+        )
 
     def test_dataloader(self) -> DataLoader:
         return self._create_dataloader(
@@ -85,7 +109,12 @@ def test_dataloader(self) -> DataLoader:
     @lru_cache
     def _create_dataset(self, path, **kwargs):
         return create_sft_dataset(
-            path, tokenizer=self.tokenizer, seq_length=self.seq_length, memmap_workers=self.memmap_workers, **kwargs
+            path,
+            tokenizer=self.tokenizer,
+            seq_length=self.seq_length,
+            memmap_workers=self.memmap_workers,
+            seed=self.seed,
+            **kwargs,
         )
 
     def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
index 2aef1c7ecd24..b2d9b5ba8cca 100644
--- a/nemo/collections/llm/gpt/data/pre_training.py
+++ b/nemo/collections/llm/gpt/data/pre_training.py
@@ -1,3 +1,4 @@
+import logging
 import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
@@ -5,8 +6,9 @@
 import pytorch_lightning as pl
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils import data
-from nemo.lightning.data import WrappedDataLoader
 
+from nemo.lightning.data import WrappedDataLoader
+from nemo.lightning.io.mixin import IOMixin
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 
 if TYPE_CHECKING:
@@ -15,7 +17,45 @@
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 
-class PreTrainingDataModule(pl.LightningDataModule):
+class PreTrainingDataModule(pl.LightningDataModule, IOMixin):
+    """PyTorch Lightning-compatible data module for pre-training
+       GPT-style models.
+    Args:
+        paths (Path | List | Dict[str, List]): Paths of the data distributions. Can be either a
+            single path, a list of paths, or a dictionary. If a single path or a list of paths,
+            the given paths will be used to generate the train, validation and test datasets. If
+            providing a list of paths, the format can be either (1) a list of paths, e.g.
+                ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"],
+            or (2) a flattened, zipped list of weights and paths, e.g.
+                ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"]
+            If a dictionary is provided, it is expected to have the following form:
+                {
+                    'train': <TRAIN PATHS>,
+                    'validation': <VALID PATHS>,
+                    'test': <TEST PATHS>
+                }
+            where each value is either a path or a list of paths as described above.
+            In this case, each split will be generated using the given paths.
+            Note that if limit_val_batches <= 1, we generate the entire validaton dataset, so
+            weights should not be provided for the validation split.
+        seq_length (int): Sequence length.
+        tokenizer (Optional["TokenizerSpec"]): An instance of a TokenizerSpec object.
+        micro_batch_size (int): Batch size per GPU.
+        global_batch_size (int): Global batch size.
+        rampup_batch_size (Optional[List[int]]): Rampup batch size, should be in format of
+            [start_global_batch_size, batch_size_increment, ramup_samples].
+        num_workers (int): See ``torch.utils.data.DataLoader`` documentation.
+        pin_memory (bool): See ``torch.utils.data.DataLoader`` documentation.
+        persistent_workers (bool): See ``torch.utils.data.DataLoader`` documentation.
+        reset_position_ids (bool): Option to reset the position IDs in the dataset at an interval.
+        reset_attention_mask (bool): Option to reset the attention mask from the dataset.
+        eod_mask_loss (int): Option to enable the EOD mask loss.
+        seed (int): Seed for generating the GPT dataset.
+        split (str): A string of 3 comma-separated integers denoting how much of the distribution
+            to allocate to train, validation, and test sets, respectively. Unused if ``paths`` is a dict.
+        index_mapping_dir (Optional[str]): Path to a directory to write index mapping files.
+    """
+
     def __init__(
         self,
         paths: Path | List | Dict[str, List],
@@ -24,9 +64,6 @@ def __init__(
         micro_batch_size: int = 4,
         global_batch_size: int = 8,
         rampup_batch_size: Optional[List[int]] = None,
-        num_train_samples: int = 10_000,
-        num_val_samples: int = 10_000,
-        num_test_samples: int = 10_000,
         num_workers: int = 8,
         pin_memory: bool = True,
         persistent_workers: bool = False,
@@ -64,9 +101,6 @@ def __init__(
         self.build_kwargs = build_kwargs
         self.seq_length = seq_length
         self.tokenizer = tokenizer
-        self.num_train_samples = num_train_samples
-        self.num_val_samples = num_val_samples
-        self.num_test_samples = num_test_samples
         self.num_workers = num_workers
         self.pin_memory = pin_memory
         self.persistent_workers = persistent_workers
@@ -156,6 +190,7 @@ def test_dataloader(self) -> EVAL_DATALOADERS:
 
     def _create_dataloader(self, dataset, mode, **kwargs) -> WrappedDataLoader:
         self.init_global_step = self.trainer.global_step
+        self.data_sampler.init_global_step = self.init_global_step
         dataloader = WrappedDataLoader(
             mode=mode,
             dataset=dataset,
@@ -199,18 +234,21 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
             state_dict: the datamodule state returned by ``state_dict``.
 
         """
-        from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+        try:
+            from megatron.core.num_microbatches_calculator import update_num_microbatches
+
+        except (ImportError, ModuleNotFoundError):
+            logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+            from apex.transformer.pipeline_parallel.utils import update_num_microbatches
 
         consumed_samples = state_dict['consumed_samples']
         self.data_sampler.init_consumed_samples = consumed_samples
         self.data_sampler.prev_consumed_samples = consumed_samples
-        num_microbatch_calculator = _GLOBAL_NUM_MICROBATCHES_CALCULATOR  # noqa: SLF001
 
-        num_microbatch_calculator.update(
+        update_num_microbatches(
             consumed_samples=consumed_samples,
             consistency_check=False,
         )
-        current_global_batch_size = num_microbatch_calculator.current_global_batch_size
         self.data_sampler.if_first_step = 1
 
     def reconfigure_limit_batches(self):
@@ -224,7 +262,12 @@ def _reconfigure_limit_batches(self, limit_batches, dataloader, mode):
         Reconfigure trainer.limit_val_batches for pretraining
         """
         # Override limit_batches in terms of num microbatches and so there are limit_batches//num_micro_batches num of global batches
-        from megatron.core.num_microbatches_calculator import get_num_microbatches
+        try:
+            from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+        except (ImportError, ModuleNotFoundError):
+            logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+            from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 
         if isinstance(limit_batches, int):
             limit_batches *= get_num_microbatches()
diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py
index 77d48da98a0e..11104fe3cab2 100644
--- a/nemo/collections/llm/gpt/data/squad.py
+++ b/nemo/collections/llm/gpt/data/squad.py
@@ -6,13 +6,14 @@
 
 from nemo.collections.llm.gpt.data.core import get_dataset_root
 from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.lightning.io.mixin import IOMixin
 from nemo.utils import logging
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
 
 
-class SquadDataModule(FineTuningDataModule):
+class SquadDataModule(FineTuningDataModule, IOMixin):
     """A data module for fine-tuning on the Squad dataset.
 
     This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models on the
@@ -124,3 +125,6 @@ def _preprocess_and_split_data(
                     shutil.rmtree(p)
                 elif '.jsonl' not in str(p.name):
                     p.unlink()
+
+    def reconfigure_limit_batches(self):
+        return
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 4391a41293ee..e2d940e02d32 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -1,3 +1,4 @@
+from nemo.collections.llm.gpt.model.baichuan import Baichuan2Config, Baichuan2Config7B, Baichuan2Model
 from nemo.collections.llm.gpt.model.base import (
     GPTConfig,
     GPTModel,
@@ -7,6 +8,7 @@
     local_layer_spec,
     transformer_engine_layer_spec,
 )
+from nemo.collections.llm.gpt.model.chatglm import ChatGLM2Config6B, ChatGLM3Config6B, ChatGLMConfig, ChatGLMModel
 from nemo.collections.llm.gpt.model.gemma import (
     CodeGemmaConfig2B,
     CodeGemmaConfig7B,
@@ -29,13 +31,19 @@
     LlamaModel,
 )
 from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
-from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel
+from nemo.collections.llm.gpt.model.mixtral import (
+    MixtralConfig8x3B,
+    MixtralConfig8x7B,
+    MixtralConfig8x22B,
+    MixtralModel,
+)
 
 __all__ = [
     "GPTConfig",
     "GPTModel",
     "MistralConfig7B",
     "MistralModel",
+    "MixtralConfig8x3B",
     "MixtralConfig8x7B",
     "MixtralModel",
     "LlamaConfig",
@@ -55,6 +63,13 @@
     "CodeGemmaConfig7B",
     "GemmaModel",
     "LlamaModel",
+    "Baichuan2Config",
+    "Baichuan2Config7B",
+    "Baichuan2Model",
+    "ChatGLMConfig",
+    "ChatGLM2Config6B",
+    "ChatGLM3Config6B",
+    "ChatGLMModel",
     "MaskedTokenLossReduction",
     "gpt_data_step",
     "gpt_forward_step",
diff --git a/nemo/collections/llm/gpt/model/api.py b/nemo/collections/llm/gpt/model/api.py
deleted file mode 100644
index 7c8cbf4d02e6..000000000000
--- a/nemo/collections/llm/gpt/model/api.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import pytorch_lightning as pl
-
-from nemo.collections.llm.gpt.model.gemma import (
-    CodeGemmaConfig2B,
-    CodeGemmaConfig7B,
-    GemmaConfig,
-    GemmaConfig2B,
-    GemmaConfig7B,
-    GemmaModel,
-)
-from nemo.collections.llm.gpt.model.llama import (
-    CodeLlamaConfig7B,
-    CodeLlamaConfig13B,
-    CodeLlamaConfig34B,
-    CodeLlamaConfig70B,
-    Llama2Config7B,
-    Llama2Config13B,
-    Llama2Config70B,
-    Llama3Config8B,
-    Llama3Config70B,
-    LlamaModel,
-)
-from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
-from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel
-from nemo.collections.llm.utils import factory
-
-
-@factory
-def mistral() -> pl.LightningModule:
-    return MistralModel(MistralConfig7B())
-
-
-@factory
-def mixtral() -> pl.LightningModule:
-    return MixtralModel(MixtralConfig8x7B())
-
-
-@factory
-def llama2_7b() -> pl.LightningModule:
-    return LlamaModel(Llama2Config7B())
-
-
-@factory
-def llama3_8b() -> pl.LightningModule:
-    return LlamaModel(Llama3Config8B())
-
-
-@factory
-def llama2_13b() -> pl.LightningModule:
-    return LlamaModel(Llama2Config13B())
-
-
-@factory
-def llama2_70b() -> pl.LightningModule:
-    return LlamaModel(Llama2Config70B())
-
-
-@factory
-def llama3_70b() -> pl.LightningModule:
-    return LlamaModel(Llama3Config70B())
-
-
-@factory
-def code_llama_7b() -> pl.LightningModule:
-    return LlamaModel(CodeLlamaConfig7B())
-
-
-@factory
-def code_llama_13b() -> pl.LightningModule:
-    return LlamaModel(CodeLlamaConfig13B())
-
-
-@factory
-def code_llama_34b() -> pl.LightningModule:
-    return LlamaModel(CodeLlamaConfig34B())
-
-
-@factory
-def code_llama_70b() -> pl.LightningModule:
-    return LlamaModel(CodeLlamaConfig70B())
-
-
-@factory
-def gemma() -> pl.LightningModule:
-    return GemmaModel(GemmaConfig())
-
-
-@factory
-def gemma_2b() -> pl.LightningModule:
-    return GemmaModel(GemmaConfig2B())
-
-
-@factory
-def gemma_7b() -> pl.LightningModule:
-    return GemmaModel(GemmaConfig7B())
-
-
-@factory
-def code_gemma_2b() -> pl.LightningModule:
-    return GemmaModel(CodeGemmaConfig2B())
-
-
-@factory
-def code_gemma_7b() -> pl.LightningModule:
-    return GemmaModel(CodeGemmaConfig7B())
-
-
-__all__ = [
-    "mistral",
-    "mixtral",
-    "llama2_7b",
-    "llama3_8b",
-    "llama2_13b",
-    "llama2_70b",
-    "llama3_70b",
-    "code_llama_7b",
-    "code_llama_13b",
-    "code_llama_34b",
-    "code_llama_70b",
-    "gemma",
-    "gemma_2b",
-    "gemma_7b",
-    "code_gemma_2b",
-    "code_gemma_7b",
-]
diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py
new file mode 100644
index 000000000000..382a90547caa
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/baichuan.py
@@ -0,0 +1,266 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Annotated, Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.collections.llm.utils import Config
+from nemo.lightning import OptimizerModule, io, teardown
+
+if TYPE_CHECKING:
+    from transformers import AutoConfig, AutoModelForCausalLM
+
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+@dataclass
+class Baichuan2Config(GPTConfig):
+    normalization: str = "RMSNorm"
+    activation_func: Callable = F.silu
+    gated_linear_unit: bool = True
+    add_bias_linear: bool = False
+    seq_length: int = 4096
+    init_method_std: int = 0.02
+    layernorm_epsilon: float = 1e-6
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    share_embeddings_and_output_weights: bool = False
+
+
+@dataclass
+class Baichuan2Config7B(Baichuan2Config):
+    num_layers: int = 32
+    hidden_size: int = 4096
+    num_attention_heads: int = 32
+    num_query_groups: int = 32
+    ffn_hidden_size: int = 11008
+    position_embedding_type: str = "rope"
+
+
+class Baichuan2Model(GPTModel):
+    def __init__(
+        self,
+        config: Annotated[Optional[Baichuan2Config], Config[Baichuan2Config]] = None,
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        super().__init__(
+            config or Baichuan2Config(), optim=optim, tokenizer=tokenizer, model_transform=model_transform
+        )
+
+
+@io.model_importer(Baichuan2Model, "hf")
+class HFBaichuan2Importer(io.ModelConnector["AutoModelForCausalLM", Baichuan2Model]):
+    def init(self) -> Baichuan2Model:
+        return Baichuan2Model(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import AutoModelForCausalLM
+
+        source = AutoModelForCausalLM.from_pretrained(str(self), trust_remote_code=True)
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted Baichuan model to Nemo, model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
+            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "model.norm.weight": "decoder.final_layernorm.weight",
+            "lm_head.weight": "output_layer.weight",
+        }
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
+
+    @property
+    def tokenizer(self) -> "AutoTokenizer":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(str(self), trust_remote_code=True)
+
+    @property
+    def config(self) -> Baichuan2Config:
+        from transformers import AutoConfig as HFAutoConfig
+
+        source = HFAutoConfig.from_pretrained(str(self), trust_remote_code=True)
+
+        def make_vocab_size_divisible_by(vocab_size):
+            base = 128
+            while vocab_size % base != 0:
+                base //= 2
+            return base
+
+        output = Baichuan2Config(
+            num_layers=source.num_hidden_layers,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.intermediate_size,
+            num_attention_heads=source.num_attention_heads,
+            init_method_std=source.initializer_range,
+            layernorm_epsilon=source.rms_norm_eps,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
+            share_embeddings_and_output_weights=False,
+            position_embedding_type="rope" if source.num_hidden_layers == 32 else "alibi",
+        )
+
+        return output
+
+
+@io.model_exporter(Baichuan2Model, "hf")
+class HFBaichuan2Exporter(io.ModelConnector[Baichuan2Model, "AutoModelForCausalLM"]):
+    def init(self) -> "AutoModelForCausalLM":
+        from transformers import AutoModelForCausalLM
+
+        return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
+
+    def apply(self, output_path: Path) -> Path:
+        target = self.init()
+        source, _ = self.nemo_load(str(self))
+        target = self.convert_state(source, target)
+
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight",
+        }
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])
+
+    @property
+    def tokenizer(self):
+        return io.load_context(str(self)).model.tokenizer.tokenizer
+
+    @property
+    def config(self) -> "AutoConfig":
+        source: Baichuan2Config = io.load_context(str(self)).model.config
+
+        return AutoConfig(
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.seq_length,
+            initializer_range=source.init_method_std,
+            rms_norm_eps=source.layernorm_epsilon,
+            num_key_value_heads=source.num_query_groups,
+            rope_theta=source.rotary_base,
+            vocab_size=self.tokenizer.vocab_size,
+        )
+
+
+@io.state_transform(
+    source_key="model.layers.*.self_attn.W_pack.weight",
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_qkv(ctx: io.TransformCTX, qkv_weights):
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    qkv_weights = qkv_weights.unflatten(0, (3, hidden_size))
+    old_tensor_shape = qkv_weights[0].size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+    q = qkv_weights[0].squeeze().view(*new_q_tensor_shape)
+    k = qkv_weights[1].squeeze().view(*new_kv_tensor_shape)
+    v = qkv_weights[2].squeeze().view(*new_kv_tensor_shape)
+    qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+    for i in range(num_query_groups):
+        qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+        qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+        qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key="model.layers.*.self_attn.W_pack.weight",
+)
+def _export_qkv(ctx: io.TransformCTX, qkv_weights):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
+
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    return torch.cat(
+        [
+            qkv_weights[q_slice].reshape(-1, hidden_size),
+            qkv_weights[k_slice].reshape(-1, hidden_size),
+            qkv_weights[v_slice].reshape(-1, hidden_size),
+        ]
+    )
+
+
+@io.state_transform(
+    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight",
+)
+def _import_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0).float()
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+)
+def _export_linear_fc1(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj
+
+
+__all__ = [
+    "Baichuan2Config",
+    "Baichuan2Config7B",
+    "Baichuan2Model",
+]
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 0e4fabe020af..2badfa2b1915 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -105,6 +105,9 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     rotary_percent: float = 1.0
     seq_len_interpolation_factor: Optional[float] = None
     seq_length: int = 1024
+    attention_softmax_in_fp32: bool = False
+    masked_softmax_fusion: bool = True
+    deallocate_pipeline_outputs = True
 
     # TODO: Move this to better places?
     get_attention_mask_from_fusion: bool = False
@@ -160,6 +163,8 @@ def __init__(
         self.optim = optim or MegatronOptimizerModule(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True))
         self.optim.connect(self)  # This will bind the `configure_optimizers` method
         self.model_transform = model_transform
+        self._training_loss_reduction = None
+        self._validation_loss_reduction = None
 
     def configure_model(self) -> None:
         if not hasattr(self, "module"):
@@ -200,11 +205,19 @@ def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
 
         return self.forward_step(batch)
 
+    @property
     def training_loss_reduction(self) -> MaskedTokenLossReduction:
-        return MaskedTokenLossReduction()
+        if not self._training_loss_reduction:
+            self._training_loss_reduction = MaskedTokenLossReduction()
 
+        return self._training_loss_reduction
+
+    @property
     def validation_loss_reduction(self) -> MaskedTokenLossReduction:
-        return MaskedTokenLossReduction(validation_step=True)
+        if not self._validation_loss_reduction:
+            self._validation_loss_reduction = MaskedTokenLossReduction(validation_step=True)
+
+        return self._validation_loss_reduction
 
 
 def get_batch_on_this_context_parallel_rank(batch):
diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py
new file mode 100644
index 000000000000..10a3497070c4
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/chatglm.py
@@ -0,0 +1,325 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Annotated, Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.collections.llm.utils import Config
+from nemo.lightning import OptimizerModule, io, teardown
+
+if TYPE_CHECKING:
+    from transformers import AutoConfig, AutoModelForCausalLM
+
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+@dataclass
+class ChatGLMConfig(GPTConfig):
+    num_layers: int = 28
+    hidden_size: int = 4096
+    ffn_hidden_size: int = 13696
+    num_attention_heads: int = 32
+    num_query_groups: int = 2
+    init_method_std: float = 0.02
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    normalization: str = "RMSNorm"
+    add_bias_linear: bool = False
+    add_qkv_bias: bool = True
+    rotary_percent: float = 0.5
+    rotary_interleaved: bool = True
+    activation_func: Callable = F.silu
+    gated_linear_unit: bool = True
+    position_embedding_type: str = "rope"
+    share_embeddings_and_output_weights: bool = False
+    make_vocab_size_divisible_by: int = 65024  # override vocab size
+
+
+@dataclass
+class ChatGLM2Config6B(ChatGLMConfig):
+    seq_length: int = 32768
+
+
+@dataclass
+class ChatGLM3Config6B(ChatGLMConfig):
+    seq_length: int = 8192
+
+
+class ChatGLMModel(GPTModel):
+    def __init__(
+        self,
+        config: Annotated[Optional[ChatGLMConfig], Config[ChatGLMConfig]] = None,
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        super().__init__(config or ChatGLMConfig(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
+
+
+@io.model_importer(ChatGLMModel, "hf")
+class HFChatGLMImporter(io.ModelConnector["AutoModelForCausalLM", ChatGLMModel]):
+    def init(self) -> ChatGLMModel:
+        return ChatGLMModel(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import AutoModelForCausalLM
+
+        source = AutoModelForCausalLM.from_pretrained(str(self), trust_remote_code=True)
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted ChatGLM model to Nemo, model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "transformer.embedding.word_embeddings.weight": "embedding.word_embeddings.weight",
+            "transformer.encoder.layers.*.self_attention.dense.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "transformer.encoder.layers.*.mlp.dense_h_to_4h.weight": "decoder.layers.*.mlp.linear_fc1.weight",
+            "transformer.encoder.layers.*.mlp.dense_4h_to_h.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "transformer.encoder.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "transformer.encoder.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "transformer.encoder.final_layernorm.weight": "decoder.final_layernorm.weight",
+            "transformer.output_layer.weight": "output_layer.weight",
+        }
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv_weight, _import_qkv_bias])
+
+    @property
+    def tokenizer(self) -> "AutoTokenizer":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(str(self), trust_remote_code=True)
+
+    @property
+    def config(self) -> ChatGLMConfig:
+        from transformers import AutoConfig as HFAutoConfig
+
+        source = HFAutoConfig.from_pretrained(str(self), trust_remote_code=True)
+        output = ChatGLMConfig(
+            num_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            seq_length=source.seq_length,
+            num_query_groups=source.multi_query_group_num,
+            make_vocab_size_divisible_by=source.padded_vocab_size,
+        )
+
+        return output
+
+
+@io.model_exporter(ChatGLMModel, "hf")
+class HFChatGLMExporter(io.ModelConnector[ChatGLMModel, "AutoModelForCausalLM"]):
+    def init(self) -> "AutoModelForCausalLM":
+        from transformers import AutoModelForCausalLM
+
+        return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
+
+    def apply(self, output_path: Path) -> Path:
+        target = self.init()
+        source, _ = self.nemo_load(str(self))
+        target = self.convert_state(source, target)
+
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "embedding.word_embeddings.weight": "transformer.embedding.word_embeddings.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "transformer.encoder.layers.*.self_attention.dense.weight",
+            "decoder.layers.*.mlp.linear_fc1.weight": "transformer.encoder.layers.*.mlp.dense_h_to_4h.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "transformer.encoder.layers.*.mlp.dense_4h_to_h.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "transformer.encoder.layers.*.input_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "transformer.encoder.layers.*.post_attention_layernorm.weight",
+            "decoder.final_layernorm.weight": "transformer.encoder.final_layernorm.weight",
+            "output_layer.weight": "transformer.output_layer.weight",
+        }
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=[
+                _export_qkv_weight,
+                _export_qkv_bias,
+            ],
+        )
+
+    @property
+    def tokenizer(self):
+        return io.load_context(str(self)).model.tokenizer.tokenizer
+
+    @property
+    def config(self) -> "AutoConfig":
+        source: ChatGLMConfig = io.load_context(str(self)).model.config
+
+        return AutoConfig(
+            num_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            seq_length=source.seq_length,
+            multi_query_group_num=source.num_query_groups,
+            padded_vocab_size=self.tokenizer.vocab_size,
+        )
+
+
+@io.state_transform(
+    source_key="transformer.encoder.layers.*.self_attention.query_key_value.weight",
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_qkv_weight(ctx: io.TransformCTX, hf_qkv_weights):
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    old_tensor_shape = hf_qkv_weights.size()
+    new_q_tensor_shape = (head_num, head_size, old_tensor_shape[1])
+    new_kv_tensor_shape = (num_query_groups, head_size, old_tensor_shape[1])
+    q, k, v = hf_qkv_weights.split(
+        [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size], dim=0
+    )
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights = torch.empty((0, head_size, old_tensor_shape[1]))
+    for i in range(num_query_groups):
+        qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+        qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+        qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key="transformer.encoder.layers.*.self_attention.query_key_value.bias",
+    target_key="decoder.layers.*.self_attention.linear_qkv.bias",
+)
+def _import_qkv_bias(ctx: io.TransformCTX, hf_qkv_bias):
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    new_q_tensor_shape = (head_num, head_size)
+    new_kv_tensor_shape = (num_query_groups, head_size)
+    q, k, v = hf_qkv_bias.split(
+        [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size], dim=0
+    )
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+    qkv_bias = torch.empty((0, head_size))
+    for i in range(num_query_groups):
+        qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :]))
+        qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :]))
+        qkv_bias = torch.cat((qkv_bias, v[i : i + 1, :]))
+    qkv_bias = qkv_bias.reshape(
+        [
+            head_size * (head_num + 2 * num_query_groups),
+        ]
+    )
+    return qkv_bias
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key="transformer.encoder.layers.*.self_attention.query_key_value.weight",
+)
+def _export_qkv_weight(ctx: io.TransformCTX, qkv_weights):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
+
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_weight = qkv_weights[q_slice].reshape(-1, hidden_size)
+    k_weight = qkv_weights[k_slice].reshape(-1, hidden_size)
+    v_weight = qkv_weights[v_slice].reshape(-1, hidden_size)
+    return torch.cat((q_weight, k_weight, v_weight), dim=0)
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.bias",
+    target_key="transformer.encoder.layers.*.self_attention.query_key_value.bias",
+)
+def _export_qkv_bias(ctx: io.TransformCTX, qkv_bias):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
+
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_bias = qkv_bias[q_slice].reshape(
+        -1,
+    )
+    k_bias = qkv_bias[k_slice].reshape(
+        -1,
+    )
+    v_bias = qkv_bias[v_slice].reshape(
+        -1,
+    )
+    return torch.cat((q_bias, k_bias, v_bias))
+
+
+__all__ = [
+    "ChatGLMConfig",
+    "ChatGLM2Config6B",
+    "ChatGLM3Config6B",
+    "ChatGLMModel",
+]
diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
index 6493bb0dfad7..7d45b76e6034 100644
--- a/nemo/collections/llm/gpt/model/gemma.py
+++ b/nemo/collections/llm/gpt/model/gemma.py
@@ -30,6 +30,8 @@ class GemmaConfig(GPTConfig):
     add_bias_linear: bool = False
     seq_length: int = 8192
     kv_channels: int = 256
+    attention_dropout: float = 0.0
+    hidden_dropout: float = 0.0
     share_embeddings_and_output_weights: bool = True
     # Note: different behavior compared to Legacy NeMo
     # Legacy NeMo does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script
@@ -73,6 +75,13 @@ def __init__(
     ):
         super().__init__(config or GemmaConfig(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
 
+    def configure_model(self):
+        from nemo.collections.common.parts.utils import extend_instance
+        from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import EmbeddingScalingMixin
+
+        super().configure_model()
+        extend_instance(self.module.embedding, EmbeddingScalingMixin)
+
 
 @io.model_importer(GemmaModel, "hf")
 class HFGemmaImporter(io.ModelConnector["GemmaForCausalLM", GemmaModel]):
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index c7add828b7f4..425170c07707 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -29,6 +29,9 @@ class LlamaConfig(GPTConfig):
     position_embedding_type: str = "rope"
     add_bias_linear: bool = False
     seq_length: int = 4096
+    attention_dropout: float = 0.0
+    hidden_dropout: float = 0.0
+    share_embeddings_and_output_weights: bool = False
 
 
 @dataclass
@@ -59,15 +62,48 @@ class Llama2Config70B(LlamaConfig):
 
 
 @dataclass
-class Llama3Config8B(Llama2Config7B):
-    seq_length: int = 8192
+class Llama3Config(GPTConfig):
     num_query_groups: int = 8
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    normalization = "RMSNorm"
+    init_method_std: float = 0.01
+    layernorm_epsilon: float = 1.0e-05
+    add_bias_linear: bool = False
+    activation_func: Callable = F.silu
+    gated_linear_unit: bool = True
+    apply_query_key_layer_scaling: bool = True
+    # Fusions
+    bias_activation_fusion: bool = True
+    masked_softmax_fusion: bool = True
+    persist_layer_norm: bool = True
+    bias_dropout_fusion: bool = True
+    apply_rope_fusion: bool = True
+    share_embeddings_and_output_weights: bool = False
+    position_embedding_type = "rope"
+    rotary_percent: float = 1.0
+
+
+@dataclass
+class Llama3Config8B(Llama3Config):
+    rotary_base: int = 500_000
+    seq_length: int = 8192
+    num_layers: int = 32
+    hidden_size: int = 4096
     ffn_hidden_size: int = 14336
+    num_attention_heads: int = 32
 
 
 @dataclass
-class Llama3Config70B(Llama2Config70B):
+class Llama3Config70B(Llama3Config):
+    rotary_base: int = 500_000
     seq_length: int = 8192
+    num_layers: int = 80
+    hidden_size: int = 8192
+    ffn_hidden_size: int = 28672
+    num_attention_heads: int = 64
+    init_method_std: float = 0.008944
+    make_vocab_size_divisible_by: int = 128
 
 
 @dataclass
diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py
index d1049cfe77ce..61a96917537c 100644
--- a/nemo/collections/llm/gpt/model/mistral.py
+++ b/nemo/collections/llm/gpt/model/mistral.py
@@ -35,12 +35,49 @@ class MistralConfig7B(GPTConfig):
     num_query_groups: int = 8
     ffn_hidden_size: int = 14336
     seq_length: int = 32768
+    attention_dropout: float = 0.0
+    hidden_dropout: float = 0.0
+    share_embeddings_and_output_weights: bool = False
 
     init_method_std: float = 0.02
     layernorm_epsilon: float = 1e-5
     window_size: List[int] = field(default_factory=lambda: [4096, 0])
 
 
+@dataclass
+class MistralNeMo2407Config12B(MistralConfig7B):
+    """
+    https://mistral.ai/news/mistral-nemo/
+    """
+
+    num_layers: int = 40
+    hidden_size: int = 5120
+    kv_channels: int = 128
+    seq_length: int = 4096  # but   "max_position_embeddings": 1024000,
+
+    window_size: List[int] = None
+    rotary_percent: float = 1.0
+    rotary_base: float = 1000000.0
+
+
+@dataclass
+class MistralNeMo2407Config123B(MistralConfig7B):
+    """
+    https://mistral.ai/news/mistral-large-2407/
+    """
+
+    num_layers: int = 88
+    hidden_size: int = 12288
+    ffn_hidden_size: int = 28672
+    num_attention_heads: int = 96
+    kv_channels: int = 128
+    seq_length: int = 4096  # but   "max_position_embeddings": 131072,
+
+    window_size: List[int] = None
+    rotary_percent: float = 1.0
+    rotary_base: float = 1000000.0
+
+
 class MistralModel(GPTModel):
     def __init__(
         self,
@@ -106,11 +143,15 @@ def make_vocab_size_divisible_by(mistral_vocab_size):
                 base //= 2
             return base
 
+        window_size = None
+        if getattr(source, 'sliding_window', None) is not None:
+            window_size = [source.sliding_window, 0]
         output = MistralConfig7B(
             seq_length=source.sliding_window,
             num_layers=source.num_hidden_layers,
             hidden_size=source.hidden_size,
             ffn_hidden_size=source.intermediate_size,
+            kv_channels=getattr(source, 'head_dim', source.hidden_size // source.num_attention_heads),
             num_attention_heads=source.num_attention_heads,
             # max_position_embeddings=source.max_position_embeddings,
             init_method_std=source.initializer_range,
@@ -119,7 +160,7 @@ def make_vocab_size_divisible_by(mistral_vocab_size):
             rotary_base=source.rope_theta,
             gated_linear_unit=True,
             make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
-            window_size=[source.sliding_window, 0],
+            window_size=window_size,
             share_embeddings_and_output_weights=False,
         )
 
@@ -172,7 +213,7 @@ def config(self) -> "MistralConfig":
         from transformers import MistralConfig as HfMistralConfig
 
         return HfMistralConfig(
-            sliding_window=source.window_size[0],
+            sliding_window=source.window_size[0] if source.window_size is not None else None,
             num_hidden_layers=source.num_layers,
             hidden_size=source.hidden_size,
             intermediate_size=source.ffn_hidden_size,
@@ -183,6 +224,7 @@ def config(self) -> "MistralConfig":
             num_key_value_heads=source.num_query_groups,
             rope_theta=source.rotary_base,
             vocab_size=self.tokenizer.vocab_size,
+            head_dim=source.kv_channels,
         )
 
 
@@ -202,7 +244,7 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
     heads_per_group = head_num // num_query_groups
     hidden_size = megatron_config.hidden_size
     head_num = megatron_config.num_attention_heads
-    head_size = hidden_size // head_num
+    head_size = megatron_config.kv_channels
 
     old_tensor_shape = q.size()
     new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
@@ -244,7 +286,7 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
     heads_per_group = head_num // num_query_groups
     hidden_size = megatron_config.hidden_size
     head_num = megatron_config.num_attention_heads
-    head_size = hidden_size // head_num
+    head_size = megatron_config.kv_channels
     qkv_total_dim = head_num + 2 * num_query_groups
 
     linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py
index 6256b67515ee..7100b62c2aa6 100644
--- a/nemo/collections/llm/gpt/model/mixtral.py
+++ b/nemo/collections/llm/gpt/model/mixtral.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import TYPE_CHECKING, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -18,10 +18,9 @@
 
 
 @dataclass
-class MixtralConfig8x7B(GPTConfig):
+class MixtralConfig(GPTConfig):
     """
-    Config for Mixtral-8x7B model
-    Official announcement: https://mistral.ai/news/mixtral-of-experts/
+    Base config for Mixtral models.
     """
 
     normalization: str = "RMSNorm"
@@ -29,30 +28,84 @@ class MixtralConfig8x7B(GPTConfig):
     position_embedding_type: str = "rope"
     add_bias_linear: bool = False
     gated_linear_unit: bool = True
-    apply_query_key_layer_scaling: bool = False  # TODO: Should this be True?
+    apply_query_key_layer_scaling: bool = False
 
     num_layers: int = 32
     hidden_size: int = 4096
     num_attention_heads: int = 32
     num_query_groups: int = 8
     ffn_hidden_size: int = 14336
-    max_position_embeddings: int = 4096  # 32768
-    seq_length: int = 4096  # 32768
+    max_position_embeddings: int = 4096
+    seq_length: int = 4096
+    attention_dropout: float = 0.0
+    hidden_dropout: float = 0.0
+    share_embeddings_and_output_weights: bool = False
+
     # MoE
     num_moe_experts: int = 8
     moe_router_topk: int = 1
+    moe_router_pre_softmax: bool = True
 
     init_method_std: float = 0.02
     layernorm_epsilon: float = 1e-5
     # rotary
-    rotary_percent: float = 0.5
-    rotary_base: float = 10000
+    rotary_percent: float = 1.0
+    rotary_base: float = 1000000.0
+    bf16: bool = True
+    params_dtype: torch.dtype = torch.bfloat16
+
+
+@dataclass
+class MixtralConfig8x3B(MixtralConfig):
+    """
+    NeMo's Mixtral-8x3B model variant
+    https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/main/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
+    """
+
+    num_layers: int = 32
+    hidden_size: int = 2560
+    num_attention_heads: int = 32
+    ffn_hidden_size: int = 8960
+    max_position_embeddings: int = 4096
+    seq_length: int = 4096
+
+
+@dataclass
+class MixtralConfig8x7B(MixtralConfig):
+    """
+    Config for Mixtral-8x7B model
+    Official announcement: https://mistral.ai/news/mixtral-of-experts/
+    """
+
+    num_layers: int = 32
+    hidden_size: int = 4096
+    ffn_hidden_size: int = 14336
+    max_position_embeddings: int = 4096
+    seq_length: int = 4096
+
+
+@dataclass
+class MixtralConfig8x22B(MixtralConfig):
+    """
+    Config for Mixtral-8x7B model
+    Official announcement: https://mistral.ai/news/mixtral-8x22b/
+    """
+
+    num_layers: int = 56
+    hidden_size: int = 6144
+    num_attention_heads: int = 48
+    ffn_hidden_size: int = 16384
+    max_position_embeddings: int = 4096
+    seq_length: int = 4096
+    # MoE
+    num_moe_experts: int = 8
+    moe_router_topk: int = 2
 
 
 class MixtralModel(GPTModel):
     def __init__(
         self,
-        config: Optional[MixtralConfig8x7B] = None,
+        config: Optional[Union[MixtralConfig8x7B, MixtralConfig8x22B]] = None,
         optim: Optional[OptimizerModule] = None,
         tokenizer: Optional["TokenizerSpec"] = None,
         model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
@@ -70,7 +123,7 @@ def init(self) -> MixtralModel:
     def apply(self, output_path: Path) -> Path:
         from transformers import MixtralForCausalLM
 
-        source = MixtralForCausalLM.from_pretrained(str(self))
+        source = MixtralForCausalLM.from_pretrained(str(self), torch_dtype='auto', use_safetensors=True)
         target = self.init()
         trainer = self.nemo_setup(target)
         self.convert_state(source, target)
@@ -104,16 +157,21 @@ def tokenizer(self) -> "AutoTokenizer":
         return AutoTokenizer(str(self))
 
     @property
-    def config(self) -> MixtralConfig8x7B:
+    def config(self) -> MixtralConfig8x7B | MixtralConfig8x22B:
         from transformers import MixtralConfig as HfMixtralConfig
 
         config = HfMixtralConfig.from_pretrained(str(self))
-        return MixtralConfig8x7B(
+        config_cls = MixtralConfig8x7B
+        if '8x22b' in str(self).lower():
+            config_cls = MixtralConfig8x22B
+        return config_cls(
+            bf16=getattr(config, "torch_dtype", None) == torch.bfloat16,
             activation_func=F.silu,
             # network
             num_layers=config.num_hidden_layers,
             hidden_size=config.hidden_size,
             ffn_hidden_size=config.intermediate_size,
+            kv_channels=getattr(config, 'head_dim', config.hidden_size // config.num_attention_heads),
             max_position_embeddings=config.max_position_embeddings,  # TODO
             seq_length=config.max_position_embeddings,
             # RoPE
@@ -124,6 +182,7 @@ def config(self) -> MixtralConfig8x7B:
             num_query_groups=config.num_key_value_heads,
             num_moe_experts=config.num_local_experts,
             moe_router_topk=config.num_experts_per_tok,
+            moe_router_pre_softmax=True,
             # norm
             normalization='RMSNorm',
             layernorm_epsilon=config.rms_norm_eps,
@@ -132,6 +191,10 @@ def config(self) -> MixtralConfig8x7B:
             gated_linear_unit=True,
             # Vocab
             make_vocab_size_divisible_by=128,
+            # CPU init
+            use_cpu_initialization=True,
+            perform_initialization=False,
+            params_dtype=getattr(config, "torch_dtype", torch.bfloat16),
         )
 
 
@@ -151,7 +214,7 @@ def _import_qkv(ctx: io.TransformCTX, q, k, v):
     heads_per_group = head_num // num_query_groups
     hidden_size = megatron_config.hidden_size
     head_num = megatron_config.num_attention_heads
-    head_size = hidden_size // head_num
+    head_size = megatron_config.kv_channels
 
     old_tensor_shape = q.size()
     new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
@@ -232,7 +295,8 @@ def tokenizer(self):
 
     @property
     def config(self) -> "MixtralConfig":
-        source: MixtralConfig7B = io.load_ckpt(str(self)).model.config
+        # Either MixtralConfig8x7B or MixtralConfig8x22B
+        source: MixtralConfig8x7B = io.load_ckpt(str(self)).model.config
 
         from transformers import MixtralConfig as HfMixtralConfig
 
@@ -255,6 +319,7 @@ def config(self) -> "MixtralConfig":
             initializer_range=source.init_method_std,
             # vocab
             vocab_size=self.tokenizer.vocab_size,
+            head_dim=source.kv_channels,
         )
 
 
@@ -274,7 +339,7 @@ def _export_qkv(ctx: io.TransformCTX, linear_qkv):
     heads_per_group = head_num // num_query_groups
     hidden_size = megatron_config.hidden_size
     head_num = megatron_config.num_attention_heads
-    head_size = hidden_size // head_num
+    head_size = megatron_config.kv_channels
     qkv_total_dim = head_num + 2 * num_query_groups
 
     linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index 913144d1bf5f..71b60d5df59f 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -17,12 +17,26 @@ class to provide a specific implementation of the forward method.
     """
 
     def forward(self, x):
-        linear_output, bias = self.to_wrap(x)
-        if isinstance(linear_output, tuple) and len(linear_output) == 2:
-            linear_output, layernorm_output = linear_output
-            adapter_output = self.adapter(layernorm_output)
-        else:
-            adapter_output = self.adapter(x)
+        linear_output = self.to_wrap(x)
+        assert isinstance(
+            linear_output, tuple
+        ), f"{self.to_wrap} should return a tuple but instead returns {linear_output}"
+        """ Four cases for the wrapped module's return values
+        1. nothing: (out, None)
+        2. return_bias: (out, bias)
+        2. return_layernorm_output: ((out, ln_out), None)
+        3. both: (out, bias, ln_out)
+        """
+        if len(linear_output) == 2:
+            linear_output, bias = linear_output
+            if isinstance(linear_output, tuple) and len(linear_output) == 2:
+                linear_output, layernorm_output = linear_output
+                x = layernorm_output
+        elif len(linear_output) == 3:
+            linear_output, bias, layernorm_output = linear_output
+            x = layernorm_output
+
+        adapter_output = self.adapter(x)
         return linear_output + adapter_output, bias
 
 
@@ -72,6 +86,8 @@ class LoRA(PEFT):
     alpha: int = 32
     dropout: float = 0.0
     dropout_position: Literal['pre', 'post'] = 'post'
+    lora_A_init_method: str = "xavier"
+    lora_B_init_method: str = "zero"
 
     def transform(self, m: nn.Module, name=None, prefix=None):
         """
@@ -96,6 +112,11 @@ def transform(self, m: nn.Module, name=None, prefix=None):
                 input_is_parallel = False
                 in_features = m.in_features
                 out_features = m.out_features * tp_size
+                # LoRA is applied after layernorm, so layernorm output must be returned
+                m.return_layernorm_output = True
+                # perf optimization for LoRA + SP
+                if m.config.sequence_parallel and not m.ub_overlap_ag:
+                    m.return_layernorm_output_gathered = True
             else:  # name in ['linear_proj', 'linear_fc2']
                 # Row Parallel Linear
                 input_is_parallel = True
@@ -110,8 +131,8 @@ def transform(self, m: nn.Module, name=None, prefix=None):
                 activation='identity',
                 norm_position=None,
                 norm_type=None,
-                column_init_method="normal",
-                row_init_method="zero",
+                column_init_method=self.lora_A_init_method,
+                row_init_method=self.lora_B_init_method,
                 gather_output=False,
                 input_is_parallel=input_is_parallel,
                 dropout=self.dropout,
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
new file mode 100644
index 000000000000..d9fb5cc61f38
--- /dev/null
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -0,0 +1,14 @@
+from nemo.collections.llm.recipes import llama3_8b, llama3_8b_16k, llama3_8b_64k, llama3_70b, mistral
+from nemo.collections.llm.recipes.log.default import default_log, default_resume
+from nemo.collections.llm.recipes.optim import adam
+
+__all__ = [
+    "llama3_8b",
+    "llama3_8b_16k",
+    "llama3_8b_64k",
+    "llama3_70b",
+    "mistral",
+    "adam",
+    "default_log",
+    "default_resume",
+]
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
new file mode 100644
index 000000000000..4b99aef74a30
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -0,0 +1,113 @@
+from typing import Callable, Optional
+
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.utils import Config, Partial
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "llama3_70b"
+
+
+def model() -> Config[pl.LightningModule]:
+    return Config(LlamaModel, config=Config(Llama3Config70B))
+
+
+def trainer(
+    tensor_parallelism: int,
+    pipeline_parallelism: int,
+    pipeline_parallelism_type: Optional[torch.dtype],
+    virtual_pipeline_parallelism: Optional[int],
+    context_parallelism: int,
+    sequence_parallelism: bool,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[Config[Callback]]] = None,
+) -> Config[nl.Trainer]:
+    strategy = Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_include_optimizer=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+        ),
+    )
+
+    trainer = Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        gradient_clip_val=1.0,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+def pretrain_recipe(
+    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
+) -> Partial:
+    return Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            tensor_parallelism=4,
+            pipeline_parallelism=4,
+            pipeline_parallelism_type=torch.bfloat16,
+            virtual_pipeline_parallelism=5,
+            context_parallelism=2,
+            sequence_parallelism=True,
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[Config(TimingCallback)],
+        ),
+        data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+def hf_resume() -> Config[nl.AutoResume]:
+    return Config(nl.AutoResume, import_path="hf://meta-llama/Meta-Llama-3.1-70B")
+
+
+def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
+    recipe = pretrain_recipe(
+        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune
+    )
+    recipe.resume = hf_resume()
+    recipe.peft = Config(LoRA)
+    recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
+    return recipe
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
new file mode 100644
index 000000000000..d70366f6c5ed
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -0,0 +1,113 @@
+from typing import Callable, Optional
+
+import pytorch_lightning as pl
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.utils import Config, Partial
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "llama3_8b"
+
+
+def model() -> Config[pl.LightningModule]:
+    return Config(LlamaModel, config=Config(Llama3Config8B))
+
+
+def trainer(
+    tensor_parallelism: int,
+    pipeline_parallelism: int,
+    pipeline_parallelism_type: Optional[torch.dtype],
+    virtual_pipeline_parallelism: Optional[int],
+    context_parallelism: int,
+    sequence_parallelism: bool,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[Config[Callback]]] = None,
+) -> Config[nl.Trainer]:
+    strategy = Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_include_optimizer=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+        ),
+    )
+
+    trainer = Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        gradient_clip_val=1.0,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+def pretrain_recipe(
+    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
+) -> Partial:
+    return Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            tensor_parallelism=1,
+            pipeline_parallelism=1,
+            pipeline_parallelism_type=None,
+            virtual_pipeline_parallelism=None,
+            context_parallelism=2,
+            sequence_parallelism=False,
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[Config(TimingCallback)],
+        ),
+        data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+
+def hf_resume() -> Config[nl.AutoResume]:
+    return Config(nl.AutoResume, import_path="hf://meta-llama/Meta-Llama-3.1-8B")
+
+
+def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
+    recipe = pretrain_recipe(
+        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune
+    )
+    recipe.resume = hf_resume()
+    recipe.peft = Config(LoRA)
+    recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
+    return recipe
diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py
new file mode 100644
index 000000000000..8bb2b636eba0
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama3_8b_16k.py
@@ -0,0 +1,59 @@
+from typing import Callable
+
+import torch
+
+from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.recipes import llama3_8b
+from nemo.collections.llm.utils import Partial
+
+NAME = "llama3_8b_16k"
+
+
+def pretrain_recipe(
+    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
+) -> Partial:
+    recipe = llama3_8b.pretrain_recipe(
+        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
+    )
+
+    trainer = llama3_8b.trainer(
+        tensor_parallelism=2,
+        pipeline_parallelism=4,
+        pipeline_parallelism_type=torch.bfloat16,
+        virtual_pipeline_parallelism=5,
+        context_parallelism=2,
+        sequence_parallelism=True,
+        num_nodes=num_nodes,
+        num_gpus_per_node=num_gpus_per_node,
+    )
+    model = llama3_8b.model()
+    model.config.seq_length = 16384
+
+    recipe.model = model
+    recipe.trainer = trainer
+
+    return trainer
+
+
+def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
+    recipe = llama3_8b.finetune_recipe(
+        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node
+    )
+
+    trainer = llama3_8b.trainer(
+        tensor_parallelism=2,
+        pipeline_parallelism=4,
+        pipeline_parallelism_type=torch.bfloat16,
+        virtual_pipeline_parallelism=5,
+        context_parallelism=2,
+        sequence_parallelism=True,
+        num_nodes=num_nodes,
+        num_gpus_per_node=num_gpus_per_node,
+    )
+    model = llama3_8b.model()
+    model.config.seq_length = 16384
+
+    recipe.model = model
+    recipe.trainer = trainer
+
+    return trainer
diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py
new file mode 100644
index 000000000000..b42e1e53399e
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama3_8b_64k.py
@@ -0,0 +1,59 @@
+from typing import Callable
+
+import torch
+
+from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.recipes import llama3_8b
+from nemo.collections.llm.utils import Partial
+
+NAME = "llama3_8b_64k"
+
+
+def pretrain_recipe(
+    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
+) -> Partial:
+    recipe = llama3_8b.pretrain_recipe(
+        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
+    )
+
+    trainer = llama3_8b.trainer(
+        tensor_parallelism=2,
+        pipeline_parallelism=4,
+        pipeline_parallelism_type=torch.bfloat16,
+        virtual_pipeline_parallelism=5,
+        context_parallelism=4,
+        sequence_parallelism=True,
+        num_nodes=num_nodes,
+        num_gpus_per_node=num_gpus_per_node,
+    )
+    model = llama3_8b.model()
+    model.config.seq_length = 65536
+
+    recipe.model = model
+    recipe.trainer = trainer
+
+    return trainer
+
+
+def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
+    recipe = llama3_8b.finetune_recipe(
+        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node
+    )
+
+    trainer = llama3_8b.trainer(
+        tensor_parallelism=2,
+        pipeline_parallelism=4,
+        pipeline_parallelism_type=torch.bfloat16,
+        virtual_pipeline_parallelism=5,
+        context_parallelism=4,
+        sequence_parallelism=True,
+        num_nodes=num_nodes,
+        num_gpus_per_node=num_gpus_per_node,
+    )
+    model = llama3_8b.model()
+    model.config.seq_length = 65536
+
+    recipe.model = model
+    recipe.trainer = trainer
+
+    return trainer
diff --git a/nemo/collections/llm/recipes/log/__init__.py b/nemo/collections/llm/recipes/log/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py
new file mode 100644
index 000000000000..dc18565a0e06
--- /dev/null
+++ b/nemo/collections/llm/recipes/log/default.py
@@ -0,0 +1,52 @@
+from typing import Optional
+
+from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
+
+from nemo import lightning as nl
+from nemo.collections.llm.utils import Config
+
+
+def tensorboard_logger(name: str, save_dir: str = "tb_logs") -> Config[TensorBoardLogger]:
+    return Config(TensorBoardLogger, save_dir=save_dir, name=name)
+
+
+def wandb_logger(project: str, name: str) -> Config[WandbLogger]:
+    return Config(
+        WandbLogger,
+        project=project,
+        name=name,
+        config={},
+    )
+
+
+def default_log(
+    ckpt_dir: str,
+    name: str,
+    tensorboard_logger: Optional[Config[TensorBoardLogger]] = None,
+    wandb_logger: Optional[Config[WandbLogger]] = None,
+) -> Config[nl.NeMoLogger]:
+    ckpt = Config(
+        nl.ModelCheckpoint,
+        save_best_model=False,
+        save_last=True,
+        save_top_k=10,
+        every_n_train_steps=200,
+        filename="{model_name}--{val_loss:.2f}-{step}-{consumed_samples}",
+    )
+
+    return Config(
+        nl.NeMoLogger,
+        ckpt=ckpt,
+        name=name,
+        tensorboard=tensorboard_logger,
+        wandb=wandb_logger,
+        dir=ckpt_dir,
+    )
+
+
+def default_resume() -> Config[nl.AutoResume]:
+    return Config(
+        nl.AutoResume,
+        resume_if_exists=True,
+        resume_ignore_no_checkpoint=True,
+    )
diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral.py
new file mode 100644
index 000000000000..061e82c9d9d2
--- /dev/null
+++ b/nemo/collections/llm/recipes/mistral.py
@@ -0,0 +1,61 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
+from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "mistral"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return MistralModel(MistralConfig7B())
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(tensor_model_parallel_size=2)
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME + "_hf")
+def hf_resume() -> nl.AutoResume:
+    return nl.AutoResume(import_path="hf://mistralai/Mistral-7B-v0.3")
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=distributed_fused_adam_with_cosine_annealing(),
+    )
+
+
+@factory(name=NAME, for_task="llm.finetune")
+def finetune_recipe() -> Partial:
+    return Partial(
+        finetune,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=distributed_fused_adam_with_cosine_annealing(),
+        peft=gpt_lora,
+        resume=hf_resume,
+    )
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b_4k.py b/nemo/collections/llm/recipes/mixtral_8x22b_4k.py
new file mode 100644
index 000000000000..5a29cca38506
--- /dev/null
+++ b/nemo/collections/llm/recipes/mixtral_8x22b_4k.py
@@ -0,0 +1,64 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.llama import MixtralConfig8x22B, MixtralModel
+from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "mixtral_8x22b_4k"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return MixtralModel(MixtralConfig8x22B(seq_length=4096))
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=8,
+        sequence_parallel=True,
+    )
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME + "_hf")
+def hf_resume() -> nl.AutoResume:
+    return nl.AutoResume(import_path="hf://mistralai/Mixtral-8x22B-v0.1")
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=distributed_fused_adam_with_cosine_annealing(),
+    )
+
+
+@factory(name=NAME, for_task="llm.finetune")
+def finetune_recipe() -> Partial:
+    return Partial(
+        finetune,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=distributed_fused_adam_with_cosine_annealing(),
+        peft=gpt_lora,
+        resume=hf_resume,
+    )
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_4k.py b/nemo/collections/llm/recipes/mixtral_8x7b_4k.py
new file mode 100644
index 000000000000..5afa3cd072f6
--- /dev/null
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_4k.py
@@ -0,0 +1,64 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.llama import MixtralConfig8x7B, MixtralModel
+from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "mixtral_8x7b_4k"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return MixtralModel(MixtralConfig8x7B(seq_length=4096))
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=8,
+        sequence_parallel=True,
+    )
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME + "_hf")
+def hf_resume() -> nl.AutoResume:
+    return nl.AutoResume(import_path="hf://mistralai/Mixtral-8x7B-v0.1")
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=distributed_fused_adam_with_cosine_annealing(),
+    )
+
+
+@factory(name=NAME, for_task="llm.finetune")
+def finetune_recipe() -> Partial:
+    return Partial(
+        finetune,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=distributed_fused_adam_with_cosine_annealing(),
+        peft=gpt_lora,
+        resume=hf_resume,
+    )
diff --git a/nemo/collections/llm/recipes/optim/__init__.py b/nemo/collections/llm/recipes/optim/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py
new file mode 100644
index 000000000000..d46f7d5d36d6
--- /dev/null
+++ b/nemo/collections/llm/recipes/optim/adam.py
@@ -0,0 +1,33 @@
+from megatron.core.optimizer import OptimizerConfig
+
+from nemo.collections.llm.utils import Config
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule, OptimizerModule
+
+
+def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> Config[OptimizerModule]:
+    opt_cfg = Config(
+        OptimizerConfig,
+        optimizer="adam",
+        lr=max_lr,
+        weight_decay=0.1,
+        bf16=True,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        adam_eps=1e-5,
+        use_distributed_optimizer=True,
+        overlap_grad_reduce=True,
+        overlap_param_gather=True,
+    )
+
+    sched = Config(
+        CosineAnnealingScheduler,
+        warmup_steps=2000,
+        constant_steps=0,
+        min_lr=0.1 * max_lr,
+    )
+
+    return Config(
+        MegatronOptimizerModule,
+        config=opt_cfg,
+        lr_scheduler=sched,
+    )
diff --git a/nemo/collections/llm/tokenizer.py b/nemo/collections/llm/tokenizer.py
index 3943e24ba799..808642758ebe 100644
--- a/nemo/collections/llm/tokenizer.py
+++ b/nemo/collections/llm/tokenizer.py
@@ -9,8 +9,8 @@
     track_io(
         AutoTokenizer,
         artifacts=[
-            FileArtifact("vocab_file"),
-            FileArtifact("merges_file"),
+            FileArtifact("vocab_file", required=False),
+            FileArtifact("merges_file", required=False),
         ],
     )
     __all__.append("AutoTokenizer")
diff --git a/nemo/collections/llm/utils.py b/nemo/collections/llm/utils.py
index b4382d0afd5f..5ff01a9b0a86 100644
--- a/nemo/collections/llm/utils.py
+++ b/nemo/collections/llm/utils.py
@@ -1,14 +1,19 @@
+import logging
 from typing import Any, Callable, Generic, TypeVar, Union, overload
 
-T = TypeVar('T', bound=Callable[..., Any])
+T = TypeVar("T", bound=Callable[..., Any])
 
 try:
-    import nemo_sdk as sdk
+    import nemo_run as run
 
-    Config = sdk.Config
-    Partial = sdk.Partial
+    Config = run.Config
+    Partial = run.Partial
 except ImportError:
-    _T = TypeVar('_T')
+    logging.warning(
+        "Trying to use Config or Partial, but NeMo-Run is not installed. Please install NeMo-Run before proceeding."
+    )
+
+    _T = TypeVar("_T")
 
     class Config(Generic[_T]):
         pass
@@ -19,10 +24,10 @@ class Partial(Generic[_T]):
 
 def task(*args: Any, **kwargs: Any) -> Callable[[T], T]:
     try:
-        import nemo_sdk as sdk
+        import nemo_run as run
 
-        return sdk.task(*args, **kwargs)
-    except ImportError:
+        return run.task(*args, **kwargs)
+    except (ImportError, AttributeError):
         # Return a no-op function
         def noop_decorator(func: T) -> T:
             return func
@@ -40,15 +45,14 @@ def factory(*args: Any, **kwargs: Any) -> Callable[[T], T]: ...
 
 def factory(*args: Any, **kwargs: Any) -> Union[Callable[[T], T], T]:
     try:
-        import nemo_sdk as sdk
+        import nemo_run as run
 
-        if not args and not kwargs:
-            # Used as @factory without arguments
-            return sdk.factory()
+        if not args:
+            return run.factory(**kwargs)
         else:
             # Used as @factory(*args, **kwargs)
-            return sdk.factory(*args, **kwargs)
-    except ImportError:
+            return run.factory(*args, **kwargs)
+    except (ImportError, AttributeError):
         # Return a no-op function
         def noop_decorator(func: T) -> T:
             return func
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 17cb6e6cf644..8102d179757e 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -1004,6 +1004,8 @@ def __len__(self):
         return len(self.list_data_dict)
 
     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        if isinstance(i, np.integer):
+            i = int(i)
         sources = self.list_data_dict[i]
         if isinstance(i, int):
             sources = [sources]
@@ -1190,7 +1192,6 @@ class NevaDataset(LazySupervisedDataset):
     """Dataset for supervised fine-tuning."""
 
     def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict, data_cfg: dict):
-
         if data_path.endswith(".json"):
             super(NevaDataset, self).__init__(data_path, tokenizer, multimodal_cfg, data_cfg)
 
@@ -1313,7 +1314,7 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         return batch
 
 
-def make_supervised_data_module(tokenizer, image_processor, model_cfg) -> Dict:
+def make_supervised_data_module(tokenizer, image_processor, model_cfg, each_file_from_path=None) -> Dict:
     """Make dataset and collator for supervised fine-tuning."""
     data_cfg = model_cfg.data
     mm_cfg = model_cfg.mm_cfg
@@ -1321,10 +1322,10 @@ def make_supervised_data_module(tokenizer, image_processor, model_cfg) -> Dict:
     if getattr(model_cfg, 'no_seqlen_plus_one_input_tokens', False):
         add_extra_token = 0
     crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224))
-
+    data_path = each_file_from_path if each_file_from_path is not None else data_cfg.data_path
     train_dataset = NevaDataset(
         tokenizer=tokenizer,
-        data_path=data_cfg.data_path,
+        data_path=data_path,
         multimodal_cfg=dict(
             is_multimodal=data_cfg.is_multimodal,
             sep_image_conv_front=data_cfg.sep_image_conv_front,
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 938b9fbddc39..6218332c2bde 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -21,7 +21,7 @@
 import torch
 import torch.nn.functional as F
 from einops import rearrange, reduce, repeat
-from omegaconf.dictconfig import DictConfig
+from omegaconf import DictConfig, ListConfig
 from pkg_resources import packaging
 from pytorch_lightning.trainer.trainer import Trainer
 from transformers import CLIPVisionModel, SiglipVisionModel
@@ -38,6 +38,10 @@
     MegatronCLIPModel,
 )
 from nemo.collections.multimodal.parts.utils import create_image_processor, load_nemo_model_weights
+from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
+    get_datasets_weights_and_num_samples,
+)
+from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import MegatronPretrainingSampler
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel, get_specs
@@ -70,7 +74,6 @@
     from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace
     from megatron.core.dist_checkpointing.mapping import LocalNonpersitentObject, ShardedObject
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 
@@ -80,6 +83,13 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 
 def skip_fp8_load(x):
     if isinstance(x, ShardedObject) and 'fused_attention' in x.key and '_extra_state' in x.key:
@@ -481,7 +491,7 @@ def create_vision_encoder_and_processor(self, mm_cfg):
             from transformers import AutoConfig
 
             config = AutoConfig.from_pretrained(mm_cfg.vision_encoder.from_pretrained)
-            if config.architectures[0] == "CLIPVisionModel":
+            if config.architectures[0] == "CLIPVisionModel" or config.architectures[0] == "CLIPModel":
                 vision_encoder = CLIPVisionModel.from_pretrained(
                     mm_cfg.vision_encoder.from_pretrained,
                     torch_dtype=torch.bfloat16,
@@ -491,7 +501,7 @@ def create_vision_encoder_and_processor(self, mm_cfg):
                     for param in vision_encoder.parameters():
                         param.requires_grad = False
                     vision_encoder = vision_encoder.eval()
-            elif config.architectures[0] == "SiglipVisionModel":
+            elif config.architectures[0] == "SiglipVisionModel" or config.architectures[0] == "SiglipModel":
                 vision_encoder = SiglipVisionModel.from_pretrained(
                     mm_cfg.vision_encoder.from_pretrained,
                     torch_dtype=torch.bfloat16,
@@ -1076,9 +1086,10 @@ def fwd_output_only_func(dataloader_iter, model):
                 inference_max_sequence_len,
             ) = batch
             tokens = tokens.cuda()
-            attention_mask = attention_mask.cuda()
             position_ids = position_ids.cuda()
-            attention_mask = attention_mask[0:1]
+            if attention_mask != None:
+                attention_mask = attention_mask.cuda()
+                attention_mask = attention_mask[0:1]
             if media is not None:
                 media = media.cuda()
             labels = None
@@ -1235,15 +1246,132 @@ def setup(self, stage=None):
         if self.cfg.get('transformer_engine', False):
             self.setup_transformer_engine_tp_groups()
 
+    def build_train_valid_test_datasets_blend(self):
+        logging.info('Building Blending Neva datasets.')
+
+        train_datasets = []
+        valid_datasets = []
+
+        data_cfg = self.cfg.data
+        is_packed_sequence = data_cfg.get("packed_sequence", False)
+
+        if is_packed_sequence:
+            assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"
+
+        # Check if concat_sampling_probabilities is properly set
+        if data_cfg.get('concat_sampling_probabilities') is None or not isinstance(
+            data_cfg.concat_sampling_probabilities, ListConfig
+        ):
+            raise ValueError(
+                "concat_sampling_probabilities must be a ListConfig with the same number of entries as data_path."
+            )
+
+        if len(data_cfg.concat_sampling_probabilities) != len(data_cfg.data_path):
+            raise ValueError(
+                f"concat_sampling_probabilities must be of the same size as number of files from data path. "
+                f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.data_path)}"
+            )
+
+        for each_file_from_path in data_cfg.data_path:
+            if is_packed_sequence:
+                train_dataset = NevaPackedSeqDatatset(
+                    each_file_from_path, self.cfg.mm_cfg.vision_encoder.get("crop_size")
+                )
+                valid_dataset = NevaPackedSeqDatatset(
+                    each_file_from_path, self.cfg.mm_cfg.vision_encoder.get("crop_size")
+                )
+            else:
+                ds_dict = make_supervised_data_module(
+                    tokenizer=self.tokenizer,
+                    image_processor=(
+                        self.model.module.image_processor
+                        if hasattr(self.model, "module")
+                        else self.model.image_processor
+                    ),
+                    model_cfg=self.cfg,
+                    each_file_from_path=each_file_from_path,
+                )
+                train_dataset = ds_dict["train_dataset"]
+                valid_dataset = ds_dict["eval_dataset"]
+
+            train_datasets.append(train_dataset)
+            valid_datasets.append(valid_dataset)
+
+        # Create BlendableDataset for training
+        if self.trainer.max_steps is None or self.trainer.max_steps <= 0:
+            raise ValueError(f'Trainer max_steps must be set to a positive integer. Found {self.trainer.max_steps}')
+
+        num_train_samples = self.trainer.max_steps * data_cfg.global_batch_size
+        _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(
+            data_prefix=[
+                weight for pair in zip(data_cfg.concat_sampling_probabilities, data_cfg.data_path) for weight in pair
+            ],
+            num_samples=[num_train_samples],
+        )
+        num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])
+
+        logging.info(f"Number of train datasets: {len(train_datasets)}")
+        logging.info(f"Lengths of train datasets: {[len(ds) for ds in train_datasets]}")
+        logging.info(f"Number of train datasets after blending: {num_train_samples_after_blend}")
+
+        if is_packed_sequence:
+            num_train_samples_after_blend = sum([len(ds) for ds in train_datasets])
+
+        self._train_ds = BlendableDataset(
+            datasets=train_datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend
+        )
+
+        self._validation_ds = BlendableDataset(
+            datasets=valid_datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend
+        )
+
+        logging.info(f'Length of train dataset: {len(self._train_ds)}')
+        logging.info(f'Length of validation dataset: {len(self._validation_ds)}')
+
+        return self._train_ds, self._validation_ds
+
     def build_train_valid_test_datasets(self):
         logging.info('Building Neva datasets.')
+
+        if isinstance(self.cfg.data.data_path, (list, ListConfig)):
+            if len(self.cfg.data.data_path) > 1:
+                # Only consider data blending if there are multiple dataset paths
+                if self.cfg.data.get('concat_sampling_probabilities') is None:
+                    logging.warning("No sampling probabilities provided. Defaulting to uniform sampling.")
+                    self.cfg.data.concat_sampling_probabilities = [1 / len(self.cfg.data.data_path)] * len(
+                        self.cfg.data.data_path
+                    )
+                else:
+                    # Normalize the sampling probabilities if they don't sum to 1
+                    total = sum(self.cfg.data.concat_sampling_probabilities)
+                    if total != 1:
+                        logging.warning(f"Concat_sampling_probabilities sum to {total}. Normalizing to sum to 1.")
+                        self.cfg.data.concat_sampling_probabilities = [
+                            prob / total for prob in self.cfg.data.concat_sampling_probabilities
+                        ]
+                return self.build_train_valid_test_datasets_blend()
+            elif len(self.cfg.data.data_path) == 1:
+                if self.cfg.data.concat_sampling_probabilities is not None:
+                    logging.warning(
+                        "Using sampling probabilities with a single dataset has no effect. Defaulting to None and not using blend dataset."
+                    )
+                    self.cfg.data.concat_sampling_probabilities = None
+                self.cfg.data.data_path = self.cfg.data.data_path[0]
+            else:
+                raise ValueError("data_path must contain at least one valid path.")
+        elif isinstance(self.cfg.data.data_path, str):
+            pass
+        else:
+            raise TypeError("data_path must be a list of paths or a single string")
+
         if self.cfg.data.get("packed_sequence", False):
             assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"
+
             self._train_ds = NevaPackedSeqDatatset(
-                self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size")
+                self.cfg.data.data_path, self.cfg.mm_cfg.vision_encoder.get("crop_size")
             )
             self._validation_ds = NevaPackedSeqDatatset(
-                self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size")
+                self.cfg.data.data_path, self.cfg.mm_cfg.vision_encoder.get("crop_size")
             )
         else:
             ds_dict = make_supervised_data_module(
diff --git a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
index fc661d91ab61..3b795aa7618c 100644
--- a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
+++ b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
@@ -46,16 +46,14 @@
 from nemo.utils import logging
 
 try:
-    from apex import amp
-    from apex.transformer.enums import AttnMaskType
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
 
-    HAVE_APEX = True
 except (ImportError, ModuleNotFoundError):
-    HAVE_APEX = False
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 
 try:
     from megatron.core import parallel_state
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
@@ -550,11 +548,18 @@ def load_from_unet(self, from_pretrained_unet, from_NeMo=True):
             print("Loading unet blocks from sd")
 
             state_dict = torch.load(from_pretrained_unet, map_location='cpu')
-            state_dict = state_dict['state_dict']
+            if 'state_dict' in state_dict.keys():
+                state_dict = state_dict['state_dict']
             model_state_dict = self.state_dict()
+            model_state_keys = model_state_dict.keys()
 
             re_state_dict = {}
             for key_, value_ in state_dict.items():
+                # check if key is a raw parameter
+                if key_ in model_state_keys:
+                    re_state_dict[key_] = value_
+                    continue
+                # prune from model prefix
                 if key_.startswith('model.model.diffusion_model'):
                     re_state_dict[key_.replace('model.model.diffusion_model.', '')] = value_
                 if key_.startswith('model.diffusion_model'):
@@ -625,11 +630,6 @@ def forward(self, x, hint, timesteps, context, **kwargs):
 
 class MegatronControlNet(MegatronBaseModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer):
-        if not HAVE_APEX:
-            raise ImportError(
-                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
-            )
-
         if not HAVE_MEGATRON_CORE:
             raise ImportError(
                 "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
diff --git a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
index 9db63c2abfce..24712ed30021 100644
--- a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
+++ b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
@@ -44,7 +44,6 @@
 
 try:
     from megatron.core import parallel_state
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
@@ -53,6 +52,13 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 
 def disabled_train(self, mode=True):
     """Overwrite model.train with this function to make sure train/eval mode
diff --git a/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py b/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
index 9dd52543f7bc..b7cf6d629d65 100644
--- a/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
+++ b/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
@@ -41,7 +41,6 @@
 
 try:
     from megatron.core import parallel_state
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
@@ -49,6 +48,13 @@
 except (ImportError, ModuleNotFoundError):
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 try:
     from apex.contrib.group_norm import GroupNorm
 
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
index f099c9d41837..77a8caa58b40 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
@@ -52,17 +52,8 @@
 from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin
 from nemo.utils import logging, model_utils
 
-try:
-    from apex import amp
-    from apex.transformer.enums import AttnMaskType
-
-    HAVE_APEX = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_APEX = False
-
 try:
     from megatron.core import parallel_state
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
@@ -71,6 +62,13 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 UNCONDITIONAL_CONFIG = {
     "target": "sgm.modules.GeneralConditioner",
     "params": {"emb_models": []},
@@ -359,10 +357,6 @@ class MegatronDiffusionEngine(NLPAdapterModelMixin, MegatronBaseModel):
     """Megatron DiffusionEngine Model."""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
-        if not HAVE_APEX:
-            raise ImportError(
-                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
-            )
         if not HAVE_MEGATRON_CORE:
             raise ImportError(
                 "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
index 17599d4b0932..89b1d88819b8 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -75,17 +75,8 @@
 from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin
 from nemo.utils import logging, model_utils
 
-try:
-    from apex import amp
-    from apex.transformer.enums import AttnMaskType
-
-    HAVE_APEX = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_APEX = False
-
 try:
     from megatron.core import parallel_state
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
@@ -94,6 +85,14 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
+
 __conditioning_keys__ = {'concat': 'c_concat', 'crossattn': 'c_crossattn', 'adm': 'y'}
 
 
@@ -1674,10 +1673,6 @@ class MegatronLatentDiffusion(NLPAdapterModelMixin, MegatronBaseModel):
     """Megatron LatentDiffusion Model."""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
-        if not HAVE_APEX:
-            raise ImportError(
-                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
-            )
         if not HAVE_MEGATRON_CORE:
             raise ImportError(
                 "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
index db770ba057ac..b2d6a953a9ab 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
@@ -66,7 +66,6 @@
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.models.vision.clip_vit_model import CLIPViTModel
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
     from megatron.core.extensions.transformer_engine import (
@@ -96,6 +95,13 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 try:
     import transformer_engine
     from transformer_engine.pytorch import module as te_module
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/megatron_nsfw_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/megatron_nsfw_clip_models.py
index 7b127335d336..79c0f3910be0 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/megatron_nsfw_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/megatron_nsfw_clip_models.py
@@ -20,7 +20,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from megatron.core import parallel_state
-from megatron.core.num_microbatches_calculator import get_num_microbatches
 from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 from omegaconf.dictconfig import DictConfig
 from pytorch_lightning.accelerators import CPUAccelerator
@@ -40,6 +39,14 @@
 from nemo.utils import logging
 
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
+
 class ContentFilteringModel(MegatronModule):
     """Clip based content filtering model for NSFW."""
 
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index eb449c5406b9..b94624b33ba2 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -971,6 +971,8 @@ def __init__(
                 )
                 logging.info(f"Missing keys: {missing_key}")
                 logging.info(f"Unexpected keys: {unexpected_keys}")
+            else:
+                logging.info(f"There are no missing keys, model loaded properly!")
 
         if unet_precision == "fp16-mixed":  # AMP O2
             self.convert_to_fp16()
@@ -1217,6 +1219,7 @@ def _state_key_mapping(self, state_dict: dict):
     def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False, from_NeMo=False):
         state_dict = self._strip_unet_key_prefix(state_dict)
         if not from_NeMo:
+            logging.info("creating state key mapping from HF")
             state_dict = self._state_key_mapping(state_dict)
         state_dict = self._legacy_unet_ckpt_mapping(state_dict)
 
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index d564993f7806..ea8053398a88 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -303,41 +303,41 @@ def setup_trainer_and_model_for_inference(
 
     # Create the NLPSaveRestoreConnector object for model saving and restoring.
     save_restore_connector = NLPSaveRestoreConnector()
+    if cfg.model.restore_from_path is not None:
+        if cfg.model.restore_from_path.endswith(".nemo") or os.path.isdir(cfg.model.restore_from_path):
+            # Set the model_extracted_dir attribute if the restore path is a directory.
+            if os.path.isdir(cfg.model.restore_from_path):
+                save_restore_connector.model_extracted_dir = cfg.model.restore_from_path
 
-    if cfg.model.restore_from_path.endswith(".nemo") or os.path.isdir(cfg.model.restore_from_path):
-        # Set the model_extracted_dir attribute if the restore path is a directory.
-        if os.path.isdir(cfg.model.restore_from_path):
-            save_restore_connector.model_extracted_dir = cfg.model.restore_from_path
-
-        # Restore the model configuration from the specified path and modify it for inference.
-        model_cfg = model_provider.restore_from(
-            restore_path=cfg.model.restore_from_path,
-            trainer=trainer,
-            save_restore_connector=save_restore_connector,
-            return_config=True,
-        )
-        with open_dict(model_cfg):
-            model_cfg_modifier(model_cfg)  # modify the configuration for inference
+            # Restore the model configuration from the specified path and modify it for inference.
+            model_cfg = model_provider.restore_from(
+                restore_path=cfg.model.restore_from_path,
+                trainer=trainer,
+                save_restore_connector=save_restore_connector,
+                return_config=True,
+            )
+            with open_dict(model_cfg):
+                model_cfg_modifier(model_cfg)  # modify the configuration for inference
 
-        # Restore the model from the specified path and configuration, and set it up for inference.
-        model = model_provider.restore_from(
-            restore_path=cfg.model.restore_from_path,
-            trainer=trainer,
-            override_config_path=model_cfg,
-            save_restore_connector=save_restore_connector,
-            strict=True,
-        )
+            # Restore the model from the specified path and configuration, and set it up for inference.
+            model = model_provider.restore_from(
+                restore_path=cfg.model.restore_from_path,
+                trainer=trainer,
+                override_config_path=model_cfg,
+                save_restore_connector=save_restore_connector,
+                strict=True,
+            )
 
-    elif cfg.model.restore_from_path.endswith(".ckpt"):
-        logging.warning(
-            "Loading from .ckpt checkpoint for inference is experimental! It doesn't support models with model parallelism!"
-        )
+        elif cfg.model.restore_from_path.endswith(".ckpt"):
+            logging.warning(
+                "Loading from .ckpt checkpoint for inference is experimental! It doesn't support models with model parallelism!"
+            )
 
-        model = model_provider.load_from_checkpoint(
-            cfg.model.restore_from_path,
-            hparams_file=cfg.model.get("hparams_file"),
-            trainer=trainer,
-        )
+            model = model_provider.load_from_checkpoint(
+                cfg.model.restore_from_path,
+                hparams_file=cfg.model.get("hparams_file"),
+                trainer=trainer,
+            )
 
     else:
         # load a model from scratch
@@ -489,18 +489,9 @@ def video_processor(maybe_video_path):
         else:
             frames = maybe_video_path
 
-        if neva_cfg.mm_cfg.vision_encoder.get("from_hf", False):
-            if (
-                "siglip" in neva_cfg.mm_cfg.vision_encoder.from_pretrained
-                or "siglip" in neva_cfg.mm_cfg.vision_encoder.get("model_type", "")
-            ):
-                processor = SiglipImageProcessor.from_pretrained(neva_cfg.mm_cfg.vision_encoder.from_pretrained)
-            else:
-                # for clip and vit model
-                processor = CLIPImageProcessor.from_pretrained(neva_cfg.mm_cfg.vision_encoder.from_pretrained)
-        else:
-            processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
-
+        processor = (
+            model.model.module.image_processor if hasattr(model.model, "module") else model.model.image_processor
+        )
         # support single video inference
         if neva_cfg.data.image_aspect_ratio == 'keep':
             max_hw, min_hw = max(frames.size), min(frames.size)
@@ -541,11 +532,11 @@ def create_image_processor(mm_cfg):
         from transformers import AutoConfig
 
         config = AutoConfig.from_pretrained(mm_cfg.vision_encoder.from_pretrained)
-        if config.architectures[0] == "CLIPVisionModel":
+        if config.architectures[0] == "CLIPVisionModel" or config.architectures[0] == "CLIPModel":
             image_processor = CLIPImageProcessor.from_pretrained(
                 mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
             )
-        elif config.architectures[0] == "SiglipVisionModel":
+        elif config.architectures[0] == "SiglipVisionModel" or config.architectures[0] == "SiglipModel":
             image_processor = SiglipImageProcessor.from_pretrained(
                 mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
             )
diff --git a/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py b/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
index a433a5a6badf..ee194b74f993 100644
--- a/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
+++ b/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
@@ -198,21 +198,26 @@ class AudioTextDataset(TextProcessing, Dataset):
     """
     Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds).
     Each new line is a different sample. Example below:
-    {"audio_filepath": "1.wav", "duration": 1.12, "question": "what is the capital of France?", "answer": "Paris"}
-    {"audio_filepath": "2.wav", "duration": 2.15, "question": "what is the capital of Italy?", "answer": "Rome"}
+
+    .. code-block:: json
+
+        {"audio_filepath": "1.wav", "duration": 1.12, "question": "what is the capital of France?", "answer": "Paris"}
+        {"audio_filepath": "2.wav", "duration": 2.15, "question": "what is the capital of Italy?", "answer": "Rome"}
+
     Args:
         manifest_filepath: Path to manifest json as described above. Can be comma-separated paths.
         tokenizer: text tokenizer object
         sample_rate (int): Sample rate to resample loaded audio to
         int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
-        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded
-            audio
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded audio
         max_duration: If audio exceeds this length, do not include in dataset
         min_duration: If audio is less than this length, do not include in dataset
         max_utts: Limit number of utterances
         trim: whether or not to trim silence. Defaults to False
         channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
-        --------- NLP SPECIFIC ARGS -------------
+
+            :note: below args are NLP-specific
+
         max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
         min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
         add_bos (bool): Whether to add a beginning of sentence token to each data example
@@ -228,9 +233,16 @@ class AudioTextDataset(TextProcessing, Dataset):
         answer_only_loss: If True, will compute the loss only on the answer part of the input. If False, will compute the loss on the entire input.
         truncation_field: Field to use for truncation. (Options: "answer", "context"). Field to be used for truncation if the combined length exceeds the max sequence length.
         pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
-        prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
+        prompt_template: Prompt template to inject via an fstring. Formatted like:
+
+            .. code-block:: text
+
+                Q: {input}\\n\\nA: {output}
+
         end_string: Optional[str] = None, if not None, add this string to the end of the answer.
-        --------------- additional args for misc purposes ----------------
+
+            :note: below args are for miscellaneous purposes
+
         context_file: Optional[Union[List[str], str]] = None, if provided, will use this file to load random questions from, if question is not in manifest.
         sample_alpha: Optional[float] = None, for SPE subword sampling
         audio_locator: Optional[str] = None, a special string to split the context into multiple audio segments.
@@ -583,26 +595,30 @@ class TarredAudioTextDataset(TextProcessing, IterableDataset):
         pad_id (id): Token used to pad when collating samples in batches.
             If this is None, pads using 0s.
             Defaults to None.
-        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
-            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
-                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
-            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
-                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
-                The benefit of replication is that it allows each node to sample data points from the entire
-                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.
-
-                .. warning::
-                    Replicated strategy allows every node to sample the entire set of available tarfiles,
-                    and therefore more than one node may sample the same tarfile, and even sample the same
-                    data points! As such, there is no assured guarantee that all samples in the dataset will be
-                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
-                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
-                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
-                    or test datasets.
+        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a
+            str value during ddp.
+
+            - `scatter`: The default shard strategy applied by WebDataset, where each node gets
+              a unique set of shards, which are permanently pre-allocated and never changed at runtime.
+            - `replicate`: Optional shard strategy, where each node gets all of the set of shards
+              available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
+              The benefit of replication is that it allows each node to sample data points from the entire
+              dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.
+
+            :warning: Replicated strategy allows every node to sample the entire set of available tarfiles,
+                and therefore more than one node may sample the same tarfile, and even sample the same
+                data points! As such, there is no assured guarantee that all samples in the dataset will be
+                sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
+                occasions (when the number of shards is not divisible with ``world_size``), will not sample
+                the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
+                or test datasets.
+
         shard_manifests (bool): Whether or not to try / shard manifests. Defaults to False.
         global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
         world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
-        --------- NLP SPECIFIC ARGS -------------
+
+            :note: Below args are NLP-specific
+
         max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
         min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
         add_bos (bool): Whether to add a beginning of sentence token to each data example
@@ -617,11 +633,19 @@ class TarredAudioTextDataset(TextProcessing, IterableDataset):
         answer_only_loss: If True, will compute the loss only on the answer part of the input. If False, will compute the loss on the entire input.
         truncation_field: Field to use for truncation. (Options: "answer", "context"). Field to be used for truncation if the combined length exceeds the max sequence length.
         pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
-        prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
+        prompt_template: Prompt template to inject via an fstring. Formatted like:
+
+            .. code-block:: text
+
+                Q: {input}\\n\\nA: {output}
+
         end_string: Optional[str] = None, if not None, add this string to the end of the answer.
-        --------------- additional args for misc purposes ----------------
+
+            :note: Below args are for miscellaneous purposes
+
         context_file: Optional[Union[List[str], str]] = None, if provided, will use this file to load random questions from, if question is not in manifest.
         sample_alpha: Optional[float] = None, for SPE subword sampling
+
     """
 
     def __init__(
diff --git a/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py b/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py
index d3e70343d507..204a92e5b7ab 100644
--- a/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py
+++ b/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py
@@ -1,3 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
 import torch.utils.data
 from lhotse.dataset import AudioSamples
 from lhotse.dataset.collation import collate_vectors as collate_vectors_lhotse
@@ -63,7 +79,7 @@ def __init__(
         self.context_key = context_key
         self.default_context_key = default_context_key
 
-    def __getitem__(self, cuts) -> dict[str, torch.Tensor | list[str] | dict]:
+    def __getitem__(self, cuts) -> dict[str, Union[torch.Tensor, list[str], dict]]:
         cuts = cuts.sort_by_duration()
 
         audio, audio_lens, cuts = self.load_audio(cuts)
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_models.py b/nemo/collections/multimodal/speech_llm/models/modular_models.py
index 49df57f202fa..edabbfd82f87 100644
--- a/nemo/collections/multimodal/speech_llm/models/modular_models.py
+++ b/nemo/collections/multimodal/speech_llm/models/modular_models.py
@@ -55,13 +55,11 @@
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.classes.mixins import adapter_mixins
 from nemo.utils import AppState, logging, model_utils
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator
 from nemo.utils.model_utils import inject_model_parallel_rank
 
 try:
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
 
     HAVE_MEGATRON_CORE = True
 
@@ -69,6 +67,16 @@
     HAVE_MEGATRON_CORE = False
 
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches, reconfigure_num_microbatches_calculator
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 __all__ = ["ModularAudioGPTModel", "CrossAttendModularAudioGPTModel"]
 
 
@@ -1191,7 +1199,7 @@ def predict_step(self, batch: dict, batch_idx: int, dataloader_idx: Optional[int
             response = generate(self, **inference_config)
 
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -1360,7 +1368,7 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
         app_state = AppState()
         self._restore_activation_checkpointing_args()
         if hasattr(self, "_train_ds"):
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=self.cfg.data.train_ds.global_batch_size,
@@ -1370,7 +1378,7 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
         # When running `trainer.validate()`, the training dataset is not available.
         else:
             logging.warning('No training data found, reconfiguring microbatches based on validation batch sizes.')
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=data_cfg.global_batch_size,
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py b/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py
index f5263496b75e..fce31d031abd 100644
--- a/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py
+++ b/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py
@@ -50,11 +50,9 @@
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.core.classes.mixins import adapter_mixins
 from nemo.utils import AppState, logging, model_utils
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator, get_micro_batch_size
 
 try:
     from megatron.core import parallel_state, tensor_parallel
-    from megatron.core.num_microbatches_calculator import get_current_global_batch_size, get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
@@ -62,6 +60,25 @@
 except (ImportError, ModuleNotFoundError):
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import (
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+        reconfigure_num_microbatches_calculator,
+    )
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+    from apex.transformer.pipeline_parallel.utils import (
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+    )
+
 
 __all__ = ["ModularizedAudioT5Model", "DecoderTextPromptModularizedAudioT5Model"]
 
@@ -805,7 +822,7 @@ def _reconfigure_and_process_inference_batch(self, batch, data_cfg):
                 != data_cfg.global_batch_size // parallel_state.get_data_parallel_world_size()
             ):
                 app_state = AppState()
-                _reconfigure_microbatch_calculator(
+                reconfigure_num_microbatches_calculator(
                     rank=app_state.global_rank,
                     rampup_batch_size=None,
                     global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -815,7 +832,7 @@ def _reconfigure_and_process_inference_batch(self, batch, data_cfg):
             # NOTE: need to explicitly handle resetting for multi-validation
             else:
                 app_state = AppState()
-                _reconfigure_microbatch_calculator(
+                reconfigure_num_microbatches_calculator(
                     rank=app_state.global_rank,
                     rampup_batch_size=None,
                     global_batch_size=data_cfg.global_batch_size,
@@ -1104,7 +1121,7 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
         app_state = AppState()
         # TODO(zhehuai): add _restore_sequence_parallelism_args after sync to HEAD
         if hasattr(self, "_train_ds"):
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=self.cfg.data.train_ds.global_batch_size,
@@ -1114,7 +1131,7 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
         # When running `trainer.validate()`, the training dataset is not available.
         else:
             logging.warning('No training data found, reconfiguring microbatches based on validation batch sizes.')
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=data_cfg.global_batch_size,
diff --git a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
index bb183d45ea2d..4399c4174dd3 100644
--- a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
+++ b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
@@ -27,8 +27,7 @@
     model_inference_strategy_dispatcher,
 )
 from nemo.collections.nlp.modules.common.transformer.text_generation import OutputType
-from nemo.utils import AppState
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator
+from nemo.utils import AppState, logging
 
 try:
     from megatron.core import parallel_state, tensor_parallel
@@ -39,6 +38,15 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+
 __all__ = [
     "get_computeprob_response",
     "generate",
@@ -512,7 +520,7 @@ def sample_sequence_batch(
 ):
     app_state = AppState()
     micro_batch_size = context_tokens.shape[0]
-    _reconfigure_microbatch_calculator(
+    reconfigure_num_microbatches_calculator(
         rank=app_state.global_rank,
         rampup_batch_size=None,
         global_batch_size=micro_batch_size,
diff --git a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
index 021ac1ff3dad..20c478825946 100644
--- a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
+++ b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
@@ -200,54 +200,56 @@ class MultiAudioPerceptionModule(NeuralModule, Exportable):
     """
     Audio perception module that consists of multiple audio encoders and shared modality adapter.
     This module is experimental. An example perception cfg is:
-    -------------------
-    perception:
-        modality_adapter:
-            _target_: nemo.collections.multimodal.speechllm.modules.PoolingMLPConnectors
-            hidden_dim: 512
-            pooling: 'cat'
-            pooling_factor: 2
-            num_layers: 4
-            input_dim: -1
-            output_dim: -1
-
-        spec_augment:
-            _target_: nemo.collections.asr.modules.SpectrogramAugmentation
-            freq_masks: 2 # set to zero to disable it
-            time_masks: 10 # set to zero to disable it
-            freq_width: 27
-            time_width: 0.05
-
-        encoders:
-            asr_model:
-                _target_: nemo.collections.asr.models.ASRModel
-                output_key: d_model
-                freeze: True
-                pretrained_model: stt_en_fastconformer_transducer_large
-            ssl_model:
-                _target_: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel
-                output_key: d_model
-                freeze: True
-                pretrained_model: ssl_en_conformer_large
-                use_multi_layer_feat: True
-                multi_layer_feat:
-                layer_idx_list: [0,16]
+
+    .. code-block:: yaml
+
+        perception:
+            modality_adapter:
+                _target_: nemo.collections.multimodal.speechllm.modules.PoolingMLPConnectors
+                hidden_dim: 512
+                pooling: 'cat'
+                pooling_factor: 2
+                num_layers: 4
+                input_dim: -1
+                output_dim: -1
+
+            spec_augment:
+                _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+                freq_masks: 2 # set to zero to disable it
+                time_masks: 10 # set to zero to disable it
+                freq_width: 27
+                time_width: 0.05
+
+            encoders:
+                asr_model:
+                    _target_: nemo.collections.asr.models.ASRModel
+                    output_key: d_model
+                    freeze: True
+                    pretrained_model: stt_en_fastconformer_transducer_large
+                ssl_model:
+                    _target_: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel
+                    output_key: d_model
+                    freeze: True
+                    pretrained_model: ssl_en_conformer_large
+                    use_multi_layer_feat: True
+                    multi_layer_feat:
+                    layer_idx_list: [0,16]
+                    aggregator:
+                        mode: "cat"
+                        pooling: "avg"
+                        rounding: "floor"
+
+                speaker_model:
+                    segment_length_in_secs: 0.4
+                    freeze: True
+                    pretrained_model: titanet_large
+
+                ref_model: asr_model
                 aggregator:
                     mode: "cat"
-                    pooling: "avg"
+                    pooling: "mean"
                     rounding: "floor"
 
-            speaker_model:
-                segment_length_in_secs: 0.4
-                freeze: True
-                pretrained_model: titanet_large
-
-            ref_model: asr_model
-            aggregator:
-                mode: "cat"
-                pooling: "mean"
-                rounding: "floor"
-    -------------------
     """
 
     def __init__(self, cfg: DictConfig):
@@ -441,9 +443,10 @@ def lens_to_mask(lens, max_length):
 class TransformerCrossAttention(NeuralModule, Exportable):
     """Transformer module for cross-attention between speech and text embeddings.
     The module allows optional projection from the input embeddings to a lower dimension before feeding them to the transformer.
+
     Args:
         cfg: DictConfig, configuration object for the module which should include:
-            xattn: DictConfig, configuration object for the transformer decoder
+        xattn: DictConfig, configuration object for the transformer decoder
     """
 
     def __init__(self, cfg: DictConfig, *args, **kwargs):
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py
index ae2b5fff6be1..39b64ae89865 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py
@@ -25,7 +25,6 @@
 
 class BlendableDataset(torch.utils.data.Dataset):
     def __init__(self, datasets, weights, size):
-
         self.datasets = datasets
         num_datasets = len(datasets)
         assert num_datasets == len(weights)
@@ -43,6 +42,7 @@ def __init__(self, datasets, weights, size):
         assert num_datasets < 255
         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
+
         app_state = AppState()
         try:
             if app_state.local_rank == 0:
@@ -74,6 +74,13 @@ def __len__(self):
     def __getitem__(self, idx):
         dataset_idx = self.dataset_index[idx]
         sample_idx = self.dataset_sample_index[idx]
+        dataset_size = len(self.datasets[dataset_idx])
+        # Ensure the sample index doesn't exceed the dataset size
+        if sample_idx >= dataset_size:
+            logging.warning(f"Index {sample_idx} out of bounds for dataset {dataset_idx}. Reusing existing examples.")
+            sample_idx = sample_idx % dataset_size
+            logging.warning(f"Reusing index {sample_idx} for dataset {dataset_idx}.")
+
         return self.datasets[dataset_idx][sample_idx]
 
     def create_data_mmap(self):
@@ -85,7 +92,7 @@ class MemoryEfficientBlendableDataset(torch.utils.data.Dataset):
     """
     A BlendableDataset implementation that uses less memory than the original implementation.
     Indices are computed algorithmically instead of storing them in memory.
-    
+
     To test call: MemoryEfficientBlendableDataset.test_index_blending()
     """
 
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index caf8dbec6c7a..2e21c57dddd3 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -57,6 +57,7 @@ def __init__(
         tokens_to_generate: int = 0,
         memmap_workers: Optional[int] = None,
         hf_dataset: bool = False,
+        global_sample_mapping: bool = False,
         truncation_method: str = 'right',
         special_tokens: Optional[Mapping[str, str]] = None,  # special tokens, a dictory of {token_type: token}
         is_test: bool = False,
@@ -83,6 +84,7 @@ def __init__(
         index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset.
         prompt_template: Prompt template to inject via an fstring. Formatted like Q: {context_key}\n\nA: {label_key}
         hf_dataset: Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
+        global_sample_mapping: Whether to shuffle all data together, or shuffle the dataset within each epoch
         truncation_method: Truncation from which position. Options: ['left', 'right']
         special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
         is_test: Whether this dataset is the test split.
@@ -109,6 +111,7 @@ def __init__(
         self.tokens_to_generate = tokens_to_generate
         self.memmap_workers = memmap_workers
         self.hf_dataset = hf_dataset
+        self.global_sample_mapping = global_sample_mapping
         self.truncation_method = truncation_method
         self.is_test = is_test
         self.output_original_text = output_original_text
@@ -176,7 +179,11 @@ def _maybe_validate_prompt_template(self):
 
     def _build_samples_mapping(self):
         if self.max_num_samples is not None:
-            osm = OnlineSampleMapping(dataset_size=len(self.indexed_dataset), num_samples=self.max_num_samples)
+            osm = (
+                OnlineSampleMapping(dataset_size=len(self.indexed_dataset), num_samples=self.max_num_samples)
+                if not self.global_sample_mapping
+                else None
+            )
             self.samples_mapping = get_samples_mapping(
                 indexed_dataset=self.indexed_dataset,
                 data_prefix=self.file_path,
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
index f3b77493e0df..48f3e5127a88 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
@@ -32,9 +32,17 @@
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator
 from nemo.utils.decorators import deprecated_warning
 
+try:
+    from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+
 __all__ = ['DialogueS2SGenerationModel']
 
 
@@ -230,7 +238,7 @@ def generate_candidates(self, input_ids, attn_masks, labels):
             generated_tokens = self.language_model.generate(**param_dict)
 
         elif self.cfg.library == 'megatron':
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=0,  # This doesn't matter since it is only used for logging
                 rampup_batch_size=None,
                 global_batch_size=1,
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
index d4df93377db6..5e38b61938c9 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
@@ -17,16 +17,7 @@
 
 import numpy as np
 
-try:
-    from megatron.core import parallel_state
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
-    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
-    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
 
-    HAVE_MEGATRON_CORE = True
-except (ImportError, ModuleNotFoundError):
-    TransformerConfig = ApexGuardDefaults
-    HAVE_MEGATRON_CORE = False
 import torch
 from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
 from omegaconf import DictConfig, OmegaConf, open_dict
@@ -55,17 +46,27 @@
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import logging
 
+
 try:
     from megatron.core import parallel_state
+    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
 
     HAVE_MEGATRON_CORE = True
 
 except (ImportError, ModuleNotFoundError):
-
+    TransformerConfig = ApexGuardDefaults
     ModelParallelConfig = ApexGuardDefaults
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 
 def listify(tensor):
     l_tensor = []
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gemma2/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py
new file mode 100644
index 000000000000..fd2472c5fe49
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_modules.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Callable, Optional
+
+import torch
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.tensor_parallel import ColumnParallelLinear
+from megatron.core.transformer import MegatronModule, TransformerConfig
+from megatron.core.transformer.custom_layers.transformer_engine import TENorm, TERowParallelLinear
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.utils import attention_mask_func
+from megatron.core.utils import divide
+from torch import Tensor
+
+
+def get_swa(seq_q, seq_kv, w):
+    """Create the equivalent attention mask fro SWA in [seq_q, seq_kv] shape"""
+    m = torch.ones(seq_q, seq_kv, dtype=torch.bool, device="cuda")
+    mu = torch.triu(m, diagonal=seq_kv - seq_q - w[0])
+    ml = torch.tril(mu, diagonal=seq_kv - seq_q + w[1])
+    ml = ~ml
+    return ml
+
+
+def logit_softcapping(logits: torch.Tensor, scale: Optional[float]):
+    """Prevents logits from growing excessively by scaling them to a fixed range"""
+    if not scale:
+        return logits
+    return scale * torch.tanh(logits / scale)
+
+
+class Gemma2DotProductAttention(MegatronModule):
+    """
+    Region where selective activation recomputation is applied.
+    This region is memory intensive but less compute intensive which
+    makes activation checkpointing more efficient for LLMs (20B+).
+    See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+
+    We use the following notation:
+     h: hidden size
+     n: number of attention heads
+     p: number of tensor model parallel partitions
+     b: batch size
+     s: sequence length
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: float = None,
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        assert (
+            self.config.context_parallel_size == 1
+        ), "Context parallelism is only supported by TEDotProductAttention!"
+
+        self.layer_number = max(1, layer_number)
+
+        self.window_size = None
+        if self.layer_number % 2 == 0:
+            self.window_size = config.window_size
+
+        self.attn_mask_type = attn_mask_type
+        self.attention_type = attention_type  # unused for now
+
+        projection_size = self.config.kv_channels * self.config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(projection_size, world_size)
+        self.hidden_size_per_attention_head = divide(projection_size, config.num_attention_heads)
+        self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
+        self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
+
+        coeff = None
+        self.norm_factor = math.sqrt(config.query_pre_attn_scalar)
+
+        if self.config.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            input_in_fp16=self.config.fp16,
+            input_in_bf16=self.config.bf16,
+            attn_mask_type=self.attn_mask_type,
+            scaled_masked_softmax_fusion=self.config.masked_softmax_fusion,
+            mask_func=attention_mask_func,
+            softmax_in_fp32=self.config.attention_softmax_in_fp32,
+            scale=coeff,
+        )
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(
+            self.config.attention_dropout if attention_dropout is None else attention_dropout
+        )
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        attention_mask: Tensor,
+        attn_mask_type: AttnMaskType = None,
+        packed_seq_params: PackedSeqParams = None,
+    ):
+        assert packed_seq_params is None, (
+            "Packed sequence is not supported by DotProductAttention." "Please use TEDotProductAttention instead."
+        )
+
+        # ===================================
+        # Raw attention scores. [b, n/p, s, s]
+        # ===================================
+
+        # expand the key and value [sk, b, ng, hn] -> [sk, b, np, hn]
+        # This is a noop for normal attention where ng == np. When using group query attention this
+        # creates a view that has the keys and values virtually repeated along their dimension to
+        # match the number of queries.
+
+        # attn_mask_type is not used.
+        if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1:
+            key = key.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
+            )
+            value = value.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
+            )
+
+        # [b, np, sq, sk]
+        output_size = (
+            query.size(1),
+            query.size(2),
+            query.size(0),
+            key.size(0),
+        )
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        # This will be a simple view when doing normal attention, but in group query attention
+        # the key and value tensors are repeated to match the queries so you can't use simple strides
+        # to extract the queries.
+        query = query.reshape(output_size[2], output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key = key.view(output_size[3], output_size[0] * output_size[1], -1)
+
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor(
+            (output_size[0] * output_size[1], output_size[2], output_size[3]),
+            query.dtype,
+            "mpu",
+        )
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query.transpose(0, 1),  # [b * np, sq, hn]
+            key.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor),
+        )
+        # Gemma 2 specific:
+        matmul_result = logit_softcapping(matmul_result, self.config.attn_logit_softcapping)
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # sliding window attention
+        if attention_mask is not None and self.window_size is not None:
+            attention_mask = get_swa(query.size(0), key.size(0), self.window_size)
+
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+
+        if not self.config.sequence_parallel:
+            with tensor_parallel.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (
+            value.size(1),
+            value.size(2),
+            query.size(0),
+            value.size(3),
+        )
+
+        # change view [sk, b * np, hn]
+        value = value.view(value.size(0), output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+
+        # matmul: [b * np, sq, hn]
+        context = torch.bmm(attention_probs, value.transpose(0, 1))
+
+        # change view [b, np, sq, hn]
+        context = context.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context = context.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_shape = context.size()[:-2] + (self.hidden_size_per_partition,)
+        context = context.view(*new_context_shape)
+        return context
+
+
+class TERowParallelLinearLayerNorm(TERowParallelLinear):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        config: TransformerConfig,
+        init_method: Callable,
+        bias: bool,
+        input_is_parallel: bool,
+        skip_bias_add: bool,
+        is_expert: bool,
+        tp_comm_buffer_name: str = None,
+    ):
+        super().__init__(
+            input_size,
+            output_size,
+            config=config,
+            init_method=init_method,
+            bias=bias,
+            input_is_parallel=input_is_parallel,
+            skip_bias_add=skip_bias_add,
+            is_expert=is_expert,
+            tp_comm_buffer_name=tp_comm_buffer_name,
+        )
+        self.post_layernorm = TENorm(config, output_size)
+
+    def forward(self, x):
+        output, bias = super().forward(x)
+        return self.post_layernorm(output), bias
+
+
+class Gemma2OutputLayer(ColumnParallelLinear):
+    def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
+        output, bias = super().forward(input_, weight)
+        output = logit_softcapping(output, self.config.final_logit_softcapping)
+        return output, bias
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_spec.py
new file mode 100644
index 000000000000..32b2535c1010
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gemma2/gemma2_spec.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.transformer import ModuleSpec, TransformerLayer, TransformerLayerSubmodules
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.custom_layers.transformer_engine import TELayerNormColumnParallelLinear
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+
+from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_modules import (
+    Gemma2DotProductAttention,
+    TERowParallelLinearLayerNorm,
+)
+
+
+def get_gemma2_layer_spec():
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=Gemma2DotProductAttention,  # use unfused SDPA for attn logit softcapping
+                    linear_proj=TERowParallelLinearLayerNorm,  # post attn RMSNorm
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TELayerNormColumnParallelLinear,
+                    linear_fc2=TERowParallelLinearLayerNorm,  # post mlp RMSNorm
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index 3423f2603aeb..7459b9d1f95f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -15,8 +15,8 @@
 from importlib.metadata import version
 from typing import Any, Callable, Optional
 
+import packaging
 import torch
-from pkg_resources import packaging
 
 from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
 from nemo.collections.nlp.parts import utils_funcs
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
index aa7ae44a6484..6cf068b85ebc 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
-
 try:
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
     from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index d0d239b21637..788e9bd059f6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -50,7 +50,6 @@
 try:
     from megatron.core import ModelParallelConfig, parallel_state
     from megatron.core.distributed import DistributedDataParallel as McoreDDP
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import init_method_normal, scaled_init_method_normal
@@ -63,6 +62,13 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_current_global_batch_size, get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_current_global_batch_size, get_num_microbatches
+
 try:
     from megatron.core import Timers
 
@@ -402,7 +408,9 @@ def is_official_release_version(nvidia_torch_version):
                 self.cfg.persist_layer_norm = False
 
             # NVFUSER available starting with 21.11
-            if NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11):
+            if (NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11)) and (
+                NVIDIA_TORCH_MAJOR < 23 or (NVIDIA_TORCH_MAJOR == 23 and NVIDIA_TORCH_MINOR < 11)
+            ):
 
                 # NVFUSER
                 torch._C._jit_set_profiling_executor(True)
@@ -911,9 +919,7 @@ def compute_consumed_samples(self, steps_since_resume=0):
         app_state = AppState()
 
         if self.cfg.get('rampup_batch_size', None):
-            from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-
-            current_global_batch_size = getattr(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, 'current_global_batch_size', 1)
+            current_global_batch_size = get_current_global_batch_size() if get_current_global_batch_size() else 1
             consumed_samples = self.prev_consumed_samples + self.if_first_step * current_global_batch_size
         else:
             consumed_samples = (
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
index 0e03e8994dc2..2a356012c728 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
@@ -37,7 +37,6 @@
 from nemo.collections.nlp.modules.common.transformer.text_generation import TextGeneration
 from nemo.collections.nlp.parts.nlp_overrides import GradScaler
 from nemo.utils import AppState, logging
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator
 from nemo.utils.decorators import deprecated_warning
 
 try:
@@ -52,6 +51,16 @@
     HAVE_MEGATRON_CORE = False
 
 
+try:
+    from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+
+
 __all__ = ['MegatronBasePromptLearningModel']
 
 
@@ -380,7 +389,7 @@ def _reconfigure_and_process_inference_batch(self, global_batch_size_per_gpu, gb
         if global_batch_size_per_gpu != gbs // parallel_state.get_data_parallel_world_size():
             # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches.
             app_state = AppState()
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -390,7 +399,7 @@ def _reconfigure_and_process_inference_batch(self, global_batch_size_per_gpu, gb
 
     def _reconfigure_batch_sizes(self, gbs: int, mbs: int):
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=gbs,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 701d24d5b942..0eb5ea1c0048 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -58,7 +58,6 @@
 try:
     from megatron.core import parallel_state
     from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
@@ -69,6 +68,13 @@
     TransformerConfig = ApexGuardDefaults
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 
 class MegatronBertModel(MegatronBaseModel):
     """
@@ -1163,7 +1169,9 @@ def build_transformer_config(self) -> TransformerConfig:
 
         normalization = self.cfg.get('normalization', 'layernorm')
 
-        layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
+        layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' or self.cfg.get(
+            "layernorm_zero_centered_gamma", False
+        )
         if normalization == 'layernorm':
             normalization = 'LayerNorm'
         elif normalization == 'rmsnorm':
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 13e850d22dca..25e740b4027d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -22,15 +22,15 @@
 from importlib.metadata import version
 from typing import Any, Dict, Iterator, List, Optional, Union
 
+import packaging
 import torch
 from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
-from pkg_resources import packaging
 from pytorch_lightning.accelerators import CPUAccelerator
 from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
 from pytorch_lightning.trainer.trainer import Trainer
 
-from nemo.collections.common.parts.utils import extend_instance
+from nemo.collections.common.parts.utils import apply_rope_scaling, extend_instance
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
     MegatronCorePretrainingSampler,
     MegatronPretrainingRandomSampler,
@@ -77,6 +77,7 @@
 from nemo.utils.te_utils import is_float8tensor
 
 try:
+    import megatron.core as core
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
@@ -93,7 +94,6 @@
         get_gpt_layer_local_spec,
         get_gpt_layer_with_transformer_engine_spec,
     )
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
@@ -112,6 +112,21 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import (
+        get_current_global_batch_size,
+        get_num_microbatches,
+        update_num_microbatches,
+    )
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        get_current_global_batch_size,
+        get_num_microbatches,
+        update_num_microbatches,
+    )
+
 try:
     import transformer_engine
     from transformer_engine.pytorch import module as te_module
@@ -139,6 +154,8 @@ def mcore_supports_moe() -> bool:
 
 ## TODO: This function will not work if TE is not installed
 def get_specs(spec_name, transformer_config=None, use_te=True, hyena_cfg: Dict = None):
+    from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_spec import get_gemma2_layer_spec
+
     # else cases for backwards compatibility with neva
     num_experts = transformer_config.num_moe_experts if transformer_config else None
     moe_grouped_gemm = transformer_config.moe_grouped_gemm if transformer_config else False
@@ -152,6 +169,7 @@ def get_specs(spec_name, transformer_config=None, use_te=True, hyena_cfg: Dict =
         "": get_gpt_layer_local_spec(num_experts, moe_grouped_gemm),
         "te_gpt": get_gpt_layer_with_transformer_engine_spec(num_experts, moe_grouped_gemm),
         "megatron_falcon_gpt": get_falcon_layer_spec(),
+        "megatron_gemma2": get_gemma2_layer_spec(),
         "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(transformer_config),
         "modelopt": get_gpt_layer_modelopt_spec(num_experts),
         "te_gpt_hyena": get_gpt_layer_with_te_and_hyena_spec(hyena_cfg),
@@ -161,6 +179,17 @@ def get_specs(spec_name, transformer_config=None, use_te=True, hyena_cfg: Dict =
     return name_spec_dict[spec_name]
 
 
+def mcore_model_customize(cfg, model):
+    if cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage():
+        extend_instance(model.embedding, EmbeddingScalingMixin)
+    if cfg.get('scale_positional_embedding', False):
+        model.rotary_pos_emb.inv_freq = apply_rope_scaling(model.rotary_pos_emb.inv_freq)
+    if cfg.get("mcore_customization_config", {}).get("final_logit_softcapping", 0):
+        from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_modules import Gemma2OutputLayer
+
+        extend_instance(model.output_layer, Gemma2OutputLayer)
+
+
 class EmbeddingScalingMixin(torch.nn.Module):
     """
     A mixin class for scaling embeddings in Megatron GPT.
@@ -415,6 +444,7 @@ def get_inference_config(self):
     def model_provider_func(self, pre_process, post_process):
         """Model depends on pipeline paralellism."""
         if self.mcore_gpt:
+
             model = MCoreGPTModel(
                 config=self.transformer_config,
                 transformer_layer_spec=get_specs(
@@ -434,8 +464,7 @@ def model_provider_func(self, pre_process, post_process):
                 seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
                 rotary_base=self.cfg.get('rotary_base', 10000),
             )
-            if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage():
-                extend_instance(model.embedding, EmbeddingScalingMixin)
+            mcore_model_customize(self.cfg, model)
         else:
             assert self.cfg.get('num_query_groups', None) is None or self.cfg.get(
                 'num_query_groups', None
@@ -531,6 +560,7 @@ def setup_mcore_distributed_parallel(self):
                 # mcore bucket_size is based on num of parameters, therefore not
                 # using bucket_cap_mb to configure bucket_size here
                 bucket_size=self.cfg.optim.get('ddp_bucket_size', None),
+                average_in_collective=self.cfg.optim.get('average_in_collective', True),
             )
             self.model = [
                 McoreDDP(
@@ -616,11 +646,7 @@ def make_parameter_bucket(module: torch.nn.Module) -> List[torch.nn.Parameter]:
             if self.cfg.get('virtual_pipeline_model_parallel_size', None) is not None:
                 # Initialize a bucket for each virtual pipeline stage
                 for module in self.model:
-                    if isinstance(module, (Float16Module, MCoreFloat16Module)):
-                        module = module.module
-                    stage_bucket = []
-                    layers = module.decoder.layers if self.mcore_gpt else module.language_model.encoder.layers
-                    buckets.extend(make_parameter_bucket(layer) for layer in layers)
+                    buckets.append(make_parameter_bucket(module))
             else:
                 # Initialize a bucket for each Transformer layer
                 modules = self.model if isinstance(self.model, list) else [self.model]
@@ -780,10 +806,7 @@ def training_step(self, dataloader_iter):
             self.if_init_step = False
 
         if self.rampup_batch_size:
-            from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-
-            num_microbatch_calculator = _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-            current_global_batch_size = num_microbatch_calculator.current_global_batch_size
+            current_global_batch_size = get_current_global_batch_size()
             # do validation and save the checkpoint when gbs is changed
             if self.prev_global_batch_size != current_global_batch_size and self.prev_global_batch_size:
                 self.trainer.should_stop = True
@@ -1678,10 +1701,7 @@ def setup(self, stage=None):
         self.init_global_step = self.trainer.global_step
 
         if self.rampup_batch_size:
-            from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-
-            num_microbatch_calculator = _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-            num_microbatch_calculator.update(self.init_consumed_samples, consistency_check=False)
+            update_num_microbatches(self.init_consumed_samples, consistency_check=False)
             self.prev_consumed_samples = self.init_consumed_samples
 
         if stage == 'predict':
@@ -2077,7 +2097,9 @@ def build_transformer_config(self) -> TransformerConfig:
             )
 
         normalization = self.cfg.get('normalization', 'layernorm').lower()
-        layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
+        layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p' or self.cfg.get(
+            "layernorm_zero_centered_gamma", False
+        )
         if normalization == 'layernorm':
             normalization = 'LayerNorm'
         elif normalization == 'rmsnorm':
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
index 9590c535a86d..78f671142c1b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -44,23 +44,27 @@
 from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import AppState, logging
-from nemo.utils.apex_utils import get_micro_batch_size
 from nemo.utils.decorators import deprecated_warning
 
 try:
     from megatron.core import InferenceParams, ModelParallelConfig, parallel_state, tensor_parallel
     from megatron.core.enums import ModelType
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
-
 except (ImportError, ModuleNotFoundError):
 
     ModelParallelConfig = ApexGuardDefaults
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_micro_batch_size, get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches
+
 
 __all__ = ['MegatronGPTPromptLearningModel']
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 9ab17189ca64..9c2372ef38ca 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -37,11 +37,10 @@
 from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import AppState, logging
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator, get_micro_batch_size
 
 try:
     from megatron.core import parallel_state
-    from megatron.core.num_microbatches_calculator import get_current_global_batch_size, get_num_microbatches
+    from megatron.core.distributed import finalize_model_grads
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
@@ -50,6 +49,25 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import (
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+        reconfigure_num_microbatches_calculator,
+    )
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+    from apex.transformer.pipeline_parallel.utils import (
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+    )
+
 
 __all__ = ['MegatronGPTSFTModel']
 
@@ -284,6 +302,7 @@ def _build_dataset(self, data_cfg, is_train=True):
                 prompt_template=data_cfg.get('prompt_template', None),
                 ceil_to_power_2=data_cfg.get('ceil_to_power_2', False),
                 get_attention_mask_from_fusion=data_cfg.get('get_attention_mask_from_fusion', False),
+                global_sample_mapping=data_cfg.get('global_sample_mapping', False),
                 virtual_tokens=self.virtual_tokens,
                 tokens_to_generate=data_cfg.get(
                     'tokens_to_generate', 0
@@ -360,11 +379,27 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
             )
             grad_sync_func = self.reduce_overlap_gradients
             param_sync_func = self.sync_overlap_parameters
+        elif not forward_only and self.use_mcore_dist_optim:
+            if self.cfg.optim.get("overlap_grad_sync", False):
+                no_sync_func = [model_chunk.no_sync for model_chunk in self.model]
+                no_sync_func = no_sync_func[0] if len(self.model) == 1 else no_sync_func
+
+                if self.cfg.optim.get("delay_grad_reduce", True):
+                    grad_sync_func = [model_chunk.start_grad_sync for model_chunk in self.model]
+                    grad_sync_func = grad_sync_func[0] if len(self.model) == 1 else grad_sync_func
+            if self.cfg.optim.get("overlap_param_sync", False) and self.cfg.optim.get("delay_param_gather", False):
+                param_sync_func = [
+                    lambda x, model_index=model_index: self._optimizer.finish_param_sync(model_index, x)
+                    for model_index in range(len(self.model))
+                ]
+                param_sync_func = param_sync_func[0] if len(self.model) == 1 else param_sync_func
 
         for module in self.get_model_module_list():
             module.config.no_sync_func = no_sync_func
             module.config.grad_sync_func = grad_sync_func
             module.config.param_sync_func = param_sync_func
+            if self.use_mcore_dist_optim:
+                module.config.finalize_model_grads_func = finalize_model_grads
 
         fwd_bwd_function = get_forward_backward_func()
 
@@ -630,7 +665,7 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
         app_state = AppState()
         self._restore_activation_checkpointing_args()
         if hasattr(self, "_train_ds"):
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=self.cfg.data.train_ds.global_batch_size,
@@ -640,7 +675,7 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
         # When running `trainer.validate()`, the training dataset is not available.
         else:
             logging.warning('No training data found, reconfiguring microbatches based on validation batch sizes.')
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=data_cfg.global_batch_size,
@@ -676,7 +711,7 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int]
             response = generate(self, **inference_config)
 
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -770,7 +805,7 @@ def _reconfigure_and_process_inference_batch(self, batch, data_cfg):
                 != data_cfg.global_batch_size // parallel_state.get_data_parallel_world_size()
             ):
                 app_state = AppState()
-                _reconfigure_microbatch_calculator(
+                reconfigure_num_microbatches_calculator(
                     rank=app_state.global_rank,
                     rampup_batch_size=None,
                     global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -780,7 +815,7 @@ def _reconfigure_and_process_inference_batch(self, batch, data_cfg):
             # NOTE: need to explicitly handle resetting for multi-validation
             else:
                 app_state = AppState()
-                _reconfigure_microbatch_calculator(
+                reconfigure_num_microbatches_calculator(
                     rank=app_state.global_rank,
                     rampup_batch_size=None,
                     global_batch_size=data_cfg.global_batch_size,
@@ -864,7 +899,7 @@ def setup_eval_dataloader(self, datasets, data_cfg):
     def on_validation_epoch_start(self):
         self._reset_activation_checkpointing_args()
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=self.cfg.data.validation_ds.global_batch_size,
@@ -876,7 +911,7 @@ def on_validation_epoch_start(self):
     def on_test_epoch_start(self):
         self._reset_activation_checkpointing_args()
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=self.cfg.data.test_ds.global_batch_size,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 2488751f808e..7b92b9e25d69 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -46,7 +46,6 @@
 )
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import AppState, logging
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator, get_micro_batch_size
 
 try:
     from megatron.core import parallel_state, tensor_parallel
@@ -58,7 +57,6 @@
         get_t5_encoder_with_local_block_spec,
         get_t5_encoder_with_transformer_engine_block_spec,
     )
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
@@ -69,6 +67,20 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import (
+        get_micro_batch_size,
+        get_num_microbatches,
+        reconfigure_num_microbatches_calculator,
+    )
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+    from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches
+
 __all__ = ["MegatronLMEncoderDecoderModel"]
 
 
@@ -1222,7 +1234,7 @@ def dummy():
 
             # Reconfigure microbatch sizes here because on model restore, this will contain the micro/global batch configuration used while training.
             if reconfigure_microbatch:
-                _reconfigure_microbatch_calculator(
+                reconfigure_num_microbatches_calculator(
                     rank=0,  # This doesn't matter since it is only used for logging
                     rampup_batch_size=None,
                     global_batch_size=1,
@@ -1243,7 +1255,7 @@ def dummy():
         # Reconfigure microbatch calculator here to set num microbatches to 1 while decoding since its not clear how to decode with "grad acc".
         # reconfigure back to how things were before encode
         if reconfigure_microbatch:
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=global_batch_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -1313,7 +1325,7 @@ def dummy():
 
         # Reset microbatch calculator to what it was before decoding.
         if reconfigure_microbatch:
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=global_batch_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -1389,7 +1401,7 @@ def dummy():
             self.trainer.strategy.setup_environment()
 
             # Reconfigure microbatch sizes here because on model restore, this will contain the micro/global batch configuration used while training.
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=0,  # This doesn't matter since it is only used for logging
                 rampup_batch_size=None,
                 global_batch_size=1,
@@ -1417,7 +1429,7 @@ def dummy():
         # Reconfigure microbatch calculator here to set num microbatches to 1 while decoding since its not clear how to decode with "grad acc".
         # reconfigure back to how things were before decode
         # TODO: Check if the user is trying to do gradient acc and maybe throw error
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=global_batch_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -1513,7 +1525,7 @@ def dummy():
                         # reconfigure batch size since the tensor have been augmented with beam size
                         global_batch_per_gpu = token_ids.shape[0]
                         tensor_shape[1] = global_batch_per_gpu
-                        _reconfigure_microbatch_calculator(
+                        reconfigure_num_microbatches_calculator(
                             rank=app_state.global_rank,
                             rampup_batch_size=None,
                             global_batch_size=global_batch_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -1604,7 +1616,7 @@ def dummy():
                 )
 
         # Reset microbatch calculator to what it was before decoding.
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=global_batch_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -1659,7 +1671,7 @@ def complete(self, request: Dict):
         app_state = AppState()
 
         # The complete method only works with global batch = micro batch size = data parallel size = 1.
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=1,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
index 3eb78d34b3f4..9061f430e722 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
@@ -74,7 +74,6 @@
     from megatron.core.models.retro.decoder_spec import get_retro_decoder_block_spec
     from megatron.core.models.retro.utils import get_config_path as get_retro_config_path
     from megatron.core.models.retro.utils import get_gpt_data_dir as get_retro_data_dir
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
@@ -92,6 +91,13 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 try:
     import transformer_engine
     from transformer_engine.pytorch import module as te_module
@@ -425,7 +431,7 @@ def build_retro_config(self) -> RetroConfig:
         # Validate Transformer Engine version.
         from importlib.metadata import version
 
-        from pkg_resources import packaging
+        import packaging
 
         te_version = packaging.version.Version(version("transformer-engine"))
         if te_version >= packaging.version.Version("1.3"):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t0_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t0_model.py
index 82bd84c8ada8..cee1b11a160b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t0_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t0_model.py
@@ -25,7 +25,6 @@
 from nemo.collections.nlp.data.language_modeling.t0_dataset import T0Dataset
 from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel
 from nemo.utils import AppState, logging
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator
 
 try:
     from megatron.core import parallel_state
@@ -36,6 +35,15 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+
 __all__ = ['MegatronT0Model']
 
 
@@ -153,7 +161,7 @@ def _reconfigure_and_process_inference_batch(self, batch):
         # This should happen only on the last batch of the validation/test dataset with drop_last=False.
         if global_batch_per_gpu != self.cfg.data.validation_ds.global_batch_size:
             app_state = AppState()
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=global_batch_per_gpu * parallel_state.get_data_parallel_world_size(),
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py
index 0773e4abe811..1f54cb87428e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py
@@ -34,12 +34,10 @@
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import AppState, logging
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator, get_micro_batch_size
 
 try:
     from megatron.core import parallel_state
     from megatron.core.enums import ModelType
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
@@ -49,6 +47,14 @@
     HAVE_MEGATRON_CORE = False
 
 
+try:
+    from megatron.core.num_microbatches_calculator import get_micro_batch_size, get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches
+
+
 __all__ = ['MegatronT5PromptLearningModel']
 
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
index e71ed4964c29..c70f44925d33 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
@@ -30,11 +30,9 @@
 from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import AppState, logging
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator, get_micro_batch_size
 
 try:
     from megatron.core import parallel_state
-    from megatron.core.num_microbatches_calculator import get_current_global_batch_size, get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
@@ -43,6 +41,25 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import (
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+        reconfigure_num_microbatches_calculator,
+    )
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+    from apex.transformer.pipeline_parallel.utils import (
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+    )
+
 __all__ = ['MegatronT5SFTModel']
 
 
@@ -162,7 +179,7 @@ def setup(self, stage=None):
 
     def on_validation_epoch_start(self):
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=self.cfg.data.validation_ds.global_batch_size,
@@ -173,7 +190,7 @@ def on_validation_epoch_start(self):
 
     def on_test_epoch_start(self):
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=self.cfg.data.test_ds.global_batch_size,
@@ -256,7 +273,7 @@ def _reconfigure_and_process_inference_batch(self, batch, ds_config):
                 != ds_config.global_batch_size // parallel_state.get_data_parallel_world_size()
             ):
                 app_state = AppState()
-                _reconfigure_microbatch_calculator(
+                reconfigure_num_microbatches_calculator(
                     rank=app_state.global_rank,
                     rampup_batch_size=None,
                     global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
@@ -266,7 +283,7 @@ def _reconfigure_and_process_inference_batch(self, batch, ds_config):
             # NOTE: need to explicitly handle resetting for multi-validation
             else:
                 app_state = AppState()
-                _reconfigure_microbatch_calculator(
+                reconfigure_num_microbatches_calculator(
                     rank=app_state.global_rank,
                     rampup_batch_size=None,
                     global_batch_size=ds_config.global_batch_size,
@@ -548,7 +565,7 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
 
         app_state = AppState()
         if hasattr(self, "_train_ds"):
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=self.cfg.data.train_ds.global_batch_size,
@@ -558,7 +575,7 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
         # When running `trainer.validate()`, the training dataset is not available.
         else:
             logging.warning('No training data found, reconfiguring microbatches based on validation batch sizes.')
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=data_cfg.global_batch_size,
diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
index 6a76f88cd229..4461b417f311 100644
--- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
+++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
@@ -51,12 +51,9 @@
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.core.classes import Exportable
 from nemo.utils import AppState, logging, timers
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator, get_micro_batch_size
 
 try:
     from megatron.core import parallel_state
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
-    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
 
@@ -64,6 +61,20 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import (
+        get_micro_batch_size,
+        get_num_microbatches,
+        reconfigure_num_microbatches_calculator,
+    )
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+    from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches
+
 
 __all__ = ["MegatronNMTModel"]
 
@@ -322,7 +333,7 @@ def eval_step(self, dataloader_iter):
 
         # Eval step requires text datasets so we need to reconfigure MBS on each batch.
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=batch['text_enc'].size(0) * parallel_state.get_data_parallel_world_size(),
@@ -543,7 +554,7 @@ def eval_epoch_end(self, outputs, mode):
 
         app_state = AppState()
         if hasattr(self, "_train_ds"):
-            _reconfigure_microbatch_calculator(
+            reconfigure_num_microbatches_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=self._cfg.train_ds.global_batch_size,
@@ -814,7 +825,7 @@ def list_available_models(self):
 
     def on_validation_epoch_start(self):
         app_state = AppState()
-        _reconfigure_microbatch_calculator(
+        reconfigure_num_microbatches_calculator(
             rank=app_state.global_rank,
             rampup_batch_size=None,
             global_batch_size=parallel_state.get_data_parallel_world_size(),
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 7167eefda637..4f9f04527038 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -253,7 +253,7 @@ def __init__(
         if self._sequence_parallel and not input_is_parallel:
             from importlib.metadata import version
 
-            from pkg_resources import packaging
+            import packaging
 
             te_version = packaging.version.Version(version("transformer-engine"))
             if te_version >= packaging.version.Version("1.5.0dev") and (
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py b/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py
index a834b9a3fb49..4a180234e3cf 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py
@@ -15,9 +15,9 @@
 from importlib.metadata import version
 from typing import TYPE_CHECKING, Dict, Optional
 
+import packaging
 import torch
 import torch.nn.functional as F
-from pkg_resources import packaging
 from torch import Tensor, nn
 
 from nemo.collections.nlp.parts.peft_config import LORA_CONFIG_TO_MCORE_MAP, get_target_modules
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
index a82c56c38092..d8fac724e63c 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
@@ -16,10 +16,6 @@
 
 import numpy as np
 import torch
-from megatron.core.num_microbatches_calculator import (
-    ConstantNumMicroBatchesCalculator,
-    init_num_microbatches_calculator,
-)
 
 from nemo.utils import AppState, logging
 
@@ -47,10 +43,38 @@
         set_virtual_pipeline_model_parallel_rank,
     )
 
+    HAVE_MEGATRON_CORE = True
+
 except (ImportError, ModuleNotFoundError):
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import (
+        ConstantNumMicroBatchesCalculator,
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+        init_num_microbatches_calculator,
+    )
+
+    MCORE_MB_CALCULATOR = True
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.microbatches import ConstantNumMicroBatches as ConstantNumMicroBatchesCalculator
+    from apex.transformer.pipeline_parallel.utils import (
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+    )
+    from apex.transformer.pipeline_parallel.utils import (
+        setup_microbatch_calculator as init_num_microbatches_calculator,
+    )
+
+    MCORE_MB_CALCULATOR = False
+
+
 try:
     from apex.transformer.parallel_state import set_virtual_pipeline_model_parallel_world_size
 
@@ -139,25 +163,46 @@ def initialize_model_parallel_for_nemo(
 
     if global_batch_size and micro_batch_size is not None:
         # TODO: add rampup_batch_size here when we have it implemented
-        from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-
-        if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
-            init_num_microbatches_calculator(
-                rank=global_rank,
-                global_batch_size=global_batch_size,
-                micro_batch_size=micro_batch_size,
-                data_parallel_size=app_state.data_parallel_size,
-                rampup_batch_size=rampup_batch_size,
-            )
+        if MCORE_MB_CALCULATOR:
+            from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+            if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
+                init_num_microbatches_calculator(
+                    rank=global_rank,
+                    global_batch_size=global_batch_size,
+                    micro_batch_size=micro_batch_size,
+                    data_parallel_size=app_state.data_parallel_size,
+                    rampup_batch_size=rampup_batch_size,
+                )
+            else:
+                if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator):
+                    assert get_current_global_batch_size() == global_batch_size
+                    assert get_micro_batch_size() == micro_batch_size
+                    assert get_num_microbatches() == global_batch_size // (
+                        micro_batch_size * app_state.data_parallel_size
+                    )
+                else:
+                    raise Exception("Microbatch calculator already initialized.")
         else:
-            if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator):
-                assert _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_global_batch_size == global_batch_size
-                assert _GLOBAL_NUM_MICROBATCHES_CALCULATOR.micro_batch_size == micro_batch_size
-                assert _GLOBAL_NUM_MICROBATCHES_CALCULATOR.num_micro_batches == global_batch_size // (
-                    micro_batch_size * app_state.data_parallel_size
+            from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+            if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
+                init_num_microbatches_calculator(
+                    rank=global_rank,
+                    global_batch_size=global_batch_size,
+                    micro_batch_size=micro_batch_size,
+                    data_parallel_size=app_state.data_parallel_size,
+                    rampup_batch_size=rampup_batch_size,
                 )
             else:
-                raise Exception("Microbatch calculator already initialized.")
+                if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator):
+                    assert get_current_global_batch_size() == global_batch_size
+                    assert get_micro_batch_size() == micro_batch_size
+                    assert get_num_microbatches() == global_batch_size // (
+                        micro_batch_size * app_state.data_parallel_size
+                    )
+                else:
+                    raise Exception("Microbatch calculator already initialized.")
 
     app_state._is_megatron_initialized = True
 
diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
index 7ef6ec2d91e9..e803a622f75d 100644
--- a/nemo/collections/nlp/modules/common/megatron/transformer.py
+++ b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -18,10 +18,10 @@
 from importlib.metadata import version
 from typing import Any, Callable, Optional
 
+import packaging
 import torch
 import torch.nn as nn
 from einops import rearrange
-from pkg_resources import packaging
 
 from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
@@ -1525,7 +1525,12 @@ def forward(
         It indicates if the current step in the forward pass is the first in a gradient accumulation cycle.
         If set, FP8 weights are cached and some minor optimizations are applied to fuse_wgrad_accumulation
         """
-        from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+        try:
+            from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+        except (ImportError, ModuleNotFoundError):
+            logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+            from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
 
         num_micro_batches = getattr(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, 'num_micro_batches', 1)
 
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index 9d05dc5cdba2..8b9d7cf712c4 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -26,9 +26,9 @@
 from nemo.collections.nlp.modules.common.lm_utils import pad_batch
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+from nemo.utils import logging
 
 try:
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.identity_op import IdentityOp
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
@@ -39,6 +39,13 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 
 # the text representation of eos_id, it applies for all tokenizers
 END_OF_SEQ = '<|endoftext|>'
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index 87e88b61c211..a5215b12bfae 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -37,8 +37,7 @@
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
 from nemo.collections.nlp.modules.common.text_generation_strategy import model_inference_strategy_dispatcher
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, OutputType, SamplingParam
-from nemo.utils import AppState
-from nemo.utils.apex_utils import _reconfigure_microbatch_calculator
+from nemo.utils import AppState, logging
 
 try:
     from megatron.core import parallel_state, tensor_parallel
@@ -49,6 +48,15 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator as reconfigure_num_microbatches_calculator,
+    )
+
 __all__ = [
     "get_default_sampling_params",
     "get_default_length_params",
@@ -897,7 +905,7 @@ def sample_sequence_batch(
 
     app_state = AppState()
     micro_batch_size = context_tokens.shape[0]
-    _reconfigure_microbatch_calculator(
+    reconfigure_num_microbatches_calculator(
         rank=app_state.global_rank,
         rampup_batch_size=None,
         global_batch_size=micro_batch_size,
@@ -1089,7 +1097,7 @@ def tab_sample_sequence_batch(
 ):
     app_state = AppState()
     micro_batch_size = context_tokens.shape[0]
-    _reconfigure_microbatch_calculator(
+    reconfigure_num_microbatches_calculator(
         rank=app_state.global_rank,
         rampup_batch_size=None,
         global_batch_size=micro_batch_size,
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index f4276fd1b8f9..b2c85cde4e98 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -21,6 +21,7 @@
 from pytorch_lightning.callbacks import ModelSummary
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 
+from nemo.collections.common.metrics.perf_metrics import FLOPsMeasurementCallback
 from nemo.collections.nlp.parts.nlp_overrides import (
     CustomProgressBar,
     FSDPMixedPrecisionPlugin,
@@ -173,6 +174,10 @@ def _callbacks(self, callbacks: Optional[list]) -> list:
 
         if self.cfg.get('exp_manager', {}).get('checkpoint_callback_params', {}).get('async_save', False):
             callbacks.append(AsyncFinalizerCallback())
+
+        if self.cfg.get('exp_manager', {}).get('log_tflops_per_sec_per_gpu', True):
+            callbacks.append(FLOPsMeasurementCallback(self.cfg))
+
         return callbacks
 
     def create_trainer(self, callbacks=None) -> Trainer:
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 1d1e3d458563..b00b2ac28c3b 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -104,7 +104,6 @@
         optim_state_to_sharding_state,
     )
     from megatron.core.dist_checkpointing.strategies import tensorstore
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.tensor_parallel.layers import param_is_not_tensor_parallel_duplicate
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_layer import TransformerLayer as MCoreTransformerLayer
@@ -116,6 +115,13 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 
 try:
     from modelopt.torch.opt.plugins import restore_sharded_modelopt_state, save_sharded_modelopt_state
diff --git a/nemo/collections/tts/losses/audio_codec_loss.py b/nemo/collections/tts/losses/audio_codec_loss.py
index 4454c46291a7..6db3e30595c6 100644
--- a/nemo/collections/tts/losses/audio_codec_loss.py
+++ b/nemo/collections/tts/losses/audio_codec_loss.py
@@ -19,7 +19,8 @@
 from einops import rearrange
 
 from nemo.collections.asr.parts.preprocessing.features import FilterbankFeatures
-from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths, mask_sequence_tensor
+from nemo.collections.common.parts.utils import mask_sequence_tensor
+from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths
 from nemo.core.classes import Loss, typecheck
 from nemo.core.neural_types import (
     AudioSignal,
@@ -312,7 +313,7 @@ def forward(self, audio_real, audio_gen, audio_len):
 
         # [B, 1]
         ref_pred = torch.sum(pred * target, dim=-1, keepdim=True)
-        ref_target = torch.sum(target ** 2, dim=-1, keepdim=True)
+        ref_target = torch.sum(target**2, dim=-1, keepdim=True)
         alpha = (ref_pred + self.epsilon) / (ref_target + self.epsilon)
 
         # [B, T]
@@ -320,8 +321,8 @@ def forward(self, audio_real, audio_gen, audio_len):
         distortion = target_scaled - pred
 
         # [B]
-        target_scaled_power = torch.sum(target_scaled ** 2, dim=-1)
-        distortion_power = torch.sum(distortion ** 2, dim=-1)
+        target_scaled_power = torch.sum(target_scaled**2, dim=-1)
+        distortion_power = torch.sum(distortion**2, dim=-1)
 
         ratio = (target_scaled_power + self.epsilon) / (distortion_power + self.epsilon)
         si_sdr = 10 * torch.log10(ratio)
@@ -505,7 +506,7 @@ def forward(self, disc_scores_real, disc_scores_gen):
         loss = 0.0
         for disc_score_real, disc_score_gen in zip(disc_scores_real, disc_scores_gen):
             loss_real = torch.mean((1 - disc_score_real) ** 2)
-            loss_gen = torch.mean(disc_score_gen ** 2)
+            loss_gen = torch.mean(disc_score_gen**2)
             loss += (loss_real + loss_gen) / 2
 
         loss /= len(disc_scores_real)
diff --git a/nemo/collections/tts/losses/spectrogram_enhancer_losses.py b/nemo/collections/tts/losses/spectrogram_enhancer_losses.py
index a77f42692b11..ff62fe80e9db 100644
--- a/nemo/collections/tts/losses/spectrogram_enhancer_losses.py
+++ b/nemo/collections/tts/losses/spectrogram_enhancer_losses.py
@@ -41,7 +41,7 @@
 from einops import rearrange
 from torch.autograd import grad as torch_grad
 
-from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
+from nemo.collections.common.parts.utils import mask_sequence_tensor
 
 
 class GradientPenaltyLoss(torch.nn.Module):
diff --git a/nemo/collections/tts/models/spectrogram_enhancer.py b/nemo/collections/tts/models/spectrogram_enhancer.py
index 7115360e7125..65934d9a10ce 100644
--- a/nemo/collections/tts/models/spectrogram_enhancer.py
+++ b/nemo/collections/tts/models/spectrogram_enhancer.py
@@ -48,13 +48,14 @@
 from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
 from torch.utils.tensorboard.writer import SummaryWriter
 
+from nemo.collections.common.parts.utils import mask_sequence_tensor
 from nemo.collections.tts.losses.spectrogram_enhancer_losses import (
     ConsistencyLoss,
     GeneratorLoss,
     GradientPenaltyLoss,
     HingeLoss,
 )
-from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor, to_device_recursive
+from nemo.collections.tts.parts.utils.helpers import to_device_recursive
 from nemo.core import Exportable, ModelPT, PretrainedModelInfo, typecheck
 from nemo.core.neural_types import LengthsType, MelSpectrogramType, NeuralType
 from nemo.core.neural_types.elements import BoolType
@@ -128,7 +129,12 @@ def pad_spectrograms(self, spectrograms):
         }
     )
     def forward(
-        self, *, input_spectrograms: torch.Tensor, lengths: torch.Tensor, mixing: bool = False, normalize: bool = True,
+        self,
+        *,
+        input_spectrograms: torch.Tensor,
+        lengths: torch.Tensor,
+        mixing: bool = False,
+        normalize: bool = True,
     ):
         """
         Generator forward pass. Noise inputs will be generated.
@@ -263,7 +269,10 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             return g_loss + c_loss
 
     def configure_optimizers(self):
-        generator_opt = instantiate(self._cfg.generator_opt, params=self.generator.parameters(),)
+        generator_opt = instantiate(
+            self._cfg.generator_opt,
+            params=self.generator.parameters(),
+        )
         discriminator_opt = instantiate(self._cfg.discriminator_opt, params=self.discriminator.parameters())
         return [discriminator_opt, generator_opt], []
 
diff --git a/nemo/collections/tts/modules/audio_codec_modules.py b/nemo/collections/tts/modules/audio_codec_modules.py
index 96029d9bd105..e9ed34732c36 100644
--- a/nemo/collections/tts/modules/audio_codec_modules.py
+++ b/nemo/collections/tts/modules/audio_codec_modules.py
@@ -23,7 +23,7 @@
 
 from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor
 from nemo.collections.asr.parts.utils.activations import Snake
-from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
+from nemo.collections.common.parts.utils import mask_sequence_tensor
 from nemo.core.classes.common import typecheck
 from nemo.core.classes.module import NeuralModule
 from nemo.core.neural_types.elements import (
@@ -399,7 +399,9 @@ def encode(self, inputs: torch.Tensor, input_len: torch.Tensor) -> torch.Tensor:
             "indices": NeuralType(('D', 'B', 'T'), Index()),
             "input_len": NeuralType(tuple('B'), LengthsType()),
         },
-        output_types={"dequantized": NeuralType(('B', 'D', 'T'), EncodedRepresentation()),},
+        output_types={
+            "dequantized": NeuralType(('B', 'D', 'T'), EncodedRepresentation()),
+        },
     )
     @abstractmethod
     def decode(self, indices: torch.Tensor, input_len: torch.Tensor) -> torch.Tensor:
@@ -489,8 +491,7 @@ def round(inputs: torch.Tensor, input_len: torch.Tensor) -> torch.Tensor:
         return inputs + (inputs_rounded - inputs).detach()
 
     def compress(self, inputs: torch.Tensor, input_len: torch.Tensor) -> torch.Tensor:
-        """Apply compression to the input, to limit to values.
-        """
+        """Apply compression to the input, to limit to values."""
         output_scale = (self.num_levels - 1) / 2
         # scale down a bit to avoid rounding issues
         output_scale = output_scale * (1 - self.eps)
@@ -520,20 +521,17 @@ def inputs_to_codes(self, inputs: torch.Tensor, input_len: torch.Tensor) -> torc
         return codes
 
     def codes_to_nonnegative(self, codes: torch.Tensor) -> torch.Tensor:
-        """Convert values centered arouund zero to nonnegative values.
-        """
+        """Convert values centered arouund zero to nonnegative values."""
         scale = offset = self.num_levels // 2
         return scale * codes + offset
 
     def nonnegative_to_codes(self, codes_nonnegative: torch.Tensor) -> torch.Tensor:
-        """Convert nonnegative values to values centered arouund zero.
-        """
+        """Convert nonnegative values to values centered arouund zero."""
         scale = offset = self.num_levels // 2
         return (codes_nonnegative - offset) / scale
 
     def codes_to_indices(self, codes: torch.Tensor) -> torch.Tensor:
-        """Converts a code vector to a single index.
-        """
+        """Converts a code vector to a single index."""
         if codes.size(1) != self.dim:
             raise RuntimeError(
                 f'Input code dimension {codes.size(1)} not matching the expected dimension {self.dim}, input codes shape {codes.shape}'
@@ -575,8 +573,7 @@ def forward(
         output_types={"indices": NeuralType(('D', 'B', 'T'), Index())},
     )
     def encode(self, inputs: torch.Tensor, input_len: Optional[torch.Tensor] = None) -> torch.Tensor:
-        """Convert a continuous code vector to a single index.
-        """
+        """Convert a continuous code vector to a single index."""
         _, indices = self(inputs=inputs, input_len=input_len)
         return indices
 
@@ -585,11 +582,12 @@ def encode(self, inputs: torch.Tensor, input_len: Optional[torch.Tensor] = None)
             "indices": NeuralType(('D', 'B', 'T'), Index()),
             "input_len": NeuralType(tuple('B'), LengthsType(), optional=True),
         },
-        output_types={"dequantized": NeuralType(('B', 'D', 'T'), EncodedRepresentation()),},
+        output_types={
+            "dequantized": NeuralType(('B', 'D', 'T'), EncodedRepresentation()),
+        },
     )
     def decode(self, indices: torch.Tensor, input_len: Optional[torch.Tensor] = None) -> torch.Tensor:
-        """Convert a single index to a continuous code vector.
-        """
+        """Convert a single index to a continuous code vector."""
         if indices.size(0) > 1:
             # codebook dimension used for compatibility with RVQ
             raise ValueError(
@@ -642,8 +640,7 @@ def __init__(self, num_groups: int, num_levels_per_group: List[int], **kwargs):
 
     @property
     def codebook_dim(self):
-        """Input vector dimension.
-        """
+        """Input vector dimension."""
         return self.codebook_dim_per_group * self.num_groups
 
     @property
@@ -654,12 +651,11 @@ def codebook_size_per_group(self):
     @property
     def codebook_size(self):
         """Returns the size of the implicit codebook."""
-        return self.codebook_size_per_group ** self.num_groups
+        return self.codebook_size_per_group**self.num_groups
 
     @typecheck()
     def forward(self, inputs, input_len):
-        """Quantize each group separately, then concatenate the results.
-        """
+        """Quantize each group separately, then concatenate the results."""
         inputs_grouped = inputs.chunk(self.num_groups, dim=1)
 
         dequantized, indices = [], []
@@ -685,8 +681,7 @@ def forward(self, inputs, input_len):
         output_types={"indices": NeuralType(('D', 'B', 'T'), Index())},
     )
     def encode(self, inputs: torch.Tensor, input_len: torch.Tensor) -> torch.Tensor:
-        """Input is split into groups, each group is encoded separately, then the results are concatenated.
-        """
+        """Input is split into groups, each group is encoded separately, then the results are concatenated."""
         inputs_grouped = inputs.chunk(self.num_groups, dim=1)
         indices = []
 
@@ -704,11 +699,12 @@ def encode(self, inputs: torch.Tensor, input_len: torch.Tensor) -> torch.Tensor:
             "indices": NeuralType(('D', 'B', 'T'), Index()),
             "input_len": NeuralType(tuple('B'), LengthsType()),
         },
-        output_types={"dequantized": NeuralType(('B', 'D', 'T'), EncodedRepresentation()),},
+        output_types={
+            "dequantized": NeuralType(('B', 'D', 'T'), EncodedRepresentation()),
+        },
     )
     def decode(self, indices: torch.Tensor, input_len: torch.Tensor) -> torch.Tensor:
-        """Input indices are split into groups, each group is decoded separately, then the results are concatenated.
-        """
+        """Input indices are split into groups, each group is decoded separately, then the results are concatenated."""
         indices_grouped = indices.chunk(self.num_groups, dim=0)
         dequantized = []
 
diff --git a/nemo/collections/tts/modules/encodec_modules.py b/nemo/collections/tts/modules/encodec_modules.py
index e93c7c799550..e9a1556ab700 100644
--- a/nemo/collections/tts/modules/encodec_modules.py
+++ b/nemo/collections/tts/modules/encodec_modules.py
@@ -43,6 +43,7 @@
 from einops import rearrange, repeat
 from torch import Tensor
 
+from nemo.collections.common.parts.utils import mask_sequence_tensor
 from nemo.collections.tts.losses.audio_codec_loss import MaskedMSELoss
 from nemo.collections.tts.modules.audio_codec_modules import (
     CodecActivation,
@@ -53,7 +54,6 @@
     get_down_sample_padding,
 )
 from nemo.collections.tts.parts.utils.distributed import broadcast_tensors
-from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
 from nemo.core.classes.common import typecheck
 from nemo.core.classes.module import NeuralModule
 from nemo.core.neural_types.elements import AudioSignal, EncodedRepresentation, Index, LengthsType, LossType, VoidType
@@ -266,7 +266,10 @@ def __init__(
             out_channels = in_channels // 2
             kernel_size = 2 * up_sample_rate
             up_sample_conv = ConvTranspose1dNorm(
-                in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=up_sample_rate,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=up_sample_rate,
             )
             in_channels = out_channels
             self.up_sample_conv_layers.append(up_sample_conv)
@@ -681,7 +684,10 @@ def encode(self, inputs, input_len):
         return indices
 
     @typecheck(
-        input_types={"indices": NeuralType(('B', 'T'), Index()), "input_len": NeuralType(tuple('B'), LengthsType()),},
+        input_types={
+            "indices": NeuralType(('B', 'T'), Index()),
+            "input_len": NeuralType(tuple('B'), LengthsType()),
+        },
         output_types={"dequantized": NeuralType(('B', 'T', 'D'), EncodedRepresentation())},
     )
     def decode(self, indices, input_len):
@@ -801,7 +807,9 @@ def encode(self, inputs: Tensor, input_len: Tensor) -> Tensor:
             "indices": NeuralType(('D', 'B', 'T'), Index()),
             "input_len": NeuralType(tuple('B'), LengthsType()),
         },
-        output_types={"dequantized": NeuralType(('B', 'D', 'T'), EncodedRepresentation()),},
+        output_types={
+            "dequantized": NeuralType(('B', 'D', 'T'), EncodedRepresentation()),
+        },
     )
     def decode(self, indices: Tensor, input_len: Tensor) -> Tensor:
         # [B, T, D]
@@ -852,8 +860,7 @@ def __init__(self, num_codebooks: int, num_groups: int, codebook_dim: int, **kwa
 
     @property
     def num_codebooks_per_group(self):
-        """Number of codebooks for each group.
-        """
+        """Number of codebooks for each group."""
         if self.num_codebooks % self.num_groups != 0:
             raise ValueError(
                 f'num_codebooks ({self.num_codebooks}) must be divisible by num_groups ({self.num_groups})'
@@ -863,8 +870,7 @@ def num_codebooks_per_group(self):
 
     @property
     def codebook_dim_per_group(self):
-        """Input vector dimension for each group.
-        """
+        """Input vector dimension for each group."""
         if self.codebook_dim % self.num_groups != 0:
             raise ValueError(f'codebook_dim ({self.codebook_dim}) must be divisible by num_groups ({self.num_groups})')
 
@@ -881,8 +887,7 @@ def output_types(self):
 
     @typecheck()
     def forward(self, inputs, input_len):
-        """Quantize each group separately, then concatenate the results.
-        """
+        """Quantize each group separately, then concatenate the results."""
         inputs_grouped = inputs.chunk(self.num_groups, dim=1)
 
         dequantized, indices = [], []
@@ -910,8 +915,7 @@ def forward(self, inputs, input_len):
         output_types={"indices": NeuralType(('D', 'B', 'T'), Index())},
     )
     def encode(self, inputs: Tensor, input_len: Tensor) -> Tensor:
-        """Input is split into groups, each group is encoded separately, then the results are concatenated.
-        """
+        """Input is split into groups, each group is encoded separately, then the results are concatenated."""
         inputs_grouped = inputs.chunk(self.num_groups, dim=1)
         indices = []
 
@@ -929,11 +933,12 @@ def encode(self, inputs: Tensor, input_len: Tensor) -> Tensor:
             "indices": NeuralType(('D', 'B', 'T'), Index()),
             "input_len": NeuralType(tuple('B'), LengthsType()),
         },
-        output_types={"dequantized": NeuralType(('B', 'D', 'T'), EncodedRepresentation()),},
+        output_types={
+            "dequantized": NeuralType(('B', 'D', 'T'), EncodedRepresentation()),
+        },
     )
     def decode(self, indices: Tensor, input_len: Tensor) -> Tensor:
-        """Input indices are split into groups, each group is decoded separately, then the results are concatenated.
-        """
+        """Input indices are split into groups, each group is decoded separately, then the results are concatenated."""
         indices_grouped = indices.chunk(self.num_groups, dim=0)
         dequantized = []
 
diff --git a/nemo/collections/tts/modules/spectrogram_enhancer.py b/nemo/collections/tts/modules/spectrogram_enhancer.py
index 2cc88264a7d2..20866363d869 100644
--- a/nemo/collections/tts/modules/spectrogram_enhancer.py
+++ b/nemo/collections/tts/modules/spectrogram_enhancer.py
@@ -46,7 +46,7 @@
 from einops import rearrange
 from kornia.filters import filter2d
 
-from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
+from nemo.collections.common.parts.utils import mask_sequence_tensor
 
 
 class Blur(torch.nn.Module):
@@ -99,7 +99,10 @@ def __init__(self, latent_dim, input_channel, upsample, channels=3):
         self.conv = Conv2DModulated(input_channel, out_filters, 1, demod=False)
 
         self.upsample = (
-            torch.nn.Sequential(torch.nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False), Blur(),)
+            torch.nn.Sequential(
+                torch.nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),
+                Blur(),
+            )
             if upsample
             else None
         )
@@ -125,7 +128,15 @@ class Conv2DModulated(torch.nn.Module):
     """
 
     def __init__(
-        self, in_chan, out_chan, kernel, demod=True, stride=1, dilation=1, eps=1e-8, **kwargs,
+        self,
+        in_chan,
+        out_chan,
+        kernel,
+        demod=True,
+        stride=1,
+        dilation=1,
+        eps=1e-8,
+        **kwargs,
     ):
         super().__init__()
         self.filters = out_chan
@@ -148,7 +159,7 @@ def forward(self, x, y):
         weights = w2 * (w1 + 1)
 
         if self.demod:
-            d = torch.rsqrt((weights ** 2).sum(dim=(2, 3, 4), keepdim=True) + self.eps)
+            d = torch.rsqrt((weights**2).sum(dim=(2, 3, 4), keepdim=True) + self.eps)
             weights = weights * d
 
         x = x.reshape(1, -1, h, w)
@@ -165,7 +176,13 @@ def forward(self, x, y):
 
 class GeneratorBlock(torch.nn.Module):
     def __init__(
-        self, latent_dim, input_channels, filters, upsample=True, upsample_rgb=True, channels=1,
+        self,
+        latent_dim,
+        input_channels,
+        filters,
+        upsample=True,
+        upsample_rgb=True,
+        channels=1,
     ):
         super().__init__()
         self.upsample = torch.nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) if upsample else None
@@ -257,7 +274,12 @@ def __init__(
             not_last = ind != (self.num_layers - 1)
 
             block = GeneratorBlock(
-                latent_dim, in_chan, out_chan, upsample=not_first, upsample_rgb=not_last, channels=channels,
+                latent_dim,
+                in_chan,
+                out_chan,
+                upsample=not_first,
+                upsample_rgb=not_last,
+                channels=channels,
             )
             self.blocks.append(block)
 
@@ -315,14 +337,18 @@ def forward(self, condition: torch.Tensor, lengths: torch.Tensor, ws: List[torch
 
 class Discriminator(torch.nn.Module):
     def __init__(
-        self, n_bands, network_capacity=16, channels=1, fmap_max=512,
+        self,
+        n_bands,
+        network_capacity=16,
+        channels=1,
+        fmap_max=512,
     ):
         super().__init__()
         num_layers = int(log2(n_bands) - 1)
         num_init_filters = channels
 
         blocks = []
-        filters = [num_init_filters] + [(network_capacity * 4) * (2 ** i) for i in range(num_layers + 1)]
+        filters = [num_init_filters] + [(network_capacity * 4) * (2**i) for i in range(num_layers + 1)]
 
         set_fmap_max = partial(min, fmap_max)
         filters = list(map(set_fmap_max, filters))
diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py
index 08d31390107b..a4c65f9ed0e5 100644
--- a/nemo/collections/tts/parts/utils/helpers.py
+++ b/nemo/collections/tts/parts/utils/helpers.py
@@ -123,23 +123,26 @@ def binarize_attention(attn, in_len, out_len):
 
 def binarize_attention_parallel(attn, in_lens, out_lens):
     """For training purposes only. Binarizes attention with MAS.
-           These will no longer receive a gradient.
+       These will no longer receive a gradient.
 
-        Args:
-            attn: B x 1 x max_mel_len x max_text_len
-        """
+    Args:
+        attn: B x 1 x max_mel_len x max_text_len
+    """
     with torch.no_grad():
         log_attn_cpu = torch.log(attn.data).cpu().numpy()
         attn_out = b_mas(log_attn_cpu, in_lens.cpu().numpy(), out_lens.cpu().numpy(), width=1)
     return torch.from_numpy(attn_out).to(attn.device)
 
 
-def get_mask_from_lengths(lengths: Optional[torch.Tensor] = None, x: Optional[torch.Tensor] = None,) -> torch.Tensor:
+def get_mask_from_lengths(
+    lengths: Optional[torch.Tensor] = None,
+    x: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
     """Constructs binary mask from a 1D torch tensor of input lengths
 
     Args:
         lengths: Optional[torch.tensor] (torch.tensor): 1D tensor with lengths
-        x: Optional[torch.tensor] = tensor to be used on, last dimension is for mask 
+        x: Optional[torch.tensor] = tensor to be used on, last dimension is for mask
     Returns:
         mask (torch.tensor): num_sequences x max_length binary tensor
     """
@@ -168,7 +171,7 @@ def sort_tensor(
         context: tensor sorted by lens along dimension dim
         lens_sorted: lens tensor, sorted
         ids_sorted: reorder ids to be used to restore original order
-    
+
     """
     lens_sorted, ids_sorted = torch.sort(lens, descending=descending)
     context = torch.index_select(context, dim, ids_sorted)
@@ -177,13 +180,13 @@ def sort_tensor(
 
 def unsort_tensor(ordered: torch.Tensor, indices: torch.Tensor, dim: Optional[int] = 0) -> torch.Tensor:
     """Reverses the result of sort_tensor function:
-       o, _, ids = sort_tensor(x,l) 
+       o, _, ids = sort_tensor(x,l)
        assert unsort_tensor(o,ids) == x
     Args:
         ordered: context tensor, sorted by lengths
         indices: torch.tensor: 1D tensor with 're-order' indices returned by sort_tensor
     Returns:
-        ordered tensor in original order (before calling sort_tensor)  
+        ordered tensor in original order (before calling sort_tensor)
     """
     return torch.index_select(ordered, dim, indices.argsort(0))
 
@@ -294,7 +297,7 @@ def log_audio_to_tb(
     log_mel = spect.data.cpu().numpy().T
     mel = np.exp(log_mel)
     magnitude = np.dot(mel, filterbank) * griffin_lim_mag_scale
-    audio = griffin_lim(magnitude.T ** griffin_lim_power)
+    audio = griffin_lim(magnitude.T**griffin_lim_power)
     swriter.add_audio(name, audio / max(np.abs(audio)), step, sample_rate=sr)
 
 
@@ -317,10 +320,16 @@ def tacotron2_log_to_tb_func(
     _, spec_target, mel_postnet, gate, gate_target, alignments = tensors
     if log_images and step % log_images_freq == 0:
         swriter.add_image(
-            f"{tag}_alignment", plot_alignment_to_numpy(alignments[0].data.cpu().numpy().T), step, dataformats="HWC",
+            f"{tag}_alignment",
+            plot_alignment_to_numpy(alignments[0].data.cpu().numpy().T),
+            step,
+            dataformats="HWC",
         )
         swriter.add_image(
-            f"{tag}_mel_target", plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()), step, dataformats="HWC",
+            f"{tag}_mel_target",
+            plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()),
+            step,
+            dataformats="HWC",
         )
         swriter.add_image(
             f"{tag}_mel_predicted",
@@ -330,7 +339,10 @@ def tacotron2_log_to_tb_func(
         )
         swriter.add_image(
             f"{tag}_gate",
-            plot_gate_outputs_to_numpy(gate_target[0].data.cpu().numpy(), torch.sigmoid(gate[0]).data.cpu().numpy(),),
+            plot_gate_outputs_to_numpy(
+                gate_target[0].data.cpu().numpy(),
+                torch.sigmoid(gate[0]).data.cpu().numpy(),
+            ),
             step,
             dataformats="HWC",
         )
@@ -340,13 +352,13 @@ def tacotron2_log_to_tb_func(
             log_mel = mel_postnet[0].data.cpu().numpy().T
             mel = np.exp(log_mel)
             magnitude = np.dot(mel, filterbank) * griffin_lim_mag_scale
-            audio = griffin_lim(magnitude.T ** griffin_lim_power)
+            audio = griffin_lim(magnitude.T**griffin_lim_power)
             swriter.add_audio(f"audio/{tag}_predicted", audio / max(np.abs(audio)), step, sample_rate=sr)
 
             log_mel = spec_target[0].data.cpu().numpy().T
             mel = np.exp(log_mel)
             magnitude = np.dot(mel, filterbank) * griffin_lim_mag_scale
-            audio = griffin_lim(magnitude.T ** griffin_lim_power)
+            audio = griffin_lim(magnitude.T**griffin_lim_power)
             swriter.add_audio(f"audio/{tag}_target", audio / max(np.abs(audio)), step, sample_rate=sr)
 
 
@@ -373,16 +385,26 @@ def tacotron2_log_to_wandb_func(
         specs = []
         gates = []
         alignments += [
-            wandb.Image(plot_alignment_to_numpy(alignments[0].data.cpu().numpy().T), caption=f"{tag}_alignment",)
+            wandb.Image(
+                plot_alignment_to_numpy(alignments[0].data.cpu().numpy().T),
+                caption=f"{tag}_alignment",
+            )
         ]
         alignments += [
-            wandb.Image(plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()), caption=f"{tag}_mel_target",),
-            wandb.Image(plot_spectrogram_to_numpy(mel_postnet[0].data.cpu().numpy()), caption=f"{tag}_mel_predicted",),
+            wandb.Image(
+                plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()),
+                caption=f"{tag}_mel_target",
+            ),
+            wandb.Image(
+                plot_spectrogram_to_numpy(mel_postnet[0].data.cpu().numpy()),
+                caption=f"{tag}_mel_predicted",
+            ),
         ]
         gates += [
             wandb.Image(
                 plot_gate_outputs_to_numpy(
-                    gate_target[0].data.cpu().numpy(), torch.sigmoid(gate[0]).data.cpu().numpy(),
+                    gate_target[0].data.cpu().numpy(),
+                    torch.sigmoid(gate[0]).data.cpu().numpy(),
                 ),
                 caption=f"{tag}_gate",
             )
@@ -396,16 +418,24 @@ def tacotron2_log_to_wandb_func(
             log_mel = mel_postnet[0].data.cpu().numpy().T
             mel = np.exp(log_mel)
             magnitude = np.dot(mel, filterbank) * griffin_lim_mag_scale
-            audio_pred = griffin_lim(magnitude.T ** griffin_lim_power)
+            audio_pred = griffin_lim(magnitude.T**griffin_lim_power)
 
             log_mel = spec_target[0].data.cpu().numpy().T
             mel = np.exp(log_mel)
             magnitude = np.dot(mel, filterbank) * griffin_lim_mag_scale
-            audio_true = griffin_lim(magnitude.T ** griffin_lim_power)
+            audio_true = griffin_lim(magnitude.T**griffin_lim_power)
 
             audios += [
-                wandb.Audio(audio_true / max(np.abs(audio_true)), caption=f"{tag}_wav_target", sample_rate=sr,),
-                wandb.Audio(audio_pred / max(np.abs(audio_pred)), caption=f"{tag}_wav_predicted", sample_rate=sr,),
+                wandb.Audio(
+                    audio_true / max(np.abs(audio_true)),
+                    caption=f"{tag}_wav_target",
+                    sample_rate=sr,
+                ),
+                wandb.Audio(
+                    audio_pred / max(np.abs(audio_pred)),
+                    caption=f"{tag}_wav_predicted",
+                    sample_rate=sr,
+                ),
             ]
 
             swriter.log({"audios": audios})
@@ -505,10 +535,22 @@ def create_plot(data, x_axis, y_axis, output_filepath=None):
 def plot_gate_outputs_to_numpy(gate_targets, gate_outputs):
     fig, ax = plt.subplots(figsize=(12, 3))
     ax.scatter(
-        range(len(gate_targets)), gate_targets, alpha=0.5, color='green', marker='+', s=1, label='target',
+        range(len(gate_targets)),
+        gate_targets,
+        alpha=0.5,
+        color='green',
+        marker='+',
+        s=1,
+        label='target',
     )
     ax.scatter(
-        range(len(gate_outputs)), gate_outputs, alpha=0.5, color='red', marker='.', s=1, label='predicted',
+        range(len(gate_outputs)),
+        gate_outputs,
+        alpha=0.5,
+        color='red',
+        marker='.',
+        s=1,
+        label='predicted',
     )
 
     plt.xlabel("Frames (Green target, Red predicted)")
@@ -530,24 +572,40 @@ def save_figure_to_numpy(fig):
 
 @rank_zero_only
 def waveglow_log_to_tb_func(
-    swriter, tensors, step, tag="train", n_fft=1024, hop_length=256, window="hann", mel_fb=None,
+    swriter,
+    tensors,
+    step,
+    tag="train",
+    n_fft=1024,
+    hop_length=256,
+    window="hann",
+    mel_fb=None,
 ):
     _, audio_pred, spec_target, mel_length = tensors
     mel_length = mel_length[0]
     spec_target = spec_target[0].data.cpu().numpy()[:, :mel_length]
     swriter.add_image(
-        f"{tag}_mel_target", plot_spectrogram_to_numpy(spec_target), step, dataformats="HWC",
+        f"{tag}_mel_target",
+        plot_spectrogram_to_numpy(spec_target),
+        step,
+        dataformats="HWC",
     )
     if mel_fb is not None:
         mag, _ = librosa.core.magphase(
             librosa.core.stft(
-                np.nan_to_num(audio_pred[0].cpu().detach().numpy()), n_fft=n_fft, hop_length=hop_length, window=window,
+                np.nan_to_num(audio_pred[0].cpu().detach().numpy()),
+                n_fft=n_fft,
+                hop_length=hop_length,
+                window=window,
             )
         )
         mel_pred = np.matmul(mel_fb.cpu().numpy(), mag).squeeze()
         log_mel_pred = np.log(np.clip(mel_pred, a_min=1e-5, a_max=None))
         swriter.add_image(
-            f"{tag}_mel_predicted", plot_spectrogram_to_numpy(log_mel_pred[:, :mel_length]), step, dataformats="HWC",
+            f"{tag}_mel_predicted",
+            plot_spectrogram_to_numpy(log_mel_pred[:, :mel_length]),
+            step,
+            dataformats="HWC",
         )
 
 
@@ -560,7 +618,12 @@ def remove(conv_list):
 
 
 def regulate_len(
-    durations, enc_out, pace=1.0, mel_max_len=None, group_size=1, dur_lens: torch.tensor = None,
+    durations,
+    enc_out,
+    pace=1.0,
+    mel_max_len=None,
+    group_size=1,
+    dur_lens: torch.tensor = None,
 ):
     """A function that takes predicted durations per encoded token, and repeats enc_out according to the duration.
     NOTE: durations.shape[1] == enc_out.shape[1]
@@ -724,30 +787,6 @@ def to_device_recursive(e, device: torch.device):
         return e
 
 
-def mask_sequence_tensor(tensor: torch.Tensor, lengths: torch.Tensor):
-    """
-    For tensors containing sequences, zero out out-of-bound elements given lengths of every element in the batch.
-
-    tensor: tensor of shape (B, D, L) or (B, D1, D2, L),
-    lengths: LongTensor of shape (B,)
-    """
-    batch_size, *_, max_lengths = tensor.shape
-
-    if len(tensor.shape) == 2:
-        mask = torch.ones(batch_size, max_lengths).cumsum(dim=-1).type_as(lengths)
-        mask = mask <= rearrange(lengths, "b -> b 1")
-    elif len(tensor.shape) == 3:
-        mask = torch.ones(batch_size, 1, max_lengths).cumsum(dim=-1).type_as(lengths)
-        mask = mask <= rearrange(lengths, "b -> b 1 1")
-    elif len(tensor.shape) == 4:
-        mask = torch.ones(batch_size, 1, 1, max_lengths).cumsum(dim=-1).type_as(lengths)
-        mask = mask <= rearrange(lengths, "b -> b 1 1 1")
-    else:
-        raise ValueError("Can only mask tensors of shape B x D x L and B x D1 x D2 x L")
-
-    return tensor * mask
-
-
 @torch.jit.script
 def batch_from_ragged(
     text: torch.Tensor,
@@ -786,13 +825,16 @@ def batch_from_ragged(
 
 
 def sample_tts_input(
-    export_config, device, max_batch=1, max_dim=127,
+    export_config,
+    device,
+    max_batch=1,
+    max_dim=127,
 ):
     """
-        Generates input examples for tracing etc.
-        Returns:
-            A tuple of input examples.
-        """
+    Generates input examples for tracing etc.
+    Returns:
+        A tuple of input examples.
+    """
     sz = (max_batch * max_dim,) if export_config["enable_ragged_batches"] else (max_batch, max_dim)
     inp = torch.randint(*export_config["emb_range"], sz, device=device, dtype=torch.int64)
     pitch = torch.randn(sz, device=device, dtype=torch.float32) * 0.5
diff --git a/nemo/collections/vision/models/megatron_vit_classification_models.py b/nemo/collections/vision/models/megatron_vit_classification_models.py
index 3417b04299dc..5cffdd6d12a3 100644
--- a/nemo/collections/vision/models/megatron_vit_classification_models.py
+++ b/nemo/collections/vision/models/megatron_vit_classification_models.py
@@ -42,7 +42,6 @@
 
 try:
     from megatron.core import parallel_state
-    from megatron.core.num_microbatches_calculator import get_num_microbatches
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
@@ -51,6 +50,13 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
 
 class VitClassificationModel(MegatronModule):
     """Vision Transformer Model."""
diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py
index 03afec176325..8ee3fa1c05e7 100644
--- a/nemo/export/multimodal/build.py
+++ b/nemo/export/multimodal/build.py
@@ -208,7 +208,10 @@ def forward(self, images):
             return vision_x
 
     encoder = AutoModel.from_pretrained(
-        vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True
+        vision_config["from_pretrained"],
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation='eager',
     )
     vision_encoder = encoder.vision_model
     hf_config = encoder.config
@@ -326,7 +329,10 @@ def forward(self, images):
             return vision_x
 
     encoder = AutoModel.from_pretrained(
-        vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True
+        vision_config["from_pretrained"],
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation='eager',
     )
     vision_encoder = encoder.vision_model
     hf_config = encoder.config
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index e645ed8971c3..590cf50c804c 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -225,7 +225,8 @@ def export(self, model: MegatronGPTModel):
         assert self.export_config is not None, "Export config is not set"
         torch_dtype = torch_dtype_from_precision(self.export_config.dtype)
 
-        self._sample_output(model)
+        if self.export_config.get("sample_output", True):
+            self._sample_output(model)
 
         if model.cfg.megatron_amp_O2:
             model.model = unwrap_model(model.model, Float16Module)
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 08f1e4fe74e6..3c73da1c0731 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import json
 import logging
 import os
@@ -23,9 +24,11 @@
 from typing import List, Optional
 
 import numpy as np
+import safetensors
 import tensorrt_llm
 import torch
 import wrapt
+from tensorrt_llm._utils import numpy_to_torch
 
 from nemo.deploy import ITritonDeployable
 from nemo.export.tarutils import TarPath, unpack_tarball
@@ -38,26 +41,20 @@
     is_nemo_file,
     load_nemo_model,
 )
+from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
+from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
+from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint
 from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine
 from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_distributed, refit
 
-LOGGER = logging.getLogger("NeMo")
-
-use_model_opt = True
-try:
-    from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
-    from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
-    from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint
-except Exception as e:
-    LOGGER.warning(f"Cannot import the Model Optimizer, it will not be available. {type(e).__name__}: {e}")
-    use_model_opt = False
-
 use_deploy = True
 try:
     from nemo.deploy.utils import cast_output, str_ndarray2list
 except Exception as e:
     use_deploy = False
 
+LOGGER = logging.getLogger("NeMo")
+
 
 @wrapt.decorator
 def noop_decorator(func):
@@ -143,16 +140,16 @@ def export(
         nemo_checkpoint_path: str,
         model_type: Optional[str] = None,
         delete_existing_files: bool = True,
-        n_gpus: int = None,
+        n_gpus: Optional[int] = None,
         tensor_parallelism_size: int = 1,
         pipeline_parallelism_size: int = 1,
-        gpus_per_node: int = None,
+        gpus_per_node: Optional[int] = None,
         max_input_len: int = 256,
         max_output_len: int = 256,
         max_input_token: Optional[int] = None,
         max_output_token: Optional[int] = None,
         max_batch_size: int = 8,
-        max_prompt_embedding_table_size=None,
+        max_prompt_embedding_table_size: Optional[int] = None,
         use_parallel_embedding: bool = False,
         use_embedding_sharing: bool = False,
         paged_kv_cache: bool = True,
@@ -164,9 +161,9 @@ def export(
         use_lora_plugin: str = None,
         lora_target_modules: List[str] = None,
         max_lora_rank: int = 64,
-        max_num_tokens: int = None,
-        opt_num_tokens: int = None,
-        max_seq_len: int = None,
+        max_num_tokens: Optional[int] = None,
+        opt_num_tokens: Optional[int] = None,
+        max_seq_len: Optional[int] = None,
         multiple_profiles: bool = False,
         gpt_attention_plugin: str = "auto",
         gemm_plugin: str = "auto",
@@ -255,16 +252,22 @@ def export(
             )
             max_output_len = max_output_token
 
+        if max_seq_len is None:
+            max_seq_len = max_input_len + max_output_len
+
+        if max_batch_size < 4:
+            warnings.warn(
+                "TensorRT LLM may hit a runtime issue with batch size is smaller than 4 on some models."
+                " Force set to 4",
+                stacklevel=2,
+            )
+            max_batch_size = 4
+
         if tensorrt_llm.mpi_rank() == 0:
             tmp_dir = tempfile.TemporaryDirectory()
             nemo_export_dir = Path(tmp_dir.name)
 
-            is_qnemo_ckpt = False
-            if use_model_opt:
-                if is_qnemo_checkpoint(nemo_checkpoint_path):
-                    is_qnemo_ckpt = True
-
-            if is_qnemo_ckpt:
+            if is_qnemo_checkpoint(nemo_checkpoint_path):
                 if os.path.isdir(nemo_checkpoint_path):
                     nemo_export_dir = nemo_checkpoint_path
                 else:
@@ -277,6 +280,7 @@ def export(
                     engine_dir=self.model_dir,
                     max_input_len=max_input_len,
                     max_output_len=max_output_len,
+                    max_seq_len=max_seq_len,
                     max_batch_size=max_batch_size,
                     max_prompt_embedding_table_size=max_prompt_embedding_table_size,
                     tensor_parallel_size=tensor_parallelism_size,
@@ -290,6 +294,7 @@ def export(
                     max_lora_rank=max_lora_rank,
                     max_num_tokens=max_num_tokens,
                     opt_num_tokens=opt_num_tokens,
+                    multiple_profiles=multiple_profiles,
                 )
             else:
                 if model_type is None:
@@ -365,6 +370,84 @@ def export(
         if load_model:
             self._load()
 
+    def convert_to_safe_tensors(
+        self,
+        nemo_checkpoint_path: str,
+        model_type: Optional[str] = None,
+        delete_existing_files: bool = True,
+        tensor_parallelism_size: int = 1,
+        pipeline_parallelism_size: int = 1,
+        gpus_per_node: int = None,
+        use_parallel_embedding: bool = False,
+        use_embedding_sharing: bool = False,
+        dtype: str = "bfloat16",
+    ):
+        gpus_per_node = tensor_parallelism_size if gpus_per_node is None else gpus_per_node
+
+        if Path(self.model_dir).exists():
+            if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
+                for files in os.listdir(self.model_dir):
+                    path = os.path.join(self.model_dir, files)
+                    try:
+                        shutil.rmtree(path)
+                    except OSError:
+                        os.remove(path)
+
+                if len(os.listdir(self.model_dir)) > 0:
+                    raise Exception("Couldn't delete all files.")
+            elif len(os.listdir(self.model_dir)) > 0:
+                raise Exception("There are files in this folder. Try setting delete_existing_files=True.")
+        else:
+            Path(self.model_dir).mkdir(parents=True, exist_ok=True)
+
+        if model_type == "gpt" or model_type == "starcoder":
+            model_type = "gptnext"
+
+        if model_type == "mixtral":
+            model_type = "llama"
+
+        if tensorrt_llm.mpi_rank() == 0:
+            tmp_dir = tempfile.TemporaryDirectory()
+            nemo_export_dir = Path(tmp_dir.name)
+
+            model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
+            weights_dicts, model_configs = model_to_trtllm_ckpt(
+                model=model,
+                nemo_model_config=model_configs,
+                nemo_export_dir=nemo_export_dir,
+                decoder_type=model_type,
+                dtype=dtype,
+                tensor_parallel_size=tensor_parallelism_size,
+                pipeline_parallel_size=pipeline_parallelism_size,
+                gpus_per_node=gpus_per_node,
+                use_parallel_embedding=use_parallel_embedding,
+                use_embedding_sharing=use_embedding_sharing,
+            )
+
+            for weight_dict, model_config in zip(weights_dicts, model_configs):
+                rank = model_config.mapping.tp_rank
+                for k, v in weight_dict.items():
+                    weight_dict[k] = numpy_to_torch(v)
+
+                safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors'))
+
+            model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json'))
+
+            tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
+            if os.path.exists(tokenizer_path):
+                shutil.copy(tokenizer_path, self.model_dir)
+            else:
+                self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer'))
+
+            nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
+            if os.path.exists(nemo_model_config):
+                shutil.copy(nemo_model_config, self.model_dir)
+
+            tmp_dir.cleanup()
+
+        if tensorrt_llm.mpi_world_size() > 1:
+            tensorrt_llm.mpi_barrier()
+
     def build(
         self,
         model,
@@ -437,6 +520,8 @@ def refit(self, model, model_config):
             tokenizer_vocab_size=self.tokenizer.vocab_size,
         )
         load_distributed(self.model_dir, self.mp_rank, self.gpus_per_node)
+        gc.collect()
+        torch.cuda.empty_cache()
         refit(weights_dict)
 
     def forward(
diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
index 0345f979b8c2..db8a66308047 100644
--- a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
+++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
@@ -59,7 +59,7 @@ def get_layer_prefix(layer_names, is_mcore):
         if 'self_attention' in layer_name:
             transformer_layer_prefix = layer_name.split('layers')[0]
             break
-    assert transformer_layer_prefix is not None, "Cannot extract transformer layer prefix from {layer_name}"
+    assert transformer_layer_prefix is not None, f"Cannot extract transformer layer prefix from {layer_name}"
     if is_mcore:
         model_prefix = transformer_layer_prefix.split('decoder')[0]
     else:
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
index 1d473f497f51..1b711b5edbf3 100644
--- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -25,7 +25,9 @@
 import torch
 import yaml
 import zarr
-from torch.distributed.checkpoint import FileSystemReader
+from tensorrt_llm._utils import np_bfloat16
+from torch.distributed.checkpoint import FileSystemReader, TensorStorageMetadata
+from torch.distributed.checkpoint.state_dict_loader import load_state_dict
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
 from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
@@ -56,9 +58,11 @@ class TarFileSystemReader(FileSystemReader):
     """
 
     def __init__(self, path: Union[Path, TarPath]) -> None:
-        """No call to super().__init__ because it expects pure Path."""
-        self.path = path
-        self.storage_data = dict()
+        """Makes sure that super().__init__ gets a pure path as expected."""
+        super_path = str(path) if isinstance(path, TarPath) else path
+        super().__init__(super_path)
+        if isinstance(path, TarPath):
+            self.path = path  # overwrites path set in super().__init__ call
 
 
 def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
@@ -244,9 +248,7 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat
                 tokenizer_config["model"] = os.path.join(nemo_export_dir, "tokenizer.model")
                 tokenizer = build_tokenizer(tokenizer_config)
         else:
-            raise Exception(
-                "Not a supported nemo file format. " "Only distributed mcore nemo checkpoints are support."
-            )
+            raise Exception("Not a supported NeMo file format: only distributed MCore NeMo checkpoints are supported.")
     finally:
         if isinstance(nemo_dir, TarPath):
             nemo_dir.tarobject.close()
diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
index 630330381e56..921c6535a57a 100644
--- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import glob
 import os
+import subprocess
 import warnings
 from typing import List, Optional
 
-from modelopt.deploy.llm import build_tensorrt_llm
+from tensorrt_llm.models import PretrainedConfig
 
 from nemo.export.trt_llm.qnemo.utils import CONFIG_NAME, WEIGHTS_NAME
 
@@ -28,50 +28,97 @@ def qnemo_to_tensorrt_llm(
     engine_dir: str,
     max_input_len: int,
     max_output_len: int,
+    max_seq_len: Optional[int],
     max_batch_size: int,
     max_prompt_embedding_table_size: int,
-    tensor_parallel_size: int = None,
-    pipeline_parallel_size: int = None,
+    tensor_parallel_size: Optional[int] = None,
+    pipeline_parallel_size: Optional[int] = None,
     use_parallel_embedding: bool = False,
     paged_kv_cache: bool = True,
     remove_input_padding: bool = True,
     enable_multi_block_mode: bool = False,
-    use_lora_plugin: str = None,
+    use_lora_plugin: Optional[str] = None,
     lora_target_modules: Optional[List[str]] = None,
     max_lora_rank: int = 64,
-    max_num_tokens: int = None,
-    opt_num_tokens: int = None,
+    max_num_tokens: Optional[int] = None,
+    opt_num_tokens: Optional[int] = None,
+    max_beam_width: int = 1,
+    multiple_profiles: bool = False,
 ):
-    """Build TensorRT-LLM engine with ModelOpt build_tensorrt_llm function."""
+    """Build TensorRT-LLM engine with trtllm-build command in a subprocess."""
     assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}"
 
     warnings.warn(
-        "Note that setting tensor_parallel_size and pipeline_parallel_size parameters"
-        " for quantized models should be done on calibration step with nemo.export.quantize module."
+        "Note that setting tensor_parallel_size, pipeline_parallel_size and use_parallel_embedding "
+        " parameters for quantized models is done on calibration step with nemo.export.quantize module."
         " These parameters are ignored when building and running TensorRT-LLM engine below.",
         UserWarning,
         stacklevel=3,
     )
 
-    warnings.warn(
-        "Also use_parallel_embedding, paged_kv_cache, remove_input_padding, enable_multi_block_mode, max_num_tokens"
-        " and opt_num_tokens parameters are set by ModelOpt build_tensorrt_llm function in the optimal way and are"
-        " ignored on engine build step.",
-        UserWarning,
-        stacklevel=3,
-    )
-
     num_build_workers = len(glob.glob(os.path.join(nemo_checkpoint_path, WEIGHTS_NAME.format("*"))))
     assert num_build_workers, f"No TensorRT-LLM weight files found in {nemo_checkpoint_path}"
 
-    build_tensorrt_llm(
-        pretrained_config=os.path.join(nemo_checkpoint_path, CONFIG_NAME),
-        engine_dir=engine_dir,
-        max_input_len=max_input_len,
-        max_output_len=max_output_len,
-        max_batch_size=max_batch_size,
-        max_beam_width=1,
-        num_build_workers=num_build_workers,
-        enable_sparsity=False,
-        max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-    )
+    config = PretrainedConfig.from_json_file(os.path.join(nemo_checkpoint_path, CONFIG_NAME))
+
+    log_level = "warning"
+
+    quant_algo = config.quantization.quant_algo
+
+    use_fused_mlp = quant_algo in [
+        "FP8",
+        None,
+    ] and config.hidden_act in ["silu", "swiglu", "fast-swiglu", "gelu", "geglu"]
+
+    use_qdq = quant_algo in ["FP8", "W8A8_SQ_PER_CHANNEL"]
+
+    builder_opt = 4 if "RecurrentGemma" not in config.architecture else 0
+
+    speculative_decoding_mode = "medusa" if "Medusa" in config.architecture else None
+
+    build_cmd = "trtllm-build "
+    build_cmd += f"--checkpoint_dir {nemo_checkpoint_path} "
+    build_cmd += f"--log_level {log_level} "
+    build_cmd += f"--output_dir {engine_dir} "
+    build_cmd += f"--workers {num_build_workers} "
+    build_cmd += f"--max_batch_size {max_batch_size} "
+    build_cmd += f"--max_input_len {max_input_len} "
+    build_cmd += f"--max_output_len {max_output_len} "
+    build_cmd += f"--max_beam_width {max_beam_width} "
+    build_cmd += f"--tp_size {config.mapping.tp_size} "
+    build_cmd += f"--pp_size {config.mapping.pp_size} "
+    build_cmd += f"--max_prompt_embedding_table_size {max_prompt_embedding_table_size} "
+    build_cmd += f"--builder_opt {builder_opt} "
+    build_cmd += f"--gpt_attention_plugin {config.dtype} "
+    build_cmd += f"--nccl_plugin {config.dtype} "
+    build_cmd += f"--paged_kv_cache {'enable' if paged_kv_cache else 'disable'} "
+    build_cmd += f"--remove_input_padding {'enable' if remove_input_padding else 'disable'} "
+    build_cmd += f"--multi_block_mode {'enable' if enable_multi_block_mode else 'disable'} "
+    build_cmd += f"--multiple_profiles {'enable' if multiple_profiles else 'disable'} "
+
+    if use_fused_mlp:
+        build_cmd += "--use_fused_mlp " if "RecurrentGemma" not in config.architecture else ""
+
+    if not use_qdq:
+        build_cmd += f"--gemm_plugin {config.dtype} "
+
+    if max_seq_len:
+        build_cmd += f"--max_seq_len {max_seq_len} "
+
+    if max_num_tokens is not None:
+        build_cmd += f"--max_num_tokens {max_num_tokens} "
+    else:
+        build_cmd += f"--max_num_tokens {max_batch_size * max_input_len} "
+
+    if opt_num_tokens is not None:
+        build_cmd += f"--opt_num_tokens {opt_num_tokens} "
+
+    if speculative_decoding_mode:
+        build_cmd += f"--speculative_decoding_mode {speculative_decoding_mode} "
+
+    build_cmd = build_cmd.replace("--", "\\\n  --")  # Separate parameters line by line
+
+    print("trtllm-build command:")
+    print(build_cmd)
+
+    subprocess.run(build_cmd, shell=True, check=True)
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index d04698c318bf..1544fdf032d8 100755
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -76,9 +76,6 @@ def build_and_save_engine(
     plugin_config.use_paged_context_fmha = paged_context_fmha
     plugin_config.multiple_profiles = multiple_profiles
 
-    if max_seq_len is None:
-        max_seq_len = max_input_len + max_output_len
-
     max_num_tokens, opt_num_tokens = check_max_num_tokens(
         max_num_tokens=max_num_tokens,
         opt_num_tokens=opt_num_tokens,
diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py
index 24c3772b902b..de06ea830e07 100644
--- a/nemo/export/vllm_exporter.py
+++ b/nemo/export/vllm_exporter.py
@@ -244,6 +244,7 @@ def export(
             speculative_config=None,
             decoding_config=None,
             observability_config=None,
+            prompt_adapter_config=None,
             executor_class=executor_class,
             log_stats=log_stats,
         )
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index d647fe1b69ea..9b4aaa8d0330 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -127,13 +127,12 @@ def set_model_parallel_attributes(model, parallelism):
     # Right now mcore sub-classes ModelParellelConfig, we should remove that
     # Given Lightning's structure it would be better if parallelism is a different object
     # Since then it can be passed to the Strategy
-
-    from megatron.core.model_parallel_config import ModelParallelConfig
+    # Note: Importing nemo.lightning.pytorch.strategies creates an import cycle.
     from megatron.core.transformer.transformer_config import TransformerConfig
 
-    assert isinstance(
-        parallelism, ModelParallelConfig
-    ), f"Expected parallelism config to be of type ModelParallelConfig, but got {type(parallelism)}"
+    assert (
+        type(parallelism).__name__ == 'ParallelismConfig'
+    ), f"Expected parallelism config to be of type ParallelismConfig, but got {type(parallelism)}"
     has_mcore_config = isinstance(getattr(model, "config", None), TransformerConfig)
     if has_mcore_config and hasattr(model, "configure_model"):
         config: TransformerConfig = model.config
@@ -141,6 +140,8 @@ def set_model_parallel_attributes(model, parallelism):
             if not hasattr(config, attr_name):
                 continue
             setattr(config, attr_name, getattr(parallelism, attr_name))
+            if hasattr(config, "__io__"):
+                setattr(config.__io__, attr_name, getattr(parallelism, attr_name))
 
         return config
 
@@ -524,3 +525,33 @@ def load_model_state_dict(megatron_parallel, checkpoint: Mapping[str, Any], stri
                 _state_dict[key] = value
 
         module.load_state_dict(_state_dict, strict=strict)
+
+
+def _sync_from_last_pipeline_stage(value: torch.Tensor, broadcast: bool = False):
+    """
+    When pipeline parallelism is enabled, casts a tensor defined on the last pipeline stage to other ranks.
+
+        Args:
+            value (torch.Tensor): A tensor to be casted from the final pipeline stage of a pipeline parallelism group (e.g. loss).
+                Note that this tensor should already be defined on the target rank(s) to fill with received data.
+            broadcast (bool): When True, broadcasts value from the final pipeline stage rank to all ranks in its group.
+                When False, only rank zero receives value from the final pipeline stage rank in its group.
+                This mode exists to avoid slow one-to-many communication when not necessary. Defaults to False.
+    """
+    from megatron.core import parallel_state
+
+    if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+        src_rank = parallel_state.get_pipeline_model_parallel_last_rank()
+
+        if not broadcast:
+            pp_ranks = torch.distributed.get_process_group_ranks(parallel_state.get_pipeline_model_parallel_group())
+            if torch.distributed.get_rank() == src_rank and 0 in pp_ranks:
+                torch.distributed.send(value, 0)
+            elif torch.distributed.get_rank() == 0:
+                torch.distributed.recv(value, src_rank)
+        else:
+            torch.distributed.broadcast(
+                value,
+                src_rank,
+                group=parallel_state.get_pipeline_model_parallel_group(),
+            )
diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index a07f504f1009..59acdec6f8e2 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -47,36 +47,73 @@ def setup_microbatch_calculator(
     from nemo.lightning._strategy_lib import NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE
     from nemo.utils import AppState
 
+    try:
+        from megatron.core.num_microbatches_calculator import (
+            ConstantNumMicroBatchesCalculator,
+            get_current_global_batch_size,
+            get_micro_batch_size,
+            get_num_microbatches,
+            init_num_microbatches_calculator,
+        )
+
+        MCORE_MB_CALCULATOR = True
+
+    except (ImportError, ModuleNotFoundError):
+        logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+        from apex.transformer.microbatches import ConstantNumMicroBatches as ConstantNumMicroBatchesCalculator
+        from apex.transformer.pipeline_parallel.utils import (
+            get_current_global_batch_size,
+            get_micro_batch_size,
+            get_num_microbatches,
+        )
+        from apex.transformer.pipeline_parallel.utils import (
+            setup_microbatch_calculator as init_num_microbatches_calculator,
+        )
+
+        MCORE_MB_CALCULATOR = False
+
     app_state = AppState()
 
     if os.environ.get(NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE, "false").lower() == "true":
         init_global_rank = app_state.global_rank
     else:
         init_global_rank = global_rank
-
-    from megatron.core.num_microbatches_calculator import (
-        _GLOBAL_NUM_MICROBATCHES_CALCULATOR,
-        ConstantNumMicroBatchesCalculator,
-        init_num_microbatches_calculator,
-    )
-
-    if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
-        init_num_microbatches_calculator(
-            rank=init_global_rank,
-            global_batch_size=global_batch_size,
-            micro_batch_size=micro_batch_size,
-            data_parallel_size=app_state.data_parallel_size,
-            rampup_batch_size=rampup_batch_size,
-        )
+    if MCORE_MB_CALCULATOR:
+        from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+        if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
+            init_num_microbatches_calculator(
+                rank=init_global_rank,
+                global_batch_size=global_batch_size,
+                micro_batch_size=micro_batch_size,
+                data_parallel_size=app_state.data_parallel_size,
+                rampup_batch_size=rampup_batch_size,
+            )
+        else:
+            if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator):
+                assert get_current_global_batch_size() == global_batch_size
+                assert get_micro_batch_size() == micro_batch_size
+                assert get_num_microbatches() == global_batch_size // (micro_batch_size * app_state.data_parallel_size)
+            else:
+                raise Exception("Microbatch calculator already initialized.")
     else:
-        if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator):
-            assert _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_global_batch_size == global_batch_size
-            assert _GLOBAL_NUM_MICROBATCHES_CALCULATOR.micro_batch_size == micro_batch_size
-            assert _GLOBAL_NUM_MICROBATCHES_CALCULATOR.num_micro_batches == global_batch_size // (
-                micro_batch_size * app_state.data_parallel_size
+        from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+        if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
+            init_num_microbatches_calculator(
+                rank=init_global_rank,
+                global_batch_size=global_batch_size,
+                micro_batch_size=micro_batch_size,
+                data_parallel_size=app_state.data_parallel_size,
+                rampup_batch_size=rampup_batch_size,
             )
         else:
-            raise Exception("Microbatch calculator already initialized.")
+            if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatchesCalculator):
+                assert get_current_global_batch_size() == global_batch_size
+                assert get_micro_batch_size() == micro_batch_size
+                assert get_num_microbatches() == global_batch_size // (micro_batch_size * app_state.data_parallel_size)
+            else:
+                raise Exception("Microbatch calculator already initialized.")
 
 
 def add_megatron_sampler(
@@ -85,14 +122,45 @@ def add_megatron_sampler(
     global_batch_size: int,
     rampup_batch_size: Optional[List[int]] = None,
     consumed_samples: int = 0,
-    dataloader_type: Literal["single", "cyclic"] = "single",
+    dataloader_type: Literal["single", "cyclic", "batch"] = "single",
     drop_last: bool = True,
     pad_samples_to_global_batch_size: bool = False,
     # data_sharding: bool = False
 ) -> DataLoader:
+    """
+    This function takes an existing PyTorch `DataLoader` and configures it to use a Megatron sampler.
+    The Megatron sampler is responsible for splitting the data into batches
+    during training with Megatron.
+
+    Args:
+        dataloader (DataLoader): The original PyTorch DataLoader to wrap.
+        micro_batch_size (int): The size of each micro-batch.
+        global_batch_size (int): The effective size of the training batch across all data parallel devices.
+        rampup_batch_size (Optional[List[int]]): A list of target batch sizes for a gradual
+            rampup schedule during training (optional).
+        consumed_samples (int, optional): The number of samples consumed before
+            starting this iteration (defaults to 0).
+        dataloader_type (Literal["single", "cyclic", "batch"], optional): The type of
+            Megatron sampler to use. Valid options are:
+                - "single": Uses `MegatronPretrainingSampler` for single pass data sampling.
+                - "cyclic": Uses `MegatronPretrainingRandomSampler` for cyclic data sampling.
+                - "batch": Uses `MegatronPretrainingBatchSampler` for batch sampling. This is the option to
+                  use for fine-tuning workloads, where sequence lengths are variable between samples.
+                  Sampling the entire global batch together ensures that sequences in a global batch are
+                  padded to the same lengths.
+            Defaults to "single".
+        drop_last (bool, optional): Whether to drop the last incomplete batch
+            (defaults to True).
+        pad_samples_to_global_batch_size (bool, optional): Whether to pad the last incomplete
+            batch to the `global_batch_size`  (defaults to False, only applies when
+            `drop_last` is False).
+
+    Returns:
+        DataLoader: A new DataLoader instance with the configured Megatron sampler.
+    """
+
     from megatron.core import parallel_state
 
-    ## TODO: expose drop_last and pad_samples_to_global_batch_size args
     if dataloader_type == 'single':
         batch_sampler = MegatronPretrainingSampler(
             total_samples=len(dataloader.dataset),
@@ -115,6 +183,21 @@ def add_megatron_sampler(
             drop_last=drop_last,
             # data_sharding=data_sharding
         )
+    elif dataloader_type == 'batch':
+        from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import (
+            MegatronPretrainingBatchSampler,
+        )
+
+        batch_sampler = MegatronPretrainingBatchSampler(
+            total_samples=len(dataloader.dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            data_parallel_rank=parallel_state.get_data_parallel_rank(),
+            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            drop_last=drop_last,
+            pad_samples_to_global_batch_size=not drop_last,
+        )
     else:
         raise Exception(f'{dataloader_type} dataloader type is not supported.')
 
@@ -153,8 +236,6 @@ def __init__(
         # Sanity checks.
         if total_samples <= 0:
             raise RuntimeError(f"no sample to consume: {total_samples}")
-        if consumed_samples >= total_samples:
-            raise RuntimeError(f"no samples left to consume: {consumed_samples}, {total_samples}")
         if micro_batch_size <= 0:
             raise RuntimeError(f"micro_batch_size size must be greater than 0, but {micro_batch_size}")
         if data_parallel_size <= 0:
@@ -209,6 +290,32 @@ def __iter__(self): ...
 
 
 class MegatronPretrainingSampler(BaseMegatronSampler):
+    def __init__(
+        self,
+        total_samples: int,
+        consumed_samples: int,
+        micro_batch_size: int,
+        data_parallel_rank: int,
+        data_parallel_size: int,
+        drop_last: bool = True,
+        global_batch_size: Optional[int] = None,
+        rampup_batch_size: Optional[list] = None,
+        pad_samples_to_global_batch_size: Optional[bool] = False,
+    ):
+        super().__init__(
+            total_samples=total_samples,
+            consumed_samples=consumed_samples,
+            micro_batch_size=micro_batch_size,
+            data_parallel_rank=data_parallel_rank,
+            data_parallel_size=data_parallel_size,
+            drop_last=drop_last,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+            pad_samples_to_global_batch_size=pad_samples_to_global_batch_size,
+        )
+        if consumed_samples >= total_samples:
+            raise RuntimeError(f"no samples left to consume: {consumed_samples}, {total_samples}")
+
     def get_start_end_idx(self):
         start_idx = self.data_parallel_rank * self.micro_batch_size
         end_idx = start_idx + self.micro_batch_size
diff --git a/nemo/lightning/fabric/plugins.py b/nemo/lightning/fabric/plugins.py
index 79e1455cb33f..dba103abf2a4 100644
--- a/nemo/lightning/fabric/plugins.py
+++ b/nemo/lightning/fabric/plugins.py
@@ -124,6 +124,4 @@ def forward_context(self) -> Generator[None, None, None]:
 def _convert_megatron_mixed_precision(plugin: MegatronMixedPrecision) -> FabricMegatronMixedPrecision:
     return FabricMegatronMixedPrecision(
         precision=plugin.precision,
-        device=plugin.device,
-        scaler=plugin.scaler,
     )
diff --git a/nemo/lightning/fabric/strategies.py b/nemo/lightning/fabric/strategies.py
index a662386a9119..a183c434dc52 100644
--- a/nemo/lightning/fabric/strategies.py
+++ b/nemo/lightning/fabric/strategies.py
@@ -23,7 +23,6 @@
 from lightning_fabric.plugins.precision import Precision
 from lightning_fabric.strategies import DDPStrategy
 from lightning_fabric.strategies.strategy import _validate_keys_for_strict_loading
-from lightning_fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1
 from lightning_fabric.utilities.types import _PATH, _Stateful
 from megatron.core.distributed import DistributedDataParallelConfig
 from pytorch_lightning.loops.fetchers import _DataFetcher
@@ -208,7 +207,7 @@ def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManag
         precision_init_ctx = self.precision.module_init_context()
         module_sharded_ctx = self.megatron_context()
         stack = ExitStack()
-        if _TORCH_GREATER_EQUAL_2_1 and empty_init:
+        if empty_init:
             # Materialization happens in `setup`. When modules get wrapped by FSDP, the sequence of operations is:
             # 1) materialize module 2) call `reset_parameters()` 3) shard the module.
             # These operations are applied to each submodule 'bottom up' in the module hierarchy.
@@ -327,9 +326,9 @@ def checkpoint_io(self) -> CheckpointIO:
 
     @property
     def parallelism(self):
-        from megatron.core.model_parallel_config import ModelParallelConfig
+        from nemo.lightning.pytorch.strategies import ParallelismConfig
 
-        return ModelParallelConfig(
+        return ParallelismConfig(
             tensor_model_parallel_size=self.tensor_model_parallel_size,
             pipeline_model_parallel_size=self.pipeline_model_parallel_size,
             virtual_pipeline_model_parallel_size=self.virtual_pipeline_model_parallel_size,
diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py
index cc594b562cff..4d31f020c44a 100644
--- a/nemo/lightning/io/api.py
+++ b/nemo/lightning/io/api.py
@@ -1,11 +1,13 @@
+import json
 from pathlib import Path
+from pydoc import locate
 from typing import Any, Callable, Optional, Type, TypeVar
 
 import fiddle as fdl
 import pytorch_lightning as pl
 from fiddle._src.experimental import serialization
 
-from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector
+from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector, track_io
 from nemo.lightning.io.pl import TrainerContext
 
 CkptType = TypeVar("CkptType")
@@ -41,6 +43,14 @@ def load(path: Path, output_type: Type[CkptType] = Any) -> CkptType:
     if not _path.is_file():
         raise FileNotFoundError(f"No such file: '{_path}'")
 
+    ## add IO functionality to custom objects present in the json file
+    with open(_path) as f:
+        j = json.load(f)
+        for obj, val in j["objects"].items():
+            clss = ".".join([val["type"]["module"], val["type"]["name"]])
+            if not serialization.find_node_traverser(locate(clss)):
+                track_io(locate(clss))
+
     with open(_path, "rb") as f:
         config = serialization.load_json(f.read())
 
diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py
index 4025634ebe28..9119b2474b17 100644
--- a/nemo/lightning/io/artifact/base.py
+++ b/nemo/lightning/io/artifact/base.py
@@ -6,8 +6,9 @@
 
 
 class Artifact(ABC, Generic[ValueT]):
-    def __init__(self, attr: str):
+    def __init__(self, attr: str, required: bool = True):
         self.attr = attr
+        self.required = required
 
     @abstractmethod
     def dump(self, value: ValueT, path: Path) -> ValueT:
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 63614d934285..69368599682e 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -2,7 +2,7 @@
 import logging
 import os
 import shutil
-from pathlib import Path, PosixPath, WindowsPath
+from pathlib import Path, PosixPath, PurePath, WindowsPath
 from typing import Generic, Optional, Tuple, TypeVar
 
 import pytorch_lightning as pl
@@ -91,6 +91,14 @@ def __call__(self, output_path: Optional[Path] = None, overwrite: bool = False)
             logging.error(f"An error occurred: {e}")
             raise
 
+        finally:
+            # Delete the lock file if it exists
+            if lock_path.exists():
+                try:
+                    os.remove(lock_path)
+                except OSError as e:
+                    logging.warning(f"Failed to remove lock file {lock_path}: {e}")
+
         return _output_path
 
     def local_path(self, base_path: Optional[Path] = None) -> Path:
@@ -152,19 +160,26 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
 
         return _trainer
 
-    def nemo_save(self, output_path: Path, trainer: pl.Trainer) -> None:
+    def nemo_save(self, output_path: Path, trainer: pl.Trainer, dump_io: bool = True) -> None:
         """
         Saves the model's state to the specified path using the trainer's current strategy.
 
         Args:
             output_path (Path): The path where the model checkpoint will be saved.
             trainer (pl.Trainer): The trainer with the strategy to save the model.
+            dump_io (bool): If True, the IO configuration will be saved to the output path.
         """
         trainer.strategy._setup_optimizers = False
         trainer.strategy._init_model_parallel = False
         trainer.strategy.setup(trainer)
         trainer.save_checkpoint(output_path)
 
+        from nemo.lightning.io.pl import TrainerContext
+        from nemo.utils.get_rank import is_global_rank_zero
+
+        if is_global_rank_zero() and dump_io:
+            TrainerContext.from_trainer(trainer).io_dump(output_path)
+
     def nemo_load(
         self, path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True
     ) -> Tuple[pl.LightningModule, pl.Trainer]:
@@ -212,6 +227,10 @@ def local_path(self, base_path: Optional[Path] = None) -> Path:
 
             _base = Path(NEMO_MODELS_CACHE)
 
+        # If the useu supplied `hf:///path/to/downloaded/my-model/`
+        # then extract the last dir-name (i.e. my-model) and append it to _base
+        if str(self).startswith('/'):
+            return _base / PurePath((str(self))).name
         return _base / str(self).replace("://", "/")
 
     def on_import_ckpt(self, model: pl.LightningModule):
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index dfc78c30a929..d0d4d0243ff7 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -293,9 +293,12 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa
         """
         connector = self._get_connector(path)
         ckpt_path: Path = connector.local_path(base_path=base_path)
-        ckpt_path = connector(ckpt_path, overwrite=overwrite)
+        # If already in multiproc environment (e.g. due to torchrun invocation) run only on RANK = 0
+        from nemo.utils.get_rank import is_global_rank_zero
 
-        connector.on_import_ckpt(self)
+        if is_global_rank_zero():
+            ckpt_path = connector(ckpt_path, overwrite=overwrite)
+            connector.on_import_ckpt(self)
 
         return ckpt_path
 
@@ -429,13 +432,24 @@ def _io_init(self, **kwargs) -> fdl.Config[Self]:
     -------
         fdl.Config[Self]: The initialized configuration object.
     """
-    return fdl.Config(type(self), **kwargs)
+    try:
+        return fdl.Config(type(self), **kwargs)
+    except Exception as e:
+        error_msg = (
+            f"Error creating fdl.Config for {type(self).__name__}: {str(e)}\n"
+            f"Arguments that caused the error: {kwargs}\n"
+            f"This may be due to unsupported argument types or nested configurations."
+        )
+        raise RuntimeError(error_msg) from e
 
 
 def _io_wrap_init(cls):
     """Wraps the __init__ method of a class to add IO functionality."""
     original_init = cls.__init__
 
+    if getattr(cls, "__wrapped_init__", False):
+        return cls
+
     @functools.wraps(original_init)
     def wrapped_init(self, *args, **kwargs):
         if hasattr(self, "io_transform_args"):
@@ -450,6 +464,7 @@ def wrapped_init(self, *args, **kwargs):
         original_init(self, *args, **kwargs)
 
     cls.__init__ = wrapped_init
+    cls.__wrapped_init__ = True
     return cls
 
 
@@ -499,6 +514,10 @@ def _io_path_elements_fn(x):
 def _artifact_transform(cfg: fdl.Config, output_path: Path):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
         current_val = getattr(cfg, artifact.attr)
+        if current_val is None:
+            if artifact.required:
+                raise ValueError(f"Artifact '{artifact.attr}' is required but not provided")
+            continue
         new_val = artifact.dump(current_val, output_path)
         setattr(cfg, artifact.attr, new_val)
 
diff --git a/nemo/lightning/io/state.py b/nemo/lightning/io/state.py
index b69fed9d0f4f..9fd81a960358 100644
--- a/nemo/lightning/io/state.py
+++ b/nemo/lightning/io/state.py
@@ -4,6 +4,7 @@
 from typing import Any, Callable, Dict, Generic, List, Optional, Tuple, TypeVar, Union, overload
 
 import numpy as np
+import torch
 from torch import nn
 
 SourceModuleT = TypeVar("SourceModuleT", bound=nn.Module)
@@ -19,11 +20,12 @@ class TransformCTX:
     target_state: dict
 
 
+@torch.no_grad
 def apply_transforms(
     source: nn.Module,
     target: TargetModuleT,
     mapping: Dict[str, str],
-    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None,
+    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = [],
 ) -> TargetModuleT:
     """
     Applies a series of transformations to adapt the state dictionary of a source module to
@@ -101,9 +103,8 @@ def scale_weights(ctx):
     for key, val in mapping.items():
         ctx = StateDictTransform(key, val)(ctx)
 
-    if transforms:
-        for transform in transforms:
-            ctx = transform(ctx)
+    for transform in transforms:
+        ctx = transform(ctx)
 
     _params: Dict[str, nn.Parameter] = {}
     for name, param in _target.named_parameters():
@@ -144,9 +145,9 @@ def scale_weights(ctx):
 
         _module.register_buffer(_key, val)
 
-    keys = [name for name in list(target_state.keys()) if not name.endswith("_extra_state")]
+    keys = list(filter(lambda x: x is not None and not x.endswith("_extra_state"), target_state.keys()))
     if len(keys) != 0:
-        raise RuntimeError(f"Additional keys: {target_state.keys()} in checkpoint but not in model.")
+        raise RuntimeError(f"Additional keys: {keys} in checkpoint but not in model.")
 
     # TODO: Is this correct?
     # for key in target.state_dict():
@@ -165,7 +166,7 @@ def scale_weights(ctx):
 
 
 def _default_transform(inp):
-    return inp.float()
+    return inp
 
 
 class StateDictTransform(Generic[F]):
@@ -324,7 +325,7 @@ def _match_keys(keys: List[str], pattern: str) -> np.ndarray:
     regex_pattern = re.compile("^" + pattern.replace("*", "(.*)") + "$")
     wildcard_matches = [[] for _ in range(pattern.count("*"))]
 
-    for key in keys:
+    for key in filter(lambda x: x is not None, keys):
         match = regex_pattern.match(key)
         if match:
             for i, group in enumerate(match.groups()):
@@ -342,7 +343,7 @@ def _match_keys(keys: List[str], pattern: str) -> np.ndarray:
     output_array = np.empty(shape, dtype=object)
 
     # Populate the array with the keys, now that we have the correct shape and ordering
-    for key in keys:
+    for key in filter(lambda x: x is not None, keys):
         match = regex_pattern.match(key)
         if match:
             # Convert match groups to indices based on their position in wildcard_matches
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 74db25af64bf..56146498b539 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -116,6 +116,12 @@ class MegatronParallel(nn.ModuleList, Generic[ModelT]):
             forward pass of a model.
         loss_reduction (Optional[Callable[[nn.Module], MegatronLossReduction]]): An optional
             function that defines how the loss is reduced.
+        vp_size (Optional[int]): Virtual pipeline parallel size.
+        ddp_config (Optional[DistributedDataParallelConfig]): An instance of Megatron core's
+            DistributedDataParallelConfig which controls the Megatron DDP configuration.
+        cpu (bool): Whether model should reside on CPU.
+        convert_module_fn (Optional[Callable[[ModelT], nn.Module]]): An optional function to
+            apply to the model parameters after initialization.
 
     Examples
     --------
@@ -224,6 +230,20 @@ def forward(
         _num_microbatches: int = num_microbatches or self.infer_num_microbatches(data)
 
         pipeline = self.pipeline
+
+        use_global_batch_sampler = self.trainer.datamodule.data_sampler.dataloader_type == 'batch'
+        if use_global_batch_sampler:
+            from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split
+
+            # The current way of using a batch sampler + split to micro iterator results in
+            # extraneous padding, and is only implemented to ensure bit-exactness with NeMo 1.
+            # This part in NeMo 1 was written when megatron fwd_bwd_function did not support unequal
+            # sequence lengths, but it does now. Hence this part should be revisited in the future.
+            batch = next(data)
+            if isinstance(batch, tuple) and len(batch) == 3:
+                batch = batch[0]
+            data = get_iterator_k_split(batch, _num_microbatches, True)
+
         data_iterator: List[Iterator[DataT]] = self.to_data_iterator_list(data)
         context = self._build_context({**locals()})
 
@@ -266,17 +286,11 @@ def forward(
             self.callbacks.event("on_megatron_reduce_microbatches_end", **context)
         else:
             # we're not on the last pipeline stage so no losses
-            if forward_only:
-                loss_mean = cast(torch.Tensor, [])
-            else:
-                loss_mean = torch.tensor(0.0, device=torch.cuda.current_device())
+            loss_mean = torch.tensor(0.0, device=torch.cuda.current_device())
 
         self.callbacks.event("on_megatron_log_step_end", **context)
         self.callbacks.event("on_megatron_step_end", **context)
 
-        if loss_mean == []:
-            loss_mean = None
-
         return loss_mean
 
     def wrapped_forward_step(
@@ -628,6 +642,11 @@ def __getattr__(self, item: Any) -> Any:
 
 
 class _ModuleStepFunction:
+    """
+    This class acts as a bridge between Megatron core's lower-level functional API and PTL's object-oriented API,
+        making it possible to use PTL-compatible functions in Megatron core.
+    """
+
     def __init__(self, name: str, is_property: bool = False, includes_self: bool = False):
         self.name = name
         self.is_property = is_property
@@ -656,7 +675,9 @@ def wrapped(self, *args):
 def getattr_proxy(self, item: Any) -> Any:
     try:
         return super(self.__class__, self).__getattr__(item)
-    except AttributeError:
+    except AttributeError as e:
+        if item == 'module':  ## this is a hacky WAR and may cause misleading error messages
+            raise e
         try:
             return getattr(self.module, item)
         except AttributeError:
diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py
index 5ed783fdbefe..5ba2c39f9cff 100644
--- a/nemo/lightning/nemo_logger.py
+++ b/nemo/lightning/nemo_logger.py
@@ -7,7 +7,6 @@
 
 import lightning_fabric as fl
 import pytorch_lightning as pl
-from fiddle._src.experimental import serialization
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint as PTLModelCheckpoint
 from pytorch_lightning.loggers import Logger, TensorBoardLogger, WandbLogger
 
@@ -30,7 +29,12 @@ class NeMoLogger(IOMixin):
         log_local_rank_0_only (bool): Log only on local rank 0.
         log_global_rank_0_only (bool): Log only on global rank 0.
         files_to_copy (Optional[List[str]]): List of files to copy to log directory.
-        update_logger_directory (bool): Whether to update logger directory.
+        update_logger_directory (bool): Whether to update logger directory to write to `exp_dir`.
+            If True, the `save_dir` passed to the logger will be treated as a relative path and
+            the logger will be reconfigured to write to `exp_dir / save_dir`. This ensures that
+            all output from an experiment is written to a common directory. If False, the logger's
+            save_dir will not be overwritten. This argument applies only to TensorBoardLogger and
+            WandbLogger instances.
         ckpt (Optional[ModelCheckpoint]): Model checkpoint callback.
     """
 
@@ -73,30 +77,46 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool =
         logging.rank = self.global_rank
 
         if self.explicit_log_dir and isinstance(trainer, pl.Trainer):  # If explicit log_dir was passed, short circuit
-            return check_explicit_log_dir(trainer, self.explicit_log_dir, self.dir, self.name, self.version)
-
-        # Default dir to ./nemo_experiments if None was passed
-        _dir = self.dir
-        if self.dir is None:
-            _dir = str(Path.cwd() / 'nemo_experiments')
-
-        if not self.name:
-            self.name = "default"
-
-        version = self.version or os.environ.get(NEMO_ENV_VARNAME_VERSION, None)
-        if is_global_rank_zero():
-            if self.use_datetime_version:
-                version = time.strftime('%Y-%m-%d_%H-%M-%S')
-        if resume_if_exists:
-            logging.warning(
-                "No version folders would be created under the log folder as 'resume_if_exists' is enabled."
-            )
-            version = None
-        if version:
-            if is_global_rank_zero():
-                os.environ[NEMO_ENV_VARNAME_VERSION] = version
+            if trainer.logger is not None and not self.update_logger_directory:
+                logging.warning(
+                    f"nemo logger received explicit_log_dir: {self.explicit_log_dir} and the pytorch lightning trainer "
+                    f"that was passed to nemo_logger container a logger, but update_logger_directory is False. This means "
+                    f"that the trainer's logger directory may not match with the explicit_log_dir."
+                )
+            if self.dir or self.version:
+                logging.error(
+                    f"nemo logger received explicit_log_dir: {self.explicit_log_dir} and at least one of dir: {self.dir}, "
+                    f"or version: {self.version}. Please note that dir, name, and version will be ignored."
+                )
+            if is_global_rank_zero() and Path(self.explicit_log_dir).exists():
+                logging.warning(f"NeMoLogger is logging to {self.explicit_log_dir}, but it already exists.")
+            log_dir, _dir, self.name, version = Path(self.explicit_log_dir), str(self.explicit_log_dir), "", ""
+
+        else:
+            # Default dir to ./nemo_experiments if None was passed
+            _dir = self.dir
+            if self.dir is None:
+                _dir = str(Path.cwd() / 'nemo_experiments')
+
+            if not self.name:
+                self.name = "default"
+
+            version = self.version or os.environ.get(NEMO_ENV_VARNAME_VERSION, None)
+            if not version:
+                if resume_if_exists:
+                    logging.warning(
+                        "No version folders would be created under the log folder as 'resume_if_exists' is enabled."
+                    )
+                    version = None
+                elif is_global_rank_zero():
+                    if self.use_datetime_version:
+                        version = time.strftime('%Y-%m-%d_%H-%M-%S')
+            if version:
+                if is_global_rank_zero():
+                    os.environ[NEMO_ENV_VARNAME_VERSION] = version
+
+            log_dir = Path(_dir) / Path(str(self.name)) / Path("" if version is None else str(version))
 
-        log_dir = Path(_dir) / Path(str(self.name)) / Path("" if version is None else str(version))
         # update app_state with log_dir, exp_dir, etc
         app_state = AppState()
         app_state.log_dir = log_dir
@@ -124,25 +144,29 @@ def _setup_trainer_loggers(self, trainer, dir, version):
         loggers = [self.tensorboard, self.wandb, *self.extra_loggers]
         loggers = [logger for logger in loggers if logger is not None]
 
-        if self.update_logger_directory and self.wandb:
-            self.wandb._save_dir = dir
-            self.wandb._wandb_init["dir"] = dir
-            self.wandb._wandb_init["name"] = self.name
-            self.wandb._name = self.name
-
         if loggers:
             if trainer.logger is not None and not self.tensorboard:
                 loggers = [trainer.logger] + loggers
             trainer._logger_connector.configure_logger(loggers)
 
-        if trainer.logger is not None:
-            trainer.logger._version = version or ""
-            if self.update_logger_directory:
-                logging.warning(
-                    f'"update_logger_directory" is True. Overwriting logger "save_dir" to {dir} and "name" to {self.name}'
-                )
-                trainer.logger._root_dir = dir
-                trainer.logger._name = self.name
+        if self.update_logger_directory:
+            for logger in trainer.loggers:
+                if isinstance(logger, TensorBoardLogger):
+                    logger._version = version or ""
+                    logger._root_dir = Path(dir) / logger.save_dir
+                    trainer.logger._name = self.name
+                    logging.warning(
+                        f'"update_logger_directory" is True. Overwriting tensorboard logger "save_dir" to {logger._root_dir}'
+                    )
+                elif isinstance(logger, WandbLogger):
+                    logger._id = version or ""
+                    logger._save_dir = Path(dir) / logger.save_dir
+                    logger._wandb_init["dir"] = Path(dir) / logger.save_dir
+                    logger._wandb_init["name"] = self.name
+                    logger._name = self.name
+                    logging.warning(
+                        f'"update_logger_directory" is True. Overwriting wandb logger "save_dir" to {logger._save_dir}'
+                    )
 
     def _setup_trainer_model_checkpoint(self, trainer, log_dir, ckpt=None):
         if ckpt:
@@ -187,10 +211,15 @@ def _setup_trainer_model_checkpoint(self, trainer, log_dir, ckpt=None):
                 ModelCheckpoint.CHECKPOINT_NAME_LAST = callback.filename + '-last'
 
     def _handle_task_config(self, task_config, log_dir):
-        task_config.save_config_img(log_dir / "task.png")
-        task_json = serialization.dump_json(task_config)
-        with open(log_dir / "task.json", "w") as f:
-            f.write(task_json)
+        try:
+            from fiddle._src.experimental import serialization
+
+            task_config.save_config_img(log_dir / "task.png")
+            task_json = serialization.dump_json(task_config)
+            with open(log_dir / "task.json", "w") as f:
+                f.write(task_json)
+        except Exception as e:
+            logging.warning(f'Saving task config failed: {e}. Skipping saving')
 
     def _setup_file_logging(self, log_dir):
         """Set up file logging based on rank settings."""
diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py
index ee0e777d739e..00637c9d57d4 100644
--- a/nemo/lightning/pytorch/callbacks/__init__.py
+++ b/nemo/lightning/pytorch/callbacks/__init__.py
@@ -3,7 +3,16 @@
 from nemo.lightning.pytorch.callbacks.nsys import NsysCallback
 from nemo.lightning.pytorch.callbacks.peft import PEFT
 from nemo.lightning.pytorch.callbacks.preemption import PreemptionCallback
-from nemo.lightning.pytorch.callbacks.progress import MegatronProgressBar
+from nemo.lightning.pytorch.callbacks.progress_bar import MegatronProgressBar
+from nemo.lightning.pytorch.callbacks.progress_printer import ProgressPrinter
 
 
-__all__ = ["ModelCheckpoint", "ModelTransform", "PEFT", "NsysCallback", "MegatronProgressBar", "PreemptionCallback"]
+__all__ = [
+    "ModelCheckpoint",
+    "ModelTransform",
+    "PEFT",
+    "NsysCallback",
+    "MegatronProgressBar",
+    "ProgressPrinter",
+    "PreemptionCallback",
+]
diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
index ed8ac25185f3..db48ded0d10d 100644
--- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -26,14 +26,33 @@
 from pytorch_lightning.callbacks.model_checkpoint import _is_local_file_protocol
 from pytorch_lightning.utilities import rank_zero_info
 
-from nemo.lightning.io.mixin import IOMixin
 from nemo.lightning.io.pl import TrainerContext
 from nemo.utils import logging
 from nemo.utils.app_state import AppState
 from nemo.utils.model_utils import ckpt_to_dir
 
 
-class ModelCheckpoint(PTLModelCheckpoint, IOMixin):
+class ModelCheckpoint(PTLModelCheckpoint):
+    """Light wrapper around Lightning's ModelCheckpoint to force a saved checkpoint on train_end.
+    Adds support for asyncronous checkpointing and provides some additional logic to clean up invalid checkpoints
+    Args:
+        monitor: Metric to monitor when saving top-k checkpoints.
+        verbose: Verbosity mode.
+        save_last: When ``True``, saves a `*-last` copy whenever a checkpoint file gets saved.
+        save_top_k: When ``True``, saves the top-k checkpoints according to ``monitor``.
+        save_weights_only:  if ``True``, then only the model's weights will be saved.
+        mode: One of {min, max}. Whether the objective is to minimize or maximize the monitored quantity.
+        every_n_epochs: Number of epochs between checkpoints.
+        every_n_train_steps: Number of train steps between checkpoints.
+        train_time_interval: After each interval, monitor checkpoints. Not to be used with
+            ``every_n_epochs`` or ``every_n_train_steps``.
+        save_best_model: When ``True``, reloads and saves the best checkpoint.
+        save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch
+        enable_nemo_ckpt_io: Whether to dump the current model model state, including the
+            config file, to allow for reproducibility of experiments.
+        async_save: Whether to enable asynchronous checkpointing.
+        try_restore_best_ckpt: Whether to restore the best model path.
+    """
 
     UNFINISHED_CHECKPOINT_SUFFIX = "-unfinished"
 
@@ -51,14 +70,12 @@ def __init__(
         save_best_model: bool = False,
         save_on_train_epoch_end: Optional[bool] = False,  # Save after training, not after validation
         enable_nemo_ckpt_io: bool = True,
-        async_save: bool = False,
         try_restore_best_ckpt: bool = True,
         **kwargs,
     ):
         self.save_best_model = save_best_model
         self.previous_best_path = ""
         self.enable_nemo_ckpt_io = enable_nemo_ckpt_io
-        self.async_save = async_save
         # Checkpoints which removal is deferred until async save is done.
         # Each element of `deferred_ckpts_to_remove` is a growing list
         # that `self._remove_checkpoint` adds to. Once `self._save_checkpoint`
@@ -166,7 +183,7 @@ def nemo_topk_check_previous_run(self):
             if index != len(self.monitor):
                 match = re.search('[A-z]', checkpoint[index:])
                 if match:
-                    value = checkpoint[index : index + match.start() - 1]  # -1 due to separator hypen
+                    value = checkpoint[index : index + match.start() - 1]  # -1 due to separator hyphen
                     self.best_k_models[checkpoint] = float(value)
         if len(self.best_k_models) < 1:
             return  # No saved checkpoints yet
@@ -221,7 +238,7 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
         super().load_state_dict(state_dict)
         self._remove_invalid_entries_from_topk()
 
-    def setup(self, *args, **kwargs) -> None:
+    def setup(self, trainer, *args, **kwargs) -> None:
         from nemo.utils.get_rank import is_global_rank_zero
 
         if is_global_rank_zero():
@@ -230,7 +247,9 @@ def setup(self, *args, **kwargs) -> None:
         # Ensure that all ranks continue with unfinished checkpoints removed
         if torch.distributed.is_initialized():
             torch.distributed.barrier()
-        super().setup(*args, **kwargs)
+
+        self.async_save = getattr(trainer.strategy, "async_save", False)
+        super().setup(trainer, *args, **kwargs)
 
     def on_save_checkpoint(self, trainer, pl_module, checkpoint):
         output = super().on_save_checkpoint(trainer, pl_module, checkpoint)
@@ -374,12 +393,29 @@ def file_exists(self, filepath: str, trainer: "pytorch_lightning.Trainer", check
         exists = self._fs.exists(filepath) or (check_dist_ckpt and self._fs.exists(ckpt_to_dir(filepath)))
         return trainer.strategy.broadcast(exists)
 
+    def _monitor_candidates(self, trainer: "pl.Trainer") -> Dict[str, torch.Tensor]:
+        """Broadcast loss from last pipeline stage."""
+        monitor_candidates = super()._monitor_candidates(trainer)
+
+        from nemo.lightning._strategy_lib import _sync_from_last_pipeline_stage
+
+        keys = re.findall(r"[\{](.*?)[:\}]", self.filename)
+        for loss_name in ['reduced_train_loss']:
+            if loss_name in keys or loss_name == self.monitor:
+                if loss_name not in monitor_candidates:
+                    monitor_candidates[loss_name] = torch.tensor(0.0, device=torch.cuda.current_device())
+                _sync_from_last_pipeline_stage(monitor_candidates[loss_name], broadcast=True)
+
+        return monitor_candidates
+
     def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None:
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
         # if anything goes wrong during checkpointing, we should be able to detect that data is incomplete.
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
         ema_callback = self._ema_callback(trainer)
 
+        self._last_global_step_saved = trainer.global_step
+
         if ema_callback is not None:
             if self.async_save:
                 raise ValueError('async_save with EMA not supported')
@@ -410,6 +446,12 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
             else:
                 storage_options = None
             trainer.save_checkpoint(filepath, self.save_weights_only, storage_options=storage_options)
+
+            ## NOTE: saving context happens synchronously always
+            from nemo.utils.get_rank import is_global_rank_zero
+
+            if self.enable_nemo_ckpt_io and is_global_rank_zero():
+                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath))
             if self.async_save:
                 logging.info(f'Scheduled async checkpoint save for {filepath}')
             else:
@@ -422,14 +464,8 @@ def _get_finalize_save_checkpoint_callback(
 
         def _cb():
             logging.debug(f'Finalize callback called for step {global_step}, filepath {filepath}')
-            self._last_global_step_saved = global_step
             self._last_checkpoint_saved = filepath
 
-            from nemo.utils.get_rank import is_global_rank_zero
-
-            if self.enable_nemo_ckpt_io and is_global_rank_zero():
-                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath))
-
             # notify loggers
             if trainer.is_global_zero:
                 for logger in trainer.loggers:
diff --git a/nemo/lightning/pytorch/callbacks/model_transform.py b/nemo/lightning/pytorch/callbacks/model_transform.py
index 7949f9efd28e..8a07566f92c3 100644
--- a/nemo/lightning/pytorch/callbacks/model_transform.py
+++ b/nemo/lightning/pytorch/callbacks/model_transform.py
@@ -4,11 +4,10 @@
 import pytorch_lightning as pl
 from torch import nn
 
-from nemo.lightning.io.mixin import IOMixin
 from nemo.utils import logging
 
 
-class ModelTransform(pl.Callback, IOMixin):
+class ModelTransform(pl.Callback):
     """
     A PyTorch Lightning callback that applies a model transformation function at the start of fitting or validation.
 
@@ -72,6 +71,11 @@ def _maybe_apply_transform(self, trainer):
 
     def apply_transform(self, trainer):
         self.model_transform(trainer.model)
+        from pytorch_lightning.utilities import model_summary
+
+        logging.info(
+            f"After applying model_transform:\n" f"{model_summary.summarize(trainer.lightning_module, max_depth=1)}"
+        )
 
     @property
     def _needs_to_call(self) -> bool:
diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py
index d24d7fd974be..70ebf943b333 100644
--- a/nemo/lightning/pytorch/callbacks/nsys.py
+++ b/nemo/lightning/pytorch/callbacks/nsys.py
@@ -3,12 +3,11 @@
 import torch
 from pytorch_lightning.callbacks.callback import Callback
 
-from nemo.lightning.io.mixin import IOMixin
 from nemo.utils import logging
 from nemo.utils.get_rank import get_rank
 
 
-class NsysCallback(Callback, IOMixin):
+class NsysCallback(Callback):
     """
     A PyTorch Lightning callback for NVIDIA Nsight Systems (Nsys) profiling.
 
@@ -36,15 +35,15 @@ def __init__(
         ranks: List[int] = [0],
         gen_shape: bool = False,
     ):
-        assert type(start_step) == int, f'Nsys start_step must be of type int. Found: {type(start_step)}'
+        assert type(start_step) is int, f'Nsys start_step must be of type int. Found: {type(start_step)}'
         self._nsys_profile_start_step = start_step
 
-        assert type(end_step) == int, f'Nsys end_step must be of type int. Found: {type(start_step)}'
+        assert type(end_step) is int, f'Nsys end_step must be of type int. Found: {type(start_step)}'
         self._nsys_profile_end_step = end_step
 
         assert (
             self._nsys_profile_end_step >= self._nsys_profile_start_step
-        ), f'Nsys end_step must be greater than or equal to nsys start_step'
+        ), 'Nsys end_step must be greater than or equal to nsys start_step'
 
         self._nsys_profile_ranks = ranks
         self._nsys_profile_gen_shape = gen_shape
@@ -61,8 +60,9 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Opt
         """
 
         device = trainer.strategy.root_device
+        current_step = trainer.strategy.current_epoch_step
         if device.type == 'cuda':
-            if batch_idx == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
+            if current_step == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
                 logging.info("====== Start nsys profiling ======")
                 torch.cuda.cudart().cudaProfilerStart()
                 if self._nsys_profile_gen_shape:
@@ -77,8 +77,9 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int)
         """
 
         device = trainer.strategy.root_device
+        current_step = trainer.strategy.current_epoch_step
         if device.type == 'cuda':
-            if batch_idx == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
+            if current_step == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
                 logging.info("====== End nsys profiling ======")
                 torch.cuda.cudart().cudaProfilerStop()
                 torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index 869882671096..c7983af26752 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -113,13 +113,13 @@ def on_save_checkpoint(
         # Filter out non-trainable parameters
         trainable_params = set(name for name, param in pl_module.named_parameters() if param.requires_grad)
         filtered_state_dict = {}
-        for name, value in checkpoint['state_dict'].items():
+        for name, value in trainer.strategy.megatron_parallel.sharded_state_dict().items():
             if name in trainable_params:
                 filtered_state_dict[name] = value
             elif self.adapter_key_filter(name):  # Include all adapter-related parameters
                 filtered_state_dict[name] = value
 
-        checkpoint['state_dict'] = filtered_state_dict
+        checkpoint['sharded_state_dict'] = filtered_state_dict
 
     def adapter_key_filter(self, key: str) -> bool:
         return ".adapter." in key or key.endswith(".adapters")
diff --git a/nemo/lightning/pytorch/callbacks/preemption.py b/nemo/lightning/pytorch/callbacks/preemption.py
index 7f1dd94256d2..69ac378ed698 100644
--- a/nemo/lightning/pytorch/callbacks/preemption.py
+++ b/nemo/lightning/pytorch/callbacks/preemption.py
@@ -14,16 +14,19 @@
 
 import contextlib
 import signal
+import sys
 from typing import Optional
 
 import torch
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.trainer.trainer import Trainer
 
+from nemo.lightning.io.mixin import IOMixin
 from nemo.utils import logging
+from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO
 
 
-class PreemptionCallback(Callback):
+class PreemptionCallback(Callback, IOMixin):
     """
     PreemptionCallback checks for preemption during training at the end of every step.
     Upon preemption, it signals the trainer to stop gracefully.
@@ -61,13 +64,15 @@ def on_train_end(self, trainer: Trainer, pl_module) -> None:
 
     def on_train_batch_end(self, trainer: Trainer, pl_module, outputs, batch, batch_idx: int) -> None:
         if self.interrupted:
-            logging.info("Preemption detected, signaling trainer to stop")
-            trainer.should_stop = True
-
-    def on_exception(self, trainer: Trainer, pl_module, exception: BaseException) -> None:
-        if isinstance(exception, PreemptionException):
-            logging.info("Handling PreemptionException")
+            logging.info("Preemption detected, saving checkpoint and exiting")
             trainer.should_stop = True
+            if trainer.checkpoint_callback:
+                monitor_candidates = trainer.checkpoint_callback._monitor_candidates(trainer)
+                trainer.checkpoint_callback._save_last_checkpoint(trainer, monitor_candidates)
+                if isinstance(trainer.strategy.checkpoint_io, AsyncFinalizableCheckpointIO):
+                    logging.info("Async checkpointing detected, waiting for it to complete")
+                    trainer.strategy.checkpoint_io.maybe_finalize_save_checkpoint(blocking=True)
+                sys.exit(0)
 
     @contextlib.contextmanager
     def _preemption_handler(self):
@@ -81,7 +86,6 @@ def _preemption_handler(self):
         def master_handler(signum, frame):
             logging.info(f"Received signal {signum}, initiating graceful stop")
             self._interrupted = True
-            raise PreemptionException("Preemption signal received")
 
         def ignoring_handler(signum, frame):
             logging.debug(f"Received signal {signum} on non-master rank, ignoring")
@@ -109,7 +113,3 @@ def interrupted(self) -> bool:
         interrupted = torch.tensor(self._interrupted, device=torch.cuda.current_device(), dtype=torch.int32)
         torch.distributed.broadcast(interrupted, 0)
         return bool(interrupted.item())
-
-
-class PreemptionException(Exception):
-    """Custom exception for preemption events."""
diff --git a/nemo/lightning/pytorch/callbacks/progress.py b/nemo/lightning/pytorch/callbacks/progress_bar.py
similarity index 82%
rename from nemo/lightning/pytorch/callbacks/progress.py
rename to nemo/lightning/pytorch/callbacks/progress_bar.py
index 17178618852f..d6acb02ae377 100644
--- a/nemo/lightning/pytorch/callbacks/progress.py
+++ b/nemo/lightning/pytorch/callbacks/progress_bar.py
@@ -8,15 +8,6 @@ class MegatronProgressBar(TQDMProgressBar):
     for megatron models.
     """
 
-    def get_current_epoch_step(self, trainer) -> int:
-        """
-        Get the value of step within an epoch.
-        """
-        return max(
-            trainer.fit_loop.epoch_loop.automatic_optimization.optim_progress.optimizer.step.current.completed,
-            trainer.fit_loop.epoch_loop.manual_optimization.optim_step_progress.current.completed,
-        )
-
     def init_train_tqdm(self):
         """
         Override bar_format to not have 's/it'.
@@ -41,10 +32,10 @@ def on_train_batch_end(self, trainer, pl_module, *_, **__):
         """
         Override parent class on_train_batch_end to update progress bar per global batch instead of per microbatch.
         """
-        n = self.get_current_epoch_step(trainer)
+        n = trainer.strategy.current_epoch_step
         if self._should_update(n, self.train_progress_bar.total):
             _update_n(self.train_progress_bar, n)
-            self.train_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
+            self.train_progress_bar.set_postfix(self.get_metrics(trainer, pl_module), refresh=False)
 
 
 def calculate_data_parallel_groups() -> int:
diff --git a/nemo/lightning/pytorch/callbacks/progress_printer.py b/nemo/lightning/pytorch/callbacks/progress_printer.py
new file mode 100644
index 000000000000..8ddc97a6ddd6
--- /dev/null
+++ b/nemo/lightning/pytorch/callbacks/progress_printer.py
@@ -0,0 +1,189 @@
+from collections import defaultdict
+from typing import Any
+
+from megatron.core.num_microbatches_calculator import get_num_microbatches
+from pytorch_lightning.callbacks.progress import ProgressBar
+from pytorch_lightning.utilities.types import STEP_OUTPUT
+from typing_extensions import override
+
+
+class ProgressPrinter(ProgressBar):
+    """
+    Callback for logging progress in Megatron. Prints status in terms of global batches rather than microbatches.
+    Recommended over MegatronProgressBar for non-interactive settings
+
+    Args:
+        log_interval (int): determines how frequently (in steps) to print the progress.
+        skip_accumulate_metrics (list[str]): for all metrics in this list, value logged will
+            simply reflect the latest value rather than averaging over the log interval.
+        exclude_metrics (list[str]): any metrics to exclude from logging.
+    """
+
+    def __init__(
+        self,
+        log_interval: int = 1,
+        skip_accumulate_metrics: list[str] = ["global_step"],
+        exclude_metrics: list[str] = ["v_num"],
+    ):
+        self._train_description = "Training"
+        self._validation_description = "Validation"
+        self._test_description = "Testing"
+        self._log_interval = int(log_interval)
+        # most recent "global_step" will be logged
+        # rather than averaging over last log_interval steps
+        self.skip_accumulate_metrics = skip_accumulate_metrics
+        self.exclude_metrics = exclude_metrics
+        self.total_metrics_dict = defaultdict(lambda: 0.0)
+        self._is_disabled = log_interval <= 0
+
+        super().__init__()
+
+    def format_string(self, prefix, metrics):
+        log_string = prefix
+        for metric, val in metrics.items():
+            if isinstance(val, (float)) and val.is_integer():
+                val = int(val)
+                log_string += f' | {metric}: {val}'
+            else:
+                log_string += f' | {metric}: {val:.4}'
+        return log_string
+
+    def disable(self):
+        self._is_disabled = True
+
+    def enable(self):
+        self._is_disabled = False
+
+    @property
+    def is_disabled(self) -> bool:
+        return self._is_disabled
+
+    @property
+    def average_metrics_dict(self):
+        average_dict = {}
+        for key in self.total_metrics_dict:
+            if key in self.skip_accumulate_metrics or not isinstance(self.total_metrics_dict[key], (int, float)):
+                average_dict[key] = self.total_metrics_dict[key]
+            else:
+                average_dict[key] = self.total_metrics_dict[key] / self.log_interval
+        return average_dict
+
+    @property
+    def train_description(self):
+        return self._train_description
+
+    @property
+    def validation_description(self):
+        return self._validation_description
+
+    @property
+    def test_description(self):
+        return self._test_description
+
+    @property
+    def log_interval(self):
+        return self._log_interval
+
+    @log_interval.setter
+    def log_interval(self, val):
+        self._log_interval = val
+
+    @override
+    def on_sanity_check_start(self, *_: Any) -> None:
+        self._validation_description = "Sanity checking " + self.validation_description
+
+    @override
+    def on_sanity_check_end(self, *_: Any) -> None:
+        self._validation_description = "Validation"
+
+    @override
+    def on_train_epoch_start(self, trainer, *_):
+        if trainer.max_steps > 0:
+            # while resuming from a ckpt use trainer.max_steps as the total for progress bar as trainer.num_training_batches
+            # is truncated to max_steps - step being resumed at
+            self.total = trainer.max_steps
+        else:
+            self.total = trainer.num_training_batches
+
+    ## TODO(ashors): handle nan losses
+    @override
+    def on_train_batch_end(self, trainer, pl_module, *_, **__):
+        if self.is_disabled:
+            return
+        n = trainer.strategy.current_epoch_step
+        metrics = self.get_metrics(trainer, pl_module)
+        for key in metrics:
+            if key in self.exclude_metrics:
+                continue
+            if key in self.skip_accumulate_metrics or not isinstance(metrics[key], (int, float)):
+                self.total_metrics_dict[key] = metrics[key]
+            else:
+                self.total_metrics_dict[key] += metrics[key]
+
+        if self.should_log(n):
+            prefix = self.train_description + f" epoch {trainer.current_epoch}, iteration {n-1}/{self.total-1}"
+            log_string = self.format_string(prefix, self.average_metrics_dict)
+            print(log_string)
+
+            self.total_metrics_dict = defaultdict(lambda: 0.0)
+
+    @override
+    def on_validation_batch_start(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        if not self.has_dataloader_changed(dataloader_idx):
+            return
+        self.total_validation_steps = int(self.total_val_batches_current_dataloader / get_num_microbatches())
+
+    @override
+    def on_validation_batch_end(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        outputs: STEP_OUTPUT,
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        if self.is_disabled:
+            return
+        n = (batch_idx + 1) / get_num_microbatches()
+        if self.should_log(n):
+            print(self.validation_description + f": iteration {int(n)}/{self.total_validation_steps}")
+
+    @override
+    def on_test_batch_start(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        if not self.has_dataloader_changed(dataloader_idx):
+            return
+        self.total_test_steps = int(self.total_test_batches_current_dataloader / get_num_microbatches())
+
+    @override
+    def on_test_batch_end(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        outputs: STEP_OUTPUT,
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        if self.is_disabled:
+            return
+        n = int((batch_idx + 1) / get_num_microbatches())
+        if self.should_log(n):
+            print(self.test_description + f": iteration {n}/{self.total_validation_steps}")
+
+    def should_log(self, n):
+        return n % self.log_interval == 0
diff --git a/nemo/lightning/pytorch/optim/base.py b/nemo/lightning/pytorch/optim/base.py
index 8e857a156649..ef7f9d96843d 100644
--- a/nemo/lightning/pytorch/optim/base.py
+++ b/nemo/lightning/pytorch/optim/base.py
@@ -149,10 +149,10 @@ def optimizers(self, model) -> List[Optimizer]:
         """
         raise NotImplementedError("The optimizers method should be implemented by subclasses.")
 
-    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx) -> None:
         if self._optimizers is not None:
             lr = self._optimizers[0].param_groups[0]['lr']
-            pl_module.log('lr', lr, rank_zero_only=True, batch_size=1)
+            pl_module.log('lr', lr, batch_size=1, prog_bar=True)
 
     def __call__(self, model: L.LightningModule, megatron_parallel=None) -> OptimizerLRScheduler:
         """Calls the setup and optimizers methods.
diff --git a/nemo/lightning/pytorch/optim/lr_scheduler.py b/nemo/lightning/pytorch/optim/lr_scheduler.py
index 9374328190a6..4e865443b8fc 100644
--- a/nemo/lightning/pytorch/optim/lr_scheduler.py
+++ b/nemo/lightning/pytorch/optim/lr_scheduler.py
@@ -25,7 +25,7 @@ def __init__(
         warmup_ratio: Optional[float] = None,
         max_steps: int = 10,
         min_lr: float = 0.0,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -68,7 +68,7 @@ def __init__(
         hold_ratio: Optional[float] = None,
         max_steps: int = 10,
         min_lr: float = 0.0,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -111,7 +111,7 @@ def __init__(
         self,
         max_steps: int = 10,
         min_lr: float = 1e-5,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -142,7 +142,7 @@ def __init__(
         self,
         max_steps: int = 10,
         min_lr: float = 0.0,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -176,7 +176,7 @@ def __init__(
         warmup_ratio: Optional[float] = None,
         max_steps: int = 10,
         min_lr: float = 0.0,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -218,7 +218,7 @@ def __init__(
         max_steps: int = 10,
         decay_rate: float = 0.5,
         min_lr: float = 0.0,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -252,7 +252,7 @@ def __init__(
         self,
         max_steps: int = 10,
         min_lr: float = 0.0,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -283,7 +283,7 @@ def __init__(
         self,
         max_steps: int = 10,
         min_lr: float = 0.0,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -314,7 +314,7 @@ def __init__(
         self,
         max_steps: int = 10,
         min_lr: float = 0.0,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -347,7 +347,7 @@ def __init__(
         min_lr: float = 0.0,
         power: float = 1.0,
         cycle: bool = False,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -384,7 +384,7 @@ def __init__(
         min_lr: float = 0.0,
         power: float = 1.0,
         cycle: bool = False,
-        interval: str = "epoch",
+        interval: str = "step",
         frequency: int = 1,
         monitor: str = "val_loss",
     ):
@@ -415,13 +415,13 @@ def scheduler(self, model, optimizer):
 class CosineAnnealingScheduler(LRSchedulerModule):
     def __init__(
         self,
-        max_steps=10,
-        warmup_steps=750,
-        constant_steps=80000,
-        min_lr=int(6e-5),
-        interval="epoch",
-        frequency=1,
-        monitor="val_loss",
+        max_steps: int = 10,
+        warmup_steps: int = 750,
+        constant_steps: int = 80000,
+        min_lr: float = 6e-5,
+        interval: str = "step",
+        frequency: int = 1,
+        monitor: str = "val_loss",
     ):
         super().__init__()
         self.max_steps = max_steps
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index 9b2b317223ce..13a0caa98f0c 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Any, Dict, List, Literal, Optional
 
 import pytorch_lightning as pl
@@ -22,8 +23,9 @@ def __init__(
         micro_batch_size: int = 4,
         global_batch_size: int = 8,
         rampup_batch_size: Optional[List[int]] = None,
-        dataloader_type: Literal["single", "cyclic"] = "single",
+        dataloader_type: Literal["single", "cyclic", "batch"] = "single",
         init_consumed_samples: int = 0,
+        init_global_step: int = 0,
     ):
         self.seq_len = seq_len
         self.micro_batch_size = micro_batch_size
@@ -34,6 +36,7 @@ def __init__(
         self.prev_consumed_samples = self.init_consumed_samples
         self.if_first_step = 0
         self.prev_global_batch_size = None
+        self.init_global_step = init_global_step
 
     def setup(self, global_rank: int) -> None:
         from nemo.lightning.data import setup_microbatch_calculator
@@ -61,12 +64,8 @@ def compute_consumed_samples(self, steps_since_resume=0) -> int:
             return 0
 
         app_state = AppState()
-
         if self.rampup_batch_size is not None:
-            from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-
-            current_global_batch_size = getattr(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, "current_global_batch_size", 1)
-            consumed_samples = self.prev_consumed_samples + self.if_first_step * current_global_batch_size
+            consumed_samples = self.prev_consumed_samples + self.if_first_step * self.current_global_batch_size
         else:
             consumed_samples = (
                 self.init_consumed_samples
@@ -86,38 +85,33 @@ def on_megatron_step_start(self, trainer: pl.Trainer) -> None:
             trainer.should_stop = True
 
     def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
-        from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+        try:
+            from megatron.core.num_microbatches_calculator import update_num_microbatches
 
-        if self.rampup_batch_size is None:
-            return
+        except (ImportError, ModuleNotFoundError):
+            logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+            from apex.transformer.pipeline_parallel.utils import update_num_microbatches
 
         self.prev_global_batch_size = self.current_global_batch_size
 
-        # TODO: Add consumed samples
         consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_global_step)
-
         pl_module.log(
             'consumed_samples',
             consumed_samples,
             prog_bar=True,
-            rank_zero_only=True,
             batch_size=1,
         )
 
         self.prev_consumed_samples = consumed_samples
 
-        num_microbatch_calculator = _GLOBAL_NUM_MICROBATCHES_CALCULATOR  # noqa: SLF001
-
-        num_microbatch_calculator.update(
+        update_num_microbatches(
             consumed_samples=consumed_samples,
             consistency_check=False,
         )
-        current_global_batch_size = num_microbatch_calculator.current_global_batch_size
         pl_module.log(
             "global_batch_size",
-            current_global_batch_size,
+            self.current_global_batch_size,
             prog_bar=True,
-            rank_zero_only=True,
             batch_size=1,
         )
         self.if_first_step = 1
@@ -132,15 +126,27 @@ def megatron_data_kwargs(self) -> Dict[str, Any]:
 
     @property
     def num_microbatches(self) -> int:
-        from megatron.core.num_microbatches_calculator import get_num_microbatches
+        try:
+            from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+        except (ImportError, ModuleNotFoundError):
+            logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+            from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 
         return get_num_microbatches()
 
     @property
     def current_global_batch_size(self) -> int:
-        from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+        try:
+            from megatron.core.num_microbatches_calculator import get_current_global_batch_size
+
+        except (ImportError, ModuleNotFoundError):
+            logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+            from apex.transformer.pipeline_parallel.utils import get_current_global_batch_size
 
-        num_microbatch_calculator = _GLOBAL_NUM_MICROBATCHES_CALCULATOR  # noqa: SLF001
-        current_global_batch_size = num_microbatch_calculator.current_global_batch_size
+        if get_current_global_batch_size():
+            current_global_batch_size = get_current_global_batch_size()
+        else:
+            current_global_batch_size = 1
 
         return current_global_batch_size
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
index 5e43e09c0420..79394cc4bbb1 100644
--- a/nemo/lightning/pytorch/plugins/mixed_precision.py
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -13,47 +13,114 @@
 # limitations under the License.
 
 from contextlib import contextmanager
+from dataclasses import dataclass, fields
 from typing import Any, Callable, Generator, List, Literal, Tuple, TypeVar, Union
 
 import pytorch_lightning as pl
 import torch
-from pytorch_lightning.plugins.precision import MixedPrecision
+from pytorch_lightning.plugins.precision import Precision
 from torch.nn import Module
 from torch.optim import Optimizer
 
-from nemo.lightning._strategy_lib import GradScaler
+from nemo.utils import logging
 
 AnyT = TypeVar("AnyT")
 
 
-class MegatronMixedPrecision(MixedPrecision):
+def get_optim_config(optimizer: Optimizer):
+    try:
+        return optimizer.mcore_optimizer.config
+    except:
+        raise ValueError("Failed to extract optimizer config from module.")
+
+
+@dataclass
+class DtypeConfig:
+    fp32: bool = False
+    fp16: bool = False
+    bf16: bool = False
+    params_dtype: torch.dtype = None
+    pipeline_dtype: torch.dtype = None
+    autocast_dtype: torch.dtype = None
+    autocast_enabled: bool = False
+    grad_reduce_in_fp32: bool = True
+    # fp8 related
+    fp8: str = None
+    fp8_margin: int = 0
+    fp8_interval: int = 1
+    fp8_amax_history_len: int = 1
+    fp8_amax_compute_algo: str = "most_recent"
+    fp8_wgrad: bool = True
+    fp8_dot_product_attention: bool = False
+    fp8_multi_head_attention: bool = False
+    # FP16 Loss scaling
+    loss_scale: float = (None,)
+    initial_loss_scale: float = (None,)
+    min_loss_scale: float = (None,)
+    loss_scale_window: float = (None,)
+    hysteresis: float = (None,)
+
+
+class MegatronMixedPrecision(Precision):
     def __init__(
         self,
-        precision: Literal["16-mixed", "bf16-mixed"],
-        amp_O2: bool = False,
-        device="cuda",
+        precision: Literal["16-mixed", "bf16-mixed", "32"],
+        params_dtype: torch.dtype = None,
+        pipeline_dtype: torch.dtype = None,
+        autocast_dtype: torch.dtype = None,
+        autocast_enabled: bool = False,
+        grad_reduce_in_fp32: bool = True,
+        # fp8 related,
+        fp8: str = None,
+        fp8_margin: int = 0,
+        fp8_interval: int = 1,
+        fp8_amax_history_len: int = 1,
+        fp8_amax_compute_algo: str = "most_recent",
+        fp8_wgrad: bool = True,
+        fp8_dot_product_attention: bool = False,
+        fp8_multi_head_attention: bool = False,
+        fp16_loss_scale: float = None,
+        fp16_initial_loss_scale: float = 4294967296,
+        fp16_min_loss_scale: float = 1.0,
+        fp16_loss_scale_window: int = 1000,
+        fp16_hysteresis: int = 2,
     ) -> None:
-        if precision == "bf16-mixed":
-            scaler = None
-        else:
-            scaler = GradScaler(init_scale=2**32, growth_interval=1000, hysteresis=2)
-
-        super().__init__(precision, device, scaler)
-        self.amp_O2 = amp_O2
-
-    def connect(
-        self, model: Module, optimizers: List[Optimizer], lr_schedulers: List[Any]
-    ) -> Tuple[Module, List[Optimizer], List[Any]]:
-        """Connects this plugin to the accelerator and the training process."""
-        from nemo.core.optim import MainParamsOptimizerWrapper
-
-        if not optimizers or not self.amp_O2 or isinstance(optimizers[0], MainParamsOptimizerWrapper):
-            return model, optimizers, lr_schedulers
-
-        _optimizers = [*optimizers]
-        _optimizers[0] = self.convert_optimizer(_optimizers[0])
 
-        return model, _optimizers, lr_schedulers
+        if isinstance(precision, int):
+            precision = str(precision)
+
+        dtype = torch.bfloat16 if precision in ['bf16', 'bf16-mixed'] else torch.float32
+        self.dtype_config = DtypeConfig(
+            fp32=precision in ['fp32', '32'],
+            fp16=precision in ['fp16', 'fp16-mixed', '16', '16-mixed'],
+            bf16=precision in ['bf16', 'bf16-mixed'],
+            params_dtype=params_dtype or torch.float32,
+            pipeline_dtype=pipeline_dtype or dtype,
+            autocast_dtype=autocast_dtype or dtype,
+            autocast_enabled=autocast_enabled,
+            grad_reduce_in_fp32=grad_reduce_in_fp32,
+            fp8=fp8,
+            fp8_margin=fp8_margin,
+            fp8_interval=fp8_interval,
+            fp8_amax_history_len=fp8_amax_history_len,
+            fp8_amax_compute_algo=fp8_amax_compute_algo,
+            fp8_wgrad=fp8_wgrad,
+            fp8_dot_product_attention=fp8_dot_product_attention,
+            fp8_multi_head_attention=fp8_multi_head_attention,
+            # fp16 loss scale
+            loss_scale=fp16_loss_scale,
+            initial_loss_scale=fp16_initial_loss_scale,
+            min_loss_scale=fp16_min_loss_scale,
+            loss_scale_window=fp16_loss_scale_window,
+            hysteresis=fp16_hysteresis,
+        )
+        super().__init__()
+        if self.dtype_config.fp16:
+            self.precision = "16-mixed"
+        elif self.dtype_config.bf16:
+            self.precision = "bf16-mixed"
+        else:
+            self.precision = "32-true"
 
     def convert_module(self, module: Module) -> Module:
         """Convert the module parameters to the precision type this plugin handles.
@@ -64,15 +131,15 @@ def convert_module(self, module: Module) -> Module:
         from megatron.core.transformer.module import Float16Module
         from megatron.core.utils import get_model_config
 
-        if self.precision in ["16-mixed", "bf16-mixed"]:
+        if self.dtype_config.fp16 or self.dtype_config.bf16:
+            # Patch config options
             config = get_model_config(module.module)
-            config.fp16 = self.precision == "16-mixed"
-            config.bf16 = self.precision == "bf16-mixed"
-            if isinstance(module.module, Float16Module):
-                new_float16_module = Float16Module(config, module.module.module)
-                module.module = new_float16_module
-            else:
+            config.fp16 = self.dtype_config.fp16
+            config.bf16 = self.dtype_config.bf16
+            if hasattr(module, 'module'):
                 module.module = Float16Module(config, module.module)
+            else:
+                module = Float16Module(config, module)
 
         return module
 
@@ -82,16 +149,10 @@ def convert_optimizer(self, optimizer: Optimizer) -> Optimizer:
         This is optional and depends on the precision limitations during optimization.
 
         """
-        from nemo.core.optim import MainParamsOptimizerWrapper
-
-        if isinstance(optimizer, MainParamsOptimizerWrapper) or not self.amp_O2:
-            return optimizer
-
-        return MainParamsOptimizerWrapper(
-            optimizer,
-            fp32_grad_accum=True,
-            contiguous_grad_bucket=True,
-        )
+        optim_config = get_optim_config(optimizer)
+        assert optim_config.bf16 == self.dtype_config.bf16, "BF16 enabled on model but not on optimizer"
+        assert optim_config.fp16 == self.dtype_config.fp16, "BF16 enabled on model but not on optimizer"
+        return optimizer
 
     def convert_input(self, data: AnyT) -> AnyT:
         """Convert model inputs (forward) to the floating point precision type of this plugin.
@@ -111,42 +172,6 @@ def convert_output(self, data: AnyT) -> AnyT:
         """
         return data
 
-    def optimizer_step(
-        self,
-        optimizer: torch.optim.Optimizer,
-        model: Union["pl.LightningModule", torch.nn.Module],
-        closure: Callable[[], Any],
-        **kwargs: Any,
-    ) -> None:
-        from nemo.core.optim import MainParamsOptimizerWrapper
-
-        if not self.amp_O2 and not isinstance(optimizer, MainParamsOptimizerWrapper):
-            return super().optimizer_step(optimizer, model, closure, **kwargs)
-
-        if self.scaler is None:
-            assert optimizer.fp32_grad_accumulation, "BF16 uses FP32 grad accumulation"
-            _ = closure()
-            self._after_closure(model, optimizer)
-            return optimizer.step(**kwargs)
-
-        assert not optimizer.fp32_grad_accumulation, "FP16 uses FP16 grad accumulation"
-        closure_result = closure()
-
-        # TODO: Add an option for merged all-reduce
-
-        # cast fp16 grads to fp32 and copy to main grads, which are used for unscale and param update
-        optimizer.copy_model_grads_to_main_grads()
-        # `unscale` after the closure is executed but before the `on_before_optimizer_step` hook.
-        # unscale main (fp32) gradients
-        self.scaler.unscale_(optimizer)
-        self._after_closure(model, optimizer)
-        skipped_backward = closure_result is None
-        # in manual optimization, the closure does not return a value
-        if not isinstance(model, pl.LightningModule) or not model.automatic_optimization or not skipped_backward:
-            # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found
-            self.scaler.step(optimizer, **kwargs)
-            self.scaler.update()
-
     @contextmanager
     def forward_context(self) -> Generator[None, None, None]:
         """No explicit precision casting. Inputs are supposed to be manually casted."""
@@ -156,4 +181,19 @@ def forward_context(self) -> Generator[None, None, None]:
             pass
 
 
+def update_config_with_dtype_overrides(dtype_config, config):
+    if hasattr(config, "__io__"):
+        config.__io__ = update_config_with_dtype_overrides(dtype_config, config.__io__)
+    for field in fields(dtype_config):
+        if not hasattr(config, field.name):
+            continue
+        # If we overwrote a value, throw a warning.
+        old_val = getattr(config, field.name)
+        new_val = getattr(dtype_config, field.name)
+        if old_val != new_val:
+            setattr(config, field.name, new_val)
+            logging.warning(f"Overwrote {type(config).__name__}.{field.name}  {old_val} -> {new_val}")
+    return config
+
+
 __all__ = ["MegatronMixedPrecision"]
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 2219324f6b67..668b088a4864 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -4,7 +4,8 @@
 import os
 import shutil
 from collections import OrderedDict
-from contextlib import ExitStack
+from contextlib import ExitStack, contextmanager
+from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ContextManager, Dict, List, Literal, Mapping, Optional, TypeVar, Union, cast
 
@@ -33,7 +34,7 @@
 from nemo.lightning import _strategy_lib, io
 from nemo.lightning.io.pl import MegatronCheckpointIO
 from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
-from nemo.lightning.pytorch.callbacks import MegatronProgressBar, ModelTransform
+from nemo.lightning.pytorch.callbacks import MegatronProgressBar, ModelTransform, ProgressPrinter
 from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO, AsyncFinalizerCallback
 
 if TYPE_CHECKING:
@@ -45,6 +46,18 @@
 DDPLiteral = Literal["megatron", "pytorch"]
 
 
+@dataclass
+class ParallelismConfig:
+    tensor_model_parallel_size: int
+    pipeline_model_parallel_size: int
+    virtual_pipeline_model_parallel_size: int
+    context_parallel_size: int
+    sequence_parallel: bool
+    expert_model_parallel_size: int
+    moe_extended_tp: bool
+    pipeline_dtype: torch.dtype
+
+
 class MegatronStrategy(DDPStrategy, io.IOMixin):
     """Megatron plugin for Pytorch Lightning.
 
@@ -71,12 +84,39 @@ class MegatronStrategy(DDPStrategy, io.IOMixin):
         cluster_environment: Cluster environment for distributed training. Defaults to None.
         checkpoint_io: Checkpoint I/O handler. Defaults to None.
         find_unused_parameters (bool): Find unused parameters in DDP. Defaults to False.
-        enable_nemo_ckpt_io (bool): Enable NeMo checkpoint I/O. Defaults to True.
         ckpt_type (TrainerCkptProtocol): Checkpoint type. Defaults to TrainerCheckpoint.
-        ckpt_include_optimizer (bool): Include optimizer state in checkpoint. Defaults to False.
+        ckpt_include_optimizer (bool): Include optimizer state in checkpoint. Defaults to True.
         ddp (Union[DDPLiteral, DistributedDataParallelConfig]): DDP configuration. Defaults to "megatron".
         lazy_init (bool): Use lazy initialization for model parallel parameters. Defaults to False.
         pipeline_dtype (Optional[torch.dtype]): Data type for pipeline parallelism. Defaults to None.
+        save_ckpt_format (str): Distributed checkpoint format to use for checkpoint saving. Should be one of
+            'torch_dist' or 'zarr'. Defaults to 'torch_dist'.
+        ckpt_async_save (bool): Whether to save checkpoints asynchronously to reduce checkpointing overhead.
+            Defaults to False.
+        ckpt_torch_dist_multiproc (int): Number of extra processes per rank used during ckpt save
+            with PyTorch distributed format. Defaults to None.
+        ckpt_assume_constant_structure (bool): Allows caching some computation across checkpoint saves.
+            Set to True only if the state dict structure doesn't change within a single job.
+        ckpt_parallel_save (bool): If true, each worker will write its own part of the dist checkpoint.
+            Defaults to True.
+        ckpt_parallel_save_within_dp (bool): If true, save will be parallelized only within a DP group
+            (whole world otherwise), which might slightly reduce the save overhead. Defaults to False.
+        ckpt_parallel_load (bool): If true, each worker will load part of the dist checkpoint
+            and exchange with NCCL. Might use some extra GPU memory. Defaults to False.
+        ckpt_parallel_save_optim (bool): Parallel save/load of a DistributedOptimizer. 'True'
+            allows performant save and reshardable checkpoints. Set to 'False' only in order to minimize
+            the number of checkpoint files.
+        ckpt_load_directly_on_device (bool): if True, loads the weights directly on GPU.
+            Has effect only for `zarr` based checkpoints (PyT Distributed always loads on device).
+            Defaults to True.
+        setup_optimizers (bool): Whether to call the trainer's setup_optimizers function to perform any
+            necessary conversions of optimizer parameters and move optimizer parameters to the correct device.
+            Defaults to True.
+        init_model_parallel (bool): Whether to initialize the model parallel groups. Defaults to True.
+        replace_progress_bar (bool): Whether to replace the TQDM progress bar with a megatron-style logger
+            that prints the metrics to stdout. Suitable for non-interactive settings.
+        progress_interval (int): How frequently to print progress to stdout. Only used when
+            replace_progress_bar is True.
         **kwargs: Additional keyword arguments.
 
     Note:
@@ -100,20 +140,23 @@ def __init__(
         cluster_environment=None,  # TODO: Add type-hint
         checkpoint_io=None,  # TODO: Add type-hint
         find_unused_parameters: bool = False,
-        ckpt_include_optimizer: bool = False,
+        ckpt_include_optimizer: bool = True,
         ddp: Union[DDPLiteral, DistributedDataParallelConfig] = "megatron",
         lazy_init: bool = False,
         pipeline_dtype: Optional[torch.dtype] = None,
-        save_ckpt_format='torch_dist',
-        ckpt_torch_dist_multiproc=None,  ## TODO(ashors): put elsewhere?
-        ckpt_assume_constant_structure=False,
-        ckpt_parallel_save=True,
-        ckpt_parallel_save_within_dp=False,
-        ckpt_parallel_load=False,
-        ckpt_parallel_save_optim=True,
-        ckpt_load_directly_on_device=True,
+        save_ckpt_format: str = 'torch_dist',
+        ckpt_async_save: bool = False,
+        ckpt_torch_dist_multiproc: int = None,  ## TODO(ashors): put elsewhere?
+        ckpt_assume_constant_structure: bool = False,
+        ckpt_parallel_save: bool = True,
+        ckpt_parallel_save_within_dp: bool = False,
+        ckpt_parallel_load: bool = False,
+        ckpt_parallel_save_optim: bool = True,
+        ckpt_load_directly_on_device: bool = True,
         setup_optimizers: bool = True,
         init_model_parallel: bool = True,
+        replace_progress_bar: bool = True,
+        progress_interval: int = 1,
         **kwargs,
     ) -> None:
         super().__init__(
@@ -142,6 +185,7 @@ def __init__(
         self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0)))
 
         self.save_ckpt_format = save_ckpt_format
+        self.async_save = ckpt_async_save
         self.torch_dist_multiproc = ckpt_torch_dist_multiproc
         self.assume_constant_structure = ckpt_assume_constant_structure
         self.parallel_save = ckpt_parallel_save
@@ -150,6 +194,9 @@ def __init__(
         self.parallel_save_optim = ckpt_parallel_save_optim
         self.load_directly_on_device = ckpt_load_directly_on_device
 
+        self.replace_progress_bar = replace_progress_bar
+        self.progress_interval = progress_interval
+
         self._ddp = ddp
         if ddp == "megatron":
             self.ddp_config = DistributedDataParallelConfig(check_for_nan_in_grad=True)
@@ -172,6 +219,12 @@ def connect(self, model: pl.LightningModule) -> None:
         if _maybe_mcore_config:
             self._mcore_config = _maybe_mcore_config
 
+        dtype_config = getattr(self._precision_plugin, 'dtype_config', None)
+        if dtype_config:
+            from nemo.lightning.pytorch.plugins.mixed_precision import update_config_with_dtype_overrides
+
+            model.config = update_config_with_dtype_overrides(dtype_config, model.config)
+
         has_optim = getattr(model, "optim", None)
         if has_optim:
             opt_config = getattr(model.optim, "config", None)
@@ -181,6 +234,10 @@ def connect(self, model: pl.LightningModule) -> None:
                     raise ValueError("PyTorch DDP is not enabled for mcore optimizer")
                 ddp_config = cast(DistributedDataParallelConfig, self.ddp_config)
 
+                if dtype_config:
+                    model.optim.config = update_config_with_dtype_overrides(dtype_config, model.optim.config)
+                    self.ddp_config = update_config_with_dtype_overrides(dtype_config, self.ddp_config)
+
                 if mcore_opt_config.use_distributed_optimizer != ddp_config.use_distributed_optimizer:
                     from nemo.utils import logging
 
@@ -207,7 +264,8 @@ def setup(self, trainer: pl.Trainer) -> None:
         if not self.data_sampler and hasattr(datamodule, "data_sampler"):
             self.data_sampler = datamodule.data_sampler
             self.data_sampler.setup(self.cluster_environment.global_rank())
-            datamodule.reconfigure_limit_batches()
+            if hasattr(datamodule, "reconfigure_limit_batches"):
+                datamodule.reconfigure_limit_batches()
 
         if self.data_sampler:
             self.data_sampler.connect(trainer)
@@ -253,6 +311,16 @@ def setup(self, trainer: pl.Trainer) -> None:
             assert self.model is not None
             _sync_module_states(self.model)
 
+        ## add AsyncFinalizerCallback if using async
+        if self.async_save:
+            have_async_callback = False
+            for callback in self.trainer.callbacks:
+                if isinstance(callback, AsyncFinalizerCallback):
+                    have_async_callback = True
+                    break
+            if not have_async_callback:
+                self.trainer.callbacks.append(AsyncFinalizerCallback())
+
     @override
     def setup_distributed(self) -> None:
         self._setup_parallel_ranks()
@@ -391,7 +459,6 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP
                 'global_step',
                 self.trainer.global_step,
                 prog_bar=True,
-                rank_zero_only=True,
                 batch_size=1,
             )
 
@@ -407,31 +474,22 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP
                     "peak_memory_usage",
                     max_memory_reserved,
                     prog_bar=True,
-                    rank_zero_only=True,
                     batch_size=1,
                 )
                 self.lightning_module.log(
                     "memory_allocated",
                     memory_allocated,
                     prog_bar=True,
-                    rank_zero_only=True,
                     batch_size=1,
                 )
 
             if self.log_train_loss:
-                from megatron.core import parallel_state
-
-                from nemo.collections.nlp.parts.utils_funcs import get_last_rank
-
-                # When using pipeline parallelism, loss is calculated only in the last pipeline stage and
-                # it should be casted to other pipeline stages for logging.
-                # we can avoid this broadcast by updating the PTL log function to accept specific ranks
-                if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-                    if torch.distributed.get_rank() == get_last_rank():
-                        torch.distributed.send(out, 0)
-                    elif torch.distributed.get_rank() == 0:
-                        torch.distributed.recv(out, get_last_rank())
-                self.lightning_module.log('reduced_train_loss', out, prog_bar=True, rank_zero_only=True, batch_size=1)
+                # p2p now, broadcast later at ckpt
+                _strategy_lib._sync_from_last_pipeline_stage(out, broadcast=False)
+                if torch.distributed.get_rank() == 0:
+                    self.lightning_module.log(
+                        'reduced_train_loss', out, prog_bar=True, rank_zero_only=True, batch_size=1
+                    )
 
             return out
 
@@ -443,7 +501,24 @@ def validation_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OU
 
         with self.precision_plugin.val_step_context():  # TODO: Do we need this?
             out = self.model(dataloader_iter, forward_only=True, *args, **kwargs)
-            self.lightning_module.log('val_loss', out, rank_zero_only=True, batch_size=1)
+
+            from megatron.core import parallel_state
+
+            pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+            if pp_size > 1:
+                # ranks that are not final pp stage have 0 for loss, and out will be mean-reduced over pp
+                # groups (due to sync_dist), which divides val_loss by pp_size. so we multiply by pp_size to cancel out
+                self.lightning_module.log(
+                    'val_loss',
+                    out * pp_size,
+                    prog_bar=True,
+                    sync_dist=True,
+                    sync_dist_group=parallel_state.get_pipeline_model_parallel_group(),
+                    on_epoch=True,
+                )
+            else:
+                self.lightning_module.log('val_loss', out, prog_bar=True, on_epoch=True)
+
             return out
 
     @override
@@ -497,9 +572,16 @@ def _fix_progress_bar(self, trainer: pl.Trainer) -> None:
             if callback.__class__ == TQDMProgressBar:
                 contains_progress = True
         if not contains_megatron_progress and contains_progress:
-            for callback in callbacks:
+            for i, callback in enumerate(callbacks):
                 if isinstance(callback, TQDMProgressBar):
-                    callback.__class__ = MegatronProgressBar
+                    if self.replace_progress_bar:
+                        printer = ProgressPrinter(log_interval=self.progress_interval)
+                        printer._trainer = trainer
+                        if not trainer.is_global_zero:
+                            printer.disable()
+                        callbacks[i] = printer
+                    else:
+                        callback.__class__ = MegatronProgressBar
                     break
 
     def optimizer_sharded_state_dict(self, is_loading=False):
@@ -527,7 +609,9 @@ def save_checkpoint(
         self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None
     ) -> None:
         checkpoint["state_dict"] = OrderedDict([])  # remove device state_dict
-        checkpoint["sharded_state_dict"] = self.megatron_parallel.sharded_state_dict()
+        # retrieve `sharded_state_dict` if it has not already been configured in `on_save_checkpoint`
+        if "sharded_state_dict" not in checkpoint:
+            checkpoint["sharded_state_dict"] = self.megatron_parallel.sharded_state_dict()
         if self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_include_optimizer:
             checkpoint["optimizer"] = [self.optimizer_sharded_state_dict()]
 
@@ -572,16 +656,16 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr
         assert self.megatron_parallel is not None
 
         _strategy_lib.load_model_state_dict(self.megatron_parallel, checkpoint, strict=strict)
+        for opt in self.optimizers:
+            opt.reload_model_params()
 
     @property
     @override
     def checkpoint_io(self) -> CheckpointIO:
         if self._checkpoint_io is None:
-            checkpoint_callback = self.trainer.checkpoint_callback
-            async_save = getattr(checkpoint_callback, "async_save", False)
             self._checkpoint_io = MegatronCheckpointIO(
                 save_ckpt_format=self.save_ckpt_format,
-                async_save=async_save,
+                async_save=self.async_save,
                 torch_dist_multiproc=self.torch_dist_multiproc,
                 assume_constant_structure=self.assume_constant_structure,
                 parallel_save=self.parallel_save,
@@ -589,15 +673,8 @@ def checkpoint_io(self) -> CheckpointIO:
                 parallel_load=self.parallel_load,
                 load_directly_on_device=self.load_directly_on_device,
             )
-            if async_save:
+            if self.async_save:
                 self._checkpoint_io = AsyncFinalizableCheckpointIO(self._checkpoint_io)
-                have_async_callback = False
-                for callback in self.trainer.callbacks:
-                    if isinstance(callback, AsyncFinalizerCallback):
-                        have_async_callback = True
-                        break
-                if not have_async_callback:
-                    self.trainer.callbacks.append(AsyncFinalizerCallback())
         elif isinstance(self._checkpoint_io, _WrappingCheckpointIO):
             self._checkpoint_io.checkpoint_io = MegatronCheckpointIO()
 
@@ -607,6 +684,16 @@ def checkpoint_io(self) -> CheckpointIO:
     def checkpoint_io(self, io: CheckpointIO) -> None:
         self._checkpoint_io = io
 
+    @property
+    def current_epoch_step(self) -> int:
+        """
+        Get the value of step within an epoch.
+        """
+        return max(
+            self.trainer.fit_loop.epoch_loop.automatic_optimization.optim_progress.optimizer.step.current.completed,
+            self.trainer.fit_loop.epoch_loop.manual_optimization.optim_step_progress.current.completed,
+        )
+
     def _get_data_step(self, step_type: str) -> Optional[_ModuleStepFunction]:
         for fn_name in [f"{step_type}_data_step", "data_step"]:
             if hasattr(self.lightning_module, fn_name):
@@ -669,10 +756,8 @@ def restore_checkpoint_after_setup(self) -> bool:
         return True
 
     @property
-    def parallelism(self):
-        from megatron.core.model_parallel_config import ModelParallelConfig
-
-        return ModelParallelConfig(
+    def parallelism(self) -> ParallelismConfig:
+        return ParallelismConfig(
             tensor_model_parallel_size=self.tensor_model_parallel_size,
             pipeline_model_parallel_size=self.pipeline_model_parallel_size,
             virtual_pipeline_model_parallel_size=self.virtual_pipeline_model_parallel_size,
@@ -683,6 +768,14 @@ def parallelism(self):
             pipeline_dtype=self.pipeline_dtype,
         )
 
+    @contextmanager
+    @override
+    def tensor_init_context(self, empty_init: Optional[bool] = None):
+        # Materializaton happens in `setup()`
+        # @akoumparouli: using Parent's tensor_init_context causes mcore
+        # parameters to be initialized on GPU instead of (assumed) CPU.
+        yield
+
 
 def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
     """PTL considers checkpoints as .ckpt files.
diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py
index 8b453832d56e..81f4d12bd3fb 100644
--- a/nemo/lightning/pytorch/trainer.py
+++ b/nemo/lightning/pytorch/trainer.py
@@ -10,14 +10,24 @@
 
 
 class Trainer(pl.Trainer, IOMixin):
+
+    def add_io(self, obj):
+        """Recurse to the leaves of a container and add io functionality to non-serializable leaves"""
+        if isinstance(obj, (dict, list)):
+            if isinstance(obj, dict):
+                obj = obj.values()
+            for item in obj:
+                self.add_io(item)
+        else:
+            if not serialization.find_node_traverser(type(obj)):
+                track_io(type(obj))
+            return
+
     def io_init(self, **kwargs) -> fdl.Config[Self]:
         # Each argument of the trainer can be stateful so we copy them
         cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items()}
 
-        for val in cfg_kwargs.values():
-            if not serialization.find_node_traverser(type(val)):
-                track_io(type(val))
-
+        self.add_io(cfg_kwargs)
         return fdl.Config(type(self), **cfg_kwargs)
 
     def to_fabric(self, callbacks=None, loggers=None) -> Fabric:
diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py
index fc2e21eb37fd..ca87628d699e 100644
--- a/nemo/lightning/resume.py
+++ b/nemo/lightning/resume.py
@@ -19,10 +19,10 @@
 
 
 class Resume(IOMixin):
-    def nemo_path(self, model) -> Optional[Path]:
-        raise NotImplementedError
+    def nemo_path(self, model=None) -> Optional[Path]:
+        """Returns the checkpoint to resume from."""
 
-    def setup(self, model, trainer: Union[pl.Trainer, fl.Fabric]):
+    def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None):
         if isinstance(trainer, fl.Fabric):
             raise NotImplementedError("Fabric is not supported yet.")
 
@@ -52,10 +52,11 @@ def __init__(
             path (str): Can be used to specify a path to a specific checkpoint file to load from.
                 This will override any checkpoint found when resume_if_exists is True.
                 Defaults to None
-            dirpath (str): Path to save the checkpoints to. Defaults to <log_dir>/checkpoints
+            dirpath (str): Path to the checkpointing directory to restore from. Defaults to <log_dir>/checkpoints
             import_path (str): Path to specify if importing a checkpoint from HF or
                 another non-NeMo checkpoint format. If import_path is provided, other arguments
                 are unused.
+            adapter_path (str): Path to any adapter checkpoints.
             resume_if_exists (bool): Whether this experiment is resuming from a previous run. If
                 True, it sets trainer._checkpoint_connector._ckpt_path so that the trainer should
                 auto-resume. exp_manager will move files under log_dir to log_dir/run_{int}.
@@ -139,7 +140,11 @@ def nemo_path(self, model=None) -> Optional[Path]:
                     checkpoint = last_checkpoints[0]
                     checkpoint = uninject_model_parallel_rank(checkpoint)
                 else:
-                    raise ValueError(f"Multiple checkpoints {last_checkpoints} that matches *last.ckpt.")
+                    # Select the checkpoint with the latest modified time
+                    checkpoint = sorted(last_checkpoints, key=lambda pth: pth.lstat().st_mtime, reverse=True)[0]
+                    logging.warning(
+                        f"Multiple checkpoints {last_checkpoints} matches *last.ckpt. Selecting one with the latest modified time."
+                    )
             else:
                 checkpoint = last_checkpoints[0]
 
diff --git a/nemo/utils/apex_utils.py b/nemo/utils/apex_utils.py
deleted file mode 100644
index b3b57a175287..000000000000
--- a/nemo/utils/apex_utils.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import warnings
-from typing import List, Optional
-
-import torch
-
-
-def _reconfigure_microbatch_calculator(
-    rank: int,
-    rampup_batch_size: Optional[List[int]],
-    global_batch_size: int,
-    micro_batch_size: int,
-    data_parallel_size: int,
-) -> None:
-
-    import megatron.core.num_microbatches_calculator as mb_calculator
-
-    mb_calculator._GLOBAL_NUM_MICROBATCHES_CALCULATOR = mb_calculator.build_num_microbatches_calculator(
-        rank, rampup_batch_size, global_batch_size, micro_batch_size, data_parallel_size
-    )
-
-
-def get_micro_batch_size():
-    from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-
-    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.micro_batch_size
diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py
index ec0650a90e7d..c78196934108 100644
--- a/nemo/utils/callbacks/cuda_graph.py
+++ b/nemo/utils/callbacks/cuda_graph.py
@@ -139,25 +139,6 @@ def to_tensor(self, value, name):
     return value
 
 
-def register_key(self, key, meta, value):
-    # PyTorch Lightning creates all metrics on GPU, but creating the metric on
-    # its input device is prefered.
-    # Refer to: https://github.com/Lightning-AI/pytorch-lightning/blob/2.0.7/src/lightning/pytorch/trainer/connectors/logger_connector/result.py#L409
-    metric = _ResultMetric(meta, isinstance(value, torch.Tensor))
-    device = value.device if isinstance(value, torch.Tensor) else self.device
-    metric = metric.to(device)
-    self[key] = metric
-
-
-def update_metrics(self, key, value, batch_size):
-    # PyTorch Lightning always move all metrics to GPU, but moving the metric to
-    # its input device is prefered.
-    result_metric = self[key]
-    device = value.device if isinstance(value, torch.Tensor) else self.device
-    result_metric.forward(value.to(device), batch_size)
-    result_metric.has_reset = False
-
-
 def get_optimizer_step(state):
     def optimizer_step(
         self,
@@ -374,10 +355,6 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
         # Use smart metrics to avoid syncs
         LightningModule.__orig_to_tensor__ = LightningModule._LightningModule__to_tensor
         LightningModule._LightningModule__to_tensor = to_tensor
-        _ResultCollection.__orig_register_key__ = _ResultCollection.register_key
-        _ResultCollection.register_key = register_key
-        _ResultCollection.__orig_update_metrics__ = _ResultCollection.update_metrics
-        _ResultCollection.update_metrics = update_metrics
 
         # Save model outputs to static buffer for PL states reconstruct
         pl_module.__orig_training_step__ = pl_module.training_step
@@ -409,10 +386,6 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
 
         LightningModule._LightningModule__to_tensor = LightningModule.__orig_to_tensor__
         del LightningModule.__orig_to_tensor__
-        _ResultCollection.register_key = _ResultCollection.__orig_register_key__
-        del _ResultCollection.__orig_register_key__
-        _ResultCollection.update_metrics = _ResultCollection.__orig_update_metrics__
-        del _ResultCollection.__orig_update_metrics__
 
         pl_module.training_step = pl_module.__orig_training_step__
         del pl_module.__orig_training_step__
diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
index 9348779051bb..437c8b0c5887 100644
--- a/nemo/utils/callbacks/dist_ckpt_io.py
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -323,6 +323,10 @@ def load_checkpoint(
             logging.info(f'Using {sharded_strategy} dist-ckpt load strategy.')
 
         if isinstance(strict, bool):
+            # For backward-compatibility reasons and a bug in MCore (strict check not applied to factories)
+            # we must apply a simple strict check here.
+            if not strict:
+                sharded_state_dict = self.adjust_non_strict_load(path, sharded_state_dict)
             strict = StrictHandling.ASSUME_OK_UNEXPECTED if strict else StrictHandling.LOG_ALL
         if self.load_strictness is not None:
             # Overwrites function argument
@@ -331,6 +335,8 @@ def load_checkpoint(
             # Default behavior
             strict = StrictHandling.ASSUME_OK_UNEXPECTED
 
+        logging.debug(f'Dist ckpt load strictness: {strict}')
+
         return dist_checkpointing.load(
             sharded_state_dict=sharded_state_dict,
             checkpoint_dir=path,
@@ -339,6 +345,30 @@ def load_checkpoint(
             strict=strict,
         )
 
+    def adjust_non_strict_load(self, path: _PATH, sharded_state_dict: Dict[str, Any]):
+        ckpt_sharded_metadata = dist_checkpointing.load_tensors_metadata(path)
+        loaded_keys = []
+        missing_keys = []
+        unexpected_keys = []
+
+        def should_remove_missing_sharded_base(x: Any):
+            if isinstance(x, ShardedBase):
+                if x.key in ckpt_sharded_metadata:
+                    loaded_keys.append(x.key)
+                    return False
+                else:
+                    unexpected_keys.append(x.key)
+                    return True
+            return False
+
+        _, sharded_state_dict = extract_matching_values(sharded_state_dict, should_remove_missing_sharded_base)
+        logging.info(f'The following keys are not in the checkpoint and will not be loaded: {unexpected_keys}')
+
+        # TODO: compute missing_keys by:
+        #  1. all_gather_object of loaded_keys
+        #  2. missing_keys = ckpt_sharded_metadata.keys() - loaded_keys
+        return sharded_state_dict
+
     @_debug_time('DistributedCheckpointIO.remove_checkpoint')
     def remove_checkpoint(self, path: _PATH) -> None:
         """Remove a distributed checkpoint.
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index f4bfb8ec95c4..ca18b22c00bc 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -165,6 +165,7 @@ class FaultToleranceParams:
     initial_rank_heartbeat_timeout: Optional[float] = 60.0 * 60.0
     rank_heartbeat_timeout: Optional[float] = 45.0 * 60.0
     calculate_timeouts: bool = True
+    safety_factor: float = 5.0
     rank_termination_signal: signal.Signals = signal.SIGKILL
     log_level: str = 'INFO'
     max_rank_restarts: int = 0
@@ -229,6 +230,8 @@ class ExpManagerConfig:
     # Fault tolrance
     create_fault_tolerance_callback: Optional[bool] = False
     fault_tolerance: Optional[FaultToleranceParams] = field(default_factory=FaultToleranceParams)
+    # logs TFLOPs per sec per gpu
+    log_tflops_per_sec_per_gpu: Optional[bool] = True
 
 
 class TimingCallback(Callback):
@@ -558,7 +561,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
         if HAVE_STRAGGLER_DET:
             logging.info("Enabling straggler detection...")
             straggler_det_args_dict = dict(cfg.straggler_detection_params)
-            straggler_det_callback = StragglerDetectionCallback(**straggler_det_args_dict, logger=logging)
+            straggler_det_callback = StragglerDetectionCallback(**straggler_det_args_dict)
             trainer.callbacks.append(straggler_det_callback)
         else:
             raise ValueError(
@@ -573,6 +576,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
             # here we only need to know if the autoresume is enabled.
             ft_use_autoresume = ft_params.max_subsequent_job_failures > 0
             fault_tol_callback = FaultToleranceCallback(
+                exp_dir=Path(log_dir).parent,  # log_dir is "<run name>/results/"
                 autoresume=ft_use_autoresume,
                 calculate_timeouts=ft_params.calculate_timeouts,
                 simulated_fault_params=ft_params.simulated_fault,
@@ -583,6 +587,11 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
                 'FaultToleranceCallback was enabled with create_fault_tolerance_callback, but fault_tolerance package is not installed.'
             )
 
+    if cfg.log_tflops_per_sec_per_gpu:
+        logging.info(
+            "TFLOPs per sec per GPU will be calculated, conditioned on supported models. Defaults to -1 upon failure."
+        )
+
     if is_global_rank_zero():
         # Move files_to_copy to folder and add git information if present
         if cfg.files_to_copy:
diff --git a/nemo/utils/trainer_utils.py b/nemo/utils/trainer_utils.py
new file mode 100644
index 000000000000..790ccb819069
--- /dev/null
+++ b/nemo/utils/trainer_utils.py
@@ -0,0 +1,20 @@
+from typing import Mapping
+
+_HAS_HYDRA = True
+
+try:
+    import hydra
+    from omegaconf import DictConfig, OmegaConf
+except ModuleNotFoundError:
+    DictConfig = Mapping
+    OmegaConf = None
+    _HAS_HYDRA = False
+
+
+def resolve_trainer_cfg(trainer_cfg: DictConfig) -> DictConfig:
+    trainer_cfg = OmegaConf.to_container(trainer_cfg, resolve=True)
+    if not _HAS_HYDRA:
+        return trainer_cfg
+    if (strategy := trainer_cfg.get("strategy", None)) is not None and isinstance(strategy, Mapping):
+        trainer_cfg["strategy"] = hydra.utils.instantiate(strategy)
+    return trainer_cfg
diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt
index 7745f5326047..3da063dacf1b 100644
--- a/requirements/requirements_asr.txt
+++ b/requirements/requirements_asr.txt
@@ -5,7 +5,7 @@ g2p_en
 jiwer
 kaldi-python-io
 kaldiio
-lhotse>=1.24.2
+lhotse>=1.26.0
 librosa>=0.10.0
 marshmallow
 packaging
diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
index 1fdce2c160d9..b7e6119fd7b7 100644
--- a/requirements/requirements_multimodal.txt
+++ b/requirements/requirements_multimodal.txt
@@ -6,7 +6,7 @@ einops_exts
 imageio
 kornia
 nerfacc>=0.5.3
-open_clip_torch
+open_clip_torch==2.24.0
 PyMCubes
 taming-transformers
 torchdiffeq
diff --git a/requirements/requirements_vllm.txt b/requirements/requirements_vllm.txt
index 2a7a8972beaf..414e05078680 100644
--- a/requirements/requirements_vllm.txt
+++ b/requirements/requirements_vllm.txt
@@ -1 +1 @@
-vllm==0.5.1
+vllm==0.5.3.post1
diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
index a81fd33f47a2..14baca53f165 100644
--- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
@@ -15,12 +15,11 @@
 """
 Example to run this conversion script:
 ```
-    python convert_bert_hf_to_nemo.py \
-     --input_name_or_path "thenlper/gte-large" \
+    python /opt/NeMo/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py \
+     --input_name_or_path /path/to/hf/checkpoints/folder \
      --output_path /path/to/output/nemo/file.nemo \
      --mcore True \
-     --post_process False \
-     --precision 32
+     --precision bf16
 ```
 """
 
@@ -37,7 +36,10 @@
 
 
 def adjust_nemo_config(model_config, ref_config, mcore_bert=True):
-    model_config.tokenizer["type"] = "intfloat/e5-large-unsupervised"  # ref_config["_input_name_or_path"]
+    model_config.tokenizer["type"] = ref_config["_name_or_path"]
+    model_config.tokenizer["library"] = "huggingface"
+    model_config.tokenizer["use_fast"] = True
+    model_config["max_position_embeddings"] = ref_config['max_position_embeddings']
     model_config["num_layers"] = ref_config["num_hidden_layers"]
     model_config["hidden_size"] = ref_config["hidden_size"]
     model_config["ffn_hidden_size"] = ref_config["intermediate_size"]
@@ -67,7 +69,7 @@ def get_args():
         "--post_process", type=bool, default=False, required=False, help="Whether to have the postprocessing modules"
     )
     parser.add_argument(
-        "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
+        "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )
 
     args = parser.parse_args()
@@ -86,7 +88,12 @@ def convert(args):
     model = MegatronBertModel(nemo_config.model, trainer)
 
     if not args.post_process:
-        model.model.lm_head, model.model.encoder.final_layernorm, model.model.binary_head, model.model.output_layer = (
+        (
+            model.model.module.lm_head,
+            model.model.module.encoder.final_layernorm,
+            model.model.module.binary_head,
+            model.model.module.output_layer,
+        ) = (
             None,
             None,
             None,
@@ -263,6 +270,16 @@ def convert(args):
         else:
             nemo_state_dict['model.language_model.embedding.word_embeddings.weight'] = padded_embedding
 
+    modified_dict = {}
+    for key, value in nemo_state_dict.items():
+        if key.startswith('model.'):
+            new_key = 'model.module.' + key[len('model.') :]
+            modified_dict[new_key] = value
+        else:
+            modified_dict[key] = value
+
+    nemo_state_dict = modified_dict
+
     model.load_state_dict(nemo_state_dict, strict=True)
     dtype = torch_dtype_from_precision(args.precision)
     model = model.to(dtype=dtype)
@@ -271,5 +288,6 @@ def convert(args):
 
 
 if __name__ == '__main__':
+    os.environ['NVTE_FLASH_ATTN'] = '0'  # Bert doesn't support FLASH_ATTN
     args = get_args()
     convert(args)
diff --git a/scripts/checkpoint_converters/convert_gemma2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_gemma2_hf_to_nemo.py
new file mode 100644
index 000000000000..fb296cf25c68
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_gemma2_hf_to_nemo.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Requires HF transformers updated to v4.42 to support Gemma 2 Models
+
+    huggingface-cli login
+    >>> from huggingface_hub import snapshot_download
+    >>> snapshot_download(repo_id="google/gemma-2-9b", local_dir="/path/to/gemma2/checkpoints/hf/9b")
+
+    python3 /opt/NeMo/scripts/checkpoint_converters/convert_gemma2_hf_to_nemo.py \
+    --input_name_or_path /path/to/gemma2/checkpoints/hf/9b \
+    --output_path /path/to/gemma2-9b.nemo \
+    --tokenizer_path /path/to/gemma2/checkpoints/hf/9b/tokenizer.model
+    [--cpu]
+
+If you encounter a torch.cuda.OutOfMemoryError, try converting on CPU with --cpu.
+"""
+
+import os
+from argparse import ArgumentParser
+
+import torch
+
+from megatron.core import parallel_state
+from omegaconf import OmegaConf
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def create_rename_keys(num_hidden_layers):
+    rename_keys = []
+    for i in range(num_hidden_layers):
+        # Attention layers
+        rename_keys.extend(
+            [
+                (
+                    f"model.layers.{i}.self_attn.o_proj.weight",
+                    f"model.decoder.layers.{i}.self_attention.linear_proj.weight",
+                ),
+                (
+                    f"model.layers.{i}.self_attn.q_proj.weight",
+                    f"model.decoder.layers.{i}.self_attention.linear_q.weight",
+                ),
+                (
+                    f"model.layers.{i}.self_attn.k_proj.weight",
+                    f"model.decoder.layers.{i}.self_attention.linear_k.weight",
+                ),
+                (
+                    f"model.layers.{i}.self_attn.v_proj.weight",
+                    f"model.decoder.layers.{i}.self_attention.linear_v.weight",
+                ),
+                # MLP and LayerNorm
+                (f"model.layers.{i}.mlp.gate_proj.weight", f"model.decoder.layers.{i}.mlp.linear_fc1_gate.weight"),
+                (f"model.layers.{i}.mlp.up_proj.weight", f"model.decoder.layers.{i}.mlp.linear_fc1_proj.weight"),
+                (f"model.layers.{i}.mlp.down_proj.weight", f"model.decoder.layers.{i}.mlp.linear_fc2.weight"),
+                (
+                    f"model.layers.{i}.input_layernorm.weight",
+                    f"model.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_weight",
+                ),
+                (
+                    f"model.layers.{i}.pre_feedforward_layernorm.weight",
+                    f"model.decoder.layers.{i}.mlp.linear_fc1.layer_norm_weight",
+                ),
+                (
+                    f"model.layers.{i}.post_attention_layernorm.weight",
+                    f"model.decoder.layers.{i}.self_attention.linear_proj.post_layernorm.weight",
+                ),
+                (
+                    f"model.layers.{i}.post_feedforward_layernorm.weight",
+                    f"model.decoder.layers.{i}.mlp.linear_fc2.post_layernorm.weight",
+                ),
+            ]
+        )
+
+    # Non layer dependent keys
+    rename_keys.extend(
+        [
+            ("model.embed_tokens.weight", "model.embedding.word_embeddings.weight"),
+            ("model.norm.weight", "model.decoder.final_layernorm.weight"),
+        ]
+    )
+
+    return rename_keys
+
+
+def rename_model_keys(model_state_dict, rename_keys):
+    """
+    Rename keys in the model's state dictionary based on the provided mappings.
+
+    Parameters:
+    model_state_dict (dict): The state dictionary of the model.
+    rename_keys (list): A list of tuples with the mapping (old_key, new_key).
+
+    Returns:
+    dict: A new state dictionary with updated key names.
+    """
+
+    # Create a new state dictionary with updated key names
+    new_state_dict = {}
+
+    # Track keys from the original state dict to ensure all are processed
+    remaining_keys = set(model_state_dict.keys())
+
+    # Iterate over the rename mappings
+    for old_key, new_key in rename_keys:
+        if old_key in model_state_dict:
+            # Rename the key and remove it from the tracking set
+            new_state_dict[new_key] = model_state_dict[old_key]
+            remaining_keys.remove(old_key)
+
+    # Check if any keys were not converted from old to new
+    for old_key in remaining_keys:
+        print(f"Warning: Key '{old_key}' was not converted.")
+
+    return new_state_dict
+
+
+def adjust_tensor_shapes(model, nemo_state_dict):
+    """
+    Adapt tensor shapes in the state dictionary to ensure compatibility with a different model structure.
+
+    Parameters:
+    nemo_state_dict (dict): The state dictionary of the model.
+
+    Returns:
+    dict: The updated state dictionary with modified tensor shapes for compatibility.
+    """
+    model_config = model.cfg
+    num_query_groups = model_config["num_query_groups"]
+    head_num = model_config["num_attention_heads"]
+    hidden_size = model_config["hidden_size"]
+    head_size = model_config["kv_channels"]
+    heads_per_group = head_num // num_query_groups
+
+    # Note: For 'key' and 'value' weight and biases, NeMo uses a consolidated tensor 'query_key_value'.
+    for key_ in list(nemo_state_dict.keys()):
+        if 'mlp.linear_fc1_gate.weight' in key_:
+            key_gate = key_
+            key_proj = key_.replace('mlp.linear_fc1_gate.weight', 'mlp.linear_fc1_proj.weight')
+            new_key = key_.replace('mlp.linear_fc1_gate.weight', 'mlp.linear_fc1.weight')
+            gate_weight = nemo_state_dict[key_gate]
+            proj_weight = nemo_state_dict[key_proj]
+            nemo_state_dict[new_key] = torch.cat((gate_weight, proj_weight))
+        if 'layernorm.weight' in key_ or 'layer_norm_weight' in key_:
+            nemo_state_dict[key_] = nemo_state_dict[key_]
+        if 'self_attention.linear_q.weight' in key_:
+            key_q = key_
+            key_k = key_.replace('linear_q', 'linear_k')
+            key_v = key_.replace('linear_q', 'linear_v')
+            key_qkv = key_.replace('linear_q', 'linear_qkv')
+
+            # [(head_num + 2 * num_query_groups) * head_size, hidden_size]
+            # -> [head_num, head_size, hidden_size], 2 * [num_query_groups, head_size, hidden_size]
+            q_weight, k_weight, v_weight = nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v]
+            q_weight = q_weight.reshape(head_num, head_size, hidden_size)
+            k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)
+            v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size)
+
+            qkv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device)
+            for i in range(num_query_groups):
+                qkv_weight = torch.cat((qkv_weight, q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+                qkv_weight = torch.cat((qkv_weight, k_weight[i : i + 1, :, :]))
+                qkv_weight = torch.cat((qkv_weight, v_weight[i : i + 1, :, :]))
+            qkv_weight = qkv_weight.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+            nemo_state_dict[key_qkv] = qkv_weight
+            del nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v]
+
+    return nemo_state_dict
+
+
+def adjust_nemo_config(model_config, ref_config):
+    model_config["encoder_seq_length"] = ref_config["max_position_embeddings"]
+    model_config["num_layers"] = ref_config["num_hidden_layers"]
+    model_config["ffn_hidden_size"] = ref_config["intermediate_size"]
+    model_config["hidden_size"] = ref_config["hidden_size"]
+    model_config["num_attention_heads"] = ref_config["num_attention_heads"]
+    model_config["num_query_groups"] = ref_config["num_key_value_heads"]
+    model_config["kv_channels"] = ref_config["head_dim"]
+    model_config["layernorm_epsilon"] = ref_config["rms_norm_eps"]
+    model_config["window_size"] = (ref_config["sliding_window_size"], 0)
+    model_config["layernorm_zero_centered_gamma"] = True
+    model_config["name"] = 'megatron_gemma2'
+    model_config['mcore_customization_config'] = {
+        "attn_logit_softcapping": ref_config["attn_logit_softcapping"],
+        "final_logit_softcapping": ref_config["final_logit_softcapping"],
+        "query_pre_attn_scalar": ref_config["query_pre_attn_scalar"],
+    }
+    return model_config
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument("--input_name_or_path", type=str)
+    parser.add_argument("--tokenizer_path", type=str)
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_gemma2_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weight saved"
+    )
+    parser.add_argument("--run_verification", action="store_true")
+    parser.add_argument("--cpu", action="store_true")
+
+    args = parser.parse_args()
+    return args
+
+
+def verify(nemo_model, hf_tokenizer, hf_model):
+    # Verifications
+    input_texts = [
+        'query: how much protein should a female eat',
+    ]
+    logging.info(f"Running verifications {input_texts} ...")
+
+    # Tokenize the input texts
+    hf_tokenizer.pad_token = hf_tokenizer.eos_token
+    batch_dict = hf_tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
+    batch_dict_cuda = {k: v.cuda() for k, v in batch_dict.items()}
+    hf_model = hf_model.cuda().eval()
+    nemo_model = nemo_model.eval()
+
+    hf_outputs = hf_model(**batch_dict_cuda, output_hidden_states=True)
+
+    parallel_state._set_global_memory_buffer()
+    ids = batch_dict_cuda['input_ids']
+
+    id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids.cpu()]
+
+    masks_and_position_ids = [
+        get_ltor_masks_and_position_ids(id_tensor, hf_tokenizer.eos_token, False, False, False)
+        for id_tensor in id_tensors
+    ]
+    for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
+        attn_mask, _, pos_ids = attn_mask_and_pos_ids
+        outputs = nemo_model(
+            tokens=tokens.cuda(), text_position_ids=pos_ids.cuda(), attention_mask=attn_mask.cuda(), labels=None
+        )
+
+    hf_next_token = hf_outputs.logits[0, -1].argmax()
+    next_token = outputs.squeeze()[-1].argmax()
+
+    logging.info(f"HF predicted next token is: '{hf_tokenizer._convert_id_to_token(hf_next_token)}'.")
+    logging.info(f"NeMo predicted next token is: '{hf_tokenizer._convert_id_to_token(next_token)}'.")
+    assert (
+        hf_next_token == next_token
+    ), f'prediction mismatch: {hf_tokenizer.decode(hf_next_token)} != {hf_tokenizer.decode(next_token)}'
+
+
+def convert(args):
+    logging.info(f"Loading checkpoint from HF Gemma 2: `{args.input_name_or_path}`")
+    hf_tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path)
+    hf_model = AutoModelForCausalLM.from_pretrained(args.input_name_or_path)
+    logging.info("HF Model loading done.")
+
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.__dict__)
+    nemo_config.model.tokenizer["model"] = args.tokenizer_path
+
+    nemo_config.trainer["precision"] = args.precision
+    if args.cpu:
+        nemo_config.model['use_cpu_initialization'] = True
+        nemo_config.trainer['accelerator'] = 'cpu'
+    trainer = MegatronTrainerBuilder(nemo_config).create_trainer()
+    model = MegatronGPTModel(nemo_config.model, trainer)
+
+    rename_keys = create_rename_keys(nemo_config.model.num_layers)
+    old_state_dict = hf_model.state_dict()
+    new_state_dict = rename_model_keys(model_state_dict=old_state_dict, rename_keys=rename_keys)
+
+    nemo_state_dict = adjust_tensor_shapes(model, new_state_dict)
+    model.load_state_dict(nemo_state_dict, strict=False)
+
+    if args.run_verification and not args.cpu:
+        logging.info(f'=' * 100)
+        verify(model, hf_tokenizer, hf_model)
+        logging.info(f'=' * 100)
+
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
+    model.cfg.use_cpu_initialization = False
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
index 1f8c69b5b240..35039f8d02e9 100644
--- a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
+++ b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
@@ -62,7 +62,7 @@ def get_args():
         help="Path to output mcore weights file (ends in .nemo).",
     )
     parser.add_argument(
-        "--cpu-only",
+        "--cpu_only",
         action="store_true",
         help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
         "but this option makes the conversion script significantly slower.",
@@ -73,7 +73,7 @@ def get_args():
         help="Run conversion again and overwrite output file when the output file already exists",
     )
     parser.add_argument(
-        "--ignore-if-missing",
+        "--ignore_if_missing",
         default="rotary_pos_emb.inv_freq",
         help="comma-separated list of state_dict keys that are known to be missing in mcore and can be safely ignored",
     )
@@ -158,8 +158,8 @@ def build_key_mapping(nemo_cfg):
         for wb in ('weight', 'bias') if has_layernorm_bias else ('weight',):
             mcore_to_nemo_mapping.update(
                 {
-                    f"{mcore_prefix}.{i}.self_attention.linear_qkv.layer_norm_{wb}": f"{nemo_prefix}.{i}.input_layernorm.{wb}",
-                    f"{mcore_prefix}.{i}.mlp.linear_fc1.layer_norm_{wb}": f"{nemo_prefix}.{i}.post_attention_layernorm.{wb}",
+                    f"{mcore_prefix}.{i}.input_layernorm.{wb}": f"{nemo_prefix}.{i}.input_layernorm.{wb}",
+                    f"{mcore_prefix}.{i}.pre_mlp_layernorm.{wb}": f"{nemo_prefix}.{i}.post_attention_layernorm.{wb}",
                 }
             )
 
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
index e1dc00c77439..4eb8cb6330ca 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
@@ -18,6 +18,8 @@
     python convert_llama_hf_to_nemo.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --output_path <path_to_output_nemo_file>
+     --precision bf16 \
+     --llama31 True 
 """
 
 import os
@@ -44,7 +46,11 @@
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--input_name_or_path", type=str, default=None, required=True, help="Path to Huggingface LLaMA checkpoints",
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface LLaMA checkpoints",
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
     parser.add_argument(
@@ -56,6 +62,13 @@ def get_args():
         required=False,
         help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
+    parser.add_argument(
+        "--llama31",
+        type=bool,
+        default=True,
+        required=False,
+        help="Whether the model is from LLaMa 3.1 family. LLaMa 3.1 enables scaling for RoPE frequencies.",
+    )
     parser.add_argument("--precision", type=str, default="16", help="Model precision")
     args = parser.parse_args()
     return args
@@ -92,7 +105,10 @@ def load_config(args, llama_config):
         nemo_config.tokenizer = tokenizer_dict
 
     if llama_config['rope_scaling'] is not None:
-        if llama_config['rope_scaling']['type'] == 'linear':
+        rope_type = llama_config['rope_scaling'].get('rope_type')
+        if rope_type is None:
+            rope_type = llama_config['rope_scaling'].get('type')
+        if rope_type in ('linear', 'llama3'):
             nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor']
         else:
             raise ValueError("Only linear rope scaling type is supported now")
@@ -103,6 +119,7 @@ def load_config(args, llama_config):
     while llama_config['vocab_size'] % base != 0:
         base //= 2
     nemo_config.make_vocab_size_divisible_by = base
+    nemo_config.scale_positional_embedding = args.llama31
 
     return nemo_config
 
@@ -139,7 +156,7 @@ def convert(args):
         scaler = None
         if precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32),
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
                 growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
                 hysteresis=nemo_config.get('hysteresis', 2),
             )
@@ -154,6 +171,7 @@ def convert(args):
             plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
 
     nemo_config.precision = precision
+    nemo_config.micro_batch_size = 1
     print(f"nemo_config: {nemo_config}")
 
     # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
@@ -291,12 +309,22 @@ def convert(args):
 
     # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
     if 'tokenizer_model' not in hf_config:
-        if hf_config['num_hidden_layers'] == 32:
-            model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
-        elif hf_config['num_hidden_layers'] == 80:
-            model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+        if args.llama31:
+            if hf_config['num_hidden_layers'] == 32:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B')
+            elif hf_config['num_hidden_layers'] == 80:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-70B')
+            elif hf_config['num_hidden_layers'] == 126:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B')  # 405B tokenizer is the same as 8B
+            else:
+                logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
         else:
-            logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+            if hf_config['num_hidden_layers'] == 32:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
+            elif hf_config['num_hidden_layers'] == 80:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+            else:
+                logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
 
     # cast to target precision and disable cpu init
     dtype = torch_dtype_from_precision(precision)
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
new file mode 100644
index 000000000000..f395e34765d0
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_load.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
+  Example to run this conversion script:
+    python convert_llama_hf_to_nemo.py \
+     --input_name_or_path <path_to_hf_checkpoints_folder> \
+     --input_state_dict <path_to_saved_state_dict> \
+     --output_path <path_to_output_nemo_file> \
+     --precision bf16
+     --llama31 True
+"""
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface LLaMA checkpoints",
+    )
+    parser.add_argument(
+        "--input_state_dict",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface LLaMA checkpoints",
+    )
+
+    parser.add_argument(
+        "--llama31",
+        type=bool,
+        default=True,
+        required=False,
+        help="Apply scaling for RoPE frequencies",
+    )
+
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+def load_config(args, llama_config):
+    nemo_config = OmegaConf.load(args.hparams_file).model
+
+    if llama_config.get('rope_theta', None):
+        nemo_config['rotary_base'] = llama_config['rope_theta']
+    nemo_config.encoder_seq_length = llama_config['max_position_embeddings']
+    nemo_config.num_layers = int(llama_config['num_hidden_layers'])
+    nemo_config.hidden_size = llama_config['hidden_size']
+    nemo_config.ffn_hidden_size = llama_config['intermediate_size']
+    nemo_config.num_attention_heads = llama_config['num_attention_heads']
+    nemo_config.max_position_embeddings = llama_config['max_position_embeddings']
+    nemo_config.init_method_std = llama_config['initializer_range']
+    nemo_config.layernorm_epsilon = llama_config['rms_norm_eps']
+    if 'num_key_value_heads' in llama_config:
+        nemo_config.num_query_groups = llama_config['num_key_value_heads']
+    nemo_config.use_cpu_initialization = True
+    nemo_config.activation = 'fast-swiglu'
+    nemo_config.megatron_amp_O2 = True  # True
+    nemo_config.scale_positional_embedding = args.llama31
+
+    # Tokenizer config
+    if 'tokenizer_model' in llama_config:
+        nemo_config.tokenizer.model = llama_config['tokenizer_model']
+    else:
+        # Llama3 uses converted TikToken Tokenizer
+        tokenizer_dict = {
+            'library': 'huggingface',
+            'type': args.input_name_or_path,
+            'use_fast': True,
+        }
+        nemo_config.tokenizer = tokenizer_dict
+
+    if llama_config['rope_scaling'] is not None:
+        if llama_config['rope_scaling']['type'] == 'linear':
+            nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor']
+        else:
+            raise ValueError("Only linear rope scaling type is supported now")
+    if llama_config['rope_theta'] is not None:
+        nemo_config['rotary_base'] = llama_config['rope_theta']
+
+    base = 128
+    while llama_config['vocab_size'] % base != 0:
+        base //= 2
+    nemo_config.make_vocab_size_divisible_by = base
+
+    return nemo_config
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    import torch
+
+    model = LlamaForCausalLM.from_pretrained(
+        args.input_name_or_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
+    )
+    hf_config = vars(model.config)
+    if os.path.exists(f'{args.input_name_or_path}/tokenizer.model'):
+        tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
+        hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path)
+    print(f"hf_config: {hf_config}")
+    print("named parameters:")
+    for name, param in model.named_parameters():
+        print(f"- {name}")
+
+    nemo_config = load_config(args, hf_config)
+
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+    elif args.precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            precision = args.precision
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = args.precision[2:]  # prune bf in string
+    else:
+        precision = args.precision
+
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
+                growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+                hysteresis=nemo_config.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if nemo_config.get('megatron_amp_O2', False):
+            print('HALF PRECISION')
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+    nemo_config.precision = precision
+    nemo_config.micro_batch_size = 1
+    print(f"nemo_config: {nemo_config}")
+
+    # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
+
+    hidden_size = hf_config["hidden_size"]
+    head_num = hf_config["num_attention_heads"]
+    head_size = hidden_size // head_num
+    num_layers = hf_config["num_hidden_layers"]
+
+    mcore_gpt = nemo_config.mcore_gpt
+
+    assert mcore_gpt == nemo_config.get(
+        'transformer_engine', False
+    ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+    param_to_weights = lambda param: param.float()
+
+    print('start init model')
+    del model
+    import time
+
+    st = time.perf_counter()
+    model = MegatronGPTModel(nemo_config, trainer)
+    print(f'Model init took {time.perf_counter() - st} sec')
+    from functools import reduce
+    from glob import glob
+
+    weights = glob(f'{args.input_state_dict}/*.pt')
+    st = time.perf_counter()
+    for weight_file in sorted(weights):
+        filename = os.path.basename(weight_file)
+        str_list = filename.split('.')
+        weight_type = str_list[-2]
+        str_name = '.'.join(str_list[1:-1])
+        print(f'-- Assign weight_type={weight_type} to {str_name}')
+        if nemo_config.get('megatron_amp_O2', False):
+            current = reduce(getattr, [model, 'model', 'module'] + str_list[:-2])
+        else:
+            current = reduce(getattr, [model, 'model'] + str_list[:-2])
+        load = torch.load(weight_file)
+        if nemo_config.get('megatron_amp_O2', False):
+            if precision == 'bf16':
+                target_precision = torch.bfloat16
+            elif precision == 16:
+                target_precision = torch.float16
+            load = load.to(target_precision)
+
+        if weight_type == 'weight':
+            assert current.weight.shape == load.shape
+            assert current.weight.dtype == load.dtype
+            current.weight = torch.nn.Parameter(load)
+            assert current.weight.norm() == load.norm()
+        elif weight_type == 'layer_norm_weight':
+            assert current.layer_norm_weight.dtype == load.dtype
+            assert current.layer_norm_weight.shape == load.shape
+            current.layer_norm_weight = torch.nn.Parameter(load)
+            assert current.layer_norm_weight.norm() == load.norm()
+        else:
+            raise ValueError(f'Unsupported weight type = {weight_type}')
+        del load
+
+    print(f'Finish loading model in {time.perf_counter() - st} sec. Start to save model')
+    st = time.perf_counter()
+    print(f'Model save took {time.perf_counter() - st} sec.')
+
+    model._save_restore_connector = NLPSaveRestoreConnector()
+
+    # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
+    if 'tokenizer_model' not in hf_config:
+        if args.llama31:
+            if hf_config['num_hidden_layers'] == 32:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B')
+            elif hf_config['num_hidden_layers'] == 80:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-70B')
+            elif hf_config['num_hidden_layers'] == 126:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3.1-8B')  # 405B tokenizer is the same as 8B
+            else:
+                logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+        else:
+            if hf_config['num_hidden_layers'] == 32:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
+            elif hf_config['num_hidden_layers'] == 80:
+                model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+            else:
+                logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+
+    # cast to target precision and disable cpu init
+    dtype = torch_dtype_from_precision(precision)
+    model = model.to(dtype=dtype)
+    model.cfg.use_cpu_initialization = False
+
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
new file mode 100644
index 000000000000..940a9df5f9a8
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo_save_dict.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
+  Example to run this conversion script:
+    python convert_llama_hf_to_nemo.py \
+     --input_name_or_path <path_to_hf_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file>
+     --precision bf16 
+     --apply_rope_scaling True
+"""
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface LLaMA checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output to dict dir")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument(
+        "--apply_rope_scaling",
+        type=bool,
+        default=True,
+        required=False,
+        help="Apply scaling for RoPE frequencies",
+    )
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+def load_config(args, llama_config):
+    nemo_config = OmegaConf.load(args.hparams_file).model
+
+    if llama_config.get('rope_theta', None):
+        nemo_config['rotary_base'] = llama_config['rope_theta']
+    nemo_config.encoder_seq_length = llama_config['max_position_embeddings']
+    nemo_config.num_layers = int(llama_config['num_hidden_layers'])
+    nemo_config.hidden_size = llama_config['hidden_size']
+    nemo_config.ffn_hidden_size = llama_config['intermediate_size']
+    nemo_config.num_attention_heads = llama_config['num_attention_heads']
+    nemo_config.max_position_embeddings = llama_config['max_position_embeddings']
+    nemo_config.init_method_std = llama_config['initializer_range']
+    nemo_config.layernorm_epsilon = llama_config['rms_norm_eps']
+    if 'num_key_value_heads' in llama_config:
+        nemo_config.num_query_groups = llama_config['num_key_value_heads']
+    nemo_config.use_cpu_initialization = True
+    nemo_config.activation = 'fast-swiglu'
+    nemo_config.megatron_amp_O2 = True
+
+    # Tokenizer config
+    if 'tokenizer_model' in llama_config:
+        nemo_config.tokenizer.model = llama_config['tokenizer_model']
+    else:
+        # Llama3 uses converted TikToken Tokenizer
+        tokenizer_dict = {
+            'library': 'huggingface',
+            'type': args.input_name_or_path,
+            'use_fast': True,
+        }
+        nemo_config.tokenizer = tokenizer_dict
+
+    if llama_config['rope_scaling'] is not None:
+        if llama_config['rope_scaling']['type'] == 'linear':
+            nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor']
+        else:
+            raise ValueError("Only linear rope scaling type is supported now")
+    if llama_config['rope_theta'] is not None:
+        nemo_config['rotary_base'] = llama_config['rope_theta']
+
+    base = 128
+    while llama_config['vocab_size'] % base != 0:
+        base //= 2
+    nemo_config.make_vocab_size_divisible_by = base
+
+    return nemo_config
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    import torch
+
+    model = LlamaForCausalLM.from_pretrained(
+        args.input_name_or_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
+    )
+    hf_config = vars(model.config)
+    if os.path.exists(f'{args.input_name_or_path}/tokenizer.model'):
+        tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
+        hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path)
+
+    print("named parameters:")
+    for name, param in model.named_parameters():
+        print(f"- {name}")
+
+    nemo_config = load_config(args, hf_config)
+    nemo_config.scale_positional_embedding = args.apply_rope_scaling
+
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+    elif args.precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            precision = args.precision
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = args.precision[2:]  # prune bf in string
+    else:
+        precision = args.precision
+
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
+                growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+                hysteresis=nemo_config.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if nemo_config.get('megatron_amp_O2', False):
+            print('HALF PRECISION')
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+    nemo_config.precision = precision
+    print(f"nemo_config: {nemo_config}")
+
+    # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
+
+    hidden_size = hf_config["hidden_size"]
+    head_num = hf_config["num_attention_heads"]
+    head_size = hidden_size // head_num
+    num_layers = hf_config["num_hidden_layers"]
+
+    mcore_gpt = nemo_config.mcore_gpt
+
+    assert mcore_gpt == nemo_config.get(
+        'transformer_engine', False
+    ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+    param_to_weights = lambda param: param.float()
+
+    checkpoint = OrderedDict()
+    checkpoint['state_dict'] = OrderedDict()
+
+    embed_weight = model.state_dict()[f'model.embed_tokens.weight']
+    if mcore_gpt:
+        embed_weights_base_name = f'model.embedding.word_embeddings.weight'
+    else:
+        embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
+    checkpoint['state_dict'][embed_weights_base_name] = param_to_weights(embed_weight)
+
+    # in hf, this is defined as register_buffer(..., persistent=False) so it won't be in the state dict
+    if f'model.layers.0.self_attn.rotary_emb.inv_freq' in model.state_dict():
+        rotary_embed_weight = model.state_dict()[f'model.layers.0.self_attn.rotary_emb.inv_freq']
+        if mcore_gpt:
+            rotary_embed_weight_base_name = f'model.rotary_pos_emb.inv_freq'
+        else:
+            rotary_embed_weight_base_name = f'model.language_model.rotary_pos_emb.inv_freq'
+        checkpoint['state_dict'][rotary_embed_weight_base_name] = param_to_weights(rotary_embed_weight)
+
+    if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
+        num_query_groups = head_num
+    else:
+        num_query_groups = nemo_config.num_query_groups
+        assert head_num % num_query_groups == 0, 'head_num must be divisible by num_query_groups'
+    if mcore_gpt:
+        assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.'
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+        old_tensor_shape = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].size()
+        new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+        new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+        q = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].view(*new_q_tensor_shape)
+        k = model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight'].view(*new_kv_tensor_shape)
+        v = model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight'].view(*new_kv_tensor_shape)
+        qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+        heads_per_group = head_num // num_query_groups
+        for i in range(num_query_groups):
+            qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+            qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+            qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+        qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+        if mcore_gpt:
+            qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'
+        else:
+            qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight'
+        checkpoint['state_dict'][qkv_weights_base_name] = param_to_weights(qkv_weights)
+
+        # attention dense
+        o_weight = model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']
+        if mcore_gpt:
+            o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
+        else:
+            o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight'
+        checkpoint['state_dict'][o_weight_base_name] = param_to_weights(o_weight)
+
+        # MLP
+        mlp_down_weight = model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']
+        mlp_gate_weight = model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']
+        if mcore_gpt:
+            mlp_down_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.weight'
+        else:
+            mlp_down_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_h_to_4h.weight'
+        mlp_down_weight = torch.cat((mlp_down_weight, mlp_gate_weight), axis=0)
+        checkpoint['state_dict'][mlp_down_base_name] = param_to_weights(mlp_down_weight)
+
+        mlp_up_weight = model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']
+        if mcore_gpt:
+            mlp_up_base_name = f'model.decoder.layers.{l}.mlp.linear_fc2.weight'
+        else:
+            mlp_up_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_4h_to_h.weight'
+        checkpoint['state_dict'][mlp_up_base_name] = param_to_weights(mlp_up_weight)
+
+        # LayerNorm
+        input_ln_weight = model.state_dict()[f'model.layers.{l}.input_layernorm.weight']
+        if mcore_gpt:
+            input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+        else:
+            input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight'
+        checkpoint['state_dict'][input_ln_base_name] = param_to_weights(input_ln_weight)
+
+        post_attn_ln_weight = model.state_dict()[f'model.layers.{l}.post_attention_layernorm.weight']
+        if mcore_gpt:
+            post_attn_ln_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
+        else:
+            post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
+        checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+
+        print(f"done layer {l}")
+
+    final_ln_weight = model.state_dict()[f'model.norm.weight']
+    if mcore_gpt:
+        final_ln_base_name = f'model.decoder.final_layernorm.weight'
+    else:
+        final_ln_base_name = f'model.language_model.encoder.final_layernorm.weight'
+    checkpoint['state_dict'][final_ln_base_name] = param_to_weights(final_ln_weight)
+
+    output_layer_weight = model.state_dict()[f'lm_head.weight']
+    if mcore_gpt:
+        output_layer_base_name = f'model.output_layer.weight'
+    else:
+        output_layer_base_name = f'model.language_model.output_layer.weight'
+    checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight)
+
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+
+    del model
+    import gc
+
+    gc.collect()
+
+    if nemo_config.get('megatron_amp_O2', False):
+        keys = list(checkpoint['state_dict'].keys())
+        print('convert to O2')
+        for key in keys:
+            checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
+
+    # os.mkdir(args.output_path, exist_ok=True)
+    for key in checkpoint['state_dict']:
+        print(f'Saving {key} in {checkpoint["state_dict"][key].dtype}..')
+        save_location = f'{args.output_path}/{key[13:]}.pt'
+        torch.save(checkpoint['state_dict'][key], save_location)
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/checkpoint_converters/convert_llava_nemo_to_hf.py b/scripts/checkpoint_converters/convert_llava_nemo_to_hf.py
index 430a74567ec2..4681bac41a6f 100644
--- a/scripts/checkpoint_converters/convert_llava_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_llava_nemo_to_hf.py
@@ -150,10 +150,13 @@ def reverse_adjust_tensor_shapes(model, hf_model, nemo_state_dict):
     dict: The updated state dictionary with original tensor shapes and structures.
     """
     model_config = model.cfg
-    num_query_groups = model_config["num_query_groups"]
     head_num = model_config["num_attention_heads"]
     hidden_size = model_config["hidden_size"]
     head_size = model_config["kv_channels"]
+    if "num_query_groups" in model_config and model_config["num_query_groups"] is not None:
+        num_query_groups = model_config["num_query_groups"]
+    else:
+        num_query_groups = head_num
     if head_size is None:
         head_size = hidden_size // head_num
     heads_per_group = head_num // num_query_groups
@@ -300,7 +303,7 @@ def convert(args):
         batch_dict = hf_tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
         batch_dict_cuda = {k: v.cuda() for k, v in batch_dict.items()}
         hf_model = hf_model.cuda().eval()
-        model = model.eval()
+        model = model.cuda().eval()
 
         hf_outputs = hf_model(**batch_dict_cuda, output_hidden_states=True)
         ids = batch_dict_cuda['input_ids']
@@ -315,7 +318,7 @@ def convert(args):
             attn_mask, _, pos_ids = attn_mask_and_pos_ids
 
             outputs = model(
-                tokens=tokens, text_position_ids=pos_ids.cuda(), attention_mask=attn_mask.cuda(), labels=None
+                tokens=tokens.cuda(), text_position_ids=pos_ids.cuda(), attention_mask=attn_mask.cuda(), labels=None
             )
 
         hf_next_token = hf_outputs.logits[0, -1].argmax()
diff --git a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
index 3a72661499bf..425a6c696120 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
@@ -25,6 +25,7 @@
 import os
 from argparse import ArgumentParser
 from collections import OrderedDict
+from pathlib import Path
 
 import torch
 import torch.nn
@@ -55,11 +56,13 @@ def get_args():
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
     parser.add_argument("--precision", type=str, default="bf16", help="Model precision")
+    parser.add_argument('--low-ram', '--low-mem', action='store_true', dest='low_ram')
+    parser.add_argument('--tmp-dir', default='/tmp/mistral_ckpt_parts/')
     args = parser.parse_args()
     return args
 
 
-def load_model(cls, checkpoint, strict, **kwargs):
+def restore_model_from_checkpoint(cls, checkpoint, strict, **kwargs):
     try:
         if 'cfg' in kwargs:
             model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs)
@@ -67,7 +70,8 @@ def load_model(cls, checkpoint, strict, **kwargs):
             model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs)
             for name, module in model.named_parameters():
                 if name in checkpoint['state_dict']:
-                    module.data = checkpoint['state_dict'][name]
+                    # cast to target precision and
+                    module.data = checkpoint['state_dict'][name].to(dtype=module.data.dtype)
                     checkpoint['state_dict'].pop(name)
                 else:
                     print(f"Unexpected key: {name} not in checkpoint but in model.")
@@ -84,6 +88,9 @@ def load_model(cls, checkpoint, strict, **kwargs):
 
             # register the artifacts
             cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY]
+            # assert os.path.exists(
+            #     cfg.tokenizer.model
+            # ), f"Expected cfg.tokenizer.model {cfg.tokenizer.model} to be present"
             if cfg.tokenizer.model is not None:
                 model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model)
             if cfg.tokenizer.vocab_file is not None:
@@ -95,18 +102,22 @@ def load_model(cls, checkpoint, strict, **kwargs):
     return model
 
 
-def load_config(mistral_config, tokenizer_path):
+def load_config(mistral_config, tokenizer, config_path):
     nemo_config = OmegaConf.load(
         os.path.join(os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml')
     ).model
     # akoumparouli: verify this.
-    nemo_config.encoder_seq_length = mistral_config['sliding_window']
+    if mistral_config.get('sliding_window', None) is not None:
+        nemo_config.encoder_seq_length = mistral_config['sliding_window']
+    else:
+        nemo_config.encoder_seq_length = mistral_config['max_position_embeddings']
     nemo_config.num_layers = int(mistral_config['num_hidden_layers'])
     nemo_config.hidden_size = mistral_config['hidden_size']
     nemo_config.ffn_hidden_size = mistral_config['intermediate_size']
     nemo_config.num_attention_heads = mistral_config['num_attention_heads']
     nemo_config.max_position_embeddings = mistral_config['max_position_embeddings']
-    nemo_config.window_size = [mistral_config['sliding_window'], 0]
+    if mistral_config.get('sliding_window', None) is not None:
+        nemo_config.window_size = [mistral_config['sliding_window'], 0]
     nemo_config.init_method_std = mistral_config['initializer_range']
     # RMSNorm's epsilon.
     nemo_config.layernorm_epsilon = mistral_config['rms_norm_eps']
@@ -118,7 +129,42 @@ def load_config(mistral_config, tokenizer_path):
     # Mistral uses SiLU, but it is the same as swish with beta = 1.
     nemo_config.activation = 'fast-swiglu'
 
-    nemo_config.tokenizer.model = tokenizer_path
+    # Tokenizer config
+    if hasattr(tokenizer, 'vocab_file'):
+        nemo_config.tokenizer.model = tokenizer.vocab_file
+    elif os.path.exists(os.path.join(config_path, 'tekken.json')):
+        # Load tekken.json, extract the 'vocab' field & write it to file.
+        vocab_path = os.path.join(config_path, 'tekken.json')
+        assert os.path.exists(vocab_path), f"Expected {vocab_path} to exist"
+        with open(vocab_path, 'rt') as fp:
+            tok_vocab = json.load(fp)
+        vocab_output_path = '/tmp/tekken.json'
+        if os.path.exists(vocab_output_path):
+            os.remove(vocab_output_path)
+        with open(vocab_output_path, 'wt') as fp:
+            json.dump(tok_vocab['vocab'], fp)
+        assert os.path.exists(vocab_output_path), f"Expected {vocab_output_path} to exist"
+        assert os.path.getsize(vocab_output_path) > 0, f"Expected {vocab_output_path} to be non-empty"
+
+        tokenizer_dict = {
+            'library': 'tiktoken',
+            'type': 'tiktoken',
+            'vocab_file': vocab_output_path,
+            'model': None,
+            'merge_file': None,
+            'delimiter': None,
+            'sentencepiece_legacy': False,
+        }
+        nemo_config.tokenizer = tokenizer_dict
+    else:
+        # Otherwise use HF
+        tokenizer_dict = {
+            'library': 'huggingface',
+            'type': args.input_name_or_path,
+            'use_fast': True,
+        }
+        nemo_config.tokenizer = tokenizer_dict
+
     # TODO(@akoumparouli): rope_scaling.
     nemo_config['rotary_base'] = mistral_config['rope_theta']
 
@@ -130,38 +176,63 @@ def load_config(mistral_config, tokenizer_path):
     return nemo_config
 
 
-def load_mistral_ckpt(in_dir):
+class LazyStateDict:
+    def __init__(self, ckpt_index, root):
+        self.map = ckpt_index
+        self.root = root
+
+    def __getitem__(self, key):
+        from safetensors import safe_open
+
+        assert key in self.map, f'Got unknown key: {key}'
+        ckpt_part_path = os.path.join(self.root, self.map[key])
+        assert os.path.exists(ckpt_part_path), f'Expected ckpt-part to exist {ckpt_part_path}'
+        with safe_open(ckpt_part_path, framework="pt", device="cpu") as fp:
+            return fp.get_tensor(key)
+
+
+def load_mistral_ckpt(in_dir, load_model=True):
     params_file = os.path.join(in_dir, 'config.json')
     assert os.path.exists(params_file)
     with open(params_file, 'r') as fp:
         model_args = json.load(fp)
 
-    model = AutoModelForCausalLM.from_pretrained(in_dir)
-    ckpt = model.state_dict()
+    ckpt = None
+    if load_model:
+        # If it's in safetensors format, then use lazyloading
+        ckpt_parts_map_path = os.path.join(in_dir, 'model.safetensors.index.json')
+        if os.path.exists(ckpt_parts_map_path):
+            ckpt_parts_map = {}
+            with open(ckpt_parts_map_path, 'rt') as fp:
+                ckpt_parts_map = json.load(fp)
+            print('ckpt_parts_map= ', ckpt_parts_map)
+            ckpt = LazyStateDict(ckpt_parts_map['weight_map'], in_dir)
+        else:
+            model = AutoModelForCausalLM.from_pretrained(in_dir)
+            ckpt = model.state_dict()
 
     tokenizer = AutoTokenizer.from_pretrained(in_dir)
     assert tokenizer.vocab_size == model_args['vocab_size']
     return model_args, ckpt, tokenizer
 
 
-def convert(args):
-    logging.info(f"loading checkpoint {args.input_name_or_path}")
-
-    model_args, ckpt, tokenizer = load_mistral_ckpt(args.input_name_or_path)
-    nemo_config = load_config(model_args, os.path.join(args.input_name_or_path, 'tokenizer.model'))
-    logging.info(f"loaded checkpoint {args.input_name_or_path}")
-
-    if args.precision in ["32", "16"]:
-        precision = int(float(args.precision))
-    elif args.precision in ["bf16", "bf16-mixed"]:
+def parse_precision(precision):
+    if precision in ["32", "16"]:
+        return int(float(precision))
+    elif precision in ["bf16", "bf16-mixed"]:
         if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
-            precision = args.precision
+            return precision
         else:
             logging.warning("BF16 is not supported on this device. Using FP16 instead.")
-            precision = args.precision[2:]  # prune bf in string
+            return precision[2:]  # prune bf in string
     else:
-        precision = args.precision
+        return precision
+
 
+def make_trainer(args, nemo_config):
+    model_args, ckpt, tokenizer = load_mistral_ckpt(args.input_name_or_path, load_model=False)
+    nemo_config = load_config(model_args, tokenizer, args.input_name_or_path)
+    precision = parse_precision(args.precision)
     plugins = []
     if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
         scaler = None
@@ -191,13 +262,24 @@ def convert(args):
         dtype = torch.float32  # fallback
 
     nemo_config.precision = precision
-    logging.info(f"nemo_config: {nemo_config}")
+    print(f"nemo_config: {nemo_config}")
 
     trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
+    return trainer, dtype
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+
+    model_args, ckpt, tokenizer = load_mistral_ckpt(args.input_name_or_path)
+    nemo_config = load_config(model_args, tokenizer, args.input_name_or_path)
+    logging.info(f"loaded checkpoint {args.input_name_or_path}")
 
     hidden_size = nemo_config.hidden_size
     head_num = nemo_config.num_attention_heads
-    head_size = hidden_size // head_num
+    head_size = model_args.get('head_dim', hidden_size // head_num)
+    # Set this explictly because 2407 does not use hidden_size // num_attention_heads
+    nemo_config.kv_channels = head_size
     num_layers = nemo_config.num_layers
 
     mcore_gpt = nemo_config.mcore_gpt
@@ -226,6 +308,10 @@ def convert(args):
     if mcore_gpt:
         assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.'
 
+    yield checkpoint
+    checkpoint = OrderedDict()
+    checkpoint['state_dict'] = OrderedDict()
+
     for l in range(int(num_layers)):
         print(f"converting layer {l}")
         old_tensor_shape = ckpt[f'model.layers.{l}.self_attn.q_proj.weight'].size()
@@ -298,6 +384,9 @@ def convert(args):
         checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
 
         print(f"done layer {l}")
+        yield checkpoint
+        checkpoint = OrderedDict()
+        checkpoint['state_dict'] = OrderedDict()
 
     final_ln_weight = ckpt[f'model.norm.weight']
     if mcore_gpt:
@@ -314,36 +403,72 @@ def convert(args):
     checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight)
 
     checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+    yield checkpoint
     del ckpt
 
+
+def merge(a: dict, b: dict, path=[]):
+    is_dict = lambda x: isinstance(x, OrderedDict) or isinstance(x, dict)
+    for key in b:
+        if key in a:
+            if is_dict(a[key]) and is_dict(b[key]):
+                merge(a[key], b[key], path + [str(key)])
+            elif a[key] != b[key]:
+                raise Exception('Value conflict: ' + '.'.join(path + [str(key)]))
+        else:
+            a[key] = b[key]
+    return a
+
+
+def save_to_nemo(args, checkpoint):
+
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    model_args, ckpt, tokenizer = load_mistral_ckpt(args.input_name_or_path, load_model=False)
+    nemo_config = load_config(model_args, tokenizer, args.input_name_or_path)
+
+    nemo_config.precision = parse_precision(args.precision)
+    nemo_config.megatron_amp_O2 = True
+
+    hidden_size = nemo_config.hidden_size
+    head_num = nemo_config.num_attention_heads
+    head_size = model_args.get('head_dim', hidden_size // head_num)
+    # Set this explictly because 2407 does not use hidden_size // num_attention_heads
+    nemo_config.kv_channels = head_size
+
+    trainer, dtype = make_trainer(args, nemo_config)
+
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY].use_cpu_initialization = True
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY].perform_initialization = False
+
     if nemo_config.get('megatron_amp_O2', False):
         keys = list(checkpoint['state_dict'].keys())
         for key in keys:
             checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
 
-    model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
+    model = restore_model_from_checkpoint(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
 
     model._save_restore_connector = NLPSaveRestoreConnector()
 
-    # cast to target precision and disable cpu init
-    model = model.to(dtype=dtype)
+    # disable cpu init
     model.cfg.use_cpu_initialization = False
-
+    model.cfg.perform_initialization = True
     if getattr(tokenizer, 'chat_template', None) is not None:
         import hashlib
 
-        assert (
-            hashlib.md5(tokenizer.chat_template.encode('utf-8')).hexdigest() == "0b629f783db54e02509999196956ff40"
-        ), "Got unkown chat template"
-        from omegaconf import OmegaConf, open_dict
-
-        with open_dict(model.cfg):
-            model.cfg.tokenizer.chat_template = OmegaConf.create(
-                {
-                    'prefix': "{_bos_}",
-                    'roles': {'User': "[INST] {_content_} [/INST]", 'Assistant': "{_content_}{_eos_}"},
-                }
-            )
+        template_hash = hashlib.md5(tokenizer.chat_template.encode('utf-8')).hexdigest()
+        if template_hash != "0b629f783db54e02509999196956ff40":
+            logging.warning("Got unkown chat template")
+        else:
+            from omegaconf import OmegaConf, open_dict
+
+            with open_dict(model.cfg):
+                model.cfg.tokenizer.chat_template = OmegaConf.create(
+                    {
+                        'prefix': "{_bos_}",
+                        'roles': {'User': "[INST] {_content_} [/INST]", 'Assistant': "{_content_}{_eos_}"},
+                    }
+                )
 
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
@@ -351,4 +476,20 @@ def convert(args):
 
 if __name__ == '__main__':
     args = get_args()
-    convert(args)
+    if args.low_ram:
+        os.makedirs(args.tmp_dir, exist_ok=True)
+
+    checkpoint = OrderedDict()
+    for i, ckpt_part in enumerate(convert(args)):
+        if args.low_ram:
+            torch.save(ckpt_part, f'{args.tmp_dir}/nemo_ckpt_part_{i}.pth')
+        else:
+            checkpoint = merge(checkpoint, ckpt_part)
+
+    if args.low_ram:
+        print("Loading partial checkpoints")
+        for path in map(str, Path(args.tmp_dir).rglob("*.pth")):
+            print(f"Loading checkpoint: {path}")
+            checkpoint = merge(checkpoint, torch.load(path, mmap=True))
+
+    save_to_nemo(args, checkpoint)
diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
index 99d1795aea9c..c50267ef6b42 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
@@ -40,7 +40,7 @@ def get_args():
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output HF checkpoint.")
     parser.add_argument('--hf_model_name', type=str, default="mistralai/Mistral-7B-v0.1", help="Name of HF checkpoint")
-    parser.add_argument("--precision", type=str, default="32", help="Model precision")
+    parser.add_argument("--precision", type=str, default="bf16", help="Model precision")
     args = parser.parse_args()
     return args
 
@@ -48,7 +48,8 @@ def get_args():
 def load_config(hf_model_name, nemo_config):
     hf_config = AutoConfig.from_pretrained(hf_model_name)
     # SWA; nemo_config.window_size is list [left-bound, right-bound]
-    hf_config.sliding_window = nemo_config.window_size[0]
+    if hasattr(nemo_config, 'window_size'):
+        hf_config.sliding_window = nemo_config.window_size[0]
     hf_config.max_position_embeddings = nemo_config.encoder_seq_length
     hf_config.num_hidden_layers = nemo_config.num_layers
     hf_config.hidden_size = nemo_config.hidden_size
@@ -83,6 +84,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
         model_config.use_cpu_initialization = True
     else:
         map_location = None
+    model_config.perform_initialization = False
 
     if cpu_only:
         logging.info("******** Loading model on CPU. This will take a significant amount of time.")
@@ -129,7 +131,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
     num_layers = model.cfg.num_layers
     num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
 
-    head_size = hidden_size // head_num
+    head_size = model.cfg.get('kv_channels', hidden_size // head_num)
     heads_per_group = head_num // num_query_groups
     qkv_total_dim = head_num + 2 * num_query_groups
 
@@ -215,6 +217,14 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
 
 
 if __name__ == '__main__':
+    import transformers
+
+    type(transformers.__version__)
+    from packaging.version import Version
+
+    if Version(transformers.__version__) < Version('4.44.0'):
+        logging.warning("You need to use transformers >= v4.44.0")
+
     args = get_args()
     hf_state_dict, nemo_config, dtype = convert(args.input_name_or_path, args.precision)
 
diff --git a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
index 1bf23224357f..36e4c0c2c3ea 100644
--- a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
@@ -17,7 +17,8 @@
   Example to run this conversion script:
     python3 convert_mixtral_hf_to_nemo.py \
      --input_name_or_path <path_to_mixtral_checkpoints_folder> \
-     --output_path <path_to_output_nemo_file> 
+     --output_path <path_to_output_nemo_file> \
+     --precision=bf16
 """
 
 import json
@@ -132,6 +133,7 @@ def load_config(mixtral_config, tokenizer_path):
     assert nemo_config.num_moe_experts > 0, "num_experts must be greater than zero."
     nemo_config.moe_router_topk = int(mixtral_config['num_experts_per_tok'])
     assert nemo_config.moe_router_topk > 0, "moe_router_topk must be greater than zero."
+    nemo_config.moe_router_pre_softmax = True
     nemo_config.use_cpu_initialization = True
     # Mixtral uses SiLU, but it is the same as swish with beta = 1.
     nemo_config.activation = 'fast-swiglu'
diff --git a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py
new file mode 100644
index 000000000000..7a58573278af
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py
@@ -0,0 +1,348 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from pytorch_lightning import Trainer
+from transformers import LlamaTokenizer, PreTrainedTokenizerFast
+from transformers.convert_slow_tokenizer import LlamaConverter
+
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils import logging
+
+"""
+Script to convert a nemotron checkpoint in nemo (mcore path) into a HuggingFace checkpoint.
+This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder.
+
+1) Generate only HF weights from a nemo file:
+
+    python convert_nemotron_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin
+
+2) Generate the full HF model folder
+
+    python convert_nemotron_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --hf_input_path /path/to/input_hf_folder \
+    --hf_output_path /path/to/output_hf_folder \
+
+    Use the --cpu-only flag if the model cannot fit in the GPU (e.g. Nemotron4 340b).
+    However this option makes the conversion script significantly slower.
+"""
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to .nemo file or extracted folder",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=False, help="Path to HF .bin file")
+    parser.add_argument(
+        "--hf_input_path",
+        type=str,
+        default=None,
+        help="A HF model path, " "e.g. a folder containing https://huggingface.co/nvidia/Minitron-8B-Base",
+    )
+    parser.add_argument(
+        "--hf_output_path",
+        type=str,
+        default=None,
+        help="Output HF model path, " "with the same format as above but user's own weights",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default=None,
+        help="Precision of output weights."
+        "Defaults to precision of the input nemo weights (model.cfg.trainer.precision)",
+    )
+    parser.add_argument(
+        "--cpu-only",
+        action="store_true",
+        help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
+        "but this option makes the conversion script significantly slower.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert_hf_config(nemo_config, tokenizer, vocab_size, dtype, hf_output_path, hf_url="nvidia/Minitron-8B-Base"):
+    """
+    Convert NeMo config to HF config
+    """
+    NEMO_ACT2HF = {
+        "squared-relu": "relu2",
+        "fast-swiglu": "silu",
+    }
+    DTYPE2HF = {
+        torch.bfloat16: "bfloat16",
+        torch.float16: "float16",
+        torch.float32: "float32",
+    }
+    hf_config = {
+        "_name_or_path": hf_url,
+        "architectures": ["NemotronForCausalLM"],
+        "bos_token_id": tokenizer.bos_id,
+        "eos_token_id": tokenizer.eos_id,
+        "hidden_act": NEMO_ACT2HF[nemo_config.activation],
+        "hidden_size": nemo_config.hidden_size,
+        "initializer_range": nemo_config.init_method_std,
+        "intermediate_size": nemo_config.ffn_hidden_size,
+        "max_position_embeddings": nemo_config.max_position_embeddings,
+        "model_type": "nemotron",
+        "num_attention_heads": nemo_config.num_attention_heads,
+        "num_hidden_layers": nemo_config.num_layers,
+        "num_key_value_heads": nemo_config.get("num_query_groups", nemo_config.num_attention_heads),
+        "norm_eps": nemo_config.layernorm_epsilon,
+        "rope_theta": nemo_config.get("rotary_base", 10000),
+        "partial_rotary_factor": nemo_config.get("rotary_percentage", 1.0),
+        "tie_word_embeddings": False,
+        "torch_dtype": DTYPE2HF[dtype],
+        "transformers_version": "4.44.0",
+        "use_cache": True,
+        "vocab_size": vocab_size,
+    }
+    if nemo_config.get("kv_channels", None) is not None:
+        hf_config["head_dim"] = nemo_config.kv_channels
+    json.dump(hf_config, open(f"{hf_output_path}/config.json", "w"), indent=2)
+
+
+def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None:
+    """
+    Convert NeMo weights to HF weights
+    """
+    dummy_trainer = Trainer(devices=1, accelerator="cpu", strategy=NLPDDPStrategy())
+    model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
+    model_config.tensor_model_parallel_size = 1
+    model_config.pipeline_model_parallel_size = 1
+    model_config.sequence_parallel = False
+    model_config.transformer_engine = True
+    if cpu_only:
+        map_location = torch.device("cpu")
+        model_config.use_cpu_initialization = True
+        model_config.dist_ckpt_load_on_device = False
+    else:
+        map_location = None
+
+    if cpu_only:
+        logging.info("******** Loading model on CPU. This will take a significant amount of time.")
+
+    model = MegatronGPTModel.restore_from(
+        input_nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
+    )
+
+    vocab_size = model.padded_vocab_size
+
+    if precision is None:
+        precision = model.cfg.precision
+    if precision in [32, "32"]:
+        dtype = torch.float32
+    elif precision in [16, "16", "16-mixed"]:
+        dtype = torch.float16
+    elif precision in ["bf16", "bf16-mixed"]:
+        dtype = torch.bfloat16
+    else:
+        logging.warning(f"Precision string {precision} is not recognized, falling back to fp32")
+        dtype = torch.float32  # fallback
+    logging.info(f"Using precision {dtype}")
+
+    def param_to_weights(param):
+        return param.to(dtype)
+
+    checkpoint = OrderedDict()
+
+    hidden_size = model.cfg.hidden_size
+    head_num = model.cfg.num_attention_heads
+    num_layers = model.cfg.num_layers
+    ffn_hidden_size = model.cfg.ffn_hidden_size
+    num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
+    if num_query_groups is None:
+        num_query_groups = head_num
+    heads_per_group = head_num // num_query_groups
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    # Embedding
+    embed_weight = model.state_dict()["model.embedding.word_embeddings.weight"]
+    embed_weights_base_name = "model.embed_tokens.weight"
+    checkpoint[embed_weights_base_name] = param_to_weights(embed_weight)
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+
+        qkv_weights = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_qkv.weight"]
+        qkv_weights = qkv_weights.reshape([qkv_total_dim, -1, hidden_size])
+
+        q_slice = torch.cat(
+            [
+                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                for i in range(num_query_groups)
+            ]
+        )
+        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+        ## Example of slices
+        ## (without GQA): num_query_groups = head_num = 32,
+        ## q_slice = [0, 3, 6, 9 , ... 90, 93]
+        ## k_slice = [1, 4, 7, 10, ... 91, 94]
+        ## v_slice = [2, 5, 8, 11, ... 92, 95]
+        ## (with GQA): num_query_groups = 8, head_num = 64
+        ## q_slice = [0, 1, .. 6, 7, 10, 11, .. 16, 17, 20, 21, .. 67, 70, ... 76, 77]
+        ## k_slice = [8, 18, 28, ... 68, 78]
+        ## v_slice = [9, 19, 29, ... 69, 79]
+
+        q_weights_base_name = f"model.layers.{l}.self_attn.q_proj.weight"
+        k_weights_base_name = f"model.layers.{l}.self_attn.k_proj.weight"
+        v_weights_base_name = f"model.layers.{l}.self_attn.v_proj.weight"
+
+        checkpoint[q_weights_base_name] = param_to_weights(qkv_weights[q_slice].reshape(-1, hidden_size))
+        checkpoint[k_weights_base_name] = param_to_weights(qkv_weights[k_slice].reshape(-1, hidden_size))
+        checkpoint[v_weights_base_name] = param_to_weights(qkv_weights[v_slice].reshape(-1, hidden_size))
+
+        # attention dense
+        o_weight = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_proj.weight"]
+        o_weight_base_name = f"model.layers.{l}.self_attn.o_proj.weight"
+        checkpoint[o_weight_base_name] = param_to_weights(o_weight)
+
+        # mlp
+        mlp_weights = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc1.weight"]
+        mlp_up_proj_weight = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc2.weight"]
+
+        if mlp_weights.shape[0] != mlp_up_proj_weight.shape[1]:
+            # Has projection (used for swi-glu)
+            logging.warning(
+                "Gated projection layers detected in NeMo checkpoint. Currently Nemotron HF does not support gated MLP."
+            )
+            assert mlp_weights.shape[0] == 2 * mlp_up_proj_weight.shape[1]
+
+            mlp_down_proj_weight = mlp_weights[:ffn_hidden_size, :]
+            mlp_gate_proj_weight = mlp_weights[ffn_hidden_size:, :]
+
+            mlp_down_proj_base_name = f"model.layers.{l}.mlp.gate_proj.weight"
+            mlp_gate_proj_base_name = f"model.layers.{l}.mlp.up_proj.weight"
+
+            checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight)
+            checkpoint[mlp_gate_proj_base_name] = param_to_weights(mlp_gate_proj_weight)
+        else:
+            mlp_down_proj_weight = mlp_weights
+            mlp_down_proj_base_name = f"model.layers.{l}.mlp.up_proj.weight"
+            checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight)
+
+        mlp_up_proj_base_name = f"model.layers.{l}.mlp.down_proj.weight"
+        checkpoint[mlp_up_proj_base_name] = param_to_weights(mlp_up_proj_weight)
+
+        # layernorm
+        input_ln_weight = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight"]
+        input_ln_base_name = f"model.layers.{l}.input_layernorm.weight"
+        checkpoint[input_ln_base_name] = param_to_weights(input_ln_weight)
+        if (
+            model.state_dict().get(f"model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_bias", None)
+            is not None
+        ):
+            input_ln_bias = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_bias"]
+            input_ln_bias_name = f"model.layers.{l}.input_layernorm.bias"
+            checkpoint[input_ln_bias_name] = param_to_weights(input_ln_bias)
+
+        post_attn_ln_weight = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight"]
+        post_attn_ln_base_name = f"model.layers.{l}.post_attention_layernorm.weight"
+        checkpoint[post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+        if model.state_dict().get(f"model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_bias", None) is not None:
+            post_attn_ln_bias = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_bias"]
+            post_attn_ln_bias_name = f"model.layers.{l}.post_attention_layernorm.bias"
+            checkpoint[post_attn_ln_bias_name] = param_to_weights(post_attn_ln_bias)
+
+        print(f"done layer {l}")
+
+    final_ln_weight = model.state_dict()["model.decoder.final_layernorm.weight"]
+    final_ln_base_name = "model.norm.weight"
+    checkpoint[final_ln_base_name] = param_to_weights(final_ln_weight)
+    if model.state_dict().get("model.decoder.final_layernorm.bias", None) is not None:
+        final_ln_bias = model.state_dict()["model.decoder.final_layernorm.bias"]
+        final_ln_bias_name = "model.norm.bias"
+        checkpoint[final_ln_bias_name] = param_to_weights(final_ln_bias)
+
+    output_layer_weight = model.state_dict()["model.output_layer.weight"]
+    output_layer_base_name = "lm_head.weight"
+    checkpoint[output_layer_base_name] = param_to_weights(output_layer_weight)
+
+    os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)
+    torch.save(checkpoint, output_hf_file)
+    logging.info(f"Weights saved to {output_hf_file}")
+
+    return model_config, model.tokenizer, dtype, vocab_size
+
+
+def extract_nemotron_tokenizer(nemo_file, model_config, output_hf_path, nemo_tokenizer):
+    tokenizer_cfg = model_config.tokenizer
+    if tokenizer_cfg.library == "sentencepiece":
+        # For sentencepiece tokenizer, we are wrapping with HF's LlamaTokenizer
+        # and convert it to a PreTrainedTokenizerFast
+        tokenizer_fn = tokenizer_cfg.model[5:]
+        output_tokenizer = f"{output_hf_path}/tokenizer.model"
+        if nemo_file.endswith(".nemo"):
+            import tarfile
+
+            archive = tarfile.open(nemo_file, "r")
+            tokenizer_filename = "./" + tokenizer_fn  # exclude 'nemo:' prefix
+            archive.extract(tokenizer_filename, output_hf_path)
+            archive.close()
+            os.rename(f"{output_hf_path}/{tokenizer_fn}", output_tokenizer)
+        elif os.path.isdir(nemo_file):
+            shutil.copy(f"{nemo_file}/{tokenizer_fn}", output_tokenizer)
+        # We use LlamaTokenizer for sentencepiece based tokenizer
+        tokenizer = LlamaTokenizer.from_pretrained(output_hf_path, legacy=False)
+        # Convert the LlamaTokenizer to a PreTrainedTokenizerFast instance
+        tokenizer = PreTrainedTokenizerFast(
+            tokenizer_object=LlamaConverter(tokenizer).converted(),
+            model_input_names=["input_ids", "attention_mask"],
+            bos_token="<s>",
+            eos_token="</s>",
+        )
+        tokenizer.save_pretrained(output_hf_path)
+        logging.info(f"Setencepiece tokenizer has been saved to {output_tokenizer}")
+    elif isinstance(nemo_tokenizer, AutoTokenizer):
+        nemo_tokenizer.tokenizer.save_pretrained(output_hf_path)
+        logging.info(f"HF AutoTokenizer has been saved to {output_hf_path}")
+    else:
+        raise ValueError(f"Unsupported tokenizer type: library: {tokenizer_cfg.library}, type: {tokenizer_cfg.type}")
+
+
+if __name__ == "__main__":
+    args = get_args()
+    if not args.hf_output_path:
+        assert args.output_path is not None, "Need to provide either output_path or hf_output_path"
+    else:
+        args.output_path = f"{args.hf_output_path}/pytorch_model.bin"
+        logging.info(f"weight will be saved to {args.output_path}")
+
+    nemo_config, nemo_tokenizer, dtype, vocab_size = convert(
+        args.input_name_or_path, args.output_path, precision=args.precision, cpu_only=args.cpu_only
+    )
+    if args.hf_input_path and args.hf_output_path:
+        convert_hf_config(nemo_config, nemo_tokenizer, vocab_size, dtype, args.hf_output_path, args.hf_input_path)
+        extract_nemotron_tokenizer(args.input_name_or_path, nemo_config, args.hf_output_path, nemo_tokenizer)
+    else:
+        logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.")
+        logging.info(f".bin file is saved to {args.output_path}")
diff --git a/scripts/checkpoint_converters/convert_stablediffusion_hf_to_nemo.py b/scripts/checkpoint_converters/convert_stablediffusion_hf_to_nemo.py
index 67bc975708d0..ff10dab4bc90 100644
--- a/scripts/checkpoint_converters/convert_stablediffusion_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_stablediffusion_hf_to_nemo.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 
 r"""
-Conversion script to convert HuggingFace Starcoder2 checkpoints into nemo checkpoint.
+Conversion script to convert HuggingFace StableDiffusion checkpoints into nemo checkpoint.
   Example to run this conversion script:
     python convert_hf_starcoder2_to_nemo.py \
      --input_name_or_path <path_to_sc2_checkpoints_folder> \
-     --output_path <path_to_output_nemo_file>
+     --output_path <path_to_output_nemo_file> --model <unet|vae>
 """
 
+import os
 from argparse import ArgumentParser
 
 import numpy as np
@@ -29,8 +30,6 @@
 
 from nemo.utils import logging
 
-intkey = lambda x: int(x)
-
 
 def filter_keys(rule, dict):
     keys = list(dict.keys())
@@ -95,7 +94,7 @@ def __getitem__(self, name: str):
                 return None
             # either more than 1 match (error) or exactly 1 (success)
             if np.sum(p_flag) > 1:
-                print(f"error: multiple matches of key {name} with {keys}")
+                logging.warning(f"warning: multiple matches of key {name} with {keys}")
             else:
                 i = np.where(p_flag)[0][0]
                 n = numdots(keys[i])
@@ -130,14 +129,9 @@ def get_args():
     return args
 
 
-def make_tiny_config(config):
-    '''dial down the config file to make things tractable'''
-    # TODO
-    return config
-
-
 def load_hf_ckpt(in_dir, args):
     ckpt = {}
+    assert os.path.isdir(in_dir), "Currently supports only directories with a safetensor file in it."
     with safetensors.safe_open(in_dir + "/diffusion_pytorch_model.safetensors", framework="pt") as f:
         for k in f.keys():
             ckpt[k] = f.get_tensor(k)
@@ -161,9 +155,9 @@ def sanity_check(hf_tree, hf_unet, nemo_unet):
     # check if i'm introducing new keys
     for hfk, nk in hf_to_nemo_mapping(hf_tree).items():
         if nk not in nemo_unet.keys():
-            print(nk)
+            logging.info(nk)
         if hfk not in hf_unet.keys():
-            print(hfk)
+            logging.info(hfk)
 
 
 def convert_input_keys(hf_tree: SegTree):
@@ -174,7 +168,7 @@ def convert_input_keys(hf_tree: SegTree):
     # start counting blocks from now on
     nemo_inp_blk = 1
     down_blocks = hf_tree['down_blocks']
-    down_blocks_keys = sorted(list(down_blocks.nodes.keys()), key=intkey)
+    down_blocks_keys = sorted(list(down_blocks.nodes.keys()), key=int)
     for downblockid in down_blocks_keys:
         block = down_blocks[str(downblockid)]
         # compute number of resnets, attentions, downsamplers in this block
@@ -183,14 +177,14 @@ def convert_input_keys(hf_tree: SegTree):
         downsamplers = block.nodes.get('downsamplers', SegTree())
 
         if len(attentions) == 0:  # no attentions, this is a DownBlock2d
-            for resid in sorted(list(resnets.nodes.keys()), key=intkey):
+            for resid in sorted(list(resnets.nodes.keys()), key=int):
                 resid = str(resid)
                 resnets[resid].convert_name = f"input_blocks.{nemo_inp_blk}.0"
                 map_resnet_block(resnets[resid])
                 nemo_inp_blk += 1
         elif len(attentions) == len(resnets):
             # there are attention blocks here -- each resnet+attention becomes a block
-            for resid in sorted(list(resnets.nodes.keys()), key=intkey):
+            for resid in sorted(list(resnets.nodes.keys()), key=int):
                 resid = str(resid)
                 resnets[resid].convert_name = f"input_blocks.{nemo_inp_blk}.0"
                 map_resnet_block(resnets[resid])
@@ -199,7 +193,6 @@ def convert_input_keys(hf_tree: SegTree):
                 nemo_inp_blk += 1
         else:
             logging.warning("number of attention blocks is not the same as resnets - whats going on?")
-
         # if there is a downsampler, then also append it
         if len(downsamplers) > 0:
             for k in downsamplers.nodes.keys():
@@ -217,10 +210,9 @@ def clean_convert_names(tree):
 def map_attention_block(att_tree: SegTree):
     '''this HF tree can either be an AttentionBlock or a DualAttention block
     currently assumed AttentionBlock
-
     '''
 
-    # TODO (rohit): Add check for dual attention block
+    # TODO(@rohitrango): Add check for dual attention block, but this works for both SD and SDXL
     def check_att_type(tree):
         return "att_block"
 
@@ -237,7 +229,7 @@ def check_att_type(tree):
             dup_convert_name_recursive(tblock['norm1'], 'attn1.norm')
             dup_convert_name_recursive(tblock['norm2'], 'attn2.norm')
             dup_convert_name_recursive(tblock['norm3'], 'ff.net.0')
-            # map ff module
+            # map ff
             tblock['ff'].convert_name = "ff"
             tblock['ff.net'].convert_name = 'net'
             dup_convert_name_recursive(tblock['ff.net.0'], '1')
@@ -272,12 +264,16 @@ def hf_to_nemo_mapping(tree: SegTree):
 
 def convert_cond_keys(tree: SegTree):
     # map all conditioning keys
-    tree['add_embedding'].convert_name = 'label_emb.0'
-    dup_convert_name_recursive(tree['add_embedding.linear_1'], '0')
-    dup_convert_name_recursive(tree['add_embedding.linear_2'], '2')
-    tree['time_embedding'].convert_name = 'time_embed'
-    dup_convert_name_recursive(tree['time_embedding.linear_1'], '0')
-    dup_convert_name_recursive(tree['time_embedding.linear_2'], '2')
+    if tree.nodes.get("add_embedding"):
+        logging.info("Add embedding found...")
+        tree['add_embedding'].convert_name = 'label_emb.0'
+        dup_convert_name_recursive(tree['add_embedding.linear_1'], '0')
+        dup_convert_name_recursive(tree['add_embedding.linear_2'], '2')
+    if tree.nodes.get("time_embedding"):
+        logging.info("Time embedding found...")
+        tree['time_embedding'].convert_name = 'time_embed'
+        dup_convert_name_recursive(tree['time_embedding.linear_1'], '0')
+        dup_convert_name_recursive(tree['time_embedding.linear_2'], '2')
 
 
 def convert_middle_keys(tree: SegTree):
@@ -298,7 +294,7 @@ def convert_output_keys(hf_tree: SegTree):
     '''output keys is similar to input keys'''
     nemo_inp_blk = 0
     up_blocks = hf_tree['up_blocks']
-    up_blocks_keys = sorted(list(up_blocks.nodes.keys()), key=intkey)
+    up_blocks_keys = sorted(list(up_blocks.nodes.keys()), key=int)
 
     for downblockid in up_blocks_keys:
         block = up_blocks[str(downblockid)]
@@ -307,8 +303,8 @@ def convert_output_keys(hf_tree: SegTree):
         attentions = block.nodes.get('attentions', SegTree())
         upsamplers = block.nodes.get('upsamplers', SegTree())
 
-        if len(attentions) == 0:  # no attentions, this is a DownBlock2d
-            for resid in sorted(list(resnets.nodes.keys()), key=intkey):
+        if len(attentions) == 0:  # no attentions, this is a UpBlock2D
+            for resid in sorted(list(resnets.nodes.keys()), key=int):
                 resid = str(resid)
                 resnets[resid].convert_name = f"output_blocks.{nemo_inp_blk}.0"
                 map_resnet_block(resnets[resid])
@@ -316,7 +312,7 @@ def convert_output_keys(hf_tree: SegTree):
 
         elif len(attentions) == len(resnets):
             # there are attention blocks here -- each resnet+attention becomes a block
-            for resid in sorted(list(resnets.nodes.keys()), key=intkey):
+            for resid in sorted(list(resnets.nodes.keys()), key=int):
                 resid = str(resid)
                 resnets[resid].convert_name = f"output_blocks.{nemo_inp_blk}.0"
                 map_resnet_block(resnets[resid])
@@ -326,11 +322,13 @@ def convert_output_keys(hf_tree: SegTree):
         else:
             logging.warning("number of attention blocks is not the same as resnets - whats going on?")
 
-        # if there is a downsampler, then also append it
+        # if there is a upsampler, then also append it
         if len(upsamplers) > 0:
-            # for k in upsamplers.nodes.keys():
             nemo_inp_blk -= 1
-            upsamplers['0'].convert_name = f"output_blocks.{nemo_inp_blk}.2"
+            upsamplenum = (
+                1 if len(attentions) == 0 else 2
+            )  # if there are attention modules, upsample is module2, else it is module 1 (to stay consistent with SD)
+            upsamplers['0'].convert_name = f"output_blocks.{nemo_inp_blk}.{upsamplenum}"
             dup_convert_name_recursive(upsamplers['0.conv'], 'conv')
             nemo_inp_blk += 1
 
@@ -387,6 +385,7 @@ def convert_decoder(hf_tree: SegTree):
     decoder['mid_block'].convert_name = 'mid'
     dup_convert_name_recursive(decoder[f'mid_block.resnets.0'], 'block_1')
     dup_convert_name_recursive(decoder[f'mid_block.resnets.1'], 'block_2')
+    # attention blocks
     att = decoder['mid_block.attentions.0']
     att.convert_name = 'attn_1'
     dup_convert_name_recursive(att['group_norm'], 'norm')
@@ -443,6 +442,7 @@ def convert(args):
 
     for hf_key, nemo_key in mapping.items():
         nemo_ckpt[nemo_key] = hf_ckpt[hf_key]
+    # save this
     torch.save(nemo_ckpt, args.output_path)
     logging.info(f"Saved nemo file to {args.output_path}")
 
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 01be9ff63a0d..c0acd97e1b50 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -185,8 +185,8 @@ def get_args(argv):
     parser.add_argument(
         "-srs",
         "--start_rest_service",
-        default="False",
-        type=str,
+        default=False,
+        type=bool,
         help="Starts the REST service for OpenAI API support",
     )
     parser.add_argument(
diff --git a/scripts/multimodal_dataset_conversion/prepare_youmakeup.py b/scripts/multimodal_dataset_conversion/prepare_youmakeup.py
deleted file mode 100644
index 4b8f49f5410d..000000000000
--- a/scripts/multimodal_dataset_conversion/prepare_youmakeup.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import json
-import os
-import random
-import subprocess
-from concurrent.futures import ThreadPoolExecutor
-from multiprocessing import Pool
-from pathlib import Path
-
-from moviepy.video.io.VideoFileClip import VideoFileClip
-from tqdm import tqdm
-from yt_dlp import YoutubeDL
-
-
-def clean_video_folder(video_folder):
-    """Remove non-11-character.mp4 files from the video folder."""
-    for video in video_folder.glob('*.mp4'):
-        if len(video.stem) != 11:
-            video.unlink()
-
-
-def get_video_duration(video_path):
-    """Function to get the duration of a video using ffprobe."""
-    cmd = [
-        'ffprobe',
-        '-v',
-        'error',
-        '-show_entries',
-        'format=duration',
-        '-of',
-        'default=noprint_wrappers=1:nokey=1',
-        video_path,
-    ]
-    duration = subprocess.check_output(cmd).strip()
-    return float(duration)
-
-
-def prepare_dataset(source, video_folder, chunk_length=120):
-    """Prepare dataset from source JSON and download videos."""
-
-    with open(source, 'r') as f:
-        data = json.load(f)
-
-    dataset = {}
-    video_missing_counter = 0
-    total_videos = len(data.keys())
-
-    for key, value in tqdm(data.items()):
-        if not os.path.exists(os.path.join(video_folder, key + ".mp4")):
-            print(f"Video if {key} does not exist")
-            video_missing_counter += 1
-            continue
-
-        duration = value['duration']
-        timestamps = value['timestamps']
-        sentences = value['sentences']
-
-        # Videos are too long, sliding window of 2 minutes
-        video_begin = 0
-        video_end = duration
-        new_data = {}
-        counter = 0
-
-        # We need some negative samples, where timestamps doesn't have any events
-        last_end = 0
-        empty_timestamps = []
-        new_timestamps = []
-        new_sentences = []
-        for start, end in timestamps:
-            if start - last_end > 25 and start - last_end < 35:
-                empty_timestamps.append([last_end, start])
-            last_end = end
-
-        empty = 0.1 * len(timestamps)
-        empty_timestamps = random.choices(empty_timestamps, k=min(int(empty), len(empty_timestamps)))
-
-        for start, end in empty_timestamps:
-            time_range = random.randint(20, end - start)
-            new_start_time = random.randint(start, end - time_range)
-            new_end_time = new_start_time + time_range
-            new_data[f"{key}_{counter}"] = {
-                "video_begin": new_start_time,
-                "video_end": new_end_time,
-                "timestamps": [],
-                "sentences": [],
-            }
-            counter += 1
-
-        # Normal samples
-        for idx, ((start, end), sentence) in enumerate(zip(timestamps, sentences)):
-            if idx == 0:
-                video_begin = max(0, start - 5)
-                video_end = end
-            if end - video_begin > chunk_length:  # Use 2 minute chunks
-                since_last_end = start - new_timestamps[-1][1] if new_timestamps else 0
-                since_last_end = min(since_last_end, 10)
-                pad = since_last_end // 2
-                video_end = video_end + pad  # pad the end
-
-                new_data[f"{key}_{counter}"] = {
-                    "video_begin": video_begin,
-                    "video_end": video_end,
-                    "timestamps": new_timestamps,
-                    "sentences": new_sentences,
-                }
-                counter += 1
-
-                new_timestamps = []
-                new_sentences = []
-                video_begin = max(0, start - pad)  # pad the start
-
-            new_timestamps.append([int(start), int(end)])
-            new_sentences.append(sentence)
-            video_end = end
-
-            if idx == len(timestamps) - 1:
-                video_end = min(duration, end + 5)
-
-        if len(new_timestamps) > 0:
-            new_data[f"{key}_{counter}"] = {
-                "video_begin": video_begin,
-                "video_end": video_end,
-                "timestamps": new_timestamps,
-                "sentences": new_sentences,
-            }
-            counter += 1
-
-        dataset.update(new_data)
-
-    print(f"Got {len(dataset)} videos")
-    print(f"Total videos missing {video_missing_counter} out of total videos {total_videos}")
-    return dataset
-
-
-def crop_video(input_file, output_file, start_seconds, end_seconds):
-    """Crop a video."""
-    video = VideoFileClip(input_file)
-    cropped_video = video.subclip(start_seconds, end_seconds)
-    cropped_video = cropped_video.without_audio()
-    cropped_video.write_videofile(output_file, codec='libx264', audio_codec='aac')
-
-
-def process_video(key, value, video_folder, ignore=False):
-    """Process a video."""
-    video_begin = value['video_begin']
-    video_end = value['video_end']
-    timestamps = value['timestamps']
-    sentences = value['sentences']
-    key_orig = key.rsplit('_', 1)[0]
-    video_chunk_dir = os.path.join(Path(video_folder), 'videos')
-    os.makedirs(video_chunk_dir, exist_ok=True)
-    video_path = os.path.join(Path(video_folder), 'videos_original', key_orig + ".mp4")
-    save_video_path = os.path.join(video_chunk_dir, key + ".mp4")
-    if ignore == False:
-        crop_video(video_path, save_video_path, video_begin, video_end)
-
-    try:
-        # vr = decord.VideoReader(str(save_video_path))
-        # duration = vr._num_frame / vr.get_avg_fps()
-        duration = get_video_duration(save_video_path)
-    except Exception as e:
-        duration = video_end - video_begin
-        print(f"Fallback to {duration} for {save_video_path} because {e}")
-
-    timestamps = [[start - video_begin, end - video_begin] for start, end in timestamps]
-    return key, {
-        "duration": duration,
-        "timestamps": timestamps,
-        "sentences": sentences,
-    }
-
-
-def convert_to_seconds(time_str):
-    h, m, s = map(int, time_str.split(':'))
-    return h * 3600 + m * 60 + s
-
-
-def download_video(video_id_output_folder):
-
-    video_id, output_folder = video_id_output_folder
-    video_url = f'https://www.youtube.com/watch?v={video_id}'
-    output_path = os.path.join(output_folder, f'{video_id}.mp4')
-
-    # Check if the video has already been downloaded
-    if os.path.exists(output_path):
-        print(f"Video {video_id} already exists. Skipping...")
-        return
-
-    ydl_opts = {
-        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4',
-        'outtmpl': output_path,
-    }
-
-    try:
-        with YoutubeDL(ydl_opts) as ydl:
-            print(f"Downloading video {video_id} from {video_url}...")
-            ydl.download([video_url])
-            print(f"Video {video_id} downloaded successfully!")
-    except Exception as e:
-        print(f"Error downloading video {video_id} from {video_url}: {str(e)}")
-
-
-def download_videos(json_file, output_folder):
-    # Create the output folder if it doesn't exist
-    output_folder = os.path.join(output_folder, 'videos_original')
-    os.makedirs(output_folder, exist_ok=True)
-    # List to store video ids
-    video_ids = []
-    # Load video ids from the JSON file
-    with open(json_file, 'r') as f:
-        for line in f:
-            data = json.loads(line)
-            video_id = data['video_id']
-            video_ids.append(video_id)
-
-    # Create a pool of processes
-    with Pool(10) as executor:
-        # Map each video_id to the download_video function
-        executor.map(download_video, [(video_id, output_folder) for video_id in video_ids])
-
-
-def parse_dense_video_captions(original_file_path, output_dir):
-
-    output_file_path = os.path.join(output_dir, 'train_original.json')
-
-    data_dict = {}
-    with open(original_file_path, 'r') as f:
-        for line in f:
-            data = json.loads(line)
-            video_id = data['video_id']
-            step_data = data['step']
-            duration = convert_to_seconds(step_data[str(len(step_data))]['endtime'])
-
-            timestamps = []
-            sentences = []
-
-            for step in step_data.values():
-                start_time = convert_to_seconds(step['startime'])
-                end_time = convert_to_seconds(step['endtime'])
-                timestamps.append([start_time, end_time])
-                sentences.append(step['caption'])
-
-            new_data = {'duration': duration, 'timestamps': timestamps, 'sentences': sentences}
-
-            data_dict[video_id] = new_data
-
-    with open(output_file_path, 'w') as nf:
-        json.dump(data_dict, nf, indent=2)
-    return output_file_path
-
-
-def chunk(args, original_json):
-
-    video_folder = os.path.join(args.output_dir, 'videos_original')
-    video_folder = Path(video_folder)
-    output_json = os.path.join(args.output_dir, 'train.json')
-    clean_video_folder(video_folder)
-    dataset = prepare_dataset(original_json, video_folder, args.chunk_length)
-    fixed_data = {}
-
-    # Function to process a single video
-    def process_single_video(key, value):
-        video_chunk_dir = os.path.join(args.output_dir, 'videos')
-        os.makedirs(video_chunk_dir, exist_ok=True)
-        save_video_path = os.path.join(video_chunk_dir, f"{key}.mp4")
-        ignore = False
-        if os.path.exists(save_video_path):
-            print(f"Chunk for video {key} already exists. Skipping...")
-            ignore = True
-
-        key, value = process_video(key, value, args.output_dir, ignore=ignore)
-
-        return key, value
-
-    max_threads = 10  # Change this value to adjust the number of threads
-    # Process videos in parallel
-    with ThreadPoolExecutor(max_workers=max_threads) as executor:
-        futures = [executor.submit(process_single_video, key, value) for key, value in tqdm(dataset.items())]
-
-        for future in tqdm(futures):
-            result = future.result()
-            if result[1]:  # If result is not None
-                fixed_data[result[0]] = result[1]
-    with open(output_json, 'w') as f:
-        json.dump(fixed_data, f, indent=2, ensure_ascii=False)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Prepare dataset and create JSON file.")
-    parser.add_argument("-d", "--download", type=bool, help="Whether to download videos.", default=False)
-    parser.add_argument("-i", "--input_json", help="Path to the input JSON file.")
-    parser.add_argument("-o", "--output_dir", help="Path to the output_dir.")
-    parser.add_argument(
-        "-l",
-        "--chunk_length",
-        type=int,
-        help="Length of each chunked video in seconds (Default=120).",
-        default=120,
-        required=False,
-    )
-
-    args = parser.parse_args()
-
-    if args.download:
-        download_videos(args.input_json, args.output_dir)
-        print(f"Videos have been downloaded to {args.output_dir}")
-    else:
-        original_json = parse_dense_video_captions(args.input_json, args.output_dir)
-        chunk(args, original_json)
-        print(f"Dataset has been prepared at {args.output_dir}")
diff --git a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
index b3251e75c84e..7ff2342e4087 100644
--- a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
+++ b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
@@ -46,11 +46,16 @@
 python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
    model.data.train_ds.file_names=[/path/to/training.jsonl] \
    model.data.train_ds.max_seq_length=2048 \
-   +tokenizer_path=/path/to/tokenizer.model
-   +output_dir=/path/to/output_folder
+   +tokenizer_path=<see note 1 below> \
+   +output_dir=/path/to/output_folder \
    +pack_sizes=[2048,4096,8192]
    
 Note: 
+  - Tokenizer path supports SentencePiece tokenizer and HF tokenizer. 
+    For SentencePiece tokenizer, specify the file /path/to/tokenizer.model 
+    For HF tokenizer, specify a folder /path/to/hf_folder which contains tokenizer.json, tokenizer_config.json
+    and special_tokens_map.json
+
   - If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will
     need to pass in the same configs to ``model.data.train_ds`` as you would for training with unpacked dataset.
 
@@ -83,9 +88,15 @@ def tokenize_dataset(cfg: 'DictConfig'):
     # using the same template as SFT/PEFT script. This may be overkill but guarantees the preprocess settings
     # are identical to normal SFT training
     data_cfg = cfg.model.data.train_ds
+    if os.path.isdir(cfg.tokenizer_path):
+        # pass in a Hugging Face folder which contains tokenizer.json
+        tokenizer = get_nmt_tokenizer(library="huggingface", model_name=cfg.tokenizer_path, use_fast=True)
+    else:
+        tokenizer = get_nmt_tokenizer(library="sentencepiece", tokenizer_model=cfg.tokenizer_path)
+
     dataset = GPTSFTDataset(
         file_path=data_cfg.file_names[0],
-        tokenizer=get_nmt_tokenizer(library="sentencepiece", tokenizer_model=cfg.tokenizer_path),
+        tokenizer=tokenizer,
         max_seq_length=data_cfg.max_seq_length,
         min_seq_length=data_cfg.min_seq_length,
         pad_seq_length_to_mult=16,  # adds padding in collate_fn so this value is irrelevant here
@@ -149,23 +160,9 @@ def main(cfg: 'DictConfig') -> None:
 
     logging.info(
         f"""
-✅ Packed datasets with pack sizes {args.pack_sizes} are prepared successfully.
-To train with packed sequences, you need to change three things in the SFT/PEFT config file
-1. Turn on the packed_sequence flag 
-   > +model.data.train_ds.packed_sequence=True
-2. Use the new dataset file instead of the original jsonl file
-   > model.data.train_ds.file_names=/path/to/packed_dataset.npy
-3. Specify the packed sequence length. This should be one of the ``pack_sizes`` you specified during data preparation.
-   > model.data.train_ds.max_seq_length=<pack_size>
-4. Adjust the batch sizes. 
-   Micro batch size has to be set to 1 as a nominal constraint. This is because batches are now concatenated 
-   in the preprocessing step. You can increase the pack_size to achieve the same purpose of increasing micro batch size.
-   Global batch size has to be reduced by the average number of sequences per pack `n`, 
-   where n = total number of sequences / total number of packs. This ensures that each gradient iteration 
-   sees (on average) the same number of sequences so that the recipe is maintained.
-   Please scroll up to see the value of n for each of your pack sizes.
-   > model.micro_batch_size=1
-   > model.global_batch_size=<previous GBS divided by n>
+✅ Packed datasets with pack sizes {args.pack_sizes} are prepared successfully. 
+To train with packed sequences, you need to make changes to the SFT/PEFT config file. See NeMo Documentation 
+for more details: <https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/throughput_optimizations.html#sequence-packing-for-sft-peft>
 """
     )
 
diff --git a/scripts/nlp_language_modeling/preprocess_data_for_megatron.py b/scripts/nlp_language_modeling/preprocess_data_for_megatron.py
index e1f89182279b..cde14d83ec4b 100644
--- a/scripts/nlp_language_modeling/preprocess_data_for_megatron.py
+++ b/scripts/nlp_language_modeling/preprocess_data_for_megatron.py
@@ -338,7 +338,7 @@ def main():
         if json_file.endswith('.gz'):
             fin = gzip.open(json_file, 'r')
         else:
-            fin = open(args.input, 'r', encoding='utf-8')
+            fin = open(json_file, 'r', encoding='utf-8')
 
         encoded_docs = pool.imap(encoder.encode, fin, 25)
 
diff --git a/scripts/speech_recognition/convert_hf_dataset_to_nemo.py b/scripts/speech_recognition/convert_hf_dataset_to_nemo.py
index e6612974952b..4c3efdc79721 100644
--- a/scripts/speech_recognition/convert_hf_dataset_to_nemo.py
+++ b/scripts/speech_recognition/convert_hf_dataset_to_nemo.py
@@ -226,7 +226,10 @@ def map_dataset_to_nemo(batch):
 
 
 def convert_offline_dataset_to_nemo(
-    dataset: Dataset, cfg: HFDatasetConversionConfig, basedir: str, manifest_filepath: str,
+    dataset: Dataset,
+    cfg: HFDatasetConversionConfig,
+    basedir: str,
+    manifest_filepath: str,
 ):
     """
     Converts a HF dataset to a audio-preprocessed Nemo dataset in Offline mode.
diff --git a/scripts/speech_recognition/estimate_duration_bins_2d.py b/scripts/speech_recognition/estimate_duration_bins_2d.py
new file mode 100644
index 000000000000..52d5b3620a2a
--- /dev/null
+++ b/scripts/speech_recognition/estimate_duration_bins_2d.py
@@ -0,0 +1,331 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import ast
+import math
+from functools import partial
+from itertools import islice
+from pathlib import Path
+from typing import Callable, Iterable
+
+import numpy as np
+import pandas as pd
+from lhotse.cut import Cut
+from omegaconf import OmegaConf
+
+from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
+from nemo.collections.common.data.lhotse.cutset import read_cutset_from_config
+from nemo.collections.common.data.lhotse.dataloader import (
+    DurationFilter,
+    FixedBucketBatchSizeConstraint2D,
+    LhotseDataLoadingConfig,
+    TokenPerSecondFilter,
+    tokenize,
+)
+from nemo.collections.common.prompts.formatter import PromptFormatter
+from nemo.collections.common.tokenizers import AggregateTokenizer, SentencePieceTokenizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Estimate duration bins for Lhotse dynamic bucketing using a sample of the input dataset. "
+        "The dataset is read either from one or more manifest files and supports data weighting. "
+        "Unlike estimate_duration_bins.py, this script prepares the setup for 2D bucketing. "
+        "This means that each main bucket for audio duration is sub-divided into sub-buckets "
+        "for the number of output tokens (supporting BPE and Aggregated tokenizers). "
+        "2D bucketing is especially useful for encoder-decoder models where input audio duration is often "
+        "not sufficient to stratify the sampling with an optimal GPU utilization.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "input",
+        help='Data input. Options: '
+        '1) "path.json" - any single NeMo manifest; '
+        '2) "[[path1.json],[path2.json],...]" - any collection of NeMo manifests; '
+        '3) "[[path1.json,weight1],[path2.json,weight2],...]" - any collection of weighted NeMo manifests; '
+        '4) "input_cfg.yaml" - a new option supporting input configs, same as in model training \'input_cfg\' arg; '
+        '5) "path/to/shar_data" - a path to Lhotse Shar data directory; '
+        '6) "key=val" - in case none of the previous variants cover your case: "key" is the key you\'d use in NeMo training config with its corresponding value ',
+    )
+    parser.add_argument(
+        "-t",
+        "--tokenizer",
+        nargs="+",
+        required=True,
+        help="Path to one or more SPE tokenizers. More than one means we'll use AggregateTokenizer and --langs argument must also be used. When provided, we'll estimate a 2D distribution for input and output sequence lengths.",
+    )
+    parser.add_argument(
+        "-a", "--langs", nargs="+", help="Language names for each of AggregateTokenizer sub-tokenizers."
+    )
+    parser.add_argument(
+        "-b",
+        "--buckets",
+        type=int,
+        default=30,
+        help="The desired number of buckets (dim0 => covers input sequence length / audio duration).",
+    )
+    parser.add_argument(
+        "-s",
+        "--sub-buckets",
+        type=int,
+        default=2,
+        help="The desired number of sub-buckets (dim1 => covers output sequence length / num_tokens).",
+    )
+    parser.add_argument("--text-field", default="text", help="The key in manifests to read transcripts from.")
+    parser.add_argument("--lang-field", default="lang", help="The key in manifests to read language from.")
+    parser.add_argument(
+        "-n",
+        "--num_examples",
+        type=int,
+        default=-1,
+        help="The number of examples (utterances) to estimate the bins. -1 means use all data "
+        "(be careful: it could be iterated over infinitely).",
+    )
+    parser.add_argument(
+        "-l",
+        "--min_duration",
+        type=float,
+        default=-float("inf"),
+        help="If specified, we'll filter out utterances shorter than this.",
+    )
+    parser.add_argument(
+        "-u",
+        "--max_duration",
+        type=float,
+        default=float("inf"),
+        help="If specified, we'll filter out utterances longer than this.",
+    )
+    parser.add_argument(
+        "--max_tps",
+        type=float,
+        default=float("inf"),
+        help="If specified, we'll filter out utterances with more tokens/second than this. "
+        "On regular utterances and BPE tokenizers with 1024 tokens 10-12tps is generally a reasonable limit.",
+    )
+    parser.add_argument(
+        "-q", "--quiet", type=bool, default=False, help="When specified, only print the estimated duration bins."
+    )
+    parser.add_argument(
+        "-f",
+        "--prompt-format",
+        type=str,
+        help="When specified, we'll use a prompt formatter in addition to the tokenizer for the purpose of estimating token count bins. "
+        "This is useful for accurate 2D bucket estimation with models such as EncDecMultiTaskModel (Canary-1B), "
+        "or any model where the label sequence consists of a user prompt and a model's response.",
+    )
+    parser.add_argument(
+        "-p",
+        "--prompt",
+        type=str,
+        help="Prompt slots provided as a Python list of dicts. It is used together with --prompt-format option."
+        "For example, with Canary-1B you may use: [{'role':'user','slots':{'source_lang':'en','target_lang':'en','task':'asr','pnc':'yes'}]",
+    )
+    return parser.parse_args()
+
+
+def estimate_duration_buckets(
+    cuts: Iterable[Cut],
+    num_buckets: int,
+    num_subbuckets: int,
+    max_tps: float,
+    max_duration: float,
+    quiet: bool,
+) -> list[tuple[float, float]]:
+    """
+    This function is based on lhotse.dataset.sampling.dynamic_bucketing.estimate_duration_buckets.
+    It extends it to a 2D bucketing case.
+    """
+    assert num_buckets > 1
+
+    constraint = FixedBucketBatchSizeConstraint2D([(0.0, 0.0)], [0])
+
+    # Gather the duration and token count statistics for the dataset.
+    sizes = []
+    num_tokens = []
+    for c in cuts:
+        dur, toks = constraint.measure_length(c)
+        sizes.append(dur)
+        num_tokens.append(toks)
+    sizes = np.array(sizes, dtype=np.float32)
+    num_tokens = np.array(num_tokens, dtype=np.int32)
+    joint = np.rec.fromarrays([sizes, num_tokens])
+    joint.sort()
+    sizes = joint.f0
+    num_tokens = joint.f1
+
+    # We are building buckets with equal duration (empirically leads to more even bucket exhaustion over time).
+    # We need to determine how much duration to allocate per bucket.
+    size_per_bucket = sizes.sum() / num_buckets
+
+    if not quiet:
+        print("Duration distribution:")
+        print(pd.Series(sizes).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]))
+    if math.isinf(max_duration):
+        max_duration = sizes[-1]
+
+    tps = num_tokens / sizes
+    if not quiet:
+        print("Token per second distribution:")
+        print(pd.Series(tps).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]))
+    if math.isinf(max_tps):
+        max_tps = tps.max()
+    del tps
+
+    bins = []
+    bin_indexes = [0]
+    tot = 0.0
+
+    def _estimate_token_buckets(max_bucket_duration):
+        # Since this is 2D bucketing, apply the same bin creation logic
+        # for the second dimension (i.e. token count) as for the first dimension (duration).
+        # That means we aim to have each bucket contain roughly the same number of tokens.
+        # Note that this estimation is biased towards more padding if you have
+        # a lot of zero-token examples (e.g. non-speech).
+        nonlocal bins
+        num_tokens_bucket = num_tokens[bin_indexes[-1] : binidx]
+        num_tokens_bucket.sort()
+        tokens_per_subbucket = num_tokens_bucket.sum() / num_subbuckets
+        tot_toks = 0
+        # Iterate over token counts, and whenever we hit tokens_per_subbucket, create a new 2D bucket bin.
+        for num_toks in num_tokens_bucket:
+            # Threshold hit: we are creating a new (max_duration, max_num_tokens) bin.
+            if tot_toks > tokens_per_subbucket:
+                bins.append((max_bucket_duration, num_toks))
+                tot_toks = 0
+            tot_toks += num_toks
+        bins.append((size, math.ceil(size * max_tps)))
+
+    # Iterate over data, and whenever we hit size_per_bucket, create a new bucket bin.
+    for binidx, size in enumerate(sizes):
+        if tot > size_per_bucket:
+            # Threshold hit: we are creating a new duration bin (multiplied by number of token bins).
+            _estimate_token_buckets(max_bucket_duration=size)
+            tot = 0.0
+        tot += size
+
+    # Estimate an extra 2D bin set for global max duration.
+    _estimate_token_buckets(max_bucket_duration=max_duration)
+
+    return bins
+
+
+def load_tokenizer(paths: list[str], langs: list[str] = None) -> TokenizerWrapper:
+    if len(paths) == 1:
+        tok = SentencePieceTokenizer(paths[0])
+    else:
+        assert langs is not None and len(paths) == len(
+            langs
+        ), f"Cannot create AggregateTokenizer; each tokenizer must have assigned a language via --langs option (we got --tokenizers={paths} and --langs={langs})"
+        tok = AggregateTokenizer({lang: SentencePieceTokenizer(p) for lang, p in zip(langs, paths)})
+    return TokenizerWrapper(tok)
+
+
+def apply_tokenizer(cut, tokenizer=None, prompt: PromptFormatter = None):
+    if prompt is not None:
+        turns = prompt.get_default_dialog_slots()
+        last_turn = {"role": prompt.OUTPUT_ROLE, "slots": prompt.get_slots(prompt.OUTPUT_ROLE)}
+        assert len(last_turn["slots"]) == 1  # TODO: not sure how to handle multi-slot for system output here
+        for key in last_turn["slots"]:
+            last_turn["slots"][key] = cut.supervisions[0].text
+        last_turn["slots"][prompt.PROMPT_LANGUAGE_SLOT] = cut.supervisions[0].language
+        turns.append(last_turn)
+        ans = prompt.encode_dialog(turns)
+        cut.supervisions[0].tokens = ans["input_ids"]
+
+    elif tokenizer is not None:
+        cut = tokenize(cut, tokenizer)
+
+    return cut
+
+
+class RejectionsCounter:
+    def __init__(self, predicate: Callable, message: str):
+        self.predicate = predicate
+        self.message = message
+        self.total = 0
+        self.rejected = 0
+
+    def __call__(self, example) -> bool:
+        ans = self.predicate(example)
+        self.total += 1
+        if not ans:
+            self.rejected += 1
+        return ans
+
+    def print_report(self) -> None:
+        if self.rejected:
+            print(f"{self.message} | Rejected {self.rejected}/{self.total} examples.")
+
+
+def main():
+    args = parse_args()
+
+    if not args.quiet:
+        pd.set_option('display.float_format', lambda x: '%.2f' % x)
+
+    tokenizer = None
+    prompt = None
+    if args.tokenizer is not None:
+        tokenizer = load_tokenizer(args.tokenizer, args.langs)
+        if args.prompt_format is not None:
+            prompt_defaults = None
+            if args.prompt is not None:
+                prompt_defaults = ast.literal_eval(args.prompt)
+            prompt = PromptFormatter.resolve(args.prompt_format)(tokenizer._tokenizer, defaults=prompt_defaults)
+
+    if '=' in args.input:
+        inp_arg = args.input
+    elif args.input.endswith(".yaml"):
+        inp_arg = f"input_cfg={args.input}"
+    elif Path(args.input).is_dir():
+        inp_arg = f"shar_path={args.input}"
+    else:
+        inp_arg = f"manifest_filepath={args.input}"
+    config = OmegaConf.merge(
+        OmegaConf.structured(LhotseDataLoadingConfig),
+        OmegaConf.from_dotlist(
+            [inp_arg, "metadata_only=true", f"text_field={args.text_field}", f"lang_field={args.lang_field}"]
+        ),
+    )
+    cuts, _ = read_cutset_from_config(config)
+    duration_filter = RejectionsCounter(DurationFilter(args.min_duration, args.max_duration), "Duration filtering")
+    cuts = cuts.filter(duration_filter)
+    cuts = cuts.map(partial(apply_tokenizer, tokenizer=tokenizer, prompt=prompt))
+    tps_filter = RejectionsCounter(TokenPerSecondFilter(-1, args.max_tps), "Token per second filtering")
+    cuts = cuts.filter(tps_filter)
+    if (N := args.num_examples) > 0:
+        cuts = islice(cuts, N)
+
+    duration_bins = estimate_duration_buckets(
+        cuts,
+        num_buckets=args.buckets,
+        num_subbuckets=args.sub_buckets,
+        max_tps=args.max_tps,
+        max_duration=args.max_duration,
+        quiet=args.quiet,
+    )
+    duration_bins = "[" + ','.join(f"[{b:.3f},{sb:d}]" for b, sb in duration_bins) + "]"
+    if args.quiet:
+        print(duration_bins)
+        return
+    duration_filter.print_report()
+    tps_filter.print_report()
+    print("Use the following options in your config:")
+    print(f"\tnum_buckets={args.buckets}")
+    print(f"\tbucket_duration_bins={duration_bins}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/speech_recognition/filter_tarred_audio_dataset.py b/scripts/speech_recognition/filter_tarred_audio_dataset.py
new file mode 100644
index 000000000000..bbe88c6700f3
--- /dev/null
+++ b/scripts/speech_recognition/filter_tarred_audio_dataset.py
@@ -0,0 +1,142 @@
+from functools import partial
+from io import BytesIO
+from pathlib import Path
+
+import click
+import lhotse
+import torch.utils.data
+from lhotse import CutSet, MonoCut
+from lhotse.audio.backend import LibsndfileBackend
+from lhotse.dataset import DynamicCutSampler, IterableDatasetWrapper
+from lhotse.shar import JsonlShardWriter, TarWriter
+from omegaconf import OmegaConf
+
+from nemo.collections.common.data.lhotse import read_cutset_from_config
+from nemo.collections.common.data.lhotse.dataloader import LhotseDataLoadingConfig
+
+
+@click.command()
+@click.argument("manifest_filepath")
+@click.argument("tarred_audio_filepaths")
+@click.argument("filtered_manifest_filepath")
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-f",
+    "--output-format",
+    type=click.Choice(["lhotse_shar", "nemo_tarred"]),
+    default="lhotse_shar",
+    help="Which format should we use to save the filtered tarred data.",
+)
+@click.option("-s", "--shard-size", type=int, default=1000, help="Desired number of examples per output shard.")
+def filter_tarred(
+    manifest_filepath: str,
+    tarred_audio_filepaths: str,
+    filtered_manifest_filepath: str,
+    output_dir: str,
+    output_format: str,
+    shard_size: int,
+):
+    """
+    Given an existing tarred dataset and manifests that point to a subset of examples,
+    create a new tarred dataset corresponding to the subset.
+
+    This is useful if you want to "re-tar" an existing tarred dataset in order to efficiently
+    read some subset of it.
+    """
+    lhotse.set_dill_enabled(True)
+    all_cuts = read_cutset(manifest_filepath, tarred_audio_filepaths)
+    keep_cuts = {cut.id: cut for cut in read_cutset(filtered_manifest_filepath)}
+    filtered_cuts = bg_load(
+        all_cuts.filter(lambda c: c.id in keep_cuts).map(partial(attach_custom, cuts_with_custom=keep_cuts))
+    )
+    if not '://' in output_dir:  # we support object store writing too
+        Path(output_dir).mkdir(exist_ok=True, parents=True)
+    if output_format == "lhotse_shar":
+        filtered_cuts.to_shar(output_dir=output_dir, fields={"recording": "flac"}, shard_size=shard_size)
+    elif output_format == "nemo_tarred":
+        export_to_nemo_tarred(cuts=filtered_cuts, output_dir=output_dir, shard_size=shard_size)
+    else:
+        raise RuntimeError(f"Unsupported output format: '{output_format}'")
+
+
+def read_cutset(src: str, tar: str | None = None) -> CutSet:
+    inp_arg = ["force_finite=true"]
+    if tar is not None:
+        inp_arg += [f"manifest_filepath={src}", f"tarred_audio_filepaths={tar}"]
+    else:
+        inp_arg += ["metadata_only=true"]
+        if src.endswith(".yaml"):
+            inp_arg += [f"input_cfg={src}"]
+        elif Path(src).is_dir():
+            inp_arg += [f"shar_path={src}"]
+        else:
+            inp_arg += [f"manifest_filepath={src}"]
+    config = OmegaConf.merge(
+        OmegaConf.structured(LhotseDataLoadingConfig),
+        OmegaConf.from_dotlist(inp_arg),
+    )
+    cuts, _ = read_cutset_from_config(config)
+    return cuts
+
+
+def export_to_nemo_tarred(cuts: CutSet, output_dir: str, shard_size: int) -> None:
+    with (
+        TarWriter(pattern=f"{output_dir}/audio_%d.tar", shard_size=shard_size) as aw,
+        JsonlShardWriter(pattern=f"{output_dir}/manifest_%d.jsonl", shard_size=shard_size) as mw,
+    ):
+        for cut in cuts:
+            assert (
+                isinstance(cut, MonoCut) and len(cut.supervisions) == 1
+            ), f"Export to nemo_tarred format is possible only for mono cuts with a single supervision, but we got: {cut}"
+            # Prepare audio for writing.
+            audio_name = f"{cut.id}.flac"
+            audio = BytesIO()
+            LibsndfileBackend().save_audio(audio, cut.load_audio(), sampling_rate=cut.sampling_rate, format="flac")
+            audio.seek(0)
+            # Prepare manifest for writing.
+            ans = {"audio_filepath": audio_name, "duration": cut.duration}
+            if cut.supervisions[0].text:
+                ans["text"] = cut.supervisions[0].text
+            if cut.supervisions[0].language:
+                ans["lang"] = cut.supervisions[0].language
+            if cut.custom is not None:
+                # Ensure if we export anything custom, these are only simple built-in types compatible with JSON.
+                ans.update({k: v for k, v in cut.custom.items() if isinstance(v, (int, float, str, list, dict))})
+            # Set the right shard_id.
+            shard_id = max(0, mw.num_shards - 1)
+            if mw.num_items > 0 and mw.num_items % mw.shard_size == 0:
+                shard_id += 1
+            ans["shard_id"] = shard_id
+            # Write both items.
+            aw.write(audio_name, audio)
+            mw.write(ans)
+
+
+def attach_custom(cut, cuts_with_custom):
+    custom = cuts_with_custom[cut.id].custom
+    if custom is not None:
+        cut.custom.update(custom)
+    return cut
+
+
+class Identity(torch.utils.data.Dataset):
+    def __getitem__(self, x):
+        cut = x[0]
+        for k in ["dataloading_info", "shard_id"]:
+            cut.custom.pop(k, None)
+        return cut
+
+
+def bg_load(cuts: CutSet) -> CutSet:
+    return CutSet(
+        torch.utils.data.DataLoader(
+            IterableDatasetWrapper(Identity(), DynamicCutSampler(cuts, max_cuts=1)),
+            batch_size=None,
+            num_workers=1,
+            prefetch_factor=10,
+        )
+    )
+
+
+if __name__ == '__main__':
+    filter_tarred()
diff --git a/scripts/speech_recognition/oomptimizer.py b/scripts/speech_recognition/oomptimizer.py
new file mode 100755
index 000000000000..165ac5ac692d
--- /dev/null
+++ b/scripts/speech_recognition/oomptimizer.py
@@ -0,0 +1,528 @@
+#!/usr/bin/env python
+import importlib
+import math
+import sys
+from numbers import Number
+from typing import Iterable, Literal
+
+import click
+import pytorch_lightning as pl
+import torch
+from lhotse import compute_num_samples
+from omegaconf import OmegaConf
+
+from nemo.collections.asr.models.asr_model import ASRModel
+from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType
+from nemo.utils import logging
+
+
+class ProfilingBatchGenerator:
+    """
+    ProfilingBatchGenerator is used to generate artificial mini-batches for model training
+    and tracking the progress of batch size optimization.
+
+    The high-level usage API is the following::
+
+        >>> gen = ProfilingBatchGenerator(schema)
+        ... finished = False
+        ... while not finished:
+        ...     batch = gen(input_seq_len, output_seq_len)
+        ...     try:
+        ...         training_step(model, batch)
+        ...         oom = False
+        ...     except torch.cuda.OutOfMemoryError:
+        ...         oom = True
+        ...     finished = gen.advance(oom)
+        ... solution = gen.max_batch_size  # The solution of the search problem.
+        ... gen.reset()  # Can re-use for other sequence lengths now.
+
+    The search terminates once the difference between max working batch size and min OOM batch size
+    divided by the latter is smaller than ``rel_gap_thresh`` that difference amounts to a single element.
+    For example, a max working batch size is 96 and min OOM batch size is 100 indicates a gap of 0.04,
+    which would terminate the search with threshold of 0.05.
+
+    In order to generate mini-batches compatible with a given model, the generator:
+
+    * accepts a ``schema`` argument in its constructor, and
+
+    * accepts input/output sequence lengths in each call to generate a mini-batch.
+
+    ``schema`` has the following structure::
+
+
+        >>> {
+        ...     "cls": tuple | MyBatchType,
+        ...     "inputs": [
+        ...         {
+        ...             "type": NeuralType(...) | Literal["dummy"],
+        ...             "seq_length": Literal["input", "output"],
+        ...             "vocab_size": int,  # optional, required only for LabelsType
+        ...             "name": str,  # optional, indicates kwarg
+        ...         },
+        ...         ...,
+        ...     ]
+        ... }
+
+    ``cls`` indicates how we should construct the mini-batch. Typically you can just use ``tuple`` for most
+    batch schemas. However, if the model expects a specific, e.g., dataclass, you can tell ``ProfilingBatchGenerator``
+    to use it. The mini-batch object will be constructed using the items in ``inputs``.
+
+    Each element of ``inputs`` specifies a NeMo NeuralType which needs to have a defined ``elements_type``.
+    The supported types are ``AudioSignal``, ``LengthsType`` and ``LabelsType``.
+    If "type" is not a NeuralType, we interpret that as a placeholder tensor that's not relevant but expected
+    by the model/batch constructor. In addition, ``"seq_length"`` key is used to determine whether we should apply
+    input or output sequence length to a given tensor.
+
+    Optional keys:
+
+    * ``vocab_size`` is required for ``LabelsType`` so that we can generate proper label values.
+
+    * ``name`` is required if objects of ``cls`` have to be constructed using keyword arguments.
+
+    A simple schema example for a model using audio/lengths tensor pair (unsupervised/self-supervised)::
+
+        >>> {
+        ...     "cls": tuple,
+        ...     "inputs": [
+        ...         {"type": NeuralType(("B", "T"), AudioSignal()), "seq_length": "input"},
+        ...         {"type": NeuralType(("B"), LengthsType()), "seq_length": "input"},
+        ...     ]
+        ... }
+
+    """
+
+    def __init__(
+        self,
+        schema: dict,
+        start_batch_size: int = 32,
+        rel_gap_thresh: float = 0.05,
+        device: str = "cuda",
+    ):
+        self.schema = schema
+        self.start_batch_size = start_batch_size
+        self.rel_gap_thresh = rel_gap_thresh
+        self.device = device
+        self.reset()
+
+    def __call__(self, input_seq_length: int, output_seq_length: int):
+        B = self._current
+        select_seq_length = {"input": input_seq_length, "output": output_seq_length}
+        batch = []
+        names = []
+        for item in self.schema["inputs"]:
+            nt = item["type"]
+            if not isinstance(nt, NeuralType):  # placeholder
+                tnsr = torch.tensor([])
+            elif isinstance(nt.elements_type, AudioSignal):
+                seq_length = select_seq_length[item["seq_length"]]
+                tnsr = torch.randn(B, seq_length, dtype=torch.float32, device=self.device)
+            elif isinstance(nt.elements_type, LengthsType):
+                seq_length = select_seq_length[item["seq_length"]]
+                tnsr = torch.ones(B, dtype=torch.long, device=self.device) * seq_length
+            elif isinstance(nt.elements_type, LabelsType):
+                seq_length = select_seq_length[item["seq_length"]]
+                tnsr = torch.randint(0, item["vocab_size"], size=(B, seq_length), device=self.device)
+            else:
+                raise RuntimeError("Unexpected item in oomptimizer schema: {item}")
+            batch.append(tnsr)
+            names.append(item.get("name"))
+        args = [elem for name, elem in zip(names, batch) if name is None]
+        kwargs = {name: elem for name, elem in zip(names, batch) if name is not None}
+        if not kwargs and self.schema["cls"] == tuple:
+            return tuple(args)
+        return self.schema["cls"](*args, **kwargs)
+
+    @property
+    def max_batch_size(self) -> int | None:
+        """
+        Return the solution of the batch size search problem.
+        It will keep returning None until the search is done.
+        """
+        if (
+            self._max_ok is not None
+            and self._min_err is not None
+            and (self.current_rel_gap <= self.rel_gap_thresh or self._min_err - self._max_ok <= 1)
+        ):
+            return self._max_ok
+        return None
+
+    @property
+    def current_rel_gap(self) -> float | None:
+        """
+        Return the current gap between the largest batch that works and the smallest batch that triggers OOM.
+        The gap is defined as the batch size difference divided by the larger element.
+        E.g., if the best found batch size is 95 and the smallest that triggers OOM is 100, the gap is 0.05.
+        """
+        if self._min_err is None or self._max_ok is None:
+            return None
+        return (self._min_err - self._max_ok) / self._min_err
+
+    def reset(self):
+        """Reset the generator to prepare it for a new search."""
+        self._current = self.start_batch_size
+        self._max_ok = None  # max batch size that works
+        self._min_err = None  # min batch size that doesn't work
+
+    def advance(self, oom: bool) -> bool:
+        """
+        Adjusts the current batch size based on the outcome.
+        Returns a bool indicating whether the calibration is complete.
+        """
+        if self.max_batch_size is not None:
+            return True
+
+        if oom:
+            # Training step failed with OOM.
+            # Update the minimum known batch size that causes an error.
+            self._min_err = min(float("inf") if self._min_err is None else self._min_err, self._current)
+            # Training step failed on OOM
+            if self._max_ok is None:
+                # We haven't found a batch size that works yet, keep going 2x down.
+                self._current = round(self._current / 2)
+            else:
+                # Try the middle-point between the known extremes.
+                self._current = round((self._max_ok + self._min_err) / 2)
+        else:
+            # Training step successful.
+            # Update the maximum known batch size that works.
+            self._max_ok = max(-1 if self._max_ok is None else self._max_ok, self._current)
+            if self._min_err is None:
+                # We haven't found a batch size that causes an error yet, keep going 2x higher
+                self._current *= 2
+            else:
+                # Try the middle-point between the known extremes.
+                self._current = round((self._max_ok + self._min_err) / 2)
+
+        return False
+
+
+class FloatList(click.Option):
+    """Support passing bucket duration bins as [1.1,2.5,5.6,...]"""
+
+    name = "list[float]"
+
+    def type_cast_value(self, ctx, value):
+        if isinstance(value, list) and all(isinstance(v, float) for v in value):
+            return value
+        try:
+            import ast
+
+            ans = ast.literal_eval(value)
+            if isinstance(ans[0], list):
+                ans = [tuple(item) for item in ans]
+            return ans
+        except ValueError:
+            raise click.BadParameter(value)
+
+
+@click.command(context_settings={'show_default': True})
+@click.option(
+    "-n",
+    "--pretrained-name",
+    type=str,
+    default=None,
+    help="Name of a pretrained model to use, e.g. 'nvidia/canary-1b'.",
+)
+@click.option(
+    "-m",
+    "--module-name",
+    type=str,
+    default=None,
+    help="Full path to NeMo's module corresponding to CONFIG_PATH, e.g. 'nemo.collections.asr.models.EncDecMultiTaskModel'.",
+)
+@click.option(
+    "-c", "--config-path", type=str, default=None, help="Path to the training configuration file for MODULE_NAME."
+)
+@click.option("-o", "--optimizer-name", type=str, default="adamw", help="Name of optimizer to use.")
+@click.option(
+    "-b",
+    "--buckets",
+    cls=FloatList,
+    default=[5.0, 10.0, 15.0, 20.0, 25.0, 30.0],
+    help="List of upper-bound bucket bins (i.e. first bucket is [0.0 - item0), second bucket is [item0 - item1), etc.). "
+    "We also support a nested list for 2D bucketing, e.g. [[2.0, 10],[2.0,20],[4.5,15],[4.5,30],...], "
+    "where each item is a pair of (max_input_seq_len, max_output_seq_len) for a given bucket.",
+)
+@click.option(
+    "-t",
+    "--threshold",
+    type=float,
+    default=0.05,
+    help="Search stopping criterion in range [0, 1], lower is more precise. Interpret as the uncerainty gap, i.e. (min_oom_batch_size - max_ok_batch_size) / min_oom_batch_size.",
+)
+@click.option("-s", "--start-batch-size", type=int, default=32, help="Initial batch size to start the search from.")
+@click.option(
+    "-r",
+    "--ratio",
+    type=int,
+    default=12,  # conservative estimate towards longer transcripts
+    help="The output_sequence_length to input_sequence_length ratio for the purpose of determing the maximum output sequence lengths. "
+    "The interpretation depends on input and output modalities. Examples: for audio->text it's tokens per second. "
+    "For text->audio it's seconds per token. For audio->audio it's output seconds per input second. "
+    "For text->text it's output tokens per input token. "
+    "In general larger ratio means longer output sequences and increased memory consumption. "
+    "The default value is set adequately for automatic speech recognition. "
+    "This argument is ignored when 2D buckets are provided to --buckets option.",
+)
+@click.option(
+    "-f",
+    "--memory-fraction",
+    type=float,
+    default=0.9,
+    help="Limits the use of CUDA memory for this process to MEMORY_FRACTION of the total device memory. "
+    "By default we force 5% memory to be unused to account for non-training-loop related CUDA memory usage"
+    "in actual training scripts.",
+)
+@click.option(
+    "-d",
+    "--device",
+    default="cuda:0",
+    help="Device string to be passed to torch.device; due to MEMORY_FRACTION option, "
+    "it must specify the device index (e.g. cuda:0). "
+    "You can also leave the default index and select a specific GPU using env var CUDA_VISIBLE_DEVICES=<idx>",
+)
+@click.option(
+    "-y",
+    "--dtype",
+    default="bfloat16",
+    help="Float precision to use for computation (used together with autocast).",
+)
+@click.option(
+    "--ddp/--no-ddp",
+    type=bool,
+    default=True,
+    help="Whether we should simulate DDP GPU RAM usage. Stores an extra copy of the model in GPU memory. Enabled by default.",
+)
+def oomptimizer(
+    pretrained_name: str | None,
+    module_name: str | None,
+    config_path: str | None,
+    optimizer_name: str,
+    buckets: list[float],
+    threshold: float,
+    start_batch_size: int,
+    ratio: int,
+    memory_fraction: float,
+    device: str,
+    dtype: str,
+    ddp: bool,
+):
+    """
+    OOMptimizer finds the optimal batch sizes for training your model with bucketing dataloading.
+    It performs a search over batch sizes until it converges by measuring the GPU memory usage for
+    a model's training step and optimizer update.
+
+    \b
+    There are two main usage patterns: for using a pretrained model or an untrained model configuration.
+    The latter is more flexible but requires the user to provide two separate arguments. Examples:
+    * python oomptimizer.py --pretrained-name nvidia/canary-1b
+    * python oomptimizer.py --module-name nemo.collections.asr.models.EncDecMultiTaskModel \
+        --config-path examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
+
+    Dynamic bucketing is notoriously difficult to tune as you risk running into CUDA OOM many steps into the training.
+    In order to simplify finding the optimal settings, OOMptimizer scans each bucket to find the maximum possible
+    batch size that doesn't trigger a CUDA OOM.
+
+    \b
+    The suggested workflow is the following:
+    1) Run scripts/speech_recognition/estimate_duration_bins.py to get the duration distribution of your data.
+        (consider running estimate_duration_bins_2d.py for models with a strong dependency on output sequence length
+        such as attention-encoder-decoder models).
+    2) Run OOMptimizer to find the optimal batch sizes for your specific model, optimizer, and GPU.
+    3) Use these optimal settings in your actual training script and enjoy optimal GPU utilization OOM-free.
+
+    In the unlikely event that OOMptimizer bucket batch sizes are still leading to OOMs,
+    please try a lower setting of the MEMORY_FRACTION option, e.g. 0.75 (75% of GPU memory).
+    This may be required in very complex setups where there are additional GPU RAM loads that can't be anticipated
+    through the combination of training_step and optimizer update.
+    """
+    if all(opt is None for opt in (pretrained_name, module_name, config_path)):
+        click.secho(
+            "You need to provide either PRETRAINED_NAME or the pair of MODULE_NAME and CONFIG_PATH.", fg="yellow"
+        )
+        sys.exit(1)
+    logging.setLevel(logging.CRITICAL)
+    torch.cuda.set_per_process_memory_fraction(memory_fraction, device)
+
+    trainer = pl.Trainer(barebones=True)
+    trainer.log_every_n_steps = 1000000
+    model_clones = []
+    for _ in range(2 if ddp else 1):
+        if pretrained_name is not None:
+            assert (
+                config_path is None and module_name is None
+            ), "--pretrained-name cannot be used together with --module-name/--config-path"
+            click.echo(f"Intializing ASR model from pretrained checkpoint {pretrained_name}.")
+            model = ASRModel.from_pretrained(pretrained_name, trainer=trainer).to(device)
+        else:
+            assert config_path is not None, "--module-name requires --config-path to be specified as well."
+            assert module_name is not None, "--config-path requires --module-name to be specified as well."
+            cfg = OmegaConf.load(config_path)
+            namespace, name = module_name.rsplit('.', maxsplit=1)
+            model_cls = getattr(importlib.import_module(namespace), name)
+            model = model_cls(cfg=cfg.model, trainer=trainer).to(device)
+        model_clones.append(model)
+    model = model_clones[-1]
+
+    if not hasattr(model, "oomptimizer_schema"):
+        click.secho(
+            f"We read model of type {type(model)} which doesn't seem to support OOMptimizer "
+            f"(we could not find the property .oomptimizer_schema).",
+            fg="red",
+        )
+        sys.exit(1)
+
+    schema = model.oomptimizer_schema
+
+    click.echo("Setting up the optimizers.")
+    optimizer, _ = model.setup_optimization({"name": optimizer_name, "lr": 1e-7, "weight_decay": 0.0})
+
+    is_2d_bucketing = all(
+        isinstance(item, (list, tuple)) and len(item) == 2 and all(isinstance(v, Number) for v in item)
+        for item in buckets
+    )
+    # Determine modality for input and output.
+    modalities = [
+        (
+            "text"
+            if any(
+                isinstance(item["type"].elements_type, LabelsType) and item["seq_length"] == direction
+                for item in schema["inputs"]
+                if item["type"] != "dummy"
+            )
+            else "audio"
+        )
+        for direction in ("input", "output")
+    ]
+
+    def get_max_seq_lens(buckets):
+
+        def _determine_lens_for_bucket(bin):
+            if is_2d_bucketing:
+                input_len, output_len = bin
+            else:
+                input_len = bin
+                output_len = math.ceil(ratio * input_len)
+            sampling_rate = getattr(
+                model, "sample_rate", 16000
+            )  # TODO: may need to extend schema for broader model coverage
+            match modalities:
+                case "audio", "audio":
+                    return (
+                        compute_num_samples(input_len, sampling_rate=sampling_rate),
+                        compute_num_samples(output_len, sampling_rate=sampling_rate),
+                    )
+                case "audio", "text":
+                    return (compute_num_samples(input_len, sampling_rate=sampling_rate), output_len)
+                case "text", "audio":
+                    return (
+                        input_len,
+                        compute_num_samples(output_len, sampling_rate=sampling_rate),
+                    )
+                case "text", "text":
+                    return input_len, output_len
+                case _:
+                    raise RuntimeError(f"Unexpected modality combination: {_}")
+
+        return [_determine_lens_for_bucket(bin) for bin in buckets]
+
+    click.echo("Starting profiling.")
+    max_seq_lens = get_max_seq_lens(buckets)
+    gen = ProfilingBatchGenerator(schema=schema, start_batch_size=start_batch_size, rel_gap_thresh=threshold)
+    profile = {}
+
+    # Iterate buckets from the largest to the smallest sequences. This usually ends up creating
+    # a tiny bit smaller batches, likely due to worse memory fragmentation.
+    with torch.autocast("cuda", getattr(torch, dtype)):
+        for bucket, (seq_len_in, seq_len_out) in reversed(list(zip(buckets, max_seq_lens))):
+            click.echo(f"The current sequence lengths are: input={seq_len_in} output={seq_len_out}.")
+            gen.reset()
+            batch_idx = 0
+
+            def step():
+                click.echo(
+                    f"\t[BEGIN step] [CUDA RAM CURRENT: {torch.cuda.memory_allocated() / (1024 * 1024):.1f}MB] [CUDA RAM MAX: {torch.cuda.max_memory_allocated() / (1024*1024):.1f}MB]"
+                )
+                batch = gen(seq_len_in, seq_len_out)
+                oom = False
+                try:
+                    click.echo(f"\tCurrent gap: {gen.current_rel_gap}... ", nl=False)
+                    optimizer.zero_grad()
+                    out = model.training_step(batch, batch_idx)
+                    out['loss'].sum().backward()
+                    optimizer.step()
+                except torch.cuda.OutOfMemoryError as e:
+                    click.secho(f"OOM!", fg="yellow")
+                    oom = True
+                except RuntimeError as e:
+                    if "cuFFT error: CUFFT_INTERNAL_ERROR" not in str(e):
+                        raise
+                    click.secho(f"OOM!", fg="yellow")
+                    oom = True
+                else:
+                    click.secho(f"OK!", fg="green")
+                finally:
+                    click.echo(
+                        f"\t[END step] [CUDA RAM CURRENT: {torch.cuda.memory_allocated() / (1024 * 1024):.1f}MB] [CUDA RAM MAX: {torch.cuda.max_memory_allocated() / (1024*1024):.1f}MB]"
+                    )
+                    del batch
+                    # Note: We could call empty_cache() to free up some more memory on the GPU,
+                    #       but we have found out empirically that this causes a mismatched condition
+                    #       between OOMptimizer and the actual training. During training, there is some
+                    #       degree of memory fragmentation and it's better to simulate that in OOMptimizer.
+                    # torch.cuda.memory.empty_cache()
+                    torch.cuda.reset_max_memory_allocated()
+                return oom
+
+            oom = step()
+            while not (finished := gen.advance(oom)):
+                click.echo("\t" + "=" * 80)
+                oom = step()
+
+            click.secho(
+                f"=> Optimal setting for bucket={bucket} (input={seq_len_in} output={seq_len_out}) is max_batch_size={gen.max_batch_size}",
+                fg="green",
+            )
+            profile[(bucket, seq_len_in, seq_len_out)] = gen.max_batch_size
+            gen.start_batch_size = gen.max_batch_size * 2
+
+    # Reverse the profile to be ascendingly sorted again.
+    profile = dict(reversed(list(profile.items())))
+
+    click.echo("The 1st stage profile is:")
+    for (bucket, seq_len_in, seq_len_out), bs in profile.items():
+        click.echo(f"Bucket={bucket} (input={seq_len_in} output={seq_len_out}) => max_batch_size={bs}")
+
+    if is_2d_bucketing:
+        # 2D bucketing doesn't support bucket merging.
+        final_profile = [["[" + ",".join(map(str, b)) + "]", bs] for (b, _, __), bs in profile.items()]
+        max_input_len, max_output_len = buckets[-1]
+        ratio = max_output_len / max_input_len
+    else:
+        click.echo("Bucket merging stage...")
+        final_profile = []
+        for idx, ((bucket, seq_len_in, seq_len_out), bs) in enumerate(profile.items()):
+            if idx == 0:
+                final_profile.append([bucket, bs])
+                continue
+            if bs == final_profile[-1][1]:
+                click.echo(f"Merging bucket {idx} with bucket {idx-1} due to identical batch sizes.")
+                final_profile[-1][0] = bucket
+                continue
+            final_profile.append([bucket, bs])
+        max_input_len = final_profile[-1][0]
+
+    click.secho(f"The profile was created with the following settings:")
+    click.secho(f"* using {memory_fraction:.1%} of available GPU RAM.")
+    click.secho(f"* {'' if ddp else 'not '}simulating DDP memory overhead.")
+    click.secho(f"* using AMP with dtype={dtype}.")
+    click.secho("The final profile is:", bold=True)
+    click.secho("\tbucket_duration_bins=[" + ",".join(str(seqlen) for seqlen, bs in final_profile) + "]", bold=True)
+    click.secho("\tbucket_batch_size=[" + ",".join(str(bs) for seqlen, bs in final_profile) + "]", bold=True)
+    click.secho("\t(The following flags are suitable for ASR/speech-to-text models):")
+    click.secho(f"\tmax_tps={ratio}", bold=True)
+    click.secho(f"\tmax_duration={max_input_len}", bold=True)
+
+
+if __name__ == "__main__":
+    oomptimizer()
diff --git a/setup.py b/setup.py
index 000de8aa0f66..7787c0ba9603 100644
--- a/setup.py
+++ b/setup.py
@@ -273,7 +273,7 @@ def finalize_options(self):
     # Custom commands.
     cmdclass={'style': StyleCommand},
     entry_points={
-        "sdk.factories": [
+        "run.factories": [
             "llm = nemo.collections.llm",
         ],
     },
diff --git a/tests/collections/asr/decoding/test_multi_task_decoding.py b/tests/collections/asr/decoding/test_multi_task_decoding.py
new file mode 100644
index 000000000000..906caccad396
--- /dev/null
+++ b/tests/collections/asr/decoding/test_multi_task_decoding.py
@@ -0,0 +1,198 @@
+from unittest.mock import Mock
+
+import pytest
+import torch
+
+from nemo.collections.asr.modules.transformer.transformer import TransformerDecoderNM
+from nemo.collections.asr.modules.transformer.transformer_generators import (
+    BeamSearchSequenceGenerator,
+    GreedySequenceGenerator,
+)
+from nemo.collections.asr.parts.submodules.multitask_beam_decoding import TransformerAEDBeamInfer
+from nemo.collections.asr.parts.submodules.multitask_greedy_decoding import TransformerAEDGreedyInfer
+from nemo.collections.asr.parts.submodules.token_classifier import TokenClassifier
+
+
+@pytest.fixture()
+def deterministic_rng():
+    state = torch.get_rng_state()
+    torch.manual_seed(0)
+    yield
+    torch.set_rng_state(state)
+
+
+@pytest.fixture()
+def decoder_nm(deterministic_rng):
+    return TransformerDecoderNM(
+        vocab_size=8,
+        hidden_size=2,
+        num_layers=1,
+        inner_size=4,
+        num_attention_heads=1,
+        max_sequence_length=32,
+    ).eval()
+
+
+@pytest.fixture()
+def nnet(decoder_nm):
+    ans = (
+        decoder_nm.embedding,
+        decoder_nm.decoder,
+        TokenClassifier(hidden_size=2, num_classes=8),
+    )
+    ans = tuple(m.eval() for m in ans)
+    return ans
+
+
+@pytest.fixture()
+def inputs():
+    B, T, C = 1, 5, 2
+    return (
+        torch.tensor([[1]], dtype=torch.long),  # decoder_input_ids
+        torch.ones(B, T, C, dtype=torch.float),  # encoder_hidden_states
+        torch.ones(B, T, dtype=torch.float),  # encoder_input_mask
+    )
+
+
+@pytest.fixture()
+def tokenizer():
+    tok = Mock()
+    tok.pad = 0
+    tok.bos = 1
+    tok.eos = 2
+    return tok
+
+
+def test_greedy_decoding(inputs, nnet, deterministic_rng):
+    gen = GreedySequenceGenerator(*nnet)
+    output = gen(*inputs)
+
+    assert len(output) == 2
+    best_path, hypotheses = output
+
+    assert best_path is not None
+    assert torch.is_tensor(best_path)
+    assert best_path.shape == (1, 25)
+
+    assert hypotheses is None
+
+
+def test_temperature_sampling_decoding(inputs, nnet):
+    gen = GreedySequenceGenerator(*nnet, temperature=10.0, n_samples=2)
+    output = gen(*inputs)
+
+    assert len(output) == 2
+    best_path, hypotheses = output
+
+    assert best_path is not None
+    assert torch.is_tensor(best_path)
+    assert best_path.shape[0] == 1
+
+    assert isinstance(hypotheses, list)
+    assert len(hypotheses) == 1
+    (seq0,) = hypotheses
+    assert seq0.shape[0] == 2
+    assert (seq0[0] != seq0[1]).any()
+
+
+def test_beam_decoding_beam_scores_false(inputs, nnet):
+    gen = BeamSearchSequenceGenerator(*nnet, beam_size=2)
+    output = gen(*inputs, return_beam_scores=False)
+
+    assert len(output) == 1
+    (best_path,) = output
+
+    assert best_path is not None
+    assert torch.is_tensor(best_path)
+    assert best_path.shape == (26,)
+
+
+def test_beam_decoding_beam_scores_true(inputs, nnet):
+    gen = BeamSearchSequenceGenerator(*nnet, beam_size=2)
+    output = gen(*inputs, return_beam_scores=True)
+
+    assert len(output) == 3
+    beam_paths, scores, best_path = output
+
+    assert beam_paths is not None
+    assert isinstance(beam_paths, list)
+    assert len(beam_paths) == 1
+    (beam_paths_seq0,) = beam_paths
+    assert torch.is_tensor(beam_paths_seq0)
+    assert beam_paths_seq0.shape == (2, 26)
+
+    assert scores is not None
+    assert isinstance(scores, list)
+    assert len(scores) == 1
+    (scores_seq0,) = scores
+    assert torch.is_tensor(scores_seq0)
+    assert scores_seq0.shape == (2,)
+
+    assert best_path is not None
+    assert torch.is_tensor(best_path)
+    assert best_path.shape == (1, 26)
+
+
+@pytest.fixture()
+def prompted_inputs():
+    B, T, C = 1, 5, 2
+    return (
+        torch.tensor([[1, 0, 2, 3, 4]], dtype=torch.long),  # prompt
+        torch.ones(B, T, C, dtype=torch.float),  # encoder_hidden_states
+        torch.ones(B, T, dtype=torch.float),  # encoder_input_mask
+    )
+
+
+def test_transformer_aed_beam_infer_strips_prompt(prompted_inputs, decoder_nm, nnet, tokenizer):
+    decoder_input_ids, encoder_hidden_states, encoder_input_mask = prompted_inputs
+    *_, classifier = nnet
+
+    # Run the actual top-level module used by MultiTask AED model for decoding.
+    # This module is expected to trim the prompt from the beginning, and eos and pad from the end.
+    gen = TransformerAEDBeamInfer(decoder_nm, classifier, tokenizer)
+    ans = gen(
+        encoder_hidden_states=encoder_hidden_states,
+        encoder_input_mask=encoder_input_mask,
+        decoder_input_ids=decoder_input_ids,
+    )
+    best_path = ans[0][0].y_sequence
+    assert best_path is not None
+    assert torch.is_tensor(best_path)
+
+    # Now run the underlying beam search generator that doesn't trim anything.
+    *_, (untrimmed,) = gen.beam_search(*prompted_inputs, return_beam_scores=True)
+    assert untrimmed is not None
+    assert torch.is_tensor(untrimmed)
+
+    # Check that the expected trimming has indeed been done.
+    torch.testing.assert_close(
+        untrimmed[decoder_input_ids.shape[1] :], best_path
+    )  # stripped the prompt from the beggining
+
+
+def test_transformer_aed_greedy_infer_strips_prompt(prompted_inputs, decoder_nm, nnet, tokenizer):
+    decoder_input_ids, encoder_hidden_states, encoder_input_mask = prompted_inputs
+    decoder_input_ids = torch.tensor([[1, 0, 2, 3, 4]], dtype=torch.long)  # prompt
+    *_, classifier = nnet
+
+    # Run the actual top-level module used by MultiTask AED model for decoding.
+    # This module is expected to trim the prompt from the beginning, and eos and pad from the end.
+    gen = TransformerAEDGreedyInfer(decoder_nm, classifier, tokenizer)
+    ans = gen(
+        encoder_hidden_states=encoder_hidden_states,
+        encoder_input_mask=encoder_input_mask,
+        decoder_input_ids=decoder_input_ids,
+    )
+    best_path = ans[0][0].y_sequence
+    assert best_path is not None
+    assert torch.is_tensor(best_path)
+
+    # Now run the underlying beam search generator that doesn't trim anything.
+    (untrimmed,), _ = gen.greedy_search(*prompted_inputs)
+    assert untrimmed is not None
+    assert torch.is_tensor(untrimmed)
+
+    # Check that the expected trimming has indeed been done.
+    torch.testing.assert_close(
+        untrimmed[decoder_input_ids.shape[1] :], best_path
+    )  # stripped the prompt from the beggining
diff --git a/tests/collections/asr/test_asr_lhotse_dataset.py b/tests/collections/asr/test_asr_lhotse_dataset.py
new file mode 100644
index 000000000000..e7521bfdf7d8
--- /dev/null
+++ b/tests/collections/asr/test_asr_lhotse_dataset.py
@@ -0,0 +1,53 @@
+import pytest
+import torch
+from lhotse import CutSet, SupervisionSegment
+from lhotse.testing.dummies import DummyManifest
+
+from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer, create_spt_model
+
+
+@pytest.fixture(scope="session")
+def tokenizer(tmp_path_factory) -> SentencePieceTokenizer:
+    tmpdir = tmp_path_factory.mktemp("klingon_tokens")
+    text_path = tmpdir / "text.txt"
+    text_path.write_text("\n".join(map(chr, range(ord('a'), ord('z')))))
+    model_path, vocab_path = create_spt_model(
+        text_path, vocab_size=32, sample_size=-1, do_lower_case=False, output_dir=str(tmpdir)
+    )
+    return SentencePieceTokenizer(model_path)
+
+
+def test_lhotse_asr_dataset(tokenizer):
+    # 3 cuts of duration 1s with audio and a single supervision with text 'irrelevant'
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=3, with_data=True)
+
+    # cuts[0] is the default case: audio + single untokenized superivision
+
+    # cuts[1]: audio + single pre-tokenized superivision
+    cuts[1].supervisions[0].tokens = tokenizer.text_to_ids(cuts[1].supervisions[0].text)
+
+    # cuts[2]: audio + two supervisions
+    cuts[2].supervisions = [
+        SupervisionSegment(id="cuts2-sup0", recording_id=cuts[2].recording_id, start=0, duration=0.5, text="first"),
+        SupervisionSegment(id="cuts2-sup1", recording_id=cuts[2].recording_id, start=0.5, duration=0.5, text="second"),
+    ]
+
+    dataset = LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+    batch = dataset[cuts]
+
+    assert isinstance(batch, tuple)
+    assert len(batch) == 4
+    assert all(isinstance(t, torch.Tensor) for t in batch)
+
+    audio, audio_lens, tokens, token_lens = batch
+
+    assert audio.shape == (3, 16000)
+    assert audio_lens.tolist() == [16000] * 3
+
+    assert tokens.shape == (3, 13)
+    assert tokens[0].tolist() == [1, 10, 19, 19, 6, 13, 6, 23, 2, 15, 21, 0, 0]
+    assert tokens[1].tolist() == tokens[0].tolist()
+    assert tokens[2].tolist() == [1, 7, 10, 19, 20, 21, 1, 20, 6, 4, 16, 15, 5]
+
+    assert token_lens.tolist() == [11, 11, 13]
diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py
index 4e805c8f34de..3b3268423812 100644
--- a/tests/collections/asr/test_asr_multitask_model_bpe.py
+++ b/tests/collections/asr/test_asr_multitask_model_bpe.py
@@ -17,18 +17,28 @@
 
 import pytest
 import torch
+from lhotse import CutSet
+from lhotse.testing.dummies import DummyManifest
 from omegaconf import DictConfig
 
+from nemo.collections.asr.data.audio_to_text_lhotse_prompted import (
+    PromptedAudioToTextLhotseDataset,
+    PromptedAudioToTextMiniBatch,
+)
 from nemo.collections.asr.models.aed_multitask_models import EncDecMultiTaskModel
 from nemo.collections.asr.parts.submodules import multitask_beam_decoding as beam_decode
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
-from nemo.collections.common.prompts.canary import CanaryPromptFormatter
+from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
+from nemo.collections.common.prompts.canary import CanaryPromptFormatter, canary
 from nemo.collections.common.tokenizers import CanaryTokenizer
 
 
 @pytest.fixture()
 def asr_model(test_data_dir):
-    preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})}
+    preprocessor = {
+        'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor',
+        'params': {"window_size": 0.02, "window_stride": 0.01, "features": 64},
+    }
 
     model_defaults = {'asr_enc_hidden': 128, 'lm_enc_hidden': 64, 'lm_dec_hidden': 64}
 
@@ -384,3 +394,113 @@ def test_build_tokenizer(self, asr_model, test_data_dir):
 
         for i, j in zip(ids1, ids2):
             assert i == j
+
+    @pytest.mark.unit
+    def test_predict_step(self, asr_model, test_data_dir):
+        cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True)
+        c = cuts[0]
+        c.supervisions[0].language = "en"
+        c.source_lang = "en"
+        c.target_lang = "en"
+        c.task = "asr"
+        c.pnc = "no"
+        dataset = PromptedAudioToTextLhotseDataset(tokenizer=asr_model.tokenizer, prompt_format_fn=canary)
+        batch = dataset[cuts]
+
+        # Numpy array test
+        outputs = asr_model.predict_step(batch)
+        print(outputs)
+        assert len(outputs) == 1
+        assert isinstance(outputs[0], str)
+
+    @pytest.mark.unit
+    def test_FrameBatchMultiTaskAED(self, asr_model, test_data_dir):
+        model = FrameBatchMultiTaskAED(asr_model, batch_size=1)
+
+        audio_file = os.path.join(test_data_dir, "asr", "train", "an4", "wav", "an46-mmap-b.wav")
+        meta = {
+            'audio_filepath': audio_file,
+            'duration': 100000,
+            'source_lang': 'en',
+            'taskname': 'asr',
+            'target_lang': 'en',
+            'pnc': 'yes',
+            'answer': 'nothing',
+        }
+        model.read_audio_file(audio_file, delay=0.0, model_stride_in_secs=40.0, meta_data=meta)
+        outputs = model.transcribe()
+        assert isinstance(outputs, str)
+
+
+@pytest.mark.unit
+def test_prompted_dataset(asr_model):
+    dataset = PromptedAudioToTextLhotseDataset(tokenizer=asr_model.tokenizer, prompt_format_fn=canary)
+
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=3, with_data=True)
+
+    c = cuts[0]
+    c.supervisions[0].language = "en"
+    c.source_lang = "en"
+    c.target_lang = "en"
+    c.task = "asr"
+    c.pnc = "no"
+
+    c = cuts[1]
+    c.supervisions[0].language = "de"
+    c.supervisions[0].text = "unerheblich"
+    c.source_lang = "en"
+    c.target_lang = "de"
+    c.taskname = "ast"  # note: testing for "taskname" as we support it together with "task"
+    c.pnc = "yes"
+
+    c = cuts[2]
+    c.supervisions[0].language = "en"
+    c.supervisions[0].text = ""
+    c.source_lang = "en"
+    c.target_lang = "en"
+    c.task = "asr"
+    c.pnc = "yes"
+
+    batch = dataset[cuts]
+
+    assert isinstance(batch, PromptedAudioToTextMiniBatch)
+    assert batch.audio.shape == (3, 16000)
+    assert batch.audio_lens.tolist() == [16000, 16000, 16000]
+
+    # Test example 0 (transcription)
+    i = 0
+    assert (
+        asr_model.tokenizer.ids_to_text(batch.prompt[i]) == '<|startoftranscript|><|en|><|transcribe|><|en|><|nopnc|>'
+    )
+    assert batch.prompt_lens[i] == 5
+    assert asr_model.tokenizer.ids_to_text(batch.transcript[i]) == 'i##r##r##el##e##v##a##nt<pad><pad>'
+    assert batch.transcript_lens[i] == 8
+    assert (
+        asr_model.tokenizer.ids_to_text(batch.prompted_transcript[i])
+        == '<|startoftranscript|><|en|><|transcribe|><|en|><|nopnc|>i##r##r##el##e##v##a##nt<|endoftext|><pad><pad>'
+    )
+    assert batch.prompted_transcript_lens[i] == 14
+
+    # Test example 1 (translation)
+    i = 1
+    assert asr_model.tokenizer.ids_to_text(batch.prompt[i]) == '<|startoftranscript|><|en|><|translate|><|de|><|pnc|>'
+    assert batch.prompt_lens[i] == 5
+    assert asr_model.tokenizer.ids_to_text(batch.transcript[i]) == 'u##ne##r##h##e##b##l##i##c##h'
+    assert batch.transcript_lens[i] == 10
+    assert (
+        asr_model.tokenizer.ids_to_text(batch.prompted_transcript[i])
+        == '<|startoftranscript|><|en|><|translate|><|de|><|pnc|>u##ne##r##h##e##b##l##i##c##h<|endoftext|>'
+    )
+    assert batch.prompted_transcript_lens[i] == 16
+
+    # Test example 2 (no transcript, e.g. noise)
+    i = 2
+    assert asr_model.tokenizer.ids_to_text(batch.prompt[i]) == '<|startoftranscript|><|en|><|transcribe|><|en|><|pnc|>'
+    assert batch.prompt_lens[i] == 5
+    assert asr_model.tokenizer.ids_to_text(batch.transcript[i]) == '<pad>' * 10
+    assert batch.transcript_lens[i] == 0
+    assert (
+        asr_model.tokenizer.ids_to_text(batch.prompted_transcript[i])
+        == '<|startoftranscript|><|en|><|transcribe|><|en|><|pnc|><|endoftext|>' + '<pad>' * 10
+    )
+    assert batch.prompted_transcript_lens[i] == 6
diff --git a/tests/collections/asr/utils/test_transcription_move_to_device.py b/tests/collections/asr/utils/test_transcription_move_to_device.py
new file mode 100644
index 000000000000..6e95e66c5b26
--- /dev/null
+++ b/tests/collections/asr/utils/test_transcription_move_to_device.py
@@ -0,0 +1,38 @@
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from nemo.collections.asr.parts.mixins.transcription import move_to_device
+
+
+@dataclass
+class _Batch:
+    data: torch.Tensor
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="This test requires GPUs.")
+@pytest.mark.parametrize(
+    "batch",
+    [
+        torch.tensor([0]),
+        (torch.tensor([0]),),
+        [torch.tensor([0])],
+        {"data": torch.tensor([0])},
+        _Batch(torch.tensor([0])),
+        "not a tensor",
+    ],
+)
+def test_transcription_move_to_device(batch):
+    cuda_batch = move_to_device(batch, device="cuda")
+    assert type(batch) == type(cuda_batch)
+    if isinstance(batch, _Batch):
+        assert cuda_batch.data.is_cuda
+    elif isinstance(batch, dict):
+        assert cuda_batch["data"].is_cuda
+    elif isinstance(batch, (list, tuple)):
+        assert cuda_batch[0].is_cuda
+    elif isinstance(batch, torch.Tensor):
+        assert cuda_batch.is_cuda
+    else:
+        assert cuda_batch == batch
diff --git a/tests/collections/audio/test_audio_losses.py b/tests/collections/audio/test_audio_losses.py
index 8c8dbdb47598..cf32a7868451 100644
--- a/tests/collections/audio/test_audio_losses.py
+++ b/tests/collections/audio/test_audio_losses.py
@@ -17,6 +17,7 @@
 import torch
 
 from nemo.collections.audio.losses.audio import (
+    MAELoss,
     MSELoss,
     SDRLoss,
     calculate_mse_batch,
@@ -474,7 +475,7 @@ def test_sdr_convolution_invariant(self, num_channels: int, filter_length: int):
     @pytest.mark.parametrize('num_channels', [1, 4])
     @pytest.mark.parametrize('ndim', [3, 4])
     def test_mse(self, num_channels: int, ndim: int):
-        """Test SDR calculation"""
+        """Test MSE calculation"""
         batch_size = 8
         num_samples = 50
         num_features = 123
@@ -536,7 +537,7 @@ def test_mse(self, num_channels: int, ndim: int):
     @pytest.mark.parametrize('num_channels', [1, 4])
     @pytest.mark.parametrize('ndim', [3, 4])
     def test_mse_weighted(self, num_channels: int, ndim: int):
-        """Test SDR calculation with weighting for channels"""
+        """Test MSE calculation with weighting for channels"""
         batch_size = 8
         num_samples = 50
         num_features = 123
@@ -595,7 +596,7 @@ def test_mse_weighted(self, num_channels: int, ndim: int):
     @pytest.mark.parametrize('num_channels', [1, 4])
     @pytest.mark.parametrize('ndim', [3, 4])
     def test_mse_input_length(self, num_channels: int, ndim: int):
-        """Test SDR calculation with input length."""
+        """Test MSE calculation with input length."""
         batch_size = 8
         max_num_samples = 50
         num_features = 123
@@ -650,3 +651,178 @@ def test_mse_input_length(self, num_channels: int, ndim: int):
             assert np.allclose(
                 uut_mse_loss.cpu().detach().numpy(), golden_mse, atol=atol
             ), f'MSELoss not matching for example {n}'
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 4])
+    @pytest.mark.parametrize('ndim', [3, 4])
+    def test_mae(self, num_channels: int, ndim: int):
+        """Test MAE calculation"""
+        batch_size = 8
+        num_samples = 50
+        num_features = 123
+        num_batches = 10
+        random_seed = 42
+        atol = 1e-6
+
+        signal_shape = (
+            (batch_size, num_channels, num_features, num_samples)
+            if ndim == 4
+            else (batch_size, num_channels, num_samples)
+        )
+
+        reduction_dim = (-2, -1) if ndim == 4 else -1
+
+        mae_loss = MAELoss(ndim=ndim)
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        for n in range(num_batches):
+
+            # Generate random signal
+            target = _rng.normal(size=signal_shape)
+            # Random noise + scaling
+            noise = _rng.uniform(low=0.01, high=1) * _rng.normal(size=signal_shape)
+            # Estimate
+            estimate = target + noise
+
+            # DC bias for both
+            target += _rng.uniform(low=-1, high=1)
+            estimate += _rng.uniform(low=-1, high=1)
+
+            # Tensors for testing the loss
+            tensor_estimate = torch.tensor(estimate)
+            tensor_target = torch.tensor(target)
+
+            # Reference MSE
+            golden_mae = np.zeros((batch_size, num_channels))
+            for b in range(batch_size):
+                for m in range(num_channels):
+                    err = estimate[b, m, :] - target[b, m, :]
+                    golden_mae[b, m] = np.mean(np.abs(err), axis=reduction_dim)
+
+            # Calculate MSE loss
+            uut_mae_loss = mae_loss(estimate=tensor_estimate, target=tensor_target)
+
+            # Compare
+            assert np.allclose(
+                uut_mae_loss.cpu().detach().numpy(), golden_mae.mean(), atol=atol
+            ), f'MAE not matching for example {n}'
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 4])
+    @pytest.mark.parametrize('ndim', [3, 4])
+    def test_mae_weighted(self, num_channels: int, ndim: int):
+        """Test MAE calculation with weighting for channels"""
+        batch_size = 8
+        num_samples = 50
+        num_features = 123
+        num_batches = 10
+        random_seed = 42
+        atol = 1e-6
+
+        signal_shape = (
+            (batch_size, num_channels, num_features, num_samples)
+            if ndim == 4
+            else (batch_size, num_channels, num_samples)
+        )
+
+        reduction_dim = (-2, -1) if ndim == 4 else -1
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        channel_weight = _rng.uniform(low=0.01, high=1.0, size=num_channels)
+        channel_weight = channel_weight / np.sum(channel_weight)
+        mae_loss = MAELoss(weight=channel_weight, ndim=ndim)
+
+        for n in range(num_batches):
+
+            # Generate random signal
+            target = _rng.normal(size=signal_shape)
+            # Random noise + scaling
+            noise = _rng.uniform(low=0.001, high=10) * _rng.normal(size=target.shape)
+            # Estimate
+            estimate = target + noise
+
+            # Tensors for testing the loss
+            tensor_estimate = torch.tensor(estimate)
+            tensor_target = torch.tensor(target)
+
+            # Reference MAE
+            golden_mae = 0
+            for b in range(batch_size):
+                mae = [
+                    np.mean(np.abs(estimate[b, m, :] - target[b, m, :]), axis=reduction_dim)
+                    for m in range(num_channels)
+                ]
+                # weighted sum
+                mae = np.sum(np.array(mae) * channel_weight)
+                golden_mae += mae
+            golden_mae /= batch_size  # average over batch
+
+            # Calculate MAE loss
+            uut_mae_loss = mae_loss(estimate=tensor_estimate, target=tensor_target)
+
+            # Compare
+            assert np.allclose(
+                uut_mae_loss.cpu().detach().numpy(), golden_mae, atol=atol
+            ), f'MAELoss not matching for example {n}'
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 4])
+    @pytest.mark.parametrize('ndim', [3, 4])
+    def test_mae_input_length(self, num_channels: int, ndim: int):
+        """Test MAE calculation with input length."""
+        batch_size = 8
+        max_num_samples = 50
+        num_features = 123
+        num_batches = 10
+        random_seed = 42
+        atol = 1e-6
+
+        signal_shape = (
+            (batch_size, num_channels, num_features, max_num_samples)
+            if ndim == 4
+            else (batch_size, num_channels, max_num_samples)
+        )
+
+        reduction_dim = (-2, -1) if ndim == 4 else -1
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        mae_loss = MAELoss(ndim=ndim)
+
+        for n in range(num_batches):
+
+            # Generate random signal
+            target = _rng.normal(size=signal_shape)
+            # Random noise + scaling
+            noise = _rng.uniform(low=0.001, high=10) * _rng.normal(size=target.shape)
+            # Estimate
+            estimate = target + noise
+
+            # Limit calculation to random input_length samples
+            input_length = _rng.integers(low=1, high=max_num_samples, size=batch_size)
+
+            # Tensors for testing the loss
+            tensor_estimate = torch.tensor(estimate)
+            tensor_target = torch.tensor(target)
+            tensor_input_length = torch.tensor(input_length)
+
+            # Reference MSE
+            golden_mae = 0
+            for b, b_len in enumerate(input_length):
+                mae = [
+                    np.mean(np.abs(estimate[b, m, ..., :b_len] - target[b, m, ..., :b_len]), axis=reduction_dim)
+                    for m in range(num_channels)
+                ]
+                mae = np.mean(np.array(mae))
+                golden_mae += mae
+            golden_mae /= batch_size  # average over batch
+
+            # Calculate MSE
+            uut_mae_loss = mae_loss(estimate=tensor_estimate, target=tensor_target, input_length=tensor_input_length)
+
+            # Compare
+            assert np.allclose(
+                uut_mae_loss.cpu().detach().numpy(), golden_mae, atol=atol
+            ), f'MAELoss not matching for example {n}'
diff --git a/tests/collections/common/test_2d_bucketing_constraint.py b/tests/collections/common/test_2d_bucketing_constraint.py
new file mode 100644
index 000000000000..ba67d2e1fabb
--- /dev/null
+++ b/tests/collections/common/test_2d_bucketing_constraint.py
@@ -0,0 +1,89 @@
+import numpy as np
+import pytest
+from lhotse import CutSet, Seconds, SupervisionSegment
+from lhotse.dataset import DynamicBucketingSampler
+from lhotse.testing.dummies import DummyManifest, dummy_cut
+from nemo.collections.common.data.lhotse.dataloader import FixedBucketBatchSizeConstraint2D
+
+
+@pytest.fixture
+def cuts():
+    def _cut(id_: int, duration: Seconds, num_tokens: int):
+        supervision = SupervisionSegment(f"blah-{id_}", f"blah-{id_}", 0.0, duration, text="a" * num_tokens)
+        supervision.tokens = np.zeros((num_tokens,), dtype=np.int32)
+        return dummy_cut(id_, duration=duration, supervisions=[supervision])
+
+    return CutSet(
+        [_cut(i, duration=2.0, num_tokens=4) for i in range(20)]
+        + [_cut(i, duration=2.0, num_tokens=8) for i in range(20)]
+        + [_cut(i, duration=2.0, num_tokens=12) for i in range(20)]
+        + [_cut(i, duration=8.0, num_tokens=8) for i in range(20)]
+        + [_cut(i, duration=8.0, num_tokens=12) for i in range(20)]
+        + [_cut(i, duration=8.0, num_tokens=16) for i in range(20)]
+        + [_cut(i, duration=14.0, num_tokens=12) for i in range(20)]
+        + [_cut(i, duration=14.0, num_tokens=16) for i in range(20)]
+        + [_cut(i, duration=14.0, num_tokens=20) for i in range(20)]
+    )
+
+
+def test_2d_bucketing_expected_bucket_allocation(cuts):
+    duration_bins = [
+        (5.0, 5),
+        (5.0, 11),
+        (5.0, 15),
+        (7.0, 10),
+        (7.0, 13),
+        (7.0, 20),
+        (8.0, 15),
+        (8.0, 17),
+        (8.0, 25),
+        (15.0, 20),
+        (15.0, 29),
+        (15.0, 30),
+    ]
+    batch_sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+    sampler = DynamicBucketingSampler(
+        cuts.repeat(),
+        shuffle=True,
+        duration_bins=duration_bins,
+        constraint=FixedBucketBatchSizeConstraint2D(
+            max_seq_len_buckets=duration_bins,
+            batch_sizes=batch_sizes,
+        ),
+        buffer_size=1000,
+        seed=0,
+    )
+
+    for batch_idx, batch in enumerate(sampler):
+        # Run for 100 batches and check invariants on each.
+        if batch_idx == 100:
+            break
+        # Note: batch_sizes are indexes into duration_bins when subtracting 1.
+        # This way we can determine which bucket the data came from in this test.
+        bin_index = len(batch) - 1
+        max_duration, max_num_tokens = duration_bins[bin_index]
+        for cut in batch:
+            # First, check that the sampled examples are indeed below the max duration/num_tokens for its bucket.
+            assert cut.duration <= max_duration
+            assert cut.supervisions[0].tokens.shape[0] <= max_num_tokens
+            # Then, find the previous compatible bucket for each of training example's dimensions,
+            # and verify that it was not possible to assign the example to that smaller bucket.
+            # We should skip this for bucket_idx==0 (no previous buckets available).
+            # Note: max will be an empty sequence in some cases, e.g. when it's the first bucket
+            # with a given max_duration, it has the smallest max_num_tokens, leaving previous candidates list
+            # for max_num_tokens empty.
+            if bin_index > 0:
+                try:
+                    prev_max_duration = max(dur for dur, tok in duration_bins[:bin_index] if dur < max_duration)
+                    assert cut.duration > prev_max_duration
+                except ValueError as e:
+                    if "max() arg is an empty sequence" not in str(e):
+                        raise
+                try:
+                    prev_max_num_tokens = max(
+                        tok for dur, tok in duration_bins[:bin_index] if dur == max_duration and tok < max_num_tokens
+                    )
+                    assert cut.supervisions[0].tokens.shape[0] > prev_max_num_tokens
+                except ValueError as e:
+                    if "max() arg is an empty sequence" not in str(e):
+                        raise
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index c481413a3a37..97fdca434843 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -21,10 +21,11 @@
 import numpy as np
 import pytest
 import torch
-from lhotse import CutSet, MonoCut, NumpyFilesWriter, Recording
+from lhotse import CutSet, MonoCut, NumpyFilesWriter, Recording, SupervisionSegment, compute_num_samples
 from lhotse.audio import AudioLoadingError
 from lhotse.cut import Cut, MixedCut
 from lhotse.cut.text import TextPairExample
+from lhotse.testing.dummies import dummy_recording
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
@@ -199,6 +200,7 @@ def test_dataloader_from_lhotse_cuts(cutset_path: Path):
             "num_workers": 0,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "drop_last": False,
             "batch_duration": 4.0,  # seconds
@@ -354,6 +356,7 @@ def test_dataloader_from_lhotse_shar_cuts(cutset_shar_path: Path):
             "num_workers": 0,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "drop_last": False,
             "batch_duration": 4.0,  # seconds
@@ -400,6 +403,7 @@ def test_dataloader_from_nemo_manifest(nemo_manifest_path: Path):
             "num_workers": 0,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "drop_last": False,
             "batch_duration": 4.0,  # seconds
@@ -477,6 +481,7 @@ def test_dataloader_from_tarred_nemo_manifest(nemo_tarred_manifest_path: tuple[s
             "num_workers": 0,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "drop_last": False,
             "batch_duration": 4.0,  # seconds
@@ -524,6 +529,7 @@ def test_dataloader_from_tarred_nemo_manifest_weighted_combination(nemo_tarred_m
             "num_workers": 0,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "drop_last": False,
             "batch_duration": 4.0,  # seconds
@@ -556,6 +562,7 @@ def test_dataloader_from_tarred_nemo_manifest_multi(nemo_tarred_manifest_path_mu
             "num_workers": 0,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "drop_last": False,
             "batch_duration": 4.0,  # seconds
@@ -603,6 +610,7 @@ def test_dataloader_from_tarred_nemo_manifest_multi_max_open_streams(nemo_tarred
             "num_workers": 0,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "max_open_streams": 1,
             "drop_last": False,
@@ -693,6 +701,7 @@ def test_dataloader_from_lhotse_shar_cuts_combine_datasets_unweighted(
             "num_workers": 0,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "drop_last": False,
             "batch_duration": 4.0,  # seconds
@@ -745,6 +754,7 @@ def test_dataloader_from_lhotse_shar_cuts_combine_datasets_weighted(
             "num_workers": 0,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "drop_last": False,
             "batch_duration": 4.0,  # seconds
@@ -1577,6 +1587,69 @@ def test_dataloader_with_synth_rir(cutset_path: Path):
         assert isinstance(tfnm, ReverbWithImpulseResponse)
 
 
+def test_dataloader_bucket_batch_size(nemo_tarred_manifest_path_multi: tuple[str, str]):
+    json_mft, tar_mft = nemo_tarred_manifest_path_multi
+    config = OmegaConf.create(
+        {
+            "manifest_filepath": json_mft,
+            "tarred_audio_filepaths": tar_mft,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "use_lhotse": True,
+            "num_workers": 0,
+            # lhotse specific
+            "use_bucketing": True,
+            "concurrent_bucketing": False,
+            # Note: all input cuts belong to the first bucket so the batch size will always be 2.
+            "bucket_duration_bins": [2.0, 4.0],
+            "bucket_batch_size": [2, 1],
+            "drop_last": False,
+            "shuffle_buffer_size": 10,
+            "bucket_buffer_size": 100,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity())
+
+    for b in islice(dl, 10):
+        assert len(b) == 2
+
+
+def test_dataloader_2d_bucketing(nemo_tarred_manifest_path_multi: tuple[str, str], en_es_tokenizer):
+    json_mft, tar_mft = nemo_tarred_manifest_path_multi
+    config = OmegaConf.create(
+        {
+            "manifest_filepath": json_mft,
+            "tarred_audio_filepaths": tar_mft,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "use_lhotse": True,
+            "num_workers": 0,
+            # lhotse specific
+            "use_bucketing": True,
+            "concurrent_bucketing": False,
+            # Here each bin has the format: [audio_duration, token_sequence_length]
+            "bucket_duration_bins": [[0.5, 1], [0.5, 2], [2.0, 5], [2.0, 15], [4.0, 10], [4.0, 20]],
+            "bucket_batch_size": [7, 6, 5, 4, 3, 2],
+            "drop_last": False,
+            "shuffle_buffer_size": 10,
+            "bucket_buffer_size": 100,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(
+        config=config, global_rank=0, world_size=1, dataset=Identity(), tokenizer=en_es_tokenizer
+    )
+
+    # All of our data have duration 1.0 and 10 tokens so they will fall to bin[3] with batch_size=4
+    for b in islice(dl, 10):
+        assert len(b) == 4
+
+
 @pytest.fixture(scope="session")
 def questions_path(tmp_path_factory) -> Path:
     """A text file with 10 lines containing question values"""
@@ -1614,6 +1687,7 @@ def test_dataloader_from_nemo_nontarred_manifest_with_extra_questions_field_iter
     )
 
     dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity())
+
     b = next(iter(dl))
     c = b[0]
     assert isinstance(c, MonoCut)
@@ -1718,3 +1792,116 @@ def test_dataloader_from_nemo_manifest_with_extra_questions_field_sample(
     assert isinstance(c, MonoCut)
     assert hasattr(c, "question")
     assert c.question == "some question number 8"
+
+
+@pytest.fixture(scope="session")
+def nemo_tarred_manifest_path_with_offset(tmp_path_factory) -> Tuple[str, str]:
+    """10 utterances of length 1s as a NeMo tarred manifest."""
+    from lhotse.serialization import SequentialJsonlWriter
+    from lhotse.shar.writers import TarWriter
+
+    root = tmp_path_factory.mktemp("nemo_tar_offset")
+    root.mkdir(exist_ok=True)
+    recording = dummy_recording(0, duration=10.0, with_data=True)
+
+    with (
+        TarWriter(f"{root}/audios_0.tar", shard_size=None) as tar_writer,
+        SequentialJsonlWriter(root / "tarred_audio_filepaths.jsonl") as mft_writer,
+    ):
+
+        def audio_path(n: int = None):
+            return recording.id + ("" if n is None else f"-sub{n}") + ".wav"
+
+        tar_writer.write(audio_path(), BytesIO(recording.sources[0].source))
+        mft_writer.write(
+            {  # segment 0-3s
+                "audio_filepath": audio_path(),
+                "offset": 0.0,
+                "duration": 3.0,
+                "text": "irrelevant",
+                "lang": "en",
+                "shard_id": 0,
+            }
+        )
+        mft_writer.write(
+            {  # segment 4-9s
+                "audio_filepath": audio_path(1),
+                "offset": 4.0,
+                "duration": 5.0,
+                "text": "irrelevant-2",
+                "lang": "en",
+                "shard_id": 0,
+            }
+        )
+        mft_writer.write(
+            {  # full recording - for reference
+                "audio_filepath": audio_path(2),
+                "offset": 0.0,
+                "duration": 10.0,
+                "text": "irrelevant irrelevant-2",
+                "lang": "en",
+                "shard_id": 0,
+            }
+        )
+    return mft_writer.path, tar_writer.output_paths[0]
+
+
+def test_dataloader_from_tarred_nemo_manifest_with_offset(nemo_tarred_manifest_path_with_offset: tuple[str, str]):
+    json_mft, tar_mft = nemo_tarred_manifest_path_with_offset
+    config = OmegaConf.create(
+        {
+            "manifest_filepath": json_mft,
+            "tarred_audio_filepaths": tar_mft,
+            "sample_rate": 16000,
+            "shuffle": False,
+            "num_workers": 0,
+            "batch_size": 3,
+            "seed": 0,
+            "shard_seed": 0,
+            "force_finite": True,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity())
+
+    # Loads all three examples in a single mini-batch (that's why batch_size=3).
+    batches = [b for b in dl]
+    assert len(batches) == 1
+    (batch,) = batches
+    assert len(batch) == 3
+
+    # Validate example containing full 10s recording.
+    full_cut = batch[1]
+    assert full_cut.start == 0.0
+    assert full_cut.duration == 10.0
+    assert full_cut.supervisions[0].text == "irrelevant irrelevant-2"
+    assert full_cut.supervisions[0].language == "en"
+    full_audio = full_cut.load_audio()
+    assert full_audio.shape[1] == full_cut.num_samples == 160000  # 10s * 16kHz
+
+    # Validate segment 0-3s.
+    cut = batch[2]
+    assert cut.start == 0.0
+    assert cut.duration == 3.0
+    assert cut.supervisions[0].text == "irrelevant"
+    assert cut.supervisions[0].language == "en"
+    audio = cut.load_audio()
+    assert audio.shape[1] == cut.num_samples
+    # Check the audio for the segment is identical to a slice of the full audio.
+    np.testing.assert_equal(audio, full_audio[:, : compute_num_samples(cut.duration, cut.sampling_rate)])
+
+    # Validate segment 4-9s.
+    # Note: LazyNeMoTarredIterator removes the offset information, as it creates a new recording
+    # that's a "subset" of the original recording as a memory saving optimization.
+    # Hence, we will not see cut.start == 4.0.
+    cut = batch[0]
+    assert cut.start == 0.0
+    assert cut.duration == 5.0
+    assert cut.supervisions[0].text == "irrelevant-2"
+    assert cut.supervisions[0].language == "en"
+    audio = cut.load_audio()
+    assert audio.shape[1] == cut.num_samples
+    # Check the audio for the segment is identical to a slice of the full audio.
+    np.testing.assert_equal(
+        audio, full_audio[:, compute_num_samples(4.0, cut.sampling_rate) : compute_num_samples(9.0, cut.sampling_rate)]
+    )
diff --git a/tests/collections/common/test_lhotse_multirank_rng.py b/tests/collections/common/test_lhotse_multirank_rng.py
index 7fa828900e27..d7b883625aa7 100644
--- a/tests/collections/common/test_lhotse_multirank_rng.py
+++ b/tests/collections/common/test_lhotse_multirank_rng.py
@@ -38,7 +38,11 @@ def nemo_manifest_path(cutset_path: Path):
     nemo = []
     for idx, c in enumerate(CutSet.from_file(cutset_path)):
         nemo.append(
-            {"audio_filepath": c.recording.sources[0].source, "text": f"irrelevant-{idx}", "duration": c.duration,}
+            {
+                "audio_filepath": c.recording.sources[0].source,
+                "text": f"irrelevant-{idx}",
+                "duration": c.duration,
+            }
         )
     p = cutset_path.parent / "nemo_manifest.json"
     save_to_jsonl(nemo, p)
@@ -50,9 +54,10 @@ def nemo_tarred_manifest_path(nemo_manifest_path: Path) -> tuple[str, str]:
     """5 shards, each with 2 utterances."""
     root = nemo_manifest_path.parent / "nemo_tar"
     root.mkdir(exist_ok=True)
-    with TarWriter(f"{root}/audios_%01d.tar", shard_size=2) as tar_writer, JsonlShardWriter(
-        f"{root}/manifest_%01d.jsonl", shard_size=2
-    ) as mft_writer:
+    with (
+        TarWriter(f"{root}/audios_%01d.tar", shard_size=2) as tar_writer,
+        JsonlShardWriter(f"{root}/manifest_%01d.jsonl", shard_size=2) as mft_writer,
+    ):
         for idx, d in enumerate(load_jsonl(nemo_manifest_path)):
             p = d["audio_filepath"]
             name = Path(p).name
@@ -74,6 +79,7 @@ def test_dataloader_multiple_ranks_deterministic_rng(nemo_tarred_manifest_path:
             "num_workers": 1,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "drop_last": False,
             "batch_duration": 4.0,  # seconds
@@ -89,12 +95,22 @@ def test_dataloader_multiple_ranks_deterministic_rng(nemo_tarred_manifest_path:
     dp0 = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=2, dataset=_Identity())
 
     # Data parallel, rank 0 copy (is the iteration deterministic? -> yes)
-    dp0_cpy = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=2, dataset=_Identity(),)
+    dp0_cpy = get_lhotse_dataloader_from_config(
+        config=config,
+        global_rank=0,
+        world_size=2,
+        dataset=_Identity(),
+    )
 
     # Data parallel, rank 0, incremented seed (paranoia mode: does the iteration order change with the seed? -> yes)
     config2 = config.copy()
     config2["seed"] = config2["seed"] + 1
-    dp0_incrseed = get_lhotse_dataloader_from_config(config=config2, global_rank=0, world_size=2, dataset=_Identity(),)
+    dp0_incrseed = get_lhotse_dataloader_from_config(
+        config=config2,
+        global_rank=0,
+        world_size=2,
+        dataset=_Identity(),
+    )
 
     # Data parallel, rank 1 (is data different on each DP rank? -> yes)
     dp1 = get_lhotse_dataloader_from_config(config=config, global_rank=1, world_size=2, dataset=_Identity())
@@ -127,6 +143,7 @@ def test_dataloader_multiple_ranks_trng(nemo_tarred_manifest_path: tuple[str, st
             "num_workers": 1,
             # lhotse specific
             "use_bucketing": True,
+            "concurrent_bucketing": False,
             "num_buckets": 2,
             "drop_last": False,
             "batch_duration": 4.0,  # seconds
@@ -142,12 +159,22 @@ def test_dataloader_multiple_ranks_trng(nemo_tarred_manifest_path: tuple[str, st
     dp0 = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=2, dataset=_Identity())
 
     # Data parallel, rank 0 copy (is the iteration deterministic? -> no, trng)
-    dp0_cpy = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=2, dataset=_Identity(),)
+    dp0_cpy = get_lhotse_dataloader_from_config(
+        config=config,
+        global_rank=0,
+        world_size=2,
+        dataset=_Identity(),
+    )
 
     # Data parallel, rank 0, incremented seed (paranoia mode: does the iteration order change with the seed? -> yes)
     config2 = config.copy()
     config2["seed"] = config2["seed"] + 1
-    dp0_incrseed = get_lhotse_dataloader_from_config(config=config2, global_rank=0, world_size=2, dataset=_Identity(),)
+    dp0_incrseed = get_lhotse_dataloader_from_config(
+        config=config2,
+        global_rank=0,
+        world_size=2,
+        dataset=_Identity(),
+    )
 
     # Data parallel, rank 1 (is data different on each DP rank? -> yes)
     dp1 = get_lhotse_dataloader_from_config(config=config, global_rank=1, world_size=2, dataset=_Identity())
diff --git a/tests/collections/common/test_perf_metrics.py b/tests/collections/common/test_perf_metrics.py
new file mode 100644
index 000000000000..0708bca0c24d
--- /dev/null
+++ b/tests/collections/common/test_perf_metrics.py
@@ -0,0 +1,128 @@
+import pytest
+import yaml
+
+from nemo.collections.common.metrics.perf_metrics import FLOPsMeasurementCallback
+
+LLAMA2_CFG_STR = """
+    run:
+        name: train_llama2_7b_tp1_pp1_FP8_1node_15steps
+    trainer:
+        num_nodes: 1
+        devices: 8
+        precision: bf16
+    exp_manager:
+        explicit_log_dir: "results/logs"
+    model:
+        micro_batch_size: 1
+        global_batch_size: 128
+        encoder_seq_length: 4096
+        max_position_embeddings: 4096
+        num_layers: 32
+        hidden_size: 4096
+        ffn_hidden_size: 11008
+        num_attention_heads: 32
+"""
+
+NEMOTRON_CFG_STR = """
+    run:
+      name: train_nemotron_8b_tp2_pp1_FP8_8node_20steps
+    trainer:
+      num_nodes: 8
+      devices: 8
+      precision: bf16
+    exp_manager:
+      explicit_log_dir: null
+    model:
+      micro_batch_size: 4
+      global_batch_size: 256
+      encoder_seq_length: 4096
+      max_position_embeddings: 4096
+      num_layers: 32
+      hidden_size: 4096
+      ffn_hidden_size: 16384
+      num_attention_heads: 32
+      fp8: true
+"""
+
+UNSUPPORTED_MODEL_CFG_STR = """
+    run:
+        name: unsupported_model
+    trainer:
+        num_nodes: 1
+        devices: 8
+        precision: bf64
+    exp_manager:
+        explicit_log_dir: null
+    model:
+        micro_batch_size: 1
+        global_batch_size: 1
+        encoder_seq_length: 1
+        max_position_embeddings: 1
+        num_layers: 1
+        hidden_size: 1
+        ffn_hidden_size: 1
+        num_attention_heads: 1
+"""
+
+NULL_MODEL_CFG_STR = """
+    run:
+        name: null
+"""
+
+
+@pytest.fixture
+def model_config(cfg):
+    return yaml.safe_load(cfg)
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize(
+    "cfg, model_name, train_step_time, expected_value",
+    [
+        (LLAMA2_CFG_STR, None, 8, 377.53),
+        (LLAMA2_CFG_STR, "llama2", 8, 377.53),
+        (LLAMA2_CFG_STR, None, [8, 8, 8, 8], 377.53),
+        (NEMOTRON_CFG_STR, None, 1.31, 642.73),
+        (
+            UNSUPPORTED_MODEL_CFG_STR,
+            None,
+            1,  # model_name in config is unsupported
+            "Failed to extract valid model name from or missing FLOPs calculations for unsupported_model",
+        ),
+        (
+            UNSUPPORTED_MODEL_CFG_STR,
+            "unknown_model",
+            1,  # overrided model name is unsupported
+            "Failed to extract valid model name from or missing FLOPs calculations for unknown_model",
+        ),
+        (
+            NULL_MODEL_CFG_STR,
+            None,
+            1,  # both- config and overrided model name are None
+            "Failed to extract valid model name from or missing FLOPs calculations for None",
+        ),
+    ],
+)
+def test_eval_tflops_per_sec_per_gpu(model_config, model_name, train_step_time, expected_value):
+    if isinstance(expected_value, (int, float)):
+        flops_callback = FLOPsMeasurementCallback(model_config, model_name=model_name)
+        tflops_per_sec_per_gpu = flops_callback.eval_tflops_per_sec_per_gpu(train_step_time)
+        assert tflops_per_sec_per_gpu == pytest.approx(expected_value, rel=1e-4)
+
+        if model_name is None:
+            # extract valid model name with delimiter='-'
+            model_config["run"]["name"] = model_config["run"]["name"].replace("_", ".")
+            flops_callback = FLOPsMeasurementCallback(model_config, model_name=model_name)
+            tflops_per_sec_per_gpu = flops_callback.eval_tflops_per_sec_per_gpu(train_step_time)
+            assert tflops_per_sec_per_gpu == pytest.approx(expected_value, rel=1e-4)
+
+            # # extract valid model name from a string
+            model_config["run"]["name"] = model_config["run"]["name"].replace("_", "")
+            flops_callback = FLOPsMeasurementCallback(model_config, model_name=model_name)
+            tflops_per_sec_per_gpu = flops_callback.eval_tflops_per_sec_per_gpu(train_step_time)
+            assert tflops_per_sec_per_gpu == pytest.approx(expected_value, rel=1e-4)
+
+    if isinstance(expected_value, str):
+        flops_callback = FLOPsMeasurementCallback(model_config, model_name=model_name)
+        with pytest.raises(KeyError, match=expected_value):
+            _ = flops_callback.eval_tflops_per_sec_per_gpu(train_step_time)
diff --git a/tests/collections/common/test_utils.py b/tests/collections/common/test_utils.py
index 7d7a995c2b3a..9011c4a0d1d9 100644
--- a/tests/collections/common/test_utils.py
+++ b/tests/collections/common/test_utils.py
@@ -19,16 +19,16 @@
 
 import numpy as np
 import pytest
+import torch
 
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path, is_tarred_dataset
-from nemo.collections.common.parts.utils import flatten
+from nemo.collections.common.parts.utils import flatten, mask_sequence_tensor
 
 
 class TestListUtils:
     @pytest.mark.unit
     def test_flatten(self):
-        """Test flattening an iterable with different values: str, bool, int, float, complex.
-        """
+        """Test flattening an iterable with different values: str, bool, int, float, complex."""
         test_cases = []
         test_cases.append({'input': ['aa', 'bb', 'cc'], 'golden': ['aa', 'bb', 'cc']})
         test_cases.append({'input': ['aa', ['bb', 'cc']], 'golden': ['aa', 'bb', 'cc']})
@@ -40,11 +40,43 @@ def test_flatten(self):
             assert flatten(test_case['input']) == test_case['golden'], f'Test case {n} failed!'
 
 
+class TestMaskSequenceTensor:
+    @pytest.mark.unit
+    @pytest.mark.parametrize('ndim', [2, 3, 4, 5])
+    def test_mask_sequence_tensor(self, ndim: int):
+        """Test masking a tensor based on the provided length."""
+        num_examples = 20
+        max_batch_size = 10
+        max_max_len = 30
+
+        for n in range(num_examples):
+            batch_size = np.random.randint(low=1, high=max_batch_size)
+            max_len = np.random.randint(low=1, high=max_max_len)
+
+            if ndim > 2:
+                tensor_shape = (batch_size,) + tuple(torch.randint(1, 30, (ndim - 2,))) + (max_len,)
+            else:
+                tensor_shape = (batch_size, max_len)
+
+            tensor = torch.randn(tensor_shape)
+            lengths = torch.randint(low=1, high=max_len + 1, size=(batch_size,))
+
+            if ndim <= 4:
+                masked_tensor = mask_sequence_tensor(tensor=tensor, lengths=lengths)
+
+                for b, l in enumerate(lengths):
+                    assert torch.equal(masked_tensor[b, ..., :l], tensor[b, ..., :l]), f'Failed for example {n}'
+                    assert torch.all(masked_tensor[b, ..., l:] == 0.0), f'Failed for example {n}'
+            else:
+                # Currently, supporting only up to 4D tensors
+                with pytest.raises(ValueError):
+                    mask_sequence_tensor(tensor=tensor, lengths=lengths)
+
+
 class TestPreprocessingUtils:
     @pytest.mark.unit
     def test_get_full_path_local(self, tmpdir):
-        """Test with local paths
-        """
+        """Test with local paths"""
         # Create a few files
         num_files = 10
 
@@ -150,8 +182,7 @@ def create_files(paths):
 
     @pytest.mark.unit
     def test_get_full_path_ais(self, tmpdir):
-        """Test with paths on AIStore.
-        """
+        """Test with paths on AIStore."""
         # Create a few files
         num_files = 10
 
@@ -234,8 +265,7 @@ def test_get_full_path_audio_file_len_limit(self):
 
     @pytest.mark.unit
     def test_get_full_path_invalid_type(self):
-        """Make sure exceptions are raised when audio_file is not a string or a list of strings.
-        """
+        """Make sure exceptions are raised when audio_file is not a string or a list of strings."""
 
         with pytest.raises(ValueError, match="Unexpected audio_file type"):
             get_full_path(1)
diff --git a/tests/collections/nlp/test_rampup_batch_size.py b/tests/collections/nlp/test_rampup_batch_size.py
index fea61571e70f..c7efb5f57f4c 100644
--- a/tests/collections/nlp/test_rampup_batch_size.py
+++ b/tests/collections/nlp/test_rampup_batch_size.py
@@ -16,12 +16,19 @@
 
 import pytest
 import torch
-from megatron.core.num_microbatches_calculator import get_num_microbatches
 from omegaconf import DictConfig
 from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils import logging
+
+try:
+    from megatron.core.num_microbatches_calculator import get_num_microbatches, update_num_microbatches
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches, update_num_microbatches
 
 DEVICE_CAPABILITY = None
 if torch.cuda.is_available():
@@ -29,7 +36,12 @@
 
 
 def reset_microbatch_calculator():
-    import megatron.core.num_microbatches_calculator as mb
+    try:
+        import megatron.core.num_microbatches_calculator as mb
+
+    except (ImportError, ModuleNotFoundError):
+        logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+        import apex.transformer.pipeline_parallel.utils as mb
 
     mb._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
 
@@ -164,10 +176,6 @@ def test_rampup_bs(self, gpt_model, rampup_batch_size):
 
     @pytest.mark.unit
     def test_rampup_bs_schedule(self, gpt_model, trainer_cfg, rampup_batch_size_schedule):
-
-        from megatron.core.num_microbatches_calculator import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-
-        num_microbatch_calculator = _GLOBAL_NUM_MICROBATCHES_CALCULATOR
         micro_batch_size = gpt_model.cfg.micro_batch_size
         num_devices = trainer_cfg["devices"]
         num_nodes = trainer_cfg["num_nodes"]
@@ -179,7 +187,7 @@ def test_rampup_bs_schedule(self, gpt_model, trainer_cfg, rampup_batch_size_sche
             step += 1
             current_global_batch_size = get_num_microbatches() * micro_batch_size * num_devices * num_nodes
             consumed_samples += current_global_batch_size
-            num_microbatch_calculator.update(consumed_samples=consumed_samples, consistency_check=True)
+            update_num_microbatches(consumed_samples=consumed_samples, consistency_check=True)
 
             if current_global_batch_size not in global_batch_size_schedule:
                 global_batch_size_schedule.append(current_global_batch_size)
diff --git a/tests/collections/tts/losses/test_audio_codec_loss.py b/tests/collections/tts/losses/test_audio_codec_loss.py
index 60ea8d293655..dcbeb4bf65a9 100644
--- a/tests/collections/tts/losses/test_audio_codec_loss.py
+++ b/tests/collections/tts/losses/test_audio_codec_loss.py
@@ -16,8 +16,8 @@
 import torch
 from torchmetrics import ScaleInvariantSignalDistortionRatio
 
+from nemo.collections.common.parts.utils import mask_sequence_tensor
 from nemo.collections.tts.losses.audio_codec_loss import MaskedMAELoss, MaskedMSELoss, SISDRLoss
-from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
 
 
 class TestAudioCodecLoss:
diff --git a/tests/collections/tts/modules/test_audio_codec_modules.py b/tests/collections/tts/modules/test_audio_codec_modules.py
index 28de02b6afb4..e1429df4fb70 100644
--- a/tests/collections/tts/modules/test_audio_codec_modules.py
+++ b/tests/collections/tts/modules/test_audio_codec_modules.py
@@ -15,6 +15,7 @@
 import pytest
 import torch
 
+from nemo.collections.common.parts.utils import mask_sequence_tensor
 from nemo.collections.tts.modules.audio_codec_modules import (
     CodecActivation,
     Conv1dNorm,
@@ -29,7 +30,6 @@
     get_down_sample_padding,
 )
 from nemo.collections.tts.modules.encodec_modules import GroupResidualVectorQuantizer, ResidualVectorQuantizer
-from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
 
 
 class TestAudioCodecModules:
@@ -205,8 +205,7 @@ def test_multiband_mel_encoder(self):
 
 class TestResidualVectorQuantizer:
     def setup_class(self):
-        """Setup common members
-        """
+        """Setup common members"""
         self.batch_size = 2
         self.max_len = 20
         self.codebook_size = 256
@@ -315,8 +314,7 @@ def test_snake(self):
 
 class TestFiniteScalarQuantizer:
     def setup_class(self):
-        """Setup common members
-        """
+        """Setup common members"""
         self.batch_size = 2
         self.max_len = 20
         self.num_examples = 10
diff --git a/tests/collections/tts/test_spectrogram_enhancer.py b/tests/collections/tts/test_spectrogram_enhancer.py
index a3b0f2625060..b7e754d554ca 100644
--- a/tests/collections/tts/test_spectrogram_enhancer.py
+++ b/tests/collections/tts/test_spectrogram_enhancer.py
@@ -17,8 +17,8 @@
 from einops import rearrange
 from omegaconf import DictConfig
 
+from nemo.collections.common.parts.utils import mask_sequence_tensor
 from nemo.collections.tts.models import SpectrogramEnhancerModel
-from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
 
 
 @pytest.fixture
diff --git a/tests/core/test_straggler_det.py b/tests/core/test_straggler_det.py
index 53ba37ac28bb..ee5222854889 100644
--- a/tests/core/test_straggler_det.py
+++ b/tests/core/test_straggler_det.py
@@ -56,12 +56,12 @@ def on_train_start(self):
         rank = torch.distributed.get_rank()
 
     def train_dataloader(self):
-        dataset = OnesDataset(128)
-        return torch.utils.data.DataLoader(dataset, batch_size=2, num_workers=8)
+        dataset = OnesDataset(1024 * 1024)
+        return torch.utils.data.DataLoader(dataset, batch_size=2, num_workers=2)
 
     def val_dataloader(self):
-        dataset = OnesDataset(128)
-        return torch.utils.data.DataLoader(dataset, batch_size=2, num_workers=8)
+        dataset = OnesDataset(128 * 1024)
+        return torch.utils.data.DataLoader(dataset, batch_size=2, num_workers=2)
 
     def forward(self, batch):
         output = self.l1(batch)
diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py
index 5193fe951138..88efb2374555 100644
--- a/tests/deploy/nemo_deploy.py
+++ b/tests/deploy/nemo_deploy.py
@@ -164,6 +164,7 @@ def run_trt_llm_inference(
     use_embedding_sharing=False,
     max_input_len=128,
     max_output_len=128,
+    max_num_tokens=None,
     ptuning=False,
     p_tuning_checkpoint=None,
     lora=False,
@@ -249,7 +250,7 @@ def run_trt_llm_inference(
             max_prompt_embedding_table_size=max_prompt_embedding_table_size,
             use_lora_plugin=use_lora_plugin,
             lora_target_modules=lora_target_modules,
-            max_num_tokens=int(max_input_len * max_batch_size * 0.2),
+            max_num_tokens=max_num_tokens,
             opt_num_tokens=60,
             use_embedding_sharing=use_embedding_sharing,
         )
@@ -424,6 +425,7 @@ def run_existing_checkpoints(
             use_embedding_sharing=use_embedding_sharing,
             max_input_len=512,
             max_output_len=model_info["max_output_len"],
+            max_num_tokens=None,
             ptuning=ptuning,
             p_tuning_checkpoint=p_tuning_checkpoint,
             lora=lora,
@@ -448,7 +450,6 @@ def get_args():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description=f"Deploy nemo models to Triton and benchmark the models",
     )
-
     parser.add_argument(
         "--model_name",
         type=str,
@@ -499,6 +500,10 @@ def get_args():
         type=int,
         default=128,
     )
+    parser.add_argument(
+        "--max_num_tokens",
+        type=int,
+    )
     parser.add_argument(
         "--p_tuning_checkpoint",
         type=str,
@@ -646,6 +651,7 @@ def run_inference_tests(args):
                     max_batch_size=args.max_batch_size,
                     max_input_len=args.max_input_len,
                     max_output_len=args.max_output_len,
+                    max_num_tokens=args.max_num_tokens,
                     ptuning=args.ptuning,
                     p_tuning_checkpoint=args.p_tuning_checkpoint,
                     lora=args.lora,
diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py
index 6a296fdb92eb..557d6c07613d 100644
--- a/tests/export/nemo_export.py
+++ b/tests/export/nemo_export.py
@@ -223,6 +223,7 @@ def run_inference(
     use_embedding_sharing=False,
     max_input_len=128,
     max_output_len=128,
+    max_num_tokens=None,
     use_parallel_embedding=False,
     ptuning=False,
     p_tuning_checkpoint=None,
@@ -322,7 +323,7 @@ def run_inference(
                 max_prompt_embedding_table_size=max_prompt_embedding_table_size,
                 use_lora_plugin=use_lora_plugin,
                 lora_target_modules=lora_target_modules,
-                max_num_tokens=int(max_input_len * max_batch_size * 0.2),
+                max_num_tokens=max_num_tokens,
                 use_embedding_sharing=use_embedding_sharing,
             )
 
@@ -511,6 +512,7 @@ def run_existing_checkpoints(
             use_parallel_embedding=use_parallel_embedding,
             max_input_len=512,
             max_output_len=model_info["max_output_len"],
+            max_num_tokens=None,
             ptuning=ptuning,
             p_tuning_checkpoint=p_tuning_checkpoint,
             lora=lora,
@@ -596,7 +598,6 @@ def get_args():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description=f"Deploy nemo models to Triton and benchmark the models",
     )
-
     parser.add_argument(
         "--model_name",
         type=str,
@@ -652,6 +653,10 @@ def get_args():
         type=int,
         default=128,
     )
+    parser.add_argument(
+        "--max_num_tokens",
+        type=int,
+    )
     parser.add_argument(
         "--use_parallel_embedding",
         type=str,
@@ -856,6 +861,7 @@ def run_inference_tests(args):
                     max_batch_size=args.max_batch_size,
                     max_input_len=args.max_input_len,
                     max_output_len=args.max_output_len,
+                    max_num_tokens=args.max_num_tokens,
                     use_parallel_embedding=args.use_parallel_embedding,
                     ptuning=args.ptuning,
                     p_tuning_checkpoint=args.p_tuning_checkpoint,
diff --git a/tests/lightning/io/test_mixin.py b/tests/lightning/io/test_mixin.py
index 824608db6bf0..3a520b8e74ae 100644
--- a/tests/lightning/io/test_mixin.py
+++ b/tests/lightning/io/test_mixin.py
@@ -14,3 +14,10 @@ def test_reinit(self):
         assert copied is not dummy
         assert copied.a == dummy.a
         assert copied.b == dummy.b
+
+    def test_init(self):
+        outputs = []
+        for i in range(1001):
+            outputs.append(DummyClass(i, i))
+
+        assert len(outputs) == 1001
diff --git a/tests/lightning/pytorch/callbacks/test_nsys.py b/tests/lightning/pytorch/callbacks/test_nsys.py
index e8734ad1c1ac..d87da58b8ad0 100644
--- a/tests/lightning/pytorch/callbacks/test_nsys.py
+++ b/tests/lightning/pytorch/callbacks/test_nsys.py
@@ -68,6 +68,7 @@ def test_on_train_batch_start_profiling(
         mock_get_rank.return_value = 0
         callback = NsysCallback(start_step=10, end_step=20, ranks=[0], gen_shape=True)
 
+        mock_trainer.strategy.current_epoch_step = 10
         callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
 
         mock_cudart().cudaProfilerStart.assert_called_once()
@@ -80,6 +81,7 @@ def test_on_train_batch_start_no_profiling(self, mock_cudart, mock_get_rank, moc
         mock_get_rank.return_value = 0
         callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
 
+        mock_trainer.strategy.current_epoch_step = 9
         callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 9)
 
         mock_cudart().cudaProfilerStart.assert_not_called()
@@ -94,6 +96,7 @@ def test_on_train_batch_end_profiling(
         mock_get_rank.return_value = 0
         callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
 
+        mock_trainer.strategy.current_epoch_step = 20
         callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20)
 
         mock_cudart().cudaProfilerStop.assert_called_once()
@@ -163,6 +166,7 @@ def test_profiling_range(
         mock_get_rank.return_value = 0
         callback = NsysCallback(start_step=start_step, end_step=end_step, ranks=[0])
 
+        mock_trainer.strategy.current_epoch_step = batch_idx
         callback.on_train_batch_start(mock_trainer, mock_pl_module, None, batch_idx)
 
         if expected_call:
@@ -183,13 +187,16 @@ def test_single_profile_range(self, mock_cudart, mock_get_rank, mock_trainer, mo
         mock_trainer.strategy.root_device.type = 'cuda'
 
         # Start of range
+        mock_trainer.strategy.current_epoch_step = 10
         callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 10)
         assert mock_cudart().cudaProfilerStart.call_count == 1, "cudaProfilerStart was not called"
 
         # Middle of range
+        mock_trainer.strategy.current_epoch_step = 25
         callback.on_train_batch_start(mock_trainer, mock_pl_module, None, 25)
         assert mock_cudart().cudaProfilerStart.call_count == 1, "cudaProfilerStart was called again"
 
         # End of range
+        mock_trainer.strategy.current_epoch_step = 40
         callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 40)
         assert mock_cudart().cudaProfilerStop.call_count == 1, "cudaProfilerStop was not called"
diff --git a/tests/lightning/pytorch/callbacks/test_preemption.py b/tests/lightning/pytorch/callbacks/test_preemption.py
index 5fcb4a1458ee..a385582ea021 100644
--- a/tests/lightning/pytorch/callbacks/test_preemption.py
+++ b/tests/lightning/pytorch/callbacks/test_preemption.py
@@ -1,4 +1,3 @@
-import logging
 import signal
 from unittest.mock import MagicMock, PropertyMock, patch
 
@@ -6,7 +5,7 @@
 import torch
 from pytorch_lightning import Trainer
 
-from nemo.lightning.pytorch.callbacks.preemption import PreemptionCallback, PreemptionException
+from nemo.lightning.pytorch.callbacks.preemption import PreemptionCallback
 
 
 class TestPreemptionCallback:
@@ -100,15 +99,9 @@ def test_on_train_end(self, callback, mock_trainer):
     @pytest.mark.parametrize("interrupted", [True, False])
     def test_on_train_batch_end(self, callback, mock_trainer, interrupted):
         with patch.object(PreemptionCallback, 'interrupted', new_callable=lambda: property(lambda self: interrupted)):
-            callback.on_train_batch_end(mock_trainer, None, None, None, 0)
+            if interrupted:
+                with pytest.raises(SystemExit):
+                    callback.on_train_batch_end(mock_trainer, None, None, None, 0)
+            else:
+                callback.on_train_batch_end(mock_trainer, None, None, None, 0)
             assert mock_trainer.should_stop == interrupted
-
-    def test_on_exception_preemption(self, callback, mock_trainer):
-        exception = PreemptionException("Test preemption")
-        callback.on_exception(mock_trainer, None, exception)
-        assert mock_trainer.should_stop
-
-    def test_on_exception_other(self, callback, mock_trainer):
-        exception = ValueError("Some other exception")
-        callback.on_exception(mock_trainer, None, exception)
-        assert not mock_trainer.should_stop
diff --git a/tests/lightning/test_dist_ckpt.py b/tests/lightning/test_dist_ckpt.py
new file mode 100644
index 000000000000..98fe8d4a6107
--- /dev/null
+++ b/tests/lightning/test_dist_ckpt.py
@@ -0,0 +1,167 @@
+import os
+from pathlib import Path
+
+import pytest
+import pytorch_lightning as pl
+import torch
+from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
+
+import nemo.lightning as nl
+from nemo.collections import llm
+from nemo.lightning.io.pl import MegatronCheckpointIO
+from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO, AsyncFinalizerCallback
+
+
+def _get_strategy():
+    strategy = nl.MegatronStrategy(
+        enable_nemo_ckpt_io=False,
+    )
+    return strategy
+
+
+def _get_last_checkpoint_dir(model: pl.LightningModule, suffix: str = '') -> Path:
+    return f'epoch={model.trainer.current_epoch - 1}-step={model.trainer.max_steps - 1}{suffix}'
+
+
+def get_model_and_data():
+    micro_batch_size = 2
+    global_batch_size = 2
+    seq_length = 128
+    data = llm.MockDataModule(
+        seq_length=seq_length, micro_batch_size=micro_batch_size, global_batch_size=global_batch_size
+    )
+
+    config = llm.GPTConfig(
+        num_layers=2,
+        hidden_size=64,
+        ffn_hidden_size=256,
+        num_attention_heads=4,
+        seq_length=seq_length,
+        apply_query_key_layer_scaling=1,
+    )
+    reconfigure_num_microbatches_calculator(
+        0,
+        None,
+        global_batch_size,
+        micro_batch_size,
+        data_parallel_size=1,
+    )
+    return llm.GPTModel(config, tokenizer=data.tokenizer), data
+
+
+class TestDistCkptIO:
+
+    @pytest.mark.run_only_on('GPU')
+    def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path):
+
+        model, data = get_model_and_data()
+
+        strategy = _get_strategy()
+
+        trainer = nl.Trainer(
+            devices=1,
+            accelerator="gpu",
+            strategy=strategy,
+            enable_checkpointing=True,
+            max_steps=2,
+            default_root_dir=str(tmp_path),
+            logger=False,
+        )
+
+        trainer.fit(model, data)
+
+        assert isinstance(trainer.strategy.checkpoint_io, MegatronCheckpointIO)
+        # Ckpt path doesn't contain the .ckpt suffix
+        ckpts = os.listdir(Path(tmp_path / "checkpoints"))
+        assert len(ckpts) == 1
+        ckpt = ckpts[0]
+        assert str(ckpt) == _get_last_checkpoint_dir(model)
+
+    @pytest.mark.run_only_on('GPU')
+    def test_async_save_produces_same_checkpoints_as_sync(self, tmp_path):
+
+        model, data = get_model_and_data()
+
+        sync_ckpt_dir = tmp_path / 'sync_checkpoints'
+        async_ckpt_dir = tmp_path / 'async_checkpoints'
+
+        sync_checkpoint_io = MegatronCheckpointIO('torch_dist')
+        async_checkpoint_io = AsyncFinalizableCheckpointIO(MegatronCheckpointIO('torch_dist', async_save=True))
+
+        # dummy_trainer just to initialize NCCL
+        dummy_trainer = pl.Trainer(
+            devices=1,
+            logger=False,
+            max_steps=2,
+            strategy=_get_strategy(),
+        )
+        dummy_trainer.fit(model, data)
+        strategy = _get_strategy()
+        tmp_path = strategy.broadcast(tmp_path)
+
+        ## reset the model and data and train with sync checkpointing
+        model, data = get_model_and_data()
+        sync_test_trainer = pl.Trainer(
+            devices=1,
+            enable_checkpointing=True,
+            logger=False,
+            max_steps=2,
+            strategy=_get_strategy(),
+            plugins=[sync_checkpoint_io],
+            default_root_dir=str(sync_ckpt_dir),
+        )
+        sync_test_trainer.fit(model, data)
+
+        ## reset the model and data and train with sync checkpointing
+        model, data = get_model_and_data()
+        async_test_trainer = pl.Trainer(
+            devices=1,
+            enable_checkpointing=True,
+            logger=False,
+            max_steps=2,
+            strategy=_get_strategy(),
+            plugins=[async_checkpoint_io],
+            callbacks=AsyncFinalizerCallback(),
+            default_root_dir=str(async_ckpt_dir),
+        )
+        async_test_trainer.fit(model, data)
+
+        checkpoint = {'sharded_state_dict': model.sharded_state_dict()}
+
+        sync_state_dict = sync_checkpoint_io.load_checkpoint(
+            Path(f"{sync_ckpt_dir}/checkpoints/{_get_last_checkpoint_dir(model)}"), sharded_state_dict=checkpoint
+        )
+
+        async_state_dict = async_checkpoint_io.load_checkpoint(
+            Path(f"{async_ckpt_dir}/checkpoints/{_get_last_checkpoint_dir(model)}"), sharded_state_dict=checkpoint
+        )
+
+        ## one of the keys is a _io.BytesIO object
+        for k in sync_state_dict['sharded_state_dict'].keys():
+            if isinstance(sync_state_dict['sharded_state_dict'][k], torch.Tensor):
+                assert torch.all(sync_state_dict['sharded_state_dict'][k] == async_state_dict['sharded_state_dict'][k])
+
+    def test_sharded_strategies(self):
+
+        model_checkpoint = nl.ModelCheckpoint()
+
+        strategy = nl.MegatronStrategy(
+            enable_nemo_ckpt_io=False,
+            save_ckpt_format='torch_dist',
+            ckpt_parallel_save=True,
+            ckpt_load_directly_on_device=False,
+            ckpt_async_save=True,
+        )
+        trainer = nl.Trainer(
+            callbacks=[model_checkpoint],
+            strategy=strategy,
+        )
+
+        assert isinstance(strategy.checkpoint_io, AsyncFinalizableCheckpointIO)
+        assert isinstance(strategy.checkpoint_io._checkpoint_io, MegatronCheckpointIO)
+
+        base_checkpoint_io = strategy.checkpoint_io._checkpoint_io
+
+        assert base_checkpoint_io.save_ckpt_format == 'torch_dist'
+        assert base_checkpoint_io.parallel_save
+        assert base_checkpoint_io.load_directly_on_device == False
diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py
index 0dd49838d9e4..a0a16150c65f 100644
--- a/tests/lightning/test_nemo_logger.py
+++ b/tests/lightning/test_nemo_logger.py
@@ -1,3 +1,6 @@
+import os
+import time
+from pathlib import Path
 from unittest.mock import patch
 
 import pytest
@@ -5,6 +8,8 @@
 from pytorch_lightning.loggers import WandbLogger
 
 from nemo import lightning as nl
+from nemo.constants import NEMO_ENV_VARNAME_VERSION
+from nemo.utils.exp_manager import NotFoundError
 
 
 class TestNeMoLogger:
@@ -16,7 +21,7 @@ def test_loggers(self):
         trainer = nl.Trainer(accelerator="cpu")
         logger = nl.NeMoLogger(
             update_logger_directory=True,
-            wandb=WandbLogger(save_dir="test", offline=True),
+            wandb=WandbLogger(save_dir="wandb_logs", offline=True),
         )
 
         logger.setup(trainer)
@@ -24,16 +29,25 @@ def test_loggers(self):
         assert len(logger.extra_loggers) == 0
         assert len(trainer.loggers) == 2
         assert isinstance(trainer.loggers[1], WandbLogger)
-        assert str(trainer.loggers[1].save_dir).endswith("nemo_experiments")
+        assert str(trainer.loggers[1].save_dir).endswith("nemo_experiments/wandb_logs")
         assert trainer.loggers[1]._name == "default"
 
     def test_explicit_log_dir(self, trainer):
         explicit_dir = "explicit_test_dir"
         logger = nl.NeMoLogger(name="test", explicit_log_dir=explicit_dir)
 
-        with patch("nemo.utils.exp_manager.check_explicit_log_dir") as mock_check:
-            logger.setup(trainer)
-            mock_check.assert_called_once_with(trainer, explicit_dir, None, "test", None)
+        app_state = logger.setup(trainer)
+        assert str(app_state.log_dir) == "explicit_test_dir"
+        assert app_state.name == ""  ## name should be ignored when explicit_log_dir is passed in
+        assert app_state.version == ""
+
+    def test_default_log_dir(self, trainer):
+
+        if os.environ.get(NEMO_ENV_VARNAME_VERSION, None) is not None:
+            del os.environ[NEMO_ENV_VARNAME_VERSION]
+        logger = nl.NeMoLogger(use_datetime_version=False)
+        app_state = logger.setup(trainer)
+        assert app_state.log_dir == Path(Path.cwd() / "nemo_experiments" / "default")
 
     def test_custom_version(self, trainer):
         custom_version = "v1.0"
@@ -58,3 +72,93 @@ def test_model_checkpoint_setup(self, trainer):
         ptl_ckpt = next(cb for cb in trainer.callbacks if isinstance(cb, PTLModelCheckpoint))
         assert str(ptl_ckpt.dirpath).endswith("test_ckpt")
         assert ptl_ckpt.filename == "test-{epoch:02d}-{val_loss:.2f}"
+
+    def test_resume(self, trainer, tmp_path):
+        """Tests the resume capabilities of NeMoLogger + AutoResume"""
+
+        if os.environ.get(NEMO_ENV_VARNAME_VERSION, None) is not None:
+            del os.environ[NEMO_ENV_VARNAME_VERSION]
+
+        # Error because explicit_log_dir does not exist
+        with pytest.raises(NotFoundError):
+            nl.AutoResume(
+                dirpath=str(tmp_path / "test_resume"),
+                resume_if_exists=True,
+            ).setup(trainer)
+
+        # Error because checkpoints folder does not exist
+        with pytest.raises(NotFoundError):
+            nl.AutoResume(
+                dirpath=str(tmp_path / "test_resume" / "does_not_exist"),
+                path="does_not_exist",
+                resume_if_exists=True,
+            ).setup(trainer)
+
+        # No error because we tell autoresume to ignore notfounderror
+        nl.AutoResume(
+            dirpath=str(tmp_path / "test_resume" / "does_not_exist"),
+            resume_if_exists=True,
+            resume_ignore_no_checkpoint=True,
+        ).setup(trainer)
+
+        path = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints").mkdir(parents=True)
+        # Error because checkpoints do not exist in folder
+        with pytest.raises(NotFoundError):
+            nl.AutoResume(
+                dirpath=path,
+                resume_if_exists=True,
+            ).setup(trainer)
+
+        Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").mkdir()
+        # Error because *end.ckpt is in folder indicating that training has already finished
+        with pytest.raises(ValueError):
+            nl.AutoResume(
+                dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"),
+                resume_if_exists=True,
+            ).setup(trainer)
+
+        ## if there are multiple "-last" checkpoints, choose the most recent one
+        Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").rmdir()
+        Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last").mkdir()
+        time.sleep(1)  ## sleep for a second so the checkpoints are created at different times
+        Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last").mkdir()
+        nl.AutoResume(
+            dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"),
+            resume_if_exists=True,
+        ).setup(trainer)
+        assert str(trainer.ckpt_path) == str(
+            Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last")
+        )
+
+        # Finally succeed
+        logger = nl.NeMoLogger(
+            name="default",
+            dir=str(tmp_path) + "/test_resume",
+            version="version_0",
+            use_datetime_version=False,
+        )
+        logger.setup(trainer)
+        Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last").rmdir()
+        nl.AutoResume(
+            resume_if_exists=True,
+        ).setup(trainer)
+        checkpoint = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last")
+        assert Path(trainer.ckpt_path).resolve() == checkpoint.resolve()
+
+        trainer = nl.Trainer(accelerator="cpu", logger=False)
+        # Check that model loads from `dirpath` and not <log_dir>/checkpoints
+        dirpath_log_dir = Path(tmp_path / "test_resume" / "dirpath_test" / "logs")
+        dirpath_log_dir.mkdir(parents=True)
+        dirpath_checkpoint_dir = Path(tmp_path / "test_resume" / "dirpath_test" / "ckpts")
+        dirpath_checkpoint = Path(dirpath_checkpoint_dir / "mymodel--last")
+        dirpath_checkpoint.mkdir(parents=True)
+        logger = nl.NeMoLogger(
+            name="default",
+            explicit_log_dir=dirpath_log_dir,
+        )
+        logger.setup(trainer)
+        nl.AutoResume(
+            resume_if_exists=True,
+            dirpath=str(dirpath_checkpoint_dir),
+        ).setup(trainer)
+        assert Path(trainer.ckpt_path).resolve() == dirpath_checkpoint.resolve()
diff --git a/tests/lightning/test_precision_plugin.py b/tests/lightning/test_precision_plugin.py
new file mode 100644
index 000000000000..bdd834c3bf7a
--- /dev/null
+++ b/tests/lightning/test_precision_plugin.py
@@ -0,0 +1,95 @@
+import pytest
+import pytorch_lightning as pl
+import torch
+from megatron.core.optimizer import OptimizerConfig
+
+from nemo import lightning as nl
+from nemo.collections import llm
+
+
+class DummyTokenizer:
+    def __init__(self):
+        self.vocab_size = 30000
+
+
+class TestMegatronMixedPrecision:
+    """Unit tests for the MegatronMixedPrecision class."""
+
+    @pytest.mark.run_only_on('GPU')
+    def test_precision_plugin_fp8_passed(self):
+        """Test __init__ with default parameters."""
+
+        class TrainerHook(nl.Trainer):
+            def connect(self, model: pl.LightningModule) -> None:
+                assert model.config.bf16 == False
+                assert model.config.fp8 is None
+                super().connect(model)
+                assert model.config.fp8 == 'e4m3'
+                assert model.config.bf16 == True
+
+        trainer = TrainerHook(
+            devices=2,
+            accelerator="gpu",
+            max_steps=2,
+            strategy=nl.MegatronStrategy(
+                tensor_model_parallel_size=2,
+                sequence_parallel=True,
+                ckpt_include_optimizer=False,
+            ),
+            plugins=nl.MegatronMixedPrecision(precision="bf16-mixed", fp8='e4m3'),
+            limit_val_batches=0.0,
+            num_sanity_val_steps=0,
+        )
+
+        optim = nl.MegatronOptimizerModule(
+            config=OptimizerConfig(
+                optimizer="adam",
+                lr=1e-5,
+                use_distributed_optimizer=False,
+                fp16=True,
+                params_dtype=torch.float32,
+            ),
+        )
+        config = llm.Llama2Config7B()
+        config.num_layers = 2
+        model = llm.LlamaModel(config, tokenizer=DummyTokenizer(), optim=optim)
+        trainer.strategy.connect(model)
+
+    @pytest.mark.run_only_on('GPU')
+    def test_precision_plugin_precision_params_override(self):
+        """Test __init__ with default parameters."""
+        trainer = nl.Trainer(
+            devices=2,
+            accelerator="gpu",
+            max_steps=2,
+            strategy=nl.MegatronStrategy(
+                tensor_model_parallel_size=2,
+                sequence_parallel=True,
+                ckpt_include_optimizer=False,
+            ),
+            plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+            limit_val_batches=0.0,
+            num_sanity_val_steps=0,
+        )
+
+        optim = nl.MegatronOptimizerModule(
+            config=OptimizerConfig(
+                optimizer="adam",
+                lr=1e-5,
+                use_distributed_optimizer=False,
+                fp16=True,
+                params_dtype=torch.float32,
+            ),
+        )
+        config = llm.Llama2Config7B()
+        config.num_layers = 2
+        config.fp16 = True
+        config.bf16 = False
+        model = llm.LlamaModel(config, tokenizer=DummyTokenizer(), optim=optim)
+        trainer.strategy.connect(model)
+        assert optim.config.bf16 is not None
+        assert optim.config.fp16 is not None
+        assert optim.config.bf16 == True
+        assert optim.config.fp16 == False
+        assert model.config.fp16 == False
+        assert model.config.bf16 == True
diff --git a/tests/utils/test_trainer_utils.py b/tests/utils/test_trainer_utils.py
new file mode 100644
index 000000000000..ed13b0c4ac38
--- /dev/null
+++ b/tests/utils/test_trainer_utils.py
@@ -0,0 +1,20 @@
+from omegaconf import OmegaConf
+from pytorch_lightning.strategies import DDPStrategy
+
+from nemo.utils.trainer_utils import resolve_trainer_cfg
+
+
+def test_resolve_trainer_cfg_strategy():
+    cfg = OmegaConf.create({"strategy": "ddp"})
+    ans = resolve_trainer_cfg(cfg)
+    assert isinstance(ans, dict)
+    assert ans["strategy"] == "ddp"
+
+    cfg = OmegaConf.create(
+        {"strategy": {"_target_": "pytorch_lightning.strategies.DDPStrategy", "gradient_as_bucket_view": True}}
+    )
+    ans = resolve_trainer_cfg(cfg)
+    assert isinstance(ans, dict)
+    assert isinstance(ans["strategy"], DDPStrategy)
+    assert "gradient_as_bucket_view" in ans["strategy"]._ddp_kwargs
+    assert ans["strategy"]._ddp_kwargs["gradient_as_bucket_view"] == True
diff --git a/tutorials/multimodal/LITA Tutorial.ipynb b/tutorials/multimodal/LITA Tutorial.ipynb
index 1dfc70add81a..6858c9724a97 100644
--- a/tutorials/multimodal/LITA Tutorial.ipynb	
+++ b/tutorials/multimodal/LITA Tutorial.ipynb	
@@ -36,7 +36,7 @@
    "source": [
     "# LITA Introduction\n",
     "\n",
-    "[LITA](https://arxiv.org/pdf/2403.19046) stands for Language Instructed Temporal-Localization Assistan, which demonstrates strong performance on Reasoning Temporal Localization (RTL) task. It introduces time tokens to better help LLM understand 'When?' question in video. The below figure from [LITA paper](https://arxiv.org/pdf/2403.19046) shows a clear idea of how LITA works.\n",
+    "[LITA](https://arxiv.org/pdf/2403.19046) stands for Language Instructed Temporal-Localization Assistant, which demonstrates strong performance on Reasoning Temporal Localization (RTL) task. It introduces time tokens to better help LLM understand 'When?' question in video. The below figure from [LITA paper](https://arxiv.org/pdf/2403.19046) shows a clear idea of how LITA works.\n",
     "\n",
     "<img src=\"images/LITA_arch.png\" alt=\"drawing\" style=\"width:800px;\"/>"
    ]
@@ -46,7 +46,7 @@
    "metadata": {},
    "source": [
     "## Tokenizer and Checkpoint Conversion\n",
-    "As we learned that LITA introduces `time tokens` so that timestampes of events in a video would be represented as time tokens instead of the original float point timestamps. Therefore we need to add these time tokens to the tokenizer of the backbone/LLM model. In this example, we take `Llama-3-VILA1.5-8B` as an example to show how to integrate LITA to a LLaVA like model. You may also use similar steps to convert other llama or LLaVA like models that have backbone LLM as llama such as [vicuna](https://huggingface.co/lmsys/vicuna-13b-v1.5) and [llava-v1.6-vicuna-13b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-13b).\n",
+    "As we learned that LITA introduces `time tokens` so that timestamps of events in a video would be represented as time tokens instead of the original float point timestamps. Therefore we need to add these time tokens to the tokenizer of the backbone/LLM model. In this example, we take `Llama-3-VILA1.5-8B` as an example to show how to integrate LITA to a LLaVA like model. You may also use similar steps to convert other llama or LLaVA like models that have backbone LLM as llama such as [vicuna](https://huggingface.co/lmsys/vicuna-13b-v1.5) and [llava-v1.6-vicuna-13b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-13b).\n",
     "\n",
     "Please download the huggingface `Llama-3-VILA1.5-8B` model."
    ]
@@ -56,13 +56,14 @@
    "execution_count": null,
    "metadata": {
     "vscode": {
-     "languageId": "plaintext"
+     "languageId": "shellscript"
     }
    },
    "outputs": [],
    "source": [
-    "! mkdir pretrained_models && cd pretrained_models\n",
-    "! git clone https://huggingface.co/Efficient-Large-Model/Llama-3-VILA1.5-8B"
+    "%%bash\n",
+    "mkdir /ws/pretrained_models && cd /ws/pretrained_models\n",
+    "git clone https://huggingface.co/Efficient-Large-Model/Llama-3-VILA1.5-8B"
    ]
   },
   {
@@ -108,7 +109,7 @@
     "extra_tokens = [\"<extra_id_0>\",\"<extra_id_1>\",\"<extra_id_2>\",\"<extra_id_3>\",\"<extra_id_6>\",\"<extra_id_7>\"]\n",
     "tokenizer.add_tokens(extra_tokens)\n",
     "tokenizer.save_pretrained(tokenizer_path)\n",
-    "print(tokenizer.vocab_size)"
+    "print(len(tokenizer.vocab))"
    ]
   },
   {
@@ -126,7 +127,7 @@
    "source": [
     "from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer\n",
     "tokenizer = get_nmt_tokenizer(library=\"huggingface\", model_name=tokenizer_path)\n",
-    "print(tokenizer.vocab_size)"
+    "print(len(tokenizer.vocab))"
    ]
   },
   {
@@ -147,27 +148,32 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
    "outputs": [],
    "source": [
-    "! cd /opt/\n",
-    "! git clone --depth 1 --branch v1.2.2 https://github.com/haotian-liu/LLaVA/\n",
-    "! export PYTHONPATH=/opt/LLaVA/:$PYTHONPATH\n",
-    "! cd /ws  # do not run the below commands under `/opt` folder"
+    "%%bash\n",
+    "git clone --depth 1 --branch v1.2.2 https://github.com/haotian-liu/LLaVA/ /ws/LLaVA\n",
+    "cd /ws"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {
     "vscode": {
-     "languageId": "plaintext"
+     "languageId": "shellscript"
     }
    },
    "outputs": [],
    "source": [
+    "%%bash\n",
+    "export PYTHONPATH=/ws/LLaVA:$PYTHONPATH\n",
     "# check the config file in /opt/NeMo/examples/multimodal/multimodal_llm/neva/conf/vita_config.yaml\n",
-    "! python /opt/NeMo/examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py \\\n",
+    "python /opt/NeMo/examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py \\\n",
     "    --in-file /ws/pretrained_models/Llama-3-VILA1.5-8B/llm \\\n",
     "    --mm-vision-tower /ws/pretrained_models/Llama-3-VILA1.5-8B/vision_tower \\\n",
     "    --mm-projector-ckpt-dir /ws/pretrained_models/Llama-3-VILA1.5-8B/mm_projector \\\n",
@@ -191,7 +197,7 @@
    "source": [
     "## Finetuning\n",
     "\n",
-    "In this section, we'll preprocess the Dense Videco Captioning dataset and then do finetuning with the nemo ckpt we just converted."
+    "In this section, we'll preprocess the Dense Video Captioning dataset and then do finetuning with the nemo ckpt we just converted."
    ]
   },
   {
@@ -231,7 +237,6 @@
     "            {\"from\": \"human\", \"value\": \"<video>\\n \"What is the action performed in this video?\"}, \n",
     "            {\"from\": \"gpt\", \"value\": \"brush hair\"}\n",
     "        ]\n",
-    "\n",
     "    },\n",
     "    # 4th example: event localization\n",
     "    {\n",
@@ -247,7 +252,7 @@
     "]\n",
     "```\n",
     "\n",
-    "Here the `<video>` is the placeholder for the video features. In the 2nd example, `<t1>` `<t2>` are the time tokens to indidate in which time interval we've seen this event or description of the time inverval. You can prepare your time tokens like this:\n"
+    "Here the `<video>` is the placeholder for the video features. In the 2nd example, `<t1>` `<t2>` are the time tokens to indicate in which time interval we've seen this event or description of the time inverval. You can prepare your time tokens like this:\n"
    ]
   },
   {
@@ -272,14 +277,14 @@
     "start = start / duration \n",
     "end = end / duration\n",
     "start_time_token_str = time_to_string(start, num_time_tokens)\n",
-    "end_time_token_str = time_to_string(end, num_time_tokens,)"
+    "end_time_token_str = time_to_string(end, num_time_tokens)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For Dense Video Captioning (DVC) task or Resoning Temporal Localization (RTL) task, your dataset probably looks like:\n",
+    "For Dense Video Captioning (DVC) task or Reasoning Temporal Localization (RTL) task, your dataset probably looks like:\n",
     "```bash\n",
     "{\n",
     "    \"video_name\": {\n",
@@ -301,15 +306,7 @@
     "}\n",
     "```\n",
     "\n",
-    "If you've already prepared this style dataset, you may refer to `convert_dvc_dataset_for_training.py`, `convert_dvc_dataset_for_evaluation.py` and `convert_video_qa_dataset.py` under `NeMo/scripts/multimodal_dataset_conversion` to convert the datasets so that they could be used in finetuning. If you want to augment your dataset by leveraging the NVIDIA LLM APIs or external LLMs, you may refer to `generate_qa_data.py` under the same directory."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We take the `YouMakeUp` dataset as an example.\n",
-    "1. First download dataset and prepare it to DVC dataset format."
+    "If you've already prepared this style dataset, you may refer to `convert_dvc_dataset_for_training.py`, `convert_dvc_dataset_for_evaluation.py` and `convert_video_qa_dataset.py` under `/opt/NeMo/scripts/multimodal_dataset_conversion` to convert the datasets so that they could be used in finetuning. If you want to augment your dataset by leveraging the NVIDIA LLM APIs or external LLMs, you may refer to `generate_qa_data.py` under the same directory. For example, suppose your dataset is `train.json`:"
    ]
   },
   {
@@ -317,52 +314,7 @@
    "execution_count": null,
    "metadata": {
     "vscode": {
-     "languageId": "plaintext"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "%%bash\n",
-    "git clone https://github.com/AIM3-RUC/YouMakeup/tree/master\n",
-    "pip install yt-dlp==2024.4.9\n",
-    "pip install moviepy\n",
-    "\n",
-    "#download videos, this may take a while\n",
-    "python /opt/NeMo/scripts/multimodal_dataset_conversion/prepare_youmakeup.py -i YouMakeup/data/train/train_steps.json -o /ws/dataset -d True\n",
-    "\n",
-    "#chunk videos into clips, with each clip containing 120 seconds\n",
-    "python /opt/NeMo/scripts/multimodal_dataset_conversion/prepare_youmakeup.py -i YouMakeup/data/train/train_steps.json -o /ws/dataset -l 12\n",
-    "\n",
-    "#create evaluation dataset\n",
-    "python /opt/NeMo/scripts/multimodal_dataset_conversion/prepare_youmakeup.py -i YouMakeup/data/valid/valid_steps.json -o /ws/dataset/valid/ -d True\n",
-    "python /opt/NeMo/scripts/multimodal_dataset_conversion/prepare_youmakeup.py -i YouMakeup/data/train/valid_steps.json -o /ws/dataset/valid/ -l 120\n",
-    "\n",
-    "#create QA style validation/evaluation or test dataset\n",
-    "python3 /opt/NeMo/scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_evaluation.py --input /ws/dataset/valid/train.json --output_file=/ws/dataset/valid/rtl_eval.json"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "2. Your dataset should now have the following structure:\n",
-    "```bash\n",
-    "    dataset/\n",
-    "        videos/\n",
-    "            video1.mp4\n",
-    "            video2.mp4\n",
-    "            ...\n",
-    "        train.json\n",
-    "```\n",
-    "Then we convert the dataset to finetuning format by:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "snippets"
+     "languageId": "shellscript"
     }
    },
    "outputs": [],
@@ -375,7 +327,7 @@
     "    --subtask custom_caption --data_multiplier 3 \\\n",
     "    --output_file /ws/dataset/vc_train.json\n",
     "\n",
-    "# generate event loalization dataset and increase the dataset by three times\n",
+    "# generate event localization dataset and increase the dataset by three times\n",
     "python /opt/NeMo/scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_training.py \\\n",
     "    --input_dvc_dataset /ws/dataset/train.json \\\n",
     "    --video_path_prefix /ws/dataset/videos/ \\\n",
@@ -383,28 +335,6 @@
     "    --output_file /ws/dataset/event_loc_train.json"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Let's combine the two json files into one\n",
-    "import json\n",
-    "def get_list_data(json_file):\n",
-    "    with open(json_file, 'r') as f:\n",
-    "        data = json.load(f)\n",
-    "    return data\n",
-    "l1 = get_list_data('/ws/dataset/vc_train.json')\n",
-    "l2 = get_list_data('/ws/dataset/event_loc_train.json')\n",
-    "l = l1 + l2\n",
-    "# shuffle the data\n",
-    "import random\n",
-    "random.shuffle(l)\n",
-    "with open('/ws/dataset/combined_train.json', 'w') as f:\n",
-    "    json.dump(l, f, indent=4)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -418,12 +348,17 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
    "outputs": [],
    "source": [
     "%%bash\n",
     "video_folder=/ws/dataset/videos/\n",
-    "data_path=/ws/dataset/combined_train.json\n",
+    "# You may use vc_train.json or event_loc_train.json in last step or combine them together\n",
+    "data_path=/ws/dataset/combined_train.json  # training datasets combining different video tasks;\n",
     "model_path=/ws/converted_models/Llama-3-VILA1.5-8B.nemo\n",
     "EXP_MANAGER_DIR=/ws/train  # check this directory for experiment details\n",
     "num_gpus=8\n",
@@ -469,7 +404,25 @@
    "metadata": {},
    "source": [
     "## Evaluation\n",
-    "After training for around 8 hours, we can split the `rtl_eval.json` file into `$num_gpus` number of input files so that the inference can be accelerated.\n",
+    "Assume you're trying to do evaluation task on RTL task. Please refer to `/opt/NeMo/scripts/multimodal_dataset_conversion/convert_dvc_dataset_for_evaluation.py` about how to generate RTL task evaluation file from DVC dataset. And assume your evaluation file `rtl_eval.json` is like:\n",
+    "\n",
+    "```bash\n",
+    "[\n",
+    "    {\n",
+    "        \"video\": \"-4RXOT_UfpM_3.mp4\",\n",
+    "        \"question_id\": \"-4RXOT_UfpM_3_0\",\n",
+    "        \"question\": \"When does \\\"Apply eyeshadow on the lower area then crease with brush\\\" happen in the video? Provide a response using only start and end timestamps.\",\n",
+    "        \"ref_answer\": \"<5> <58> Apply eyeshadow on the lower area then crease with brush\",\n",
+    "        \"duration\": 118.01801801801803\n",
+    "    },\n",
+    "    ...\n",
+    "]\n",
+    "\n",
+    "```\n",
+    "\n",
+    "Notice the `<5> <58>` are the start and end timestamps (in seconds) of the event.\n",
+    "\n",
+    "After training, we can split the evaluation file (`rtl_eval.json`) into `$num_gpus` number of input files so that the inference can be accelerated. This is optional.\n",
     "We can do this by:"
    ]
   },
@@ -480,8 +433,9 @@
    "outputs": [],
    "source": [
     "import os\n",
+    "import json\n",
     "json_file=\"/ws/dataset/valid/rtl_eval.json\"\n",
-    "num_splits=8\n",
+    "num_splits=8 # suppose you have 8 gpus\n",
     "output_dir=\"/ws/dataset/valid/split/\"\n",
     "os.makedirs(output_dir, exist_ok=True)\n",
     "with open(json_file, 'r') as f:\n",
@@ -511,7 +465,7 @@
    "execution_count": null,
    "metadata": {
     "vscode": {
-     "languageId": "plaintext"
+     "languageId": "shellscript"
     }
    },
    "outputs": [],
@@ -534,8 +488,7 @@
     "    prompt_file=/ws/dataset/valid/split/$i.json\n",
     "    output_file=/ws/dataset/valid/split_output/nemo_infer_output_$i.json\n",
     "    video_base_path=/ws/dataset/valid/videos/\n",
-    "    sleep 1\n",
-    "    CUDA_VISIBLE_DEVICES=$i torchrun --nnodes=1 --standalone /ws/NeMo/examples/multimodal/multimodal_llm/neva/neva_evaluation.py \\\n",
+    "    CUDA_VISIBLE_DEVICES=$i torchrun --nnodes=1 --standalone /opt/NeMo/examples/multimodal/multimodal_llm/neva/neva_evaluation.py \\\n",
     "               --config-path=/opt/NeMo/examples/multimodal/multimodal_llm/neva/conf/ \\\n",
     "               --config-name=neva_inference.yaml \\\n",
     "               tensor_model_parallel_size=1 \\\n",
@@ -572,7 +525,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# combine the output json files if you splitted them before\n",
+    "# combine the output json files if you split them before\n",
     "input_dir = \"/ws/dataset/valid/split_output/\"\n",
     "output_file = \"/ws/dataset/valid/split_output/nemo_infer_output_total.json\"\n",
     "data = []\n",
@@ -597,7 +550,7 @@
    "execution_count": null,
    "metadata": {
     "vscode": {
-     "languageId": "plaintext"
+     "languageId": "shellscript"
     }
    },
    "outputs": [],
@@ -612,13 +565,29 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You many also refer to `/opt/NeMo/examples/multimodal/multimodal_llm/neva/eval/eval_vqa.py` to check how to use external LLM API to do the video question answering task evaluation."
+    "The `IOU` and `IOU@0.5 precision` metric will be reported. The higher the better.\n",
+    "\n",
+    "You may also refer to `/opt/NeMo/examples/multimodal/multimodal_llm/neva/eval/eval_vqa.py` to check how to use external LLM API to do the video question answering task evaluation."
    ]
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/multimodal/Multimodal Data Preparation.ipynb b/tutorials/multimodal/Multimodal Data Preparation.ipynb
index fb7bdee1402f..713f001937b8 100644
--- a/tutorials/multimodal/Multimodal Data Preparation.ipynb	
+++ b/tutorials/multimodal/Multimodal Data Preparation.ipynb	
@@ -53,7 +53,11 @@
     "We need to downgrade opencv version to resolve this issue: https://github.com/opencv/opencv-python/issues/884\n",
     "\"\"\"\n",
     "! pip uninstall -y opencv-python-headless\n",
-    "! pip install opencv-python==4.8.0.74"
+    "! pip install opencv-python==4.8.0.74\n",
+    "\"\"\"\n",
+    "We need to downgrade dask version to prevent a repartition error.\n",
+    "\"\"\"\n",
+    "! pip install dask==2024.1.1"
    ],
    "metadata": {
     "collapsed": false
diff --git a/tutorials/multimodal/NeVA Mixtral Tutorial.ipynb b/tutorials/multimodal/NeVA Mixtral Tutorial.ipynb
new file mode 100644
index 000000000000..280d3e72b26e
--- /dev/null
+++ b/tutorials/multimodal/NeVA Mixtral Tutorial.ipynb	
@@ -0,0 +1,381 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b29a4b72-31bb-4268-9598-2cd2b6f7475e",
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "source": [
+    "# NeVA Training / Inference Tutorial\n",
+    "\n",
+    "### Note:\n",
+    "Currently, this notebook must be run in a NeMo container. An example command to launch the container:\n",
+    "\n",
+    "```\n",
+    "docker run --gpus all -it --rm -v <your_nemo_dir>:/opt/NeMo --shm-size=8g \\\n",
+    "     -p 8888:8888 --ulimit memlock=-1 --ulimit \\\n",
+    "      stack=67108864 <your_nemo_container>\n",
+    "```\n",
+    "\n",
+    "## Introduction\n",
+    "\n",
+    "In NeMo Framework 24.07 release, we introduce a wide variety of new features for training NeVA. This notebook illustrates the new features within NeVA such as: Mistral + Mixtral Support, Token Compression and SigLIP support.\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "## Latest Enhancements and Expanded Capabilities\n",
+    "\n",
+    "NeVA has expanded its capabilities with support for multiple GPT architectures, vision encoders, and new modalities. Here's an overview of the latest additions:\n",
+    "\n",
+    "### 1. Expanded Language Model Support\n",
+    "\n",
+    "NeVA now supports a broader range of language models, including:\n",
+    "\n",
+    "* LLama 2 & 3\n",
+    "* Mistral & Mixtral\n",
+    "* Nemotron-3 & 4\n",
+    "\n",
+    "#### Integrating LLaMA or Mistral Models\n",
+    "\n",
+    "To utilize LLaMA or Mistral models as your base language model in NeVA:\n",
+    "\n",
+    "1. Download the model from Hugging Face.\n",
+    "2. Convert the model from HF to '.nemo' format using the appropriate conversion script:\n",
+    "   - LLama 7b & 70b: [convert_llama_hf_to_nemo.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py)\n",
+    "   - Mistral 7b: [convert_mistral_7b_hf_to_nemo.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py)\n",
+    "   - Mixtral 8x7b: [convert_mixtral_hf_to_nemo.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py)\n",
+    "\n",
+    "\n",
+    "### 2. Enhanced Vision Encoder Options\n",
+    "\n",
+    "NeVA now supports both CLIP and SigLIP as vision encoder options. Configuration examples:\n",
+    "\n",
+    "#### CLIP Configuration\n",
+    "\n",
+    "```\n",
+    "model.mm_cfg.vision_encoder.from_pretrained = 'openai/clip-vit-large-patch14-336'\n",
+    "model.mm_cfg.vision_encoder.hidden_size = 1024\n",
+    "model.mm_cfg.vision_encoder.from_hf = True\n",
+    "model.mm_cfg.vision_encoder.crop_size = [336, 336]\n",
+    "model.mm_cfg.vision_encoder.class_token_length = 1\n",
+    "```\n",
+    "\n",
+    "#### SigLIP Configuration\n",
+    "\n",
+    "```\n",
+    "model.mm_cfg.vision_encoder.from_pretrained = 'google/siglip-so400m-patch14-384'\n",
+    "model.mm_cfg.vision_encoder.hidden_size = 1152\n",
+    "model.mm_cfg.vision_encoder.from_hf = True\n",
+    "model.mm_cfg.vision_encoder.crop_size = [384, 384]\n",
+    "model.mm_cfg.vision_encoder.class_token_length = 0\n",
+    "```\n",
+    "\n",
+    "### 3. Token Fusion Method from [VILA](https://github.com/NVlabs/VILA)\n",
+    "\n",
+    "NeVA introduces an additional multimodal adapter design, which fuses tokens and reduces the number of tokens needed to represent an image by 4x.\n",
+    "\n",
+    "To utilize this new token fusion method, modify `model.mm_cfg.mm_mlp_adapter_type` to `mlp_downsample`.\n",
+    "\n",
+    "### 4. Video Modality Support\n",
+    "\n",
+    "NeVA now supports video processing by representing video as multiple image frames. Configure as follows:\n",
+    "\n",
+    "```\n",
+    "model.data.media_type = 'video'\n",
+    "model.data.num_frames = 5\n",
+    "model.data.splice_single_frame = None\n",
+    "model.data.video_folder = '/path/to/videos'\n",
+    "```\n",
+    "\n",
+    "For more detailed documentation on VideoNeVA, refer to the [official documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/multimodal/mllm/video_neva.html).\n",
+    "\n",
+    "\n",
+    "## This tutorial will guide you through the following topics:\n",
+    "1. Prepare pre-requisites for NeVA training\n",
+    "2. Training a NeVA model with an MoE Mixtral model, utilize token compression and replace CLIP with SigLIP as your vision encoder.\n",
+    "3. Performing inference with the trained model\n",
+    "\n",
+    "## Datasets\n",
+    "\n",
+    "Please refer to [NeMo User Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/multimodalmodels/multimodallanguagemodel/neva/dataprep.html#prepare-pretraining-and-fine-tuning-datasets) for preparing NeVA dataset for pretrain and fine-tuning.\n",
+    "\n",
+    "\n",
+    "### Pre-Training Dataset\n",
+    "\n",
+    "The pre-training dataset is open-sourced from the LLaVA implementation and can be downloaded [here](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain). The dataset consists of a 558K subset of the LAION-CC-SBU dataset with BLIP captions.\n",
+    "\n",
+    "The associated images for pretraining can be downloaded via HuggingFace [here](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/blob/main/images.zip).\n",
+    "\n",
+    "### Instruction Tuning Dataset\n",
+    "\n",
+    "The instruction tuning annotations are sourced from the LLaVA implementation and are available [here](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json).\n",
+    "\n",
+    "The associated images for the mixture instruction tuning annotations can be found [here](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#visual-instruction-tuning). After extracting, the data should be formatted as follows:\n",
+    "\n",
+    "```\n",
+    "    images\n",
+    "      ├── coco\n",
+    "      │    └── train2017\n",
+    "      ├── gqa\n",
+    "      │    └── images\n",
+    "      ├── ocr_vqa\n",
+    "      │    └── images\n",
+    "      ├── textvqa\n",
+    "      │    └── train_images\n",
+    "      └── vg\n",
+    "           ├── VG_100K\n",
+    "           └── VG_100K_2\n",
+    "```\n",
+    "\n",
+    "After downloading all below datasets for pretraining and instruction tuning, please put data folder at `/workspace/datasets`. Your dataset directory should look something similar to:\n",
+    "\n",
+    "```\n",
+    "LLaVA-Pretrain-LCS-558K\n",
+    "├── blip_laion_cc_sbu_558k.json\n",
+    "├── images\n",
+    "LLaVA-Instruct-mixture\n",
+    "├── llava_v1_5_mix665k.json\n",
+    "└── images\n",
+    "    └── ...\n",
+    "```\n",
+    "\n",
+    "## Setting up Checkpoint and Tokenizer\n",
+    "\n",
+    "In this notebook, we first need to convert the Vicuna 1.5 checkpoint into the .nemo format. Meanwhile, special tokens must be incorporated into the tokenizer for NeVA training. After downloading language models from Hugging Face, ensure you also fetch the corresponding tokenizer model. Using the 7B-chat model as a reference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d80adff-bd3a-40e0-9441-684328ec7596",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "! mkdir -p /workspace/checkpoints\n",
+    "\n",
+    "# Download Mixtral Instruct 0.1 checkpoint from HF\n",
+    "! git clone https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 /workspace/checkpoints/mixtral-8x7b-instruct\n",
+    "\n",
+    "# Convert checkpoint\n",
+    "! python /opt/NeMo/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py \\\n",
+    "  --input_name_or_path /workspace/checkpoints/mixtral-8x7b-instruct \\\n",
+    "  --output_path /workspace/checkpoints/mixtral-8x7b-instruct.nemo\n",
+    "\n",
+    "# Prepare tokenizer\n",
+    "! cd /opt && git clone https://github.com/google/sentencepiece.git && \\\n",
+    "  cd sentencepiece && \\\n",
+    "  mkdir build && \\\n",
+    "  cd build && \\\n",
+    "  cmake .. && \\\n",
+    "  make && \\\n",
+    "  make install && \\\n",
+    "  ldconfig && \\\n",
+    "cd /opt/sentencepiece/src/ && protoc --python_out=/opt/NeMo/scripts/tokenizers/ sentencepiece_model.proto\n",
+    "\n",
+    "! python /opt/NeMo/scripts/tokenizers/add_special_tokens_to_sentencepiece.py \\\n",
+    "--input_file /workspace/checkpoints/mixtral-8x7b-instruct/tokenizer.model \\\n",
+    "--output_file /workspace/checkpoints/mixtral-8x7b-instruct/tokenizer_neva.model \\\n",
+    "--is_userdefined \\\n",
+    "--tokens \"<extra_id_0>\" \"<extra_id_1>\" \"<extra_id_2>\" \"<extra_id_3>\" \\\n",
+    "         \"<extra_id_4>\" \"<extra_id_5>\" \"<extra_id_6>\" \"<extra_id_7>\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b619e0a",
+   "metadata": {},
+   "source": [
+    "## Training\n",
+    "\n",
+    "### Feature Alignment Pre-Training\n",
+    "\n",
+    "We provide a set of scripts for pre-training and fine-tuning which can be kicked off with CLI flags defining specified arguments. \n",
+    "\n",
+    "An example of a pre-training script execution (note the scripts will only perform 100 steps with a small micro batch size, this is not a full training):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d16b0b9",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "! torchrun --nproc_per_node=4 /opt/NeMo/examples/multimodal/multimodal_llm/neva/neva_pretrain.py \\\n",
+    "--config-path=/opt/NeMo/examples/multimodal/multimodal_llm/neva/conf \\\n",
+    "--config-name=neva_mixtral_config.yaml \\\n",
+    "++cluster_type=BCP \\\n",
+    "trainer.precision=bf16 \\\n",
+    "trainer.num_nodes=1 \\\n",
+    "trainer.devices=4 \\\n",
+    "trainer.val_check_interval=50 \\\n",
+    "trainer.limit_val_batches=5 \\\n",
+    "trainer.log_every_n_steps=1 \\\n",
+    "trainer.max_steps=100 \\\n",
+    "model.micro_batch_size=2 \\\n",
+    "model.global_batch_size=4 \\\n",
+    "model.tensor_model_parallel_size=2 \\\n",
+    "model.expert_model_parallel_size=2 \\\n",
+    "model.pipeline_model_parallel_size=1 \\\n",
+    "model.mcore_gpt=True \\\n",
+    "model.transformer_engine=True \\\n",
+    "model.data.data_path=/workspace/datasets/LLaVA-Pretrain-LCS-558K/blip_laion_cc_sbu_558k.json \\\n",
+    "model.data.image_folder=/workspace/datasets/LLaVA-Pretrain-LCS-558K/images \\\n",
+    "model.tokenizer.library=sentencepiece \\\n",
+    "model.tokenizer.model=/workspace/checkpoints/mixtral-8x7b-instruct/tokenizer_neva.model \\\n",
+    "model.data.num_workers=0 \\\n",
+    "model.mm_cfg.llm.from_pretrained=/workspace/checkpoints/mixtral-8x7b-instruct.nemo \\\n",
+    "model.mm_cfg.llm.model_type=mistral \\\n",
+    "model.data.conv_template=mistral \\\n",
+    "model.mm_cfg.vision_encoder.from_pretrained=\"google/siglip-so400m-patch14-384\" \\\n",
+    "model.mm_cfg.vision_encoder.crop_size=\"[384,384]\" \\\n",
+    "model.mm_cfg.vision_encoder.class_token_length=0 \\\n",
+    "model.mm_cfg.mm_mlp_adapter_type=\"mlp_downsample\" \\\n",
+    "model.mm_cfg.vision_encoder.from_hf=True \\\n",
+    "model.optim.name=\"fused_adam\" \\\n",
+    "exp_manager.create_checkpoint_callback=True \\\n",
+    "exp_manager.create_wandb_logger=False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f24ee70d-3025-47f6-8571-295b024c3e05",
+   "metadata": {},
+   "source": [
+    "**Note**: To initialize training a model from scratch rather than from a pretrained checkpoint, you may specify `null` instead of a path in the CLI arguments.\n",
+    "\n",
+    "### Image-Language Pair Instruction Fine-Tuning\n",
+    "\n",
+    "Fine-tuning can also be run from within the container via a similar command leveraging the `neva_finetune.py` script. We leverage the checkpoint saved from pretrain step to further finetune it, given by `model.restore_from_path=/workspace/nemo_experiments/nemo_neva/checkpoints/nemo_neva.nemo`.\n",
+    "\n",
+    "An example of an image-text pair instruction tuning script execution (note the scripts will only perform 1000 steps with a small micro batch size, this is not a full training):\n",
+    "\n",
+    "Note: For running SFT on multiple nodes on a Slurm cluster, replace the `torchrun --nproc_per_node=8` with `python`. Mixtral 8x7B has a minimum requirement of TP8PP2 for finetuning, which will require two nodes. You can refer to the [NeMo Framework Launcher](https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/main/launcher_scripts/conf/fine_tuning/neva/mixtral_8x7b_instruct.yaml) for detailed instructions on setting up and running multimodal fine-tuning for Mixtral."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97963224",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "python /opt/NeMo/examples/multimodal/multimodal_llm/neva/neva_finetune.py \\\n",
+    "--config-path=/opt/NeMo/examples/multimodal/multimodal_llm/neva/conf \\\n",
+    "--config-name=neva_mixtral_config.yaml \\\n",
+    "++cluster_type=BCP \\\n",
+    "trainer.precision=bf16 \\\n",
+    "trainer.num_nodes=1 \\\n",
+    "trainer.devices=4 \\\n",
+    "trainer.val_check_interval=50 \\\n",
+    "trainer.limit_val_batches=5 \\\n",
+    "trainer.log_every_n_steps=1 \\\n",
+    "trainer.max_steps=100 \\\n",
+    "model.micro_batch_size=2 \\\n",
+    "model.global_batch_size=4 \\\n",
+    "model.tensor_model_parallel_size=8 \\\n",
+    "model.expert_model_parallel_size=1 \\\n",
+    "model.pipeline_model_parallel_size=2 \\\n",
+    "model.mcore_gpt=True \\\n",
+    "model.transformer_engine=True \\\n",
+    "model.restore_from_path=/workspace/nemo_experiments/nemo_neva/checkpoints/nemo_neva.nemo \\\n",
+    "model.data.data_path=/workspace/datasets/LLaVA-Instruct-mixture/llava_v1_5_mix665k.json \\\n",
+    "model.data.image_folder=/workspace/datasets/LLaVA-Instruct-mixture/images \\\n",
+    "model.tokenizer.library=sentencepiece \\\n",
+    "model.tokenizer.model=/workspace/checkpoints/mixtral-8x7b-instruct/tokenizer_neva.model \\\n",
+    "model.data.num_workers=0 \\\n",
+    "model.mm_cfg.llm.from_pretrained=/workspace/checkpoints/mixtral-8x7b-instruct.nemo \\\n",
+    "model.mm_cfg.llm.freeze=False \\\n",
+    "model.mm_cfg.llm.model_type=mistral \\\n",
+    "model.data.conv_template=mistral \\\n",
+    "model.mm_cfg.vision_encoder.from_pretrained=\"google/siglip-so400m-patch14-384\" \\\n",
+    "model.mm_cfg.vision_encoder.crop_size=\"[384,384]\" \\\n",
+    "model.mm_cfg.vision_encoder.class_token_length=0 \\\n",
+    "model.mm_cfg.mm_mlp_adapter_type=\"mlp_downsample\" \\\n",
+    "model.mm_cfg.vision_encoder.from_hf=True \\\n",
+    "model.optim.name=\"fused_adam\" \\\n",
+    "exp_manager.create_checkpoint_callback=True \\\n",
+    "exp_manager.name=\"nemo_neva_finetune\" \\\n",
+    "exp_manager.create_wandb_logger=False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5235639a",
+   "metadata": {},
+   "source": [
+    "### Running Inference\n",
+    "\n",
+    "NeVA inference via the NeMo Framework can be quickly spun up via the NeMo Launcher and a few modifications to use the default NeVA inference config file.\n",
+    "\n",
+    "Inference can be run with a similar command leveraging the provided inference script `neva_evaluation.py` within the container.\n",
+    "\n",
+    "An example of an inference script execution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee0156ea",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "! echo '{\"image\": \"RTX4080.png\", \"prompt\": \"<image>\\nCan you describe this image?\"}' > sample.jsonl\n",
+    "! mkdir images && wget https://assets.nvidia.partners/images/png/TUF_Gaming_GeForce_RTX_4080_SUPER_OC_edition_packaging_with_card__12419.png --output-document=images/RTX4080.png\n",
+    "! torchrun --nproc_per_node=1 /opt/NeMo/examples/multimodal/multimodal_llm/neva/neva_evaluation.py \\\n",
+    "tensor_model_parallel_size=4 \\\n",
+    "pipeline_model_parallel_size=1 \\\n",
+    "neva_model_file=/workspace/nemo_neva_finetune/checkpoints/mixtral_neva.nemo \\\n",
+    "trainer.devices=4 \\\n",
+    "trainer.precision=bf16 \\\n",
+    "prompt_file=sample.jsonl \\\n",
+    "inference.media_base_path=images \\\n",
+    "output_file=output.jsonl \\\n",
+    "inference.temperature=0.2 \\\n",
+    "inference.tokens_to_generate=256"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tutorials/multimodal/NeVA Tutorial.ipynb b/tutorials/multimodal/NeVA Tutorial.ipynb
index 1ad1101a0299..4914ccd6fcb1 100644
--- a/tutorials/multimodal/NeVA Tutorial.ipynb	
+++ b/tutorials/multimodal/NeVA Tutorial.ipynb	
@@ -186,6 +186,7 @@
     " model.mm_cfg.vision_encoder.from_hf=True \\\n",
     " model.optim.name=\"fused_adam\" \\\n",
     " exp_manager.create_checkpoint_callback=True \\\n",
+    " exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \\\n",
     " exp_manager.create_wandb_logger=False"
    ]
   },
@@ -255,6 +256,7 @@
     " model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \\\n",
     " model.mm_cfg.vision_encoder.from_hf=True \\\n",
     " exp_manager.create_checkpoint_callback=True \\\n",
+    " exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \\\n",
     " exp_manager.name=\"nemo_neva_finetune\" \\\n",
     " model.optim.name=\"fused_adam\""
    ]
@@ -313,7 +315,7 @@
    },
    "outputs": [],
    "source": [
-    "! echo '{\"image\": \"RTX4080.png\", \"prompt\": \"<image>\\nCan you describe this image?\"}' > sample.json\n",
+    "! echo '{\"image\": \"RTX4080.png\", \"prompt\": \"<image>\\nCan you describe this image?\"}' > sample.jsonl\n",
     "! mkdir images && wget https://assets.nvidia.partners/images/png/TUF_Gaming_GeForce_RTX_4080_SUPER_OC_edition_packaging_with_card__12419.png --output-document=images/RTX4080.png\n",
     "! torchrun --nproc_per_node=1 /opt/NeMo/examples/multimodal/multimodal_llm/neva/neva_evaluation.py \\\n",
     "tensor_model_parallel_size=1 \\\n",
@@ -321,9 +323,9 @@
     "neva_model_file=/workspace/checkpoints/llava-7b.nemo \\\n",
     "trainer.devices=1 \\\n",
     "trainer.precision=bf16 \\\n",
-    "prompt_file=sample.json \\\n",
+    "prompt_file=sample.jsonl \\\n",
     "inference.media_base_path=images \\\n",
-    "output_file=output.json \\\n",
+    "output_file=output.jsonl \\\n",
     "inference.temperature=0.2 \\\n",
     "inference.tokens_to_generate=256"
    ]