diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index 8c61c767b4f1..3e2d63285ec4 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -43,7 +43,7 @@ jobs:
     steps:
         - name: Docker system cleanup
           run: |
-            docker system prune -a --filter "until=48h" --force
+            docker system prune -a --filter "until=48h" --force || true
 
         - name: Docker pull image
           run: |
diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
index 3c82269cb9a6..0b753d59a826 100644
--- a/.github/workflows/cherry-pick-release-commit.yml
+++ b/.github/workflows/cherry-pick-release-commit.yml
@@ -1,28 +1,101 @@
 name: Create PR to main with cherry-pick from release
 
 on: 
-  pull_request_target:
+  push:
     branches:
-      - 'r*.*.*'
-    types: ["closed"]
+      - main
 
 jobs:
-  cherry-pick-release-commit:
-    name: Cherry-pick release commit
+  main:
     runs-on: ubuntu-latest
+    environment: 
+      name: main
     steps:
       - name: Checkout
         uses: actions/checkout@v3
         with:
           fetch-depth: 0
-      - name: github-cherry-pick-action v1.0.3
-        uses: carloscastrojumo/github-cherry-pick-action@bb0869df47c27be4ae4c7a2d93d22827aa5a0054
-        with:
-          branch: main
-          labels: |
-            cherry-pick
-          reviewers: |
-            ${{ github.event.pull_request.user.login }}
+          token: ${{ secrets.PAT }}
+
+
+      - name: Cherry pick
+        env:
+          GH_TOKEN: ${{ secrets.PAT }}
+        run: |
+          set -x  
+          set +e
+
+          git config --global user.email "nemo-bot@nvidia.com"
+          git config --global user.name "NeMo Bot"
+          
+          SHA=$(git rev-list --no-merges -n 1 HEAD)
+          MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
+          PR_ID=$(echo $MESSAGE | awk -F'#' '{print $2}' | awk -F')' '{print $1}' )
+
+          PR=$(curl -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer $GH_TOKEN" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/NVIDIA/NeMo/pulls/$PR_ID)
+          
+          LABELS=$(echo -E $PR | jq '.labels | [.[].name] | join(",")' | tr -d '"')
+
+          TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'r[^,]*')
+      
+          if [[ $TARGET_BRANCHES == '' ]]; then
+            echo Nothing to cherry-pick
+            exit 0
+          fi
+
+          echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do
+            TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false)
+          
+            if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then
+              echo Release branch does not yet exist, will not  cherry-pick
+              continue
+            fi
+            
+            (
+              git fetch origin $RELEASE_BRANCH:$RELEASE_BRANCH
+              git switch --force-create cherry-pick-$PR_ID-$RELEASE_BRANCH $RELEASE_BRANCH
+              git cherry-pick $SHA
+              git push -u origin --force cherry-pick-$PR_ID-$RELEASE_BRANCH
+              git checkout ${CI_DEFAULT_BRANCH:-main}
+            )
+
+            CHERRYPICK_SUCCESSFUL=$?
+
+            if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then
+              curl -L \
+                -X POST \
+                -H "Accept: application/vnd.github+json" \
+                -H "Authorization: Bearer $GH_TOKEN" \
+                -H "X-GitHub-Api-Version: 2022-11-28" \
+                https://api.github.com/repos/NVIDIA/NeMo/pulls \
+                -d '{"title":"Cherry-pick '$PR_ID' into '$RELEASE_BRANCH'","head":"cherry-pick-'$PR_ID'-'$RELEASE_BRANCH'","base":"'$RELEASE_BRANCH'"}'
+
+            else
+              URL=https://github.com/NVIDIA/NeMo/pull/${{ github.event.number }}
+
+              MESSAGE='{
+                "blocks": [
+                  {
+                    "type": "section",
+                    "text": {
+                      "type": "mrkdwn",
+                      "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'${{ github.event.number }}'> failed"
+                    }
+                  }
+                ]
+              }'
+
+              curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }}
+
+            fi
+
+          done
+   
+          
 
 env:
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index dd74e050a533..daf530d8bec6 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: "CICD NeMo"
-
 on:
   pull_request:
     branches:
       - 'main'
       - 'r**'
     types: [ labeled ]
+
   workflow_dispatch:
     inputs:
       test_to_run:
@@ -122,112 +122,219 @@ jobs:
         '
         ### \'\'
   
-  L0_Unit_Tests_GPU:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      TIMEOUT: 60
-      SCRIPT: |
-        NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
-      IS_OPTIONAL: true
+  # L0: GPU unit tests
+  L0_Unit_Tests_GPU_ASR:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       TIMEOUT: 20
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
+
+  L0_Unit_Tests_GPU_Audio:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       TIMEOUT: 20
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --with_downloads
+
+  L0_Unit_Tests_GPU_Common:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads
+
+  L0_Unit_Tests_GPU_LLM:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads
+
+  L0_Unit_Tests_GPU_Multimodal:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --with_downloads
+
+  L0_Unit_Tests_GPU_NLP:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --with_downloads
+
+  L0_Unit_Tests_GPU_TTS:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --with_downloads
+
+  OPTIONAL_L0_Unit_Tests_GPU_Core:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       TIMEOUT: 20
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest tests/core -m "not pleasefixme" --with_downloads
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_GPU_Hydra:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads
+
+  OPTIONAL_L0_Unit_Tests_GPU_Lightning:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_GPU_Others:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted-azure
+       SCRIPT: |
+         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads \
+         --ignore=tests/collections/asr \
+         --ignore=tests/collections/audio \
+         --ignore=tests/collections/common \
+         --ignore=tests/collections/llm \
+         --ignore=tests/collections/multimodal \
+         --ignore=tests/collections/nlp \
+         --ignore=tests/collections/tts \
+         --ignore=tests/core \
+         --ignore=tests/core_ptl \
+         --ignore=tests/hydra \
+         --ignore=tests/lightning \
+         --ignore=tests/utils
 
   # L0: CPU unit tests
   L0_Unit_Tests_CPU_ASR:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        TIMEOUT: 20
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_Audio:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_Common:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_LLM:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_Multimodal:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_NLP:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_TTS:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_Core:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_Hydra:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU_Lightning:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        SCRIPT: |
          CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-       IS_OPTIONAL: true
 
-  L0_Unit_Tests_CPU_Ohers:
+  L0_Unit_Tests_CPU_Others:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure-cpu
        SCRIPT: |
@@ -4965,12 +5072,59 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/llm/gpt_pretrain_results
         rm -rf examples/llm/gpt_index_mappings
+      
+  L2_NeMo_2_SSM_Pretraining:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+
+        python tests/collections/llm/gpt/model/megatron_ssm_pretraining.py \
+        --devices 1 \
+        --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain \
+        --max-steps 10 \
+        --data-path /home/TestData/nlp/megatron_mamba/toy_ssm_dataset/legal_pile_text_document
+
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain
+
+  L2_NeMo_2_SSM_Finetuning:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+
+        python tests/collections/llm/gpt/model/megatron_ssm_finetuning.py \
+        --devices 1 \
+        --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft \
+        --max-steps 10 \
+        --model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt
+
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft
 
   Nemo_CICD_Test:
     needs: 
+      - pre-flight
       - gpu-test
       - cicd-test-container-setup
-      - L0_Unit_Tests_GPU
+
+      - L0_Unit_Tests_GPU_ASR
+      - L0_Unit_Tests_GPU_Audio
+      - L0_Unit_Tests_GPU_Common
+      - L0_Unit_Tests_GPU_LLM
+      - L0_Unit_Tests_GPU_Multimodal
+      - L0_Unit_Tests_GPU_NLP
+      - L0_Unit_Tests_GPU_TTS
+      #- OPTIONAL_L0_Unit_Tests_GPU_Core
+      - L0_Unit_Tests_GPU_Hydra
+      #- OPTIONAL_L0_Unit_Tests_GPU_Lightning
+      - L0_Unit_Tests_GPU_Others
+
       - L0_Unit_Tests_CPU_ASR
       - L0_Unit_Tests_CPU_Audio
       - L0_Unit_Tests_CPU_Common
@@ -4981,7 +5135,8 @@ jobs:
       - L0_Unit_Tests_CPU_Core
       - L0_Unit_Tests_CPU_Hydra
       - L0_Unit_Tests_CPU_Lightning
-      - L0_Unit_Tests_CPU_Ohers
+      - L0_Unit_Tests_CPU_Others
+
       - L2_Community_LLM_Checkpoints_tests_Bert
       - L2_Community_LLM_Checkpoints_tests_Mamba2
       - L2_Community_LLM_Checkpoints_tests_Llama
@@ -5083,6 +5238,8 @@ jobs:
       #- OPTIONAL_L2_Stable_Diffusion_Training
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
       - L2_NeMo_2_GPT_DDP_Param_Parity_check
+      - L2_NeMo_2_SSM_Pretraining
+      - L2_NeMo_2_SSM_Finetuning
     if: always()
     runs-on: ubuntu-latest
     steps:  
@@ -5176,3 +5333,4 @@ jobs:
       - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
         run: |
           exit 1
+
diff --git a/.github/workflows/config/.secrets.baseline b/.github/workflows/config/.secrets.baseline
index 2bf4e372565c..4a56aaad3c58 100644
--- a/.github/workflows/config/.secrets.baseline
+++ b/.github/workflows/config/.secrets.baseline
@@ -123,6 +123,15 @@
     }
   ],
   "results": {
+    ".github/workflows/cicd-main.yml": [
+      {
+        "type": "Base64 High Entropy String",
+        "filename": ".github/workflows/cicd-main.yml",
+        "hashed_secret": "593951c440200143335452427205ae7c8580d463",
+        "is_verified": false,
+        "line_number": 1503
+      }
+    ],
     "docs/source/nlp/question_answering.rst": [
       {
         "type": "Hex High Entropy String",
@@ -2074,5 +2083,5 @@
       }
     ]
   },
-  "generated_at": "2024-09-04T00:45:39Z"
+  "generated_at": "2024-09-08T19:00:15Z"
 }
diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml
index f8d037271f36..7f8cd3dad8f5 100644
--- a/.github/workflows/release-freeze.yml
+++ b/.github/workflows/release-freeze.yml
@@ -7,6 +7,11 @@ on:
         description: 'MAJOR.MINOR.PATCH[rcN] (Example: 2.0.0rc1, or 2.1.0)'
         required: true
         type: string
+      is_prelease:
+        description: Whether to keep and bump the pre-release label
+        required: false
+        default: false
+        type: boolean
       mcore_version:
         description: 'Version of MCore to use (must be a valid git ref)'
         required: true
@@ -27,25 +32,25 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
           ref: main
-      
-      - name: Get Previous tag
-        id: previous-tag
-        # git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags ==> refs/tags/vX.Y.Z in descending order of date
-        # awk 'FNR == 2 {print substr($1, 11, length($1))}') ==> Selects the 2nd tag from the list, then strips the /refs/tags/ part of the tag
-        # set-output name=tag_name:: ==> Takes the clean tag vX.Y.Z and sets it to steps.previous_tag.outputs.tag_name
-        run: |
-          TAG=$(git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags | awk 'FNR == 2 {print substr($1, 11, length($1))}')
-          echo "tag-name=$TAG" >> "$GITHUB_OUTPUT"
+          token: ${{ secrets.PAT }}
 
       - name: Get release branch ref
         id: release-branch
         run: |
           cd ${{ github.run_id }}
-          
+                    
+          if [[ "${{ inputs.is_prelease }}" == "false" ]]; then
+            sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" nemo/package_info.py 
+          fi
+
           VERSION=$(python -c 'import nemo; print(nemo.__version__)')
-                    echo "Release version r$VERSION" > version
+
+          echo "Release version r$VERSION" > version
           echo "version=$VERSION" >> "$GITHUB_OUTPUT"
 
+          git switch --force-create r$VERSION origin/main
+          git push -u origin r$VERSION --force
+
       - name: Pin branch name in Notebooks
         run: |
           cd ${{ github.run_id }}
@@ -56,34 +61,13 @@ jobs:
           cd ${{ github.run_id }}
           sed -i 's/^ARG MCORE_TAG=.*$/ARG MCORE_TAG=${{ inputs.mcore_version }}/' Dockerfile.ci
 
-      - name: Build Changelog
-        id: build-changelog
-        uses: mikepenz/release-changelog-builder-action@v3.3.1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          # Configuration file is setup with filters for domains
-          # owner:repo must point to current repo
-          # fromTag: Auto resolved from historical tag order (previous tag compared to current tag)
-          # toTag: Current tag reference
-          configuration: ".github/workflows/config/changelog-config.json"
-          owner: ${{ github.repository_owner }}
-          repo: ${{ github.event.repository.name }} 
-          ignorePreReleases: "false"
-          failOnError: "false"
-          fromTag: ${{ steps.previous-tag.outputs.tag-name }}
-          toTag: main
-
-      - name: Append Changelog
-        run: |
-          echo "${{ steps.build-changelog.outputs.changelog }}"
-
       - name: Create Release PR
         uses: peter-evans/create-pull-request@v6
         id: create-pull-request
         with:
           path: ${{ github.run_id }}
-          branch: r${{ steps.release-branch.outputs.version }}
+          base: r${{ steps.release-branch.outputs.version }}
+          branch: ci/release-r${{ steps.release-branch.outputs.version }}
           title: 'Release `${{ steps.release-branch.outputs.version }}`'
           body: |
             🚀 PR to release NeMo `${{ steps.release-branch.outputs.version }}`.
@@ -101,22 +85,6 @@ jobs:
           assignees: okoenig
           labels: 'Run CICD'
 
-      - name: Add Summary comment
-        uses: peter-evans/create-or-update-comment@v4
-        with:
-          issue-number: ${{ steps.create-pull-request.outputs.pull-request-number }}
-          body: |
-            # Highlights
-            _<here-goes-the-summary...>_
-
-      - name: Add Changelog comment
-        uses: peter-evans/create-or-update-comment@v4
-        with:
-          issue-number: ${{ steps.create-pull-request.outputs.pull-request-number }}
-          body: |
-            # Detailed Changelogs
-            ${{ steps.build-changelog.outputs.changelog }}
-
   bump-next-version:
     runs-on: ubuntu-latest
     needs: [create-release-branch]
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index af09fa241c59..30033a80e6c7 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -34,11 +34,12 @@ jobs:
           PAYLOAD=$(jq \
                       -n \
                       -c \
+                      --arg TAG_NAME "v${VERSION}" \
                       --arg CI_COMMIT_BRANCH "${{ inputs.branch }}" \
                       --arg NAME "$NAME" \
                       --arg BODY "$CHANGELOG" \
                       '{
-                        "tag_name": $CI_COMMIT_BRANCH,
+                        "tag_name": $TAG_NAME,
                         "target_commitish": $CI_COMMIT_BRANCH,
                         "name": $NAME,
                         "body": $BODY,
diff --git a/.github/workflows/secrets-detector.yml b/.github/workflows/secrets-detector.yml
index 4de052535cc1..a7793a9c62db 100644
--- a/.github/workflows/secrets-detector.yml
+++ b/.github/workflows/secrets-detector.yml
@@ -23,10 +23,14 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
+          path: ${{ github.run_id }}
           ref: ${{ inputs.branch-name || github.head_ref }}
+          fetch-depth: 0
 
       - name: Install secrets detector
         run: pip install detect-secrets
 
       - name: Run on change-set
-        run: git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .github/workflows/config/.secrets.baseline 
\ No newline at end of file
+        run: |
+          cd ${{ github.run_id }}
+          git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .github/workflows/config/.secrets.baseline 
\ No newline at end of file
diff --git a/Dockerfile.ci b/Dockerfile.ci
index 3d9a9d9b08a1..7e3ba798d62e 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -18,23 +18,29 @@ ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
 
 FROM ${BASE_IMAGE}
 
-ENV TRANSFORMERS_OFFLINE=0 
+ENV TRANSFORMERS_OFFLINE=0
 ENV HYDRA_FULL_ERROR=1
 ENV PYTHONUNBUFFERED=1
 
 # APT packages
 RUN <<"EOF" bash -ex
 apt-get update
-apt-get install -y bc libsox-fmt-all -y 
+apt-get install -y bc libsox-fmt-all -y
 apt-get clean
 EOF
 
 WORKDIR /workspace
 
+RUN pip install hatchling   # needed to install nemo-run
+ARG NEMU_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2
+RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_TAG}
+
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.15.0
-ARG MCORE_TAG=3396356ab4ca83cc4c4d3272530b142a1702606e
+
+ARG MCORE_TAG=01945b98d1ea3a2acb5e8301e181a328104f4856
+
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \
diff --git a/docs/source/performance/performance_summary.md b/docs/source/performance/performance_summary.md
index eca42f2d0695..98dae2dc0a78 100644
--- a/docs/source/performance/performance_summary.md
+++ b/docs/source/performance/performance_summary.md
@@ -11,18 +11,18 @@
 
 | Model         | #-GPUs | GBS  | MBS | Sequence Length| TP | PP | CP | VP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to train in days (10T tokens, 1K GPUs)*** |
 | -----         | ------ | ---  | --- | ---------------| -- | -- | -- | -- | ------------------ | ----------------------- | ------------------------------------------------------ |
-| GPT3-5B       | 64     | 2048 | 4   | 2048           | 1  | 1  | 1  | 1  | 23574              | 770                     | ***5***                                                |
-| GPT3-20B      | 64     | 256  | 2   | 2048           | 2  | 1  | 1  | 1  | 5894               | 755                     | ***19***                                               |
-| GPT3-175B     | 128    | 256  | 1   | 2048           | 4  | 8  | 1  | 6  | 745                | 802                     | **152**                                                |
-| GPT3-175B     | 512    | 2048 | 2   | 2048           | 4  | 8  | 1  | 6  | 832                | [895](https://mlcommons.org/benchmarks/training/)                     | **136**                                                |
-| LLAMA2-7B     | 8      | 128  | 1   | 4096           | 1  | 1  | 1  | 1  | 16634              | 767                     | ***7***                                                | 
+| GPT3-5B       | 64     | 2048 | 4   | 2048           | 1  | 1  | 1  | 1  | 23406              | 765                     | ***5***                                                |
+| GPT3-20B      | 64     | 256  | 2   | 2048           | 2  | 1  | 1  | 1  | 5851               | 750                     | ***19***                                               |
+| GPT3-175B     | 128    | 256  | 1   | 2048           | 4  | 8  | 1  | 6  | 716                | 771                     | **158**                                                |
+| GPT3-175B     | 512    | 2048 | 2   | 2048           | 4  | 8  | 1  | 6  | 825                | [888](https://mlcommons.org/benchmarks/training/)                     | **137**                                                |
+| LLAMA2-7B     | 8      | 128  | 1   | 4096           | 1  | 1  | 1  | 1  | 16934              | 780                     | ***7***                                                | 
 | LLAMA2-13B    | 16     | 128  | 1   | 4096           | 1  | 4  | 1  | 10 | 8715               | 760                     | ***13***                                               |
-| LLAMA2-70B    | 64     | 128  | 1   | 4096           | 4  | 4  | 1  | 20 | 1717               | 763                     | ***66***                                               |
+| LLAMA2-70B    | 64     | 128  | 1   | 4096           | 4  | 4  | 1  | 20 | 1728               | 768                     | ***65***                                               |
 | Nemotron-8B   | 64     | 256  | 4   | 4096           | 2  | 1  | 1  | 1  | 12507              | 643                     | ***9***                                                |
-| Nemotron-22B  | 64     | 256  | 2   | 4096           | 2  | 4  | 1  | 10 | 4289               | 559                     | ***26***                                               |
-| Nemotron-340B | 128    | 32   | 1   | 4096           | 8  | 8  | 1  | 12 | 328                | 691                     | ***344***                                              |
-| LLAMA3-8B     | 8      | 128  | 1   | 8192           | 1  | 1  | 2  | 1  | 11883              | 688                     | ***10***                                               |
-| LLAMA3-70B    | 64     | 128  | 1   | 8192           | 4  | 4  | 2  | 5  | 1549               | 746                     | ***73***                                               |
+| Nemotron-22B  | 64     | 256  | 2   | 4096           | 2  | 4  | 1  | 10 | 4312               | 562                     | ***26***                                               |
+| Nemotron-340B | 128    | 32   | 1   | 4096           | 8  | 8  | 1  | 12 | 326                | 686                     | ***347***                                              |
+| LLAMA3-8B     | 8      | 128  | 1   | 8192           | 1  | 1  | 2  | 1  | 12273              | 711                     | ***9***                                                |
+| LLAMA3-70B    | 64     | 128  | 1   | 8192           | 4  | 4  | 2  | 5  | 1524               | 734                     | ***74***                                               |
 
 ### Finetuning
 
@@ -34,9 +34,9 @@
 
 | Model      | Task     | #-GPUs | GBS | MBS | Packed Sequence Length | TP | PP | Tokens / sec / GPU | Model TFLOP / sec / GPU |  ***Est. time to finetune in mins (10M tokens)***  |
 | -----      | ----     | ---    | --- | --- | --------------- | -- | -- | ------------------ | -----------------------        | -------------------------------------------------- |
-| LLAMA2-7B  | SFT      | 8      | 32  | 1   | 4096            | 1  | 1  | 17617              | 702                            | ***1.2***                                          |
+| LLAMA2-7B  | SFT      | 8      | 32  | 1   | 4096            | 1  | 1  | 16891              | 673                            | ***1.2***                                          |
 | LLAMA2-13B | SFT      | 8      | 32  | 1   | 4096            | 1  | 4  | 10176              | 787                            | ***2.0***                                          |
-| LLAMA2-70B | SFT      | 16     | 32  | 1   | 4096            | 4  | 4  | 1812               | 747                            | ***5.7***                                          |
-| LLAMA2-7B  | LoRA     | 8      | 32  | 1   | 4096            | 1  | 1  | 25206              | 673                            | ***0.8***                                          |
-| LLAMA2-13B | LoRA     | 8      | 32  | 1   | 4096            | 1  | 1  | 14760              | 764                            | ***1.4***                                          |
+| LLAMA2-70B | SFT      | 16     | 32  | 1   | 4096            | 4  | 4  | 1816               | 749                            | ***5.7***                                          |
+| LLAMA2-7B  | LoRA     | 8      | 32  | 1   | 4096            | 1  | 1  | 24824              | 663                            | ***0.8***                                          |
+| LLAMA2-13B | LoRA     | 8      | 32  | 1   | 4096            | 1  | 1  | 14629              | 757                            | ***1.4***                                          |
 | LLAMA2-70B | LoRA     | 8      | 32  | 1   | 4096            | 2  | 4  | 2621               | 722                            | ***7.9***                                          |
diff --git a/examples/llm/auto_configurator/README.md b/examples/llm/auto_configurator/README.md
new file mode 100644
index 000000000000..26cf5cd75263
--- /dev/null
+++ b/examples/llm/auto_configurator/README.md
@@ -0,0 +1,85 @@
+> [!IMPORTANT] 
+> This is an early version of the Auto Configurator, and the code base can be modified as it will be integrated into the CLI.
+
+Use Auto Configurator to Find the Optimal Configuration
+-------------------------------------------------------
+
+Auto Configurator searches for hyperparameters (HPs) that achieve the maximum highest training throughput when working with Large Language Models (LLMs) utilizing the NeMo Framework.
+
+> [!NOTE] 
+> Auto Configurator is only supported now for GPT-based models: GPT3, LLama, Mixtral, Mistral, Gemma and Nemotron.
+
+Auto Configurator Capabilities
+------------------------------
+
+Auto Configurator is intended to iterate over different model configurations quickly and find the best configuration, that is, the configuration that minimizes both time and financial expenditure. It offers a range of features to facilitate this, as detailed in the list below.
+
+- **Model size recommendation**: finds the optimal model size if the parameter is not specified.
+- **Training time estimation**: estimates model training time based on input parameters.
+- **Base configuration generation**: returns a basic model configuration.
+- **Hyperparameters recommendation**: finds the optimal list of hyperparameters to be trained.
+- **Optimal configuration recommendation**: calculates the performance after a short training of candidate configurations and finds the optimal model configuration.
+
+Model Size Recommendation
+-------------------------
+
+If you have not decided what model size you want to train, Auto Configurator can recommend a model size for your use case. If you know the number of GPUs, TFLOPS per GPU, the maximum time to train, and the number of tokens to train for, it can recommend a model size that can be trained with the specified hardware and time constraints.
+
+For example, if you had 20 NVIDIA DGX nodes available (in 80 GB GPU memory), and wanted to train a GPT model for a maximum of 5 days, Auto Configurator would recommend using a 5B parameter GPT model.
+
+Training Time Estimation
+------------------------
+
+Auto Configurator calculates the estimated training time for your model. It provides a projection of the training time in days, based on the input dataset and parameters you provide.
+
+Base Configuration Generation
+-----------------------------
+
+When you provide the model size, or Auto Configurator has suggested one, it generates a base configuration for the target model. The base configuration is a valid configuration in NeMo 2.0 format. The optimization of throughput, however, is conducted in the next step.
+
+Hyperparameters Recommendation
+------------------------------
+
+After Auto Configurator generates the base configuration, it searches over four critical hyperparameters that have a great impact on training throughput but do not affect model convergence. These hyperparameters include  Tensor Parallelism (TP), Pipeline Parallelism (PP), Context Parallelism (CP), Expert Parallelism (EP), Micro Batch Size (MBS), and Activation Checkpointing Layers (ActCkpt). Auto Configurator will also provide optimal Global Batch Size (GBS) if it's not specified.
+
+Auto Configurator initially applies heuristics to identify suitable candidates for the four key parameters, subsequently generating a grid of candidate configurations. It returns all of the candidate configurations in NeMo 2.0 format.
+   
+> [!NOTE]
+> Some of the candidate configurations may not work due to high-memory usage or other issues.
+
+Once the candidate configurations are generated, you can use NeMo Framework to launch the most promising candidates.
+   
+When running the candidates on the cluster, you can limit job time and job max steps by using ``max_minutes_per_run`` and ``max_steps_per_run`` parameters. During this search, the jobs will run with the number of nodes specified in the configuration files, using the ``num_nodes`` parameter. Once all of the jobs have finished running, you'll need to run compare_throughput.py to get a ``.csv`` table with performance results for each succeeded job.
+
+Optimal Configuration Recommendation
+------------------------------------
+
+After all of the candidate jobs are done, Auto Configurator calculates performance parameters for each of the candidates. 
+Auto Configurator generates two ``.csv`` files: one detailing the performance measures of the candidates and another listing the candidates that failed due to out-of-memory errors.
+
+End-To-End Example
+------------------
+
+The following list shows the required input parameters for the Auto Configurator runner:
+
+- ``model``: model configuration based on NeMo 2.0.
+- ``num_nodes``: number of nodes to be used for the training.
+- ``seq_length``: sequence length to be used for the training.
+- ``data_paths``: dataset to be used for the training.
+- ``tokenizer_path``: path to tokenizer model if custom tokenizer will be used.
+
+The following list shows the optional parameters for the Auto Configurator runner:
+
+- ``global_batch_size``: global batch size to be used.
+- ``tensor_parallel_sizes``: a list, such as ``[1, 2, 4]``.
+- ``pipeline_parallel_sizes``: a list, such as ``[1, 2, 4]``.
+- ``context_parallel_sizes``: a list, such as ``[1, 2, 4]``.
+- ``expert_parallel_sizes``: a list, such as ``[1, 2, 4]``.
+- ``micro_batch_sizes``: a list, such as ``[1, 2, 4]``.
+- ``min_model_parallel_size``: a value for the minimum desired parallelism.
+- ``max_model_parallel_size``: a value for the maximum desired parallelism.
+
+For each of the optional parameters, Auto Configurator will find the optimal value if the parameter is not specified. To view the full list of parameters, please visit [this page](https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/nemo/collections/llm/tools/auto_configurator/runner.py#L51).
+
+To view an end-to-end example of how to generate candidate configs, train them, and calculate the performance using Auto Configurator with NeMo Framework, please visit [this page](https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/examples/llm/auto_configurator/auto_config.py).
+
diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
new file mode 100644
index 000000000000..c202d4d33325
--- /dev/null
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import fiddle as fdl
+import nemo_run as run
+
+from nemo.collections.llm import GPTConfig126M
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs, get_results
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--run_number", type=int, help="Number of config to run")
+    parser.add_argument("--logs_dir", type=str, help="Path where to save training logs")
+    parser.add_argument("--data_path", type=str, help="Path to the dataset")
+    parser.add_argument("--get_results", action="store_true")
+
+    return parser.parse_args()
+
+
+def train_config(args):
+    # GPT-3 126M
+    # This example will generate 3 configs.
+    # It is expected that this script will be run 3 times with changing --run_number flag for each run from 0 to 2.
+    # After all configurations are trained, please trigger the script using --get_results flag.
+    runner = AutoConfigurator(
+        model=run.Config(GPTConfig126M),
+        num_nodes=1,
+        gpus_per_node=1,
+        gpu_memory_gb=40,
+        global_batch_size=16,
+        seq_length=512,
+        tensor_parallel_sizes=[1],
+        pipeline_parallel_sizes=[1],
+        micro_batch_sizes=[1, 2, 4],
+        max_training_days=1,
+        max_steps_per_run=25,
+        num_tokens_in_b=10,
+        vocab_size=51200,
+        data_paths=args.data_path,
+        path_to_logs=args.logs_dir,
+    )
+
+    base_cfg, configs = generate_configs(runner)
+    if not args.get_results:
+        # Get generated configs
+        partials = list(configs.values())
+        names = list(configs.keys())
+
+        # Run pre-training
+        partial = partials[args.run_number - 1]
+        partial.log.dir = os.path.join(args.logs_dir, names[args.run_number - 1])
+        pretrain = fdl.build(partial)
+        pretrain()
+    else:
+        # # Get Auto Configurator results
+        get_results(base_cfg, runner, args.logs_dir)
+        print(f"The results were successfully saved to {args.logs_dir}.")
+
+
+def main():
+    args = get_args()
+    train_config(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/llm/pretrain/README.md b/examples/llm/pretrain/README.md
new file mode 100644
index 000000000000..c9bb7331f972
--- /dev/null
+++ b/examples/llm/pretrain/README.md
@@ -0,0 +1,72 @@
+# Pre-training
+
+### Listing the available recipes for pretraining
+
+```bash
+nemorun llm pretrain --help
+```
+
+![recipe-listing](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/list-recipes.png)
+
+
+### Run pre-training with a default recipe
+
+```bash
+nemorun llm pretrain --factory llama3_8b
+```
+
+![llama3_70b](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b.png)
+
+We can also call the factory function with custom parameters:
+
+```bash
+nemorun llm pretrain --factory "llama3_70b(num_nodes=128)"
+```
+
+![llama3_70b-128-nodes](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b_128nodes.png)
+
+
+The CLI allows you to overwrite any parameter. For example, to run the recipe with 2000 steps: 
+
+```bash
+nemorun llm pretrain --factory llama3_70b trainer.max_steps=2000
+```
+
+The syntax of the CLI is the same as the Python code. Which is great but in some cases you might want to inspect & edit a recipe interactively. An easy way to do this using the cli is the use the `--repl` flag.
+
+```bash
+nemorun llm pretrain --factory llama3_70b --repl
+```
+
+![repl](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/repl.gif)
+
+We can also trigger a run from a jupyter notebook, see [pretrain.ipynb](pretrain.ipynb) for an example. This allows visualizes all configs in a structured format. See for instance the `llama3_8b` recipe:
+
+![llama3_8b_visualization](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_8b_config.svg)
+
+
+### Create and run a custom recipe
+
+We can create a script that contains a custom recipe. See [custom_recipe.py](custom_recipe.py) for an example.
+
+Note that we end the script with a call to `run.cli.main()`, which uses the same syntax as the CLI but allows us to provide specific defaults. We still can overwrite any parameter using the syntax `param=value`. We can set nested parameters using dotted notation, e.g. `trainer.max_steps=2000`.
+
+When running the custom_recipe.py file, it will execute the `custom_llama3_8b` recipe by default. However, you can select different recipes or modify parameters using the following methods:
+
+1. To select the `custom_llama3_70b` recipe:
+   ```bash
+   python custom_recipe.py --factory custom_llama3_70b
+   ```
+   This will automatically call the `custom_llama3_70b` function defined in the script.
+
+2. To overwrite any parameter:
+   ```bash
+   python custom_recipe.py trainer.max_steps=2000
+   ```
+
+3. You can even apply transformations when triggering the CLI as if it's Python code:
+   ```bash
+   python custom_recipe.py "trainer.max_steps=*2"
+   ```
+
+These options provide flexibility in customizing your pretraining recipe directly from the command line.
\ No newline at end of file
diff --git a/examples/llm/pretrain/custom_recipe.py b/examples/llm/pretrain/custom_recipe.py
new file mode 100644
index 000000000000..a522a1a8e1f5
--- /dev/null
+++ b/examples/llm/pretrain/custom_recipe.py
@@ -0,0 +1,44 @@
+import nemo_run as run
+
+from nemo.collections import llm
+from nemo.collections.llm.recipes import llama3_8b, llama3_70b
+
+
+def custom_llama3_8b():
+    pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8)
+
+    pretrain.trainer.val_check_interval = 400
+    pretrain.log.ckpt.save_top_k = -1
+    pretrain.log.ckpt.every_n_train_steps = 400
+
+    pretrain.trainer.max_steps = 1000
+
+    return pretrain
+
+
+def custom_llama3_70b():
+    pretrain = llama3_70b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8)
+
+    pretrain.trainer.val_check_interval = 400
+    pretrain.log.ckpt.save_top_k = -1
+    pretrain.log.ckpt.every_n_train_steps = 400
+
+    pretrain.trainer.max_steps = 1000
+
+    return pretrain
+
+
+if __name__ == "__main__":
+    # When running this file, it will run the `custom_llama3_8b` recipe
+
+    # To select the `custom_llama3_70b` recipe, use the following command:
+    #   python custom_recipe.py --factory custom_llama3_70b
+    #   This will automatically call the custom_llama3_70b that's defined above
+
+    # Note that any parameter can be overwritten by using the following syntax:
+    # python custom_recipe.py trainer.max_steps=2000
+
+    # You can even apply transformations when triggering the CLI as if it's python code
+    # python custom_recipe.py "trainer.max_steps*=2"
+
+    run.cli.main(llm.pretrain, default_factory=custom_llama3_8b)
diff --git a/examples/llm/pretrain/default_executor.py b/examples/llm/pretrain/default_executor.py
new file mode 100644
index 000000000000..2668d312f2b8
--- /dev/null
+++ b/examples/llm/pretrain/default_executor.py
@@ -0,0 +1,106 @@
+from typing import Optional
+import nemo_run as run
+from nemo.collections import llm
+
+
+def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor:
+    env_vars = {
+        "TRANSFORMERS_OFFLINE": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
+        "NCCL_NVLS_ENABLE": "0",
+        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
+        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "NVTE_FUSED_ATTN": "0",
+    }
+
+    executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
+
+    return executor
+
+
+def slurm_executor(
+    user: str,
+    host: str,
+    remote_job_dir: str,
+    account: str,
+    partition: str,
+    nodes: int,
+    devices: int,
+    time: str = "01:00:00",
+    custom_mounts: Optional[list[str]] = None,
+    custom_env_vars: Optional[dict[str, str]] = None,
+    container_image: str = "nvcr.io/nvidia/nemo:dev",
+    retries: int = 0,
+) -> run.SlurmExecutor:
+    if not (user and host and remote_job_dir and account and partition and nodes and devices):
+        raise RuntimeError(
+            "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function."
+        )
+
+    mounts = []
+    if custom_mounts:
+        mounts.extend(custom_mounts)
+
+    env_vars = {
+        "TRANSFORMERS_OFFLINE": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
+        "NCCL_NVLS_ENABLE": "0",
+        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
+        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "NVTE_FUSED_ATTN": "0",
+    }
+    if custom_env_vars:
+        env_vars |= custom_env_vars
+
+    executor = run.SlurmExecutor(
+        account=account,
+        partition=partition,
+        tunnel=run.SSHTunnel(
+            user=user,
+            host=host,
+            job_dir=remote_job_dir,
+        ),
+        nodes=nodes,
+        ntasks_per_node=devices,
+        gpus_per_node=devices,
+        mem="0",
+        exclusive=True,
+        gres="gpu:8",
+        packager=run.GitArchivePackager(subpath="examples/llm/run"),
+    )
+
+    executor.container_image = container_image
+    executor.container_mounts = mounts
+    executor.env_vars = env_vars
+    executor.retries = retries
+    executor.time = time
+
+    return executor
+
+
+def my_slurm_executor():
+    # TODO: Set your custom parameters for the Slurm Executor.
+    return slurm_executor(
+        user="",
+        host="",
+        remote_job_dir="",
+        account="",
+        partition="",
+        nodes=1,
+        devices=2,
+    )
+
+
+if __name__ == "__main__":
+    run.cli.main(llm.pretrain, default_executor=local_executor_torchrun)
+
+    # This will re-expose the pretrain entrypoint with your custom local executor as default.
+
+    # To run, for instance, the llama3_8b recipe, use the following command:
+    #   python default_executor.py --factory llama3_8b
+
+    # To run with any overrides, use the following command:
+    #   python default_executor.py --factory llama3_8b trainer.max_steps=2000
+
+    # To use your custom Slurm executor, use the following command:
+    #   python default_executor.py --executor my_slurm_executor --factory llama3_8b
diff --git a/examples/llm/pretrain/pretrain.ipynb b/examples/llm/pretrain/pretrain.ipynb
new file mode 100644
index 000000000000..194741a9da9f
--- /dev/null
+++ b/examples/llm/pretrain/pretrain.ipynb
@@ -0,0 +1,737 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Trigger a run from a notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-08-29 17:14:25 nemo_logging:349] /Users/romeyn/base/code/.venv/lib/python3.10/site-packages/megatron/core/optimizer/__init__.py:18: UserWarning: Transformer Engine and Apex are not installed. Falling back to Torch optimizers.\n",
+      "      warnings.warn(\n",
+      "    \n",
+      "[NeMo W 2024-08-29 17:14:25 nemo_logging:349] /Users/romeyn/base/code/.venv/lib/python3.10/site-packages/megatron/core/optimizer/clip_grads.py:31: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier, multi_tensor_l2norm, and multi_tensor_scale\n",
+      "      warnings.warn(\n",
+      "    \n"
+     ]
+    }
+   ],
+   "source": [
+    "import nemo_run as run\n",
+    "from nemo.collections import llm\n",
+    "from nemo.collections.llm.recipes import llama3_8b\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/svg+xml": [
+       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
+       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
+       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
+       "<!-- Generated by graphviz version 11.0.0 (20240428.1522)\n",
+       " -->\n",
+       "<!-- Pages: 1 -->\n",
+       "<svg width=\"1905pt\" height=\"852pt\"\n",
+       " viewBox=\"0.00 0.00 1904.50 851.50\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
+       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 847.5)\">\n",
+       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-847.5 1900.5,-847.5 1900.5,4 -4,4\"/>\n",
+       "<!-- 2 -->\n",
+       "<g id=\"node1\" class=\"node\">\n",
+       "<title>2</title>\n",
+       "<polygon fill=\"#fff8dc\" stroke=\"none\" points=\"0,-187.5 0,-207.5 129,-207.5 129,-187.5 0,-187.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"0,-187.5 0,-207.5 129,-207.5 129,-187.5 0,-187.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"3.75\" y=\"-195\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"35.25\" y=\"-195\" font-family=\"Courier,monospace\" font-size=\"10.00\"> Llama3Config8B</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"0,-167.5 0,-187.5 129,-187.5 129,-167.5 0,-167.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"0,-167.5 0,-187.5 129,-187.5 129,-167.5 0,-167.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"28.5\" y=\"-175\" font-family=\"Courier,monospace\" font-style=\"italic\" font-size=\"10.00\">no arguments</text>\n",
+       "</g>\n",
+       "<!-- 1 -->\n",
+       "<g id=\"node2\" class=\"node\">\n",
+       "<title>1</title>\n",
+       "<polygon fill=\"#90ee90\" stroke=\"none\" points=\"85,-500.5 85,-520.5 190,-520.5 190,-500.5 85,-500.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"85,-500.5 85,-520.5 190,-520.5 190,-500.5 85,-500.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"88.75\" y=\"-508\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"120.25\" y=\"-508\" font-family=\"Courier,monospace\" font-size=\"10.00\"> LlamaModel</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"85,-480.5 85,-500.5 151,-500.5 151,-480.5 85,-480.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"85,-480.5 85,-500.5 151,-500.5 151,-480.5 85,-480.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"111\" y=\"-487\" font-family=\"Courier,monospace\" font-size=\"10.00\">config</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"151,-480.5 151,-500.5 190,-500.5 190,-480.5 151,-480.5\"/>\n",
+       "<polygon fill=\"#fff8dc\" stroke=\"none\" points=\"155,-484.5 155,-496.5 186,-496.5 186,-484.5 155,-484.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"155,-484.5 155,-496.5 186,-496.5 186,-484.5 155,-484.5\"/>\n",
+       "</g>\n",
+       "<!-- 1&#45;&#45;2 -->\n",
+       "<g id=\"edge1\" class=\"edge\">\n",
+       "<title>1:c&#45;&#45;2:c</title>\n",
+       "<path fill=\"none\" stroke=\"#cbc6b0\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M168.83,-484.77C157.56,-452.76 92.23,-267.24 71.09,-207.22\"/>\n",
+       "</g>\n",
+       "<!-- 0 -->\n",
+       "<g id=\"node3\" class=\"node\">\n",
+       "<title>0</title>\n",
+       "<polygon fill=\"#ffc0cb\" stroke=\"none\" points=\"640.5,-823.5 640.5,-843.5 738.5,-843.5 738.5,-823.5 640.5,-823.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" points=\"640.5,-823.5 640.5,-843.5 738.5,-843.5 738.5,-823.5 640.5,-823.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"644.5\" y=\"-831\" font-family=\"Courier,monospace\" font-size=\"8.00\">Partial:</text>\n",
+       "<text text-anchor=\"start\" x=\"680.5\" y=\"-831\" font-family=\"Courier,monospace\" font-size=\"10.00\"> pretrain</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" stroke-dasharray=\"5,2\" points=\"640.5,-803.5 640.5,-823.5 706.5,-823.5 706.5,-803.5 640.5,-803.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"640.5,-803.5 640.5,-823.5 706.5,-823.5 706.5,-803.5 640.5,-803.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"672.5\" y=\"-810\" font-family=\"Courier,monospace\" font-size=\"10.00\">model</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"706.5,-803.5 706.5,-823.5 738.5,-823.5 738.5,-803.5 706.5,-803.5\"/>\n",
+       "<polygon fill=\"#90ee90\" stroke=\"none\" points=\"710.5,-807.5 710.5,-819.5 734.5,-819.5 734.5,-807.5 710.5,-807.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"710.5,-807.5 710.5,-819.5 734.5,-819.5 734.5,-807.5 710.5,-807.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"640.5,-783.5 640.5,-803.5 706.5,-803.5 706.5,-783.5 640.5,-783.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"640.5,-783.5 640.5,-803.5 706.5,-803.5 706.5,-783.5 640.5,-783.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"678.5\" y=\"-790\" font-family=\"Courier,monospace\" font-size=\"10.00\">data</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"706.5,-783.5 706.5,-803.5 738.5,-803.5 738.5,-783.5 706.5,-783.5\"/>\n",
+       "<polygon fill=\"#ffa07a\" stroke=\"none\" points=\"710.5,-787.5 710.5,-799.5 734.5,-799.5 734.5,-787.5 710.5,-787.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"710.5,-787.5 710.5,-799.5 734.5,-799.5 734.5,-787.5 710.5,-787.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"640.5,-763.5 640.5,-783.5 706.5,-783.5 706.5,-763.5 640.5,-763.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"640.5,-763.5 640.5,-783.5 706.5,-783.5 706.5,-763.5 640.5,-763.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"660.5\" y=\"-770\" font-family=\"Courier,monospace\" font-size=\"10.00\">trainer</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"706.5,-763.5 706.5,-783.5 738.5,-783.5 738.5,-763.5 706.5,-763.5\"/>\n",
+       "<polygon fill=\"#add8e6\" stroke=\"none\" points=\"710.5,-767.5 710.5,-779.5 734.5,-779.5 734.5,-767.5 710.5,-767.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"710.5,-767.5 710.5,-779.5 734.5,-779.5 734.5,-767.5 710.5,-767.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"640.5,-743.5 640.5,-763.5 706.5,-763.5 706.5,-743.5 640.5,-743.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"640.5,-743.5 640.5,-763.5 706.5,-763.5 706.5,-743.5 640.5,-743.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"684.5\" y=\"-750\" font-family=\"Courier,monospace\" font-size=\"10.00\">log</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"706.5,-743.5 706.5,-763.5 738.5,-763.5 738.5,-743.5 706.5,-743.5\"/>\n",
+       "<polygon fill=\"#db7093\" stroke=\"none\" points=\"710.5,-747.5 710.5,-759.5 734.5,-759.5 734.5,-747.5 710.5,-747.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"710.5,-747.5 710.5,-759.5 734.5,-759.5 734.5,-747.5 710.5,-747.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"640.5,-723.5 640.5,-743.5 706.5,-743.5 706.5,-723.5 640.5,-723.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"640.5,-723.5 640.5,-743.5 706.5,-743.5 706.5,-723.5 640.5,-723.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"666.5\" y=\"-730\" font-family=\"Courier,monospace\" font-size=\"10.00\">resume</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"706.5,-723.5 706.5,-743.5 738.5,-743.5 738.5,-723.5 706.5,-723.5\"/>\n",
+       "<polygon fill=\"#00bfff\" stroke=\"none\" points=\"710.5,-727.5 710.5,-739.5 734.5,-739.5 734.5,-727.5 710.5,-727.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"710.5,-727.5 710.5,-739.5 734.5,-739.5 734.5,-727.5 710.5,-727.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"640.5,-703.5 640.5,-723.5 706.5,-723.5 706.5,-703.5 640.5,-703.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"640.5,-703.5 640.5,-723.5 706.5,-723.5 706.5,-703.5 640.5,-703.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"672.5\" y=\"-710\" font-family=\"Courier,monospace\" font-size=\"10.00\">optim</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"706.5,-703.5 706.5,-723.5 738.5,-723.5 738.5,-703.5 706.5,-703.5\"/>\n",
+       "<polygon fill=\"#7b68ee\" stroke=\"none\" points=\"710.5,-707.5 710.5,-719.5 734.5,-719.5 734.5,-707.5 710.5,-707.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"710.5,-707.5 710.5,-719.5 734.5,-719.5 734.5,-707.5 710.5,-707.5\"/>\n",
+       "</g>\n",
+       "<!-- 0&#45;&#45;1 -->\n",
+       "<g id=\"edge2\" class=\"edge\">\n",
+       "<title>0:c&#45;&#45;1:c</title>\n",
+       "<path fill=\"none\" stroke=\"#73be73\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M710.73,-810.4C659.33,-800.82 450.59,-757.86 305.5,-667.5 239.2,-626.21 178.54,-554.45 151.84,-520.43\"/>\n",
+       "</g>\n",
+       "<!-- 3 -->\n",
+       "<g id=\"node4\" class=\"node\">\n",
+       "<title>3</title>\n",
+       "<polygon fill=\"#ffa07a\" stroke=\"none\" points=\"314.5,-520.5 314.5,-540.5 456.5,-540.5 456.5,-520.5 314.5,-520.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"314.5,-520.5 314.5,-540.5 456.5,-540.5 456.5,-520.5 314.5,-520.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"324.75\" y=\"-528\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"356.25\" y=\"-528\" font-family=\"Courier,monospace\" font-size=\"10.00\"> MockDataModule</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"314.5,-500.5 314.5,-520.5 424.5,-520.5 424.5,-500.5 314.5,-500.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"314.5,-500.5 314.5,-520.5 424.5,-520.5 424.5,-500.5 314.5,-500.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"360.5\" y=\"-507\" font-family=\"Courier,monospace\" font-size=\"10.00\">seq_length</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"424.5,-500.5 424.5,-520.5 456.5,-520.5 456.5,-500.5 424.5,-500.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"428.5\" y=\"-507\" font-family=\"Courier,monospace\" font-size=\"10.00\">8192</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"314.5,-480.5 314.5,-500.5 424.5,-500.5 424.5,-480.5 314.5,-480.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"314.5,-480.5 314.5,-500.5 424.5,-500.5 424.5,-480.5 314.5,-480.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"324.5\" y=\"-487\" font-family=\"Courier,monospace\" font-size=\"10.00\">micro_batch_size</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"424.5,-480.5 424.5,-500.5 456.5,-500.5 456.5,-480.5 424.5,-480.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"428.5\" y=\"-487\" font-family=\"Courier,monospace\" font-size=\"10.00\">1</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"314.5,-460.5 314.5,-480.5 424.5,-480.5 424.5,-460.5 314.5,-460.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"314.5,-460.5 314.5,-480.5 424.5,-480.5 424.5,-460.5 314.5,-460.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"318.5\" y=\"-467\" font-family=\"Courier,monospace\" font-size=\"10.00\">global_batch_size</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"424.5,-460.5 424.5,-480.5 456.5,-480.5 456.5,-460.5 424.5,-460.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"428.5\" y=\"-467\" font-family=\"Courier,monospace\" font-size=\"10.00\">512</text>\n",
+       "</g>\n",
+       "<!-- 0&#45;&#45;3 -->\n",
+       "<g id=\"edge3\" class=\"edge\">\n",
+       "<title>0:c&#45;&#45;3:c</title>\n",
+       "<path fill=\"none\" stroke=\"#cb8061\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M710.63,-789.72C670.89,-779.87 540.79,-743.07 465.5,-667.5 429.94,-631.81 407.85,-577.2 396.12,-540.46\"/>\n",
+       "</g>\n",
+       "<!-- 4 -->\n",
+       "<g id=\"node6\" class=\"node\">\n",
+       "<title>4</title>\n",
+       "<polygon fill=\"#add8e6\" stroke=\"none\" points=\"474.5,-647.5 474.5,-667.5 670.5,-667.5 670.5,-647.5 474.5,-647.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-647.5 474.5,-667.5 670.5,-667.5 670.5,-647.5 474.5,-647.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"532.75\" y=\"-655\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"564.25\" y=\"-655\" font-family=\"Courier,monospace\" font-size=\"10.00\"> Trainer</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-627.5 474.5,-647.5 620.5,-647.5 620.5,-627.5 474.5,-627.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-627.5 474.5,-647.5 620.5,-647.5 620.5,-627.5 474.5,-627.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"550.5\" y=\"-634\" font-family=\"Courier,monospace\" font-size=\"10.00\">accelerator</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-627.5 620.5,-647.5 670.5,-647.5 670.5,-627.5 620.5,-627.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-634\" font-family=\"Courier,monospace\" font-size=\"10.00\">&#39;gpu&#39;</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-607.5 474.5,-627.5 620.5,-627.5 620.5,-607.5 474.5,-607.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-607.5 474.5,-627.5 620.5,-627.5 620.5,-607.5 474.5,-607.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"478.5\" y=\"-614\" font-family=\"Courier,monospace\" font-size=\"10.00\">accumulate_grad_batches</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-607.5 620.5,-627.5 670.5,-627.5 670.5,-607.5 620.5,-607.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-614\" font-family=\"Courier,monospace\" font-size=\"10.00\">1</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-553.5 474.5,-607.5 620.5,-607.5 620.5,-553.5 474.5,-553.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-553.5 474.5,-607.5 620.5,-607.5 620.5,-553.5 474.5,-553.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"562.5\" y=\"-577.38\" font-family=\"Courier,monospace\" font-size=\"10.00\">callbacks</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-553.5 620.5,-607.5 670.5,-607.5 670.5,-553.5 620.5,-553.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"624.5,-582.75 624.5,-603.5 666.5,-603.5 666.5,-582.75 624.5,-582.75\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"624.5,-582.75 624.5,-603.5 666.5,-603.5 666.5,-582.75 624.5,-582.75\"/>\n",
+       "<text text-anchor=\"start\" x=\"633.5\" y=\"-590\" font-family=\"Courier,monospace\" font-size=\"10.00\">list</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"624.5,-566.75 624.5,-582.75 666.5,-582.75 666.5,-566.75 624.5,-566.75\"/>\n",
+       "<polygon fill=\"#ff8c00\" stroke=\"none\" points=\"628.5,-570.75 628.5,-578.75 662.5,-578.75 662.5,-570.75 628.5,-570.75\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"628.5,-570.75 628.5,-578.75 662.5,-578.75 662.5,-570.75 628.5,-570.75\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"624.5,-557.25 624.5,-566.75 666.5,-566.75 666.5,-557.25 624.5,-557.25\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"624.5,-557.25 624.5,-566.75 666.5,-566.75 666.5,-557.25 624.5,-557.25\"/>\n",
+       "<text text-anchor=\"start\" x=\"643.62\" y=\"-560.05\" font-family=\"Courier,monospace\" font-size=\"6.00\">0</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-533.5 474.5,-553.5 620.5,-553.5 620.5,-533.5 474.5,-533.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-533.5 474.5,-553.5 620.5,-553.5 620.5,-533.5 474.5,-533.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"574.5\" y=\"-540\" font-family=\"Courier,monospace\" font-size=\"10.00\">devices</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-533.5 620.5,-553.5 670.5,-553.5 670.5,-533.5 620.5,-533.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-540\" font-family=\"Courier,monospace\" font-size=\"10.00\">8</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-513.5 474.5,-533.5 620.5,-533.5 620.5,-513.5 474.5,-513.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-513.5 474.5,-533.5 620.5,-533.5 620.5,-513.5 474.5,-513.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"514.5\" y=\"-520\" font-family=\"Courier,monospace\" font-size=\"10.00\">gradient_clip_val</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-513.5 620.5,-533.5 670.5,-533.5 670.5,-513.5 620.5,-513.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-520\" font-family=\"Courier,monospace\" font-size=\"10.00\">1.0</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-493.5 474.5,-513.5 620.5,-513.5 620.5,-493.5 474.5,-493.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-493.5 474.5,-513.5 620.5,-513.5 620.5,-493.5 474.5,-493.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"508.5\" y=\"-500\" font-family=\"Courier,monospace\" font-size=\"10.00\">limit_test_batches</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-493.5 620.5,-513.5 670.5,-513.5 670.5,-493.5 620.5,-493.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-500\" font-family=\"Courier,monospace\" font-size=\"10.00\">50</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-473.5 474.5,-493.5 620.5,-493.5 620.5,-473.5 474.5,-473.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-473.5 474.5,-493.5 620.5,-493.5 620.5,-473.5 474.5,-473.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"514.5\" y=\"-480\" font-family=\"Courier,monospace\" font-size=\"10.00\">limit_val_batches</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-473.5 620.5,-493.5 670.5,-493.5 670.5,-473.5 620.5,-473.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-480\" font-family=\"Courier,monospace\" font-size=\"10.00\">32</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-453.5 474.5,-473.5 620.5,-473.5 620.5,-453.5 474.5,-453.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-453.5 474.5,-473.5 620.5,-473.5 620.5,-453.5 474.5,-453.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"514.5\" y=\"-460\" font-family=\"Courier,monospace\" font-size=\"10.00\">log_every_n_steps</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-453.5 620.5,-473.5 670.5,-473.5 670.5,-453.5 620.5,-453.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-460\" font-family=\"Courier,monospace\" font-size=\"10.00\">10</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-433.5 474.5,-453.5 620.5,-453.5 620.5,-433.5 474.5,-433.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-433.5 474.5,-453.5 620.5,-453.5 620.5,-433.5 474.5,-433.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"562.5\" y=\"-440\" font-family=\"Courier,monospace\" font-size=\"10.00\">max_steps</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-433.5 620.5,-453.5 670.5,-453.5 670.5,-433.5 620.5,-433.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-440\" font-family=\"Courier,monospace\" font-size=\"10.00\">1168251</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-413.5 474.5,-433.5 620.5,-433.5 620.5,-413.5 474.5,-413.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-413.5 474.5,-433.5 620.5,-433.5 620.5,-413.5 474.5,-413.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"562.5\" y=\"-420\" font-family=\"Courier,monospace\" font-size=\"10.00\">num_nodes</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-413.5 620.5,-433.5 670.5,-433.5 670.5,-413.5 620.5,-413.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-420\" font-family=\"Courier,monospace\" font-size=\"10.00\">1</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-393.5 474.5,-413.5 620.5,-413.5 620.5,-393.5 474.5,-393.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-393.5 474.5,-413.5 620.5,-413.5 620.5,-393.5 474.5,-393.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"574.5\" y=\"-400\" font-family=\"Courier,monospace\" font-size=\"10.00\">plugins</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-393.5 620.5,-413.5 670.5,-413.5 670.5,-393.5 620.5,-393.5\"/>\n",
+       "<polygon fill=\"#8fbc8f\" stroke=\"none\" points=\"624.5,-397.5 624.5,-409.5 666.5,-409.5 666.5,-397.5 624.5,-397.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"624.5,-397.5 624.5,-409.5 666.5,-409.5 666.5,-397.5 624.5,-397.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-373.5 474.5,-393.5 620.5,-393.5 620.5,-373.5 474.5,-373.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-373.5 474.5,-393.5 620.5,-393.5 620.5,-373.5 474.5,-373.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"568.5\" y=\"-380\" font-family=\"Courier,monospace\" font-size=\"10.00\">strategy</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-373.5 620.5,-393.5 670.5,-393.5 670.5,-373.5 620.5,-373.5\"/>\n",
+       "<polygon fill=\"#ff6347\" stroke=\"none\" points=\"624.5,-377.5 624.5,-389.5 666.5,-389.5 666.5,-377.5 624.5,-377.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"624.5,-377.5 624.5,-389.5 666.5,-389.5 666.5,-377.5 624.5,-377.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-353.5 474.5,-373.5 620.5,-373.5 620.5,-353.5 474.5,-353.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-353.5 474.5,-373.5 620.5,-373.5 620.5,-353.5 474.5,-353.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"478.5\" y=\"-360\" font-family=\"Courier,monospace\" font-size=\"10.00\">use_distributed_sampler</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-353.5 620.5,-373.5 670.5,-373.5 670.5,-353.5 620.5,-353.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-360\" font-family=\"Courier,monospace\" font-size=\"10.00\">False</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"474.5,-333.5 474.5,-353.5 620.5,-353.5 620.5,-333.5 474.5,-333.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"474.5,-333.5 474.5,-353.5 620.5,-353.5 620.5,-333.5 474.5,-333.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"508.5\" y=\"-340\" font-family=\"Courier,monospace\" font-size=\"10.00\">val_check_interval</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"620.5,-333.5 620.5,-353.5 670.5,-353.5 670.5,-333.5 620.5,-333.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"624.5\" y=\"-340\" font-family=\"Courier,monospace\" font-size=\"10.00\">2000</text>\n",
+       "</g>\n",
+       "<!-- 0&#45;&#45;4 -->\n",
+       "<g id=\"edge9\" class=\"edge\">\n",
+       "<title>0:c&#45;&#45;4:c</title>\n",
+       "<path fill=\"none\" stroke=\"#8aacb7\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M719.9,-767.8C712.41,-754.27 689.71,-713.27 664.31,-667.37\"/>\n",
+       "</g>\n",
+       "<!-- 9 -->\n",
+       "<g id=\"node11\" class=\"node\">\n",
+       "<title>9</title>\n",
+       "<polygon fill=\"#db7093\" stroke=\"none\" points=\"805.5,-540.5 805.5,-560.5 941.5,-560.5 941.5,-540.5 805.5,-540.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"805.5,-540.5 805.5,-560.5 941.5,-560.5 941.5,-540.5 805.5,-540.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"824.75\" y=\"-548\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"856.25\" y=\"-548\" font-family=\"Courier,monospace\" font-size=\"10.00\"> NeMoLogger</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"805.5,-520.5 805.5,-540.5 879.5,-540.5 879.5,-520.5 805.5,-520.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"805.5,-520.5 805.5,-540.5 879.5,-540.5 879.5,-520.5 805.5,-520.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"851.5\" y=\"-527\" font-family=\"Courier,monospace\" font-size=\"10.00\">name</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"879.5,-520.5 879.5,-540.5 941.5,-540.5 941.5,-520.5 879.5,-520.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"883.5\" y=\"-527\" font-family=\"Courier,monospace\" font-size=\"10.00\">&#39;default&#39;</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"805.5,-500.5 805.5,-520.5 879.5,-520.5 879.5,-500.5 805.5,-500.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"805.5,-500.5 805.5,-520.5 879.5,-520.5 879.5,-500.5 805.5,-500.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"857.5\" y=\"-507\" font-family=\"Courier,monospace\" font-size=\"10.00\">dir</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"879.5,-500.5 879.5,-520.5 941.5,-520.5 941.5,-500.5 879.5,-500.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"883.5\" y=\"-507\" font-family=\"Courier,monospace\" font-size=\"10.00\">None</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"805.5,-480.5 805.5,-500.5 879.5,-500.5 879.5,-480.5 805.5,-480.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"805.5,-480.5 805.5,-500.5 879.5,-500.5 879.5,-480.5 805.5,-480.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"851.5\" y=\"-487\" font-family=\"Courier,monospace\" font-size=\"10.00\">ckpt</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"879.5,-480.5 879.5,-500.5 941.5,-500.5 941.5,-480.5 879.5,-480.5\"/>\n",
+       "<polygon fill=\"#f0e68c\" stroke=\"none\" points=\"883.5,-484.5 883.5,-496.5 937.5,-496.5 937.5,-484.5 883.5,-484.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"883.5,-484.5 883.5,-496.5 937.5,-496.5 937.5,-484.5 883.5,-484.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"805.5,-460.5 805.5,-480.5 879.5,-480.5 879.5,-460.5 805.5,-460.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"805.5,-460.5 805.5,-480.5 879.5,-480.5 879.5,-460.5 805.5,-460.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"809.5\" y=\"-467\" font-family=\"Courier,monospace\" font-size=\"10.00\">tensorboard</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"879.5,-460.5 879.5,-480.5 941.5,-480.5 941.5,-460.5 879.5,-460.5\"/>\n",
+       "<polygon fill=\"#32cd32\" stroke=\"none\" points=\"883.5,-464.5 883.5,-476.5 937.5,-476.5 937.5,-464.5 883.5,-464.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"883.5,-464.5 883.5,-476.5 937.5,-476.5 937.5,-464.5 883.5,-464.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"805.5,-440.5 805.5,-460.5 879.5,-460.5 879.5,-440.5 805.5,-440.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"805.5,-440.5 805.5,-460.5 879.5,-460.5 879.5,-440.5 805.5,-440.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"845.5\" y=\"-447\" font-family=\"Courier,monospace\" font-size=\"10.00\">wandb</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"879.5,-440.5 879.5,-460.5 941.5,-460.5 941.5,-440.5 879.5,-440.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"883.5\" y=\"-447\" font-family=\"Courier,monospace\" font-size=\"10.00\">None</text>\n",
+       "</g>\n",
+       "<!-- 0&#45;&#45;9 -->\n",
+       "<g id=\"edge12\" class=\"edge\">\n",
+       "<title>0:c&#45;&#45;9:c</title>\n",
+       "<path fill=\"none\" stroke=\"#af5975\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M725.38,-747.71C739.1,-724.9 798.77,-625.73 838.06,-560.41\"/>\n",
+       "</g>\n",
+       "<!-- 12 -->\n",
+       "<g id=\"node13\" class=\"node\">\n",
+       "<title>12</title>\n",
+       "<polygon fill=\"#00bfff\" stroke=\"none\" points=\"959.5,-510.5 959.5,-530.5 1161.5,-530.5 1161.5,-510.5 959.5,-510.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"959.5,-510.5 959.5,-530.5 1161.5,-530.5 1161.5,-510.5 959.5,-510.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1011.75\" y=\"-518\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"1043.25\" y=\"-518\" font-family=\"Courier,monospace\" font-size=\"10.00\"> AutoResume</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"959.5,-490.5 959.5,-510.5 1129.5,-510.5 1129.5,-490.5 959.5,-490.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"959.5,-490.5 959.5,-510.5 1129.5,-510.5 1129.5,-490.5 959.5,-490.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1029.5\" y=\"-497\" font-family=\"Courier,monospace\" font-size=\"10.00\">resume_if_exists</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1129.5,-490.5 1129.5,-510.5 1161.5,-510.5 1161.5,-490.5 1129.5,-490.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1133.5\" y=\"-497\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"959.5,-470.5 959.5,-490.5 1129.5,-490.5 1129.5,-470.5 959.5,-470.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"959.5,-470.5 959.5,-490.5 1129.5,-490.5 1129.5,-470.5 959.5,-470.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"963.5\" y=\"-477\" font-family=\"Courier,monospace\" font-size=\"10.00\">resume_ignore_no_checkpoint</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1129.5,-470.5 1129.5,-490.5 1161.5,-490.5 1161.5,-470.5 1129.5,-470.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1133.5\" y=\"-477\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "</g>\n",
+       "<!-- 0&#45;&#45;12 -->\n",
+       "<g id=\"edge13\" class=\"edge\">\n",
+       "<title>0:c&#45;&#45;12:c</title>\n",
+       "<path fill=\"none\" stroke=\"#0098cb\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M734.4,-731.95C770.45,-729.74 880.24,-718.9 950.5,-667.5 999.19,-631.88 1032.61,-567.69 1048.96,-530.43\"/>\n",
+       "</g>\n",
+       "<!-- 13 -->\n",
+       "<g id=\"node15\" class=\"node\">\n",
+       "<title>13</title>\n",
+       "<polygon fill=\"#7b68ee\" stroke=\"none\" points=\"1393,-510.5 1393,-530.5 1576,-530.5 1576,-510.5 1393,-510.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1393,-510.5 1393,-530.5 1576,-530.5 1576,-510.5 1393,-510.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1396.75\" y=\"-518\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"1428.25\" y=\"-518\" font-family=\"Courier,monospace\" font-size=\"10.00\"> MegatronOptimizerModule</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1393,-490.5 1393,-510.5 1516,-510.5 1516,-490.5 1393,-490.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1393,-490.5 1393,-510.5 1516,-510.5 1516,-490.5 1393,-490.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1476\" y=\"-497\" font-family=\"Courier,monospace\" font-size=\"10.00\">config</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1516,-490.5 1516,-510.5 1576,-510.5 1576,-490.5 1516,-490.5\"/>\n",
+       "<polygon fill=\"#ffc0cb\" stroke=\"none\" points=\"1520,-494.5 1520,-506.5 1572,-506.5 1572,-494.5 1520,-494.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1520,-494.5 1520,-506.5 1572,-506.5 1572,-494.5 1520,-494.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1393,-470.5 1393,-490.5 1516,-490.5 1516,-470.5 1393,-470.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1393,-470.5 1393,-490.5 1516,-490.5 1516,-470.5 1393,-470.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1440\" y=\"-477\" font-family=\"Courier,monospace\" font-size=\"10.00\">lr_scheduler</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1516,-470.5 1516,-490.5 1576,-490.5 1576,-470.5 1516,-470.5\"/>\n",
+       "<polygon fill=\"#90ee90\" stroke=\"none\" points=\"1520,-474.5 1520,-486.5 1572,-486.5 1572,-474.5 1520,-474.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1520,-474.5 1520,-486.5 1572,-486.5 1572,-474.5 1520,-474.5\"/>\n",
+       "</g>\n",
+       "<!-- 0&#45;&#45;13 -->\n",
+       "<g id=\"edge16\" class=\"edge\">\n",
+       "<title>0:c&#45;&#45;13:c</title>\n",
+       "<path fill=\"none\" stroke=\"#6253be\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M734.34,-711.93C796.48,-708.83 1085.19,-693.26 1170.5,-667.5 1274.69,-636.04 1384.47,-569.06 1442.61,-530.48\"/>\n",
+       "</g>\n",
+       "<!-- 5 -->\n",
+       "<g id=\"node5\" class=\"node\">\n",
+       "<title>5</title>\n",
+       "<polygon fill=\"#ff8c00\" stroke=\"none\" points=\"147,-187.5 147,-207.5 276,-207.5 276,-187.5 147,-187.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"147,-187.5 147,-207.5 276,-207.5 276,-187.5 147,-187.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"150.75\" y=\"-195\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"182.25\" y=\"-195\" font-family=\"Courier,monospace\" font-size=\"10.00\"> TimingCallback</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"147,-167.5 147,-187.5 276,-187.5 276,-167.5 147,-167.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"147,-167.5 147,-187.5 276,-187.5 276,-167.5 147,-167.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"175.5\" y=\"-175\" font-family=\"Courier,monospace\" font-style=\"italic\" font-size=\"10.00\">no arguments</text>\n",
+       "</g>\n",
+       "<!-- 4&#45;&#45;5 -->\n",
+       "<g id=\"edge4\" class=\"edge\">\n",
+       "<title>4:c&#45;&#45;5:c</title>\n",
+       "<path fill=\"none\" stroke=\"#cb6f00\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M644.28,-570.78C634.73,-547.93 569.54,-399.11 465.5,-333.5 396.49,-289.98 353.48,-342.61 285.5,-297.5 251.96,-275.24 229.56,-232.07 218.84,-207.26\"/>\n",
+       "</g>\n",
+       "<!-- 6 -->\n",
+       "<g id=\"node8\" class=\"node\">\n",
+       "<title>6</title>\n",
+       "<polygon fill=\"#8fbc8f\" stroke=\"none\" points=\"294.5,-227.5 294.5,-247.5 496.5,-247.5 496.5,-227.5 294.5,-227.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"294.5,-227.5 294.5,-247.5 496.5,-247.5 496.5,-227.5 294.5,-227.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"310.75\" y=\"-235\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"342.25\" y=\"-235\" font-family=\"Courier,monospace\" font-size=\"10.00\"> MegatronMixedPrecision</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"294.5,-207.5 294.5,-227.5 416.5,-227.5 416.5,-207.5 294.5,-207.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"294.5,-207.5 294.5,-227.5 416.5,-227.5 416.5,-207.5 294.5,-207.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"358.5\" y=\"-214\" font-family=\"Courier,monospace\" font-size=\"10.00\">precision</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"416.5,-207.5 416.5,-227.5 496.5,-227.5 496.5,-207.5 416.5,-207.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"420.5\" y=\"-214\" font-family=\"Courier,monospace\" font-size=\"10.00\">&#39;bf16&#45;mixed&#39;</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"294.5,-187.5 294.5,-207.5 416.5,-207.5 416.5,-187.5 294.5,-187.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"294.5,-187.5 294.5,-207.5 416.5,-207.5 416.5,-187.5 294.5,-187.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"340.5\" y=\"-194\" font-family=\"Courier,monospace\" font-size=\"10.00\">params_dtype</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"416.5,-187.5 416.5,-207.5 496.5,-207.5 496.5,-187.5 416.5,-187.5\"/>\n",
+       "<polygon fill=\"#adff2f\" stroke=\"none\" points=\"420.5,-191.5 420.5,-203.5 492.5,-203.5 492.5,-191.5 420.5,-191.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"420.5,-191.5 420.5,-203.5 492.5,-203.5 492.5,-191.5 420.5,-191.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"294.5,-167.5 294.5,-187.5 416.5,-187.5 416.5,-167.5 294.5,-167.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"294.5,-167.5 294.5,-187.5 416.5,-187.5 416.5,-167.5 294.5,-167.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"328.5\" y=\"-174\" font-family=\"Courier,monospace\" font-size=\"10.00\">pipeline_dtype</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"416.5,-167.5 416.5,-187.5 496.5,-187.5 496.5,-167.5 416.5,-167.5\"/>\n",
+       "<polygon fill=\"#adff2f\" stroke=\"none\" points=\"420.5,-171.5 420.5,-183.5 492.5,-183.5 492.5,-171.5 420.5,-171.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"420.5,-171.5 420.5,-183.5 492.5,-183.5 492.5,-171.5 420.5,-171.5\"/>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"294.5,-147.5 294.5,-167.5 416.5,-167.5 416.5,-147.5 294.5,-147.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"294.5,-147.5 294.5,-167.5 416.5,-167.5 416.5,-147.5 294.5,-147.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"316.5\" y=\"-154\" font-family=\"Courier,monospace\" font-size=\"10.00\">autocast_enabled</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"416.5,-147.5 416.5,-167.5 496.5,-167.5 496.5,-147.5 416.5,-147.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"420.5\" y=\"-154\" font-family=\"Courier,monospace\" font-size=\"10.00\">False</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"294.5,-127.5 294.5,-147.5 416.5,-147.5 416.5,-127.5 294.5,-127.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"294.5,-127.5 294.5,-147.5 416.5,-147.5 416.5,-127.5 294.5,-127.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"298.5\" y=\"-134\" font-family=\"Courier,monospace\" font-size=\"10.00\">grad_reduce_in_fp32</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"416.5,-127.5 416.5,-147.5 496.5,-147.5 496.5,-127.5 416.5,-127.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"420.5\" y=\"-134\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "</g>\n",
+       "<!-- 4&#45;&#45;6 -->\n",
+       "<g id=\"edge7\" class=\"edge\">\n",
+       "<title>4:c&#45;&#45;6:c</title>\n",
+       "<path fill=\"none\" stroke=\"#729672\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M638.24,-397.59C617.06,-383.14 553.79,-339.19 505.5,-297.5 487.49,-281.95 468.79,-264.13 452.02,-247.46\"/>\n",
+       "</g>\n",
+       "<!-- 8 -->\n",
+       "<g id=\"node9\" class=\"node\">\n",
+       "<title>8</title>\n",
+       "<polygon fill=\"#ff6347\" stroke=\"none\" points=\"514.5,-277.5 514.5,-297.5 776.5,-297.5 776.5,-277.5 514.5,-277.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-277.5 514.5,-297.5 776.5,-297.5 776.5,-277.5 514.5,-277.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"578.75\" y=\"-285\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"610.25\" y=\"-285\" font-family=\"Courier,monospace\" font-size=\"10.00\"> MegatronStrategy</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"514.5,-257.5 514.5,-277.5 738.5,-277.5 738.5,-257.5 514.5,-257.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-257.5 514.5,-277.5 738.5,-277.5 738.5,-257.5 514.5,-257.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"578.5\" y=\"-264\" font-family=\"Courier,monospace\" font-size=\"10.00\">tensor_model_parallel_size</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"738.5,-257.5 738.5,-277.5 776.5,-277.5 776.5,-257.5 738.5,-257.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"742.5\" y=\"-264\" font-family=\"Courier,monospace\" font-size=\"10.00\">1</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"514.5,-237.5 514.5,-257.5 738.5,-257.5 738.5,-237.5 514.5,-237.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-237.5 514.5,-257.5 738.5,-257.5 738.5,-237.5 514.5,-237.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"566.5\" y=\"-244\" font-family=\"Courier,monospace\" font-size=\"10.00\">pipeline_model_parallel_size</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"738.5,-237.5 738.5,-257.5 776.5,-257.5 776.5,-237.5 738.5,-237.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"742.5\" y=\"-244\" font-family=\"Courier,monospace\" font-size=\"10.00\">1</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"514.5,-217.5 514.5,-237.5 738.5,-237.5 738.5,-217.5 514.5,-217.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-217.5 514.5,-237.5 738.5,-237.5 738.5,-217.5 514.5,-217.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"518.5\" y=\"-224\" font-family=\"Courier,monospace\" font-size=\"10.00\">virtual_pipeline_model_parallel_size</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"738.5,-217.5 738.5,-237.5 776.5,-237.5 776.5,-217.5 738.5,-217.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"742.5\" y=\"-224\" font-family=\"Courier,monospace\" font-size=\"10.00\">None</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"514.5,-197.5 514.5,-217.5 738.5,-217.5 738.5,-197.5 514.5,-197.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-197.5 514.5,-217.5 738.5,-217.5 738.5,-197.5 514.5,-197.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"608.5\" y=\"-204\" font-family=\"Courier,monospace\" font-size=\"10.00\">context_parallel_size</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"738.5,-197.5 738.5,-217.5 776.5,-217.5 776.5,-197.5 738.5,-197.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"742.5\" y=\"-204\" font-family=\"Courier,monospace\" font-size=\"10.00\">2</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"514.5,-177.5 514.5,-197.5 738.5,-197.5 738.5,-177.5 514.5,-177.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-177.5 514.5,-197.5 738.5,-197.5 738.5,-177.5 514.5,-177.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"632.5\" y=\"-184\" font-family=\"Courier,monospace\" font-size=\"10.00\">sequence_parallel</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"738.5,-177.5 738.5,-197.5 776.5,-197.5 776.5,-177.5 738.5,-177.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"742.5\" y=\"-184\" font-family=\"Courier,monospace\" font-size=\"10.00\">False</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"514.5,-157.5 514.5,-177.5 738.5,-177.5 738.5,-157.5 514.5,-157.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-157.5 514.5,-177.5 738.5,-177.5 738.5,-157.5 514.5,-157.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"602.5\" y=\"-164\" font-family=\"Courier,monospace\" font-size=\"10.00\">ckpt_include_optimizer</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"738.5,-157.5 738.5,-177.5 776.5,-177.5 776.5,-157.5 738.5,-157.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"742.5\" y=\"-164\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"514.5,-137.5 514.5,-157.5 738.5,-157.5 738.5,-137.5 514.5,-137.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-137.5 514.5,-157.5 738.5,-157.5 738.5,-137.5 514.5,-137.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"650.5\" y=\"-144\" font-family=\"Courier,monospace\" font-size=\"10.00\">pipeline_dtype</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"738.5,-137.5 738.5,-157.5 776.5,-157.5 776.5,-137.5 738.5,-137.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"742.5\" y=\"-144\" font-family=\"Courier,monospace\" font-size=\"10.00\">None</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"514.5,-117.5 514.5,-137.5 738.5,-137.5 738.5,-117.5 514.5,-117.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-117.5 514.5,-137.5 738.5,-137.5 738.5,-117.5 514.5,-117.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"644.5\" y=\"-124\" font-family=\"Courier,monospace\" font-size=\"10.00\">ckpt_async_save</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"738.5,-117.5 738.5,-137.5 776.5,-137.5 776.5,-117.5 738.5,-117.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"742.5\" y=\"-124\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"514.5,-97.5 514.5,-117.5 738.5,-117.5 738.5,-97.5 514.5,-97.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-97.5 514.5,-117.5 738.5,-117.5 738.5,-97.5 514.5,-97.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"626.5\" y=\"-104\" font-family=\"Courier,monospace\" font-size=\"10.00\">ckpt_parallel_load</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"738.5,-97.5 738.5,-117.5 776.5,-117.5 776.5,-97.5 738.5,-97.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"742.5\" y=\"-104\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"514.5,-77.5 514.5,-97.5 738.5,-97.5 738.5,-77.5 514.5,-77.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"514.5,-77.5 514.5,-97.5 738.5,-97.5 738.5,-77.5 514.5,-77.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"596.5\" y=\"-84\" font-family=\"Courier,monospace\" font-size=\"10.00\">gradient_as_bucket_view</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"738.5,-77.5 738.5,-97.5 776.5,-97.5 776.5,-77.5 738.5,-77.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"742.5\" y=\"-84\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "</g>\n",
+       "<!-- 4&#45;&#45;8 -->\n",
+       "<g id=\"edge8\" class=\"edge\">\n",
+       "<title>4:c&#45;&#45;8:c</title>\n",
+       "<path fill=\"none\" stroke=\"#cb4f38\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M645.5,-377.7C645.5,-365.66 645.5,-332.55 645.5,-297.33\"/>\n",
+       "</g>\n",
+       "<!-- 7 -->\n",
+       "<g id=\"node7\" class=\"node\">\n",
+       "<title>7</title>\n",
+       "<polygon fill=\"#adff2f\" stroke=\"none\" points=\"410.5,-20.75 410.5,-41.5 502.5,-41.5 502.5,-20.75 410.5,-20.75\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"410.5,-20.75 410.5,-41.5 502.5,-41.5 502.5,-20.75 410.5,-20.75\"/>\n",
+       "<text text-anchor=\"start\" x=\"441.5\" y=\"-28\" font-family=\"Courier,monospace\" font-size=\"10.00\">dtype</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"410.5,0 410.5,-20.75 502.5,-20.75 502.5,0 410.5,0\"/>\n",
+       "<text text-anchor=\"start\" x=\"414.5\" y=\"-7.25\" font-family=\"Courier,monospace\" font-size=\"10.00\">torch.bfloat16</text>\n",
+       "</g>\n",
+       "<!-- 6&#45;&#45;7 -->\n",
+       "<g id=\"edge5\" class=\"edge\">\n",
+       "<title>6:c&#45;&#45;7:c</title>\n",
+       "<path fill=\"none\" stroke=\"#8acb25\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M456.5,-191.83C456.5,-170.44 456.5,-81.25 456.5,-41.28\"/>\n",
+       "</g>\n",
+       "<!-- 6&#45;&#45;7 -->\n",
+       "<g id=\"edge6\" class=\"edge\">\n",
+       "<title>6:c&#45;&#45;7:c</title>\n",
+       "<path fill=\"none\" stroke=\"#8acb25\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M456.5,-171.7C456.5,-152.06 456.5,-77.17 456.5,-41.23\"/>\n",
+       "</g>\n",
+       "<!-- 10 -->\n",
+       "<g id=\"node10\" class=\"node\">\n",
+       "<title>10</title>\n",
+       "<polygon fill=\"#f0e68c\" stroke=\"none\" points=\"794.5,-227.5 794.5,-247.5 1260.5,-247.5 1260.5,-227.5 794.5,-227.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"794.5,-227.5 794.5,-247.5 1260.5,-247.5 1260.5,-227.5 794.5,-227.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"963.75\" y=\"-235\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"995.25\" y=\"-235\" font-family=\"Courier,monospace\" font-size=\"10.00\"> ModelCheckpoint</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"794.5,-207.5 794.5,-227.5 916.5,-227.5 916.5,-207.5 794.5,-207.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"794.5,-207.5 794.5,-227.5 916.5,-227.5 916.5,-207.5 794.5,-207.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"858.5\" y=\"-214\" font-family=\"Courier,monospace\" font-size=\"10.00\">save_last</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"916.5,-207.5 916.5,-227.5 1260.5,-227.5 1260.5,-207.5 916.5,-207.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"920.5\" y=\"-214\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"794.5,-187.5 794.5,-207.5 916.5,-207.5 916.5,-187.5 794.5,-187.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"794.5,-187.5 794.5,-207.5 916.5,-207.5 916.5,-187.5 794.5,-187.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"852.5\" y=\"-194\" font-family=\"Courier,monospace\" font-size=\"10.00\">save_top_k</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"916.5,-187.5 916.5,-207.5 1260.5,-207.5 1260.5,-187.5 916.5,-187.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"920.5\" y=\"-194\" font-family=\"Courier,monospace\" font-size=\"10.00\">10</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"794.5,-167.5 794.5,-187.5 916.5,-187.5 916.5,-167.5 794.5,-167.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"794.5,-167.5 794.5,-187.5 916.5,-187.5 916.5,-167.5 794.5,-167.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"798.5\" y=\"-174\" font-family=\"Courier,monospace\" font-size=\"10.00\">every_n_train_steps</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"916.5,-167.5 916.5,-187.5 1260.5,-187.5 1260.5,-167.5 916.5,-167.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"920.5\" y=\"-174\" font-family=\"Courier,monospace\" font-size=\"10.00\">200</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"794.5,-147.5 794.5,-167.5 916.5,-167.5 916.5,-147.5 794.5,-147.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"794.5,-147.5 794.5,-167.5 916.5,-167.5 916.5,-147.5 794.5,-147.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"822.5\" y=\"-154\" font-family=\"Courier,monospace\" font-size=\"10.00\">save_best_model</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"916.5,-147.5 916.5,-167.5 1260.5,-167.5 1260.5,-147.5 916.5,-147.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"920.5\" y=\"-154\" font-family=\"Courier,monospace\" font-size=\"10.00\">False</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"794.5,-127.5 794.5,-147.5 916.5,-147.5 916.5,-127.5 794.5,-127.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"794.5,-127.5 794.5,-147.5 916.5,-147.5 916.5,-127.5 794.5,-127.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"864.5\" y=\"-134\" font-family=\"Courier,monospace\" font-size=\"10.00\">filename</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"916.5,-127.5 916.5,-147.5 1260.5,-147.5 1260.5,-127.5 916.5,-127.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"920.5\" y=\"-134\" font-family=\"Courier,monospace\" font-size=\"10.00\">&#39;{model_name}&#45;&#45;{val_loss:.2f}&#45;{step}&#45;{consumed_samples}&#39;</text>\n",
+       "</g>\n",
+       "<!-- 9&#45;&#45;10 -->\n",
+       "<g id=\"edge10\" class=\"edge\">\n",
+       "<title>9:c&#45;&#45;10:c</title>\n",
+       "<path fill=\"none\" stroke=\"#bfb770\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M911.33,-484.52C914.71,-464.84 928.28,-390.86 950.5,-333.5 961.88,-304.13 977.78,-273.1 992.18,-247.31\"/>\n",
+       "</g>\n",
+       "<!-- 11 -->\n",
+       "<g id=\"node12\" class=\"node\">\n",
+       "<title>11</title>\n",
+       "<polygon fill=\"#32cd32\" stroke=\"none\" points=\"1279,-197.5 1279,-217.5 1426,-217.5 1426,-197.5 1279,-197.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1279,-197.5 1279,-217.5 1426,-217.5 1426,-197.5 1279,-197.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1282.75\" y=\"-205\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"1314.25\" y=\"-205\" font-family=\"Courier,monospace\" font-size=\"10.00\"> TensorBoardLogger</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1279,-177.5 1279,-197.5 1349,-197.5 1349,-177.5 1279,-177.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1279,-177.5 1279,-197.5 1349,-197.5 1349,-177.5 1279,-177.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1297\" y=\"-184\" font-family=\"Courier,monospace\" font-size=\"10.00\">save_dir</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1349,-177.5 1349,-197.5 1426,-197.5 1426,-177.5 1349,-177.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1353\" y=\"-184\" font-family=\"Courier,monospace\" font-size=\"10.00\">&#39;tb_logs&#39;</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1279,-157.5 1279,-177.5 1349,-177.5 1349,-157.5 1279,-157.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1279,-157.5 1279,-177.5 1349,-177.5 1349,-157.5 1279,-157.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1321\" y=\"-164\" font-family=\"Courier,monospace\" font-size=\"10.00\">name</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1349,-157.5 1349,-177.5 1426,-177.5 1426,-157.5 1349,-157.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1353\" y=\"-164\" font-family=\"Courier,monospace\" font-size=\"10.00\">&#39;default&#39;</text>\n",
+       "</g>\n",
+       "<!-- 9&#45;&#45;11 -->\n",
+       "<g id=\"edge11\" class=\"edge\">\n",
+       "<title>9:c&#45;&#45;11:c</title>\n",
+       "<path fill=\"none\" stroke=\"#28a328\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M910.2,-464.53C909.3,-444.42 908.85,-368.48 950.5,-333.5 1059.76,-241.74 1145.41,-367.92 1269.5,-297.5 1301.77,-279.19 1325.53,-242.72 1339.17,-217.02\"/>\n",
+       "</g>\n",
+       "<!-- 14 -->\n",
+       "<g id=\"node14\" class=\"node\">\n",
+       "<title>14</title>\n",
+       "<polygon fill=\"#ffc0cb\" stroke=\"none\" points=\"1444.5,-277.5 1444.5,-297.5 1646.5,-297.5 1646.5,-277.5 1444.5,-277.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-277.5 1444.5,-297.5 1646.5,-297.5 1646.5,-277.5 1444.5,-277.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1481.75\" y=\"-285\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"1513.25\" y=\"-285\" font-family=\"Courier,monospace\" font-size=\"10.00\"> OptimizerConfig</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1444.5,-257.5 1444.5,-277.5 1602.5,-277.5 1602.5,-257.5 1444.5,-257.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-257.5 1444.5,-277.5 1602.5,-277.5 1602.5,-257.5 1444.5,-257.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1544.5\" y=\"-264\" font-family=\"Courier,monospace\" font-size=\"10.00\">optimizer</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1602.5,-257.5 1602.5,-277.5 1646.5,-277.5 1646.5,-257.5 1602.5,-257.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1606.5\" y=\"-264\" font-family=\"Courier,monospace\" font-size=\"10.00\">&#39;adam&#39;</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1444.5,-237.5 1444.5,-257.5 1602.5,-257.5 1602.5,-237.5 1444.5,-237.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-237.5 1444.5,-257.5 1602.5,-257.5 1602.5,-237.5 1444.5,-237.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1586.5\" y=\"-244\" font-family=\"Courier,monospace\" font-size=\"10.00\">lr</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1602.5,-237.5 1602.5,-257.5 1646.5,-257.5 1646.5,-237.5 1602.5,-237.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1606.5\" y=\"-244\" font-family=\"Courier,monospace\" font-size=\"10.00\">0.0003</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1444.5,-217.5 1444.5,-237.5 1602.5,-237.5 1602.5,-217.5 1444.5,-217.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-217.5 1444.5,-237.5 1602.5,-237.5 1602.5,-217.5 1444.5,-217.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1526.5\" y=\"-224\" font-family=\"Courier,monospace\" font-size=\"10.00\">weight_decay</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1602.5,-217.5 1602.5,-237.5 1646.5,-237.5 1646.5,-217.5 1602.5,-217.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1606.5\" y=\"-224\" font-family=\"Courier,monospace\" font-size=\"10.00\">0.1</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1444.5,-197.5 1444.5,-217.5 1602.5,-217.5 1602.5,-197.5 1444.5,-197.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-197.5 1444.5,-217.5 1602.5,-217.5 1602.5,-197.5 1444.5,-197.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1574.5\" y=\"-204\" font-family=\"Courier,monospace\" font-size=\"10.00\">bf16</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1602.5,-197.5 1602.5,-217.5 1646.5,-217.5 1646.5,-197.5 1602.5,-197.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1606.5\" y=\"-204\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1444.5,-177.5 1444.5,-197.5 1602.5,-197.5 1602.5,-177.5 1444.5,-177.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-177.5 1444.5,-197.5 1602.5,-197.5 1602.5,-177.5 1444.5,-177.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1538.5\" y=\"-184\" font-family=\"Courier,monospace\" font-size=\"10.00\">adam_beta1</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1602.5,-177.5 1602.5,-197.5 1646.5,-197.5 1646.5,-177.5 1602.5,-177.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1606.5\" y=\"-184\" font-family=\"Courier,monospace\" font-size=\"10.00\">0.9</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1444.5,-157.5 1444.5,-177.5 1602.5,-177.5 1602.5,-157.5 1444.5,-157.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-157.5 1444.5,-177.5 1602.5,-177.5 1602.5,-157.5 1444.5,-157.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1538.5\" y=\"-164\" font-family=\"Courier,monospace\" font-size=\"10.00\">adam_beta2</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1602.5,-157.5 1602.5,-177.5 1646.5,-177.5 1646.5,-157.5 1602.5,-157.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1606.5\" y=\"-164\" font-family=\"Courier,monospace\" font-size=\"10.00\">0.95</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1444.5,-137.5 1444.5,-157.5 1602.5,-157.5 1602.5,-137.5 1444.5,-137.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-137.5 1444.5,-157.5 1602.5,-157.5 1602.5,-137.5 1444.5,-137.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1550.5\" y=\"-144\" font-family=\"Courier,monospace\" font-size=\"10.00\">adam_eps</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1602.5,-137.5 1602.5,-157.5 1646.5,-157.5 1646.5,-137.5 1602.5,-137.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1606.5\" y=\"-144\" font-family=\"Courier,monospace\" font-size=\"10.00\">1e&#45;05</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1444.5,-117.5 1444.5,-137.5 1602.5,-137.5 1602.5,-117.5 1444.5,-117.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-117.5 1444.5,-137.5 1602.5,-137.5 1602.5,-117.5 1444.5,-117.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1448.5\" y=\"-124\" font-family=\"Courier,monospace\" font-size=\"10.00\">use_distributed_optimizer</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1602.5,-117.5 1602.5,-137.5 1646.5,-137.5 1646.5,-117.5 1602.5,-117.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1606.5\" y=\"-124\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1444.5,-97.5 1444.5,-117.5 1602.5,-117.5 1602.5,-97.5 1444.5,-97.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-97.5 1444.5,-117.5 1602.5,-117.5 1602.5,-97.5 1444.5,-97.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1484.5\" y=\"-104\" font-family=\"Courier,monospace\" font-size=\"10.00\">overlap_grad_reduce</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1602.5,-97.5 1602.5,-117.5 1646.5,-117.5 1646.5,-97.5 1602.5,-97.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1606.5\" y=\"-104\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1444.5,-77.5 1444.5,-97.5 1602.5,-97.5 1602.5,-77.5 1444.5,-77.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1444.5,-77.5 1444.5,-97.5 1602.5,-97.5 1602.5,-77.5 1444.5,-77.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1478.5\" y=\"-84\" font-family=\"Courier,monospace\" font-size=\"10.00\">overlap_param_gather</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1602.5,-77.5 1602.5,-97.5 1646.5,-97.5 1646.5,-77.5 1602.5,-77.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1606.5\" y=\"-84\" font-family=\"Courier,monospace\" font-size=\"10.00\">True</text>\n",
+       "</g>\n",
+       "<!-- 13&#45;&#45;14 -->\n",
+       "<g id=\"edge14\" class=\"edge\">\n",
+       "<title>13:c&#45;&#45;14:c</title>\n",
+       "<path fill=\"none\" stroke=\"#cb99a2\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M1545.99,-494.61C1545.96,-472.09 1545.8,-375.99 1545.68,-297.39\"/>\n",
+       "</g>\n",
+       "<!-- 15 -->\n",
+       "<g id=\"node16\" class=\"node\">\n",
+       "<title>15</title>\n",
+       "<polygon fill=\"#90ee90\" stroke=\"none\" points=\"1664.5,-207.5 1664.5,-227.5 1896.5,-227.5 1896.5,-207.5 1664.5,-207.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1664.5,-207.5 1664.5,-227.5 1896.5,-227.5 1896.5,-207.5 1664.5,-207.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1689.75\" y=\"-215\" font-family=\"Courier,monospace\" font-size=\"8.00\">Config:</text>\n",
+       "<text text-anchor=\"start\" x=\"1721.25\" y=\"-215\" font-family=\"Courier,monospace\" font-size=\"10.00\"> CosineAnnealingScheduler</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1664.5,-187.5 1664.5,-207.5 1756.5,-207.5 1756.5,-187.5 1664.5,-187.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1664.5,-187.5 1664.5,-207.5 1756.5,-207.5 1756.5,-187.5 1664.5,-187.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1680.5\" y=\"-194\" font-family=\"Courier,monospace\" font-size=\"10.00\">warmup_steps</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1756.5,-187.5 1756.5,-207.5 1896.5,-207.5 1896.5,-187.5 1756.5,-187.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1760.5\" y=\"-194\" font-family=\"Courier,monospace\" font-size=\"10.00\">2000</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1664.5,-167.5 1664.5,-187.5 1756.5,-187.5 1756.5,-167.5 1664.5,-167.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1664.5,-167.5 1664.5,-187.5 1756.5,-187.5 1756.5,-167.5 1664.5,-167.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1668.5\" y=\"-174\" font-family=\"Courier,monospace\" font-size=\"10.00\">constant_steps</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1756.5,-167.5 1756.5,-187.5 1896.5,-187.5 1896.5,-167.5 1756.5,-167.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1760.5\" y=\"-174\" font-family=\"Courier,monospace\" font-size=\"10.00\">0</text>\n",
+       "<polygon fill=\"#eeeeee\" stroke=\"none\" points=\"1664.5,-147.5 1664.5,-167.5 1756.5,-167.5 1756.5,-147.5 1664.5,-147.5\"/>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1664.5,-147.5 1664.5,-167.5 1756.5,-167.5 1756.5,-147.5 1664.5,-147.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1716.5\" y=\"-154\" font-family=\"Courier,monospace\" font-size=\"10.00\">min_lr</text>\n",
+       "<polygon fill=\"none\" stroke=\"black\" points=\"1756.5,-147.5 1756.5,-167.5 1896.5,-167.5 1896.5,-147.5 1756.5,-147.5\"/>\n",
+       "<text text-anchor=\"start\" x=\"1760.5\" y=\"-154\" font-family=\"Courier,monospace\" font-size=\"10.00\">2.9999999999999997e&#45;05</text>\n",
+       "</g>\n",
+       "<!-- 13&#45;&#45;15 -->\n",
+       "<g id=\"edge15\" class=\"edge\">\n",
+       "<title>13:c&#45;&#45;15:c</title>\n",
+       "<path fill=\"none\" stroke=\"#73be73\" stroke-width=\"3\" stroke-opacity=\"0.501961\" d=\"M1549.68,-474.93C1572.12,-447.09 1691.32,-299.16 1749.08,-227.49\"/>\n",
+       "</g>\n",
+       "</g>\n",
+       "</svg>\n"
+      ],
+      "text/plain": [
+       "<Partial[pretrain(\n",
+       "  model=<Config[LlamaModel(config=<Config[Llama3Config8B()]>)]>,\n",
+       "  data=<Config[MockDataModule(seq_length=8192, micro_batch_size=1, global_batch_size=512)]>,\n",
+       "  trainer=<Config[Trainer(\n",
+       "    accelerator='gpu',\n",
+       "    accumulate_grad_batches=1,\n",
+       "    callbacks=[<Config[TimingCallback()]>],\n",
+       "    devices=8,\n",
+       "    gradient_clip_val=1.0,\n",
+       "    limit_test_batches=50,\n",
+       "    limit_val_batches=32,\n",
+       "    log_every_n_steps=10,\n",
+       "    max_steps=1168251,\n",
+       "    num_nodes=1,\n",
+       "    plugins=<Config[MegatronMixedPrecision(\n",
+       "      precision='bf16-mixed',\n",
+       "      params_dtype=torch.bfloat16,\n",
+       "      pipeline_dtype=torch.bfloat16,\n",
+       "      autocast_enabled=False,\n",
+       "      grad_reduce_in_fp32=True)]>,\n",
+       "    strategy=<Config[MegatronStrategy(\n",
+       "      tensor_model_parallel_size=1,\n",
+       "      pipeline_model_parallel_size=1,\n",
+       "      virtual_pipeline_model_parallel_size=None,\n",
+       "      context_parallel_size=2,\n",
+       "      sequence_parallel=False,\n",
+       "      ckpt_include_optimizer=True,\n",
+       "      pipeline_dtype=None,\n",
+       "      ckpt_async_save=True,\n",
+       "      ckpt_parallel_load=True,\n",
+       "      gradient_as_bucket_view=True)]>,\n",
+       "    use_distributed_sampler=False,\n",
+       "    val_check_interval=2000)]>,\n",
+       "  log=<Config[NeMoLogger(\n",
+       "    name='default',\n",
+       "    dir=None,\n",
+       "    ckpt=<Config[ModelCheckpoint(\n",
+       "      save_last=True,\n",
+       "      save_top_k=10,\n",
+       "      every_n_train_steps=200,\n",
+       "      save_best_model=False,\n",
+       "      filename='{model_name}--{val_loss:.2f}-{step}-{consumed_samples}')]>,\n",
+       "    tensorboard=<Config[TensorBoardLogger(save_dir='tb_logs', name='default')]>,\n",
+       "    wandb=None)]>,\n",
+       "  resume=<Config[AutoResume(resume_if_exists=True, resume_ignore_no_checkpoint=True)]>,\n",
+       "  optim=<Config[MegatronOptimizerModule(\n",
+       "    config=<Config[OptimizerConfig(\n",
+       "      optimizer='adam',\n",
+       "      lr=0.0003,\n",
+       "      weight_decay=0.1,\n",
+       "      bf16=True,\n",
+       "      adam_beta1=0.9,\n",
+       "      adam_beta2=0.95,\n",
+       "      adam_eps=1e-05,\n",
+       "      use_distributed_optimizer=True,\n",
+       "      overlap_grad_reduce=True,\n",
+       "      overlap_param_gather=True)]>,\n",
+       "    lr_scheduler=<Config[CosineAnnealingScheduler(\n",
+       "      warmup_steps=2000,\n",
+       "      constant_steps=0,\n",
+       "      min_lr=2.9999999999999997e-05)]>)]>)]>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8)\n",
+    "\n",
+    "pretrain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
index 89e61a8b917c..0464d85b5480 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
@@ -210,6 +210,23 @@ model:
     image_folder: null
     image_aspect_ratio: 'square'
 
+energon:
+  use_energon: False
+  data:
+    __module__: megatron.energon
+    __class__: Metadataset
+    splits:
+      # Train dataset, the datasets will be mixed according to their weights 
+      train:
+        datasets:
+          - weight: 1.0
+            path: null
+      val:
+        datasets:
+          - weight: 1.0
+            path: null
+
+
   # Nsys profiling options
   nsys_profile:
     enabled: False
diff --git a/examples/multimodal/multimodal_llm/neva/neva_finetune.py b/examples/multimodal/multimodal_llm/neva/neva_finetune.py
index e94308ad89f3..1796a87bac9e 100644
--- a/examples/multimodal/multimodal_llm/neva/neva_finetune.py
+++ b/examples/multimodal/multimodal_llm/neva/neva_finetune.py
@@ -22,8 +22,6 @@
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
-mp.set_start_method("spawn", force=True)
-
 
 @hydra_runner(config_path="conf", config_name="neva_finetune")
 def main(cfg) -> None:
diff --git a/examples/multimodal/multimodal_llm/neva/neva_peft.py b/examples/multimodal/multimodal_llm/neva/neva_peft.py
index 2c0e1bc41ac2..0960dd260ad4 100644
--- a/examples/multimodal/multimodal_llm/neva/neva_peft.py
+++ b/examples/multimodal/multimodal_llm/neva/neva_peft.py
@@ -23,8 +23,6 @@
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
-mp.set_start_method("spawn", force=True)
-
 
 @hydra_runner(config_path="conf", config_name="neva_peft")
 def main(cfg) -> None:
diff --git a/examples/multimodal/multimodal_llm/neva/neva_pretrain.py b/examples/multimodal/multimodal_llm/neva/neva_pretrain.py
index 26e0dc294185..8aae9f2d655a 100644
--- a/examples/multimodal/multimodal_llm/neva/neva_pretrain.py
+++ b/examples/multimodal/multimodal_llm/neva/neva_pretrain.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-import torch.multiprocessing as mp
 from omegaconf.omegaconf import OmegaConf
 
 from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
@@ -22,8 +20,6 @@
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
-mp.set_start_method("spawn", force=True)
-
 
 @hydra_runner(config_path="conf", config_name="neva_config")
 def main(cfg) -> None:
diff --git a/nemo/collections/common/parts/perf_metrics_utils.py b/nemo/collections/common/parts/perf_metrics_utils.py
index 41273797e035..1633b1343340 100644
--- a/nemo/collections/common/parts/perf_metrics_utils.py
+++ b/nemo/collections/common/parts/perf_metrics_utils.py
@@ -2,7 +2,6 @@
 import os
 from typing import List
 
-from tensorboard.backend.event_processing import event_accumulator
 
 from nemo.utils import logging
 
@@ -27,6 +26,7 @@ def read_tb_log(path: str, summary_name: str) -> List:
     Returns:
         summary_list: list, the values in the read summary list, formatted as a list.
     """
+    from tensorboard.backend.event_processing import event_accumulator
 
     files = glob.glob(f"{path}/events*tfevents*")
     files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 8da00b0edd7f..614af0df400c 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -31,6 +31,11 @@
     Baichuan2Config,
     Baichuan2Config7B,
     Baichuan2Model,
+    BaseMambaConfig1_3B,
+    BaseMambaConfig2_7B,
+    BaseMambaConfig130M,
+    BaseMambaConfig370M,
+    BaseMambaConfig780M,
     ChatGLM2Config6B,
     ChatGLM3Config6B,
     ChatGLMConfig,
@@ -46,6 +51,12 @@
     GemmaConfig7B,
     GemmaModel,
     GPTConfig,
+    GPTConfig5B,
+    GPTConfig7B,
+    GPTConfig20B,
+    GPTConfig40B,
+    GPTConfig126M,
+    GPTConfig175B,
     GPTModel,
     Llama2Config7B,
     Llama2Config13B,
@@ -71,12 +82,15 @@
     Nemotron4Config340B,
     NemotronConfig,
     NemotronModel,
+    NVIDIAMambaConfig8B,
+    NVIDIAMambaHybridConfig8B,
     Qwen2Config,
     Qwen2Config1P5B,
     Qwen2Config7B,
     Qwen2Config72B,
     Qwen2Config500M,
     Qwen2Model,
+    SSMConfig,
     Starcoder2Config,
     Starcoder2Config3B,
     Starcoder2Config7B,
@@ -120,6 +134,14 @@
     "Nemotron4Config22B",
     "Nemotron4Config340B",
     "NemotronConfig",
+    "SSMConfig",
+    "BaseMambaConfig130M",
+    "BaseMambaConfig370M",
+    "BaseMambaConfig780M",
+    "BaseMambaConfig1_3B",
+    "BaseMambaConfig2_7B",
+    "NVIDIAMambaConfig8B",
+    "NVIDIAMambaHybridConfig8B",
     "LlamaConfig",
     "Llama2Config7B",
     "Llama2Config13B",
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index d330b42d08c4..847b87131925 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -18,10 +18,10 @@
 from pathlib import Path
 from typing import Any, Callable, Optional, Union
 
+import nemo_run as run
 import pytorch_lightning as pl
 from typing_extensions import Annotated
 
-from nemo.collections.llm.utils import Config, task
 from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io
 from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform
 from nemo.utils import logging
@@ -29,13 +29,13 @@
 TokenizerType = Any
 
 
-@task(namespace="llm")
+@run.cli.entrypoint(namespace="llm")
 def train(
     model: pl.LightningModule,
     data: pl.LightningDataModule,
     trainer: Trainer,
-    log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
-    resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
+    log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None,
     optim: Optional[OptimizerModule] = None,
     tokenizer: Optional[TokenizerType] = None,
     model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None,
@@ -87,13 +87,13 @@ def train(
     return app_state.exp_dir
 
 
-@task(namespace="llm")
+@run.cli.entrypoint(namespace="llm")
 def pretrain(
     model: pl.LightningModule,
     data: pl.LightningDataModule,
     trainer: Trainer,
-    log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
-    resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
+    log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None,
     optim: Optional[OptimizerModule] = None,
 ) -> Path:
     """
@@ -135,13 +135,13 @@ def pretrain(
     )
 
 
-@task(namespace="llm")
+@run.cli.entrypoint(namespace="llm")
 def finetune(
     model: pl.LightningModule,
     data: pl.LightningDataModule,
     trainer: Trainer,
-    log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
-    resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
+    log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None,
     optim: Optional[OptimizerModule] = None,
     peft: Optional[Union[PEFT, ModelTransform, Callable]] = None,
 ) -> Path:
@@ -186,13 +186,13 @@ def finetune(
     )
 
 
-@task(namespace="llm")
+@run.cli.entrypoint(namespace="llm")
 def validate(
     model: pl.LightningModule,
     data: pl.LightningDataModule,
     trainer: Trainer,
-    log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
-    resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
+    log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None,
+    resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None,
     optim: Optional[OptimizerModule] = None,
     tokenizer: Optional[TokenizerType] = None,
     model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None,
@@ -311,7 +311,7 @@ def store_args_to_json(triton_http_address, triton_port, triton_request_timeout,
         json.dump(args_dict, f)
 
 
-@task(namespace="llm")
+@run.cli.entrypoint(namespace="llm")
 def deploy(
     nemo_checkpoint: Path = None,
     model_type: str = "llama",
@@ -400,7 +400,7 @@ def deploy(
     nm.stop()
 
 
-@task(name="import", namespace="llm")
+@run.cli.entrypoint(name="import", namespace="llm")
 def import_ckpt(
     model: pl.LightningModule,
     source: str,
@@ -414,7 +414,7 @@ def load_connector_from_trainer_ckpt(path: Path, target: str) -> io.ModelConnect
     return io.load_context(path).model.exporter(target, path)
 
 
-@task(name="export", namespace="llm")
+@run.cli.entrypoint(name="export", namespace="llm")
 def export_ckpt(
     path: Path,
     target: str,
diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 7fa5bd719581..46cab3163368 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -63,6 +63,7 @@ def __init__(
         num_workers: int = 8,
         pin_memory: bool = True,
         persistent_workers: bool = False,
+        pad_to_max_length: bool = False,
     ):
         super().__init__()
         self.seq_length = seq_length
@@ -78,6 +79,7 @@ def __init__(
         self.rampup_batch_size = rampup_batch_size
         self.data_sampler = None
         self.max_train_samples = None
+        self.pad_to_max_length = pad_to_max_length
 
     def setup(self, stage: str):
         self.data_sampler = MegatronDataSampler(
@@ -97,6 +99,7 @@ def train_dataloader(self) -> DataLoader:
             self._create_dataset(
                 str(self.train_path),
                 max_num_samples=self.max_train_samples,
+                pad_to_max_length=self.pad_to_max_length,
             )
         )
 
@@ -105,6 +108,7 @@ def val_dataloader(self) -> DataLoader:
             self._create_dataset(
                 str(self.validation_path),
                 is_test=True,
+                pad_to_max_length=self.pad_to_max_length,
             ),
         )
 
@@ -114,6 +118,7 @@ def test_dataloader(self) -> DataLoader:
                 str(self.test_path),
                 tokens_to_generate=32,
                 is_test=True,
+                pad_to_max_length=self.pad_to_max_length,
             )
         )
 
diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
index ccb2d21729ed..534922efe3a3 100644
--- a/nemo/collections/llm/gpt/data/pre_training.py
+++ b/nemo/collections/llm/gpt/data/pre_training.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+import os
 import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
@@ -34,6 +35,66 @@
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 
+def is_number_tryexcept(s):
+    """Returns True if string is a number."""
+    if s is None:
+        return False
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+def is_zipped_list(paths):
+    # ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"]
+    even = paths[::2]
+    if len(even) == 0:
+        return False
+    is_num = list(map(is_number_tryexcept, even))
+    if any(is_num):
+        assert all(is_num), "Got malformatted zipped list"
+    return is_num[0]
+
+
+def validate_dataset_asset_accessibility(paths):
+    if paths is None:
+        raise ValueError("Expected path to have a value.")
+
+    if isinstance(paths, tuple) or isinstance(paths, list):
+        if is_zipped_list(paths):
+            # remove weights from paths.
+            paths = paths[1::2]
+        for p in paths:
+            validate_dataset_asset_accessibility(p)
+        return
+    elif isinstance(paths, dict):
+        for p in paths.values():
+            validate_dataset_asset_accessibility(p)
+        return
+
+    if not isinstance(paths, str) and not isisntance(paths, Path):
+        raise ValueError("Expected path to be of string or Path type.")
+
+    path = Path(paths)
+    suffices = ('.bin', '.idx')
+    if path.is_dir():
+        if not os.access(path, os.R_OK):
+            raise PermissionError(f"Expected {str(path)} to be readable.")
+        # Will let the downstream class confirm contents are ok.
+        return
+    if path.exists():
+        if not os.access(path, os.R_OK):
+            raise PermissionError(f"Expected {str(path)} to be readable.")
+        return
+    for suffix in suffices:
+        file_path = Path(str(path) + suffix)
+        if not file_path.exists():
+            raise FileNotFoundError(f"Expected {str(file_path)} to exist.")
+        if not os.access(file_path, os.R_OK):
+            raise PermissionError(f"Expected {str(file_path)} to be readable.")
+
+
 class PreTrainingDataModule(pl.LightningDataModule, IOMixin):
     """PyTorch Lightning-compatible data module for pre-training
        GPT-style models.
@@ -100,6 +161,8 @@ def __init__(
 
         from megatron.core.datasets.utils import get_blend_from_list
 
+        validate_dataset_asset_accessibility(paths)
+
         build_kwargs = {}
         if isinstance(paths, dict):
             if split is not None:
diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py
index a2dfa12af69e..3f73d67ec61d 100644
--- a/nemo/collections/llm/gpt/data/squad.py
+++ b/nemo/collections/llm/gpt/data/squad.py
@@ -53,6 +53,7 @@ def __init__(
         num_workers: int = 8,
         pin_memory: bool = True,
         persistent_workers: bool = False,
+        pad_to_max_length: bool = False,
     ):
         self.force_redownload = force_redownload
         self.delete_raw = delete_raw
@@ -69,6 +70,7 @@ def __init__(
             num_workers=num_workers,
             pin_memory=pin_memory,
             persistent_workers=persistent_workers,
+            pad_to_max_length=pad_to_max_length,
         )
 
     def prepare_data(self) -> None:
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 81098040191c..aa3615b3ddfd 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -15,6 +15,12 @@
 from nemo.collections.llm.gpt.model.baichuan import Baichuan2Config, Baichuan2Config7B, Baichuan2Model
 from nemo.collections.llm.gpt.model.base import (
     GPTConfig,
+    GPTConfig5B,
+    GPTConfig7B,
+    GPTConfig20B,
+    GPTConfig40B,
+    GPTConfig126M,
+    GPTConfig175B,
     GPTModel,
     MaskedTokenLossReduction,
     gpt_data_step,
@@ -71,6 +77,16 @@
     Qwen2Config500M,
     Qwen2Model,
 )
+from nemo.collections.llm.gpt.model.ssm import (
+    BaseMambaConfig1_3B,
+    BaseMambaConfig2_7B,
+    BaseMambaConfig130M,
+    BaseMambaConfig370M,
+    BaseMambaConfig780M,
+    NVIDIAMambaConfig8B,
+    NVIDIAMambaHybridConfig8B,
+    SSMConfig,
+)
 from nemo.collections.llm.gpt.model.starcoder import StarcoderConfig, StarcoderConfig15B, StarcoderModel
 from nemo.collections.llm.gpt.model.starcoder2 import (
     Starcoder2Config,
@@ -137,6 +153,14 @@
     "Qwen2Config7B",
     "Qwen2Config72B",
     "Qwen2Model",
+    "SSMConfig",
+    "BaseMambaConfig130M",
+    "BaseMambaConfig370M",
+    "BaseMambaConfig780M",
+    "BaseMambaConfig1_3B",
+    "BaseMambaConfig2_7B",
+    "NVIDIAMambaConfig8B",
+    "NVIDIAMambaHybridConfig8B",
     "MaskedTokenLossReduction",
     "gpt_data_step",
     "gpt_forward_step",
diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py
index b60c0430b8be..19a04a65a026 100644
--- a/nemo/collections/llm/gpt/model/baichuan.py
+++ b/nemo/collections/llm/gpt/model/baichuan.py
@@ -106,7 +106,7 @@ def convert_state(self, source, target):
     def tokenizer(self) -> "AutoTokenizer":
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
-        return AutoTokenizer(str(self), trust_remote_code=True)
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)), trust_remote_code=True)
 
     @property
     def config(self) -> Baichuan2Config:
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index a6b53f4e859d..e0d752bf3411 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -182,6 +182,60 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
         )
 
 
+@dataclass
+class GPTConfig126M(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 12
+    hidden_size: int = 768
+    ffn_hidden_size: int = 3072
+    num_attention_heads: int = 12
+
+
+@dataclass
+class GPTConfig5B(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 24
+    hidden_size: int = 4096
+    ffn_hidden_size: int = 16384
+    num_attention_heads: int = 32
+
+
+@dataclass
+class GPTConfig7B(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 32
+    hidden_size: int = 4096
+    ffn_hidden_size: int = 10880
+    num_attention_heads: int = 32
+
+
+@dataclass
+class GPTConfig20B(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 44
+    hidden_size: int = 6144
+    ffn_hidden_size: int = 24576
+    num_attention_heads: int = 48
+
+
+@dataclass
+class GPTConfig40B(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 48
+    hidden_size: int = 8192
+    ffn_hidden_size: int = 32768
+    num_attention_heads: int = 64
+
+
+@dataclass
+class GPTConfig175B(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 96
+    hidden_size: int = 12288
+    ffn_hidden_size: int = 49152
+    num_attention_heads: int = 96
+
+
 class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
     def __init__(
         self,
diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py
index 3b6453b2b891..162b42501d11 100644
--- a/nemo/collections/llm/gpt/model/chatglm.py
+++ b/nemo/collections/llm/gpt/model/chatglm.py
@@ -113,7 +113,7 @@ def convert_state(self, source, target):
     def tokenizer(self) -> "AutoTokenizer":
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
-        return AutoTokenizer(str(self), trust_remote_code=True)
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)), trust_remote_code=True)
 
     @property
     def config(self) -> ChatGLMConfig:
diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
index 753d75165197..e28d4409437b 100644
--- a/nemo/collections/llm/gpt/model/gemma.py
+++ b/nemo/collections/llm/gpt/model/gemma.py
@@ -134,7 +134,7 @@ def convert_state(self, source, target):
     def tokenizer(self) -> "AutoTokenizer":
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
-        return AutoTokenizer(str(self))
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)))
 
     @property
     def config(self) -> GemmaConfig:
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 2c76b2fdd976..59d697f2f6b7 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -251,7 +251,7 @@ def convert_state(self, source, target):
     def tokenizer(self) -> "AutoTokenizer":
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
-        return AutoTokenizer(str(self))
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)))
 
     @property
     def config(self) -> LlamaConfig:
diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py
index 73e6a34fd7c2..a6415769112a 100644
--- a/nemo/collections/llm/gpt/model/mistral.py
+++ b/nemo/collections/llm/gpt/model/mistral.py
@@ -142,7 +142,7 @@ def convert_state(self, source, target):
     def tokenizer(self) -> "AutoTokenizer":
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
-        return AutoTokenizer(str(self))
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)))
 
     @property
     def config(self) -> MistralConfig7B:
diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py
index b0f40a2fc785..bc255ae8fb87 100644
--- a/nemo/collections/llm/gpt/model/mixtral.py
+++ b/nemo/collections/llm/gpt/model/mixtral.py
@@ -59,7 +59,7 @@ class MixtralConfig(GPTConfig):
     moe_aux_loss_coeff: float = 0.01
     moe_expert_capacity_factor: float = 1.0
     moe_pad_expert_input_to_capacity: bool = True
-    moe_router_topk: int = 1
+    moe_router_topk: int = 2
     moe_router_pre_softmax: bool = True
     moe_token_dispatcher_type: str = "alltoall"
 
@@ -104,7 +104,7 @@ class MixtralConfig8x7B(MixtralConfig):
 @dataclass
 class MixtralConfig8x22B(MixtralConfig):
     """
-    Config for Mixtral-8x7B model
+    Config for Mixtral-8x22B model
     Official announcement: https://mistral.ai/news/mixtral-8x22b/
     """
 
@@ -114,9 +114,6 @@ class MixtralConfig8x22B(MixtralConfig):
     ffn_hidden_size: int = 16384
     max_position_embeddings: int = 4096
     seq_length: int = 4096
-    # MoE
-    num_moe_experts: int = 8
-    moe_router_topk: int = 2
 
 
 class MixtralModel(GPTModel):
@@ -171,7 +168,7 @@ def convert_state(self, source, target):
     def tokenizer(self) -> "AutoTokenizer":
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
-        return AutoTokenizer(str(self))
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)))
 
     @property
     def config(self) -> MixtralConfig8x7B | MixtralConfig8x22B:
diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py
index 44f10c0bee60..c8a8b5abee4b 100644
--- a/nemo/collections/llm/gpt/model/nemotron.py
+++ b/nemo/collections/llm/gpt/model/nemotron.py
@@ -173,7 +173,7 @@ def convert_state(self, source, target):
     def tokenizer(self) -> "AutoTokenizer":
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
-        return AutoTokenizer(str(self))
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)))
 
     @property
     def config(self) -> NemotronConfig:
diff --git a/nemo/collections/llm/gpt/model/qwen2.py b/nemo/collections/llm/gpt/model/qwen2.py
index 643bdda3ba8d..09ed910bac4c 100644
--- a/nemo/collections/llm/gpt/model/qwen2.py
+++ b/nemo/collections/llm/gpt/model/qwen2.py
@@ -141,7 +141,7 @@ def convert_state(self, source, target):
     def tokenizer(self) -> "AutoTokenizer":
         from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 
-        return AutoTokenizer(str(self), trust_remote_code=True)
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)), trust_remote_code=True)
 
     @property
     def config(self) -> Qwen2Config:
diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py
new file mode 100644
index 000000000000..954fa8bfe9f7
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/ssm.py
@@ -0,0 +1,317 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Literal, Optional
+
+import torch
+
+from nemo.utils import logging
+
+try:
+    from megatron.core import parallel_state
+    from megatron.core.models.mamba import MambaModel as MCoreMambaModel
+    from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
+
+    HAVE_MEGATRON_CORE_OR_TE = True
+
+except (ImportError, ModuleNotFoundError):
+    logging.warning("The package `megatron.core` was not imported in this environment which is needed for SSMs.")
+    HAVE_MEGATRON_CORE_OR_TE = False
+
+from megatron.core.transformer.transformer_config import TransformerConfig
+from nemo.collections.llm.gpt.model.base import GPTModel, gpt_data_step
+from nemo.lightning import get_vocab_size, io, teardown
+
+
+def ssm_forward_step(model, batch) -> torch.Tensor:
+
+    forward_args = {
+        "input_ids": batch["tokens"],
+        "position_ids": batch["position_ids"],
+        "labels": batch["labels"],
+    }
+    forward_args["attention_mask"] = None
+    return model(**forward_args)
+
+
+@dataclass
+class SSMConfig(TransformerConfig, io.IOMixin):
+    # From megatron.core.models.mamba.mamba_model.MambaModel
+    fp16_lm_cross_entropy: bool = False
+    parallel_output: bool = True
+    share_embeddings_and_output_weights: bool = False
+    num_layers: int = 2
+    mamba_ssm_ngroups: int = 8
+    num_attention_heads: int = 1
+    hybrid_attention_ratio: float = 0.0
+    hybrid_mlp_ratio: float = 0.0
+    hybrid_override_pattern: str = None
+    post_process: bool = True
+    pre_process: bool = True
+    seq_length: int = 2048
+    # Mamba with no attention has no need for position embeddings, so none is default
+    position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'none'
+    rotary_percent: float = 1.0
+    rotary_base: int = 10000
+    seq_len_interpolation_factor: Optional[float] = None
+    apply_rope_fusion: bool = True
+    make_vocab_size_divisible_by: int = 128
+    gated_linear_unit: bool = False
+    fp32_residual_connections: bool = True
+    normalization: str = 'RMSNorm'
+    add_bias_linear: bool = False
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    layernorm_epsilon: float = 1e-5
+    # TODO: Move this to better places?
+    get_attention_mask_from_fusion: bool = False
+
+    forward_step_fn: Callable = ssm_forward_step
+    data_step_fn: Callable = gpt_data_step
+
+    def configure_model(self, tokenizer) -> "MCoreMambaModel":
+
+        return MCoreMambaModel(
+            self,
+            mamba_stack_spec=mamba_stack_spec,
+            vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
+            max_sequence_length=self.seq_length,
+            mamba_ssm_ngroups=self.mamba_ssm_ngroups,
+            hybrid_attention_ratio=self.hybrid_attention_ratio,
+            hybrid_mlp_ratio=self.hybrid_mlp_ratio,
+            hybrid_override_pattern=self.hybrid_override_pattern,
+            position_embedding_type=self.position_embedding_type,
+            rotary_percent=self.rotary_percent,
+            rotary_base=self.rotary_base,
+            seq_len_interpolation_factor=self.seq_len_interpolation_factor,
+            pre_process=parallel_state.is_pipeline_first_stage(),
+            post_process=parallel_state.is_pipeline_last_stage(),
+        )
+
+
+@io.model_importer(GPTModel, "pytorch")
+class PyTorchSSMImporter(io.ModelConnector["GPTModel", GPTModel]):
+
+    def __new__(cls, path: str, model_config=None):
+        instance = super().__new__(cls, path)
+        instance.model_config = model_config
+        return instance
+
+    def init(self) -> GPTModel:
+
+        return GPTModel(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+
+        source = torch.load(str(self), map_location='cpu')
+        if 'model' in source:
+            source = source['model']
+
+        class ModelState:
+            def __init__(self, state_dict):
+                self._state_dict = state_dict
+
+            def state_dict(self):
+                return self._state_dict
+
+        source = ModelState(source)
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        logging.info(f"Converted SSM model to Nemo, model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+
+        if self.model_config.mapping_type == "base":
+            mapping = {
+                'backbone.embedding.weight': 'embedding.word_embeddings.weight',
+                'backbone.layers.*.mixer.A_log': 'decoder.layers.*.mixer.A_log',
+                'backbone.layers.*.mixer.D': 'decoder.layers.*.mixer.D',
+                'backbone.layers.*.mixer.conv1d.weight': 'decoder.layers.*.mixer.conv1d.weight',
+                'backbone.layers.*.mixer.conv1d.bias': 'decoder.layers.*.mixer.conv1d.bias',
+                'backbone.layers.*.mixer.in_proj.weight': 'decoder.layers.*.mixer.in_proj.weight',
+                'backbone.layers.*.mixer.dt_bias': 'decoder.layers.*.mixer.dt_bias',
+                'backbone.layers.*.mixer.out_proj.weight': 'decoder.layers.*.mixer.out_proj.weight',
+                'backbone.layers.*.mixer.norm.weight': 'decoder.layers.*.mixer.norm.weight',
+                'backbone.layers.*.norm.weight': 'decoder.layers.*.mixer.in_proj.layer_norm_weight',
+                'backbone.norm_f.weight': 'decoder.final_norm.weight',
+                'lm_head.weight': 'output_layer.weight',
+            }
+        elif "nvidia" in self.model_config.mapping_type:
+            mapping = {
+                'embedding.word_embeddings.weight': 'embedding.word_embeddings.weight',
+                'decoder.layers.*.mixer.A_log': 'decoder.layers.*.mixer.A_log',
+                'decoder.layers.*.mixer.D': 'decoder.layers.*.mixer.D',
+                'decoder.layers.*.mixer.conv1d.weight': 'decoder.layers.*.mixer.conv1d.weight',
+                'decoder.layers.*.mixer.conv1d.bias': 'decoder.layers.*.mixer.conv1d.bias',
+                'decoder.layers.*.mixer.in_proj.weight': 'decoder.layers.*.mixer.in_proj.weight',
+                'decoder.layers.*.mixer.dt_bias': 'decoder.layers.*.mixer.dt_bias',
+                'decoder.layers.*.mixer.out_proj.weight': 'decoder.layers.*.mixer.out_proj.weight',
+                'decoder.layers.*.mixer.norm.weight': 'decoder.layers.*.mixer.norm.weight',
+                'decoder.layers.*.norm.weight': 'decoder.layers.*.mixer.in_proj.layer_norm_weight',
+                'decoder.final_norm.weight': 'decoder.final_norm.weight',
+                'output_layer.weight': 'output_layer.weight',
+            }
+            if "hybrid" in self.model_config.mapping_type:
+                mapping.update(
+                    {
+                        'decoder.layers.*.mlp.linear_fc1.layer_norm_weight': 'decoder.layers.*.mlp.linear_fc1.layer_norm_weight',
+                        'decoder.layers.*.mlp.linear_fc1.weight': 'decoder.layers.*.mlp.linear_fc1.weight',
+                        'decoder.layers.*.mlp.linear_fc2.weight': 'decoder.layers.*.mlp.linear_fc2.weight',
+                        'decoder.layers.*.self_attention.linear_proj.weight': 'decoder.layers.*.self_attention.linear_proj.weight',
+                        'decoder.layers.*.self_attention.linear_qkv.layer_norm_weight': 'decoder.layers.*.self_attention.linear_qkv.layer_norm_weight',
+                        'decoder.layers.*.self_attention.linear_qkv.weight': 'decoder.layers.*.self_attention.linear_qkv.weight',
+                    }
+                )
+        else:
+            raise AttributeError(f"mapping type [{self.mapping_type}] not found.")
+        return io.apply_transforms(source, target, mapping=mapping)
+
+    @property
+    def tokenizer(self):
+        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+        tokenizer = get_nmt_tokenizer(
+            library=self.model_config.tokenizer_library,
+            model_name=self.model_config.tokenizer_name,
+            tokenizer_model=self.model_config.tokenizer_model_path,
+            use_fast=True,
+        )
+
+        return tokenizer
+
+    @property
+    def config(self) -> SSMConfig:
+        return self.model_config
+
+
+@dataclass
+class BaseMambaConfig130M(SSMConfig):
+    hybrid_override_pattern: str = "M" * 24
+    num_layers: int = 24
+    seq_length: int = 2048
+    hidden_size: int = 768
+    mamba_ssm_ngroups: int = 1
+    ffn_hidden_size: int = 768
+    make_vocab_size_divisible_by: int = 16
+    tokenizer_library: str = 'huggingface'
+    tokenizer_name: str = "EleutherAI/gpt-neox-20b"
+    mapping_type: str = "base"
+
+
+@dataclass
+class BaseMambaConfig370M(SSMConfig):
+    hybrid_override_pattern: str = "M" * 48
+    num_layers: int = 48
+    seq_length: int = 2048
+    hidden_size: int = 1024
+    mamba_ssm_ngroups: int = 1
+    ffn_hidden_size: int = 1024
+    make_vocab_size_divisible_by: int = 16
+    tokenizer_library: str = 'huggingface'
+    tokenizer_name: str = "EleutherAI/gpt-neox-20b"
+    mapping_type: str = "base"
+
+
+@dataclass
+class BaseMambaConfig780M(SSMConfig):
+    hybrid_override_pattern: str = "M" * 48
+    num_layers: int = 48
+    seq_length: int = 2048
+    hidden_size: int = 1536
+    mamba_ssm_ngroups: int = 1
+    ffn_hidden_size: int = 1536
+    make_vocab_size_divisible_by: int = 16
+    tokenizer_library: str = 'huggingface'
+    tokenizer_name: str = "EleutherAI/gpt-neox-20b"
+    mapping_type: str = "base"
+
+
+@dataclass
+class BaseMambaConfig1_3B(SSMConfig):
+    hybrid_override_pattern: str = "M" * 48
+    num_layers: int = 48
+    seq_length: int = 2048
+    hidden_size: int = 2048
+    mamba_ssm_ngroups: int = 1
+    ffn_hidden_size: int = 2048
+    make_vocab_size_divisible_by: int = 16
+    tokenizer_library: str = 'huggingface'
+    tokenizer_name: str = "EleutherAI/gpt-neox-20b"
+    mapping_type: str = "base"
+
+
+@dataclass
+class BaseMambaConfig2_7B(SSMConfig):
+    hybrid_override_pattern: str = "M" * 64
+    num_layers: int = 64
+    seq_length: int = 2048
+    hidden_size: int = 2560
+    mamba_ssm_ngroups: int = 1
+    ffn_hidden_size: int = 2560
+    make_vocab_size_divisible_by: int = 16
+    tokenizer_library: str = 'huggingface'
+    tokenizer_name: str = "EleutherAI/gpt-neox-20b"
+    mapping_type: str = "base"
+
+
+@dataclass
+class NVIDIAMambaConfig8B(SSMConfig):
+    hybrid_override_pattern: str = "M" * 56
+    num_layers: int = 56
+    seq_length: int = 4096
+    hidden_size: int = 4096
+    mamba_ssm_ngroups: int = 8
+    ffn_hidden_size: int = 4096
+    make_vocab_size_divisible_by: int = 128
+    tokenizer_library: str = 'megatron'
+    tokenizer_name: str = "GPTSentencePieceTokenizer"
+    mapping_type: str = "nvidia-pure"
+
+
+@dataclass
+class NVIDIAMambaHybridConfig8B(SSMConfig):
+    hybrid_override_pattern: str = "M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-"
+    num_layers: int = 56
+    seq_length: int = 4096
+    hidden_size: int = 4096
+    mamba_ssm_ngroups: int = 8
+    ffn_hidden_size: int = 16384
+    num_attention_heads: int = 32
+    num_query_groups: int = 8
+    make_vocab_size_divisible_by: int = 128
+    tokenizer_library: str = 'megatron'
+    tokenizer_name: str = "GPTSentencePieceTokenizer"
+    mapping_type: str = "nvidia-hybrid"
+
+
+__all__ = [
+    "SSMConfig",
+    "BaseMambaConfig130M",
+    "BaseMambaConfig370M",
+    "BaseMambaConfig780M",
+    "BaseMambaConfig1_3B",
+    "BaseMambaConfig2_7B",
+    "NVIDIAMambaConfig8B",
+    "NVIDIAMambaHybridConfig8B",
+]
diff --git a/nemo/collections/llm/gpt/model/starcoder.py b/nemo/collections/llm/gpt/model/starcoder.py
index 15deb0ba2191..e7cc3f411710 100644
--- a/nemo/collections/llm/gpt/model/starcoder.py
+++ b/nemo/collections/llm/gpt/model/starcoder.py
@@ -19,7 +19,6 @@
 import torch.nn.functional as F
 from torch import nn
 
-from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
 from nemo.collections.llm.utils import Config
 from nemo.lightning import OptimizerModule, io, teardown
@@ -120,7 +119,9 @@ def convert_state(self, source, target):
 
     @property
     def tokenizer(self) -> "AutoTokenizer":
-        return AutoTokenizer(str(self))
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)))
 
     @property
     def config(self) -> StarcoderConfig:
diff --git a/nemo/collections/llm/gpt/model/starcoder2.py b/nemo/collections/llm/gpt/model/starcoder2.py
index c49af006c6f5..57b8d3635ade 100644
--- a/nemo/collections/llm/gpt/model/starcoder2.py
+++ b/nemo/collections/llm/gpt/model/starcoder2.py
@@ -20,7 +20,6 @@
 import torch.nn.functional as F
 from torch import nn
 
-from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
 from nemo.collections.llm.utils import Config
 from nemo.lightning import OptimizerModule, io, teardown
@@ -144,7 +143,9 @@ def convert_state(self, source, target):
 
     @property
     def tokenizer(self) -> "AutoTokenizer":
-        return AutoTokenizer(str(self))
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)))
 
     @property
     def config(self) -> Starcoder2Config:
diff --git a/nemo/collections/llm/recipes/ADD-RECIPE.md b/nemo/collections/llm/recipes/ADD-RECIPE.md
new file mode 100644
index 000000000000..c506374e3784
--- /dev/null
+++ b/nemo/collections/llm/recipes/ADD-RECIPE.md
@@ -0,0 +1,100 @@
+# How to Add a New Recipe
+
+This guide explains the process of adding a new recipe to the NeMo LLM collection.
+
+## Step 1: Create a New Python File
+
+Create a new Python file in the `nemo/collections/llm/recipes/` directory. Name it according to the model and its specific configuration, e.g., `my_new_model_12b.py`.
+
+## Step 2: Define the Model Configuration
+
+Create a function called `model` to define the model configuration:
+
+```python
+NAME = "my_new_model_12b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    return run.Config(YourModel, config=run.Config(YourModelConfig))
+```
+
+## Step 3: Define the Trainer Configuration
+
+Create a function called `trainer` to set up the trainer:
+
+```python
+def trainer(
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    # Add other parameters as needed
+) -> run.Config[nl.Trainer]:
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        # Define your parallelism strategy here
+    )
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        devices=num_gpus_per_node,
+        num_nodes=num_nodes,
+        # Add other trainer configurations
+    )
+    return trainer
+```
+
+## Step 4: Define the Recipe Configuration
+
+Create a function called `pretrain_recipe` or `finetune_recipe` to define the recipe configuration:
+
+```python
+from nemo.collections.llm import pretrain
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    # Add other parameters as needed
+) -> run.Config[nl.PretrainRecipe]:
+    return run.Config(
+        nl.PretrainRecipe,
+        model=model(),
+        trainer=trainer(),
+        # Add other recipe configurations
+        data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+```
+
+```python
+from nemo.collections.llm import finetune
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    # Add other parameters as needed
+) -> run.Config[nl.FinetuneRecipe]:
+    return run.Config(
+        nl.FinetuneRecipe,
+        model=model(),
+        trainer=trainer(),
+        # Add other recipe configurations
+        data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+```
+
+
+## Step 5: Import the recipe in the __init__.py file
+
+Import the recipe in the [__init__.py](__init__.py) file in the same directory:
+
+```python
+from .my_new_model_12b import pretrain_recipe, finetune_recipe
+```
+
+
+## Step 6: Add tests for the recipe
+
+Add tests for the recipe in the [tests](../../../../tests/collections/llm/recipes) directory. You can use [test_llama3_8b.py](../../../../tests/collections/llm/recipes/test_llama3_8b.py) as an example.
diff --git a/nemo/collections/llm/recipes/README.md b/nemo/collections/llm/recipes/README.md
new file mode 100644
index 000000000000..a3cf715acffb
--- /dev/null
+++ b/nemo/collections/llm/recipes/README.md
@@ -0,0 +1,46 @@
+# NeMo LLM Recipes
+
+This directory contains recipes for pre-training and fine-tuning large language models (LLMs) using NeMo.
+
+A recipe in NeMo is a Python file that defines a complete configuration for training or fine-tuning an LLM. Each recipe typically includes:
+
+1. Model configuration: Defines the architecture and hyperparameters of the LLM.
+2. Training configuration: Specifies settings for the PyTorch Lightning Trainer, including distributed training strategies.
+3. Data configuration: Sets up the data pipeline, including batch sizes and sequence lengths.
+4. Optimization configuration: Defines the optimizer and learning rate schedule.
+5. Logging and checkpointing configuration: Specifies how to save model checkpoints and log training progress.
+
+Recipes are designed to be modular and extensible, allowing users to easily customize settings for their specific use cases.
+
+## Usage
+
+### Command Line Interface
+
+You can use these recipes via the NeMo CLI:
+
+```bash
+nemorun llm <task> --factory <recipe_name>
+```
+Where:
+- `<task>` is either `pretrain` or `finetune`
+- `<recipe_name>` is the name of the recipe (e.g. `llama3_8b`)
+
+For example:
+```bash
+nemorun llm pretrain --factory llama3_8b
+```
+
+
+### Customizing Parameters
+
+You can override any parameter in the recipe:
+
+```bash
+nemorun llm pretrain --factory llama3_8b trainer.max_steps=2000
+```
+
+For more details around running recipes, see [pre-train](../../../../examples/llm/pretrain/README.md).
+
+## Adding a New Recipe
+
+See [ADD-RECIPE.md](ADD-RECIPE.md) for instructions on how to add a new recipe.
\ No newline at end of file
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 950ca6db7ac6..ec44d1c19864 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -1,3 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 from nemo.collections.llm.recipes import (
     llama3_8b,
     llama3_8b_16k,
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
index cbf6b5e2e7a1..96c94fd6eeba 100644
--- a/nemo/collections/llm/recipes/llama3_70b.py
+++ b/nemo/collections/llm/recipes/llama3_70b.py
@@ -1,8 +1,23 @@
-from typing import Callable, Optional
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
+
+from typing import Optional
+
+import nemo_run as run
 import pytorch_lightning as pl
 import torch
-from megatron.core.distributed import DistributedDataParallelConfig
 from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
@@ -13,32 +28,77 @@
 from nemo.collections.llm.peft.lora import LoRA
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
-from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
-from nemo.collections.llm.utils import Config, Partial
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama3_70b"
 
 
-def model() -> Config[pl.LightningModule]:
-    return Config(LlamaModel, config=Config(Llama3Config70B))
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llama3 70B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llama3 70B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=llama3_70b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(LlamaModel, config=run.Config(Llama3Config70B))
 
 
 def trainer(
-    tensor_parallelism: int,
-    pipeline_parallelism: int,
-    pipeline_parallelism_type: Optional[torch.dtype],
-    virtual_pipeline_parallelism: Optional[int],
-    context_parallelism: int,
-    sequence_parallelism: bool,
+    tensor_parallelism: int = 4,
+    pipeline_parallelism: int = 4,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 5,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = True,
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
     max_steps: int = 1168251,
-    callbacks: Optional[list[Config[Callback]]] = None,
-) -> Config[nl.Trainer]:
-    strategy = Config(
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Llama3 70B model.
+
+    This function sets up the distributed training strategy optimized for the large 70B model.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=llama3_70b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses extensive parallelism to handle the large model size efficiently.
+    """
+    strategy = run.Config(
         nl.MegatronStrategy,
         tensor_model_parallel_size=tensor_parallelism,
         pipeline_model_parallel_size=pipeline_parallelism,
@@ -51,7 +111,7 @@ def trainer(
         ckpt_parallel_load=True,
     )
 
-    trainer = Config(
+    trainer = run.Config(
         nl.Trainer,
         accelerator="gpu",
         accumulate_grad_batches=1,
@@ -62,7 +122,7 @@ def trainer(
         log_every_n_steps=10,
         max_steps=max_steps,
         num_nodes=num_nodes,
-        plugins=bf16_mixed_plugin(),
+        plugins=bf16_mixed(),
         strategy=strategy,
         use_distributed_sampler=False,
         val_check_interval=2000,
@@ -71,42 +131,89 @@ def trainer(
     return trainer
 
 
+@run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    return Partial(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Llama3 70B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory llama3_70b
+            $ nemo llm pretrain --factory "llama3_70b(num_nodes=4, name='my_70b_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="llama3_70b_pretrain", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for the large 70B model and requires significant computational resources.
+    """
+    return run.Partial(
         fn,
         model=model(),
         trainer=trainer(
-            tensor_parallelism=4,
-            pipeline_parallelism=4,
-            pipeline_parallelism_type=torch.bfloat16,
-            virtual_pipeline_parallelism=5,
-            context_parallelism=2,
-            sequence_parallelism=True,
             num_nodes=num_nodes,
             num_gpus_per_node=num_gpus_per_node,
-            callbacks=[Config(TimingCallback)],
+            callbacks=[run.Config(TimingCallback)],
         ),
-        data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
-        log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
         resume=default_resume(),
     )
 
 
+@run.cli.factory(target=pretrain, name=NAME + "_performance")
 def pretrain_recipe_performance(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    """'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default
-    due to being model specific or lacking sufficent support. For better compatibility please use
-    the default 'pretrain_recipe()' above."""
-    recipe = pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
-    )
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Llama3 70B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
     recipe.trainer.callbacks.append(
-        Config(
+        run.Config(
             MegatronCommOverlapCallback,
             tp_comm_overlap=True,
             tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
@@ -118,18 +225,66 @@ def pretrain_recipe_performance(
     return recipe
 
 
-def hf_resume() -> Config[nl.AutoResume]:
-    return Config(
+def hf_resume() -> run.Config[nl.AutoResume]:
+    """
+    Configure automatic resumption from a Hugging Face checkpoint for Llama3 70B model.
+
+    This function sets up the configuration to resume training from a pre-trained
+    Hugging Face model checkpoint.
+
+    More info about the model can be found at: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+
+    Returns:
+        run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint.
+
+    Note:
+        This is particularly useful for fine-tuning scenarios where you want to
+        start from the pre-trained Llama3 70B model.
+    """
+    return run.Config(
         nl.AutoResume,
-        restore_config=Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-70B"),
+        restore_config=run.Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-70B"),
     )
 
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune
-    )
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llama3 70B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning of the large model.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory llama3_70b
+            $ nemo llm finetune --factory "llama3_70b(num_nodes=4, name='my_70b_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llama3_70b_finetune", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. Be aware that fine-tuning a 70B model
+        requires substantial computational resources.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune)
     recipe.resume = hf_resume()
-    recipe.peft = Config(LoRA)
-    recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
+    recipe.peft = run.Config(LoRA)
+    recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
     return recipe
diff --git a/nemo/collections/llm/recipes/llama3_70b_16k.py b/nemo/collections/llm/recipes/llama3_70b_16k.py
index 87826661606f..3798088ff722 100644
--- a/nemo/collections/llm/recipes/llama3_70b_16k.py
+++ b/nemo/collections/llm/recipes/llama3_70b_16k.py
@@ -1,57 +1,81 @@
-from typing import Callable
-
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
 import torch
 
-from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import llama3_70b
-from nemo.collections.llm.utils import Config, Partial
-from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama3_70b_16k"
 
 
-def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    recipe = llama3_70b.pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
-    )
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llama3 70B model configuration with 16k sequence length.
 
-    model = llama3_70b.model()
-    model.config.seq_length = 16384
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llama3 70B model with 16k sequence length.
 
-    trainer = llama3_70b.trainer(
-        tensor_parallelism=4,
-        pipeline_parallelism=4,
-        pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
-        context_parallelism=4,
-        sequence_parallelism=True,
-        num_nodes=num_nodes,
-        num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
-    )
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=llama3_70b_16k ...
 
-    data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    model_config = llama3_70b.model()
+    model_config.config.seq_length = 16384
+    return model_config
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
 
-    return recipe
+def trainer(
+    num_nodes: int = 2,
+    num_gpus_per_node: int = 8,
+) -> run.Config:
+    """
+    Configure the NeMo Lightning Trainer for Llama3 70B model with 16k sequence length.
 
+    This function sets up the distributed training strategy optimized for the large 70B model with longer sequences.
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = llama3_70b.finetune_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node
-    )
+    Args:
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Config: Configuration for the NeMo Lightning Trainer.
 
-    model = llama3_70b.model()
-    model.config.seq_length = 16384
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=llama3_70b_16k ...
 
-    trainer = llama3_70b.trainer(
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses extensive parallelism to handle the large model size and longer sequence length efficiently.
+    """
+    return llama3_70b.trainer(
         tensor_parallelism=2,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
@@ -60,13 +84,93 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node:
         sequence_parallelism=True,
         num_nodes=num_nodes,
         num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
     )
 
-    data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 2,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Llama3 70B model with 16k sequence length.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, and data settings optimized for 16k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory llama3_70b_16k
+            $ nemo llm pretrain --factory "llama3_70b_16k(num_nodes=4, name='my_70b_16k_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="llama3_70b_16k_pretrain", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for the large 70B model with longer sequences (16k).
+        It requires significant computational resources.
+    """
+    recipe = llama3_70b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
+
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 2,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llama3 70B model with 16k sequence length.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, and data settings optimized for 16k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory llama3_70b_16k
+            $ nemo llm finetune --factory "llama3_70b_16k(num_nodes=4, name='my_70b_16k_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llama3_70b_16k_finetune", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for fine-tuning the large 70B model with longer sequences (16k).
+        It uses the SQuAD dataset adapted for 16k sequence length. Be aware that this configuration
+        requires substantial computational resources.
+    """
+    recipe = llama3_70b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
 
     return recipe
diff --git a/nemo/collections/llm/recipes/llama3_70b_64k.py b/nemo/collections/llm/recipes/llama3_70b_64k.py
index 5185e6b2ec45..353bdd659947 100644
--- a/nemo/collections/llm/recipes/llama3_70b_64k.py
+++ b/nemo/collections/llm/recipes/llama3_70b_64k.py
@@ -1,72 +1,179 @@
-from typing import Callable
-
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
 import torch
 
-from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import llama3_70b
-from nemo.collections.llm.utils import Config, Partial
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama3_70b_64k"
 
 
-def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    recipe = llama3_70b.pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
-    )
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llama3 70B model configuration with 64k sequence length.
 
-    model = llama3_70b.model()
-    model.config.seq_length = 65536
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llama3 70B model with 64k sequence length.
 
-    trainer = llama3_70b.trainer(
-        tensor_parallelism=8,
-        pipeline_parallelism=4,
-        pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
-        context_parallelism=8,
-        sequence_parallelism=True,
-        num_nodes=num_nodes,
-        num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
-    )
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=llama3_70b_64k ...
 
-    data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    model_config = llama3_70b.model()
+    model_config.config.seq_length = 65536
+    return model_config
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
 
-    return recipe
+def trainer(
+    num_nodes: int = 32,
+    num_gpus_per_node: int = 8,
+) -> run.Config:
+    """
+    Configure the NeMo Lightning Trainer for Llama3 70B model with 64k sequence length.
 
+    This function sets up the distributed training strategy optimized for the large 70B model with long sequences.
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = llama3_70b.finetune_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node
-    )
+    Args:
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Config: Configuration for the NeMo Lightning Trainer.
 
-    model = llama3_70b.model()
-    model.config.seq_length = 65536
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=llama3_70b_64k ...
 
-    trainer = llama3_70b.trainer(
-        tensor_parallelism=2,
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=32, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses extensive parallelism to handle the large model size and long sequence length efficiently.
+        It requires a significant amount of computational resources.
+    """
+    return llama3_70b.trainer(
+        tensor_parallelism=8,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
         virtual_pipeline_parallelism=5,
-        context_parallelism=2,
+        context_parallelism=8,
         sequence_parallelism=True,
         num_nodes=num_nodes,
         num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
+        callbacks=[run.Config(TimingCallback)],
     )
 
-    data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 32,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Llama3 70B model with 64k sequence length.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, and data settings optimized for 64k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory llama3_70b_64k
+            $ nemo llm pretrain --factory "llama3_70b_64k(num_nodes=32, name='my_70b_64k_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="llama3_70b_64k_pretrain", num_nodes=32)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for the large 70B model with long sequences (64k).
+        It requires extensive computational resources due to the model size and extended sequence length.
+    """
+    recipe = llama3_70b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
+
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 32,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llama3 70B model with 64k sequence length.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, and data settings optimized for 64k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory llama3_70b_64k
+            $ nemo llm finetune --factory "llama3_70b_64k(num_nodes=32, name='my_70b_64k_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llama3_70b_64k_finetune", num_nodes=32)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for fine-tuning the large 70B model with long sequences (64k).
+        It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration
+        requires extensive computational resources due to the model size and extended sequence length.
+    """
+    recipe = llama3_70b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
 
     return recipe
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
index 17d4e8b168b3..8b2ea2969273 100644
--- a/nemo/collections/llm/recipes/llama3_8b.py
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -1,5 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 from typing import Callable, Optional
 
+import nemo_run as run
 import pytorch_lightning as pl
 import torch
 from pytorch_lightning.callbacks.callback import Callback
@@ -12,31 +28,77 @@
 from nemo.collections.llm.peft.lora import LoRA
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
-from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin
-from nemo.collections.llm.utils import Config, Partial
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama3_8b"
 
 
-def model() -> Config[pl.LightningModule]:
-    return Config(LlamaModel, config=Config(Llama3Config8B))
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llama3 8B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llama3 8B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=llama3_8b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(LlamaModel, config=run.Config(Llama3Config8B))
 
 
 def trainer(
-    tensor_parallelism: int,
-    pipeline_parallelism: int,
-    pipeline_parallelism_type: Optional[torch.dtype],
-    virtual_pipeline_parallelism: Optional[int],
-    context_parallelism: int,
-    sequence_parallelism: bool,
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
     max_steps: int = 1168251,
-    callbacks: Optional[list[Config[Callback]]] = None,
-) -> Config[nl.Trainer]:
-    strategy = Config(
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Llama3 8B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=llama3_8b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
         nl.MegatronStrategy,
         tensor_model_parallel_size=tensor_parallelism,
         pipeline_model_parallel_size=pipeline_parallelism,
@@ -49,7 +111,7 @@ def trainer(
         ckpt_parallel_load=True,
     )
 
-    trainer = Config(
+    trainer = run.Config(
         nl.Trainer,
         accelerator="gpu",
         accumulate_grad_batches=1,
@@ -60,7 +122,7 @@ def trainer(
         log_every_n_steps=10,
         max_steps=max_steps,
         num_nodes=num_nodes,
-        plugins=bf16_mixed_plugin(),
+        plugins=bf16_mixed(),
         strategy=strategy,
         use_distributed_sampler=False,
         val_check_interval=2000,
@@ -69,42 +131,93 @@ def trainer(
     return trainer
 
 
+@run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    return Partial(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Llama3 8B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory llama3_8b
+            $ nemo llm pretrain --factory "llama3_8b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="llama3_8b_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
+    """
+    return run.Partial(
         fn,
         model=model(),
         trainer=trainer(
-            tensor_parallelism=1,
-            pipeline_parallelism=1,
-            pipeline_parallelism_type=None,
-            virtual_pipeline_parallelism=None,
-            context_parallelism=2,
-            sequence_parallelism=False,
             num_nodes=num_nodes,
             num_gpus_per_node=num_gpus_per_node,
-            callbacks=[Config(TimingCallback)],
+            callbacks=[run.Config(TimingCallback)],
         ),
-        data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
-        log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
         resume=default_resume(),
     )
 
 
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
 def pretrain_recipe_performance(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    """'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default
-    due to being model specific or lacking sufficent support. For better compatibility please use
-    the default 'pretrain_recipe()' above."""
-    recipe = pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
-    )
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a performance-optimized pre-training recipe for Llama3 8B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory llama3_8b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
 
     recipe.trainer.callbacks.append(
-        Config(
+        run.Config(
             MegatronCommOverlapCallback,
             tp_comm_overlap=False,
         )
@@ -112,18 +225,61 @@ def pretrain_recipe_performance(
     return recipe
 
 
-def hf_resume() -> Config[nl.AutoResume]:
-    return Config(
+def hf_resume() -> run.Config[nl.AutoResume]:
+    """Configure automatic resumption from a Hugging Face checkpoint.
+
+    This function sets up the configuration to resume training from a pre-trained
+    Hugging Face model checkpoint.
+
+    More info about the model can be found at: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+
+    Returns:
+        run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint.
+    """
+    return run.Config(
         nl.AutoResume,
-        restore_config=Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-8B"),
+        restore_config=run.Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-8B"),
     )
 
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune
-    )
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llama3 8B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory llama3_8b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llama3_8b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune)
     recipe.resume = hf_resume()
-    recipe.peft = Config(LoRA)
-    recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
+    recipe.peft = run.Config(LoRA)
+    recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
     return recipe
diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py
index 27762777c622..bd02f1975864 100644
--- a/nemo/collections/llm/recipes/llama3_8b_16k.py
+++ b/nemo/collections/llm/recipes/llama3_8b_16k.py
@@ -1,57 +1,81 @@
-from typing import Callable
-
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
 import torch
 
-from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import llama3_8b
-from nemo.collections.llm.utils import Config, Partial
-from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama3_8b_16k"
 
 
-def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    recipe = llama3_8b.pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
-    )
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llama3 8B model configuration with 16k sequence length.
 
-    model = llama3_8b.model()
-    model.config.seq_length = 16384
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llama3 8B model with 16k sequence length.
 
-    trainer = llama3_8b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
-        pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
-        context_parallelism=2,
-        sequence_parallelism=True,
-        num_nodes=num_nodes,
-        num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
-    )
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=llama3_8b_16k ...
 
-    data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    model_config = llama3_8b.model()
+    model_config.config.seq_length = 16384
+    return model_config
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
 
-    return recipe
+def trainer(
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Config:
+    """
+    Configure the NeMo Lightning Trainer for Llama3 8B model with 16k sequence length.
 
+    This function sets up the distributed training strategy optimized for longer sequences.
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = llama3_8b.finetune_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node
-    )
+    Args:
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Config: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=llama3_8b_16k ...
 
-    model = llama3_8b.model()
-    model.config.seq_length = 16384
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
 
-    trainer = llama3_8b.trainer(
+    Note:
+        This configuration uses increased parallelism to handle the longer sequence length efficiently.
+    """
+    return llama3_8b.trainer(
         tensor_parallelism=2,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
@@ -60,13 +84,91 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node:
         sequence_parallelism=True,
         num_nodes=num_nodes,
         num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
     )
 
-    data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Llama3 8B model with 16k sequence length.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, and data settings optimized for 16k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory llama3_8b_16k
+            $ nemo llm pretrain --factory "llama3_8b_16k(num_nodes=2, name='my_16k_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="llama3_8b_16k_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for handling longer sequences (16k) compared to the standard 8k version.
+    """
+    recipe = llama3_8b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
+
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llama3 8B model with 16k sequence length.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, and data settings optimized for 16k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory llama3_8b_16k
+            $ nemo llm finetune --factory "llama3_8b_16k(num_nodes=2, name='my_16k_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llama3_8b_16k_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for fine-tuning with longer sequences (16k) compared to the standard 8k version.
+        It uses the SQuAD dataset adapted for 16k sequence length.
+    """
+    recipe = llama3_8b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
 
     return recipe
diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py
index 90001c6189a0..e5845e4530ca 100644
--- a/nemo/collections/llm/recipes/llama3_8b_64k.py
+++ b/nemo/collections/llm/recipes/llama3_8b_64k.py
@@ -1,57 +1,81 @@
-from typing import Callable
-
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
 import torch
 
-from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import llama3_8b
-from nemo.collections.llm.utils import Config, Partial
-from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama3_8b_64k"
 
 
-def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    recipe = llama3_8b.pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
-    )
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llama3 8B model configuration with 64k sequence length.
 
-    model = llama3_8b.model()
-    model.config.seq_length = 65536
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llama3 8B model with 64k sequence length.
 
-    trainer = llama3_8b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
-        pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=5,
-        context_parallelism=4,
-        sequence_parallelism=True,
-        num_nodes=num_nodes,
-        num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
-    )
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=llama3_8b_64k ...
 
-    data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    model_config = llama3_8b.model()
+    model_config.config.seq_length = 65536
+    return model_config
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
 
-    return recipe
+def trainer(
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Config:
+    """
+    Configure the NeMo Lightning Trainer for Llama3 8B model with 64k sequence length.
 
+    This function sets up the distributed training strategy optimized for long sequences.
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = llama3_8b.finetune_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node
-    )
+    Args:
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Config: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=llama3_8b_64k ...
 
-    model = llama3_8b.model()
-    model.config.seq_length = 65536
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
 
-    trainer = llama3_8b.trainer(
+    Note:
+        This configuration uses significantly increased parallelism to handle the long sequence length efficiently.
+    """
+    return llama3_8b.trainer(
         tensor_parallelism=2,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
@@ -60,13 +84,93 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node:
         sequence_parallelism=True,
         num_nodes=num_nodes,
         num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
     )
 
-    data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Llama3 8B model with 64k sequence length.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, and data settings optimized for 64k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory llama3_8b_64k
+            $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=2, name='my_64k_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for handling long sequences (64k) compared to the standard 8k version.
+        It requires significant computational resources due to the extended sequence length.
+    """
+    recipe = llama3_8b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
+
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llama3 8B model with 64k sequence length.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, and data settings optimized for 64k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory llama3_8b_64k
+            $ nemo llm finetune --factory "llama3_8b_64k(num_nodes=2, name='my_64k_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llama3_8b_64k_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for fine-tuning with long sequences (64k) compared to the standard 8k version.
+        It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration requires
+        substantial computational resources due to the extended sequence length.
+    """
+    recipe = llama3_8b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
 
     return recipe
diff --git a/nemo/collections/llm/recipes/log/__init__.py b/nemo/collections/llm/recipes/log/__init__.py
index e69de29bb2d1..d9155f923f18 100644
--- a/nemo/collections/llm/recipes/log/__init__.py
+++ b/nemo/collections/llm/recipes/log/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py
index 4d5e9223b535..b59d549726c6 100644
--- a/nemo/collections/llm/recipes/log/default.py
+++ b/nemo/collections/llm/recipes/log/default.py
@@ -1,9 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 from typing import Optional
 
+from nemo_run import Config, cli
 from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
 
 from nemo import lightning as nl
-from nemo.collections.llm.utils import Config
 
 
 def tensorboard_logger(name: str, save_dir: str = "tb_logs") -> Config[TensorBoardLogger]:
@@ -24,15 +39,15 @@ def wandb_logger(project: str, name: str, entity: Optional[str] = None) -> Confi
     return cfg
 
 
+@cli.factory(is_target_default=True)
 def default_log(
-    ckpt_dir: str,
-    name: str,
+    dir: Optional[str] = None,
+    name: str = "default",
     tensorboard_logger: Optional[Config[TensorBoardLogger]] = None,
     wandb_logger: Optional[Config[WandbLogger]] = None,
 ) -> Config[nl.NeMoLogger]:
     ckpt = Config(
         nl.ModelCheckpoint,
-        save_best_model=False,
         save_last=True,
         save_top_k=10,
         every_n_train_steps=200,
@@ -45,13 +60,14 @@ def default_log(
         name=name,
         tensorboard=tensorboard_logger,
         wandb=wandb_logger,
-        dir=ckpt_dir,
+        dir=dir,
     )
 
 
-def default_resume() -> Config[nl.AutoResume]:
+@cli.factory(is_target_default=True)
+def default_resume(resume_if_exists=True, resume_ignore_no_checkpoint=True) -> Config[nl.AutoResume]:
     return Config(
         nl.AutoResume,
-        resume_if_exists=True,
-        resume_ignore_no_checkpoint=True,
+        resume_if_exists=resume_if_exists,
+        resume_ignore_no_checkpoint=resume_ignore_no_checkpoint,
     )
diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral.py
index c504340348fe..902e7623afd2 100644
--- a/nemo/collections/llm/recipes/mistral.py
+++ b/nemo/collections/llm/recipes/mistral.py
@@ -1,61 +1,242 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
 import pytorch_lightning as pl
+import torch
+from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
-from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
-from nemo.collections.llm.peft.api import gpt_lora
-from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
-from nemo.collections.llm.utils import Partial, factory
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mistral"
 
 
-@factory(name=NAME)
-def model() -> pl.LightningModule:
-    return MistralModel(MistralConfig7B())
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mistral 7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mistral 7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mistral ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(MistralModel, config=run.Config(MistralConfig7B))
+
 
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 100,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mistral 7B model.
 
-@factory(name=NAME)
-def trainer(devices=8) -> nl.Trainer:
-    strategy = nl.MegatronStrategy(tensor_model_parallel_size=2)
+    This function sets up the distributed training strategy and other training parameters.
 
-    return nl.Trainer(
-        devices=devices,
-        max_steps=100,
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mistral ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_include_optimizer=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
         accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        gradient_clip_val=1.0,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
         strategy=strategy,
-        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+        use_distributed_sampler=False,
+        val_check_interval=2000,
     )
 
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mistral 7B model.
 
-@factory(name=NAME + "_hf")
-def hf_resume() -> nl.AutoResume:
-    return nl.AutoResume(restore_config=nl.RestoreConfig(path="hf://mistralai/Mistral-7B-v0.3"))
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
 
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
 
-@factory(name=NAME, for_task="llm.pretrain")
-def pretrain_recipe() -> Partial:
-    return Partial(
-        pretrain,
-        model=model,
-        trainer=trainer,
-        data=squad,
-        log=default_log,
-        optim=distributed_fused_adam_with_cosine_annealing(),
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mistral
+            $ nemo llm pretrain --factory "mistral(num_nodes=2, name='my_mistral_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mistral_pretrain", num_nodes=2)
+            >>> print(recipe)
+    """
+    return run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            tensor_parallelism=1,
+            pipeline_parallelism=1,
+            pipeline_parallelism_type=None,
+            virtual_pipeline_parallelism=None,
+            context_parallelism=2,
+            sequence_parallelism=False,
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
     )
 
 
-@factory(name=NAME, for_task="llm.finetune")
-def finetune_recipe() -> Partial:
-    return Partial(
-        finetune,
-        model=model,
-        trainer=trainer,
-        data=squad,
-        log=default_log,
-        optim=distributed_fused_adam_with_cosine_annealing(),
-        peft=gpt_lora,
-        resume=hf_resume,
+@run.cli.factory(name=NAME + "_hf")
+def hf_resume() -> run.Config[nl.AutoResume]:
+    """
+    Configure automatic resumption from a Hugging Face checkpoint for Mistral 7B model.
+
+    This function sets up the configuration to resume training from a pre-trained
+    Hugging Face model checkpoint.
+
+    More info about the model can be found at: https://huggingface.co/mistralai/Mistral-7B-v0.3
+
+    Returns:
+        run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint.
+
+    Note:
+        This is particularly useful for fine-tuning scenarios where you want to
+        start from the pre-trained Mistral 7B model.
+    """
+    return run.Config(
+        nl.AutoResume, restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mistral-7B-v0.3")
     )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mistral 7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mistral
+            $ nemo llm finetune --factory "mistral(num_nodes=2, name='my_mistral_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mistral_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune)
+    recipe.resume = hf_resume()
+    recipe.peft = run.Config(LoRA)
+    recipe.data = run.Config(SquadDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1)
+    return recipe
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py
index 209a5926a008..2320c89dfd2c 100644
--- a/nemo/collections/llm/recipes/mixtral_8x22b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x22b.py
@@ -1,7 +1,24 @@
-from typing import Callable, Optional
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
+
+from typing import Optional
+
+import nemo_run as run
 import pytorch_lightning as pl
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
@@ -12,31 +29,76 @@
 from nemo.collections.llm.peft.lora import LoRA
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
-from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin
-from nemo.collections.llm.utils import Config, Partial
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mixtral_8x22b"
 
 
-def model() -> Config[pl.LightningModule]:
-    return Config(MixtralModel, config=Config(MixtralConfig8x22B))
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mixtral 8x22B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mixtral 8x22B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mixtral_8x22b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(MixtralModel, config=run.Config(MixtralConfig8x22B))
 
 
 def trainer(
-    tensor_parallelism: int,
-    pipeline_parallelism: int,
-    pipeline_parallelism_type: Optional[torch.dtype],
-    virtual_pipeline_parallelism: Optional[int],
-    context_parallelism: int,
-    sequence_parallelism: bool,
-    expert_parallelism: int,
-    num_nodes: int = 1,
+    tensor_parallelism: int = 8,
+    pipeline_parallelism: int = 8,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = 7,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = True,
+    expert_parallelism: int = 1,
+    num_nodes: int = 8,
     num_gpus_per_node: int = 8,
     max_steps: int = 1168251,
-    callbacks: Optional[list[Config[Callback]]] = None,
-) -> Config[nl.Trainer]:
-    strategy = Config(
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mixtral 8x22B model.
+
+    This function sets up the distributed training strategy optimized for the large Mixtral 8x22B model.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        expert_parallelism (int): Degree of expert parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mixtral_8x22b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=8, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses extensive parallelism to handle the large model size efficiently.
+    """
+    strategy = run.Config(
         nl.MegatronStrategy,
         tensor_model_parallel_size=tensor_parallelism,
         pipeline_model_parallel_size=pipeline_parallelism,
@@ -48,9 +110,14 @@ def trainer(
         gradient_as_bucket_view=True,
         ckpt_async_save=True,
         ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+        ),
     )
 
-    trainer = Config(
+    trainer = run.Config(
         nl.Trainer,
         accelerator="gpu",
         accumulate_grad_batches=1,
@@ -61,7 +128,7 @@ def trainer(
         log_every_n_steps=10,
         max_steps=max_steps,
         num_nodes=num_nodes,
-        plugins=bf16_mixed_plugin(),
+        plugins=run.Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
         strategy=strategy,
         use_distributed_sampler=False,
         val_check_interval=2000,
@@ -70,43 +137,107 @@ def trainer(
     return trainer
 
 
+@run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    return Partial(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mixtral 8x22B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mixtral_8x22b
+            $ nemo llm pretrain --factory "mixtral_8x22b(num_nodes=2, name='my_mixtral_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=2)
+            >>> print(recipe)
+    """
+    return run.Partial(
         fn,
         model=model(),
         trainer=trainer(
-            tensor_parallelism=8,
-            pipeline_parallelism=8,
-            pipeline_parallelism_type=torch.bfloat16,
-            virtual_pipeline_parallelism=7,
-            context_parallelism=1,
-            sequence_parallelism=True,
-            expert_parallelism=1,
-            num_nodes=num_nodes,
-            num_gpus_per_node=num_gpus_per_node,
-            callbacks=[Config(TimingCallback)],
+            num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, callbacks=[run.Config(TimingCallback)]
         ),
-        data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
-        log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
         resume=default_resume(),
     )
 
 
-def hf_resume() -> Config[nl.AutoResume]:
-    return Config(
+def hf_resume() -> run.Config[nl.AutoResume]:
+    """
+    Configure automatic resumption from a Hugging Face checkpoint for Mixtral 8x22B model.
+
+    This function sets up the configuration to resume training from a pre-trained
+    Hugging Face model checkpoint.
+
+    More info about the model can be found at: https://huggingface.co/mistralai/Mixtral-8x22B-v0.1
+
+    Returns:
+        run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint.
+
+    Note:
+        This is particularly useful for fine-tuning scenarios where you want to
+        start from the pre-trained Mixtral 8x22B model.
+    """
+    return run.Config(
         nl.AutoResume,
-        restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x22B-v0.1"),
+        restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x22B-v0.1"),
     )
 
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune
-    )
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 8,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mixtral 8x22B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mixtral_8x22b
+            $ nemo llm finetune --factory "mixtral_8x22b(num_nodes=2, name='my_mixtral_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mixtral_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune)
     recipe.resume = hf_resume()
-    recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
-    recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
+    recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
+    recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
     return recipe
diff --git a/nemo/collections/llm/recipes/mixtral_8x3b.py b/nemo/collections/llm/recipes/mixtral_8x3b.py
index 7dc8170e13e3..14318bea9e5a 100644
--- a/nemo/collections/llm/recipes/mixtral_8x3b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x3b.py
@@ -1,5 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 from typing import Callable, Optional
 
+import nemo_run as run
 import pytorch_lightning as pl
 import torch
 from pytorch_lightning.callbacks.callback import Callback
@@ -12,31 +28,74 @@
 from nemo.collections.llm.peft.lora import LoRA
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
-from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin
-from nemo.collections.llm.utils import Config, Partial
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mixtral_8x3b"
 
 
-def model() -> Config[pl.LightningModule]:
-    return Config(MixtralModel, config=Config(MixtralConfig8x3B))
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mixtral 8x3B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mixtral_8x3b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(MixtralModel, config=run.Config(MixtralConfig8x3B))
 
 
 def trainer(
-    tensor_parallelism: int,
-    pipeline_parallelism: int,
-    pipeline_parallelism_type: Optional[torch.dtype],
-    virtual_pipeline_parallelism: Optional[int],
-    context_parallelism: int,
-    sequence_parallelism: bool,
-    expert_parallelism: int,
+    tensor_parallelism: int = 4,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = True,
+    expert_parallelism: int = 1,
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
     max_steps: int = 1168251,
-    callbacks: Optional[list[Config[Callback]]] = None,
-) -> Config[nl.Trainer]:
-    strategy = Config(
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mixtral 8x3B model.
+
+    This function sets up the distributed training strategy optimized for the Mixtral 8x3B model.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        expert_parallelism (int): Degree of expert parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mixtral_8x3b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=8)
+            >>> print(trainer_config)
+    """
+    strategy = run.Config(
         nl.MegatronStrategy,
         tensor_model_parallel_size=tensor_parallelism,
         pipeline_model_parallel_size=pipeline_parallelism,
@@ -50,7 +109,7 @@ def trainer(
         ckpt_parallel_load=True,
     )
 
-    trainer = Config(
+    trainer = run.Config(
         nl.Trainer,
         accelerator="gpu",
         accumulate_grad_batches=1,
@@ -61,7 +120,7 @@ def trainer(
         log_every_n_steps=10,
         max_steps=max_steps,
         num_nodes=num_nodes,
-        plugins=bf16_mixed_plugin(),
+        plugins=bf16_mixed(),
         strategy=strategy,
         use_distributed_sampler=False,
         val_check_interval=2000,
@@ -70,43 +129,108 @@ def trainer(
     return trainer
 
 
+@run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    return Partial(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mixtral 8x3B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, and data settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): Function to use for pre-training (default: nemo.collections.llm.api.pretrain).
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mixtral_8x3b
+            $ nemo llm pretrain --factory "mixtral_8x3b(num_nodes=2, name='my_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mixtral_8x3b_pretrain", num_nodes=2)
+            >>> print(recipe)
+    """
+    return run.Partial(
         fn,
         model=model(),
         trainer=trainer(
-            tensor_parallelism=4,
-            pipeline_parallelism=1,
-            pipeline_parallelism_type=None,
-            virtual_pipeline_parallelism=None,
-            context_parallelism=1,
-            sequence_parallelism=True,
-            expert_parallelism=1,
             num_nodes=num_nodes,
             num_gpus_per_node=num_gpus_per_node,
-            callbacks=[Config(TimingCallback)],
+            callbacks=[run.Config(TimingCallback)],
         ),
-        data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
-        log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
         resume=default_resume(),
     )
 
 
-def hf_resume() -> Config[nl.AutoResume]:
-    return Config(
+def hf_resume() -> run.Config[nl.AutoResume]:
+    """
+    Configure the Hugging Face model resuming for Mixtral 8x3B model.
+
+    This function sets up the configuration for resuming training from a Hugging Face model.
+
+    Returns:
+        run.Config[nl.AutoResume]: Configuration for resuming from a Hugging Face model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory "mixtral_8x3b(resume=hf_resume())"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mixtral_8x3b_finetune", num_nodes=2)
+            >>> recipe.resume = hf_resume()
+            >>> print(recipe)
+    """
+    return run.Config(
         nl.AutoResume,
-        restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"),
+        restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"),
     )
 
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune
-    )
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mixtral 8x3B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, and data settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mixtral_8x3b
+            $ nemo llm finetune --factory "mixtral_8x3b(num_nodes=2, name='my_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mixtral_8x3b_finetune", num_nodes=2)
+            >>> print(recipe)
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune)
+
     recipe.resume = hf_resume()
-    recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
-    recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
+    recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
+    recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
     return recipe
diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py
index dbf27f86415c..287ac331ee65 100644
--- a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py
@@ -1,61 +1,82 @@
-from typing import Callable
-
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
 import torch
 
-from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import mixtral_8x3b
-from nemo.collections.llm.utils import Config, Partial
-from nemo.utils.exp_manager import TimingCallback
-
 
 NAME = "mixtral_8x3b_16k"
 
 
-def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    recipe = mixtral_8x3b.pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
-    )
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mixtral 8x3B model configuration with 16k sequence length.
 
-    model = mixtral_8x3b.model()
-    model.config.seq_length = 16384
-    model.config.max_position_embeddings = 16384
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model with 16k sequence length.
 
-    trainer = mixtral_8x3b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=2,
-        pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=8,
-        context_parallelism=2,
-        sequence_parallelism=True,
-        expert_parallelism=1,
-        num_nodes=num_nodes,
-        num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
-    )
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mixtral_8x3b_16k ...
 
-    data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    model_config = mixtral_8x3b.model()
+    model_config.config.seq_length = 16384
+    model_config.config.max_position_embeddings = 16384
+    return model_config
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
 
-    return recipe
+def trainer(
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Config:
+    """
+    Configure the NeMo Lightning Trainer for Mixtral 8x3B model with 16k sequence length.
 
+    This function sets up the distributed training strategy optimized for longer sequences.
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = mixtral_8x3b.finetune_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node
-    )
+    Args:
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Config: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mixtral_8x3b_16k ...
 
-    model = mixtral_8x3b.model()
-    model.config.seq_length = 16384
-    model.config.max_position_embeddings = 16384
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
 
-    trainer = mixtral_8x3b.trainer(
+    Note:
+        This configuration uses increased parallelism to handle the longer sequence length efficiently.
+    """
+    return mixtral_8x3b.trainer(
         tensor_parallelism=2,
         pipeline_parallelism=2,
         pipeline_parallelism_type=torch.bfloat16,
@@ -65,13 +86,91 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node:
         expert_parallelism=1,
         num_nodes=num_nodes,
         num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
     )
 
-    data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mixtral 8x3B model with 16k sequence length.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, and data settings optimized for 16k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mixtral_8x3b_16k
+            $ nemo llm pretrain --factory "mixtral_8x3b_16k(num_nodes=2, name='my_16k_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mixtral_8x3b_16k_pretrain", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for handling longer sequences (16k) compared to the standard version.
+    """
+    recipe = mixtral_8x3b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
+
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mixtral 8x3B model with 16k sequence length.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, and data settings optimized for 16k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mixtral_8x3b_16k
+            $ nemo llm finetune --factory "mixtral_8x3b_16k(num_nodes=2, name='my_16k_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mixtral_8x3b_16k_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for fine-tuning with longer sequences (16k) compared to the standard version.
+        It uses the SQuAD dataset adapted for 16k sequence length.
+    """
+    recipe = mixtral_8x3b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
 
     return recipe
diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py
index b2a7724b35a9..98cf2f4f9e7b 100644
--- a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py
@@ -1,62 +1,84 @@
-from typing import Callable
-
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
 import torch
 
-from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import mixtral_8x3b
-from nemo.collections.llm.utils import Config, Partial
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mixtral_8x3b_64k"
 
 
-def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    recipe = mixtral_8x3b.pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
-    )
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mixtral 8x3B model configuration with 64k sequence length.
 
-    model = mixtral_8x3b.model()
-    model.config.seq_length = 65536
-    model.config.max_position_embeddings = 65536
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model with 64k sequence length.
 
-    trainer = mixtral_8x3b.trainer(
-        tensor_parallelism=4,
-        pipeline_parallelism=4,
-        pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=8,
-        context_parallelism=4,
-        sequence_parallelism=True,
-        expert_parallelism=1,
-        num_nodes=num_nodes,
-        num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
-    )
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mixtral_8x3b_64k ...
 
-    data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    model_config = mixtral_8x3b.model()
+    model_config.config.seq_length = 65536
+    return model_config
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
 
-    return recipe
+def trainer(
+    num_nodes: int = 8,
+    num_gpus_per_node: int = 8,
+) -> run.Config:
+    """
+    Configure the NeMo Lightning Trainer for Mixtral 8x3B model with 64k sequence length.
 
+    This function sets up the distributed training strategy optimized for long sequences.
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = mixtral_8x3b.finetune_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node
-    )
+    Args:
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
 
-    model = mixtral_8x3b.model()
-    model.config.seq_length = 65536
-    model.config.max_position_embeddings = 65536
+    Returns:
+        run.Config: Configuration for the NeMo Lightning Trainer.
 
-    trainer = mixtral_8x3b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=2,
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mixtral_8x3b_64k ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=8, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses significantly increased parallelism to handle the long sequence length efficiently.
+    """
+    return mixtral_8x3b.trainer(
+        tensor_parallelism=4,
+        pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
         virtual_pipeline_parallelism=8,
         context_parallelism=4,
@@ -64,13 +86,93 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node:
         expert_parallelism=1,
         num_nodes=num_nodes,
         num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
+        callbacks=[run.Config(TimingCallback)],
     )
 
-    data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 8,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mixtral 8x3B model with 64k sequence length.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, and data settings optimized for 64k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mixtral_8x3b_64k
+            $ nemo llm pretrain --factory "mixtral_8x3b_64k(num_nodes=8, name='my_64k_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mixtral_8x3b_64k_pretrain", num_nodes=8)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for handling long sequences (64k) compared to the standard version.
+        It requires significant computational resources due to the extended sequence length.
+    """
+    recipe = mixtral_8x3b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 8,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mixtral 8x3B model with 64k sequence length.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, and data settings optimized for 64k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mixtral_8x3b_64k
+            $ nemo llm finetune --factory "mixtral_8x3b_64k(num_nodes=8, name='my_64k_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mixtral_8x3b_64k_finetune", num_nodes=8)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for fine-tuning with long sequences (64k) compared to the standard version.
+        It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration requires
+        substantial computational resources due to the extended sequence length.
+    """
+    recipe = mixtral_8x3b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
 
     return recipe
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
index bacbfcab4e2d..21c9ef572a68 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -1,7 +1,24 @@
-from typing import Callable, Optional
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
+
+from typing import Optional
+
+import nemo_run as run
 import pytorch_lightning as pl
 import torch
+from megatron.core.distributed import DistributedDataParallelConfig
 from pytorch_lightning.callbacks.callback import Callback
 
 from nemo import lightning as nl
@@ -12,31 +29,73 @@
 from nemo.collections.llm.peft.lora import LoRA
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
-from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin
-from nemo.collections.llm.utils import Config, Partial
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mixtral_8x7b"
 
 
-def model() -> Config[pl.LightningModule]:
-    return Config(MixtralModel, config=Config(MixtralConfig8x7B))
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mixtral 8x7B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mixtral_8x7b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(MixtralModel, config=run.Config(MixtralConfig8x7B))
 
 
 def trainer(
-    tensor_parallelism: int,
-    pipeline_parallelism: int,
-    pipeline_parallelism_type: Optional[torch.dtype],
-    virtual_pipeline_parallelism: Optional[int],
-    context_parallelism: int,
-    sequence_parallelism: bool,
-    expert_parallelism: int,
-    num_nodes: int = 1,
+    tensor_parallelism: int = 8,
+    pipeline_parallelism: int = 2,
+    pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = True,
+    expert_parallelism: int = 1,
+    num_nodes: int = 2,
     num_gpus_per_node: int = 8,
     max_steps: int = 1168251,
-    callbacks: Optional[list[Config[Callback]]] = None,
-) -> Config[nl.Trainer]:
-    strategy = Config(
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Mixtral 8x7B model.
+
+    This function sets up the distributed training strategy optimized for the Mixtral 8x7B model.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        expert_parallelism (int): Degree of expert parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mixtral_8x7b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+    """
+    strategy = run.Config(
         nl.MegatronStrategy,
         tensor_model_parallel_size=tensor_parallelism,
         pipeline_model_parallel_size=pipeline_parallelism,
@@ -48,9 +107,14 @@ def trainer(
         gradient_as_bucket_view=True,
         ckpt_async_save=True,
         ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+        ),
     )
 
-    trainer = Config(
+    trainer = run.Config(
         nl.Trainer,
         accelerator="gpu",
         accumulate_grad_batches=1,
@@ -61,7 +125,7 @@ def trainer(
         log_every_n_steps=10,
         max_steps=max_steps,
         num_nodes=num_nodes,
-        plugins=bf16_mixed_plugin(),
+        plugins=run.Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
         strategy=strategy,
         use_distributed_sampler=False,
         val_check_interval=2000,
@@ -70,43 +134,107 @@ def trainer(
     return trainer
 
 
+@run.cli.factory(target=pretrain, name=NAME)
 def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    return Partial(
+    dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, fn=pretrain
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mixtral 8x7B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mixtral_8x7b
+            $ nemo llm pretrain --factory "mixtral_8x7b(num_nodes=2, name='my_mixtral_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mixtral_8x7b_pretrain", num_nodes=2)
+            >>> print(recipe)
+    """
+    return run.Partial(
         fn,
         model=model(),
         trainer=trainer(
-            tensor_parallelism=8,
-            pipeline_parallelism=2,
-            pipeline_parallelism_type=torch.bfloat16,
-            virtual_pipeline_parallelism=None,
-            context_parallelism=1,
-            sequence_parallelism=True,
-            expert_parallelism=1,
-            num_nodes=num_nodes,
-            num_gpus_per_node=num_gpus_per_node,
-            callbacks=[Config(TimingCallback)],
+            num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, callbacks=[run.Config(TimingCallback)]
         ),
-        data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
-        log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
         resume=default_resume(),
     )
 
 
-def hf_resume() -> Config[nl.AutoResume]:
-    return Config(
+def hf_resume() -> run.Config[nl.AutoResume]:
+    """
+    Configure automatic resumption from a Hugging Face checkpoint for Mixtral 8x7B model.
+
+    This function sets up the configuration to resume training from a pre-trained
+    Hugging Face model checkpoint.
+
+    More info about the model can be found at: https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
+
+    Returns:
+        run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint.
+
+    Note:
+        This is particularly useful for fine-tuning scenarios where you want to
+        start from the pre-trained Mixtral 8x7B model.
+    """
+    return run.Config(
         nl.AutoResume,
-        restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"),
+        restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"),
     )
 
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune
-    )
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 2,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mixtral 8x7B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mixtral_8x7b
+            $ nemo llm finetune --factory "mixtral_8x7b(num_nodes=2, name='my_mixtral_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mixtral_8x7b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune)
     recipe.resume = hf_resume()
-    recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
-    recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
+    recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32)
+    recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
     return recipe
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
index 0542f22836d6..4b5fd07a69e9 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py
@@ -1,76 +1,174 @@
-from typing import Callable
-
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
 import torch
 
-from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import mixtral_8x7b
-from nemo.collections.llm.utils import Config, Partial
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mixtral_8x7b_16k"
 
 
-def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    recipe = mixtral_8x7b.pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
-    )
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mixtral 8x7B model configuration with 16k sequence length.
 
-    model = mixtral_8x7b.model()
-    model.config.seq_length = 16384
-    model.config.max_position_embeddings = 16384
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model with 16k sequence length.
 
-    trainer = mixtral_8x7b.trainer(
-        tensor_parallelism=2,
-        pipeline_parallelism=4,
-        pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=8,
-        context_parallelism=4,
-        sequence_parallelism=True,
-        expert_parallelism=1,
-        num_nodes=num_nodes,
-        num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
-    )
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mixtral_8x7b_16k ...
 
-    data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    model_config = mixtral_8x7b.model()
+    model_config.config.seq_length = 16384
+    model_config.config.max_position_embeddings = 16384
+    return model_config
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
 
-    return recipe
+def trainer(
+    num_nodes: int = 2,
+    num_gpus_per_node: int = 8,
+) -> run.Config:
+    """
+    Configure the NeMo Lightning Trainer for Mixtral 8x7B model with 16k sequence length.
 
+    This function sets up the distributed training strategy optimized for longer sequences.
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = mixtral_8x7b.finetune_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node
-    )
+    Args:
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Config: Configuration for the NeMo Lightning Trainer.
 
-    model = mixtral_8x7b.model()
-    model.config.seq_length = 16384
-    model.config.max_position_embeddings = 16384
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mixtral_8x7b_16k ...
 
-    trainer = mixtral_8x7b.trainer(
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses increased parallelism to handle the longer sequence length efficiently.
+    """
+    return mixtral_8x7b.trainer(
         tensor_parallelism=2,
-        pipeline_parallelism=2,
+        pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
         virtual_pipeline_parallelism=8,
-        context_parallelism=1,
+        context_parallelism=4,
         sequence_parallelism=True,
-        expert_parallelism=8,
+        expert_parallelism=1,
         num_nodes=num_nodes,
         num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
+        callbacks=[run.Config(TimingCallback)],
     )
 
-    data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 2,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mixtral 8x7B model with 16k sequence length.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, and data settings optimized for 16k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mixtral_8x7b_16k
+            $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=2, name='my_16k_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=2)
+            >>> print(recipe)
+    """
+    recipe = mixtral_8x7b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
+
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 2,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mixtral 8x7B model with 16k sequence length.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, and data settings optimized for 16k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mixtral_8x7b_16k
+            $ nemo llm finetune --factory "mixtral_8x7b_16k(num_nodes=2, name='my_16k_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mixtral_8x7b_16k_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning.
+    """
+    recipe = mixtral_8x7b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1)
 
     return recipe
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
index 4fb8de98063e..6a1f76961325 100644
--- a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
+++ b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py
@@ -1,76 +1,180 @@
-from typing import Callable
-
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import nemo_run as run
+import pytorch_lightning as pl
 import torch
 
-from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes import mixtral_8x7b
-from nemo.collections.llm.utils import Config, Partial
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mixtral_8x7b_64k"
 
 
-def pretrain_recipe(
-    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
-) -> Partial:
-    recipe = mixtral_8x7b.pretrain_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
-    )
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Mixtral 8x7B model configuration with 64k sequence length.
 
-    model = mixtral_8x7b.model()
-    model.config.seq_length = 65536
-    model.config.max_position_embeddings = 65536
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model with 64k sequence length.
 
-    trainer = mixtral_8x7b.trainer(
-        tensor_parallelism=4,
-        pipeline_parallelism=4,
-        pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=4,
-        context_parallelism=8,
-        sequence_parallelism=True,
-        expert_parallelism=1,
-        num_nodes=num_nodes,
-        num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
-    )
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=mixtral_8x7b_64k ...
 
-    data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    model_config = mixtral_8x7b.model()
+    model_config.config.seq_length = 65536
+    return model_config
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
 
-    return recipe
+def trainer(
+    num_nodes: int = 16,
+    num_gpus_per_node: int = 8,
+) -> run.Config:
+    """
+    Configure the NeMo Lightning Trainer for Mixtral 8x7B model with 64k sequence length.
 
+    This function sets up the distributed training strategy optimized for very long sequences.
 
-def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial:
-    recipe = mixtral_8x7b.finetune_recipe(
-        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node
-    )
+    Args:
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Config: Configuration for the NeMo Lightning Trainer.
 
-    model = mixtral_8x7b.model()
-    model.config.seq_length = 65536
-    model.config.max_position_embeddings = 65536
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=mixtral_8x7b_64k ...
 
-    trainer = mixtral_8x7b.trainer(
-        tensor_parallelism=2,
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=16, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses significantly increased parallelism to handle the long sequence length efficiently.
+        It requires a substantial amount of computational resources.
+    """
+    return mixtral_8x7b.trainer(
+        tensor_parallelism=4,
         pipeline_parallelism=4,
         pipeline_parallelism_type=torch.bfloat16,
-        virtual_pipeline_parallelism=8,
-        context_parallelism=4,
+        virtual_pipeline_parallelism=4,
+        context_parallelism=8,
         sequence_parallelism=True,
         expert_parallelism=1,
         num_nodes=num_nodes,
         num_gpus_per_node=num_gpus_per_node,
-        callbacks=[Config(TimingCallback)],
+        callbacks=[run.Config(TimingCallback)],
     )
 
-    data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
 
-    recipe.model = model
-    recipe.trainer = trainer
-    recipe.data = data
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 16,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Mixtral 8x7B model with 64k sequence length.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, and data settings optimized for 64k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory mixtral_8x7b_64k
+            $ nemo llm pretrain --factory "mixtral_8x7b_64k(num_nodes=16, name='my_64k_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="mixtral_8x7b_64k_pretrain", num_nodes=16)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for handling long sequences (64k) compared to the standard version.
+        It requires extensive computational resources due to the model size and extended sequence length.
+    """
+    recipe = mixtral_8x7b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
+
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 16,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mixtral 8x7B model with 64k sequence length.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, and data settings optimized for 64k sequence length.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mixtral_8x7b_64k
+            $ nemo llm finetune --factory "mixtral_8x7b_64k(num_nodes=16, name='my_64k_finetune')"
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mixtral_8x7b_64k_finetune", num_nodes=16)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for fine-tuning with long sequences (64k) compared to the standard version.
+        It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration requires
+        substantial computational resources due to the model size and extended sequence length.
+    """
+    recipe = mixtral_8x7b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+
+    recipe.model = model()
+    recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+    recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1)
 
     return recipe
diff --git a/nemo/collections/llm/recipes/optim/__init__.py b/nemo/collections/llm/recipes/optim/__init__.py
index e69de29bb2d1..d9155f923f18 100644
--- a/nemo/collections/llm/recipes/optim/__init__.py
+++ b/nemo/collections/llm/recipes/optim/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py
index d38bbc09d8e6..77472d8a3755 100644
--- a/nemo/collections/llm/recipes/optim/adam.py
+++ b/nemo/collections/llm/recipes/optim/adam.py
@@ -1,11 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import nemo_run as run
 from megatron.core.optimizer import OptimizerConfig
 
-from nemo.collections.llm.utils import Config
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule, OptimizerModule
 
 
-def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> Config[OptimizerModule]:
-    opt_cfg = Config(
+@run.cli.factory
+def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> run.Config[OptimizerModule]:
+    opt_cfg = run.Config(
         OptimizerConfig,
         optimizer="adam",
         lr=max_lr,
@@ -20,14 +36,14 @@ def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> Config
         clip_grad=1.0,
     )
 
-    sched = Config(
+    sched = run.Config(
         CosineAnnealingScheduler,
         warmup_steps=2000,
         constant_steps=0,
         min_lr=0.1 * max_lr,
     )
 
-    return Config(
+    return run.Config(
         MegatronOptimizerModule,
         config=opt_cfg,
         lr_scheduler=sched,
diff --git a/nemo/collections/llm/recipes/precision/__init__.py b/nemo/collections/llm/recipes/precision/__init__.py
index e69de29bb2d1..d9155f923f18 100644
--- a/nemo/collections/llm/recipes/precision/__init__.py
+++ b/nemo/collections/llm/recipes/precision/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/llm/recipes/precision/mixed_precision.py b/nemo/collections/llm/recipes/precision/mixed_precision.py
index 6a9cb64404ce..3c0332a0b330 100644
--- a/nemo/collections/llm/recipes/precision/mixed_precision.py
+++ b/nemo/collections/llm/recipes/precision/mixed_precision.py
@@ -1,11 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import nemo_run as run
 import torch
 
-from nemo.collections.llm.utils import Config
 from nemo.lightning.pytorch.plugins.mixed_precision import MegatronMixedPrecision
 
 
-def bf16_mixed_plugin() -> Config[MegatronMixedPrecision]:
-    return Config(
+@run.cli.factory
+def bf16_mixed() -> run.Config[MegatronMixedPrecision]:
+    return run.Config(
         MegatronMixedPrecision,
         precision="bf16-mixed",
         params_dtype=torch.bfloat16,
@@ -15,8 +31,9 @@ def bf16_mixed_plugin() -> Config[MegatronMixedPrecision]:
     )
 
 
-def fp16_mixed_plugin() -> Config[MegatronMixedPrecision]:
-    return Config(
+@run.cli.factory
+def fp16_mixed() -> run.Config[MegatronMixedPrecision]:
+    return run.Config(
         MegatronMixedPrecision,
         precision="16-mixed",
         params_dtype=torch.half,
diff --git a/nemo/collections/llm/tokenizer.py b/nemo/collections/llm/tokenizer.py
index 77320c4b9c02..ef8cc53db7e5 100644
--- a/nemo/collections/llm/tokenizer.py
+++ b/nemo/collections/llm/tokenizer.py
@@ -12,12 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.lightning.io.artifact import FileArtifact
+from nemo.lightning.io.artifact import DirOrStringArtifact, FileArtifact
 from nemo.lightning.io.mixin import track_io
 
 __all__ = []
 
+
+def extract_name(cls):
+    return str(cls).split('.')[-1].rstrip('>').rstrip("'")
+
+
 try:
+    # Track HF tokenizers
+    from transformers import AutoTokenizer as HfAutoTokenizer
+    from transformers.models.llama.tokenization_llama import LlamaTokenizer
+    from transformers.models.llama.tokenization_llama_fast import LlamaTokenizerFast
+
+    for cls in [HfAutoTokenizer, LlamaTokenizer, LlamaTokenizerFast]:
+        track_io(
+            cls,
+            artifacts=[
+                FileArtifact(attr_name, required=False)
+                for attr_name in ['vocab_file', 'merges_file', 'tokenizer_file', 'name_or_path']
+            ],
+        )
+        __all__.append(extract_name(cls))
+
     from nemo.collections.common.tokenizers import AutoTokenizer
 
     track_io(
@@ -25,6 +45,7 @@
         artifacts=[
             FileArtifact("vocab_file", required=False),
             FileArtifact("merges_file", required=False),
+            DirOrStringArtifact("pretrained_model_name", required=False),
         ],
     )
     __all__.append("AutoTokenizer")
diff --git a/nemo/collections/llm/tools/auto_configurator/__init__.py b/nemo/collections/llm/tools/auto_configurator/__init__.py
new file mode 100644
index 000000000000..5c6bde2c285a
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/__init__.py
@@ -0,0 +1,2 @@
+from nemo.collections.llm.tools.auto_configurator.core.calculate_performance import get_results
+from nemo.collections.llm.tools.auto_configurator.runner import AutoConfigurator, generate_configs
diff --git a/tests/lightning/fabric/__init__.py b/nemo/collections/llm/tools/auto_configurator/core/__init__.py
similarity index 100%
rename from tests/lightning/fabric/__init__.py
rename to nemo/collections/llm/tools/auto_configurator/core/__init__.py
diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
new file mode 100644
index 000000000000..ee1579f6f6e8
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from megatron.core.optimizer import OptimizerConfig
+from pytorch_lightning.loggers import TensorBoardLogger
+
+from nemo import lightning as nl
+from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
+from nemo.collections.llm import PreTrainingDataModule
+from nemo.collections.llm.utils import Config
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
+from nemo.utils.exp_manager import TimingCallback
+
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class BaseConfig:
+    def __init__(self, config=None):
+        """
+        Args:
+            config (AutoConfigurator): auto configurator runner config.
+        """
+
+        self.config = config
+
+        self.model = self.get_model()
+        self.optim = self.get_optim()
+        self.trainer = self.get_trainer()
+        self.data = self.get_data()
+        self.log = self.get_logger()
+        self.run = self.get_run_config()
+        self.tokenizer = self.get_tokenizer(config.tokenizer_type, config.tokenizer_path)
+
+    def get_model(self):
+        """Function that returns model config.
+
+        Returns:
+            Config: model config.
+        """
+
+        self.config.model.seq_length = self.config.seq_length
+
+        return self.config.model
+
+    def get_optim(self) -> Config[OptimizerConfig]:
+        """Function that returns optimizer config.
+
+        Returns:
+            Config[OptimizerConfig]: optimizer config.
+        """
+        optim_params = {
+            "optimizer": "adam",
+            "lr": 1e-4,
+            "min_lr": 1e-5,
+            "use_distributed_optimizer": True,
+            "bf16": True,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "overlap_grad_reduce": True,
+            "overlap_param_gather": True,
+            "clip_grad": 1.0,
+            "adam_eps": 1e-5,
+        }
+
+        optim_config = Config(
+            OptimizerConfig,
+            **optim_params,
+        )
+
+        sched = Config(
+            CosineAnnealingScheduler,
+            warmup_steps=10,
+            constant_steps=0,
+            min_lr=optim_config.min_lr,
+        )
+
+        return Config(
+            MegatronOptimizerModule,
+            config=optim_config,
+            lr_scheduler=sched,
+        )
+
+    def get_trainer(self) -> Config[nl.Trainer]:
+        """Function that returns config for PTL trainer.
+
+        Returns:
+            Config[nl.Trainer]: trainer config.
+        """
+
+        trainer_config = {
+            "accelerator": "gpu",
+            "enable_checkpointing": False,
+            "use_distributed_sampler": False,
+            "max_epochs": None,
+            "log_every_n_steps": 1,
+            "limit_val_batches": 1,
+            "limit_test_batches": 1,
+            "accumulate_grad_batches": 1,
+            "num_nodes": self.config.num_nodes,
+            "devices": self.config.num_gpus,
+            "max_steps": self.config.max_steps_per_run,
+            "val_check_interval": self.config.max_steps_per_run,
+        }
+
+        strategy = Config(
+            nl.MegatronStrategy,
+            pipeline_dtype=torch.bfloat16,
+        )
+
+        return Config(
+            nl.Trainer,
+            **trainer_config,
+            strategy=strategy,
+            plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
+            callbacks=[Config(TimingCallback)],
+        )
+
+    def get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
+        """Function that returns the tokenizer config.
+
+        Args:
+            tokenizer_type (str): tokenizer type.
+            tokenizer_path (str): path to the tokenizer.
+
+        Returns:
+            Config: tokenizer config.
+        """
+
+        if tokenizer_type == "sentencepiece":
+            return Config(SentencePieceTokenizer, model_path=tokenizer_path)
+        else:
+            return Config(AutoTokenizer, pretrained_model_name=tokenizer_path)
+
+    def get_data(self) -> Config[PreTrainingDataModule]:
+        """Function that returns dataset config.
+
+        Returns:
+            Config[PreTrainingDataModule]: data config.
+        """
+
+        # Data config
+        data_config = {
+            "paths": self.config.data_paths,
+            "seq_length": self.config.seq_length,
+            "global_batch_size": self.config.global_batch_size,
+            "num_workers": 2,
+            "index_mapping_dir": None,
+        }
+
+        # Define the tokenizer
+        tokenizer = self.get_tokenizer(
+            self.config.tokenizer_type,
+            self.config.tokenizer_path,
+        )
+
+        return Config(
+            PreTrainingDataModule,
+            **data_config,
+            tokenizer=tokenizer,
+        )
+
+    def get_logger(self) -> Config[nl.NeMoLogger]:
+        """Function that returns the training strategy.
+
+        Returns:
+            Config[nl.NeMoLogger]: NeMo Logger config.
+        """
+
+        # Define TensorBoard Logger
+        tb_logger = Config(TensorBoardLogger, save_dir="tb_logs")
+
+        ckpt = Config(
+            nl.ModelCheckpoint,
+            monitor="reduced_train_loss",
+            save_last=False,
+            save_top_k=0,
+        )
+
+        return Config(
+            nl.NeMoLogger,
+            ckpt=ckpt,
+            tensorboard=tb_logger,
+            wandb=None,
+            dir=self.config.path_to_logs,
+        )
+
+    def get_run_config(self) -> dict:
+        """Function that returns config for cluster job.
+
+        Returns:
+            dict: cluster job config.
+        """
+
+        run_config = {
+            "name": self.config.model.__class__.__name__,
+            "time_limit": f"0-00:{self.config.max_minutes_per_run}:00",
+        }
+
+        return run_config
+
+
+def calculate_model_size(
+    gpu_count: int,
+    max_training_days: float,
+    model_size_in_b: float = None,
+    tflops_per_gpu: int = 140,
+    num_tokens_in_b: int = 300,
+    model_name: str = "gpt3",
+) -> float:
+    """Estimates a model size to be trained given the constraints. If the
+       model_size is provided, it estimates the time to train it with the given
+       constraints.
+
+    Example:
+        output 5B params to train for 7 days with 160 GPUs.
+
+    Args:
+        gpu_count (int): number of gpus to use (num_nodes * gpus_per_node).
+        max_training_days (float): number of days to train the model for.
+        model_size_in_b (float): number of parameters in the model, if known.
+        tflops_per_gpu (int): estimated number of TFLOPS/s per GPU.
+        num_tokens_in_b (int): number of tokens to train the model for.
+        model_name (str): name of the model.
+
+    Returns:
+        float: number of parameters to use for training.
+    """
+
+    # Model size is not known, must be estimated.
+    if model_size_in_b is None:
+        model_size_in_b = _estimate_model_size(
+            max_training_days=max_training_days,
+            gpu_count=gpu_count,
+            tflops_per_gpu=tflops_per_gpu,
+            num_tokens_in_b=num_tokens_in_b,
+            model_name=model_name,
+        )
+    # Model size is known, so only time to train estimate is needed.
+    else:
+        max_training_days = _estimate_training_time(
+            model_size_in_b=model_size_in_b,
+            gpu_count=gpu_count,
+            tflops_per_gpu=tflops_per_gpu,
+            num_tokens_in_b=num_tokens_in_b,
+            model_name=model_name,
+        )
+
+    print(
+        f"You can train a {model_size_in_b}B parameter model in "
+        f"{max_training_days} days using {gpu_count} GPUs. This result assumes "
+        f"you are training to {num_tokens_in_b}B tokens, and each GPU achieves "
+        f"{tflops_per_gpu} TFLOPS."
+    )
+    return model_size_in_b
+
+
+def _estimate_model_size(
+    max_training_days: float,
+    gpu_count: int,
+    tflops_per_gpu: int,
+    num_tokens_in_b: int,
+    model_name: str,
+) -> float:
+    """Estimates model size given time and hardware constraints. It's only used if the model size is not provided by the user.
+
+    Args:
+        max_training_days (float): number of days to train the model for.
+        gpu_count (int): number of gpus to use (num_nodes * gpus_per_node).
+        tflops_per_gpu (int): estimated number of TFLOPS/s per GPU.
+        num_tokens_in_b (int): number of tokens to train the model for.
+        model_name (str): name of the model, such as gpt3, t5, mt5...
+
+    Returns:
+        float: number of parameters to use for training.
+
+    Raises:
+        NotImplementedError: if the model_name is not one of the supported models.
+    """
+
+    model_penalty = 0.87 if model_name == "mt5" else 1.0
+    valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma", "nemotron"]
+    try:
+        if model_name in valid_models:
+            return round(
+                model_penalty
+                * (max_training_days * 3600 * 24 * gpu_count * tflops_per_gpu * 1e12)
+                / (8 * num_tokens_in_b * 1e9)
+                / 1e9,
+                2,
+            )
+        else:
+            raise NotImplementedError
+    except ValueError as err:
+        print(f"Input values were not valid: {err}")
+    except ZeroDivisionError as err:
+        print(f"Cannot divide by zero. This can happen if num_tokens_in_b is zero: {err}")
+    except NotImplementedError as err:
+        print(f"Model size estimation is only available for {valid_models}: {err}")
+    return None
+
+
+def _estimate_training_time(
+    model_size_in_b: float,
+    gpu_count: int,
+    tflops_per_gpu: int,
+    num_tokens_in_b: int,
+    model_name: str,
+) -> float:
+    """Estimates training time for a given model size and hardware constraint. To be used when a model size is provided by the user.
+
+    Args:
+        model_size_in_b (float): number of parameters to use for training.
+        gpu_count (int): number of gpus to use (num_nodes * gpus_per_node).
+        tflops_per_gpu (int): estimated number of TFLOPS/s per GPU.
+        num_tokens_in_b (int): number of tokens to train the model for.
+        model_name (str): name of the model, such as gpt3, t5, mt5...
+
+    Returns:
+        float: number of days it will take to train the model.
+
+    Raises:
+        NotImplementedError: if the model_name is not one of the supported models.
+    """
+
+    model_penalty = 1.15 if model_name == "mt5" else 1.0
+    valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma", "nemotron"]
+    try:
+        if model_name in valid_models:
+            return round(
+                model_penalty
+                * (model_size_in_b * 1e9 * 8 * num_tokens_in_b * 1e9)
+                / (3600 * 24 * gpu_count * tflops_per_gpu * 1e12),
+                2,
+            )
+        else:
+            raise NotImplementedError
+    except ValueError as err:
+        print(f"Input values were not valid: {err}")
+    except ZeroDivisionError as err:
+        print(f"Cannot divide by zero. This can happen if gpu_count or tflops_per_gpu are zero: {err}")
+    except NotImplementedError as err:
+        print(f"Training time estimation is only available for {valid_models}: {err}")
+    return None
diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
new file mode 100644
index 000000000000..5b7ac0ebc4d3
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -0,0 +1,334 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from typing import Optional
+
+import pandas as pd
+from tensorboard.backend.event_processing import event_accumulator
+
+
+def get_results(
+    base_config=None,
+    train_config=None,
+    path_to_save: str = None,
+    output_top_n: Optional[int] = 10,
+):
+    """Generates performance results.
+
+    Args:
+        config (AutoConfigurator): auto configurator runner config.
+        path_to_save (str): path where to save performance results.
+        output_top_n (Optional[int]): Number of configs to be printed out as best configs.
+    """
+
+    # Define needed variables
+    model_name = train_config.model_type
+    model_size = train_config.model_size_in_b
+    global_batch_size = base_config.data.global_batch_size
+    seq_length = base_config.data.seq_length
+
+    vocab_size = train_config.vocab_size
+    num_nodes = train_config.num_nodes
+    gpus_per_node = train_config.gpus_per_node
+
+    layers = base_config.model.num_layers
+    hs = base_config.model.hidden_size
+    ffn_hs = base_config.model.ffn_hidden_size
+
+    training_logs = path_to_save
+    final_result_logs = path_to_save
+
+    result_columns = [
+        "Model Name",
+        "Model Size",
+        "Seq Length",
+        "TP",
+        "PP",
+        "CP",
+        "EP",
+        "MBS",
+        "Act Ckpt Layers",
+        "Act Ckpt Micro Bathes",
+        "Act Ckpt Layers per Pipeline",
+        "Num Layers",
+        "Hidden Size",
+        "FFN Hidden Size",
+        "GBS",
+        "Nodes",
+        "GPUs per Node",
+        "Time per Step",
+        "Samples per Second",
+        "Model TFLOPS / GPU",
+        "Model TFLOPS Aggregate",
+    ]
+    error_columns = [
+        "Model Name",
+        "Model Size",
+        "Seq Length",
+        "TP",
+        "PP",
+        "CP",
+        "EP",
+        "MBS",
+        "Act Ckpt Layers",
+        "Act Ckpt Micro Bathes",
+        "Act Ckpt Layers per Pipeline",
+        "Num Layers",
+        "Hidden Size",
+        "FFN Hidden Size",
+        "GBS",
+        "Nodes",
+        "GPUs per Node",
+        "Error Message",
+    ]
+    result = []
+    errors = []
+    dirs = [f.path for f in os.scandir(training_logs) if f.is_dir()]
+
+    for candidate_dir in dirs:
+        logs_dir = os.path.join(training_logs, candidate_dir, "tb_logs/lightning_logs")
+        logs_folder = [f.path for f in os.scandir(logs_dir) if f.is_dir()][0]
+        tp, pp, cp, ep, mbs, act_ckpt, num_mbs_act, act_per_pipe = get_config(candidate_dir)
+
+        for f in os.listdir(logs_folder):
+            if f.endswith("0.txt"):
+                error_file = os.path.join(logs_folder, f)
+                error = find_error(error_file)
+                if error:
+                    errors.append(
+                        [
+                            model_name,
+                            model_size,
+                            seq_length,
+                            tp,
+                            pp,
+                            cp,
+                            ep,
+                            mbs,
+                            act_ckpt,
+                            num_mbs_act,
+                            act_per_pipe,
+                            layers,
+                            hs,
+                            ffn_hs,
+                            global_batch_size,
+                            num_nodes,
+                            gpus_per_node,
+                            error,
+                        ]
+                    )
+
+        files = os.listdir(logs_folder)
+        for f in files:
+            if f.startswith("events"):
+                event_file = os.path.join(logs_folder, f)
+                ea = event_accumulator.EventAccumulator(event_file)
+                ea.Reload()
+                try:
+                    timing_list = ea.Scalars("train_step_timing in s")
+                    if len(timing_list) <= 6:
+                        continue
+                    timing_list = [x.value for x in timing_list[5:]]
+                    avg_global_step_time = round(sum(timing_list) / len(timing_list), 4)
+                    samples_per_s = round(global_batch_size / avg_global_step_time, 2)
+                    m_tflops, m_tflops_gpu = calculate_tflops(
+                        model_name=model_name,
+                        gbs=global_batch_size,
+                        enc_seq_len=seq_length,
+                        dec_seq_len=seq_length,
+                        hs=hs,
+                        ffn_hs=ffn_hs,
+                        layers=layers,
+                        vocab=vocab_size,
+                        nodes=num_nodes,
+                        gpus_per_node=gpus_per_node,
+                        time_per_step=avg_global_step_time,
+                    )
+                    config_name = f"tp{tp}_pp{pp}_cp{cp}_ep{ep}_mbs{mbs}_act_{act_ckpt}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
+                    result.append(
+                        [
+                            model_name,
+                            model_size,
+                            seq_length,
+                            tp,
+                            pp,
+                            cp,
+                            ep,
+                            mbs,
+                            act_ckpt,
+                            num_mbs_act,
+                            act_per_pipe,
+                            layers,
+                            hs,
+                            ffn_hs,
+                            global_batch_size,
+                            num_nodes,
+                            gpus_per_node,
+                            avg_global_step_time,
+                            samples_per_s,
+                            m_tflops_gpu,
+                            m_tflops,
+                        ]
+                    )
+                finally:
+                    continue
+    result.sort(key=lambda x: x[17])
+    print(f"Top {min(output_top_n, len(result))} configs sorted from fastest to slowest:")
+    for i, res in enumerate(result):
+        print(f"Config #{i+1}: {res[-1]} with {res[17]:.4f}s per global step.")
+        if i + 1 == output_top_n:
+            break
+
+    top_config = f"{model_name}_{model_size}b_{num_nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_cp_{result[0][5]}_ep_{result[0][6]}_mbs_{result[0][7]}_act_ckpt_{result[0][8]}_num_mbs_act_{result[0][9]}_act_per_pipe_{result[0][10]}"
+    print("\n==================================================")
+    print(f"Optimal config: {top_config} with {result[0][17]:.4f}s per global step.")
+    print("==================================================\n")
+
+    # Save results as a CSV file.
+    os.makedirs(final_result_logs, exist_ok=True)
+    result_df = pd.DataFrame(result, columns=result_columns)
+    result_df.to_csv(os.path.join(final_result_logs, f"final_summary_{num_nodes}nodes.csv"), index=False)
+
+    error_df = pd.DataFrame(errors, columns=error_columns)
+    error_df.to_csv(os.path.join(final_result_logs, f"failed_jobs_{num_nodes}nodes.csv"), index=False)
+
+
+def calculate_tflops(
+    model_name,
+    gbs,
+    enc_seq_len,
+    dec_seq_len,
+    hs,
+    ffn_hs,
+    layers,
+    vocab,
+    nodes,
+    gpus_per_node,
+    time_per_step,
+):
+    """Calculates model and hardware TFLOPS for each model.
+
+    GPT-3 Formulas:
+        Model FLOPs = (24𝐵𝑠ℎ^2 + 4𝐵��^2ℎ) x (3 x num_layers) + 6𝐵𝑠ℎ
+    T5/mT5 Formula:
+        Model FLOPs =
+    Bert Formula:
+        Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL))
+    """
+
+    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral"]:
+        # Model FLOPS calculation
+        model_flops = (
+            (24 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs) * (3 * layers)
+            + (6 * gbs * enc_seq_len * hs * vocab)
+        ) / time_per_step
+        model_flops_per_gpu = model_flops / (nodes * gpus_per_node)
+
+        model_tflops = model_flops / 1e12
+        model_tflops_per_gpu = model_flops_per_gpu / 1e12
+
+    elif model_name == "bert":
+        model_flops = (
+            72 * gbs * layers * enc_seq_len * hs * hs * (1 + (enc_seq_len / (6 * hs)) + (vocab / (12 * hs * layers)))
+        ) / time_per_step
+        model_flops_per_gpu = model_flops / (nodes * gpus_per_node)
+        model_tflops = model_flops / 1e12
+        model_tflops_per_gpu = model_flops_per_gpu / 1e12
+
+    elif model_name in ["t5", "mt5"]:
+        # Encoder Layer FLOPS: include self attention + MLP
+        flops_self_attn_enc = 8 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs
+        flops_mlp_enc = 6 * gbs * enc_seq_len * hs * ffn_hs  # geglu needs two gemms for h -> ffn_h
+        flops_enc_layer = flops_self_attn_enc + flops_mlp_enc
+
+        # Decoder Layer FLOPS: inlcude self_attn + cross_attn + MLP
+        flops_self_attn_dec = 8 * gbs * dec_seq_len * hs * hs + 4 * gbs * dec_seq_len * dec_seq_len * hs
+        flops_cross_attn_dec = (
+            4 * gbs * enc_seq_len * hs * hs
+            + 4 * gbs * dec_seq_len * hs * hs
+            + 4 * gbs * enc_seq_len * dec_seq_len * hs
+        )
+        flops_mlp_dec = 6 * gbs * dec_seq_len * hs * ffn_hs  # geglu needs two gemms for h -> ffn_h
+        flops_dec_layer = flops_self_attn_dec + flops_cross_attn_dec + flops_mlp_dec
+
+        # FLOPs of logits layer in the head
+        flops_logits = 2 * gbs * dec_seq_len * hs * vocab
+
+        # FLOPs of fprop
+        flops_fprop = (flops_enc_layer + flops_dec_layer) * (layers // 2) + flops_logits
+
+        # FLOPs of each train step (FLOPs of bprop is 2*fprop)
+        model_flops = 3 * flops_fprop / time_per_step
+        model_flops_per_gpu = model_flops / (nodes * gpus_per_node)
+        model_tflops = model_flops / 1e12
+        model_tflops_per_gpu = model_flops_per_gpu / 1e12
+
+    else:
+        raise NotImplementedError("Model type not supported.")
+    return round(model_tflops, 2), round(model_tflops_per_gpu, 2)
+
+
+def find_error(error_file: str, errors: list = ["CUDA out of memory"]):
+    """Function that finds the error among job output.
+
+    Args:
+        errors (list): list of "popular" errors.
+        error_file (str): path to the job output.
+
+    Returns:
+        str: serror message if job has been failed because of one of listed errors or None if not.
+    """
+
+    error = None
+    with open(error_file, "r") as f:
+        output = f.read()
+    for e in errors:
+        if e in output:
+            error = e
+    return error
+
+
+def get_config(run_name: str) -> tuple:
+    """Function that extract model parallelism parameters
+
+    Args:
+        run_name (str): name of the run.
+
+    Returns:
+        tuple: model parallelism parameters.
+    """
+    pattern = r'_(tp|pp|cp|ep|mbs|act_ckpt|num_mbs_act|act_per_pipe)_([^_]+)'
+
+    # Find all matches in the input string
+    matches = re.findall(pattern, run_name)
+
+    # Convert matches to a dictionary
+    params = {param: value for param, value in matches}
+
+    return (
+        params["tp"],
+        params["pp"],
+        params["cp"],
+        params["ep"],
+        params["mbs"],
+        params["act_ckpt"],
+        params["num_mbs_act"],
+        params["act_per_pipe"],
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
new file mode 100644
index 000000000000..087bf3c6fb0e
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -0,0 +1,892 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Tuple
+
+from nemo.collections.llm.tools.auto_configurator.core import utils
+
+
+GPT_BASED_MODELS = [
+    "gpt3",
+    "bert",
+    "llama",
+    "baichuan2",
+    "chatglm",
+    "qwen2",
+    "mixtral",
+    "mistral",
+    "gemma",
+    "nemotron",
+]
+
+
+def generate_grid_search_configs(
+    base_cfg: dict,
+    train_cfg: dict,
+) -> Tuple[dict, dict]:
+    """Generates the grid of all possible configurations for the given model, and stores each different configuration in a yaml file.
+
+    Args:
+        base_cfg (dict): base configuration of the model to be trained.
+        train_cfg (dict): train configuration of the model to be trained.
+
+    Returns:
+        dict: base config.
+        dict: generated configs.
+    """
+
+    model_name = train_cfg.model_type
+    model_size_in_b = train_cfg.model_size_in_b
+
+    # 2 * num_layers is needed because of encoder/decoder architecture.
+    multiplier = 1 if model_name in GPT_BASED_MODELS else 2
+
+    seq_length = base_cfg.model.seq_length
+    num_layers = base_cfg.model.num_layers if model_name in GPT_BASED_MODELS else base_cfg.model.encoder.num_layers
+
+    if model_name in GPT_BASED_MODELS:
+        act_method = None
+    else:
+        act_method = base_cfg.model.encoder.activations_checkpoint_method
+
+    params = _calculate_tp_pp_mbs_grid(
+        model_size_in_b=model_size_in_b,
+        num_layers=num_layers,
+        model_name=model_name,
+        seq_length=seq_length,
+        train_cfg=train_cfg,
+    )
+
+    max_minutes = train_cfg.max_minutes_per_run
+    max_steps = train_cfg.max_steps_per_run
+    num_nodes = train_cfg.num_nodes
+
+    valid_tp_pp_list = []
+    for tp in params.tp:
+        for pp in params.pp:
+            for cp in params.cp:
+                for ep in params.ep:
+                    for mbs in params.mbs:
+                        num_gpus = base_cfg.trainer.num_nodes * base_cfg.trainer.devices
+                        base_cfg.data.global_batch_size = params.gbs
+                        if model_name in GPT_BASED_MODELS:
+                            att_heads = base_cfg.model.num_attention_heads
+                            num_layers = base_cfg.model.num_layers
+                        else:
+                            att_heads = base_cfg.model.encoder.num_attention_heads
+                            num_layers = base_cfg.model.encoder.num_layers
+                        model_parallelism = (tp * pp * cp * ep) if (cp and ep) else (tp * pp)
+                        mod_gbs = params.gbs % (mbs * num_gpus / model_parallelism)
+                        mod_att_heads = att_heads % tp
+                        mod_layers = (multiplier * num_layers) % pp
+                        mod_cp = cp if cp else 1
+                        mod_ep = ep if ep else 1
+                        if (
+                            mod_gbs == 0
+                            and mod_att_heads == 0
+                            and mod_layers == 0
+                            and (tp, pp, cp, ep) not in valid_tp_pp_list
+                            and (mod_cp // mod_ep == mod_cp or mod_ep // mod_cp == mod_ep)
+                            and params.min_model_parallel <= model_parallelism <= params.max_model_parallel
+                        ):
+                            valid_tp_pp_list.append((tp, pp, cp, ep))
+
+    # Generate grid search configs.
+    configs = {}
+    for tp, pp, cp, ep in valid_tp_pp_list:
+        (
+            virtual_pipelines,
+            act_ckpt_layers,
+            num_micro_batches_partial_act_ckpt,
+            act_ckpt_layers_per_pipeline,
+        ) = _set_activations_checkpoint_params(
+            tp,
+            pp,
+            cp,
+            ep,
+            num_layers,
+            act_method,
+            multiplier,
+            model_size_in_b,
+            model_name,
+        )
+        for mbs in params.mbs:
+            kwargs = {
+                "base_cfg": base_cfg,
+                "act": None,
+                "num_mbs_act": None,
+                "act_per_pipe": None,
+                "tp": tp,
+                "pp": pp,
+                "cp": cp,
+                "ep": ep,
+                "virtual_pipelines": virtual_pipelines,
+                "mbs": mbs,
+                "max_minutes": max_minutes,
+                "max_steps": max_steps,
+                "num_nodes": num_nodes,
+                "model_name": model_name,
+                "model_size": model_size_in_b,
+            }
+            if act_ckpt_layers[0] is not None:
+                if act_layers is not None and act_layers != "auto":
+                    act_ckpt_layers = act_layers
+                for act in act_ckpt_layers:
+                    for num_mbs_act in num_micro_batches_partial_act_ckpt:
+                        for act_per_pipe in act_ckpt_layers_per_pipeline:
+                            kwargs["act"] = act
+                            kwargs["num_mbs_act"] = num_mbs_act
+                            kwargs["act_per_pipe"] = act_per_pipe
+                            new_cfg = utils.modify_cfg(**kwargs)
+                            if new_cfg:  # Save candidate cfg.
+                                configs[new_cfg["run"]["name"]] = new_cfg
+            else:
+                new_cfg = utils.modify_cfg(**kwargs)
+                if new_cfg:  # Save candidate cfg.
+                    config_name = new_cfg["run"]["name"]
+                    new_cfg.pop("run")
+                    configs[config_name] = new_cfg
+
+    print(f"\nAll candidate configurations created correctly. Total number of configs: {len(configs)}.\n")
+    return base_cfg, configs
+
+
+def _set_activations_checkpoint_params(
+    tp, pp, cp, ep, num_layers, act_method, multiplier, model_size_in_b, model_name
+):
+    act_multiple = 4 // pp
+    if act_method == "block":
+        if 1.0 <= model_size_in_b < 11.3:
+            act_multiple = 8 // pp
+        elif 11.3 <= model_size_in_b < 26.0:
+            act_multiple = 16 // pp
+        elif 26.0 <= model_size_in_b < 60.0:
+            act_multiple = 16 // pp
+        elif 60.0 <= model_size_in_b:
+            act_multiple = 32 // pp
+    act_multiple = max(act_multiple, 1)
+
+    virtual_pipelines = None
+    # Num micro batches with partial act ckpt
+    min_micro_b = 0  # 0 will not be used, minimum will be set to 1 later in the code.
+    max_micro_b = pp
+    interval_micro_b = 1
+    # Act ckpt layers per pipeline
+    min_layers_per_pipe = 0
+    max_layers_per_pipe = num_layers
+    interval_layers_per_pipe = act_multiple
+    if model_name in GPT_BASED_MODELS and pp > 2:  # Interleaved pipeline scheduling.
+        virtual_pipelines = num_layers // pp  # TODO: verify that this is the best value.
+        act_multiple = 1
+        max_micro_b = pp * (virtual_pipelines - 1) + (pp - 1) * 2 + 1
+        interval_micro_b = virtual_pipelines * 8
+        max_layers_per_pipe = multiplier * num_layers // pp // virtual_pipelines + 1
+
+    (
+        act_ckpt_layers,
+        num_micro_batches_partial_act_ckpt,
+        act_ckpt_layers_per_pipeline,
+    ) = ([None], [None], [None])
+    if act_method == "block":
+        # Act ckpt num layers
+        if virtual_pipelines is None:
+            act_ckpt_layers = range(0, multiplier * num_layers // pp + 1, act_multiple)
+        else:
+            act_ckpt_layers = range(0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple)
+
+        if pp > 1 and model_name in GPT_BASED_MODELS:
+            # Num micro batches with partial act ckpt
+            num_micro_batches_partial_act_ckpt = list(range(min_micro_b, max_micro_b + 1, interval_micro_b))
+            if num_micro_batches_partial_act_ckpt[0] == 0:
+                num_micro_batches_partial_act_ckpt[0] = 1
+
+            # Act ckpt layers per pipeline
+            act_ckpt_layers_per_pipeline = range(
+                min_layers_per_pipe, max_layers_per_pipe + 1, interval_layers_per_pipe
+            )
+
+    return (
+        virtual_pipelines,
+        act_ckpt_layers,
+        num_micro_batches_partial_act_ckpt,
+        act_ckpt_layers_per_pipeline,
+    )
+
+
+@dataclass
+class GPT3GridSearch:
+    """Selects grid search space for TP, PP, CP, EP, MBS parameters for GPT-3 and 80GB GPUs.
+
+    Args:
+        model_size_in_b (float): number of parameters in the model.
+        valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
+        seq_length (int): sequence length to use for training.
+        gpu_memory_gb (int): size of GPU memory in GB.
+    """
+
+    model_size_in_b: int
+    valid_pp: List[int]
+    seq_length: int
+    gpu_memory_gb: int
+
+    tp = [1, 2, 4, 8]
+    pp = [1]
+    cp = [1]
+    ep = [1]
+    mbs = [1, 2, 4, 8]
+
+    gbs: int = 1024
+    min_model_parallel: int = 1
+    max_model_parallel: int = 8
+
+    def init_params(self):
+        model_size_in_b = self.model_size_in_b
+        gpu_memory_gb = self.gpu_memory_gb
+        seq_length = self.seq_length
+
+        if gpu_memory_gb == 80:
+            if seq_length == 2048:
+                if model_size_in_b <= 1.0:
+                    self.tp = [1, 2]
+                    self.gbs = 256
+                elif model_size_in_b <= 4.0:
+                    self.tp = [1, 2, 4]
+                    self.gbs = 1024
+                elif model_size_in_b <= 8.0:
+                    self.tp = [1, 2, 4]
+                    self.gbs = 2048
+                elif model_size_in_b <= 13.0:
+                    self.tp = [1, 2, 4, 8]
+                    self.gbs = 2048
+                elif model_size_in_b <= 23.0:
+                    self.tp = [1, 2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 4]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 4
+                    self.max_model_parallel = 8
+                    self.gbs = 2048
+                elif model_size_in_b <= 45.0:
+                    self.tp = [2, 4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 4]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 32
+                    self.gbs = 2048
+                elif model_size_in_b <= 95:
+                    self.tp = [2, 4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 8]
+                    self.mbs = [1, 2, 4, 8]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 64
+                    self.gbs = 2048
+                elif model_size_in_b <= 130.0:
+                    self.tp = [2, 4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 16]
+                    self.mbs = [1, 2, 4, 8]
+                    self.min_model_parallel = 16
+                    self.max_model_parallel = 128
+                    self.gbs = 2048
+                elif model_size_in_b <= 195.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 4 <= x <= 16]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 32
+                    self.max_model_parallel = 256
+                    self.gbs = 2048
+                elif model_size_in_b <= 395.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 8 <= x <= 32]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 64
+                    self.max_model_parallel = 512
+                    self.gbs = 2048
+                elif model_size_in_b <= 790.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 8 <= x <= 100]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 128
+                    self.max_model_parallel = 1024
+                    self.gbs = 2048
+                elif model_size_in_b <= 1100.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 16 <= x <= 130]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 256
+                    self.max_model_parallel = 2048
+                    self.gbs = 2048
+            elif seq_length == 4096:
+                if model_size_in_b <= 1.0:
+                    self.tp = [1, 2, 4]
+                    self.mbs = [1, 2, 4, 8]
+                    self.gbs = 128
+                elif model_size_in_b <= 4.0:
+                    self.tp = [1, 2, 4]
+                    self.mbs = [1, 2, 4, 8]
+                    self.gbs = 512
+                elif model_size_in_b <= 8.0:
+                    self.tp = [1, 2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2, 4]
+                    self.gbs = 1024
+                elif model_size_in_b <= 13.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2, 4]
+                    self.gbs = 1024
+                elif model_size_in_b <= 23.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2]
+                    self.min_model_parallel = 4
+                    self.max_model_parallel = 16
+                    self.gbs = 1024
+                elif model_size_in_b <= 45.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 2 <= x <= 4]
+                    self.mbs = [1, 2]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 32
+                    self.gbs = 1024
+                elif model_size_in_b <= 95:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 8]
+                    self.mbs = [1, 2]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 64
+                    self.gbs = 1024
+            elif seq_length == 8192:
+                if model_size_in_b <= 1.0:
+                    self.tp = [1, 2]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2, 4]
+                    self.gbs = 64
+                elif model_size_in_b <= 4.0:
+                    self.tp = [1, 2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2, 4]
+                    self.gbs = 128
+                elif model_size_in_b <= 8.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2]
+                    self.gbs = 256
+                elif model_size_in_b <= 13.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2]
+                    self.gbs = 256
+                elif model_size_in_b <= 23.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 4]
+                    self.mbs = [1]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 32
+                    self.gbs = 256
+                elif model_size_in_b <= 45.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 4 <= x <= 8]
+                    self.mbs = [1]
+                    self.min_model_parallel = 32
+                    self.max_model_parallel = 64
+                    self.gbs = 256
+            elif seq_length == 16384:
+                if model_size_in_b <= 1.0:
+                    self.tp = [2, 4]
+                    self.mbs = [1, 2]
+                    self.gbs = 32
+                elif model_size_in_b <= 4.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1]
+                    self.gbs = 64
+                elif model_size_in_b <= 8.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1]
+                    self.gbs = 128
+                elif model_size_in_b <= 13.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1]
+                    self.gbs = 128
+                elif model_size_in_b <= 23.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 2 <= x <= 4]
+                    self.mbs = [1]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 32
+                    self.gbs = 128
+            elif seq_length == 32768:
+                if model_size_in_b <= 1.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1]
+                    self.gbs = 16
+                elif model_size_in_b <= 4.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1]
+                    self.gbs = 32
+                elif model_size_in_b <= 8.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.min_model_parallel = 4
+                    self.max_model_parallel = 16
+                    self.mbs = [1]
+                    self.gbs = 64
+                elif model_size_in_b <= 13.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.min_model_parallel = 4
+                    self.max_model_parallel = 16
+                    self.mbs = [1]
+                    self.gbs = 64
+                elif model_size_in_b <= 23.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 2 <= x <= 4]
+                    self.mbs = [1]
+                    self.min_model_parallel = 16
+                    self.max_model_parallel = 32
+                    self.gbs = 64
+        elif gpu_memory_gb == 40:
+            if model_size_in_b <= 1.0:
+                self.tp = [1, 2, 4]
+                self.mbs = [1, 2, 4, 8]
+                self.gbs = 256
+            elif model_size_in_b <= 4.0:
+                self.tp = [1, 2, 4, 8]
+                self.mbs = [1, 2, 4, 8]
+                self.gbs = 1024
+            elif model_size_in_b <= 8.0:
+                self.tp = [2, 4, 8]
+                self.pp = [1, 2]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 2
+                self.gbs = 2048
+            elif model_size_in_b <= 13.0:
+                self.tp = [4, 8]
+                self.pp = [1, 2, 4]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 4
+                self.max_model_parallel = 32
+                self.gbs = 2048
+            elif model_size_in_b <= 23.0:
+                self.tp = [2, 4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 8]
+                self.min_model_parallel = 8
+                self.max_model_parallel = 64
+                self.gbs = 2048
+            elif model_size_in_b <= 45.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 12]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 128
+                self.gbs = 2048
+            elif model_size_in_b <= 95:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 16]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 256
+                self.gbs = 2048
+            elif model_size_in_b <= 130.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 2 <= x <= 26]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 32
+                self.max_model_parallel = 512
+                self.gbs = 2048
+            elif model_size_in_b <= 195.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 2 <= x <= 32]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 64
+                self.max_model_parallel = 1024
+                self.gbs = 2048
+            elif model_size_in_b <= 395.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 4 <= x <= 64]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 128
+                self.max_model_parallel = 2048
+                self.gbs = 2048
+            elif model_size_in_b <= 790.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 8 <= x <= 128]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 256
+                self.max_model_parallel = 4096
+                self.gbs = 2048
+            elif model_size_in_b <= 1100.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 8 <= x <= 192]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 512
+                self.max_model_parallel = 8192
+                self.gbs = 2048
+
+
+@dataclass
+class T5GridSearch:
+    """Selects grid search space for TP, PP, MBS parameters for T5/mT5 and 80GB GPUs.
+
+    Args:
+        model_size_in_b (float): number of parameters in the model.
+        valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
+        seq_length (int): sequence length to use for training.
+        gpu_memory_gb (int): size of GPU memory in GB.
+    """
+
+    model_size_in_b: int
+    seq_length: int
+    gpu_memory_gb: int
+    valid_pp: List[int]
+
+    tp = [1, 2, 4, 8]
+    pp = [1]
+    cp = [None]
+    ep = [None]
+    mbs = [1, 2, 4, 6, 8, 12, 16]
+
+    gbs: int = 1920
+    min_model_parallel: int = 1
+    max_model_parallel: int = 8
+
+    def init_params(self):
+        model_size_in_b = self.model_size_in_b
+        gpu_memory_gb = self.gpu_memory_gb
+        seq_length = self.seq_length
+
+        if gpu_memory_gb == 80:
+            if model_size_in_b <= 1.0:
+                self.tp = [1, 2]
+                self.mbs = [16, 32, 64, 128]
+                self.gbs = 2048
+            elif model_size_in_b <= 4.0:
+                self.tp = [1, 2, 4]
+                self.mbs = [4, 6, 8, 12, 16, 24, 32, 48]
+                self.gbs = 1920
+            elif model_size_in_b <= 8.0:
+                self.tp = [2, 4, 8]
+                self.mbs = [4, 6, 8, 12, 16, 24, 32]
+                self.gbs = 1920
+            elif model_size_in_b <= 14.5:
+                self.tp = [4, 8]
+                self.mbs = [2, 4, 6, 8, 12, 16, 24]
+                self.gbs = 1920
+            elif model_size_in_b <= 25.9:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 4
+                self.max_model_parallel = 16
+                self.gbs = 1920
+            elif model_size_in_b <= 43.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 4]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 8
+                self.max_model_parallel = 32
+                self.gbs = 1920
+            elif model_size_in_b <= 85.5:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 2 <= x <= 8]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 64
+                self.gbs = 1920
+            elif model_size_in_b <= 165.5:
+                self.tp = [8]
+                self.pp = [x for x in self.valid_pp if 4 <= x <= 16]
+                self.mbs = [1, 2, 4, 6]
+                self.min_model_parallel = 32
+                self.max_model_parallel = 128
+                self.gbs = 1920
+            elif model_size_in_b <= 250:
+                self.tp = [8]
+                self.pp = [x for x in self.valid_pp if 4 <= x <= 32]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 64
+                self.max_model_parallel = 256
+                self.gbs = 1920
+        elif gpu_memory_gb == 40:
+            if model_size_in_b <= 1.0:
+                self.tp = [1, 2]
+                self.mbs = [16, 32, 64, 128]
+                self.gbs = 2048
+            elif model_size_in_b <= 4.0:
+                self.tp = [1, 2, 4]
+                self.mbs = [4, 8, 12, 16, 24, 32, 48]
+                self.gbs = 1920
+            elif model_size_in_b <= 8.0:
+                self.tp = [2, 4, 8]
+                self.mbs = [4, 6, 8, 12, 16, 24]
+                self.gbs = 1920
+            elif model_size_in_b <= 14.5:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                self.mbs = [2, 4, 6, 8, 12, 16]
+                self.min_model_parallel = 4
+                self.max_model_parallel = 16
+                self.gbs = 1920
+            elif model_size_in_b <= 25.9:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 8]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 8
+                self.max_model_parallel = 32
+                self.gbs = 1920
+            elif model_size_in_b <= 43.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 8]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 32
+                self.gbs = 1920
+            elif model_size_in_b <= 85.5:
+                self.tp = [8]
+                self.pp = [x for x in self.valid_pp if 2 <= x <= 8]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 32
+                self.max_model_parallel = 64
+                self.gbs = 1920
+            elif model_size_in_b <= 165.5:
+                self.tp = [8]
+                self.pp = [x for x in self.valid_pp if 4 <= x <= 32]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 64
+                self.max_model_parallel = 128
+                self.gbs = 1920
+            elif model_size_in_b <= 250:
+                self.tp = [8]
+                self.pp = [x for x in self.valid_pp if 8 <= x <= 64]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 128
+                self.max_model_parallel = 256
+                self.gbs = 1920
+
+
+@dataclass
+class BertGridSearch:
+    """Selects grid search space for TP, PP, MBS parameters for BERT and 80GB GPUs.
+
+    Args:
+        model_size_in_b (float): number of parameters in the model.
+        valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
+        seq_length (int): sequence length to use for training.
+        gpu_memory_gb (int): size of GPU memory in GB.
+    """
+
+    model_size_in_b: int
+    seq_length: int
+    gpu_memory_gb: int
+    valid_pp: List[int]
+
+    tp = [1, 2, 4, 8]
+    pp = [1]
+    cp = [None]
+    ep = [None]
+    mbs = [1, 2, 4, 6, 8, 12, 16]
+
+    gbs: int = 1920
+    min_model_parallel: int = 1
+    max_model_parallel: int = 8
+
+    def init_params(self):
+        model_size_in_b = self.model_size_in_b
+        gpu_memory_gb = self.gpu_memory_gb
+        seq_length = self.seq_length
+
+        if gpu_memory_gb == 80:
+            if model_size_in_b <= 1.0:
+                self.tp = [1, 2]
+                self.gbs = 256
+            elif model_size_in_b <= 4.0:
+                self.tp = [1, 2, 4]
+                self.gbs = 1024
+            elif model_size_in_b <= 8.0:
+                self.tp = [2, 4, 8]
+                self.min_model_parallel = 2
+                self.gbs = 2048
+            elif model_size_in_b <= 13.0:
+                self.tp = [2, 4, 8]
+                self.mbs = [1, 2, 3, 4, 6]
+                self.min_model_parallel = 2
+                self.gbs = 2048
+            elif model_size_in_b <= 25.0:
+                self.tp = [4, 8]
+                self.mbs = [1, 2, 3, 4]
+                self.min_model_parallel = 4
+                self.gbs = 2048
+            elif model_size_in_b <= 46.5:
+                self.tp = [4, 8]
+                self.pp = [1, 2, 4]
+                self.mbs = [1, 2, 3, 4]
+                self.min_model_parallel = 4
+                self.max_model_parallel = 16
+                self.gbs = 2048
+            elif model_size_in_b <= 87.5:
+                self.tp = [4, 8]
+                self.pp = [2, 4, 6, 8]
+                self.mbs = [1, 2, 3, 4]
+                self.min_model_parallel = 8
+                self.max_model_parallel = 32
+                self.gbs = 2048
+            elif model_size_in_b <= 165.5:
+                self.tp = [4, 8]
+                self.pp = [4, 6, 8, 16]
+                self.mbs = [2, 4, 6, 8]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 128
+                self.gbs = 2048
+            elif model_size_in_b <= 250.5:
+                self.tp = [8]
+                self.pp = [4, 8, 16, 32]
+                self.mbs = [1, 2, 3, 4]
+                self.min_model_parallel = 32
+                self.max_model_parallel = 256
+                self.gbs = 2048
+            else:
+                raise ValueError("No BERT model larger than 250B parameters is supported.")
+        elif gpu_memory_gb == 40:
+            if model_size_in_b <= 1.0:
+                self.tp = [1, 2, 4]
+                self.gbs = 256
+            elif model_size_in_b <= 4.0:
+                self.tp = [1, 2, 4, 8]
+                self.gbs = 1024
+            elif model_size_in_b <= 8.0:
+                self.tp = [2, 4, 8]
+                self.mbs = [1, 2, 4]
+                self.gbs = 2048
+            elif model_size_in_b <= 13.0:
+                self.tp = [2, 4, 8]
+                self.mbs = [1, 2, 4]
+                self.gbs = 2048
+            elif model_size_in_b <= 25.0:
+                self.tp = [2, 4, 8]
+                self.pp = [1, 2]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 2
+                self.max_model_parallel = 16
+                self.gbs = 2048
+            elif model_size_in_b <= 46.5:
+                self.tp = [4, 8]
+                self.pp = [1, 2, 4, 8]
+                self.mbs = [1, 2, 3]
+                self.min_model_parallel = 8
+                self.max_model_parallel = 32
+                self.gbs = 2048
+            elif model_size_in_b <= 87.5:
+                self.tp = [4, 8]
+                self.pp = [2, 4, 6, 8]
+                self.mbs = [1, 2, 3]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 64
+                self.gbs = 2048
+            elif model_size_in_b <= 165.5:
+                self.tp = [8]
+                self.pp = [4, 6, 8, 16]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 32
+                self.max_model_parallel = 256
+                self.gbs = 2048
+            elif model_size_in_b <= 250.5:
+                self.tp = [8]
+                self.pp = [8, 16, 32]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 64
+                self.max_model_parallel = 512
+                self.gbs = 2048
+            else:
+                raise ValueError("No BERT model larger than 250B parameters is supported.")
+
+
+def _calculate_tp_pp_mbs_grid(
+    model_size_in_b: float,
+    num_layers: int,
+    model_name: str,
+    seq_length: int,
+    train_cfg: dict,
+) -> Tuple[int, int, int]:
+    """Selects grid search space for TP, PP, MBS parameters for any model, and calls the necessary heuristics function accordingly.
+
+    Args:
+        model_size_in_b (float): number of parameters in the model.
+        num_layers (int): number of layers in the model config.
+        model_name (str): name of the model to be used, such as gpt3, t5, mt5...
+        seq_length (int): sequence length to use for training.
+        train_cfg (dict): config of the model that will be launched.
+
+    Returns:
+        dataclass object with model parallelism parameters.
+
+    Raises:
+        NotImplementedError: if the model_name is not one of the supported models.
+    """
+
+    tp_sizes = train_cfg.tensor_parallel_sizes
+    pp_sizes = train_cfg.pipeline_parallel_sizes
+    cp_sizes = train_cfg.context_parallel_sizes
+    ep_sizes = train_cfg.expert_parallel_sizes
+    min_model_parallel_size = train_cfg.min_model_parallel_size
+    max_model_parallel_size = train_cfg.max_model_parallel_size
+    mbs_sizes = train_cfg.micro_batch_sizes
+    gbs_size = train_cfg.global_batch_size
+    gpu_memory_gb = train_cfg.gpu_memory_gb
+    multiplier = 1 if model_name in GPT_BASED_MODELS else 2
+    init_pp = [] if model_name in GPT_BASED_MODELS else [1]
+    valid_pp = init_pp + [
+        multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0
+    ]  # Only divisors of num_layers are possible.
+
+    kwargs = {
+        "model_size_in_b": model_size_in_b,
+        "valid_pp": valid_pp,
+        "seq_length": seq_length,
+        "gpu_memory_gb": gpu_memory_gb,
+    }
+
+    if model_name in GPT_BASED_MODELS:
+        search_class = GPT3GridSearch
+    elif model_name in ["t5", "mt5"]:
+        search_class = T5GridSearch
+    elif model_name == "bert":
+        search_class = BertGridSearch
+    else:
+        raise NotImplementedError("Model name not implemented.")
+
+    params = search_class(**kwargs)
+    params.init_params()
+
+    # Override the tp, pp, mbs search if indicated in the config params.
+    if tp_sizes is not None and tp_sizes != "auto":
+        params.tp = tp_sizes
+    if pp_sizes is not None and pp_sizes != "auto":
+        params.pp = pp_sizes
+    if cp_sizes is not None and cp_sizes != "auto":
+        params.cp = cp_sizes
+    if ep_sizes is not None and ep_sizes != "auto":
+        params.ep = ep_sizes
+    if mbs_sizes is not None and mbs_sizes != "auto":
+        params.mbs = mbs_sizes
+    if gbs_size is not None and gbs_size != "auto":
+        params.gbs = gbs_size
+    if min_model_parallel_size is not None and min_model_parallel_size != "auto":
+        params.min_model_parallel = min_model_parallel_size
+    if max_model_parallel_size is not None and max_model_parallel_size != "auto":
+        params.max_model_parallel = max_model_parallel_size
+    return params
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
new file mode 100644
index 000000000000..3441c7cdbf9b
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+
+GPT_BASED_MODELS = [
+    "gpt3",
+    "bert",
+    "llama",
+    "baichuan2",
+    "chatglm",
+    "qwen2",
+    "mixtral",
+    "mistral",
+    "gemma",
+    "nemotron",
+]
+
+
+@dataclass
+class ModelSizeParams:
+    """Calculates the parameters that affect model_size: hidden size, attention heads, KV channels, and FFN size. It also calculates the learning rate.
+
+    Args:
+        model_size_in_b (float): number of parameters in the desired model config, in billions.
+        vocab_size (int): size of the vocabulary to use for training.
+        seq_length (int): sequence length to be used during training.
+        model_name (str): name of the model to be trained, i.e. gpt3, t5, mt5...
+
+    Raises:
+        ValueError: if the model size is larger than the max supported model size.
+        NotImplementedError: if the model name is not supported.
+    """
+
+    model_size_in_b: float
+    vocab_size: int
+    seq_length: int
+    model_name: str
+
+    # Model size params
+    layers: int = None
+    hs: int = None
+    att_h: int = None
+    ffn: int = None
+    kv: int = None
+    lr: float = None
+
+    def init_params(self):
+        model_name = self.model_name
+        model_size_in_b = self.model_size_in_b
+        if model_name in GPT_BASED_MODELS:
+            if model_size_in_b < 0.25:
+                self.hs, self.att_h, self.lr = 768, 12, 6e-4
+            elif model_size_in_b < 0.5:
+                self.hs, self.att_h, self.lr = 1024, 16, 3e-4
+            elif model_size_in_b < 1:
+                self.hs, self.att_h, self.lr = 1536, 16, 2.5e-4
+            elif model_size_in_b < 2:
+                self.hs, self.att_h, self.lr = 2048, 16, 2e-4
+            elif model_size_in_b < 3:
+                self.hs, self.att_h, self.lr = 2560, 32, 1.6e-4
+            elif model_size_in_b < 4.5:
+                self.hs, self.att_h, self.lr = 3072, 32, 1.4e-4
+            elif model_size_in_b < 8:
+                self.hs, self.att_h, self.lr = 4096, 32, 1.2e-4
+            elif model_size_in_b < 15:
+                self.hs, self.att_h, self.lr = 5120, 40, 1e-4
+            elif model_size_in_b < 25:
+                self.hs, self.att_h, self.lr = 6144, 48, 1e-4
+            elif model_size_in_b < 52:
+                self.hs, self.att_h, self.lr = 8192, 64, 0.8e-4
+            elif model_size_in_b < 105:
+                self.hs, self.att_h, self.lr = 10240, 80, 0.7e-4
+            elif model_size_in_b < 205:
+                self.hs, self.att_h, self.lr = 12288, 96, 0.6e-4
+            elif model_size_in_b < 405:
+                self.hs, self.att_h, self.lr = 20480, 128, 0.5e-4
+            elif model_size_in_b < 805:
+                self.hs, self.att_h, self.lr = 20480, 128, 0.4e-4
+            elif model_size_in_b < 1105:
+                self.hs, self.att_h, self.lr = 25600, 160, 0.3e-4
+            else:
+                raise ValueError("Model_size for GPT-3 must be smaller than 1.1T parameters.")
+        elif model_name == "t5":
+            self.kv, self.lr = 64, 1e-4
+            if model_size_in_b < 0.1:
+                self.hs, self.att_h, self.ffn = 512, 6, 1024
+            elif model_size_in_b < 0.4:
+                self.hs, self.att_h, self.ffn = 768, 12, 2048
+            elif model_size_in_b < 1:
+                self.hs, self.att_h, self.ffn = 1024, 16, 2816
+            elif model_size_in_b < 5:
+                self.hs, self.att_h, self.ffn = 2048, 32, 5120
+            elif model_size_in_b < 15:
+                self.hs, self.att_h, self.ffn = 4096, 64, 10240
+            elif model_size_in_b < 25.9:
+                self.hs, self.att_h, self.ffn = 5120, 80, 10880
+            elif model_size_in_b < 43.0:
+                self.hs, self.att_h, self.ffn = 6144, 96, 10880
+            elif model_size_in_b <= 85.5:
+                self.hs, self.att_h, self.ffn = 6144, 96, 16384
+            elif model_size_in_b <= 165.5:
+                self.hs, self.att_h, self.ffn, kv = 7680, 96, 20480, 128
+            elif model_size_in_b <= 250:
+                self.hs, self.att_h, self.ffn, kv = 12288, 96, 32768, 128
+            else:
+                raise ValueError("Model_size for T5 must be smaller than 250B parameters.")
+        elif model_name == "mt5":
+            self.kv, self.lr = 64, 1e-4
+            if model_size_in_b < 0.25:
+                self.hs, self.att_h, self.ffn = 512, 6, 1024
+            elif model_size_in_b < 0.5:
+                self.hs, self.att_h, self.ffn = 768, 12, 2048
+            elif model_size_in_b < 1.2:
+                self.hs, self.att_h, self.ffn = 1024, 16, 2816
+            elif model_size_in_b < 5:
+                self.hs, self.att_h, self.ffn = 2048, 32, 5120
+            elif model_size_in_b < 15:
+                self.hs, self.att_h, self.ffn = 4096, 64, 10240
+            elif model_size_in_b < 25.9:
+                self.hs, self.att_h, self.ffn = 5120, 80, 10880
+            elif model_size_in_b < 43.0:
+                self.hs, self.att_h, self.ffn = 6144, 96, 10880
+            elif model_size_in_b <= 85.5:
+                self.hs, self.att_h, self.ffn = 6144, 96, 16384
+            elif model_size_in_b <= 165.5:
+                self.hs, self.att_h, self.ffn, kv = 7680, 96, 20480, 128
+            elif model_size_in_b <= 250:
+                self.hs, self.att_h, self.ffn, kv = 12288, 96, 32768, 128
+            else:
+                raise ValueError("Model_size for mT5 must be smaller than 250B parameters.")
+        elif model_name == "bert":
+            self.lr = 1e-4
+            if model_size_in_b < 0.25:
+                self.hs, self.att_h, self.lr = 768, 12, 2e-4
+            elif model_size_in_b < 0.5:
+                self.hs, self.att_h, self.lr = 1024, 16, 2e-4
+            elif model_size_in_b < 1:
+                self.hs, self.att_h = 1536, 16
+            elif model_size_in_b < 2:
+                self.hs, self.att_h = 2048, 16
+            elif model_size_in_b < 3:
+                self.hs, self.att_h = 2560, 32
+            elif model_size_in_b < 4.5:
+                self.hs, self.att_h = 2560, 32
+            elif model_size_in_b < 8:
+                self.hs, self.att_h = 4096, 32
+            elif model_size_in_b < 15:
+                self.hs, self.att_h = 5120, 40
+            elif model_size_in_b <= 25:
+                self.hs, self.att_h = 6144, 48
+            elif model_size_in_b <= 46.5:
+                self.hs, self.att_h = 7680, 48
+            elif model_size_in_b <= 87.5:
+                self.hs, self.att_h = 9216, 96
+            elif model_size_in_b <= 165.5:
+                self.hs, self.att_h = 9216, 96
+            elif model_size_in_b <= 250.5:
+                self.hs, self.att_h = 12288, 96
+            else:
+                raise ValueError("Model_size for BERT must be smaller than 25B parameters.")
+            self.ffn = 4 * self.hs
+        else:
+            raise NotImplementedError("Model name is not valid.")
+
+        # Try powers of 2
+        margin = 0.01
+        for attempt in range(0, 10):
+            for layers in (2**p for p in range(1, 10)):
+                out_size = _calculate_model_size(
+                    vocab_size=self.vocab_size,
+                    seq_length=self.seq_length,
+                    hidden_size=self.hs,
+                    num_layers=layers,
+                    ffn_size=self.ffn,
+                    kv_channels=self.kv,
+                    att_heads=self.att_h,
+                    model_name=self.model_name,
+                )
+                if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
+                    self.layers = layers
+            margin += 0.01  # Double margin of acceptable model sizes.
+
+        # Try multiples of 16
+        margin = 0.01
+        for attempt in range(0, 6):
+            for layers in range(16, 201, 16):
+                out_size = _calculate_model_size(
+                    vocab_size=self.vocab_size,
+                    seq_length=self.seq_length,
+                    hidden_size=self.hs,
+                    num_layers=layers,
+                    ffn_size=self.ffn,
+                    kv_channels=self.kv,
+                    att_heads=self.att_h,
+                    model_name=self.model_name,
+                )
+                if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
+                    self.layers = layers
+            margin += 0.01  # Double margin of acceptable model sizes.
+
+        # Try multiples of 2
+        margin = 0.01
+        for attempt in range(0, 6):
+            for layers in range(2, 201, 2):
+                out_size = _calculate_model_size(
+                    vocab_size=self.vocab_size,
+                    seq_length=self.seq_length,
+                    hidden_size=self.hs,
+                    num_layers=layers,
+                    ffn_size=self.ffn,
+                    kv_channels=self.kv,
+                    att_heads=self.att_h,
+                    model_name=self.model_name,
+                )
+                if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
+                    self.layers = layers
+            margin += 0.01  # Double margin of acceptable model sizes.
+
+        # Try multiples of 5
+        margin = 0.01
+        for attempt in range(0, 6):
+            for layers in range(5, 201, 5):
+                out_size = _calculate_model_size(
+                    vocab_size=self.vocab_size,
+                    seq_length=self.seq_length,
+                    hidden_size=self.hs,
+                    num_layers=layers,
+                    ffn_size=self.ffn,
+                    kv_channels=self.kv,
+                    att_heads=self.att_h,
+                    model_name=self.model_name,
+                )
+                if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
+                    self.layers = layers
+            margin += 0.01  # Double margin of acceptable model sizes.
+
+        # Try any valid number
+        margin = 0.01
+        for attempt in range(0, 10):
+            for layers in range(1, 200):
+                out_size = _calculate_model_size(
+                    vocab_size=self.vocab_size,
+                    seq_length=self.seq_length,
+                    hidden_size=self.hs,
+                    num_layers=layers,
+                    ffn_size=self.ffn,
+                    kv_channels=self.kv,
+                    att_heads=self.att_h,
+                    model_name=self.model_name,
+                )
+                if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
+                    self.layers = layers
+            margin += 0.01  # Double margin of acceptable model sizes.
+
+        if not self.layers:
+            raise Exception("Number of layers not found, config is not possible.")
+
+
+def _calculate_model_size(
+    vocab_size: int = None,
+    seq_length: int = None,
+    hidden_size: int = None,
+    num_layers: int = None,
+    ffn_size: int = None,
+    kv_channels: int = None,
+    att_heads: int = None,
+    model_name: str = "gpt3",
+):
+    """Calculates the model size (number of parameters in billions), given the model parameters and name.
+
+    Args:
+        vocab_size (int): vocabulary size to be used during training.
+        seq_length (int): input sequence length to be used during training.
+        hidden_size (int): size of the hidden layers of the model.
+        num_layers (int): number of layers in the model.
+        ffn_size (int): FFN size of the model.
+        kv_channels (int): number of KV channels in the transformer layers.
+        att_heads (int): number of attention heads in the transformer layers.
+        model_name (str): name of the model, i.e gpt3, t5, mt5...
+
+    Returns:
+        float: size of the model in billions of parameters.
+
+    Raises:
+        NotImplementedError: if the model name is not valid.
+    """
+
+    if model_name in GPT_BASED_MODELS:
+        model_size = (
+            12
+            * num_layers
+            * hidden_size**2
+            * (1 + (13 / (12 * hidden_size)) + ((vocab_size + seq_length) / (12 * num_layers * hidden_size)))
+            / 1e9
+        )
+    elif model_name in ["t5", "mt5"]:
+        # 2 L F + 3 L P + H (2 + 4 L F + L (21 + 12 P) + 1 S + 1 V)
+        proj_size = att_heads * kv_channels
+        model_size = (
+            2 * num_layers * 1.5 * ffn_size
+            + 3 * num_layers * proj_size
+            + hidden_size
+            * (2 + 4 * num_layers * 1.5 * ffn_size + num_layers * (21 + 12 * proj_size) + seq_length + vocab_size)
+        ) / 1e9
+    elif model_name == "bert":
+        model_size = (
+            num_layers * (ffn_size + hidden_size * (4 * hidden_size + 3 * att_heads + 2 * ffn_size + 6))
+            + hidden_size * (vocab_size + seq_length + hidden_size + 5)
+        ) / 1e9
+
+    else:
+        raise NotImplementedError("Model name is not valid.")
+
+    return model_size
+
+
+def generic_base_config(config) -> dict:
+    """Generates a base config dictionary from a base config python file.
+
+    Args:
+        config (AutoConfigurator): config object for the Auto Configurator tool.
+
+    Returns:
+        BaseConfig: base configuration for the model.
+        AutoConfigurator: config object for the Auto Configurator tool.
+    """
+
+    from nemo.collections.llm.tools.auto_configurator.core.base_config import BaseConfig, calculate_model_size
+
+    default_model = False if config.model_size_in_b else True
+
+    model_size_in_b = calculate_model_size(
+        config.gpu_count,
+        config.max_training_days,
+        config.model_size_in_b,
+        config.tflops_per_gpu,
+        config.num_tokens_in_b,
+        config.model_type,
+    )
+    base_cfg = BaseConfig(config)
+
+    if default_model:
+        params = ModelSizeParams(
+            model_size_in_b,
+            config.vocab_size,
+            config.seq_length,
+            config.model_type,
+        )
+        params.init_params()
+
+        if config.model_type in GPT_BASED_MODELS:
+            base_cfg.model.num_layers = params.layers
+            base_cfg.model.hidden_size = params.hs
+            base_cfg.model.num_attention_heads = params.att_h
+            base_cfg.model.kv_channels = params.kv
+            if not params.ffn:
+                base_cfg.model.ffn_hidden_size = params.hs * 4
+            else:
+                base_cfg.model.ffn_hidden_size = params.ffn
+
+    config.model_size_in_b = model_size_in_b
+
+    return base_cfg, config
+
+
+def modify_cfg(
+    base_cfg: dict,
+    act: int,
+    num_mbs_act: int,
+    act_per_pipe: int,
+    tp: int,
+    pp: int,
+    cp: int,
+    ep: int,
+    virtual_pipelines: int,
+    mbs: int,
+    max_minutes: int,
+    max_steps: int,
+    num_nodes: int,
+    model_name: str,
+    model_size,
+) -> dict:
+    """Modify the base configuration for the model with the new parameters that are specific to the current model, which the Auto Configurator tool heuristics selected.
+
+    Args:
+        base_cfg (dict): base configuration for the current model, which will be modified in this function.
+        act (int): number of activation checkpointing layers to use for the model.
+        num_mbs_act (int): sets the number of micro-batches where only a partial number of Transformer layers get checkpointed and recomputed within a window of micro-batches.
+        act_per_pipe (int): sets the number of Transformer layers to skip checkpointing at later pipeline stages.
+        tp (int): Tensor Parallelism (TP) value to be set for the model.
+        pp (int): Pipeline Parallelism (PP) value to be set for the model.
+        cp (int): Context Parallelism (CP) value to be set for the model.
+        ep (int): Expert Parallelism (EP) value to be set for the model.
+        virtual_pipelines (int): Virtual Pipelines value to be set for the model.
+        mbs (int): Micro Batch Size (MBS) value to be set for the model.
+        max_minutes (int): maximum amount of time to run this model for.
+        max_steps (int): maximum number of steps to run this model for.
+        num_nodes (int): number of nodes to use for the training run.
+        model_name (str): name of the model, i.e. gpt3, t5, mt5...
+
+    Returns:
+        dict: dictionary containing the updated model configuration parameters.
+    """
+
+    if model_name in GPT_BASED_MODELS:
+        att_heads = base_cfg.model.num_attention_heads
+        num_layers = base_cfg.model.num_layers
+    else:
+        att_heads = base_cfg.model.encoder.num_attention_heads
+        num_layers = base_cfg.model.encoder.num_layers
+
+    # gbs = mbs * num_gpus * accumulate_grad_batches / (tp * pp)
+    num_gpus = base_cfg.trainer.num_nodes * base_cfg.trainer.devices
+    gbs = base_cfg.data.global_batch_size
+    seq_len = base_cfg.model.seq_length
+
+    new_cfg = dict(run=base_cfg.run)
+    if act is not None:
+        if model_name in GPT_BASED_MODELS:
+            new_cfg["activations_checkpoint_num_layers"] = act
+        else:
+            new_cfg["encoder"]["activations_checkpoint_num_layers"] = act // 2
+            new_cfg["decoder"]["activations_checkpoint_num_layers"] = act // 2
+
+    if num_mbs_act is not None and model_name in GPT_BASED_MODELS:
+        new_cfg["num_micro_batches_with_partial_activation_checkpoints"] = num_mbs_act
+
+    if act_per_pipe is not None and model_name in GPT_BASED_MODELS:
+        new_cfg["activations_checkpoint_layers_per_pipeline"] = act_per_pipe
+
+    if virtual_pipelines is not None and model_name in GPT_BASED_MODELS:
+        new_cfg["virtual_pipeline_model_parallel_size"] = virtual_pipelines
+
+    new_cfg["tensor_model_parallel_size"] = tp
+    new_cfg["pipeline_model_parallel_size"] = pp
+    new_cfg["micro_batch_size"] = mbs
+    new_cfg["global_batch_size"] = gbs
+
+    if cp is not None:
+        new_cfg["context_parallel_size"] = cp
+
+    if ep is not None:
+        new_cfg["expert_model_parallel_size"] = ep
+
+    mod_gbs = gbs % (mbs * num_gpus / (tp * pp))
+    mod_att_heads = att_heads % tp
+    mod_layers = num_layers % pp
+    if mod_gbs == 0 and mod_att_heads == 0 and mod_layers == 0:
+        # Valid config
+        new_cfg["run"][
+            "name"
+        ] = f"{model_name}_{str(model_size)}b_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
+        print(
+            f"Valid config: SeqLen={seq_len}, GBS={gbs}, MBS={mbs}, TP={tp}, PP={pp}, CP={cp}, EP={ep}, act_ckpt_layers={act}, num_mbs_act={num_mbs_act}, act_per_pipe={act_per_pipe}. Adding to directory."
+        )
+        return new_cfg
+    return None
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
new file mode 100644
index 000000000000..0c80c9a21a9e
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import re
+
+from typing import List, Optional
+
+from nemo.collections.llm import GPTModel
+from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.tools.auto_configurator.core.training_config import generate_grid_search_configs
+from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
+from nemo.collections.llm.utils import Config, Partial
+from nemo.utils import logging
+
+SUPPORTED_MODELS = [
+    "gpt3",
+    "llama",
+    "mixtral",
+    "mistral",
+    "gemma",
+    "nemotron",
+]
+
+SUPPORTED_TOKENIZERS = [
+    "autotokenizer",
+    "sentencepiece",
+    "huggingface",
+]
+
+
+class AutoConfigurator:
+    """Auto Configurator runner config class."""
+
+    def __init__(
+        self,
+        model: Config = None,
+        num_nodes: int = None,
+        data_paths: List = None,
+        path_to_logs: str = None,
+        tokenizer_type: Optional[str] = "autotokenizer",
+        tokenizer_path: Optional[str] = "GPT2BPETokenizer",
+        gpus_per_node: Optional[int] = 8,
+        gpu_memory_gb: Optional[int] = 80,
+        seq_length: Optional[int] = 2048,
+        global_batch_size: Optional[int] = "auto",
+        tensor_parallel_sizes: Optional[List[int]] = "auto",
+        pipeline_parallel_sizes: Optional[List[int]] = "auto",
+        micro_batch_sizes: Optional[List[int]] = "auto",
+        context_parallel_sizes: Optional[List[int]] = [1],
+        expert_parallel_sizes: Optional[List[int]] = [1],
+        min_model_parallel_size: Optional[int] = "auto",
+        max_model_parallel_size: Optional[int] = "auto",
+        num_tokens_in_b: Optional[int] = 300,
+        tflops_per_gpu: Optional[int] = 140,
+        max_minutes_per_run: Optional[int] = 30,
+        max_training_days: Optional[int] = 2,
+        max_steps_per_run: Optional[int] = 50,
+        vocab_size: Optional[int] = 51200,
+    ):
+        """
+        Args:
+            model_type (Config): model type to be used for training.
+            num_nodes (int): number of nodes to be used for training.
+            data_paths (List): list of datafiles to be used for training.
+            path_to_logs (str): path to the directory where the logs will be stored.
+            tokenizer_type (Optional[str]): tokenizer type.
+            tokenizer_path (Optional[str]): path to the tokenizer model.
+            model_size (Optional[int]): size of model to be trained.
+            gpus_per_node (Optional[int]): number of GPUs per node to be used.
+            gpu_memory_gb (Optional[int]): memory per GPU, in GB. Currently 40GB and 80GB A100s/H100s supported.
+            seq_length (Optional[int]): model sequence length. Available seq_length list for GPT-based models: [2048, 4096, 8192, 16384, 32768].
+            global_batch_size (Optional[int]): model global batch size. Set to "auto" if you want auto configurator to find optimal gbs.
+            tensor_parallel_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
+            pipeline_parallel_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
+            micro_batch_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
+            context_parallel_sizes (Optional[List[int]]): model context parallel size. A list, such as [1, 2, 4, 8].
+            expert_parallel_sizes (Optional[List[int]]): model expert parallel size. A list, such as [1, 2, 4, 8].
+            min_model_parallel_size (Optional[int]): set to "auto" to use our recommendation, or a value for the minimum desired parallelism.
+            max_model_parallel_size (Optional[int]): set to "auto" to use our recommendation, or a value for the maximum desired parallelism.
+            num_tokens_in_b (Optional[int]): number of tokens in billions in train dataset.
+            tflops_per_gpu (Optional[int]): estimated tflops per GPU.
+            max_minutes_per_run (Optional[int]): maximum number of minutes per run for the grid search.
+            max_training_days (Optional[int]): number of days expected model to be trained.
+            max_steps_per_run (Optional[int]): maximum number of steps per run for the grid search.
+            vocab_size (Optional[int]): size of tokenizer vocabulary.
+        """
+
+        # Print out the config
+        config = locals()
+        config.pop('self')
+        for key, value in config.items():
+            setattr(self, key, value)
+        logging.info(self._get_message(config))
+
+        model_type = self._get_model_type(model)
+        assert model_type in SUPPORTED_MODELS, f"model_type must be set to one of {SUPPORTED_MODELS}."
+        assert tokenizer_type in SUPPORTED_TOKENIZERS, f"tokenizer_type must be set to one of {SUPPORTED_TOKENIZERS}."
+        assert num_nodes, "num_nodes value must be specified."
+        assert data_paths, "training data must be specified."
+        assert path_to_logs, f"path_to_logs parameter must be specified."
+        gpu_count = num_nodes * gpus_per_node
+        assert gpu_count > 0, "num_nodes * gpus_per_node must be an int larger than zero."
+        assert gpu_memory_gb in (
+            40,
+            80,
+        ), "gpu_memory_gb can only be 40 or 80."
+        assert max_minutes_per_run >= 10, "max_minutes_per_run must be an int and be at least 10 minutes."
+
+        self.model_type = model_type
+        self.model_size_in_b = self._get_model_size(model)
+        self.gpu_count = gpu_count
+        self.num_gpus = gpus_per_node
+
+    def _get_message(self, config: dict) -> str:
+        """
+        Function that returns runner config line by line.
+
+        Args:
+            config (dict): runner config.
+
+        Returns:
+            str: runner config params.
+        """
+
+        message = "AutoConfigurator runner config:\n"
+        for key, value in config.items():
+            message += f"{key}: {value}\n"
+
+        return message
+
+    def _get_model_type(self, model: Config) -> str:
+        """
+        Function that returns model type from model class name.
+
+        Args:
+            models (Config): model object.
+
+        Returns:
+            str: model type.
+        """
+
+        match = re.search(r"\w+\d+[MB]", str(model))
+        if match:
+            model = match.group(0)
+
+        if "GPT" in model:
+            return "gpt3"
+        elif "Llama" in model:
+            return "llama"
+        elif "Mixtral" in model:
+            return "mixtral"
+        elif "Mistral" in model:
+            return "mistral"
+        elif "Gemma" in model:
+            return "gemma"
+        elif "Nemotron" in model:
+            return "nemotron"
+        else:
+            return None
+
+    def _get_model_size(self, model: Config) -> int:
+        """
+        Function that returns model size from model class name.
+
+        Args:
+            model (Config): model class name.
+
+        Returns:
+            int: model size.
+        """
+        match = re.search(r'(\d+)([BM])', str(model))
+        if match:
+            size = int(match.group(1))
+            measure = match.group(2)
+            if measure == 'B':
+                return size
+            elif measure == 'M':
+                return size / 1000  # Convert millions to billions
+        return None
+
+
+def generate_configs(runner_config: AutoConfigurator = None) -> dict:
+    """
+    Function that returns a dictionary of Partial configs.
+
+    Args:
+        config (AutoConfigurator): Auto Configurator object.
+
+    Returns:
+        dict: dictionary of Partial configs.
+    """
+
+    # Generate base config for the given model size
+    base_cfg, train_cfg = generic_base_config(runner_config)
+
+    # Launch grid search for training constraints
+    base_config, train_configs = generate_grid_search_configs(base_cfg, train_cfg)
+
+    tokenizer = base_config.tokenizer
+    model = Config(GPTModel, config=base_config.model, tokenizer=tokenizer)
+
+    configs = {}
+    for name, config in train_configs.items():
+        trainer = copy.deepcopy(base_config.trainer)
+        data = copy.deepcopy(base_config.data)
+        log = copy.deepcopy(base_config.log)
+
+        # Set data params
+        data.micro_batch_size = config.get("micro_batch_size")
+        data.global_batch_size = config.get("global_batch_size")
+
+        # Set strategy params
+        trainer.strategy.tensor_model_parallel_size = config.get("tensor_model_parallel_size")
+        trainer.strategy.pipeline_model_parallel_size = config.get("pipeline_model_parallel_size")
+        trainer.strategy.context_parallel_size = config.get("context_parallel_size")
+        trainer.strategy.expert_model_parallel_size = config.get("expert_model_parallel_size")
+        trainer.strategy.virtual_pipeline_model_parallel_size = config.get(
+            "virtual_pipeline_model_parallel_size", None
+        )
+        if config.get("tensor_model_parallel_size") > 1:
+            trainer.strategy.sequence_parallel = True
+
+        # Set the directory where to save the logs
+        configs[name] = Partial(
+            pretrain,
+            model=model,
+            trainer=trainer,
+            data=data,
+            optim=base_config.optim,
+            log=log,
+            resume=None,
+        )
+
+    return base_cfg, configs
diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py
index 89f1ab24f0a9..4bd4443e46f5 100644
--- a/nemo/collections/multimodal/data/neva/conversation.py
+++ b/nemo/collections/multimodal/data/neva/conversation.py
@@ -34,6 +34,7 @@
 DEFAULT_IM_START_TOKEN["llama_3"] = "<|reserved_special_token_4|>"
 DEFAULT_IM_END_TOKEN["llama_3"] = "<|reserved_special_token_5|>"
 
+
 DEFAULT_VID_START_TOKEN = "<extra_id_8>"
 DEFAULT_VID_END_TOKEN = "<extra_id_9>"
 TIME_TOKEN_TEMPLATE = "<t{t}>"
@@ -507,6 +508,7 @@ def dict(self):
     sep2=DEFAULT_EOS_TOKEN,
 )
 
+
 default_conversation = conv_vicuna_v1
 conv_templates = {
     "default": conv_vicuna_v0,
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 37f57ff21bba..f46b75e7b472 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -40,6 +40,7 @@
     DEFAULT_IMAGE_PATCH_TOKEN,
     DEFAULT_IMAGE_TOKEN,
     DEFAULT_LABELS_TOKEN,
+    DEFAULT_PAD_TOKEN,
     DEFAULT_VID_END_TOKEN,
     DEFAULT_VID_START_TOKEN,
     DEFAULT_VIDEO_TOKEN,
@@ -353,8 +354,14 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
         if use_plain:
             assert default_token in conversation[0]['value']
             conversation[0]['value'] = default_token
-        for turn in conversation:
-            turn["value"] = turn["value"].replace(default_token, replace_token)
+        if multimodal_cfg["conv_template"] == "interleaved":
+            # directly replace the default_token in the conversation,
+            # since we don't use the conversation template
+            updated_conversation = conversation.replace(default_token, replace_token)
+            source['conversations'] = updated_conversation
+        else:
+            for turn in conversation:
+                turn["value"] = turn["value"].replace(default_token, replace_token)
     return sources
 
 
@@ -791,6 +798,52 @@ def preprocess_v1(
     )
 
 
+def preprocess_interleaved_prompt(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
+    """tokenize the interleaved prompt and mask the text part of the prompt"""
+    conversations = []
+    for source in sources:
+        conversations.append(source['conversations'])
+    add_extra_token = cfg.get("add_extra_token")
+    tokens = tokenize(
+        texts=conversations,
+        tokenizer=tokenizer,
+        context_length=cfg.get("context_length"),
+        add_extra_token=add_extra_token,
+    )
+
+    model_type = cfg['model_type']
+    image_patch_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type]
+    image_start_token = DEFAULT_IM_START_TOKEN[model_type]
+    image_end_token = DEFAULT_IM_END_TOKEN[model_type]
+    DEFAULT_TOKENS = [image_patch_token, image_start_token, image_end_token, DEFAULT_PAD_TOKEN]
+    img_patch_id, img_start_id, img_end_id, pad_id = get_tokens_ids(tokenizer, DEFAULT_TOKENS)
+    tokens[tokens == img_patch_id] = 0  # DEFAULT_IMAGE_PATCH_TOKEN
+
+    labels = tokens.clone().detach()
+
+    # Mask labels change for interleaved prompt
+    labels[labels == img_start_id] = IGNORE_INDEX
+    labels[labels == img_end_id] = IGNORE_INDEX
+    labels[labels == 0] = IGNORE_INDEX
+    labels[labels == pad_id] = IGNORE_INDEX
+
+    if add_extra_token:
+        tokens = tokens[:, :-1].contiguous()
+        labels = labels[:, 1:].contiguous()
+    else:
+        labels = torch.roll(labels, shifts=-1, dims=-1)
+        labels[:, -1] = IGNORE_INDEX
+
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
+
+
 def preprocess_nvgpt(
     sources: dict,
     tokenizer,
@@ -1075,6 +1128,29 @@ def preprocess_plain(
     )
 
 
+def preprocess_conversations(self, sources):
+    if self.conv_template in ["nvgpt", "nv_steerlm"]:
+        return preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg)
+    elif self.conv_template == "nv_dpo":
+        return preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg)
+    elif self.conv_template == "v1":
+        return preprocess_v1(sources, self.tokenizer, self.multimodal_cfg)
+    elif self.conv_template == "llama_2":
+        return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg)
+    elif self.conv_template == "llama_3":
+        return preprocess_llama_3(sources, self.tokenizer, self.multimodal_cfg)
+    elif self.conv_template == "mistral":
+        return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg, is_mistral=True)
+    elif self.conv_template == "yi_34b":
+        return preprocess_yi_34b(sources, self.tokenizer, self.multimodal_cfg)
+    elif self.conv_template == "plain":
+        return preprocess_plain(sources, self.tokenizer, self.multimodal_cfg)
+    elif self.conv_template == "interleaved":
+        return preprocess_interleaved_prompt(sources, self.tokenizer, self.multimodal_cfg)
+    else:
+        raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.")
+
+
 class LazySupervisedDataset(Dataset):
     """Dataset for supervised fine-tuning."""
 
@@ -1215,57 +1291,7 @@ def expand2square(pil_img, background_color):
             media_tensors = torch.tensor([])
             sources = copy.deepcopy(sources)
 
-        if self.conv_template in ["nvgpt", "nv_steerlm"]:
-            data_dict = preprocess_nvgpt(
-                sources,
-                self.tokenizer,
-                self.multimodal_cfg,
-            )
-        elif self.conv_template == "nv_dpo":
-            data_dict = preprocess_nv_dpo(
-                sources,
-                self.tokenizer,
-                self.multimodal_cfg,
-            )
-        elif self.conv_template == "v1":
-            data_dict = preprocess_v1(
-                sources,
-                self.tokenizer,
-                self.multimodal_cfg,
-            )
-        elif self.conv_template == "llama_2":
-            data_dict = preprocess_llama_2(
-                sources,
-                self.tokenizer,
-                self.multimodal_cfg,
-            )
-        elif self.conv_template == "llama_3":
-            data_dict = preprocess_llama_3(
-                sources,
-                self.tokenizer,
-                self.multimodal_cfg,
-            )
-        elif self.conv_template == "mistral":
-            data_dict = preprocess_llama_2(
-                sources,
-                self.tokenizer,
-                self.multimodal_cfg,
-                is_mistral=True,
-            )
-        elif self.conv_template == "plain":
-            data_dict = preprocess_plain(
-                sources,
-                self.tokenizer,
-                self.multimodal_cfg,
-            )
-        elif self.conv_template == "yi_34b":
-            data_dict = preprocess_yi_34b(
-                sources,
-                self.tokenizer,
-                self.multimodal_cfg,
-            )
-        else:
-            raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.")
+        data_dict = preprocess_conversations(self, sources)
 
         if isinstance(i, int):
             data_dict = dict(tokens=data_dict["tokens"][0], labels=data_dict["labels"][0])
diff --git a/nemo/collections/multimodal/data/neva/neva_energon_dataset.py b/nemo/collections/multimodal/data/neva/neva_energon_dataset.py
new file mode 100644
index 000000000000..a83e616f248f
--- /dev/null
+++ b/nemo/collections/multimodal/data/neva/neva_energon_dataset.py
@@ -0,0 +1,506 @@
+import dataclasses
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+from einops import rearrange
+from megatron.energon import (
+    Batch,
+    CaptioningSample,
+    DefaultTaskEncoder,
+    InterleavedSample,
+    SimilarityInterleavedSample,
+    VQASample,
+    batch_pad_stack,
+)
+from PIL import Image
+
+from nemo.collections.multimodal.data.neva.neva_dataset import (
+    DEFAULT_IMAGE_TOKEN,
+    preprocess_conversations,
+    preprocess_interleaved_prompt,
+    preprocess_llama_2,
+    preprocess_llama_3,
+    preprocess_multimodal,
+    preprocess_nv_dpo,
+    preprocess_nvgpt,
+    preprocess_plain,
+    preprocess_v1,
+    preprocess_yi_34b,
+    process_image,
+)
+
+
+# Type for intermediate batch, after batch()
+@dataclass
+class ImageTaskSample:
+    __key__: str
+    __subflavor__: str
+    conversations: List[dict]
+    image: Optional[Union[str, List[str], torch.Tensor]] = None
+    video: Optional[Union[str, List[str]]] = None
+
+    tokens: Optional[torch.Tensor] = None
+    labels: Optional[torch.Tensor] = None
+    attention_mask: Optional[torch.Tensor] = None
+    loss_mask: Optional[torch.Tensor] = None
+    position_ids: Optional[torch.Tensor] = None
+
+
+# Typing for the resulting batch data after encode_batch()
+@dataclass
+class ImageTaskBatch(Batch):
+    tokens: torch.Tensor
+    labels: torch.Tensor
+    attention_mask: torch.Tensor
+    loss_mask: torch.Tensor
+    position_ids: torch.Tensor
+    media: Optional[torch.Tensor] = None
+
+
+# Required for energon, https://nvidia.github.io/Megatron-Energon/task_encoders.html
+class TaskEncoder(DefaultTaskEncoder[VQASample, InterleavedSample, ImageTaskBatch, dict]):
+    """A task encoder for data samples for captioning, pretraining, sft and interleaved multimodal tasks.
+    It defines how the data is processed after it is loaded from the dataset.
+    Currently, it supports captioning, pretraining, sft and interleaved multimodal tasks and datasets.
+    """
+
+    def __init__(self, tokenizer, image_processor, multimodal_cfg: dict, data_cfg: dict):
+        super().__init__(batch_type=ImageTaskBatch)
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.multimodal_cfg = multimodal_cfg
+        self.data_cfg = data_cfg
+        self.conv_template = multimodal_cfg["conv_template"]
+        self.max_num_images = 6
+        self.image_following_text_only = False
+        self.caption_prompts = [
+            "Generate a short cap fotion of the image.",
+            "Describe the image concisely.",
+            "Provide a brief description of the given image.",
+        ]
+        self.prompt_index = 0
+
+    def encode_sample(
+        self,
+        sample: Union[ImageTaskSample, CaptioningSample, VQASample, InterleavedSample, SimilarityInterleavedSample],
+    ) -> dict:
+        if isinstance(sample, InterleavedSample):
+            return self.encode_interleaved(sample)
+        elif isinstance(sample, VQASample):
+            return self.encode_pretrain(sample)
+        elif isinstance(sample, CaptioningSample):
+            return self.encode_captioning(sample)
+        elif isinstance(sample, SimilarityInterleavedSample) and self.conv_template == "interleaved":
+            return self.encode_similarity_interleaved(sample)
+        else:
+            return self.encode_sft(sample)
+
+    def encode_captioning(self, sample: CaptioningSample) -> dict:
+        """Preprocessing function for datasets like COCO, containing image-caption pairs.
+        See Energon codebase for more details on CaptioningSample.
+        https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/captioning.py
+        """
+        processed_image = self.process_images(sample.image)
+
+        prompt = f"<image>\n{self.caption_prompts[self.prompt_index]}\n"
+        self.prompt_index = (self.prompt_index + 1) % len(self.caption_prompts)
+
+        caption = sample.caption.strip()
+
+        conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": caption}]
+
+        processed_sample = {"conversations": conversation, "image": processed_image}
+
+        if self.multimodal_cfg['is_multimodal']:
+            cur_token_len = self.calculate_token_length(processed_sample["image"])
+            processed_sample = preprocess_multimodal(
+                [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain")
+            )[0]
+
+        processed = preprocess_conversations(self, [processed_sample])
+        tokens = processed["tokens"]
+        labels = processed["labels"]
+        attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavor__=sample.__subflavor__,
+            conversations=conversation,
+            image=processed_sample["image"],
+            tokens=tokens.squeeze(0),
+            labels=labels.squeeze(0),
+            attention_mask=attention_mask.squeeze(0),
+            loss_mask=loss_mask.squeeze(0),
+            position_ids=position_ids,
+        )
+
+    def encode_pretrain(self, sample: VQASample) -> dict:
+        """Preprocessing function for datasets like LlaVA-Pretrain, multimodal synthesized conversation from the image-caption pairs.
+        See Energon codebase for more details on VQASample.
+        https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/vqa.py
+        """
+        conversations = [{"from": "human", "value": sample.context}, {"from": "gpt", "value": sample.answers}]
+        processed_sample = {"conversations": conversations}
+
+        if self.multimodal_cfg['is_multimodal']:
+            if hasattr(sample, 'image') and sample.image is not None:
+                processed_sample["image"] = self.process_images(sample.image)
+                cur_token_len = self.calculate_token_length(processed_sample["image"])
+                processed_sample = preprocess_multimodal(
+                    [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain")
+                )[0]
+
+        processed = preprocess_conversations(self, [processed_sample])
+        tokens = processed["tokens"]
+        labels = processed["labels"]
+        attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels)
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavor__=sample.__subflavor__,
+            conversations=conversations,
+            image=processed_sample.get("image"),
+            video=processed_sample.get("video"),
+            tokens=tokens.squeeze(0),
+            labels=labels.squeeze(0),
+            attention_mask=attention_mask.squeeze(0),
+            loss_mask=loss_mask.squeeze(0),
+            position_ids=position_ids,
+        )
+
+    def encode_sft(self, sample: Union[ImageTaskSample, VQASample, InterleavedSample]) -> dict:
+        """Preprocessing function for datasets like LLaVA-Instruct, conversational multimodal instruction-following data.
+        See Energon codebase for more details on VQASample.
+        https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/vqa.py
+        """
+        conversations = sample.texts if hasattr(sample, 'texts') else sample.conversations
+        processed_sample = {"conversations": conversations}
+        image_present = False
+
+        if self.multimodal_cfg['is_multimodal']:
+            image_present = False
+            if hasattr(sample, 'image') and sample.image is not None:
+                processed_sample["image"] = self.process_images(sample.image)
+                image_present = True
+            elif hasattr(sample, 'images') and sample.images:
+                processed_sample["image"] = self.process_images(sample.images[0])
+                image_present = True
+            elif hasattr(sample, 'video') and sample.video:
+                # Implement video processing if needed
+                pass
+
+            if image_present:
+                processed_sample = preprocess_multimodal(
+                    [processed_sample],
+                    self.multimodal_cfg,
+                    self.calculate_token_length(processed_sample["image"]),
+                    use_plain=(self.conv_template == "plain"),
+                )[0]
+
+        processed = preprocess_conversations(self, [processed_sample])
+        tokens = processed["tokens"]
+        labels = processed["labels"]
+        attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels)
+
+        if not image_present:
+            processed_sample["image"] = torch.zeros(
+                1, 3, self.multimodal_cfg["crop_size"][0], self.multimodal_cfg["crop_size"][1]
+            )
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavor__=sample.__subflavor__,
+            conversations=conversations,
+            # rewrite image so it creates tensor of zeros if not present
+            image=processed_sample.get("image", torch.tensor([])),
+            tokens=tokens.squeeze(0),
+            labels=labels.squeeze(0),
+            attention_mask=attention_mask.squeeze(0),
+            loss_mask=loss_mask.squeeze(0),
+            position_ids=position_ids,
+        )
+
+    def encode_similarity_interleaved(self, sample: SimilarityInterleavedSample) -> dict:
+        """Preprocessing function for datasets like MMC4, where text and images are interleaved via a similarity matrix or matched_text_indices.
+        See Energon codebase for more details on SimilarityInterleavedSample.
+        https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/similarity_interleaved.py
+        """
+        # 4 fields: sample.images, sample.texts, sample.similarity_matrix, sample.matched_text_index
+        images, sentence_ixs = [], []
+        for sample_image, sim_vec in zip(sample.images, sample.matched_text_indices):
+            images.append(sample_image)
+            sentence_ixs.append(sim_vec)
+
+        # constrain max num images
+        max_num_images = self.max_num_images
+        if len(images) > max_num_images:
+            images = images[:max_num_images]
+            sentence_ixs = sentence_ixs[:max_num_images]
+
+        images = [images[i] for i in np.argsort(sentence_ixs)]
+
+        for ix in sentence_ixs:
+            sample.texts[ix] = f"{DEFAULT_IMAGE_TOKEN} {sample.texts[ix]}"
+
+        if self.image_following_text_only:
+            # use pad token to divide sentence pieces
+            text = self.tokenizer.pad_id.join(sample.texts)
+        else:
+            text = " ".join(sample.texts)
+
+        text = text.replace("<image> ", "<image>").replace(" <image>", "<image>")
+        text = f"{text}{self.tokenizer.eos_id}"
+
+        if len(images) > 0:
+            processed_images = self.process_images(images)
+        else:
+            processed_images = None
+
+        # check the case where the last token is the image token.
+        if text.endswith(DEFAULT_IMAGE_TOKEN):
+            text = text[: -len(DEFAULT_IMAGE_TOKEN)]
+
+        n_im_patch = text.count(DEFAULT_IMAGE_TOKEN)
+        processed_images = processed_images[:n_im_patch]
+        assert len(processed_images) == n_im_patch
+
+        processed_sample = {"conversations": text, "image": processed_images}
+
+        if self.multimodal_cfg['is_multimodal']:
+            if images:
+                cur_token_len = self.calculate_token_length(processed_sample["image"])
+                processed_sample = preprocess_multimodal(
+                    [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain")
+                )[0]
+
+        processed = preprocess_conversations(self, [processed_sample])
+
+        tokens = processed["tokens"]
+        labels = processed["labels"]
+        attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels)
+
+        # pad images
+        if images:
+            processed_sample["image"] = self.pad_images(processed_sample["image"], self.max_num_images)
+        else:
+            # add extra dummy images
+            processed_sample["image"] = torch.zeros(
+                self.max_num_images, 3, self.multimodal_cfg["crop_size"][0], self.multimodal_cfg["crop_size"][1]
+            )
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavor__=sample.__subflavor__,
+            conversations=processed_sample["conversations"],
+            image=processed_sample["image"],
+            tokens=tokens.squeeze(0),
+            labels=labels.squeeze(0),
+            attention_mask=attention_mask.squeeze(0),
+            loss_mask=loss_mask.squeeze(0),
+            position_ids=position_ids,
+        )
+
+    def encode_interleaved(self, sample: InterleavedSample) -> dict:
+        """Preprocessing function for datasets like OBELISC, where text and images are strictly interleaved.
+        See Energon codebase for more details on SimilarityInterleavedSample.
+        https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/interleaved.py
+        """
+        interleaved_text = []
+        images = []
+        for item in sample.sequence:
+            if isinstance(item, str):
+                interleaved_text.append(item)
+            elif isinstance(item, torch.Tensor) or isinstance(item, Image.Image):
+                interleaved_text.append(DEFAULT_IMAGE_TOKEN)
+                images.append(item)
+            else:
+                raise ValueError(f"Unsupported type in interleaved sequence: {type(item)}")
+
+        # constrain max num images
+        max_num_images = self.max_num_images
+
+        n_im_patch = interleaved_text.count(DEFAULT_IMAGE_TOKEN)
+        if n_im_patch > max_num_images:
+            interleaved_text, kept_image_indices = self.remove_excess_image_tokens(interleaved_text, max_num_images)
+            images = [images[i] for i in kept_image_indices]
+
+        if len(images) > max_num_images:
+            images = images[:max_num_images]
+
+        if len(images) > 0:
+            processed_images = self.process_images(images)
+        else:
+            processed_images = None
+
+        combined_text = ' '.join(interleaved_text)
+
+        processed_sample = {"conversations": combined_text, "image": processed_images}
+
+        if self.multimodal_cfg['is_multimodal']:
+            if images:
+                cur_token_len = self.calculate_token_length(processed_sample["image"])
+                processed_sample = preprocess_multimodal(
+                    [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain")
+                )[0]
+
+        processed = preprocess_conversations(self, [processed_sample])
+
+        tokens = processed["tokens"]
+        labels = processed["labels"]
+
+        attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels)
+
+        # pad images
+        if images:
+            processed_sample["image"] = self.pad_images(processed_sample["image"], self.max_num_images)
+        else:
+            processed_sample["image"] = torch.zeros(
+                self.max_num_images, 3, self.multimodal_cfg["crop_size"][0], self.multimodal_cfg["crop_size"][1]
+            )
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavor__=sample.__subflavor__,
+            conversations=processed_sample["conversations"],
+            image=processed_sample["image"],
+            tokens=tokens.squeeze(0),
+            labels=labels.squeeze(0),
+            attention_mask=attention_mask.squeeze(0),
+            loss_mask=loss_mask.squeeze(0),
+            position_ids=position_ids,
+        )
+
+    def remove_excess_image_tokens(self, interleaved_text, max_num_images):
+        if interleaved_text[-1] == DEFAULT_IMAGE_TOKEN:
+            interleaved_text = interleaved_text[:-1]
+
+        image_indices = [i for i, token in enumerate(interleaved_text) if token == DEFAULT_IMAGE_TOKEN]
+
+        if len(image_indices) <= max_num_images:
+            return interleaved_text, list(range(len(image_indices)))
+
+        # we keep the images that are close to the text tokens
+        importance = []
+        for i, img_idx in enumerate(image_indices):
+            has_text_before = img_idx > 0 and interleaved_text[img_idx - 1] != DEFAULT_IMAGE_TOKEN
+            has_text_after = (
+                img_idx < len(interleaved_text) - 1 and interleaved_text[img_idx + 1] != DEFAULT_IMAGE_TOKEN
+            )
+
+            if has_text_before and has_text_after:
+                importance.append((0, img_idx))  # highest importance
+            elif has_text_before or has_text_after:
+                importance.append((1, img_idx))
+            else:
+                importance.append((2, img_idx))
+
+        importance.sort(key=lambda x: (x[0], x[1]))
+        kept_indices = {idx for _, idx in importance[:max_num_images]}
+
+        # update idx to map correctly to the original images array
+        kept_image_indices = [image_indices.index(i) for i in kept_indices if i in image_indices]
+
+        new_interleaved_text = [
+            token for i, token in enumerate(interleaved_text) if token != DEFAULT_IMAGE_TOKEN or i in kept_indices
+        ]
+
+        return new_interleaved_text, kept_image_indices
+
+    def process_images(self, images):
+        if not isinstance(images, list):
+            images = [images]
+        processed_images = []
+        for image in images:
+            image = process_image(self.image_processor, image, self.multimodal_cfg['image_aspect_ratio'])
+            processed_images.append(image)
+        return torch.stack(processed_images)  # make it always 4D, otherwise has problem when len(images) > 1
+
+    def pad_images(self, images, max_num_images):
+        if len(images) < max_num_images:
+            pad_size = max_num_images - len(images)
+            padded_images = torch.cat([images, torch.zeros(pad_size, *images.size()[1:])], dim=0)
+            return padded_images
+        return images
+
+    def batch(self, samples: List[ImageTaskSample]) -> ImageTaskBatch:
+        """Pads and stacks the samples in the batch."""
+        batch = ImageTaskBatch(
+            tokens=batch_pad_stack([s.tokens for s in samples]),
+            labels=batch_pad_stack([s.labels for s in samples]),
+            attention_mask=batch_pad_stack([s.attention_mask for s in samples]),
+            loss_mask=batch_pad_stack([s.loss_mask for s in samples]),
+            position_ids=batch_pad_stack([s.position_ids for s in samples]),
+            media=(
+                torch.stack([s.image for s in samples if s.image is not None])
+                if self.multimodal_cfg['is_multimodal']
+                else None
+            ),
+        )
+
+        # TODO: cleanup, this is following logic in neva_dataset when we rearrange media tensor
+        if batch.media.shape[1] == 1:
+            batch.media = rearrange(batch.media, "b T c h w -> b T 1 c h w")
+        else:
+            batch.media = rearrange(batch.media, "b T c h w -> b T 1 c h w")
+
+        return batch
+
+    def preprocess_conversations(self, sources):
+        if self.conv_template == "nvgpt":
+            return preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg)
+        elif self.conv_template == "nv_dpo":
+            return preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg)
+        elif self.conv_template == "v1":
+            return preprocess_v1(sources, self.tokenizer, self.multimodal_cfg)
+        elif self.conv_template == "llama_2":
+            return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg)
+        elif self.conv_template == "llama_3":
+            return preprocess_llama_3(sources, self.tokenizer, self.multimodal_cfg)
+        elif self.conv_template == "mistral":
+            return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg, is_mistral=True)
+        elif self.conv_template == "yi_34b":
+            return preprocess_yi_34b(sources, self.tokenizer, self.multimodal_cfg)
+        elif self.conv_template == "plain":
+            return preprocess_plain(sources, self.tokenizer, self.multimodal_cfg)
+        elif self.conv_template == "interleaved":
+            return preprocess_interleaved_prompt(sources, self.tokenizer, self.multimodal_cfg)
+        else:
+            raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.")
+
+    def encode_batch(self, batch: ImageTaskBatch) -> dict:
+        raw = dataclasses.asdict(batch)
+        return raw
+
+    def calculate_token_length(self, media_tensor):
+        if len(media_tensor.shape) == 4:
+            height = media_tensor.shape[2]
+            width = media_tensor.shape[3]
+        else:
+            raise ValueError("Media tensor must be 4-dimensional")
+        patch_dim = self.multimodal_cfg['patch_dim']
+        height_num_patches = height // patch_dim
+        width_num_patches = width // patch_dim
+        if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+            height_num_patches = (height_num_patches + 1) // 2 * 2
+            width_num_patches = (width_num_patches + 1) // 2 * 2
+
+        return height_num_patches * width_num_patches
+
+    def get_masks_and_position_ids(self, tokens, labels):
+        from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+
+        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+            data=tokens,
+            eod_token=self.tokenizer.eos_id,
+            eod_mask_loss=self.data_cfg.get("eod_mask_loss", False),
+            reset_attention_mask=False,
+            reset_position_ids=False,
+        )
+
+        loss_mask[labels == -1] = 0.0
+        tokens[tokens == -1] = 0
+        labels[labels == -1] = 0
+
+        return attention_mask, loss_mask, position_ids
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 6218332c2bde..07bc4f3960d3 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -21,7 +21,7 @@
 import torch
 import torch.nn.functional as F
 from einops import rearrange, reduce, repeat
-from omegaconf import DictConfig, ListConfig
+from omegaconf import DictConfig, ListConfig, OmegaConf
 from pkg_resources import packaging
 from pytorch_lightning.trainer.trainer import Trainer
 from transformers import CLIPVisionModel, SiglipVisionModel
@@ -69,6 +69,25 @@
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
 
+try:
+    from megatron.energon import (
+        LimitDataset,
+        RepeatDataset,
+        WorkerConfig,
+        get_loader,
+        get_savable_loader,
+        get_train_dataset,
+        get_val_datasets,
+    )
+
+    from nemo.collections.multimodal.data.neva.neva_energon_dataset import TaskEncoder
+
+    HAVE_ENERGON = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_ENERGON = False
+
 try:
     from megatron.core import InferenceParams, dist_checkpointing, parallel_state, tensor_parallel
     from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace
@@ -1226,10 +1245,22 @@ def setup(self, stage=None):
         else:
             # TODO: consider adding a ModelPT guard to check if model is being restored.
             # allowing restored models to optionally setup datasets
-            self.build_train_valid_test_datasets()
-            self.setup_training_data(self.cfg.data)
-            self.setup_validation_data(self.cfg.data)
-            self.setup_test_data(self.cfg.data)
+
+            if self.cfg.get('energon', {}).get('use_energon', False):
+                if not HAVE_ENERGON:
+                    raise ImportError(
+                        "Megatron-Energon was not found. Please see the Energon README for installation instructions: https://github.com/NVIDIA/Megatron-Energon?tab=readme-ov-file#installation."
+                    )
+                assert not self.use_peft, "NeMo does not currently support the combination of Energon and PEFT."
+                logging.info(
+                    "You are now using an experimental implementation of Megatron-Energon, https://github.com/NVIDIA/Megatron-Energon, for your NeVA dataloader. Further updates to Energon support in NeMo will be done in NeMo 2.0 implementation."
+                )
+                self.build_train_valid_test_datasets_energon()
+            else:
+                self.build_train_valid_test_datasets()
+                self.setup_training_data(self.cfg.data)
+                self.setup_validation_data(self.cfg.data)
+                self.setup_test_data(self.cfg.data)
 
         # when using pipeline model parallel the final stage need to initialize word embeddings
         if parallel_state.get_pipeline_model_parallel_world_size() > 1:
@@ -1435,6 +1466,144 @@ def build_pretraining_data_loader(
             persistent_workers=True if self.cfg.data.num_workers > 0 else False,
         )
 
+    def datasets_provider(self, worker_config=None):
+        """Create multimodal train, validation and test datasets."""
+        if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+            micro_batch_size = self.cfg.micro_batch_size
+        else:
+            micro_batch_size = self.cfg.global_batch_size // parallel_state.get_data_parallel_world_size()
+
+        dname = OmegaConf.to_container(self.cfg.energon.data, resolve=True)
+
+        image_processor = (
+            self.model.module.image_processor if hasattr(self.model, "module") else self.model.image_processor
+        )
+
+        add_extra_token = 1
+        if getattr(self.cfg, 'no_seqlen_plus_one_input_tokens', False):
+            add_extra_token = 0
+
+        multimodal_cfg = dict(
+            is_multimodal=self.cfg.data.is_multimodal,
+            sep_image_conv_front=self.cfg.data.sep_image_conv_front,
+            model_type=self.cfg.mm_cfg.llm.get("model_type", "nvgpt"),
+            conv_template=self.cfg.data.get("conv_template", "nvgpt"),
+            patch_dim=self.cfg.mm_cfg.vision_encoder.patch_dim,
+            crop_size=self.cfg.mm_cfg.vision_encoder.get("crop_size", (336, 336)),
+            image_folder=self.cfg.data.get('image_folder', None),
+            video_folder=self.cfg.data.get('video_folder', None),
+            image_aspect_ratio=self.cfg.data.image_aspect_ratio,
+            use_im_start_end=getattr(self.cfg.mm_cfg, 'use_im_start_end', False),
+            image_processor=image_processor,
+            add_extra_token=add_extra_token,
+            context_length=self.cfg.encoder_seq_length,
+            media_type=self.cfg.data.get('media_type', 'image'),
+            num_frames=self.cfg.data.get('num_frames', -1),
+            use_lita=getattr(self.cfg.mm_cfg, 'use_lita', False),
+            lita=getattr(self.cfg.mm_cfg, 'lita', {}),
+            mm_mlp_adapter_type=self.cfg.mm_cfg.get('mm_mlp_adapter_type', 'linear'),
+        )
+
+        data_cfg = dict(
+            splice_single_frame=self.cfg.data.get('splice_single_frame', None),
+            num_frames=self.cfg.data.get('num_frames', -1),
+            sep_token_between_frames=self.cfg.data.get('sep_token_between_frames', False),
+        )
+
+        train_dataset = get_train_dataset(
+            dname,
+            batch_size=micro_batch_size,
+            task_encoder=TaskEncoder(
+                tokenizer=self.tokenizer,
+                image_processor=image_processor,
+                multimodal_cfg=multimodal_cfg,
+                data_cfg=data_cfg,
+            ),
+            worker_config=worker_config,
+            virtual_epoch_length=1000,
+            max_samples_per_sequence=100,
+            shuffle_buffer_size=100,
+            image_decode="pil",
+        )
+
+        val_datasets = get_val_datasets(
+            dname,
+            batch_size=micro_batch_size,
+            # This is the total number over all workers
+            task_encoder=TaskEncoder(
+                tokenizer=self.tokenizer,
+                image_processor=image_processor,
+                multimodal_cfg=multimodal_cfg,
+                data_cfg=data_cfg,
+            ),
+            worker_config=worker_config,
+            image_decode="pil",
+        )
+
+        val_datasets_without_source_datasets = [
+            # Limit the dataset to eval_iters * num_microbatches
+            LimitDataset(
+                # Repeat the inner dataset in case it's too short
+                RepeatDataset(val_ds, worker_config=worker_config),
+                length=self.cfg.micro_batch_size * self.trainer.limit_val_batches,
+                worker_config=worker_config,
+                reset_after_epoch=True,
+            )
+            for val_ds, _src_ds in val_datasets
+        ]
+
+        return train_dataset, val_datasets_without_source_datasets, None
+
+    # energon dataset builder
+    def build_train_valid_test_datasets_energon(self):
+        """Builds train and validation dataloaders using Megatron-Energon"""
+        rank = parallel_state.get_data_parallel_rank()
+        world_size = parallel_state.get_data_parallel_world_size()
+        data_parallel_group = parallel_state.get_data_parallel_group()
+        worker_debug_path = None
+        worker_log_level = 0
+        logging.info(
+            f" Multimodal train dataloader initializing with  rank {rank} world_size {world_size} data_parallel_group {data_parallel_group} ****** "
+        )
+
+        worker_config = WorkerConfig(
+            rank=rank,
+            world_size=world_size,
+            num_workers=1,
+            data_parallel_group=data_parallel_group,
+            worker_debug_path=worker_debug_path,
+            worker_log_level=worker_log_level,
+        )
+        train_ds, valid_ds1, test_ds = self.datasets_provider(worker_config)
+        train_dataloader = get_savable_loader(train_ds, worker_config=worker_config)
+
+        # Restore energon train dataloader state if we are resuming training
+        restore = os.path.exists(self.trainer.ckpt_path) if self.trainer.ckpt_path else False
+        if restore:
+            replica_id = (
+                parallel_state.get_pipeline_model_parallel_rank(),
+                parallel_state.get_tensor_model_parallel_rank(),
+                parallel_state.get_context_parallel_rank(),
+            )
+            sharded_state_dict = {
+                'dataloader_state': ShardedObject(
+                    data=None,
+                    key='dataloader_state',
+                    global_shape=[parallel_state.get_data_parallel_world_size()],
+                    global_offset=[parallel_state.get_data_parallel_rank()],
+                    replica_id=replica_id,
+                )
+            }
+            state_dict = dist_checkpointing.load(sharded_state_dict, self.trainer.ckpt_path)
+            train_dataloader.restore_state_rank(state_dict['dataloader_state'])
+            logging.info(f"Restored dataset state from {self.trainer.ckpt_path}")
+
+        valid_dataloader = [get_loader(valid_ds, worker_config=worker_config) for valid_ds in valid_ds1]
+        # valid_dataloader = get_loader(valid_ds1, worker_config=worker_config)
+        self._train_dl = train_dataloader
+        self._validation_dl = valid_dataloader
+        return self._train_dl, self._validation_dl
+
     @classmethod
     def list_available_models(cls) -> Optional[PretrainedModelInfo]:
         """
@@ -1512,6 +1681,49 @@ def on_load_checkpoint(self, checkpoint) -> None:
                     self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
                 parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
+    def on_save_checkpoint(self, checkpoint) -> None:
+        """LightningModule hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint
+        """
+
+        # Neva supports Megatron Energon dataloader, this requires saving the dataloader state on each data parallel group
+        def should_save_dataloader_state():
+            if self._train_dl is None:
+                return False
+            if not hasattr(self._train_dl, "save_state"):
+                return False
+            first_rank = (
+                parallel_state.is_pipeline_first_stage(ignore_virtual=True)
+                and parallel_state.get_tensor_model_parallel_rank() == 0
+            )
+            return first_rank
+
+        def save_dataloader_state():
+            train_dataloader_state_dict = self._train_dl.save_state_rank()
+            checkpoint['dataloader_state'] = ShardedObject(
+                data=train_dataloader_state_dict,
+                key='dataloader_state',
+                global_shape=[parallel_state.get_data_parallel_world_size()],
+                global_offset=[parallel_state.get_data_parallel_rank()],
+            )
+
+        # Save energon train dataloader state if conditions are met
+        if self.cfg.get('energon', False) and should_save_dataloader_state():
+            save_dataloader_state()
+
+        # mcore uses distributed checkpointing
+        # FSDP supports the lagecy checkpointing or torch-FSDP-native sharded checkpointing
+        if self.mcore_gpt and not self.use_fsdp:
+            checkpoint['sharded_state_dict'] = self.sharded_state_dict()
+
+        # legacy checkpointing for interleaved
+        else:
+            if isinstance(self.model, list):
+                for i in range(len(self.model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint()
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+
     def sharded_state_dict(self, prefix: str = ''):
         if self.use_peft:
             return None
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
index 2c3b30f2fc74..d38de8eb10b9 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
@@ -424,24 +424,26 @@ def __init__(self, *args, **kwargs):
         # TODO (yuya): need to handle post_process correctly in order to enable PP
         self.output_dim = kwargs.pop('output_dim')
         super().__init__(*args, **kwargs)
-        self.final_layernorm = TENorm(
-            config=self.config,
-            hidden_size=self.config.hidden_size,
-            eps=self.config.layernorm_epsilon,
-        )
-        self.head = torch.nn.Linear(
-            self.config.hidden_size,
-            self.output_dim,
-            bias=False,
-        )
+        if self.post_process:
+            self.final_layernorm = TENorm(
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+            self.head = torch.nn.Linear(
+                self.config.hidden_size,
+                self.output_dim,
+                bias=False,
+            )
 
     def forward(self, x):
         x = super().forward(
             x,
         )
-        x = self.final_layernorm(x)
-        x = x[:, 0]
-        x = self.head(x)
+        if self.post_process:
+            x = self.final_layernorm(x)
+            x = x[:, 0]
+            x = self.head(x)
         return x
 
 
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index ea8053398a88..6ba2e8ca91f9 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -149,6 +149,7 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):
                 checkpoint = dist_checkpointing.load(
                     sharded_state_dict=checkpoint,
                     checkpoint_dir=tmp_model_weights_dir,
+                    strict=dist_checkpointing.validation.StrictHandling.LOG_UNEXPECTED,
                 )
                 state_dict = checkpoint["state_dict"]
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
index 54dff1cd7887..afbe85e0edbb 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -67,9 +67,6 @@ def on_validation_epoch_end(self):
         averaged_loss = torch.tensor(0.0, dtype=torch.float32).cuda()
         return averaged_loss
 
-    def sharded_state_dict(self, prefix: str = ''):
-        return None
-
     def _reset_activation_checkpointing_args(self):
         return
 
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index 09f265ed2521..c6b7aac04a55 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -531,6 +531,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
             copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
         )  # HARDCODED FOR NOW
         data_dict = preprocess_llama_3(sources, tokenizer, multimodal_cfg)
+
     elif multimodal_cfg["conv_template"] == "mistral":
         record = {
             'conversations': [
@@ -552,6 +553,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
             copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
         )  # HARDCODED FOR NOW
         data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg, is_mistral=True)
+
     elif multimodal_cfg["conv_template"] == "v1":
         record = {
             'conversations': [
diff --git a/nemo/collections/vlm/__init__.py b/nemo/collections/vlm/__init__.py
new file mode 100644
index 000000000000..2aeeae299a7d
--- /dev/null
+++ b/nemo/collections/vlm/__init__.py
@@ -0,0 +1,41 @@
+from nemo.collections.vlm.neva.data import (
+    DataConfig,
+    ImageDataConfig,
+    ImageToken,
+    MockDataModule,
+    MultiModalToken,
+    NevaLazyDataModule,
+    VideoDataConfig,
+    VideoToken,
+)
+from nemo.collections.vlm.neva.model import (
+    CLIPViTConfig,
+    HFCLIPVisionConfig,
+    Llava1_5Config7B,
+    Llava1_5Config13B,
+    LlavaConfig,
+    LlavaModel,
+    MultimodalProjectorConfig,
+    NevaConfig,
+    NevaModel,
+)
+
+__all__ = [
+    "MockDataModule",
+    "NevaLazyDataModule",
+    "DataConfig",
+    "ImageDataConfig",
+    "VideoDataConfig",
+    "MultiModalToken",
+    "ImageToken",
+    "VideoToken",
+    "CLIPViTConfig",
+    "HFCLIPVisionConfig",
+    "MultimodalProjectorConfig",
+    "NevaConfig",
+    "NevaModel",
+    "LlavaConfig",
+    "Llava1_5Config7B",
+    "Llava1_5Config13B",
+    "LlavaModel",
+]
diff --git a/nemo/collections/vlm/neva/__init__.py b/nemo/collections/vlm/neva/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/collections/vlm/neva/data/__init__.py b/nemo/collections/vlm/neva/data/__init__.py
new file mode 100644
index 000000000000..bbd502e21c80
--- /dev/null
+++ b/nemo/collections/vlm/neva/data/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig, VideoDataConfig
+from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule
+from nemo.collections.vlm.neva.data.mock import MockDataModule
+from nemo.collections.vlm.neva.data.multimodal_tokens import ImageToken, MultiModalToken, VideoToken
+
+__all__ = [
+    "NevaLazyDataModule",
+    "MockDataModule",
+    "DataConfig",
+    "ImageDataConfig",
+    "VideoDataConfig",
+    "MultiModalToken",
+    "ImageToken",
+    "VideoToken",
+]
diff --git a/nemo/collections/vlm/neva/data/api.py b/nemo/collections/vlm/neva/data/api.py
new file mode 100644
index 000000000000..c2e51e033d8a
--- /dev/null
+++ b/nemo/collections/vlm/neva/data/api.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytorch_lightning as pl
+
+from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule
+from nemo.collections.vlm.neva.data.mock import MockDataModule
+
+
+def mock() -> pl.LightningDataModule:
+    return MockDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
+
+
+def lazy() -> pl.LightningDataModule:
+    return NevaLazyDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2)
+
+
+__all__ = ["mock", "lazy"]
diff --git a/nemo/collections/vlm/neva/data/config.py b/nemo/collections/vlm/neva/data/config.py
new file mode 100644
index 000000000000..3b22d5a493b3
--- /dev/null
+++ b/nemo/collections/vlm/neva/data/config.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional
+
+from .multimodal_tokens import ImageToken, MultiModalToken, VideoToken
+
+
+@dataclass
+class DataConfig:
+    media_type: str  # currently supported: image or video
+    media_token: MultiModalToken
+    conv_template: str = "v1"  # check `nemo/collections/multimodal/data/neva/conversation.py`
+    reset_position_ids: bool = False  # Option to reset the position IDs in the dataset at an interval
+    reset_attention_mask: bool = False  # Option to reset the attention mask from the dataset
+    eod_mask_loss: bool = False  # Option to enable the EOD mask loss
+
+
+@dataclass
+class ImageDataConfig(DataConfig):
+    media_type: str = "image"
+    media_token: MultiModalToken = ImageToken
+    image_folder: Optional[str] = None
+    image_process_mode: str = 'pad'
+
+
+@dataclass
+class VideoDataConfig(DataConfig):
+    media_type: str = "video"
+    media_token: MultiModalToken = VideoToken
+    splice_single_frame: Optional[str] = None
+    # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded.
+    num_frames: int = 8  # Selects the number of frames to use from the video
+    sep_token_between_frames: bool = False  # TODO: Allow usage of separator tokens between frames
+    video_folder: Optional[str] = None
diff --git a/nemo/collections/vlm/neva/data/conversation.py b/nemo/collections/vlm/neva/data/conversation.py
new file mode 100644
index 000000000000..22c435cb1fd2
--- /dev/null
+++ b/nemo/collections/vlm/neva/data/conversation.py
@@ -0,0 +1,677 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import dataclasses
+import re
+from collections import defaultdict
+from enum import Enum, auto
+from io import BytesIO
+from typing import Any, List, Optional, Union
+
+from PIL import Image
+from transformers import AutoTokenizer
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    CHATML = auto()
+    LLAMA_2 = auto()
+    LLAMA_3 = auto()
+    MISTRAL = auto()
+    NVGPT = auto()
+    QWEN = auto()
+    GEMMA = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+
+    system: Optional[str]
+    roles: tuple[str, str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+
+    tokenizer_name_or_path: Any = None
+    stop_str: Union[str, List[str]] = None
+    stop_token_ids: List[int] = None
+
+    skip_next: bool = False
+
+    def process_prompt_with_images(self, messages):
+        # Process messages to handle potential image tokens.
+        return messages
+
+    def process_chat_template(self, tokenizer_name_or_path, messages):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        if self.system is None:
+            chat = []
+        else:
+            chat = [{"role": "system", "content": self.system}]
+        for role, message in messages:
+            chat.append({"role": role.lower(), "content": message})
+        ret = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
+        return ret
+
+    def get_prompt(self):
+        messages = self.messages
+        messages = self.process_prompt_with_images(messages)
+
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+
+        elif self.sep_style == SeparatorStyle.TWO:
+            """
+            A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {{ user_message_1 }} ASSISTANT: {{ model_answer_1 }}</s>USER: {{ user_message_2 }}
+            """
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+
+        elif self.sep_style == SeparatorStyle.MISTRAL and self.version == "vila":
+            """
+            <s>[INST] {{ user_message_1 }} [/INST]{{ model_answer_1 }}</s>[INST] {{ user_message_2 }} [/INST]
+            """
+            wrap_sys = lambda msg: f"{msg}" + ("\n" if msg else "")
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = "<s>"
+
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += message + self.sep2
+                else:
+                    ret += ""
+
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            """
+            <s>[INST] <<SYS>>
+            You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
+            <</SYS>>
+
+            {{ user_message_1 }} [/INST] {{ model_answer_1 }} </s><s>[INST] {{ user_message_2 }} [/INST]
+            """
+            tokenizer_name_or_path = self.tokenizer_name_or_path or "meta-llama/Llama-2-7b-chat-hf"
+            ret = self.process_chat_template(tokenizer_name_or_path, messages)
+
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            """
+            <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+            {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+            {{ user_message_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+            {{ model_answer_1 }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+            {{ user_message_2 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+            """
+            tokenizer_name_or_path = self.tokenizer_name_or_path or "meta-llama/Meta-Llama-3-8B-Instruct"
+            ret = self.process_chat_template(tokenizer_name_or_path, messages)
+
+        elif self.sep_style == SeparatorStyle.NVGPT:
+            ret = self.sep2 + self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + '\n' + message + '\n' + self.sep
+                else:
+                    ret += role + '\n'
+
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+
+        elif self.sep_style == SeparatorStyle.MISTRAL:
+            """
+            NOT tested in NeMo!
+            """
+            tokenizer_name_or_path = self.tokenizer_name_or_path or "mistralai/Mistral-7B-Instruct-v0.2"
+            ret = self.process_chat_template(tokenizer_name_or_path, messages)
+
+        elif self.sep_style == SeparatorStyle.CHATML:
+            """
+            NOT tested in NeMo!
+            """
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = "<image>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+
+        elif self.sep_style == SeparatorStyle.MPT:
+            """
+            NOT tested in NeMo!
+            """
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+
+        elif self.sep_style == SeparatorStyle.GEMMA:
+            """
+            NOT tested in NeMo!
+            """
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                assert role == self.roles[i % 2], "Conversation should alternate user/assistant/user/assistant/..."
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+        return ret
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def process_image(self, image, image_process_mode, return_pil=False, image_format="PNG"):
+        if image_process_mode == "Pad":
+
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+
+        if type(image) is not Image.Image:
+            image = Image.open(image).convert("RGB")
+
+        max_hw, min_hw = max(image.size), min(image.size)
+        aspect_ratio = max_hw / min_hw
+        max_len, min_len = 1008, 672
+        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+        longest_edge = int(shortest_edge * aspect_ratio)
+        W, H = image.size
+        if H > W:
+            H, W = longest_edge, shortest_edge
+        else:
+            H, W = shortest_edge, longest_edge
+        image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+
+    def get_images(self, return_pil=False, return_path=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    if type(image) != list:
+                        image = [image]
+                    for img in image:
+                        if not return_path:
+                            img = self.process_image(img, image_process_mode, return_pil=return_pil)
+                        images.append(img)
+        return images
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    if type(image) != list:
+                        image = [image]
+                    if len(image) == 1:
+                        msg = "<image>\n" + msg.replace("<image>", "").strip()
+                    else:
+                        msg = re.sub(r"(<image>)\n(?=<image>)", r"\1 ", msg)
+                    for img in image:
+                        img_b64_str = self.process_image(img, "Default", return_pil=False, image_format="JPEG")
+                        img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}"/>'
+                        msg = msg.replace("<image>", img_str, 1).strip()
+                    if len(msg) > 0:
+                        ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+
+
+# Conversation Template for NVGPT
+conv_nvgpt = Conversation(
+    system="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n""",
+    roles=("User", "Assistant"),
+    version="nvgpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.NVGPT,
+    sep="<extra_id_1>",
+    sep2=f"<extra_id_0>System\n",
+)
+
+conv_nv_dpo = Conversation(
+    system="\n",
+    roles=("User", "Assistant"),
+    version="nv_dpo",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.NVGPT,
+    sep="<extra_id_1>",
+    sep2=f"<extra_id_0>System\n",
+)
+
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=[
+        ["Human", "What are the key differences between renewable and non-renewable energy sources?"],
+        [
+            "Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
+        ],
+    ],
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    stop_str="</s>",
+)
+
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+    stop_str=" </s>",
+)
+
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+    stop_str=" </s>",
+)
+
+conv_llava_llama_3 = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("user", "assistant"),
+    version="llama_v3",
+    messages=[],
+    offset=0,
+    sep="<|eot_id|>",
+    sep_style=SeparatorStyle.LLAMA_3,
+    tokenizer_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct",
+    stop_str="<|eot_id|>",
+)
+
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+    stop_str=" </s>",
+)
+
+conv_llava_llama_2_simple = Conversation(
+    system="Answer the questions about the visual content that the user provides.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+    stop_str=" </s>",
+)
+
+conv_llava_llama_2_mmtag = Conversation(
+    system="Answer the questions about the visual content that the user provides."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2_mmtag",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+    stop_str=" </s>",
+)
+
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_qwen = Conversation(
+    system="""<|im_start|>system
+You are a helpful assistant.""",
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="qwen",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+
+conv_gemma_instruct = Conversation(
+    system="",
+    roles=("<start_of_turn>user\n", "<start_of_turn>model\n"),
+    version="gemma",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.GEMMA,
+    sep="<end_of_turn>\n",
+)
+
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="",
+    sep2="\n",
+    stop_str="\n",
+)
+
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+
+conv_mistral_vila = Conversation(
+    system=None,
+    roles=("USER", "ASSISTANT"),
+    version="vila",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MISTRAL,
+    sep="",
+    sep2="</s>",
+    stop_str="</s>",
+)
+
+conv_mistral_orca = Conversation(
+    system="""<|im_start|>system
+You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_mistral_zephyr = Conversation(
+    system="""<|system|>
+You are a helpful AI assistant.""",
+    roles=("<|user|>\n", "<|assistant|>\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="</s>",
+)
+
+conv_mistral_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v1,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "mistral_orca": conv_mistral_orca,
+    "mistral_zephyr": conv_mistral_zephyr,
+    "mistral_direct": conv_mistral_direct,
+    "mistral": conv_mistral_vila,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "chatml_direct": conv_chatml_direct,
+    "llava_v0": conv_llava_v0,
+    "llava_v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "llava_v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "llava_llama_3": conv_llava_llama_3,
+    "llava_llama_2_simple": conv_llava_llama_2_simple,
+    "llava_llama_2_mmtag": conv_llava_llama_2_mmtag,
+    "llava_mistral_instruct": conv_mistral_instruct,
+    "mpt": conv_mpt,
+    "qwen_1_5": conv_qwen,
+    "gemma_instruct": conv_gemma_instruct,
+    "nvgpt": conv_nvgpt,
+    "nv_steerlm": conv_nvgpt,
+    "nv_dpo": conv_nv_dpo,
+}
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py
new file mode 100644
index 000000000000..ca1179e24033
--- /dev/null
+++ b/nemo/collections/vlm/neva/data/lazy.py
@@ -0,0 +1,612 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+import pytorch_lightning as pl
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils import data
+from torch.utils.data import DataLoader
+
+from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig
+from nemo.collections.vlm.neva.data.conversation import conv_templates as supported_conv_templates
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+
+if TYPE_CHECKING:
+    pass
+
+import json
+import logging
+import os
+import re
+import tarfile
+from typing import Any, Dict, List, Sequence
+
+import decord
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch.utils.data import Dataset, default_collate
+from transformers import CLIPImageProcessor, SiglipImageProcessor
+
+from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+from nemo.collections.vlm.neva.data.multimodal_tokens import IGNORE_INDEX, SPECIAL_TOKEN_MAP
+
+
+class TarOrFolderImageLoader:
+    """
+    A class for loading images from a tar archive or a regular folder.
+
+    This class provides functionality to open and read images from either a tar archive
+    (.tar file) or a standard directory with image files. It builds an index of images
+    if the source is a tar archive for efficient access.
+
+    Attributes:
+        image_folder (str): The path to the tar archive or image folder.
+        tar_index (dict): A dictionary that maps file names to their tarfile member
+                          objects if the image source is a tar archive.
+
+    Methods:
+        __init__(self, image_folder): Initializes the loader with the specified image folder.
+        build_index(self): Builds an index of image file names and their corresponding
+                           tarfile member objects for a tar archive.
+        open_image(self, file_name): Opens and returns an image by its file name. The image
+                                     is returned as an RGB PIL Image object.
+    """
+
+    def __init__(self, image_folder):
+        self.image_folder = image_folder
+        self.tar_index = {}
+        if self.image_folder.endswith('.tar'):
+            self.build_index()
+
+    def build_index(self):
+        with tarfile.open(self.image_folder, 'r') as tar:
+            for member in tar.getmembers():
+                self.tar_index[member.name] = member
+
+    def open_image(self, file_name):
+        if self.image_folder.endswith('.tar'):
+            with tarfile.open(self.image_folder, 'r') as tar:
+                member = self.tar_index.get(file_name)
+                if member:
+                    f = tar.extractfile(member)
+                    return Image.open(f).convert('RGB')
+        else:
+            return Image.open(os.path.join(self.image_folder, file_name)).convert('RGB')
+        return None
+
+
+class TarOrFolderVideoLoader:
+    """
+    A class for loading videos from a tar archive or a regular folder.
+
+    This class provides functionality to open and read videos from either a tar archive
+    (.tar file) or a standard directory with video files. It builds an index of videos
+    if the source is a tar archive for efficient access.
+
+    Attributes:
+        video_folder (str): The path to the tar archive or video folder.
+        data_config (dict): A dictionary of configuration options for video decoding to frames
+        tar_index (dict): A dictionary that maps file names to their tarfile member
+                          objects if the video source is a tar archive.
+
+    Methods:
+        __init__(self, video_folder): Initializes the loader with the specified video folder.
+        build_index(self): Builds an index of image file names and their corresponding
+                           tarfile member objects for a tar archive.
+        open_video(self, file_name): Opens and returns an video by its file name. The video
+                                     is returned as a list of RGB PIL Image objects.
+        flatten_frames(self, cap): Converts decord VideoReader video object to list of frame
+                                   images based on data config information.
+    """
+
+    def __init__(self, video_folder, data_config):
+        self.video_folder = video_folder
+        self.data_config = data_config
+        self.tar_index = {}
+        if self.video_folder.endswith('.tar'):
+            self.build_index()
+
+    def build_index(self):
+        with tarfile.open(self.video_folder, 'r') as tar:
+            for member in tar.getmembers():
+                self.tar_index[member.name] = member
+
+    def open_video(self, file_name):
+        if self.video_folder.endswith('.tar'):
+            with tarfile.open(self.video_folder, 'r') as tar:
+                member = self.tar_index.get(file_name)
+                if member:
+                    f = tar.extractfile(member)
+                    cap = decord.VideoReader(f)
+                    return self.flatten_frames(cap)
+        else:
+            # decord.bridge.set_bridge("torch")
+            cap = decord.VideoReader(os.path.join(self.video_folder, file_name))
+            return self.flatten_frames(cap)
+        return None
+
+    def flatten_frames(self, cap):
+        if self.data_config.splice_single_frame == 'first':
+            frame = cap[0].asnumpy()
+            return Image.fromarray(frame).convert('RGB')
+        elif self.data_config.splice_single_frame == 'middle':
+            frame = cap[len(cap) // 2].asnumpy()
+            return Image.fromarray(frame).convert('RGB')
+        elif self.data_config.splice_single_frame == 'last':
+            frame = cap[-1].asnumpy()
+            return Image.fromarray(frame).convert('RGB')
+        else:
+            if self.data_config.num_frames == -1:
+                frames = []
+                for frame in cap:
+                    rgb_frame = frame.asnumpy()
+                    img = Image.fromarray(rgb_frame).convert('RGB')
+                    frames.append(img)
+                return frames
+            else:
+                num_frames = min(len(cap), self.data_config.num_frames)
+                indices = np.linspace(0, len(cap) - 1, num_frames, dtype=int)
+                frames = [Image.fromarray(cap[i].asnumpy()).convert('RGB') for i in indices]
+                while len(frames) < self.data_config.num_frames:
+                    frames.append(frames[-1])
+                return frames
+
+
+def process_image(processor, image, image_process_mode="square"):  # this needs to be merged with conv's process image
+    if isinstance(processor, CLIPImageProcessor) or isinstance(processor, SiglipImageProcessor):
+        # image processor from HF
+        if image_process_mode == 'keep':
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            max_len, min_len = 448, 224
+            shortest_edge = int(min(max_len / aspect_ratio, min_len))
+            image = processor.preprocess(
+                image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
+            )['pixel_values'][0]
+        elif image_process_mode == 'pad':
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        else:
+            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+    else:
+        assert image_process_mode == 'square', 'NeMo image transform with setting `image_process_mode` to `square`.'
+        image = processor(image)
+    return image
+
+
+def tokenize_special_token(prompt, tokenizer, special_token_map=None):
+    """
+    Tokenizes a given prompt with special handling for multiple special tokens.
+
+    This function splits the prompt at special tokens, tokenizes each chunk separately,
+    and then reassembles the chunks with the corresponding special token inserted in place of the placeholders.
+
+    Parameters:
+    prompt (str): The input prompt containing text and special token placeholders.
+    tokenizer: The tokenizer object used to tokenize the prompt chunks.
+    special_token_map (list, optional): A list containing tuples of special token strings
+                                        and their corresponding token indices. Defaults to SPECIAL_TOKEN_MAP.
+
+    Returns:
+    torch.Tensor: A tensor of token IDs representing the tokenized prompt with special tokens.
+    """
+
+    # Use the default special token map if none is provided
+    if special_token_map is None:
+        special_token_map = SPECIAL_TOKEN_MAP
+
+    # Create a mapping of special tokens to their indices
+    special_token_dict = {token: index for token, index in special_token_map}
+
+    # Split the prompt into chunks and track special tokens
+    regex_pattern = '(' + '|'.join(re.escape(token) for token in special_token_dict.keys()) + ')'
+    chunks = re.split(regex_pattern, prompt)
+
+    # Tokenize each chunk and replace special tokens with their indices
+    tokenized_chunks = []
+    for chunk in chunks:
+        if chunk in special_token_dict:
+            tokenized_chunks.append(special_token_dict[chunk])
+        elif len(chunk) > 0:
+            tokenized_chunks.extend(tokenizer(chunk, add_special_tokens=False).input_ids)
+
+    return torch.tensor(tokenized_chunks, dtype=torch.long)
+
+
+def find_pattern_indices(template, pattern, search_start_index=0, allow_first_token_mismatch=False):
+    template_len = len(template)
+    pattern_len = len(pattern)
+    for i in range(search_start_index, template_len - pattern_len + 1):
+        match = template[i : i + pattern_len] == pattern
+        if torch.all(match) or (allow_first_token_mismatch and torch.all(match[1:])):
+            return i, i + pattern_len
+    return -1, -1
+
+
+class LazySupervisedDataset(Dataset):
+
+    def __init__(
+        self,
+        data_path,
+        data_config,
+        tokenizer,
+        image_processor,
+    ):
+        super().__init__()
+        if data_path is not None:
+            with open(data_path, "r") as file:
+                list_data_dict = json.load(file)
+        else:
+            list_data_dict = []
+
+        logging.warning("Formatting inputs...Skip in lazy mode")
+        self.data_config = data_config
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+
+        self.conv_template = data_config.conv_template
+        self.conv = supported_conv_templates[self.conv_template]
+        self.image_process_mode = data_config.image_process_mode
+        self.list_data_dict = list_data_dict
+
+        image_folder = getattr(data_config, "image_folder", None)
+        video_folder = getattr(data_config, "video_folder", None)
+
+        self.image_loader = TarOrFolderImageLoader(image_folder) if image_folder else None
+        self.video_loader = TarOrFolderVideoLoader(video_folder, data_config) if video_folder else None
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        source = self.list_data_dict[i]
+        conversations = self._apply_prompt_templates(source, use_plain=self.conv_template == "plain")
+        tokens, labels = self._tokenize_and_label(conversations)
+
+        media_tensors = self._process_images(source)
+        data_dict = dict(
+            image=media_tensors,
+            tokens=tokens,
+            labels=labels,
+        )
+        return data_dict
+
+    def _process_images(self, source):
+        media_tensors = torch.tensor([])
+        if 'image' in source:
+            if not isinstance(source['image'], list):
+                source['image'] = [source['image']]
+
+            images = []
+            for image_file in source['image']:
+                image = self.image_loader.open_image(image_file)
+                if image is None:
+                    logging.warning(f"Image {image_file} could not be found!")
+                image = process_image(self.image_processor, image, self.image_process_mode)
+                images.append(image)
+
+            if images:
+                media_tensors = torch.stack(images)
+        return media_tensors
+
+    def _apply_prompt_templates(self, source, use_plain=False):
+        conv = self.conv
+
+        roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+        source = source['conversations']
+        if roles[source[0]["from"]] != conv.roles[0]:
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{j}"
+            conv.append_message(role, sentence["value"])
+
+        if use_plain:
+            assert len(conv.messages) == 2, "Plain template requires image-caption pairs."
+            assert "<image>" in conv.messages[0][1]
+            conv.messages[0][1] = "<image>"
+
+        return conv.get_prompt()
+
+    def _tokenize_and_label(self, conversations):
+        tokens = tokenize_special_token(conversations, self.tokenizer)
+        labels = torch.ones_like(tokens) * IGNORE_INDEX
+        search_start_index = 0
+        for i in range(1, len(self.conv.messages), 2):
+            stop_str = getattr(self.conv, "stop_str", None)
+            assert (
+                stop_str is not None
+            ), "If `stop_str` is not provided, issues might occur in labeling the answer tokens."
+            answer_tokens = self.tokenizer.encode(
+                self.conv.messages[i][1] + ("" if stop_str is None else stop_str),
+                add_special_tokens=False,
+                return_tensors="pt",
+            )[0]
+            answer_start, answer_end = find_pattern_indices(tokens, answer_tokens, search_start_index)
+            labels[answer_start:answer_end] = tokens[answer_start:answer_end]
+            search_start_index = answer_end
+        tokens = tokens[:-1]
+        labels = labels[1:]
+        return tokens, labels
+
+    def _get_crop_size(self):
+        if isinstance(self.image_processor, CLIPImageProcessor):
+            return [self.image_processor.crop_size['height'], self.image_processor.crop_size['width']]
+        else:
+            raise NotImplementedError
+
+
+class NevaDataset(LazySupervisedDataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(
+        self,
+        data_path,
+        data_config,
+        tokenizer,
+        image_processor,
+    ):
+
+        if data_path.endswith(".json"):
+            super().__init__(data_path, data_config, tokenizer, image_processor)
+
+        elif data_path.endswith(".jsonl"):
+            super().__init__(None, data_config, tokenizer, image_processor)
+            logging.warning("Loading image inputs from SteerLM Dataset...")
+            if data_config.media_type == 'image':
+                image_folder = data_config.image_folder
+                for line in open(data_path, "r"):
+                    record = json.loads(line)
+
+                    # This currently supports only a single image
+                    # search for <img src="/absolute/path/to/image" in the conversation
+                    #   add it as record['image'], remove src tag from the <img> tag
+
+                    record['image'] = []
+                    for turn in record['conversations']:
+                        matches = re.finditer('<img src="([^"]+)"', turn['value'])
+                        for match in matches:
+                            image_name = match.group(1).split("/")[-1]
+                            image_path = os.path.join(image_folder, image_name)
+                            if not os.path.isfile(image_path):
+                                logging.warning(f"Image not found: {image_path}")
+                                continue
+                            record['image'].append(image_name)  # url
+                        turn['value'] = re.sub('<img src="([^"]+)">', "<image>", turn['value'])
+
+                    self.list_data_dict.append(record)
+
+        else:
+            raise ValueError(f"Formatting of {data_path} is not supported in Neva.")
+
+    def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        data_config = self.data_config
+        packed_sequence = "cu_seqlens" in instances[0]
+        max_len = max(instance['tokens'].shape[0] for instance in instances)
+        for instance in instances:
+            pad_len = max_len - instance['tokens'].shape[0]
+            instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0)
+            instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', IGNORE_INDEX)
+            if packed_sequence and instance["cu_seqlens"][-1] != max_len:
+                instance["cu_seqlens"] = torch.cat((instance["cu_seqlens"], torch.IntTensor([max_len])), 0)
+
+        if packed_sequence:
+            max_len_cu = max(instance['cu_seqlens'].shape[0] for instance in instances)
+            max_len_image = max(instance['image'].shape[0] for instance in instances)
+            for instance in instances:
+                pad_len_cu = max_len_cu - instance['cu_seqlens'].shape[0]
+                instance['cu_seqlens'] = F.pad(instance['cu_seqlens'], (0, pad_len_cu), 'constant', max_len)
+
+                x = instance['image']
+                num_pad = max_len_image - x.shape[0]
+                pad_tensor = torch.zeros(num_pad, *x.shape[1:], dtype=x.dtype, device=x.device)
+                instance['image'] = torch.cat((x, pad_tensor), dim=0)
+
+        media_type = data_config.media_type
+        if media_type == 'image':
+            media = [instance.pop('image') for instance in instances]
+            media = torch.cat(media, dim=0)
+            if media.size(0) == 0:
+                media = None
+        elif media_type == 'video':
+            media = [instance.pop('video', None) for instance in instances]
+        else:
+            raise ValueError(f"Unsupported media type {media_type}")
+
+        batch = default_collate(instances)
+        tokenizer = self.tokenizer
+
+        tokens = batch['tokens']
+        labels = batch['labels']
+
+        if packed_sequence:
+            cu_seqlens = batch["cu_seqlens"]
+            position_ids = []
+            for cu_seqlen in cu_seqlens:
+                position_ids.append([])
+                for ind in range(0, len(cu_seqlen) - 1):
+                    seqlen = cu_seqlen[ind + 1] - cu_seqlen[ind]
+                    position_ids[-1].extend(list(range(seqlen)))
+            position_ids = torch.LongTensor(position_ids)
+            loss_mask = torch.ones(tokens.size(), dtype=torch.float, device=tokens.device)
+            attention_mask = torch.ones(tokens.size(), dtype=torch.long, device=tokens.device)
+        else:
+            attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+                data=tokens,
+                eod_token=tokenizer.eos_token_id,
+                eod_mask_loss=data_config.eod_mask_loss,
+                reset_attention_mask=data_config.reset_attention_mask,
+                reset_position_ids=data_config.reset_position_ids,
+            )
+
+        loss_mask[labels < 0] = 0.0
+
+        batch = {
+            'tokens': tokens,
+            'labels': labels,
+            'attention_mask': attention_mask,
+            'loss_mask': loss_mask,
+            'position_ids': position_ids,
+            'media': media,
+        }
+        if packed_sequence:
+            batch["cu_seqlens"] = cu_seqlens
+        return batch
+
+
+class NevaLazyDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        paths: str | List[str],
+        weights: Optional[List[float]] = None,
+        data_config: Optional[DataConfig] = ImageDataConfig,
+        seq_length: int = 2048,
+        tokenizer: Optional = None,
+        image_processor: Optional = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        num_train_samples: int = 10_000,
+        num_val_samples: int = 10_000,
+        num_test_samples: int = 10_000,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+        use_packed_sequence: bool = False,
+        seed: int = 1234,
+    ) -> None:
+        super().__init__()
+        if not isinstance(paths, (list, tuple)):
+            paths = [paths]
+        if weights is not None:
+            assert len(weights) == len(paths)
+            if len(weights) == 1:
+                # weights must be None if there is only one dataset
+                weights = None
+
+        self.paths = paths
+        self.weights = weights
+        self.data_config = data_config
+        self.seq_length = seq_length
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.num_train_samples = num_train_samples
+        self.num_val_samples = num_val_samples
+        self.num_test_samples = num_test_samples
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.seed = seed
+        self.use_packed_sequence = use_packed_sequence
+        self.init_global_step = 0
+
+        if tokenizer is None or image_processor is None:
+            logging.warning(f"Processor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.")
+            from transformers import AutoProcessor
+
+            processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+            self.tokenizer = tokenizer or processor.tokenizer
+            self.image_processor = image_processor or processor.image_processor
+
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            dataloader_type="cyclic",
+        )
+
+    def setup(self, stage: str = "") -> None:
+        assert len(self.paths) == 1, "not yet support blend dataset in Neva 2.0!"
+        if self.use_packed_sequence:
+            pass  # TODO
+        else:
+            # TODO:
+            # rng = torch.Generator().manual_seed(self.seed)
+            # train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], generator=rng)
+            self._train_ds = NevaDataset(self.paths[0], self.data_config, self.tokenizer, self.image_processor)
+            self._validation_ds = NevaDataset(self.paths[0], self.data_config, self.tokenizer, self.image_processor)
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        return self._create_dataloader(self._train_ds)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._validation_ds)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._test_ds)
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        self.init_global_step = self.trainer.global_step
+        self.data_sampler.init_global_step = self.init_global_step
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=getattr(dataset, 'collate_fn', data.dataloader.default_collate),
+            **kwargs,
+        )
+
+    def state_dict(self) -> Dict[str, Any]:
+        """Called when saving a checkpoint, implement to generate and save datamodule state.
+
+        Returns:
+            A dictionary containing datamodule state.
+
+        """
+        consumed_samples = self.data_sampler.compute_consumed_samples(self.trainer.global_step - self.init_global_step)
+        return {'consumed_samples': consumed_samples}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        """Called when loading a checkpoint, implement to reload datamodule state given datamodule stat
+
+        Args:
+            state_dict: the datamodule state returned by ``state_dict``.
+
+        """
+        try:
+            from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+        except ModuleNotFoundError:
+            from nemo.lightning.apex_utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+        consumed_samples = state_dict['consumed_samples']
+        self.data_sampler.init_consumed_samples = consumed_samples
+        self.data_sampler.prev_consumed_samples = consumed_samples
+        self.if_first_step = 1
+
+        if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is not None:
+            num_microbatch_calculator = _GLOBAL_NUM_MICROBATCHES_CALCULATOR  # noqa: SLF001
+
+            num_microbatch_calculator.update(
+                consumed_samples=consumed_samples,
+                consistency_check=False,
+            )
diff --git a/nemo/collections/vlm/neva/data/mock.py b/nemo/collections/vlm/neva/data/mock.py
new file mode 100644
index 000000000000..ac4bc56a068c
--- /dev/null
+++ b/nemo/collections/vlm/neva/data/mock.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils import data
+from torch.utils.data import DataLoader, Dataset
+
+from nemo.collections.vlm.neva.data.multimodal_tokens import IMAGE_TOKEN_INDEX
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+
+
+class MockDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        seq_length: int = 2048,
+        tokenizer: Optional = None,
+        image_processor: Optional = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        num_train_samples: int = 10_000,
+        num_val_samples: int = 10_000,
+        num_test_samples: int = 10_000,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+    ):
+        super().__init__()
+        self.seq_length = seq_length
+        self.num_train_samples = num_train_samples
+        self.num_val_samples = num_val_samples
+        self.num_test_samples = num_test_samples
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+
+        if tokenizer is None or image_processor is None:
+            from transformers import AutoProcessor
+
+            processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+            self.tokenizer = tokenizer or processor.tokenizer
+            self.image_processor = image_processor or processor.image_processor
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+        )
+
+    def setup(self, stage: str = "") -> None:
+        self._train_ds = _MockNevaDataset(
+            self.tokenizer, self.image_processor, "train", self.num_train_samples, self.seq_length
+        )
+        self._validation_ds = _MockNevaDataset(
+            self.tokenizer, self.image_processor, "valid", self.num_val_samples, self.seq_length
+        )
+        self._test_ds = _MockNevaDataset(
+            self.tokenizer, self.image_processor, "test", self.num_test_samples, self.seq_length
+        )
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        if not hasattr(self, "_train_ds"):
+            self.setup()
+        return self._create_dataloader(self._train_ds)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        if not hasattr(self, "_validation_ds"):
+            self.setup()
+        return self._create_dataloader(self._validation_ds)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        if not hasattr(self, "_test_ds"):
+            self.setup()
+        return self._create_dataloader(self._test_ds)
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            **kwargs,
+        )
+
+
+class _MockNevaDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer,
+        image_processor,
+        name: str,
+        num_samples: int,
+        seq_length: int,
+        seed: int = 42,
+    ) -> None:
+        super().__init__()
+        self.name = name
+        self.seq_length = seq_length
+
+        self.vocab_size = tokenizer.vocab_size
+
+        crop_size = image_processor.crop_size
+        self.image_height, self.image_width = crop_size["height"], crop_size["width"]
+
+        self.length = num_samples
+        self.seed = seed
+
+        self.loss_mask = torch.ones(self.seq_length, dtype=torch.float)
+        self.position_ids = torch.arange(self.seq_length, dtype=torch.int64)
+
+    def __len__(self) -> int:
+        return self.length
+
+    def _get_text(self, idx: int) -> np.ndarray:
+        np_gen = np.random.default_rng(seed=(self.seed + idx))
+        return np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64)
+
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        # Generate data of the expected size and datatype (based on GPTDataset).
+        np_gen = np.random.default_rng(seed=(self.seed + idx))
+        tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length + 1], dtype=np.int64))
+        tokens[2] = IMAGE_TOKEN_INDEX  # ImageToken token index
+        labels = tokens.clone()
+        images = torch.from_numpy(np_gen.random(size=[3, self.image_height, self.image_width], dtype=np.float32))
+        tokens = tokens[:-1]
+        labels = labels[1:]
+        return {
+            "media": images,
+            "tokens": tokens,
+            "labels": labels,
+            "loss_mask": self.loss_mask,
+            "position_ids": self.position_ids,
+        }
+
+    def _collate_fn(self, batch):
+        """
+        A default implementation of a collation function.
+        Users should override this method to define custom data loaders.
+        """
+        collated_batch = data.dataloader.default_collate(batch)
+        collated_batch["attention_mask"] = None
+        return collated_batch
+
+    def collate_fn(self, batch):
+        """Method that user pass as functor to DataLoader.
+
+        The method optionally performs neural type checking and add types to the outputs.
+
+        Please note, subclasses of Dataset should not implement `input_types`.
+
+        # Usage:
+        dataloader = torch.utils.data.DataLoader(
+                ....,
+                collate_fn=dataset.collate_fn,
+                ....
+        )
+
+        Returns
+        -------
+            Collated batch, with or without types.
+        """
+        return self._collate_fn(batch)
diff --git a/nemo/collections/vlm/neva/data/multimodal_tokens.py b/nemo/collections/vlm/neva/data/multimodal_tokens.py
new file mode 100644
index 000000000000..8c4dcadad63c
--- /dev/null
+++ b/nemo/collections/vlm/neva/data/multimodal_tokens.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Callable, Optional
+
+
+@dataclass
+class MultiModalToken:
+    """
+    Base class for multimodal tokens representing different media types.
+    """
+
+    token_str: str
+    token_index: int
+    media_type: str
+    use_start_end: bool
+    encoder_fn: Optional[Callable] = None
+
+
+@dataclass
+class ImageToken(MultiModalToken):
+    token_str: str = "<image>"
+    token_index: int = -200
+    media_type: str = "image"
+    use_start_end: bool = False
+
+
+@dataclass
+class VideoToken(MultiModalToken):
+    token_str: str = "<video>"
+    token_index: int = -300
+    media_type: str = "video"
+    use_start_end: bool = False
+
+
+# Constants for token indexing and special token mapping
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = ImageToken.token_index
+VIDEO_TOKEN_INDEX = VideoToken.token_index
+SPECIAL_TOKEN_MAP = [(ImageToken.token_str, ImageToken.token_index), (VideoToken.token_str, VideoToken.token_index)]
diff --git a/nemo/collections/vlm/neva/model/__init__.py b/nemo/collections/vlm/neva/model/__init__.py
new file mode 100644
index 000000000000..25842186ecfe
--- /dev/null
+++ b/nemo/collections/vlm/neva/model/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.vlm.neva.model.base import (
+    CLIPViTConfig,
+    HFCLIPVisionConfig,
+    MultimodalProjectorConfig,
+    NevaConfig,
+    NevaModel,
+)
+from nemo.collections.vlm.neva.model.llava import Llava1_5Config7B, Llava1_5Config13B, LlavaConfig, LlavaModel
+
+__all__ = [
+    "CLIPViTConfig",
+    "HFCLIPVisionConfig",
+    "MultimodalProjectorConfig",
+    "NevaConfig",
+    "NevaModel",
+    "LlavaConfig",
+    "Llava1_5Config7B",
+    "Llava1_5Config13B",
+    "LlavaModel",
+]
diff --git a/nemo/collections/vlm/neva/model/api.py b/nemo/collections/vlm/neva/model/api.py
new file mode 100644
index 000000000000..62374d536712
--- /dev/null
+++ b/nemo/collections/vlm/neva/model/api.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytorch_lightning as pl
+
+from nemo.collections.vlm.neva.model import Llava1_5Config7B, Llava1_5Config13B, LlavaModel
+
+
+def llava1_5_7b() -> pl.LightningModule:
+    return LlavaModel(Llava1_5Config7B())
+
+
+def llava1_5_13b() -> pl.LightningModule:
+    return LlavaModel(Llava1_5Config13B())
+
+
+__all__ = [
+    "llava1_5_7b",
+    "llava1_5_13b",
+]
diff --git a/nemo/collections/vlm/neva/model/base.py b/nemo/collections/vlm/neva/model/base.py
new file mode 100644
index 000000000000..7d0c53b79321
--- /dev/null
+++ b/nemo/collections/vlm/neva/model/base.py
@@ -0,0 +1,700 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+
+import pytorch_lightning as L
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from megatron.core import dist_checkpointing
+from megatron.core.inference_params import InferenceParams
+from megatron.core.models.multimodal.llava_model import LLaVAModel as MCoreLLaVAModel
+from megatron.core.models.vision.clip_vit_model import CLIPViTModel as MCoreCLIPViTModel
+from megatron.core.models.vision.multimodal_projector import MultimodalProjector as MCoreMultimodalProjector
+from megatron.core.optimizer import OptimizerConfig
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TENorm,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_config import TransformerConfig
+from torch import nn
+from transformers import CLIPVisionConfig, CLIPVisionModel
+
+from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+from nemo.collections.llm import fn
+from nemo.collections.llm.gpt.model import local_layer_spec, transformer_engine_layer_spec
+from nemo.collections.llm.gpt.model.base import get_batch_on_this_context_parallel_rank, get_packed_seq_params
+from nemo.collections.nlp.modules.common.megatron.module import MegatronModule
+from nemo.collections.vlm.neva.data.multimodal_tokens import IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from nemo.lightning import io
+from nemo.lightning.megatron_parallel import MaskedTokenLossReductionWithLossMask
+from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
+from nemo.utils import logging
+
+
+def get_image_sequence_length(img_h, img_w, patch_dim, add_class_token, class_token_len):
+    """Get image sequence length given image size, patch size, and class token."""
+    num_patches_per_dim_h = img_h // patch_dim
+    num_patches_per_dim_w = img_w // patch_dim
+    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+    return num_patches + (class_token_len if add_class_token else 0)
+
+
+def neva_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
+    from megatron.core import parallel_state
+
+    # Based on: https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py#L87
+    # https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L828-L842
+
+    batch = next(dataloader_iter)
+
+    _batch: dict
+    if isinstance(batch, tuple) and len(batch) == 3:
+        _batch = batch[0]
+    else:
+        _batch = batch
+
+    required_keys = set()
+    required_keys.add("attention_mask")
+    if parallel_state.is_pipeline_first_stage():
+        required_keys.update(("media", "tokens", "position_ids"))
+    if parallel_state.is_pipeline_last_stage():
+        required_keys.update(("labels", "loss_mask"))
+
+    _batch = {
+        key: val.cuda(non_blocking=True) if key in required_keys and val is not None else None
+        for key, val in _batch.items()
+    }
+    # slice batch along sequence dimension for context parallelism
+    output = get_batch_on_this_context_parallel_rank(_batch)
+
+    return output
+
+
+def neva_forward_step(model, batch) -> torch.Tensor:
+    forward_args = {
+        "media": batch["media"],
+        "input_ids": batch["tokens"],
+        "position_ids": batch["position_ids"],
+        "attention_mask": batch.get("attention_mask", None),
+        "loss_mask": batch.get("loss_mask", None),
+        "labels": batch.get("labels", None),
+    }
+
+    if 'cu_seqlens' in batch:
+        forward_args['packed_seq_params'] = get_packed_seq_params(batch)
+
+    return model(**forward_args)
+
+
+def set_input_tensor(self, tensor):
+    pass
+
+
+@dataclass
+class MultimodalProjectorConfig(TransformerConfig, io.IOMixin):
+    """
+    For MLP, fc1 in shape of input_size, ffn_hidden_size, fc2 in shape of ffn_hidden_size, hidden_size
+    """
+
+    projector_type: str = "mlp2x_gelu"
+    layer_spec: Optional[MLPSubmodules] = None
+    input_size: Optional[int] = 1024
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 1024
+    activation_func: Callable = F.gelu
+    bias: bool = True
+    bias_activation_fusion: bool = True
+    num_layers: int = 1  # placeholder, NOT used!
+    num_attention_heads: int = 8  # placeholder, NOT used!
+
+    def configure_model(self) -> "MCoreMultimodalProjector":
+        if self.projector_type.startswith("mcore") and self.layer_spec is None:
+            if self.projector_type == "mcore_mlp":
+                self.projector_type = "mlp"  # strip "mcore_" for mcore init
+                self.layer_spec = ModuleSpec(
+                    module=MLP,
+                    submodules=MLPSubmodules(
+                        linear_fc1=TEColumnParallelLinear,
+                        linear_fc2=TERowParallelLinear,
+                    ),
+                )
+                self.layer_spec = self.layer_spec.submodules
+            else:
+                raise NotImplementedError(f"Not supported projector type `{self.projector_type}`")
+
+            return MCoreMultimodalProjector(
+                self,
+                self.layer_spec,
+                projector_type=self.projector_type,
+                input_size=self.input_size,
+            )
+
+        # e.g. "mlp2x_gelu"
+        mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', self.projector_type)
+        if mlp_gelu_match:
+            mlp_depth = int(mlp_gelu_match.group(1))
+            modules = [torch.nn.Linear(self.input_size, self.hidden_size, bias=True)]
+            for _ in range(1, mlp_depth):
+                modules.append(torch.nn.GELU())
+                modules.append(torch.nn.Linear(self.hidden_size, self.hidden_size, bias=True))
+            model = torch.nn.Sequential(*modules)
+            from types import MethodType
+
+            model.set_input_tensor = MethodType(set_input_tensor, model)
+        else:
+            raise NotImplementedError(f"Not supported projector type `{self.projector_type}`")
+
+        return model
+
+
+@dataclass
+class HFCLIPVisionConfig(CLIPVisionConfig, io.IOMixin):
+    """
+    https://github.com/huggingface/transformers/blob/v4.44.0/src/transformers/models/clip/configuration_clip.py#L261
+    """
+
+    pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None
+
+    def configure_hf_config(self, *args, **kwargs) -> None:
+        CLIPVisionConfig.__init__(self, *args, **kwargs)
+
+    def configure_model(self) -> "CLIPVisionModel":
+        # Monkey patch the method to the vision encoder
+        CLIPVisionModel.set_input_tensor = set_input_tensor
+
+        if self.pretrained_model_name_or_path is None:
+            model = CLIPVisionModel(self)
+        else:
+            model = CLIPVisionModel.from_pretrained(self.pretrained_model_name_or_path)
+        # Extend all model.config fields to self
+        for key, value in model.config.to_dict().items():
+            setattr(self, key, value)
+        return model
+
+
+@dataclass
+class CLIPViTConfig(TransformerConfig, io.IOMixin):
+    ln_pre_impl: Union[ModuleSpec, type] = TENorm
+    add_class_token: bool = True
+    class_token_len: int = 1
+    patch_dim: int = 14
+    img_h: int = 336
+    img_w: int = 336
+    transformer_layer_spec: ModuleSpec = transformer_engine_layer_spec
+
+    def configure_model(self) -> "MCoreCLIPViTModel":
+        transformer_layer_spec = self.transformer_layer_spec
+        if not isinstance(transformer_layer_spec, ModuleSpec):
+            transformer_layer_spec = transformer_layer_spec(self)
+        return MCoreCLIPViTModel(
+            self,
+            transformer_layer_spec,
+            ln_pre_impl=self.ln_pre_impl,
+            add_class_token=self.add_class_token,
+            class_token_len=self.class_token_len,
+            patch_dim=self.patch_dim,
+            img_h=self.img_h,
+            img_w=self.img_w,
+        )
+
+
+@dataclass
+class NevaConfig(TransformerConfig, io.IOMixin):
+    language_transformer_config: Optional[TransformerConfig] = None
+    vision_transformer_config: Optional[TransformerConfig] = None
+    vision_projection_config: Optional[TransformerConfig] = None
+    drop_vision_class_token: bool = True
+    num_layers: int = 1  # Placeholder, NOT used!
+    num_attention_heads: int = 8  # Placeholder, NOT used!
+    vision_feature_layer: int = -2
+
+    language_model_from_pretrained: Optional[str] = None
+    vision_model_from_pretrained: Optional[str] = None  # TODO
+    vision_projection_from_pretrained: Optional[str] = None  # TODO
+
+    freeze_language_model: bool = True
+    freeze_vision_model: bool = True
+    freeze_vision_projection: bool = False
+
+    forward_step_fn: Callable = neva_forward_step
+    data_step_fn: Callable = neva_data_step
+
+    def configure_model(self, tokenizer) -> "MCoreLLaVAModel":
+        language_model = self.language_transformer_config.configure_model(tokenizer=tokenizer)
+        vision_model = self.vision_transformer_config.configure_model()
+        vision_projection = self.vision_projection_config.configure_model()
+
+        if self.language_model_from_pretrained is not None:
+            sharded_state_dict = dict(state_dict=language_model.sharded_state_dict(prefix="module."))
+            loaded_state_dict = dist_checkpointing.load(
+                sharded_state_dict=sharded_state_dict, checkpoint_dir=self.language_model_from_pretrained
+            )
+            loaded_state_dict = {k.removeprefix("module."): v for k, v in loaded_state_dict["state_dict"].items()}
+            language_model.load_state_dict(loaded_state_dict)
+            logging.info(f"Restored language model weights from {self.language_model_from_pretrained}")
+        model = MCoreNevaModel(
+            transformer_config=self,
+            language_model=language_model,
+            vision_model=vision_model,
+            vision_projection=vision_projection,
+            drop_vision_class_token=self.drop_vision_class_token,
+        )
+        model.freeze(
+            freeze_language_model=self.freeze_language_model,
+            freeze_vision_model=self.freeze_vision_model,
+            freeze_vision_projection=self.freeze_vision_projection,
+        )
+        return model
+
+
+class MCoreNevaModel(MCoreLLaVAModel):
+    def __init__(
+        self,
+        transformer_config: TransformerConfig,
+        language_model: MegatronModule,
+        vision_model: MegatronModule,
+        vision_projection: MegatronModule,
+        pre_process: bool = True,
+        post_process: bool = True,
+        drop_vision_class_token: bool = False,
+    ) -> None:
+        super(MCoreLLaVAModel, self).__init__(config=transformer_config)
+
+        logging.warning("LLaVA model is under development and may be missing features.")
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        self.encoder_hidden_state = None
+        self.vision_model = vision_model
+        self.vision_projection = vision_projection
+        self.language_model = language_model
+        self.model_type = ModelType.encoder_or_decoder
+        # This attribute is needed to check if an all-reduce is required
+        # on the word embeddings inside `finalize_model_grads._allreduce_word_embedding_grads`.
+        self.share_embeddings_and_output_weights = False
+        if self.language_model is not None:
+            self.share_embeddings_and_output_weights = self.language_model.share_embeddings_and_output_weights
+            self._language_max_sequence_length = self.language_model.max_sequence_length
+
+        if self.vision_model is not None:
+            self._drop_vision_class_token = drop_vision_class_token
+
+        self.add_encoder = self.vision_model is not None
+        self.add_decoder = self.language_model is not None
+        self.vision_model_from_hf = str(self.vision_model.__class__.__module__).startswith("transformers.")
+        if self.add_decoder:
+            vision_config = self.config.vision_transformer_config
+            if self.vision_model_from_hf:
+                # img_h, img_w, patch_dim, add_class_token, class_token_len
+                self._img_seq_len = get_image_sequence_length(
+                    img_h=vision_config.image_size,
+                    img_w=vision_config.image_size,
+                    patch_dim=vision_config.patch_size,
+                    add_class_token=not drop_vision_class_token,
+                    class_token_len=0 if "siglip" in vision_config.model_type else 1,
+                )
+            else:
+                self._img_seq_len = 576  # TODO(yuya): Fix hardcode
+        else:
+            self._img_seq_len = 0
+
+    def _preprocess_data(
+        self,
+        image_embeddings,
+        language_embeddings,
+        input_ids,
+        loss_mask,
+        labels,
+        use_inference_kv_cache,
+        image_token_index,
+        num_image_tiles,
+    ):
+        # TODO (yuya): remove this and use the mcore method
+        """Preprocess input data before input to language model.
+
+        This function is adopted from
+        https://github.com/huggingface/transformers/blob/85817d98fb60977c97e3014196a462b732d2ed1a/src/transformers/models/llava_next/modeling_llava_next.py#L409
+        for our input data conventions.
+
+        image_token_index = -200 indicates the image position in the input_ids = [0, 1, -200, 2, 3] and labels = [1, -200, 2, 3, 4], for example.
+        We want to replace the image position (-200) with image_embeddings and return the following:
+        - final_embeddings = [0, 1, image_embeddings, 2, 3],
+        - final_labels = [1, -100, 2, 3, 4]
+        - final_loss_mask = [1, 0, 0, 1, 1]
+
+        This function also handles the case where the input does not contain an image (text-only sample). It also handles the case where a single input
+        image is split into multiple tiles.
+
+        If pipeline parallelism is not used, then self.pre_process and self.post_process are both True and we update both
+        input embeddings, labels and loss masks (if available).
+
+        If pipeline parallelism is used, then we do the following
+        - the first language model chunk has self.pre_process = True and self.post_process = False. We update input embeddings.
+        - the middle language model chunk(s) has self.pre_process = False and self.post_process = False. We don't need to update anything.
+        - the last language model chunk has self.pre_process = False and self.post_process = True. We update labels and loss mask.
+
+        TODO: This function should adjust the attention mask too. Currently, we assume the language model uses a causal mask.
+
+        Returns:
+            final_embedding (torch.Tensor): image and text embeddings concated [combined_seq_len, b, h].
+            final_labels (torch.Tensor): labels for image and text positions [b, combined_seq_len].
+            final_loss_mask (torch.Tensor): loss mask for image and text positions [b, combined_seq_len].
+        """
+        assert self.add_decoder, "input text preprocessing is only needed for the language model"
+
+        # No pre- or postprocessing needed. With pipeline parallel > 2, this means a chunk in the middle of the model.
+        if not self.pre_process and not self.post_process:
+            return language_embeddings, loss_mask, labels
+
+        # If using the inference KV cache, the image tokens are already computed.
+        if use_inference_kv_cache:
+            return language_embeddings, loss_mask, labels
+
+        img_seq_len = self._img_seq_len
+        batch_size, text_seq_len = input_ids.shape
+
+        has_labels = labels is not None
+        if has_labels:
+            assert (
+                labels.shape == loss_mask.shape
+            ), f"mismatching labels shape {labels.shape} and loss mask shape {loss_mask.shape}"
+
+        # Create indices for new text and label positions.
+        with torch.no_grad():
+            image_token_mask = input_ids == image_token_index
+            num_image_tokens = torch.sum(image_token_mask, dim=-1)
+
+            # Number of tiles per sample.
+            num_image_tiles_batch = num_image_tiles.split(num_image_tokens.tolist(), dim=0)
+            num_image_tiles_batch = torch.tensor([x.sum() for x in num_image_tiles_batch], device=input_ids.device)
+
+            # Sequence length for each sample is the image sequence length multiplied by the number of tiles for that image, minus image token indices,
+            # plus text sequence length.
+            seq_lens = num_image_tiles_batch * img_seq_len - num_image_tokens + text_seq_len
+            max_seq_len = seq_lens.max()
+            batch_indices, non_image_indices = torch.where(input_ids != image_token_index)
+
+            # New position ids for the text tokens, shifted by the image sequence length.
+            # E.g. for input_ids = [-200, 1, 2, 3] and img_seq_len = 576, we get new_position_ids = [576, 577, 578, 579].
+            # text_position_ids are then [577, 578, 579].
+            image_token_mask_lens = image_token_mask.int().clone()
+            # -1 is for the removed image token index.
+            image_token_mask_lens[image_token_mask] = num_image_tiles * img_seq_len - 1
+            # +1 is needed here for the cumulative sum. -1 is adjusting for zero-based indexing.
+            new_position_ids = torch.cumsum((image_token_mask_lens + 1), dim=-1) - 1
+            text_position_ids = new_position_ids[batch_indices, non_image_indices]
+
+            # Labels are shifted to left by one. So, shift text position ids and non-image indices to left by one.
+            if has_labels:
+                label_text_position_ids = text_position_ids - 1
+                valid_label_text_position_ids = label_text_position_ids >= 0
+                label_text_position_ids = label_text_position_ids[valid_label_text_position_ids]
+
+                label_batch_indices = batch_indices[valid_label_text_position_ids]
+
+                label_non_image_indices = non_image_indices - 1
+                valid_label_non_image_indices = label_non_image_indices >= 0
+                label_non_image_indices = label_non_image_indices[valid_label_non_image_indices]
+
+        # Create a mask for the image embedding positions.
+        with torch.no_grad():
+            images_mask = torch.full((batch_size, max_seq_len), True, dtype=torch.bool, device=input_ids.device)
+            # No images in the text positions.
+            images_mask[batch_indices, text_position_ids] = False
+            # Samples can have different amount of images tokens. new_position_ids[:, -1] gives the last text position id for each sample.
+            # Padding is needed when the number of image tokens differs.
+            first_padding_idx = new_position_ids[:, -1] + 1
+            images_mask[
+                torch.arange(max_seq_len, device=first_padding_idx.device).repeat(batch_size, 1)
+                >= first_padding_idx.unsqueeze(1)
+            ] = False
+
+        # Create the final input embedding (if this is the first language model stage).
+        final_embedding = None
+        if self.pre_process:
+            embed_dim = language_embeddings.shape[-1]
+            final_embedding = torch.zeros(
+                batch_size,
+                max_seq_len,
+                embed_dim,
+                dtype=image_embeddings.dtype,
+                device=image_embeddings.device,
+            )
+
+            # Put text embeddings to the text positions in the result tensor.
+            final_embedding[batch_indices, text_position_ids] = language_embeddings[batch_indices, non_image_indices]
+
+            # Put image embeddings to image positions.
+            final_embedding[images_mask] = image_embeddings.reshape(-1, embed_dim).contiguous()
+
+        # Create the final labels and loss mask (if this is the last language model stage).
+        final_labels, final_loss_mask = None, None
+        if has_labels:
+            final_labels = torch.full(
+                (batch_size, max_seq_len), IGNORE_INDEX, dtype=labels.dtype, device=labels.device
+            )
+            final_loss_mask = torch.full((batch_size, max_seq_len), 0, dtype=loss_mask.dtype, device=loss_mask.device)
+
+            # Put text labels and loss mask to the text positions.
+            final_labels[label_batch_indices, label_text_position_ids] = labels[
+                label_batch_indices, label_non_image_indices
+            ]
+
+            final_loss_mask[batch_indices, text_position_ids] = loss_mask[batch_indices, non_image_indices]
+
+            # For labels, we need to pick the last label index that got dropped by the shift to left.
+            label_extra_text_position_ids = seq_lens - 1
+            batch_range = torch.arange(len(label_extra_text_position_ids))
+            final_labels[batch_range, label_extra_text_position_ids] = labels[batch_range, -1]
+
+            # Loss mask the image positions.
+            final_loss_mask[images_mask] = 0
+
+            # Loss mask last text position just before an image so that text token does not need to predict the first image token.
+            batch_image_indices, image_indices = torch.where(image_token_mask)
+            # Indices just before image tokens. If it's -1, skip it.
+            before_image_indices = image_indices - 1
+            valid = before_image_indices >= 0
+            valid_batch_image_indices = batch_image_indices[valid]
+            valid_before_image_indices = before_image_indices[valid]
+            # Map those indices those position ids.
+            valid_before_image_indices = new_position_ids[valid_batch_image_indices, valid_before_image_indices]
+
+            final_loss_mask[valid_batch_image_indices, valid_before_image_indices] = 0
+
+        if final_embedding is not None and has_labels:
+            assert (
+                final_embedding.shape[:2] == final_labels.shape == final_loss_mask.shape
+            ), "unexpected shapes after data preprocessing"
+
+        if final_embedding is not None:
+            final_embedding = final_embedding.transpose(1, 0).contiguous()
+
+        # Truncate if exceeding the language model's max sequence length.
+        if final_embedding is not None and final_embedding.shape[0] > self._language_max_sequence_length:
+            final_embedding = final_embedding[: self._language_max_sequence_length]
+
+        if has_labels and final_labels.shape[1] > self._language_max_sequence_length:
+            final_labels = final_labels[:, : self._language_max_sequence_length]
+            final_loss_mask = final_loss_mask[:, : self._language_max_sequence_length]
+
+        return final_embedding, final_labels, final_loss_mask
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        loss_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        media: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        inference_params: Optional[InferenceParams] = None,
+        num_media_tiles: Optional[List[int]] = None,
+        media_token_index: Optional[int] = IMAGE_TOKEN_INDEX,
+    ) -> torch.Tensor:
+        """Forward function of the LLaVA model.
+
+        Args:
+            images (torch.Tensor): input image of shape [num_tiles, img_h, img_w]. num_tiles means the number of image tiles in this batch.
+            input_ids (torch.Tensor): input text ids [batch, text_seq_len].
+            position_ids (torch.Tensor): input text position ids [batch, text_seq_len].
+            attention_mask (torch.Tensor): Attention mask for the language model [batch, 1, combined_seq_len, combined_seq_len].
+            labels (torch.Tensor): Optional target text labels [batch, combined_seq_len].
+            loss_mask (torch.Tensor): Text loss mask [batch, text_seq_len].
+            inference_params (InferenceParams): Inference-time parameters including KV cache.
+            num_media_tiles (list of int): Number of tiles per image. Default None assumes 1 tile per image.
+            image_token_index (int): ID for input images.
+
+        Returns:
+            output (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
+            loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s].
+        """
+        use_inference_kv_cache = (
+            inference_params is not None and "image_tokens_count" in inference_params.key_value_memory_dict
+        )
+        # If running inference, we can skip media token computation if they were computed already earlier for this sample.
+        if use_inference_kv_cache or media is None:
+            media_embeddings = None
+        elif self.add_encoder:
+            # media is in shape of (num_images_in_mbs, c, h, w)
+            # note num_images_in_mbs is not mbs but total images in this mbs.
+            if self.vision_model_from_hf:
+                media_embeddings = self.vision_model(
+                    media, output_hidden_states=True
+                )  # [num_images, img_seq_len, h_vision]
+                media_embeddings = media_embeddings[-1][
+                    self.config.vision_feature_layer
+                ]  # take second from last layer
+            else:
+                # TODO(yuya): MCore Clip path not yet support taking a specific layer hidden states
+                media_embeddings = self.vision_model(media)
+            if self._drop_vision_class_token:
+                class_token_len = getattr(self.vision_model, "class_token_len", 1)
+                media_embeddings = media_embeddings[:, class_token_len:, :]
+
+            # map vision model output size to language model input size.
+            media_embeddings = self.vision_projection(media_embeddings)  # [img_seq_len, num_tiles, h_vision]
+
+            # If running inference, the language model KV cache will be updated for media token positions.
+            # Here we store the media tokens sequence length, which can be used as an offset to the KV cache later.
+            if inference_params is not None:
+                inference_params.key_value_memory_dict["media_tokens_count"] = (
+                    media_embeddings.shape[0] * media_embeddings.shape[1]
+                )
+        else:
+            media_embeddings = self.encoder_hidden_state
+
+        if not self.add_decoder:
+            return media_embeddings, loss_mask
+
+        language_embeddings = None
+        if self.pre_process:
+            input_ids_text = input_ids.clone()
+            # MultiModal Token indices are assumed to be values
+            input_ids_text[input_ids_text < 0] = 0
+            # Note: This adds absolute position embedding but not RoPE. Each image is counted as one position.
+            # RoPE is added in language_model forward call. Each image embedding is one position.
+            language_embeddings = self.language_model.embedding(
+                input_ids=input_ids_text, position_ids=position_ids
+            )  # [text_seq_len, b, h_language]
+            language_embeddings = language_embeddings.transpose(1, 0).contiguous()  # [b, text_seq_len, h_language]
+
+        if media is None:
+            combined_embeddings = language_embeddings.transpose(1, 0).contiguous()
+            final_labels = labels
+            final_loss_mask = loss_mask
+        else:
+            # Assume 1 tile per image if the number of tiles is not provided.
+            if num_media_tiles is None:
+                num_media_tiles = torch.ones(media.shape[0], dtype=torch.int, device=input_ids.device)
+
+            # Preprocess input, labels and loss mask.
+            combined_embeddings, final_labels, final_loss_mask = self._preprocess_data(
+                media_embeddings,
+                language_embeddings,
+                input_ids,
+                loss_mask,
+                labels,
+                use_inference_kv_cache,
+                media_token_index,
+                num_media_tiles,
+            )  # [combined_seq_len, b, h_language], [b, combined_seq_len], [b, combined_seq_len]
+
+        output = self.language_model(
+            input_ids=None,
+            position_ids=None,
+            attention_mask=attention_mask,
+            decoder_input=combined_embeddings,
+            labels=final_labels,
+            inference_params=inference_params,
+        )
+
+        if labels is None or loss_mask is None:
+            return output
+
+        return output, final_loss_mask.contiguous()
+
+
+class NevaModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
+    def __init__(
+        self,
+        config: NevaConfig,
+        # TODO: Add transformer_layer_spec when we update mcore
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.tokenizer = tokenizer
+        self.optim = optim or MegatronOptimizerModule(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True))
+        self.optim.connect(self)  # This will bind the `configure_optimizers` method
+        self.model_transform = model_transform
+        self._training_loss_reduction = None
+        self._validation_loss_reduction = None
+
+    def configure_model(self) -> None:
+        if not hasattr(self, "module"):
+            self.module = self.config.configure_model(self.tokenizer)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        loss_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        media: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        inference_params: InferenceParams = None,
+    ) -> torch.Tensor:
+        output_tensor = self.module(
+            media=media,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            loss_mask=loss_mask,
+            attention_mask=attention_mask,
+            labels=labels,
+            inference_params=inference_params,
+        )
+
+        return output_tensor
+
+    def data_step(self, dataloader_iter) -> Dict[str, torch.Tensor]:
+        return self.config.data_step_fn(dataloader_iter)
+
+    def forward_step(self, batch) -> torch.Tensor:
+        return self.config.forward_step_fn(self, batch)
+
+    def training_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # In mcore the loss-function is part of the forward-pass (when labels are provided)
+        return self.forward_step(batch)
+
+    def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # In mcore the loss-function is part of the forward-pass (when labels are provided)
+
+        return self.forward_step(batch)
+
+    @property
+    def training_loss_reduction(self) -> MaskedTokenLossReductionWithLossMask:
+        if not self._training_loss_reduction:
+            self._training_loss_reduction = MaskedTokenLossReductionWithLossMask()
+
+        return self._training_loss_reduction
+
+    @property
+    def validation_loss_reduction(self) -> MaskedTokenLossReductionWithLossMask:
+        if not self._validation_loss_reduction:
+            self._validation_loss_reduction = MaskedTokenLossReductionWithLossMask(validation_step=True)
+
+        return self._validation_loss_reduction
+
+
+__all__ = [
+    "NevaModel",
+    "NevaConfig",
+    "neva_data_step",
+    "neva_forward_step",
+    "transformer_engine_layer_spec",
+    "local_layer_spec",
+]
diff --git a/nemo/collections/vlm/neva/model/llava.py b/nemo/collections/vlm/neva/model/llava.py
new file mode 100644
index 000000000000..dc27f28373fa
--- /dev/null
+++ b/nemo/collections/vlm/neva/model/llava.py
@@ -0,0 +1,342 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Annotated, Callable, Optional, Union
+
+import torch
+from megatron.core.transformer.transformer_config import TransformerConfig
+from torch import nn
+
+from nemo.collections.llm import Llama2Config7B, Llama2Config13B, LlamaConfig
+from nemo.collections.llm.utils import Config
+from nemo.collections.vlm.neva.model.base import HFCLIPVisionConfig, MultimodalProjectorConfig, NevaConfig, NevaModel
+from nemo.lightning import OptimizerModule, io, teardown
+
+if TYPE_CHECKING:
+    from transformers import LlavaConfig as HFLlavaConfig
+    from transformers import LlavaForConditionalGeneration
+
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+# Note: these Llava configs are copied from the corresponding HF model. You may need to modify the parameter for
+# your own needs
+
+
+@dataclass
+class LlavaConfig(NevaConfig):
+    drop_vision_class_token: bool = True
+
+
+@dataclass
+class Llava1_5Config7B(LlavaConfig):
+    from transformers import PretrainedConfig
+
+    language_transformer_config: TransformerConfig = field(default_factory=lambda: Llama2Config7B())
+    vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
+        default_factory=lambda: HFCLIPVisionConfig(pretrained_model_name_or_path="openai/clip-vit-large-patch14-336")
+    )
+    vision_projection_config: TransformerConfig = field(
+        default_factory=lambda: MultimodalProjectorConfig(input_size=1024, hidden_size=4096, ffn_hidden_size=4096)
+    )
+
+
+@dataclass
+class Llava1_5Config13B(LlavaConfig):
+    from transformers import PretrainedConfig
+
+    language_transformer_config: TransformerConfig = field(default_factory=lambda: Llama2Config13B())
+    vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
+        default_factory=lambda: HFCLIPVisionConfig(pretrained_model_name_or_path="openai/clip-vit-large-patch14-336")
+    )
+    vision_projection_config: TransformerConfig = field(
+        default_factory=lambda: MultimodalProjectorConfig(input_size=1024, hidden_size=5120, ffn_hidden_size=5120)
+    )
+
+
+class LlavaModel(NevaModel):
+    def __init__(
+        self,
+        config: Annotated[Optional[LlavaConfig], Config[LlavaConfig]] = None,
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        super().__init__(config or LlavaConfig(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
+
+
+@io.model_importer(LlavaModel, "hf")
+class HFLlavaImporter(io.ModelConnector["LlavaForConditionalGeneration", LlavaModel]):
+    def init(self) -> LlavaModel:
+        return LlavaModel(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import LlavaForConditionalGeneration
+
+        source = LlavaForConditionalGeneration.from_pretrained(str(self))
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+        print(f"Converted Llava model to Nemo, saving to {output_path}")
+
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted Llava model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "language_model.model.embed_tokens.weight": "language_model.embedding.word_embeddings.weight",
+            "language_model.model.layers.*.self_attn.o_proj.weight": "language_model.decoder.layers.*.self_attention.linear_proj.weight",
+            "language_model.model.layers.*.mlp.down_proj.weight": "language_model.decoder.layers.*.mlp.linear_fc2.weight",
+            "language_model.model.layers.*.input_layernorm.weight": "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "language_model.model.layers.*.post_attention_layernorm.weight": "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "language_model.model.norm.weight": "language_model.decoder.final_layernorm.weight",
+            "language_model.lm_head.weight": "language_model.output_layer.weight",
+            "vision_tower.vision_model.*": "vision_model.vision_model.*",
+        }
+        if "vision_projection.encoder.linear_fc1.weight" in target.module.state_dict().keys():
+            mapping.update(
+                {
+                    "multi_modal_projector.linear_1.weight": "vision_projection.encoder.linear_fc1.weight",
+                    "multi_modal_projector.linear_1.bias": "vision_projection.encoder.linear_fc1.bias",
+                    "multi_modal_projector.linear_2.weight": "vision_projection.encoder.linear_fc2.weight",
+                    "multi_modal_projector.linear_2.bias": "vision_projection.encoder.linear_fc2.bias",
+                }
+            )
+        elif "vision_projection.0.weight" in target.module.state_dict().keys():
+            mapping.update(
+                {
+                    "multi_modal_projector.linear_1.weight": "vision_projection.0.weight",
+                    "multi_modal_projector.linear_1.bias": "vision_projection.0.bias",
+                    "multi_modal_projector.linear_2.weight": "vision_projection.2.weight",
+                    "multi_modal_projector.linear_2.bias": "vision_projection.2.bias",
+                }
+            )
+        else:
+            raise KeyError("Unable to map vision projection keys.")
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
+
+    @property
+    def tokenizer(self) -> "AutoTokenizer":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(str(self))
+
+    @property
+    def config(self) -> LlavaConfig:
+        from transformers import LlavaConfig as HFLlavaConfig
+
+        source = HFLlavaConfig.from_pretrained(str(self))
+        text_conifg = source.text_config
+
+        def make_vocab_size_divisible_by(vocab_size):
+            base = 128
+            while vocab_size % base != 0:
+                base //= 2
+            return base
+
+        language_transformer_config = LlamaConfig(
+            num_layers=text_conifg.num_hidden_layers,
+            hidden_size=text_conifg.hidden_size,
+            ffn_hidden_size=text_conifg.intermediate_size,
+            num_attention_heads=text_conifg.num_attention_heads,
+            init_method_std=text_conifg.initializer_range,
+            layernorm_epsilon=text_conifg.rms_norm_eps,
+            num_query_groups=text_conifg.num_key_value_heads,
+            rotary_base=text_conifg.rope_theta,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(text_conifg.vocab_size),
+            share_embeddings_and_output_weights=False,
+        )
+        vision_transformer_config = HFCLIPVisionConfig(
+            pretrained_model_name_or_path="openai/clip-vit-large-patch14-336"
+        )
+        vision_projection_config = MultimodalProjectorConfig(input_size=1024, hidden_size=4096, ffn_hidden_size=4096)
+
+        output = LlavaConfig(
+            language_transformer_config=language_transformer_config,
+            vision_transformer_config=vision_transformer_config,
+            vision_projection_config=vision_projection_config,
+            vision_feature_layer=source.vision_feature_layer,
+        )
+
+        return output
+
+
+@io.model_exporter(LlavaModel, "hf")
+class HFLlavaExporter(io.ModelConnector[LlavaModel, "LlavaForConditionalGeneration"]):
+    def init(self) -> "LlavaForConditionalGeneration":
+        raise NotImplementedError("Neva Exporter hasn't been verified!")
+
+        from transformers import AutoModelForCausalLM
+
+        return AutoModelForCausalLM.from_config(self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        target = self.init()
+        source, _ = self.nemo_load(str(self))
+
+        target = self.convert_state(source, target)
+
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "language_model.embedding.word_embeddings.weight": "language_model.model.embed_tokens.weight",
+            "language_model.decoder.layers.*.self_attention.linear_proj.weight": "language_model.model.layers.*.self_attn.o_proj.weight",
+            "language_model.decoder.layers.*.mlp.linear_fc2.weight": "language_model.model.layers.*.mlp.down_proj.weight",
+            "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "language_model.model.layers.*.input_layernorm.weight",
+            "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "language_model.model.layers.*.post_attention_layernorm.weight",
+            "language_model.decoder.final_layernorm.weight": "language_model.model.norm.weight",
+            "language_model.output_layer.weight": "language_model.lm_head.weight",
+        }
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])
+
+    @property
+    def tokenizer(self):
+        return io.load_context(str(self)).model.tokenizer.tokenizer
+
+    @property
+    def config(self) -> "HFLlavaConfig":
+        source: LlavaConfig = io.load_context(str(self)).model.config
+
+        from transformers import LlavaConfig as HFLlavaConfig
+
+        return HFLlavaConfig(
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.seq_length,
+            initializer_range=source.init_method_std,
+            rms_norm_eps=source.layernorm_epsilon,
+            num_key_value_heads=source.num_query_groups,
+            rope_theta=source.rotary_base,
+            vocab_size=self.tokenizer.vocab_size,
+        )
+
+
+@io.state_transform(
+    source_key=(
+        "language_model.model.layers.*.self_attn.q_proj.weight",
+        "language_model.model.layers.*.self_attn.k_proj.weight",
+        "language_model.model.layers.*.self_attn.v_proj.weight",
+    ),
+    target_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_qkv(ctx: io.TransformCTX, q, k, v):
+    megatron_config = ctx.target.config.language_transformer_config
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+        "language_model.model.layers.*.self_attn.q_proj.weight",
+        "language_model.model.layers.*.self_attn.k_proj.weight",
+        "language_model.model.layers.*.self_attn.v_proj.weight",
+    ),
+)
+def _export_qkv(ctx: io.TransformCTX, linear_qkv):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+@io.state_transform(
+    source_key=(
+        "language_model.model.layers.*.mlp.gate_proj.weight",
+        "language_model.model.layers.*.mlp.up_proj.weight",
+    ),
+    target_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
+)
+def _import_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0).float()
+
+
+@io.state_transform(
+    source_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=(
+        "language_model.model.layers.*.mlp.gate_proj.weight",
+        "language_model.model.layers.*.mlp.up_proj.weight",
+    ),
+)
+def _export_linear_fc1(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index 76e9ab3264f7..f318e62b819a 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -144,9 +144,6 @@ def set_model_parallel_attributes(model, parallelism):
     # Note: Importing nemo.lightning.pytorch.strategies creates an import cycle.
     from megatron.core.transformer.transformer_config import TransformerConfig
 
-    assert (
-        type(parallelism).__name__ == 'ParallelismConfig'
-    ), f"Expected parallelism config to be of type ParallelismConfig, but got {type(parallelism)}"
     has_mcore_config = isinstance(getattr(model, "config", None), TransformerConfig)
     if has_mcore_config and hasattr(model, "configure_model"):
         config: TransformerConfig = model.config
diff --git a/nemo/lightning/io/artifact/__init__.py b/nemo/lightning/io/artifact/__init__.py
index 572bd37c0be8..50f77f968a07 100644
--- a/nemo/lightning/io/artifact/__init__.py
+++ b/nemo/lightning/io/artifact/__init__.py
@@ -1,4 +1,4 @@
 from nemo.lightning.io.artifact.base import Artifact
-from nemo.lightning.io.artifact.file import FileArtifact, PathArtifact
+from nemo.lightning.io.artifact.file import DirArtifact, DirOrStringArtifact, FileArtifact, PathArtifact
 
-__all__ = ["Artifact", "FileArtifact", "PathArtifact"]
+__all__ = ["Artifact", "FileArtifact", "PathArtifact", "DirArtifact", "DirOrStringArtifact"]
diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py
index a997df42f843..ec451de9753b 100644
--- a/nemo/lightning/io/artifact/base.py
+++ b/nemo/lightning/io/artifact/base.py
@@ -9,6 +9,7 @@ class Artifact(ABC, Generic[ValueT]):
     def __init__(self, attr: str, required: bool = True):
         self.attr = attr
         self.required = required
+        self.skip = False
 
     @abstractmethod
     def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT:
diff --git a/nemo/lightning/io/artifact/file.py b/nemo/lightning/io/artifact/file.py
index 76bd0c6003a6..12b94be81030 100644
--- a/nemo/lightning/io/artifact/file.py
+++ b/nemo/lightning/io/artifact/file.py
@@ -1,3 +1,4 @@
+import os
 import shutil
 from pathlib import Path
 from typing import Union
@@ -23,8 +24,46 @@ def load(self, path: str) -> str:
         return path
 
 
+def pathize(s):
+    if not isinstance(s, Path):
+        return Path(s)
+    return s
+
+
 def copy_file(src: Union[Path, str], path: Union[Path, str], relative_dst: Union[Path, str]):
-    relative_path = Path(relative_dst) / Path(src).name
-    output = Path(path) / relative_path
+    relative_path = pathize(relative_dst) / pathize(src).name
+    output = pathize(path) / relative_path
+    if output.exists():
+        raise FileExistsError(f"Dst file already exists {str(output)}")
     shutil.copy2(src, output)
     return relative_path
+
+
+class DirArtifact(Artifact[str]):
+    def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str:
+        value = pathize(value)
+        absolute_dir = pathize(absolute_dir)
+        relative_dir = pathize(relative_dir)
+        if not value.is_dir():
+            return value
+
+        relative_dir = relative_dir / value.name
+        os.makedirs(str(absolute_dir / relative_dir), exist_ok=True)
+        for file in value.iterdir():
+            copy_file(file, absolute_dir, relative_dir)
+        return str(relative_dir)
+
+    def load(self, path: str) -> str:
+        return path
+
+
+class DirOrStringArtifact(DirArtifact):
+    def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str:
+        if not pathize(value).exists():
+            # This is Artifact is just a string.
+            self.skip = True
+            return value
+        return super().dump(value, absolute_dir, relative_dir)
+
+    def load(self, path: str) -> str:
+        return path
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 512f3bc4f12e..48222a4bd04d 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -237,5 +237,13 @@ def local_path(self, base_path: Optional[Path] = None) -> Path:
     def on_import_ckpt(self, model: pl.LightningModule):
         if hasattr(self, "tokenizer"):
             model.tokenizer = self.tokenizer
-            if hasattr(model, "__io__"):
-                model.__io__.tokenizer = self.tokenizer
+            if hasattr(model, "__io__") and hasattr(self.tokenizer, '__io__'):
+                model.__io__.tokenizer = self.tokenizer.__io__
+
+    def save_hf_tokenizer_assets(self, tokenizer_name_or_path, save_path="/tmp/nemo_tokenizer"):
+        from transformers import AutoTokenizer
+
+        tok = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        # Save tokenizer assets to save_path.
+        tok.save_pretrained(save_path)
+        return save_path
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index eee2d9ef751a..36fb36bfcb34 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -281,7 +281,7 @@ def exporter(cls, ext: str, path: Union[str, Path]) -> ModelConnector:
         """
         return cls._get_connector(ext, path, importer=False)
 
-    def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Path] = None) -> Path:
+    def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Path] = None, **kwargs) -> Path:
         """
         Imports a checkpoint from a specified path, potentially overwriting existing files.
 
@@ -299,14 +299,14 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa
         ------
             FileNotFoundError: If the checkpoint file does not exist at the specified path.
         """
-        connector = self._get_connector(path)
+        connector = self._get_connector(path, **kwargs)
         ckpt_path: Path = connector.local_path(base_path=base_path)
         ckpt_path = connector(ckpt_path, overwrite=overwrite)
         connector.on_import_ckpt(self)
         return ckpt_path
 
     @classmethod
-    def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector:
+    def _get_connector(cls, ext, path=None, importer=True, **kwargs) -> ModelConnector:
         """
         Retrieves the appropriate model connector based on the file extension and path,
         distinguishing between importers and exporters.
@@ -341,7 +341,7 @@ def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector:
 
             return connector()
 
-        return connector(_path)
+        return connector(_path, **kwargs)
 
 
 def track_io(target, artifacts: Optional[List[Artifact]] = None):
@@ -520,6 +520,9 @@ def _io_path_elements_fn(x):
 
 def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
+        # Allow optional artifacts
+        if artifact.skip:
+            continue
         current_val = getattr(cfg, artifact.attr)
         if current_val is None:
             if artifact.required:
@@ -539,7 +542,12 @@ def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: P
 
 def _artifact_transform_load(cfg: fdl.Config, path: Path):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
+        if artifact.skip:
+            continue
         current_val = getattr(cfg, artifact.attr)
+        # __init__ arguments can be None
+        if current_val is None:
+            continue
         ## replace local path with absolute one
         new_val = str(Path(path) / current_val)
         setattr(cfg, artifact.attr, new_val)
@@ -589,31 +597,31 @@ def load(path: Path, output_type: Type[CkptType] = Any, subpath: Optional[str] =
     ## add IO functionality to custom objects present in the json file
     with open(_path) as f:
         j = json.load(f)
-        for obj, val in j["objects"].items():
-            clss = ".".join([val["type"]["module"], val["type"]["name"]])
-            if subpath and "paths" in val:
-                if all(map(lambda p: subpath not in p, val["paths"])):
-                    continue
+    for obj, val in j.get("objects", {}).items():
+        clss = ".".join([val["type"]["module"], val["type"]["name"]])
+        if subpath and "paths" in val:
+            if all(map(lambda p: subpath not in p, val["paths"])):
+                continue
 
-            if not serialization.find_node_traverser(locate(clss)):
-                track_io(locate(clss))
+        if not serialization.find_node_traverser(locate(clss)):
+            track_io(locate(clss))
 
     with open(_path, "rb") as f:
         json_config = json.loads(f.read())
 
-        root_key = None
-        for obj, val in json_config["objects"].items():
-            if "paths" in val and subpath in val["paths"]:
-                root_key = obj
-                break
+    root_key = None
+    for obj, val in json_config.get("objects", {}).items():
+        if "paths" in val and subpath in val["paths"]:
+            root_key = obj
+            break
 
-        if subpath and not root_key:
-            logging.warning(f"Could not find {subpath} for {output_type} in {_path}")
+    if subpath and not root_key:
+        logging.warning(f"Could not find {subpath} for {output_type} in {_path}")
 
-        if root_key:
-            json_config["root"]["key"] = root_key
+    if root_key:
+        json_config["root"]["key"] = root_key
 
-        config = serialization.Deserialization(json_config).result
-        _artifact_transform_load(config, path)
+    config = serialization.Deserialization(json_config).result
+    _artifact_transform_load(config, path)
 
     return fdl.build(config)
diff --git a/nemo/lightning/io/state.py b/nemo/lightning/io/state.py
index 18e0865171c7..b79ca69c5845 100644
--- a/nemo/lightning/io/state.py
+++ b/nemo/lightning/io/state.py
@@ -321,7 +321,7 @@ def call_transform(self, ctx: TransformCTX, *args, **kwargs):
 
 
 def _match_keys(keys: List[str], pattern: str) -> np.ndarray:
-    regex_pattern = re.compile("^" + pattern.replace("*", "(.*)") + "$")
+    regex_pattern = re.compile("^" + pattern.replace("*", r"([^.]+)") + "$")
     wildcard_matches = [[] for _ in range(pattern.count("*"))]
 
     for key in filter(lambda x: x is not None, keys):
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 91864a6e190e..f8476a440b0c 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -1105,6 +1105,10 @@ def forward(
 
         from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
 
+        # neva returns (logits, loss_mask)
+        if isinstance(forward_out, tuple):
+            forward_out, loss_mask = forward_out
+            batch["loss_mask"] = loss_mask
         cp_size = parallel_state.get_context_parallel_world_size()
         if cp_size == 1:
             loss_for_ub = masked_token_loss(forward_out, batch["loss_mask"])
@@ -1158,6 +1162,19 @@ def reduce(self, losses_reduced_per_micro_batch) -> torch.Tensor:
         return torch.tensor(0.0, device=torch.cuda.current_device())
 
 
+class MaskedTokenLossReductionWithLossMask(MaskedTokenLossReduction):
+    def forward(
+        self,
+        batch: Dict[str, torch.Tensor],
+        forward_out: Tuple[torch.Tensor, torch.Tensor],
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        # expecting returns (token_level_loss, loss_mask)
+        forward_out, loss_mask = forward_out
+        batch["loss_mask"] = loss_mask
+
+        return super().forward(batch, forward_out)
+
+
 def masked_token_loss(tensor: Tensor, mask: Tensor):
     """
     The function takes as input per-token loss and masks non-required values.
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index 2a3b25f97cdc..060ec7915ec0 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -111,7 +111,7 @@ def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModul
         self.prev_global_batch_size = self.current_global_batch_size
 
         consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_global_step)
-        if self.output_log:
+        if self.output_log and self.trainer.training:
             # You may need to turn off logging, for example when doing trainer.predict(model, data)
             pl_module.log(
                 'consumed_samples',
diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py
index 2a210c9bd7f0..d34d1716e6b4 100644
--- a/nemo/lightning/pytorch/strategies/fsdp_strategy.py
+++ b/nemo/lightning/pytorch/strategies/fsdp_strategy.py
@@ -216,7 +216,7 @@ def save_checkpoint(
             and self.trainer.state.fn == TrainerFn.FITTING
             and self.ckpt_save_optimizer
         ):
-            del checkpoint["optimizer_states"]
+            checkpoint["optimizer_states"] = {}
             checkpoint['optimizer'] = get_optimizer_state_dict(self.model, self.optimizers)
             pyt_to_mcore_state_dict(checkpoint['optimizer']['state'], prefix="optimizer.state.")
 
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index 4bf8c42ece02..3a0a0368bcef 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -634,7 +634,7 @@ def save_checkpoint(
             and self.trainer.state.fn == TrainerFn.FITTING
             and self.ckpt_save_optimizer
         ):
-            del checkpoint["optimizer_states"]
+            checkpoint["optimizer_states"] = {}
             checkpoint["optimizer"] = [self.optimizer_sharded_state_dict()]
 
         self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py
index 64345a378257..a7f0e7339def 100644
--- a/nemo/lightning/pytorch/strategies/utils.py
+++ b/nemo/lightning/pytorch/strategies/utils.py
@@ -42,6 +42,8 @@ class RestoreConfig:
     adapter_path: Optional[str] = None
     load_model_state: bool = True
     load_optim_state: bool = False
+    # eg tokenizer, etc.
+    load_artifacts: bool = True
 
 
 def setup_parallel_ranks(strategy: pl.strategies.Strategy):
diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py
index c8cefb4dd8d3..45b73213698d 100644
--- a/nemo/lightning/resume.py
+++ b/nemo/lightning/resume.py
@@ -34,6 +34,20 @@
     BasePath = PosixPath
 
 
+def _try_restore_tokenizer(model, ckpt_path):
+    from nemo.lightning.io import load_context
+
+    try:
+        tokenizer = load_context(ckpt_path, "model.tokenizer")
+        model.tokenizer = tokenizer
+        model.__io__.tokenizer = tokenizer.__io__
+    except:
+        # Ignore if the ckpt doesn't have a tokenizer.
+        pass
+    finally:
+        return model
+
+
 @dataclass(kw_only=True)
 class AutoResume:
     """Class that handles the logic for setting checkpoint paths and restoring from
@@ -43,7 +57,8 @@ class AutoResume:
         restore_config (Optional[RestoreConfig]): Optional config for selectively restoring specific parts like model weights, optimizer states, etc.
             If the config contains a path from HF or another non-NeMo checkpoint format, the checkpoint will be automatically converted to a NeMo compatible format.
             resume_from_folder or the run's log_dir takes precedence over restore_config.
-        resume_from_directory (str): Path to the checkpointing directory to restore from. Defaults to <log_dir>/checkpoints
+        resume_from_directory (str): Path to the checkpointing directory to restore from.
+        resume_from_path (str): Path to a specific checkpoint to restore from.
         adapter_path (str): Path to any adapter checkpoints.
         resume_if_exists (bool): Whether this experiment is resuming from a previous run. If
             True, it sets trainer._checkpoint_connector._ckpt_path so that the trainer should
@@ -61,6 +76,7 @@ class AutoResume:
 
     restore_config: Optional[RestoreConfig] = None
     resume_from_directory: Optional[str] = None
+    resume_from_path: Optional[str] = None
     adapter_path: Optional[str] = None
     resume_if_exists: bool = False
     resume_past_end: bool = False
@@ -79,6 +95,11 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None):
         if trainer_ckpt_path:
             trainer.ckpt_path = trainer_ckpt_path
             trainer.checkpoint_callback.last_model_path = trainer_ckpt_path
+            # Load artifacts
+            if getattr(self.restore_config, 'load_artifacts', False):
+                context_path = self.get_context_path(model)
+                model = _try_restore_tokenizer(model, context_path)
+
         elif self.restore_config:
             new_path = self._try_import_model(
                 model=model,
@@ -215,7 +236,24 @@ def _find_trainer_ckpt_path(self) -> Optional[Path]:
 
         return checkpoint
 
+    def get_context_path(self, model: Optional[io.ConnectorMixin] = None) -> Optional[Path]:
+        checkpoint = None
+        app_state = AppState()
+        app_state.restore = self.resume_if_exists
+        if self.resume_if_exists:
+            checkpoint = self._find_trainer_ckpt_path()
+
+        if checkpoint:
+            maybe_model_weights_path = Path(checkpoint) / "context"
+            if os.path.isdir(maybe_model_weights_path):
+                checkpoint = maybe_model_weights_path
+        return checkpoint
+
     def get_trainer_ckpt_path(self, model: Optional[io.ConnectorMixin] = None) -> Optional[Path]:
+        if self.resume_from_path:
+            maybe_model_weights_path = self.get_model_weights_path(self.resume_from_path)
+            return maybe_model_weights_path if os.path.isdir(maybe_model_weights_path) else self.resume_from_path
+
         checkpoint = None
         app_state = AppState()
         app_state.restore = self.resume_if_exists
diff --git a/nemo/package_info.py b/nemo/package_info.py
index 1cd6ef729936..a60316270d57 100644
--- a/nemo/package_info.py
+++ b/nemo/package_info.py
@@ -14,9 +14,9 @@
 
 
 MAJOR = 2
-MINOR = 0
+MINOR = 1
 PATCH = 0
-PRE_RELEASE = 'rc2'
+PRE_RELEASE = 'rc0'
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 3169d31dbeed..0bd6208f11c7 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,4 +1,3 @@
-fiddle
 huggingface_hub>=0.24
 numba
 numpy>=1.22
diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
index b7e6119fd7b7..8b56c3974a25 100644
--- a/requirements/requirements_multimodal.txt
+++ b/requirements/requirements_multimodal.txt
@@ -5,6 +5,7 @@ diffusers>=0.19.3
 einops_exts
 imageio
 kornia
+megatron-energon
 nerfacc>=0.5.3
 open_clip_torch==2.24.0
 PyMCubes
diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
index 7a7484bf9c20..635cf6db2fde 100644
--- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
@@ -128,7 +128,7 @@ def convert(args):
                 key = key[:-11] + 'mixer.in_proj.layer_norm_weight'
             new_state_dict["model." + key] = value
 
-        # Tokenizer settings
+        # NVIDIA Mamba Model Tokenizer Settings
         tokenizer_library = 'megatron'
         tokenizer_type = 'GPTSentencePieceTokenizer'
         tokenizer_model = args.tokenizer_model_dir
@@ -180,8 +180,11 @@ def convert(args):
     trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer()
     nemo_model_from_pyt = MegatronMambaModel(nemo_config.model, trainer)
 
-    # Setting strict=False for the _extra_state
+    for k, v in nemo_model_from_pyt.state_dict().items():
+        if "_extra" in k:
+            new_state_dict[k] = v
 
+    # Setting strict=False for the _extra_state
     nemo_model_from_pyt.load_state_dict(new_state_dict, strict=False)
     dtype = torch_dtype_from_precision(args.precision)
     nemo_model_from_pyt = nemo_model_from_pyt.to(dtype=dtype)
diff --git a/examples/llm/run/llama3_pretraining.py b/scripts/llm/llama3_pretraining.py
similarity index 100%
rename from examples/llm/run/llama3_pretraining.py
rename to scripts/llm/llama3_pretraining.py
diff --git a/setup.py b/setup.py
index 7787c0ba9603..f3aec7b2f465 100644
--- a/setup.py
+++ b/setup.py
@@ -273,7 +273,7 @@ def finalize_options(self):
     # Custom commands.
     cmdclass={'style': StyleCommand},
     entry_points={
-        "run.factories": [
+        "nemo_run.cli": [
             "llm = nemo.collections.llm",
         ],
     },
diff --git a/tests/collections/llm/auto_conf/test_autoconf_utils.py b/tests/collections/llm/auto_conf/test_autoconf_utils.py
new file mode 100644
index 000000000000..0faa86c13016
--- /dev/null
+++ b/tests/collections/llm/auto_conf/test_autoconf_utils.py
@@ -0,0 +1,131 @@
+from nemo.collections.llm.tools.auto_configurator.core.base_config import _estimate_training_time, calculate_model_size
+
+
+class TestUtils:
+    def test_calculate_model_size(self):
+        # GPT
+        model_size = calculate_model_size(
+            8,
+            7,
+            None,
+            140,
+            300,
+            "gpt3",
+        )
+        assert model_size == 0.28, f"expected model_size is 0.28 but got {model_size}."
+
+        # Llama
+        model_size = calculate_model_size(
+            128,
+            30,
+            None,
+            100,
+            3000,
+            "llama",
+        )
+        assert model_size == 1.38, f"expected model_size is 1.38 but got {model_size}."
+
+        # Mixtral
+        model_size = calculate_model_size(
+            256,
+            20,
+            None,
+            140,
+            600,
+            "mixtral",
+        )
+        assert model_size == 12.9, f"expected model_size is 12.9 but got {model_size}."
+
+        # Mistral
+        model_size = calculate_model_size(
+            1028,
+            30,
+            None,
+            240,
+            100,
+            "mistral",
+        )
+        assert model_size == 799.37, f"expected model_size is 799.37 but got {model_size}."
+
+        # Gemma
+        model_size = calculate_model_size(
+            512,
+            30,
+            None,
+            240,
+            100,
+            "gemma",
+        )
+        assert model_size == 398.13, f"expected model_size is 398.13 but got {model_size}."
+
+        # Nemotron
+        model_size = calculate_model_size(
+            256,
+            15,
+            None,
+            240,
+            120,
+            "gemma",
+        )
+        assert model_size == 82.94, f"expected model_size is 82.94 but got {model_size}."
+
+    def test_calculate_train_time(self):
+        # GPT
+        train_time = _estimate_training_time(
+            175,
+            1024,
+            140,
+            300,
+            "gpt3",
+        )
+        assert train_time == 33.91, f"expected train_time is 33.91 but got {train_time}."
+
+        # Llama
+        train_time = _estimate_training_time(
+            35,
+            512,
+            60,
+            3000,
+            "llama",
+        )
+        assert train_time == 316.48, f"expected train_time is 316.48 but got {train_time}."
+
+        # Mixtral
+        train_time = _estimate_training_time(
+            0.8,
+            128,
+            140,
+            1000,
+            "mixtral",
+        )
+        assert train_time == 4.13, f"expected train_time is 4.13 but got {train_time}."
+
+        # Mistral
+        train_time = _estimate_training_time(
+            11,
+            24,
+            60,
+            250,
+            "mistral",
+        )
+        assert train_time == 176.83, f"expected train_time is 176.83 but got {train_time}."
+
+        # Gemma
+        train_time = _estimate_training_time(
+            7,
+            8,
+            55,
+            100,
+            "gemma",
+        )
+        assert train_time == 147.31, f"expected train_time is 147.31 but got {train_time}."
+
+        # Nemotron
+        train_time = _estimate_training_time(
+            14,
+            12,
+            11,
+            55,
+            "nemotron",
+        )
+        assert train_time == 540.12, f"expected train_time is 540.12 but got {train_time}."
diff --git a/tests/collections/llm/auto_conf/test_base_configs.py b/tests/collections/llm/auto_conf/test_base_configs.py
new file mode 100644
index 000000000000..46ee49ae0629
--- /dev/null
+++ b/tests/collections/llm/auto_conf/test_base_configs.py
@@ -0,0 +1,341 @@
+import nemo_run as run
+import torch
+
+from megatron.core.optimizer import OptimizerConfig
+from pytorch_lightning.loggers import TensorBoardLogger
+
+from nemo import lightning as nl
+from nemo.collections.common.tokenizers import AutoTokenizer
+from nemo.collections.llm import (
+    GemmaConfig2B,
+    GPTConfig126M,
+    Llama3Config8B,
+    MistralConfig7B,
+    MixtralConfig8x3B,
+    Nemotron4Config22B,
+    PreTrainingDataModule,
+)
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator
+from nemo.collections.llm.tools.auto_configurator.core.base_config import BaseConfig
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
+from nemo.utils.exp_manager import TimingCallback
+
+
+def get_tokenizer() -> run.Config:
+    return run.Config(AutoTokenizer, pretrained_model_name="GPT2BPETokenizer")
+
+
+def get_data(seq_length, global_batch_size) -> run.Config[PreTrainingDataModule]:
+    config = {
+        "paths": "/",
+        "seq_length": seq_length,
+        "global_batch_size": global_batch_size,
+        "num_workers": 2,
+        "index_mapping_dir": None,
+    }
+
+    return run.Config(
+        PreTrainingDataModule,
+        **config,
+        tokenizer=get_tokenizer(),
+    )
+
+
+def get_trainer(num_nodes) -> run.Config[nl.Trainer]:
+    trainer_config = {
+        "accelerator": "gpu",
+        "enable_checkpointing": False,
+        "use_distributed_sampler": False,
+        "max_epochs": None,
+        "log_every_n_steps": 1,
+        "limit_val_batches": 1,
+        "limit_test_batches": 1,
+        "accumulate_grad_batches": 1,
+        "num_nodes": num_nodes,
+        "devices": 8,
+        "max_steps": 50,
+        "val_check_interval": 50,
+    }
+
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        pipeline_dtype=torch.bfloat16,
+    )
+
+    return run.Config(
+        nl.Trainer,
+        **trainer_config,
+        strategy=strategy,
+        plugins=run.Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
+        callbacks=[run.Config(TimingCallback)],
+    )
+
+
+def get_optim() -> run.Config[OptimizerConfig]:
+    optim_params = {
+        "optimizer": "adam",
+        "lr": 1e-4,
+        "min_lr": 1e-5,
+        "use_distributed_optimizer": True,
+        "bf16": True,
+        "adam_beta1": 0.9,
+        "adam_beta2": 0.95,
+        "overlap_grad_reduce": True,
+        "overlap_param_gather": True,
+        "clip_grad": 1.0,
+        "adam_eps": 1e-5,
+    }
+
+    optim_config = run.Config(
+        OptimizerConfig,
+        **optim_params,
+    )
+
+    sched = run.Config(
+        CosineAnnealingScheduler,
+        warmup_steps=10,
+        constant_steps=0,
+        min_lr=optim_config.min_lr,
+    )
+
+    return run.Config(
+        MegatronOptimizerModule,
+        config=optim_config,
+        lr_scheduler=sched,
+    )
+
+
+def get_logger() -> run.Config[nl.NeMoLogger]:
+    tb_logger = run.Config(TensorBoardLogger, save_dir="tb_logs")
+
+    ckpt = run.Config(
+        nl.ModelCheckpoint,
+        monitor="reduced_train_loss",
+        save_last=False,
+        save_top_k=0,
+    )
+
+    return run.Config(
+        nl.NeMoLogger,
+        ckpt=ckpt,
+        tensorboard=tb_logger,
+        wandb=None,
+        dir="/",
+    )
+
+
+class TestBaseConfigs:
+    def test_gpt3_base_config(self):
+        # GPT3 7B
+        model_config = run.Config(GPTConfig126M)
+        runner = AutoConfigurator(model=model_config, num_nodes=8, path_to_logs="/", data_paths="/")
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(2048, 'auto')
+        trainer_config = get_trainer(8)
+        optim_config = get_optim()
+        logger_config = get_logger()
+
+        assert (
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
+        assert model_size == 0.126, f"0.126 is expected size for {model_config} but got {model_size}"
+        assert model_type == "gpt3", f"gpt3 is expected model type for {model_config} but got {model_type}"
+        assert (
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
+        assert (
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
+
+    def test_llama_base_config(self):
+        # Llama3 8B
+        model_config = run.Config(Llama3Config8B)
+        runner = AutoConfigurator(
+            model=model_config,
+            num_nodes=16,
+            path_to_logs="/",
+            data_paths="/",
+            seq_length=8192,
+            global_batch_size=2048,
+        )
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(8192, 2048)
+        trainer_config = get_trainer(16)
+        optim_config = get_optim()
+        logger_config = get_logger()
+
+        assert (
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
+        assert model_size == 8, f"8 is expected size for {model_config} but got {model_size}"
+        assert model_type == "llama", f"llama is expected model type for {model_config} but got {model_type}"
+        assert (
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
+        assert (
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
+
+    def test_mistral_base_config(self):
+        # Mistral 7B
+        model_config = run.Config(MistralConfig7B)
+        runner = AutoConfigurator(
+            model=model_config,
+            num_nodes=16,
+            path_to_logs="/",
+            data_paths="/",
+            seq_length=32768,
+            global_batch_size=2048,
+        )
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(32768, 2048)
+        trainer_config = get_trainer(16)
+        optim_config = get_optim()
+        logger_config = get_logger()
+
+        assert (
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
+        assert model_size == 7, f"7 is expected size for {model_config} but got {model_size}"
+        assert model_type == "mistral", f"mistral is expected model type for {model_config} but got {model_type}"
+        assert (
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
+        assert (
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
+
+    def test_mixtral_base_config(self):
+        # Mixtral 8x3B
+        model_config = run.Config(MixtralConfig8x3B)
+        runner = AutoConfigurator(
+            model=model_config,
+            num_nodes=16,
+            path_to_logs="/",
+            data_paths="/",
+            seq_length=4096,
+            global_batch_size=2048,
+        )
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(4096, 2048)
+        trainer_config = get_trainer(16)
+        optim_config = get_optim()
+        logger_config = get_logger()
+
+        assert (
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
+        assert model_size == 3, f"3 is expected size for {model_config} but got {model_size}"
+        assert model_type == "mixtral", f"mixtral is expected model type for {model_config} but got {model_type}"
+        assert (
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
+        assert (
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
+
+    def test_gemma_base_config(self):
+        # Gemma 2B
+        model_config = run.Config(GemmaConfig2B)
+        runner = AutoConfigurator(
+            model=model_config,
+            num_nodes=8,
+            path_to_logs="/",
+            data_paths="/",
+            seq_length=4096,
+            global_batch_size=1024,
+        )
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(4096, 1024)
+        trainer_config = get_trainer(8)
+        optim_config = get_optim()
+        logger_config = get_logger()
+
+        assert (
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
+        assert model_size == 2, f"2 is expected size for {model_config} but got {model_size}"
+        assert model_type == "gemma", f"gemma is expected model type for {model_config} but got {model_type}"
+        assert (
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
+        assert (
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
+
+    def test_nemotron_base_config(self):
+        # Nemotron 22B
+        model_config = run.Config(Nemotron4Config22B)
+        runner = AutoConfigurator(
+            model=model_config,
+            num_nodes=64,
+            path_to_logs="/",
+            data_paths="/",
+            seq_length=4096,
+            global_batch_size=2048,
+        )
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(4096, 2048)
+        trainer_config = get_trainer(64)
+        optim_config = get_optim()
+        logger_config = get_logger()
+
+        assert (
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
+        assert model_size == 22, f"22 is expected size for {model_config} but got {model_size}"
+        assert model_type == "nemotron", f"nemotron is expected model type for {model_config} but got {model_type}"
+        assert (
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
+        assert (
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
diff --git a/tests/collections/llm/auto_conf/test_generate_configs.py b/tests/collections/llm/auto_conf/test_generate_configs.py
new file mode 100644
index 000000000000..efb3bcf9a0ba
--- /dev/null
+++ b/tests/collections/llm/auto_conf/test_generate_configs.py
@@ -0,0 +1,307 @@
+import nemo_run as run
+
+from nemo.collections.llm import (
+    GemmaConfig7B,
+    GPTConfig5B,
+    Llama3Config70B,
+    MistralConfig7B,
+    MixtralConfig8x22B,
+    Nemotron3Config8B,
+)
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs
+
+
+def get_auto_configs(configs):
+    auto_configs = []
+    for run_name, config in configs.items():
+        auto_configs.append(
+            [
+                config.trainer.strategy.tensor_model_parallel_size,
+                config.trainer.strategy.pipeline_model_parallel_size,
+                config.trainer.strategy.context_parallel_size,
+                config.trainer.strategy.expert_model_parallel_size,
+                config.data.micro_batch_size,
+            ]
+        )
+
+    return auto_configs
+
+
+class TestGenerateConfgis:
+    def test_gpt_model(self):
+        # GPT3 126M
+        runner = AutoConfigurator(
+            model=run.Config(GPTConfig5B),
+            num_nodes=16,
+            seq_length=2048,
+            global_batch_size=2048,
+            tensor_parallel_sizes=[4],
+            pipeline_parallel_sizes=[2],
+            micro_batch_sizes=[1, 2],
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=8,
+            max_model_parallel_size=8,
+            data_paths="/",
+            path_to_logs="/",
+        )
+
+        _, configs = generate_configs(runner)
+
+        mbs = [1, 2]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 2048
+            assert config.data.global_batch_size == 2048
+
+        assert len(configs) == 2, f"{len(configs)} configurations were generated but 2 were expected."
+
+        auto_configs = get_auto_configs(configs)
+        assert auto_configs[0] == [
+            4,
+            2,
+            1,
+            1,
+            1,
+        ], f"[4, 2, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            4,
+            2,
+            1,
+            1,
+            2,
+        ], f"[4, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
+
+    def test_llama_model(self):
+        # Llama3 70B
+        runner = AutoConfigurator(
+            model=run.Config(Llama3Config70B),
+            num_nodes=128,
+            seq_length=8192,
+            global_batch_size=2048,
+            tensor_parallel_sizes="auto",
+            pipeline_parallel_sizes="auto",
+            micro_batch_sizes=[1],
+            context_parallel_sizes=[1, 2, 4],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=16,
+            max_model_parallel_size=64,
+            data_paths="/",
+            path_to_logs="/",
+        )
+
+        _, configs = generate_configs(runner)
+
+        mbs = [1, 1, 1]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 8192
+            assert config.data.global_batch_size == 2048
+
+        assert len(configs) == 3, f"{len(configs)} configurations were generated but 3 were expected."
+
+        auto_configs = get_auto_configs(configs)
+        assert auto_configs[0] == [
+            4,
+            1,
+            4,
+            1,
+            1,
+        ], f"[4, 1, 4, 1, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            8,
+            1,
+            2,
+            1,
+            1,
+        ], f"[8, 1, 2, 1, 1] is expected configuration output but got {auto_configs[1]}."
+
+        assert auto_configs[2] == [
+            8,
+            1,
+            4,
+            1,
+            1,
+        ], f"[8, 1, 4, 1, 1] is expected configuration output but got {auto_configs[2]}."
+
+    def test_mistral_model(self):
+        # Mistral 7B
+        runner = AutoConfigurator(
+            model=run.Config(MistralConfig7B),
+            num_nodes=16,
+            seq_length=4096,
+            global_batch_size=2048,
+            tensor_parallel_sizes=[4],
+            pipeline_parallel_sizes=[1, 2],
+            micro_batch_sizes=[1],
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=4,
+            max_model_parallel_size=8,
+            data_paths="/",
+            path_to_logs="/",
+        )
+
+        _, configs = generate_configs(runner)
+
+        mbs = [1, 1]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 4096
+            assert config.data.global_batch_size == 2048
+
+        assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected."
+
+        auto_configs = get_auto_configs(configs)
+        assert auto_configs[0] == [
+            4,
+            1,
+            1,
+            1,
+            1,
+        ], f"[4, 1, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            4,
+            2,
+            1,
+            1,
+            1,
+        ], f"[4, 2, 1, 1, 1] is expected configuration output but got {auto_configs[1]}."
+
+    def test_mixtral_model(self):
+        # Mixtral 8x22B
+        runner = AutoConfigurator(
+            model=run.Config(MixtralConfig8x22B),
+            num_nodes=16,
+            seq_length=4096,
+            global_batch_size=2048,
+            tensor_parallel_sizes=[4],
+            pipeline_parallel_sizes=[1],
+            micro_batch_sizes=[1],
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1, 2],
+            min_model_parallel_size=4,
+            max_model_parallel_size=8,
+            data_paths="/",
+            path_to_logs="/",
+        )
+
+        _, configs = generate_configs(runner)
+
+        mbs = [1, 1]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 4096
+            assert config.data.global_batch_size == 2048
+
+        assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected."
+
+        auto_configs = get_auto_configs(configs)
+        assert auto_configs[0] == [
+            4,
+            1,
+            1,
+            1,
+            1,
+        ], f"[4, 1, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            4,
+            1,
+            1,
+            2,
+            1,
+        ], f"[4, 1, 1, 2, 1] is expected configuration output but got {auto_configs[1]}."
+
+    def test_gemma_model(self):
+        # Gemma 7B
+        runner = AutoConfigurator(
+            model=run.Config(GemmaConfig7B),
+            num_nodes=16,
+            seq_length=8192,
+            global_batch_size=2048,
+            tensor_parallel_sizes=[2],
+            pipeline_parallel_sizes=[2],
+            micro_batch_sizes=[1, 2],
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=4,
+            max_model_parallel_size=8,
+            data_paths="/",
+            path_to_logs="/",
+        )
+
+        _, configs = generate_configs(runner)
+
+        mbs = [1, 2]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 8192
+            assert config.data.global_batch_size == 2048
+
+        assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected."
+
+        auto_configs = get_auto_configs(configs)
+        assert auto_configs[0] == [
+            2,
+            2,
+            1,
+            1,
+            1,
+        ], f"[2, 2, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            2,
+            2,
+            1,
+            1,
+            2,
+        ], f"[2, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
+
+    def test_nemotron_model(self):
+        # Nemotron3 8B
+        runner = AutoConfigurator(
+            model=run.Config(Nemotron3Config8B),
+            num_nodes=16,
+            seq_length=4096,
+            global_batch_size=2048,
+            tensor_parallel_sizes=[1],
+            pipeline_parallel_sizes=[4],
+            micro_batch_sizes=[1, 2],
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=4,
+            max_model_parallel_size=8,
+            data_paths="/",
+            path_to_logs="/",
+        )
+
+        _, configs = generate_configs(runner)
+
+        mbs = [1, 2]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 4096
+            assert config.data.global_batch_size == 2048
+
+        assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected."
+
+        auto_configs = get_auto_configs(configs)
+        assert auto_configs[0] == [
+            1,
+            4,
+            1,
+            1,
+            1,
+        ], f"[2, 2, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            1,
+            4,
+            1,
+            1,
+            2,
+        ], f"[2, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
diff --git a/tests/collections/llm/gpt/data/test_pre_training_data.py b/tests/collections/llm/gpt/data/test_pre_training_data.py
index 31a7b51cdf53..24dacc7bf33c 100644
--- a/tests/collections/llm/gpt/data/test_pre_training_data.py
+++ b/tests/collections/llm/gpt/data/test_pre_training_data.py
@@ -78,3 +78,37 @@ def test_multiple_data_distributions(tokenizer, trainer):
 
     ## this should succeed
     data.setup(stage="dummy")
+
+
+def test_validate_dataset_asset_accessibility_file_does_not_exist(tokenizer, trainer):
+    raised_exception = False
+    try:
+        data = PreTrainingDataModule(
+            paths=["/this/path/should/not/exist/"],
+            seq_length=512,
+            micro_batch_size=2,
+            global_batch_size=2,
+            tokenizer=tokenizer,
+        )
+        data.trainer = trainer
+    except FileNotFoundError:
+        raised_exception = True
+
+    assert raised_exception == True, "Expected to raise a FileNotFoundError"
+
+
+def test_validate_dataset_asset_accessibility_file_is_none(tokenizer, trainer):
+    raised_exception = False
+    try:
+        data = PreTrainingDataModule(
+            paths=None,
+            seq_length=512,
+            micro_batch_size=2,
+            global_batch_size=2,
+            tokenizer=tokenizer,
+        )
+        data.trainer = trainer
+    except ValueError:
+        raised_exception = True
+
+    assert raised_exception == True, "Expected to raise a ValueError"
diff --git a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
new file mode 100644
index 000000000000..4b748c298105
--- /dev/null
+++ b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+## NOTE: This script is present for github-actions testing only.
+## There are no guarantees that this script is up-to-date with latest NeMo.
+
+import argparse
+
+import torch
+from megatron.core.optimizer import OptimizerConfig
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import _setup
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.lightning import NeMoLogger
+from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Train a small GPT model using NeMo 2.0')
+    parser.add_argument('--devices', type=int, help="Number of devices to use for training")
+    parser.add_argument('--max-steps', type=int, help="Number of steps to train for")
+    parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to")
+    parser.add_argument('--model-path', type=str, help="Path to model checkpoint")
+    parser.add_argument(
+        '--tokenizer-model-path', type=str, default=None, help="Path to tokenizer model, defaults to None"
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    args = get_args()
+
+    # Checkpoint callback setup
+    checkpoint_callback = nl.ModelCheckpoint(
+        every_n_train_steps=10,
+        dirpath=args.experiment_dir,
+    )
+
+    trainer = nl.Trainer(
+        devices=args.devices,
+        max_steps=args.max_steps,
+        accelerator="gpu",
+        strategy=nl.MegatronStrategy(
+            ckpt_load_optimizer=False,
+            ckpt_save_optimizer=False,
+            tensor_model_parallel_size=1,
+        ),
+        plugins=nl.MegatronMixedPrecision(
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+        callbacks=[checkpoint_callback],
+        log_every_n_steps=1,
+        limit_val_batches=5,
+        val_check_interval=10,
+        num_sanity_val_steps=0,
+    )
+
+    opt_config = OptimizerConfig(
+        optimizer='adam',
+        lr=1e-5,
+        min_lr=1e-5,
+        use_distributed_optimizer=False,
+        clip_grad=1.0,
+        bf16=True,
+    )
+
+    optim = MegatronOptimizerModule(config=opt_config)
+    model_config = llm.BaseMambaConfig130M()
+    model_config.tokenizer_model_path = args.tokenizer_model_path
+
+    tokenizer = get_nmt_tokenizer(
+        library=model_config.tokenizer_library,
+        model_name=model_config.tokenizer_name,
+        tokenizer_model=model_config.tokenizer_model_path,
+        use_fast=True,
+    )
+
+    model = llm.GPTModel(model_config, optim=optim, tokenizer=tokenizer)
+
+    ckpt_path = model.import_ckpt(
+        path="pytorch://" + args.model_path,
+        model_config=model_config,
+    )
+
+    nemo_logger = NeMoLogger(
+        dir=args.experiment_dir,
+    )
+
+    data = llm.SquadDataModule(
+        seq_length=512,
+        micro_batch_size=2,
+        global_batch_size=4,
+        tokenizer=model.tokenizer,
+        num_workers=0,
+        pad_to_max_length=True,
+    )
+
+    app_state = _setup(
+        model=model,
+        data=data,
+        resume=None,
+        trainer=trainer,
+        log=nemo_logger,
+        optim=optim,
+        tokenizer=tokenizer,
+        model_transform=None,
+    )
+
+    trainer.fit(model, data, ckpt_path=ckpt_path)
diff --git a/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py
new file mode 100644
index 000000000000..52daa21c2279
--- /dev/null
+++ b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+## NOTE: This script is present for github-actions testing only.
+## There are no guarantees that this script is up-to-date with latest NeMo.
+
+import argparse
+import torch
+from megatron.core.optimizer import OptimizerConfig
+from pytorch_lightning.loggers import TensorBoardLogger
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import train
+from nemo.collections.llm.gpt.data import PreTrainingDataModule
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.lightning import NeMoLogger
+from nemo.lightning.pytorch.callbacks import ModelCheckpoint
+from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Train a Mamba model using NeMo 2.0')
+    parser.add_argument('--devices', type=int, help="Number of devices to use for training")
+    parser.add_argument('--max-steps', type=int, help="Number of steps to train for")
+    parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to")
+    parser.add_argument('--data-path', type=str, help="Path to data file")
+    parser.add_argument('--tokenizer-path', type=str, default=None, help="Path to tokenizer model")
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+
+    args = get_args()
+
+    seq_length = 512
+
+    tokenizer = get_nmt_tokenizer(
+        "huggingface",
+        "EleutherAI/gpt-neox-20b",
+        tokenizer_model=None,
+        use_fast=True,
+    )
+    data = PreTrainingDataModule(
+        paths=args.data_path,
+        seq_length=seq_length,
+        micro_batch_size=2,
+        global_batch_size=16,
+        seed=1234,
+        tokenizer=tokenizer,
+    )
+    ssm_config = llm.SSMConfig(
+        hybrid_override_pattern="M-M*",
+        num_layers=4,
+        hidden_size=1024,
+        ffn_hidden_size=1024,
+        num_attention_heads=4,
+        seq_length=seq_length,
+        init_method_std=0.02,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        make_vocab_size_divisible_by=16,
+    )
+    model = llm.GPTModel(ssm_config, tokenizer=data.tokenizer)
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+    )
+    checkpoint_callback = ModelCheckpoint(
+        every_n_train_steps=10,
+        dirpath=args.experiment_dir,
+    )
+    callbacks = [checkpoint_callback]
+
+    loggers = []
+    tensorboard_logger = TensorBoardLogger(
+        save_dir='dummy',  ## NOTE: this gets overwritten by default
+    )
+    loggers.append(tensorboard_logger)
+
+    opt_config = OptimizerConfig(
+        optimizer='adam',
+        lr=6e-4,
+        min_lr=6e-5,
+        clip_grad=1.0,
+        use_distributed_optimizer=False,
+        bf16=True,
+    )
+    opt = MegatronOptimizerModule(config=opt_config)
+
+    trainer = nl.Trainer(
+        devices=args.devices,
+        max_steps=args.max_steps,
+        accelerator="gpu",
+        strategy=strategy,
+        logger=loggers,
+        callbacks=callbacks,
+        log_every_n_steps=1,
+        limit_val_batches=2,
+        plugins=nl.MegatronMixedPrecision(
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+        ),
+    )
+
+    nemo_logger = NeMoLogger(
+        dir=args.experiment_dir,
+    )
+
+    train(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=nemo_logger,
+        tokenizer='data',
+        optim=opt,
+    )
diff --git a/tests/collections/llm/recipes/__init__.py b/tests/collections/llm/recipes/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/collections/llm/recipes/test_llama3_70b.py b/tests/collections/llm/recipes/test_llama3_70b.py
new file mode 100644
index 000000000000..4271dd4ef47c
--- /dev/null
+++ b/tests/collections/llm/recipes/test_llama3_70b.py
@@ -0,0 +1,113 @@
+import nemo_run as run
+import pytest
+import torch
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes import llama3_70b
+from nemo.lightning import AutoResume, Trainer
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+
+class TestLlama3_70B:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return llama3_70b
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == LlamaModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == Llama3Config70B
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 1
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is True
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 8192
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 8192
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+        assert isinstance(recipe.peft, run.Config)
+        assert recipe.peft.__fn_or_cls__ == LoRA
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_pretrain_recipe_performance(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe_performance(
+            name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8
+        )
+        assert any(
+            isinstance(cb, run.Config) and cb.__fn_or_cls__ == MegatronCommOverlapCallback
+            for cb in recipe.trainer.callbacks
+        )
+
+    def test_hf_resume(self, recipe_module):
+        resume_config = recipe_module.hf_resume()
+        assert isinstance(resume_config, run.Config)
+        assert resume_config.__fn_or_cls__ == AutoResume
+        assert isinstance(resume_config.restore_config, run.Config)
+        assert resume_config.restore_config.path == "hf://meta-llama/Meta-Llama-3-70B"
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer(
+            tensor_parallelism=8, pipeline_parallelism=2, context_parallelism=4, sequence_parallelism=False
+        )
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
+        assert trainer_config.strategy.context_parallel_size == 4
+        assert trainer_config.strategy.sequence_parallel is False
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        llama_config = model_config.config
+        assert llama_config.num_layers == 80
+        assert llama_config.hidden_size == 8192
+        assert llama_config.num_attention_heads == 64
+        assert llama_config.seq_length == 8192
diff --git a/tests/collections/llm/recipes/test_llama3_70b_16k.py b/tests/collections/llm/recipes/test_llama3_70b_16k.py
new file mode 100644
index 000000000000..0aa482b0c905
--- /dev/null
+++ b/tests/collections/llm/recipes/test_llama3_70b_16k.py
@@ -0,0 +1,93 @@
+import nemo_run as run
+import pytest
+import torch
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel
+from nemo.collections.llm.recipes import llama3_70b_16k
+from nemo.lightning import Trainer
+
+
+class TestLlama3_70B_16k:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return llama3_70b_16k
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == LlamaModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == Llama3Config70B
+        assert model_config.config.seq_length == 16384
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 2
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is True
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 16384
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 16384
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(2, 8), (4, 4), (8, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is True
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        llama_config = model_config.config
+        assert llama_config.num_layers == 80
+        assert llama_config.hidden_size == 8192
+        assert llama_config.num_attention_heads == 64
+        assert llama_config.seq_length == 16384
diff --git a/tests/collections/llm/recipes/test_llama3_70b_64k.py b/tests/collections/llm/recipes/test_llama3_70b_64k.py
new file mode 100644
index 000000000000..f344e4541350
--- /dev/null
+++ b/tests/collections/llm/recipes/test_llama3_70b_64k.py
@@ -0,0 +1,99 @@
+import nemo_run as run
+import pytest
+import torch
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.llama import Llama3Config70B, LlamaModel
+from nemo.collections.llm.recipes import llama3_70b_64k
+from nemo.lightning import Trainer
+from nemo.utils.exp_manager import TimingCallback
+
+
+class TestLlama3_70B_64k:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return llama3_70b_64k
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == LlamaModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == Llama3Config70B
+        assert model_config.config.seq_length == 65536
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 32
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.context_parallel_size == 8
+        assert trainer_config.strategy.sequence_parallel is True
+
+        # Check for TimingCallback
+        assert any(
+            isinstance(cb, run.Config) and cb.__fn_or_cls__ == TimingCallback for cb in trainer_config.callbacks
+        )
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 65536
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 65536
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(32, 8), (64, 4), (128, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.context_parallel_size == 8
+        assert trainer_config.strategy.sequence_parallel is True
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        llama_config = model_config.config
+        assert llama_config.num_layers == 80
+        assert llama_config.hidden_size == 8192
+        assert llama_config.num_attention_heads == 64
+        assert llama_config.seq_length == 65536
diff --git a/tests/collections/llm/recipes/test_llama3_8b.py b/tests/collections/llm/recipes/test_llama3_8b.py
new file mode 100644
index 000000000000..2ad22aedf863
--- /dev/null
+++ b/tests/collections/llm/recipes/test_llama3_8b.py
@@ -0,0 +1,120 @@
+import nemo_run as run
+import pytest
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes import llama3_8b
+from nemo.lightning import AutoResume, Trainer
+
+
+class TestLlama3_8B:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return llama3_8b
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == LlamaModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == Llama3Config8B
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 1
+        assert trainer_config.max_steps == 1168251
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 1
+        assert trainer_config.strategy.pipeline_model_parallel_size == 1
+        assert trainer_config.strategy.pipeline_dtype is None
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is False
+        assert trainer_config.strategy.gradient_as_bucket_view is True
+        assert trainer_config.strategy.ckpt_async_save is True
+        assert trainer_config.strategy.ckpt_parallel_load is True
+
+        # Check other trainer configurations
+        assert trainer_config.accumulate_grad_batches == 1
+        assert trainer_config.limit_test_batches == 50
+        assert trainer_config.limit_val_batches == 32
+        assert trainer_config.log_every_n_steps == 10
+        assert trainer_config.use_distributed_sampler is False
+        assert trainer_config.val_check_interval == 2000
+
+        # Check plugins
+        assert isinstance(trainer_config.plugins, run.Config)
+        assert trainer_config.plugins.__fn_or_cls__.__name__ == "MegatronMixedPrecision"
+
+    def test_hf_resume(self, recipe_module):
+        resume_config = recipe_module.hf_resume()
+        assert isinstance(resume_config, run.Config)
+        assert resume_config.__fn_or_cls__ == AutoResume
+        assert isinstance(resume_config.restore_config, run.Config)
+        assert resume_config.restore_config.path == "hf://meta-llama/Meta-Llama-3-8B"
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 8192
+        assert recipe.data.global_batch_size == 512
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 8192
+        assert recipe.data.global_batch_size == 512
+        assert isinstance(recipe.peft, run.Config)
+        assert recipe.peft.__fn_or_cls__ == LoRA
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_pretrain_recipe_performance(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe_performance(
+            name="test_perf", dir="/tmp", num_nodes=1, num_gpus_per_node=8
+        )
+        assert any(cb.__fn_or_cls__.__name__ == "MegatronCommOverlapCallback" for cb in recipe.trainer.callbacks)
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer(
+            tensor_parallelism=2, pipeline_parallelism=2, context_parallelism=4, sequence_parallelism=True
+        )
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
+        assert trainer_config.strategy.context_parallel_size == 4
+        assert trainer_config.strategy.sequence_parallel is True
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        llama_config = model_config.config
+        assert llama_config.num_layers == 32
+        assert llama_config.hidden_size == 4096
+        assert llama_config.num_attention_heads == 32
diff --git a/tests/collections/llm/recipes/test_llama3_8b_16k.py b/tests/collections/llm/recipes/test_llama3_8b_16k.py
new file mode 100644
index 000000000000..6362ef1e4728
--- /dev/null
+++ b/tests/collections/llm/recipes/test_llama3_8b_16k.py
@@ -0,0 +1,93 @@
+import nemo_run as run
+import pytest
+import torch
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
+from nemo.collections.llm.recipes import llama3_8b_16k
+from nemo.lightning import Trainer
+
+
+class TestLlama3_8B_16k:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return llama3_8b_16k
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == LlamaModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == Llama3Config8B
+        assert model_config.config.seq_length == 16384
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 1
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is True
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 16384
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 16384
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is True
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        llama_config = model_config.config
+        assert llama_config.num_layers == 32
+        assert llama_config.hidden_size == 4096
+        assert llama_config.num_attention_heads == 32
+        assert llama_config.seq_length == 16384
diff --git a/tests/collections/llm/recipes/test_llama3_8b_64k.py b/tests/collections/llm/recipes/test_llama3_8b_64k.py
new file mode 100644
index 000000000000..51a876822314
--- /dev/null
+++ b/tests/collections/llm/recipes/test_llama3_8b_64k.py
@@ -0,0 +1,93 @@
+import nemo_run as run
+import pytest
+import torch
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
+from nemo.collections.llm.recipes import llama3_8b_64k
+from nemo.lightning import Trainer
+
+
+class TestLlama3_8B_64k:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return llama3_8b_64k
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == LlamaModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == Llama3Config8B
+        assert model_config.config.seq_length == 65536
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 1
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.context_parallel_size == 4
+        assert trainer_config.strategy.sequence_parallel is True
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 65536
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == LlamaModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 65536
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 5
+        assert trainer_config.strategy.context_parallel_size == 4
+        assert trainer_config.strategy.sequence_parallel is True
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        llama_config = model_config.config
+        assert llama_config.num_layers == 32
+        assert llama_config.hidden_size == 4096
+        assert llama_config.num_attention_heads == 32
+        assert llama_config.seq_length == 65536
diff --git a/tests/collections/llm/recipes/test_mistral.py b/tests/collections/llm/recipes/test_mistral.py
new file mode 100644
index 000000000000..fb64c8fe17cc
--- /dev/null
+++ b/tests/collections/llm/recipes/test_mistral.py
@@ -0,0 +1,101 @@
+import nemo_run as run
+import pytest
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes import mistral
+from nemo.lightning import AutoResume, Trainer
+
+
+class TestMistral:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return mistral
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == MistralModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == MistralConfig7B
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 1
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 1
+        assert trainer_config.strategy.pipeline_model_parallel_size == 1
+        assert trainer_config.strategy.pipeline_dtype is None
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is False
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MistralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 4096
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MistralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 4096
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+        assert isinstance(recipe.peft, run.Config)
+        assert recipe.peft.__fn_or_cls__ == LoRA
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_hf_resume(self, recipe_module):
+        resume_config = recipe_module.hf_resume()
+        assert isinstance(resume_config, run.Config)
+        assert resume_config.__fn_or_cls__ == AutoResume
+        assert isinstance(resume_config.restore_config, run.Config)
+        assert resume_config.restore_config.path == "hf://mistralai/Mistral-7B-v0.3"
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer(
+            tensor_parallelism=2, pipeline_parallelism=2, context_parallelism=4, sequence_parallelism=True
+        )
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
+        assert trainer_config.strategy.context_parallel_size == 4
+        assert trainer_config.strategy.sequence_parallel is True
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        mistral_config = model_config.config
+        assert mistral_config.num_layers == 32
+        assert mistral_config.hidden_size == 4096
+        assert mistral_config.num_attention_heads == 32
+        assert mistral_config.seq_length == 32768
diff --git a/tests/collections/llm/recipes/test_mixtral_8x22b.py b/tests/collections/llm/recipes/test_mixtral_8x22b.py
new file mode 100644
index 000000000000..f2891408c6d6
--- /dev/null
+++ b/tests/collections/llm/recipes/test_mixtral_8x22b.py
@@ -0,0 +1,118 @@
+import nemo_run as run
+import pytest
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x22B, MixtralModel
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes import mixtral_8x22b
+from nemo.lightning import AutoResume, Trainer
+
+
+class TestMixtral8x22B:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return mixtral_8x22b
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == MixtralModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == MixtralConfig8x22B
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 8
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
+        assert trainer_config.strategy.pipeline_model_parallel_size == 8
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 7
+        assert trainer_config.strategy.context_parallel_size == 1
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        # Check DDP configuration
+        assert isinstance(trainer_config.strategy.ddp, run.Config)
+        assert trainer_config.strategy.ddp.__fn_or_cls__ == DistributedDataParallelConfig
+        assert trainer_config.strategy.ddp.check_for_nan_in_grad is True
+        assert trainer_config.strategy.ddp.grad_reduce_in_fp32 is True
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 8192
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 8192
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+        assert isinstance(recipe.peft, run.Config)
+        assert recipe.peft.__fn_or_cls__ == LoRA
+        assert recipe.peft.target_modules == ['linear_qkv', 'linear_proj']
+        assert recipe.peft.dim == 32
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(8, 8), (16, 4), (32, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_hf_resume(self, recipe_module):
+        resume_config = recipe_module.hf_resume()
+        assert isinstance(resume_config, run.Config)
+        assert resume_config.__fn_or_cls__ == AutoResume
+        assert isinstance(resume_config.restore_config, run.Config)
+        assert resume_config.restore_config.path == "hf://mistralai/Mixtral-8x22B-v0.1"
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer(
+            tensor_parallelism=4,
+            pipeline_parallelism=4,
+            context_parallelism=2,
+            sequence_parallelism=False,
+            expert_parallelism=2,
+        )
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is False
+        assert trainer_config.strategy.expert_model_parallel_size == 2
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        mixtral_config = model_config.config
+        assert mixtral_config.num_layers == 56
+        assert mixtral_config.hidden_size == 6144
+        assert mixtral_config.num_attention_heads == 48
+        assert mixtral_config.seq_length == 4096
+        assert mixtral_config.num_moe_experts == 8
diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b.py b/tests/collections/llm/recipes/test_mixtral_8x3b.py
new file mode 100644
index 000000000000..949246c54c2a
--- /dev/null
+++ b/tests/collections/llm/recipes/test_mixtral_8x3b.py
@@ -0,0 +1,110 @@
+import nemo_run as run
+import pytest
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes import mixtral_8x3b
+from nemo.lightning import AutoResume, Trainer
+
+
+class TestMixtral8x3B:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return mixtral_8x3b
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == MixtralModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 1
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 1
+        assert trainer_config.strategy.pipeline_dtype is None
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
+        assert trainer_config.strategy.context_parallel_size == 1
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 8192
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 8192
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+        assert isinstance(recipe.peft, run.Config)
+        assert recipe.peft.__fn_or_cls__ == LoRA
+        assert recipe.peft.target_modules == ['linear_qkv', 'linear_proj']
+        assert recipe.peft.dim == 32
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_hf_resume(self, recipe_module):
+        resume_config = recipe_module.hf_resume()
+        assert isinstance(resume_config, run.Config)
+        assert resume_config.__fn_or_cls__ == AutoResume
+        assert isinstance(resume_config.restore_config, run.Config)
+        assert resume_config.restore_config.path == "hf://mistralai/Mixtral-8x7B-v0.1"
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer(
+            tensor_parallelism=8,
+            pipeline_parallelism=2,
+            context_parallelism=4,
+            sequence_parallelism=False,
+            expert_parallelism=2,
+        )
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
+        assert trainer_config.strategy.context_parallel_size == 4
+        assert trainer_config.strategy.sequence_parallel is False
+        assert trainer_config.strategy.expert_model_parallel_size == 2
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        mixtral_config = model_config.config
+        assert mixtral_config.num_layers == 32
+        assert mixtral_config.hidden_size == 2560
+        assert mixtral_config.num_attention_heads == 32
+        assert mixtral_config.seq_length == 4096
+        assert mixtral_config.num_moe_experts == 8
diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py b/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py
new file mode 100644
index 000000000000..0e75e132f70b
--- /dev/null
+++ b/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py
@@ -0,0 +1,98 @@
+import nemo_run as run
+import pytest
+import torch
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel
+from nemo.collections.llm.recipes import mixtral_8x3b_16k
+from nemo.lightning import Trainer
+
+
+class TestMixtral8x3B_16k:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return mixtral_8x3b_16k
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == MixtralModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B
+        assert model_config.config.seq_length == 16384
+        assert model_config.config.max_position_embeddings == 16384
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 1
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 16384
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 16384
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        mixtral_config = model_config.config
+        assert mixtral_config.num_layers == 32
+        assert mixtral_config.hidden_size == 2560
+        assert mixtral_config.num_attention_heads == 32
+        assert mixtral_config.seq_length == 16384
+        assert mixtral_config.max_position_embeddings == 16384
+        assert mixtral_config.num_moe_experts == 8
diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py b/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py
new file mode 100644
index 000000000000..1627d55358d9
--- /dev/null
+++ b/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py
@@ -0,0 +1,98 @@
+import nemo_run as run
+import pytest
+import torch
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel
+from nemo.collections.llm.recipes import mixtral_8x3b_64k
+from nemo.lightning import Trainer
+
+
+class TestMixtral8x3B_64k:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return mixtral_8x3b_64k
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == MixtralModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B
+        assert model_config.config.seq_length == 65536
+        assert model_config.config.max_position_embeddings == 4096
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 8
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
+        assert trainer_config.strategy.context_parallel_size == 4
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 65536
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 65536
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(32, 8), (64, 4), (128, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
+        assert trainer_config.strategy.context_parallel_size == 4
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        mixtral_config = model_config.config
+        assert mixtral_config.num_layers == 32
+        assert mixtral_config.hidden_size == 2560
+        assert mixtral_config.num_attention_heads == 32
+        assert mixtral_config.seq_length == 65536
+        assert mixtral_config.max_position_embeddings == 4096
+        assert mixtral_config.num_moe_experts == 8
diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b.py b/tests/collections/llm/recipes/test_mixtral_8x7b.py
new file mode 100644
index 000000000000..ff8e2ee0724e
--- /dev/null
+++ b/tests/collections/llm/recipes/test_mixtral_8x7b.py
@@ -0,0 +1,112 @@
+import nemo_run as run
+import pytest
+import torch
+from megatron.core.distributed import DistributedDataParallelConfig
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel
+from nemo.collections.llm.peft.lora import LoRA
+from nemo.collections.llm.recipes import mixtral_8x7b
+from nemo.lightning import AutoResume, Trainer
+
+
+class TestMixtral8x7B:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return mixtral_8x7b
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == MixtralModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == MixtralConfig8x7B
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 2
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 8
+        assert trainer_config.strategy.pipeline_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None
+        assert trainer_config.strategy.context_parallel_size == 1
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        # Check DDP configuration
+        assert isinstance(trainer_config.strategy.ddp, run.Config)
+        assert trainer_config.strategy.ddp.__fn_or_cls__ == DistributedDataParallelConfig
+        assert trainer_config.strategy.ddp.check_for_nan_in_grad is True
+        assert trainer_config.strategy.ddp.grad_reduce_in_fp32 is True
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 8192
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 8192
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+        assert isinstance(recipe.peft, run.Config)
+        assert recipe.peft.__fn_or_cls__ == LoRA
+        assert recipe.peft.target_modules == ['linear_qkv', 'linear_proj']
+        assert recipe.peft.dim == 32
+
+    def test_hf_resume(self, recipe_module):
+        resume_config = recipe_module.hf_resume()
+        assert isinstance(resume_config, run.Config)
+        assert resume_config.__fn_or_cls__ == AutoResume
+        assert isinstance(resume_config.restore_config, run.Config)
+        assert resume_config.restore_config.path == "hf://mistralai/Mixtral-8x7B-v0.1"
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer(
+            tensor_parallelism=4,
+            pipeline_parallelism=4,
+            context_parallelism=2,
+            sequence_parallelism=False,
+            expert_parallelism=2,
+        )
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.context_parallel_size == 2
+        assert trainer_config.strategy.sequence_parallel is False
+        assert trainer_config.strategy.expert_model_parallel_size == 2
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        mixtral_config = model_config.config
+        assert mixtral_config.num_layers == 32
+        assert mixtral_config.hidden_size == 4096
+        assert mixtral_config.num_attention_heads == 32
+        assert mixtral_config.seq_length == 4096
+        assert mixtral_config.num_moe_experts == 8
diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
new file mode 100644
index 000000000000..a7c68362c057
--- /dev/null
+++ b/tests/collections/llm/recipes/test_mixtral_8x7b_16k.py
@@ -0,0 +1,104 @@
+import nemo_run as run
+import pytest
+import torch
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel
+from nemo.collections.llm.recipes import mixtral_8x7b_16k
+from nemo.lightning import Trainer
+from nemo.utils.exp_manager import TimingCallback
+
+
+class TestMixtral8x7B_16k:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return mixtral_8x7b_16k
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == MixtralModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == MixtralConfig8x7B
+        assert model_config.config.seq_length == 16384
+        assert model_config.config.max_position_embeddings == 16384
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 2
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
+        assert trainer_config.strategy.context_parallel_size == 4
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+        # Check for TimingCallback
+        assert any(
+            isinstance(cb, run.Config) and cb.__fn_or_cls__ == TimingCallback for cb in trainer_config.callbacks
+        )
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 16384
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 16384
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(2, 8), (4, 4), (8, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert trainer_config.strategy.tensor_model_parallel_size == 2
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8
+        assert trainer_config.strategy.context_parallel_size == 4
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        mixtral_config = model_config.config
+        assert mixtral_config.num_layers == 32
+        assert mixtral_config.hidden_size == 4096
+        assert mixtral_config.num_attention_heads == 32
+        assert mixtral_config.seq_length == 16384
+        assert mixtral_config.max_position_embeddings == 16384
+        assert mixtral_config.num_moe_experts == 8
diff --git a/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
new file mode 100644
index 000000000000..d7220d072634
--- /dev/null
+++ b/tests/collections/llm/recipes/test_mixtral_8x7b_64k.py
@@ -0,0 +1,98 @@
+import nemo_run as run
+import pytest
+import torch
+
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel
+from nemo.collections.llm.recipes import mixtral_8x7b_64k
+from nemo.lightning import Trainer
+
+
+class TestMixtral8x7B_64k:
+    @pytest.fixture(scope="class")
+    def recipe_module(self):
+        return mixtral_8x7b_64k
+
+    def test_model(self, recipe_module):
+        model_config = recipe_module.model()
+        assert isinstance(model_config, run.Config)
+        assert model_config.__fn_or_cls__ == MixtralModel
+        assert isinstance(model_config.config, run.Config)
+        assert model_config.config.__fn_or_cls__ == MixtralConfig8x7B
+        assert model_config.config.seq_length == 65536
+        assert model_config.config.max_position_embeddings == 4096
+
+    def test_trainer(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert isinstance(trainer_config, run.Config)
+        assert trainer_config.__fn_or_cls__ == Trainer
+        assert trainer_config.accelerator == "gpu"
+        assert trainer_config.devices == 8
+        assert trainer_config.num_nodes == 16
+
+        # Check strategy configuration
+        assert isinstance(trainer_config.strategy, run.Config)
+        assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy"
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.context_parallel_size == 8
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+    def test_pretrain_recipe(self, recipe_module):
+        recipe = recipe_module.pretrain_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == pretrain
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == MockDataModule
+        assert recipe.data.seq_length == 65536
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    def test_finetune_recipe(self, recipe_module):
+        recipe = recipe_module.finetune_recipe()
+        assert isinstance(recipe, run.Partial)
+        assert recipe.__fn_or_cls__ == finetune
+        assert isinstance(recipe.model, run.Config)
+        assert recipe.model.__fn_or_cls__ == MixtralModel
+        assert isinstance(recipe.trainer, run.Config)
+        assert recipe.trainer.__fn_or_cls__ == Trainer
+        assert isinstance(recipe.data, run.Config)
+        assert recipe.data.__fn_or_cls__ == SquadDataModule
+        assert recipe.data.seq_length == 65536
+        assert recipe.data.global_batch_size == 512
+        assert recipe.data.micro_batch_size == 1
+
+    @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(16, 8), (32, 4), (64, 2)])
+    def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node):
+        recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)
+        assert recipe.trainer.num_nodes == num_nodes
+        assert recipe.trainer.devices == num_gpus_per_node
+
+    def test_trainer_parallelism_options(self, recipe_module):
+        trainer_config = recipe_module.trainer()
+        assert trainer_config.strategy.tensor_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.pipeline_dtype == torch.bfloat16
+        assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 4
+        assert trainer_config.strategy.context_parallel_size == 8
+        assert trainer_config.strategy.sequence_parallel is True
+        assert trainer_config.strategy.expert_model_parallel_size == 1
+
+    def test_model_config_parameters(self, recipe_module):
+        model_config = recipe_module.model()
+        mixtral_config = model_config.config
+        assert mixtral_config.num_layers == 32
+        assert mixtral_config.hidden_size == 4096
+        assert mixtral_config.num_attention_heads == 32
+        assert mixtral_config.seq_length == 65536
+        assert mixtral_config.max_position_embeddings == 4096
+        assert mixtral_config.num_moe_experts == 8
diff --git a/tests/lightning/_fabric/__init__.py b/tests/lightning/_fabric/__init__.py
new file mode 100644
index 000000000000..d2c20dd967b2
--- /dev/null
+++ b/tests/lightning/_fabric/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Directory name is "_fabric" instead of "fabric" to avoid
+# name collision with package the "fabric" when running scripts
+# in the parent directory.
diff --git a/tests/lightning/fabric/test_conversion.py b/tests/lightning/_fabric/test_conversion.py
similarity index 100%
rename from tests/lightning/fabric/test_conversion.py
rename to tests/lightning/_fabric/test_conversion.py