diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index 8c61c767b4f1..3e2d63285ec4 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -43,7 +43,7 @@ jobs: steps: - name: Docker system cleanup run: | - docker system prune -a --filter "until=48h" --force + docker system prune -a --filter "until=48h" --force || true - name: Docker pull image run: | diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 3c82269cb9a6..0b753d59a826 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -1,28 +1,101 @@ name: Create PR to main with cherry-pick from release on: - pull_request_target: + push: branches: - - 'r*.*.*' - types: ["closed"] + - main jobs: - cherry-pick-release-commit: - name: Cherry-pick release commit + main: runs-on: ubuntu-latest + environment: + name: main steps: - name: Checkout uses: actions/checkout@v3 with: fetch-depth: 0 - - name: github-cherry-pick-action v1.0.3 - uses: carloscastrojumo/github-cherry-pick-action@bb0869df47c27be4ae4c7a2d93d22827aa5a0054 - with: - branch: main - labels: | - cherry-pick - reviewers: | - ${{ github.event.pull_request.user.login }} + token: ${{ secrets.PAT }} + + + - name: Cherry pick + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + set -x + set +e + + git config --global user.email "nemo-bot@nvidia.com" + git config --global user.name "NeMo Bot" + + SHA=$(git rev-list --no-merges -n 1 HEAD) + MESSAGE=$(git log -n 1 --pretty=format:%s $SHA) + PR_ID=$(echo $MESSAGE | awk -F'#' '{print $2}' | awk -F')' '{print $1}' ) + + PR=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GH_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/NVIDIA/NeMo/pulls/$PR_ID) + + LABELS=$(echo -E $PR | jq '.labels | [.[].name] | join(",")' | tr -d '"') + + TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'r[^,]*') + + if [[ $TARGET_BRANCHES == '' ]]; then + echo Nothing to cherry-pick + exit 0 + fi + + echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do + TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false) + + if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then + echo Release branch does not yet exist, will not cherry-pick + continue + fi + + ( + git fetch origin $RELEASE_BRANCH:$RELEASE_BRANCH + git switch --force-create cherry-pick-$PR_ID-$RELEASE_BRANCH $RELEASE_BRANCH + git cherry-pick $SHA + git push -u origin --force cherry-pick-$PR_ID-$RELEASE_BRANCH + git checkout ${CI_DEFAULT_BRANCH:-main} + ) + + CHERRYPICK_SUCCESSFUL=$? + + if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then + curl -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GH_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/NVIDIA/NeMo/pulls \ + -d '{"title":"Cherry-pick '$PR_ID' into '$RELEASE_BRANCH'","head":"cherry-pick-'$PR_ID'-'$RELEASE_BRANCH'","base":"'$RELEASE_BRANCH'"}' + + else + URL=https://github.com/NVIDIA/NeMo/pull/${{ github.event.number }} + + MESSAGE='{ + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'${{ github.event.number }}'> failed" + } + } + ] + }' + + curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }} + + fi + + done + + env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index dd74e050a533..daf530d8bec6 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. name: "CICD NeMo" - on: pull_request: branches: - 'main' - 'r**' types: [ labeled ] + workflow_dispatch: inputs: test_to_run: @@ -122,112 +122,219 @@ jobs: ' ### \'\' - L0_Unit_Tests_GPU: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - TIMEOUT: 60 - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads - IS_OPTIONAL: true + # L0: GPU unit tests + L0_Unit_Tests_GPU_ASR: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + TIMEOUT: 20 + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_Audio: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + TIMEOUT: 20 + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_Common: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_LLM: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_Multimodal: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_NLP: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --with_downloads + + L0_Unit_Tests_GPU_TTS: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --with_downloads + + OPTIONAL_L0_Unit_Tests_GPU_Core: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + TIMEOUT: 20 + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/core -m "not pleasefixme" --with_downloads + IS_OPTIONAL: true + + L0_Unit_Tests_GPU_Hydra: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads + + OPTIONAL_L0_Unit_Tests_GPU_Lightning: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads + IS_OPTIONAL: true + + L0_Unit_Tests_GPU_Others: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads \ + --ignore=tests/collections/asr \ + --ignore=tests/collections/audio \ + --ignore=tests/collections/common \ + --ignore=tests/collections/llm \ + --ignore=tests/collections/multimodal \ + --ignore=tests/collections/nlp \ + --ignore=tests/collections/tts \ + --ignore=tests/core \ + --ignore=tests/core_ptl \ + --ignore=tests/hydra \ + --ignore=tests/lightning \ + --ignore=tests/utils # L0: CPU unit tests L0_Unit_Tests_CPU_ASR: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu TIMEOUT: 20 SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Audio: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Common: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_LLM: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Multimodal: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_NLP: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_TTS: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Core: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Hydra: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true L0_Unit_Tests_CPU_Lightning: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - IS_OPTIONAL: true - L0_Unit_Tests_CPU_Ohers: + L0_Unit_Tests_CPU_Others: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-cpu SCRIPT: | @@ -4965,12 +5072,59 @@ jobs: AFTER_SCRIPT: | rm -rf examples/llm/gpt_pretrain_results rm -rf examples/llm/gpt_index_mappings + + L2_NeMo_2_SSM_Pretraining: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/gpt/model/megatron_ssm_pretraining.py \ + --devices 1 \ + --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain \ + --max-steps 10 \ + --data-path /home/TestData/nlp/megatron_mamba/toy_ssm_dataset/legal_pile_text_document + + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain + + L2_NeMo_2_SSM_Finetuning: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + + python tests/collections/llm/gpt/model/megatron_ssm_finetuning.py \ + --devices 1 \ + --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft \ + --max-steps 10 \ + --model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt + + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft Nemo_CICD_Test: needs: + - pre-flight - gpu-test - cicd-test-container-setup - - L0_Unit_Tests_GPU + + - L0_Unit_Tests_GPU_ASR + - L0_Unit_Tests_GPU_Audio + - L0_Unit_Tests_GPU_Common + - L0_Unit_Tests_GPU_LLM + - L0_Unit_Tests_GPU_Multimodal + - L0_Unit_Tests_GPU_NLP + - L0_Unit_Tests_GPU_TTS + #- OPTIONAL_L0_Unit_Tests_GPU_Core + - L0_Unit_Tests_GPU_Hydra + #- OPTIONAL_L0_Unit_Tests_GPU_Lightning + - L0_Unit_Tests_GPU_Others + - L0_Unit_Tests_CPU_ASR - L0_Unit_Tests_CPU_Audio - L0_Unit_Tests_CPU_Common @@ -4981,7 +5135,8 @@ jobs: - L0_Unit_Tests_CPU_Core - L0_Unit_Tests_CPU_Hydra - L0_Unit_Tests_CPU_Lightning - - L0_Unit_Tests_CPU_Ohers + - L0_Unit_Tests_CPU_Others + - L2_Community_LLM_Checkpoints_tests_Bert - L2_Community_LLM_Checkpoints_tests_Mamba2 - L2_Community_LLM_Checkpoints_tests_Llama @@ -5083,6 +5238,8 @@ jobs: #- OPTIONAL_L2_Stable_Diffusion_Training - L2_NeMo_2_GPT_Pretraining_no_transformer_engine - L2_NeMo_2_GPT_DDP_Param_Parity_check + - L2_NeMo_2_SSM_Pretraining + - L2_NeMo_2_SSM_Finetuning if: always() runs-on: ubuntu-latest steps: @@ -5176,3 +5333,4 @@ jobs: - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }} run: | exit 1 + diff --git a/.github/workflows/config/.secrets.baseline b/.github/workflows/config/.secrets.baseline index 2bf4e372565c..4a56aaad3c58 100644 --- a/.github/workflows/config/.secrets.baseline +++ b/.github/workflows/config/.secrets.baseline @@ -123,6 +123,15 @@ } ], "results": { + ".github/workflows/cicd-main.yml": [ + { + "type": "Base64 High Entropy String", + "filename": ".github/workflows/cicd-main.yml", + "hashed_secret": "593951c440200143335452427205ae7c8580d463", + "is_verified": false, + "line_number": 1503 + } + ], "docs/source/nlp/question_answering.rst": [ { "type": "Hex High Entropy String", @@ -2074,5 +2083,5 @@ } ] }, - "generated_at": "2024-09-04T00:45:39Z" + "generated_at": "2024-09-08T19:00:15Z" } diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index f8d037271f36..7f8cd3dad8f5 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -7,6 +7,11 @@ on: description: 'MAJOR.MINOR.PATCH[rcN] (Example: 2.0.0rc1, or 2.1.0)' required: true type: string + is_prelease: + description: Whether to keep and bump the pre-release label + required: false + default: false + type: boolean mcore_version: description: 'Version of MCore to use (must be a valid git ref)' required: true @@ -27,25 +32,25 @@ jobs: fetch-depth: 0 fetch-tags: true ref: main - - - name: Get Previous tag - id: previous-tag - # git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags ==> refs/tags/vX.Y.Z in descending order of date - # awk 'FNR == 2 {print substr($1, 11, length($1))}') ==> Selects the 2nd tag from the list, then strips the /refs/tags/ part of the tag - # set-output name=tag_name:: ==> Takes the clean tag vX.Y.Z and sets it to steps.previous_tag.outputs.tag_name - run: | - TAG=$(git for-each-ref --sort=-creatordate --format '%(refname)' refs/tags | awk 'FNR == 2 {print substr($1, 11, length($1))}') - echo "tag-name=$TAG" >> "$GITHUB_OUTPUT" + token: ${{ secrets.PAT }} - name: Get release branch ref id: release-branch run: | cd ${{ github.run_id }} - + + if [[ "${{ inputs.is_prelease }}" == "false" ]]; then + sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" nemo/package_info.py + fi + VERSION=$(python -c 'import nemo; print(nemo.__version__)') - echo "Release version r$VERSION" > version + + echo "Release version r$VERSION" > version echo "version=$VERSION" >> "$GITHUB_OUTPUT" + git switch --force-create r$VERSION origin/main + git push -u origin r$VERSION --force + - name: Pin branch name in Notebooks run: | cd ${{ github.run_id }} @@ -56,34 +61,13 @@ jobs: cd ${{ github.run_id }} sed -i 's/^ARG MCORE_TAG=.*$/ARG MCORE_TAG=${{ inputs.mcore_version }}/' Dockerfile.ci - - name: Build Changelog - id: build-changelog - uses: mikepenz/release-changelog-builder-action@v3.3.1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - # Configuration file is setup with filters for domains - # owner:repo must point to current repo - # fromTag: Auto resolved from historical tag order (previous tag compared to current tag) - # toTag: Current tag reference - configuration: ".github/workflows/config/changelog-config.json" - owner: ${{ github.repository_owner }} - repo: ${{ github.event.repository.name }} - ignorePreReleases: "false" - failOnError: "false" - fromTag: ${{ steps.previous-tag.outputs.tag-name }} - toTag: main - - - name: Append Changelog - run: | - echo "${{ steps.build-changelog.outputs.changelog }}" - - name: Create Release PR uses: peter-evans/create-pull-request@v6 id: create-pull-request with: path: ${{ github.run_id }} - branch: r${{ steps.release-branch.outputs.version }} + base: r${{ steps.release-branch.outputs.version }} + branch: ci/release-r${{ steps.release-branch.outputs.version }} title: 'Release `${{ steps.release-branch.outputs.version }}`' body: | 🚀 PR to release NeMo `${{ steps.release-branch.outputs.version }}`. @@ -101,22 +85,6 @@ jobs: assignees: okoenig labels: 'Run CICD' - - name: Add Summary comment - uses: peter-evans/create-or-update-comment@v4 - with: - issue-number: ${{ steps.create-pull-request.outputs.pull-request-number }} - body: | - # Highlights - __ - - - name: Add Changelog comment - uses: peter-evans/create-or-update-comment@v4 - with: - issue-number: ${{ steps.create-pull-request.outputs.pull-request-number }} - body: | - # Detailed Changelogs - ${{ steps.build-changelog.outputs.changelog }} - bump-next-version: runs-on: ubuntu-latest needs: [create-release-branch] diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index af09fa241c59..30033a80e6c7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -34,11 +34,12 @@ jobs: PAYLOAD=$(jq \ -n \ -c \ + --arg TAG_NAME "v${VERSION}" \ --arg CI_COMMIT_BRANCH "${{ inputs.branch }}" \ --arg NAME "$NAME" \ --arg BODY "$CHANGELOG" \ '{ - "tag_name": $CI_COMMIT_BRANCH, + "tag_name": $TAG_NAME, "target_commitish": $CI_COMMIT_BRANCH, "name": $NAME, "body": $BODY, diff --git a/.github/workflows/secrets-detector.yml b/.github/workflows/secrets-detector.yml index 4de052535cc1..a7793a9c62db 100644 --- a/.github/workflows/secrets-detector.yml +++ b/.github/workflows/secrets-detector.yml @@ -23,10 +23,14 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 with: + path: ${{ github.run_id }} ref: ${{ inputs.branch-name || github.head_ref }} + fetch-depth: 0 - name: Install secrets detector run: pip install detect-secrets - name: Run on change-set - run: git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .github/workflows/config/.secrets.baseline \ No newline at end of file + run: | + cd ${{ github.run_id }} + git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .github/workflows/config/.secrets.baseline \ No newline at end of file diff --git a/Dockerfile.ci b/Dockerfile.ci index 3d9a9d9b08a1..7e3ba798d62e 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -18,23 +18,29 @@ ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3 FROM ${BASE_IMAGE} -ENV TRANSFORMERS_OFFLINE=0 +ENV TRANSFORMERS_OFFLINE=0 ENV HYDRA_FULL_ERROR=1 ENV PYTHONUNBUFFERED=1 # APT packages RUN <<"EOF" bash -ex apt-get update -apt-get install -y bc libsox-fmt-all -y +apt-get install -y bc libsox-fmt-all -y apt-get clean EOF WORKDIR /workspace +RUN pip install hatchling # needed to install nemo-run +ARG NEMU_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2 +RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_TAG} + # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.15.0 -ARG MCORE_TAG=3396356ab4ca83cc4c4d3272530b142a1702606e + +ARG MCORE_TAG=01945b98d1ea3a2acb5e8301e181a328104f4856 + ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ diff --git a/docs/source/performance/performance_summary.md b/docs/source/performance/performance_summary.md index eca42f2d0695..98dae2dc0a78 100644 --- a/docs/source/performance/performance_summary.md +++ b/docs/source/performance/performance_summary.md @@ -11,18 +11,18 @@ | Model | #-GPUs | GBS | MBS | Sequence Length| TP | PP | CP | VP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to train in days (10T tokens, 1K GPUs)*** | | ----- | ------ | --- | --- | ---------------| -- | -- | -- | -- | ------------------ | ----------------------- | ------------------------------------------------------ | -| GPT3-5B | 64 | 2048 | 4 | 2048 | 1 | 1 | 1 | 1 | 23574 | 770 | ***5*** | -| GPT3-20B | 64 | 256 | 2 | 2048 | 2 | 1 | 1 | 1 | 5894 | 755 | ***19*** | -| GPT3-175B | 128 | 256 | 1 | 2048 | 4 | 8 | 1 | 6 | 745 | 802 | **152** | -| GPT3-175B | 512 | 2048 | 2 | 2048 | 4 | 8 | 1 | 6 | 832 | [895](https://mlcommons.org/benchmarks/training/) | **136** | -| LLAMA2-7B | 8 | 128 | 1 | 4096 | 1 | 1 | 1 | 1 | 16634 | 767 | ***7*** | +| GPT3-5B | 64 | 2048 | 4 | 2048 | 1 | 1 | 1 | 1 | 23406 | 765 | ***5*** | +| GPT3-20B | 64 | 256 | 2 | 2048 | 2 | 1 | 1 | 1 | 5851 | 750 | ***19*** | +| GPT3-175B | 128 | 256 | 1 | 2048 | 4 | 8 | 1 | 6 | 716 | 771 | **158** | +| GPT3-175B | 512 | 2048 | 2 | 2048 | 4 | 8 | 1 | 6 | 825 | [888](https://mlcommons.org/benchmarks/training/) | **137** | +| LLAMA2-7B | 8 | 128 | 1 | 4096 | 1 | 1 | 1 | 1 | 16934 | 780 | ***7*** | | LLAMA2-13B | 16 | 128 | 1 | 4096 | 1 | 4 | 1 | 10 | 8715 | 760 | ***13*** | -| LLAMA2-70B | 64 | 128 | 1 | 4096 | 4 | 4 | 1 | 20 | 1717 | 763 | ***66*** | +| LLAMA2-70B | 64 | 128 | 1 | 4096 | 4 | 4 | 1 | 20 | 1728 | 768 | ***65*** | | Nemotron-8B | 64 | 256 | 4 | 4096 | 2 | 1 | 1 | 1 | 12507 | 643 | ***9*** | -| Nemotron-22B | 64 | 256 | 2 | 4096 | 2 | 4 | 1 | 10 | 4289 | 559 | ***26*** | -| Nemotron-340B | 128 | 32 | 1 | 4096 | 8 | 8 | 1 | 12 | 328 | 691 | ***344*** | -| LLAMA3-8B | 8 | 128 | 1 | 8192 | 1 | 1 | 2 | 1 | 11883 | 688 | ***10*** | -| LLAMA3-70B | 64 | 128 | 1 | 8192 | 4 | 4 | 2 | 5 | 1549 | 746 | ***73*** | +| Nemotron-22B | 64 | 256 | 2 | 4096 | 2 | 4 | 1 | 10 | 4312 | 562 | ***26*** | +| Nemotron-340B | 128 | 32 | 1 | 4096 | 8 | 8 | 1 | 12 | 326 | 686 | ***347*** | +| LLAMA3-8B | 8 | 128 | 1 | 8192 | 1 | 1 | 2 | 1 | 12273 | 711 | ***9*** | +| LLAMA3-70B | 64 | 128 | 1 | 8192 | 4 | 4 | 2 | 5 | 1524 | 734 | ***74*** | ### Finetuning @@ -34,9 +34,9 @@ | Model | Task | #-GPUs | GBS | MBS | Packed Sequence Length | TP | PP | Tokens / sec / GPU | Model TFLOP / sec / GPU | ***Est. time to finetune in mins (10M tokens)*** | | ----- | ---- | --- | --- | --- | --------------- | -- | -- | ------------------ | ----------------------- | -------------------------------------------------- | -| LLAMA2-7B | SFT | 8 | 32 | 1 | 4096 | 1 | 1 | 17617 | 702 | ***1.2*** | +| LLAMA2-7B | SFT | 8 | 32 | 1 | 4096 | 1 | 1 | 16891 | 673 | ***1.2*** | | LLAMA2-13B | SFT | 8 | 32 | 1 | 4096 | 1 | 4 | 10176 | 787 | ***2.0*** | -| LLAMA2-70B | SFT | 16 | 32 | 1 | 4096 | 4 | 4 | 1812 | 747 | ***5.7*** | -| LLAMA2-7B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 25206 | 673 | ***0.8*** | -| LLAMA2-13B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 14760 | 764 | ***1.4*** | +| LLAMA2-70B | SFT | 16 | 32 | 1 | 4096 | 4 | 4 | 1816 | 749 | ***5.7*** | +| LLAMA2-7B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 24824 | 663 | ***0.8*** | +| LLAMA2-13B | LoRA | 8 | 32 | 1 | 4096 | 1 | 1 | 14629 | 757 | ***1.4*** | | LLAMA2-70B | LoRA | 8 | 32 | 1 | 4096 | 2 | 4 | 2621 | 722 | ***7.9*** | diff --git a/examples/llm/auto_configurator/README.md b/examples/llm/auto_configurator/README.md new file mode 100644 index 000000000000..26cf5cd75263 --- /dev/null +++ b/examples/llm/auto_configurator/README.md @@ -0,0 +1,85 @@ +> [!IMPORTANT] +> This is an early version of the Auto Configurator, and the code base can be modified as it will be integrated into the CLI. + +Use Auto Configurator to Find the Optimal Configuration +------------------------------------------------------- + +Auto Configurator searches for hyperparameters (HPs) that achieve the maximum highest training throughput when working with Large Language Models (LLMs) utilizing the NeMo Framework. + +> [!NOTE] +> Auto Configurator is only supported now for GPT-based models: GPT3, LLama, Mixtral, Mistral, Gemma and Nemotron. + +Auto Configurator Capabilities +------------------------------ + +Auto Configurator is intended to iterate over different model configurations quickly and find the best configuration, that is, the configuration that minimizes both time and financial expenditure. It offers a range of features to facilitate this, as detailed in the list below. + +- **Model size recommendation**: finds the optimal model size if the parameter is not specified. +- **Training time estimation**: estimates model training time based on input parameters. +- **Base configuration generation**: returns a basic model configuration. +- **Hyperparameters recommendation**: finds the optimal list of hyperparameters to be trained. +- **Optimal configuration recommendation**: calculates the performance after a short training of candidate configurations and finds the optimal model configuration. + +Model Size Recommendation +------------------------- + +If you have not decided what model size you want to train, Auto Configurator can recommend a model size for your use case. If you know the number of GPUs, TFLOPS per GPU, the maximum time to train, and the number of tokens to train for, it can recommend a model size that can be trained with the specified hardware and time constraints. + +For example, if you had 20 NVIDIA DGX nodes available (in 80 GB GPU memory), and wanted to train a GPT model for a maximum of 5 days, Auto Configurator would recommend using a 5B parameter GPT model. + +Training Time Estimation +------------------------ + +Auto Configurator calculates the estimated training time for your model. It provides a projection of the training time in days, based on the input dataset and parameters you provide. + +Base Configuration Generation +----------------------------- + +When you provide the model size, or Auto Configurator has suggested one, it generates a base configuration for the target model. The base configuration is a valid configuration in NeMo 2.0 format. The optimization of throughput, however, is conducted in the next step. + +Hyperparameters Recommendation +------------------------------ + +After Auto Configurator generates the base configuration, it searches over four critical hyperparameters that have a great impact on training throughput but do not affect model convergence. These hyperparameters include Tensor Parallelism (TP), Pipeline Parallelism (PP), Context Parallelism (CP), Expert Parallelism (EP), Micro Batch Size (MBS), and Activation Checkpointing Layers (ActCkpt). Auto Configurator will also provide optimal Global Batch Size (GBS) if it's not specified. + +Auto Configurator initially applies heuristics to identify suitable candidates for the four key parameters, subsequently generating a grid of candidate configurations. It returns all of the candidate configurations in NeMo 2.0 format. + +> [!NOTE] +> Some of the candidate configurations may not work due to high-memory usage or other issues. + +Once the candidate configurations are generated, you can use NeMo Framework to launch the most promising candidates. + +When running the candidates on the cluster, you can limit job time and job max steps by using ``max_minutes_per_run`` and ``max_steps_per_run`` parameters. During this search, the jobs will run with the number of nodes specified in the configuration files, using the ``num_nodes`` parameter. Once all of the jobs have finished running, you'll need to run compare_throughput.py to get a ``.csv`` table with performance results for each succeeded job. + +Optimal Configuration Recommendation +------------------------------------ + +After all of the candidate jobs are done, Auto Configurator calculates performance parameters for each of the candidates. +Auto Configurator generates two ``.csv`` files: one detailing the performance measures of the candidates and another listing the candidates that failed due to out-of-memory errors. + +End-To-End Example +------------------ + +The following list shows the required input parameters for the Auto Configurator runner: + +- ``model``: model configuration based on NeMo 2.0. +- ``num_nodes``: number of nodes to be used for the training. +- ``seq_length``: sequence length to be used for the training. +- ``data_paths``: dataset to be used for the training. +- ``tokenizer_path``: path to tokenizer model if custom tokenizer will be used. + +The following list shows the optional parameters for the Auto Configurator runner: + +- ``global_batch_size``: global batch size to be used. +- ``tensor_parallel_sizes``: a list, such as ``[1, 2, 4]``. +- ``pipeline_parallel_sizes``: a list, such as ``[1, 2, 4]``. +- ``context_parallel_sizes``: a list, such as ``[1, 2, 4]``. +- ``expert_parallel_sizes``: a list, such as ``[1, 2, 4]``. +- ``micro_batch_sizes``: a list, such as ``[1, 2, 4]``. +- ``min_model_parallel_size``: a value for the minimum desired parallelism. +- ``max_model_parallel_size``: a value for the maximum desired parallelism. + +For each of the optional parameters, Auto Configurator will find the optimal value if the parameter is not specified. To view the full list of parameters, please visit [this page](https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/nemo/collections/llm/tools/auto_configurator/runner.py#L51). + +To view an end-to-end example of how to generate candidate configs, train them, and calculate the performance using Auto Configurator with NeMo Framework, please visit [this page](https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/examples/llm/auto_configurator/auto_config.py). + diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py new file mode 100644 index 000000000000..c202d4d33325 --- /dev/null +++ b/examples/llm/auto_configurator/auto_config.py @@ -0,0 +1,81 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import fiddle as fdl +import nemo_run as run + +from nemo.collections.llm import GPTConfig126M +from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs, get_results + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--run_number", type=int, help="Number of config to run") + parser.add_argument("--logs_dir", type=str, help="Path where to save training logs") + parser.add_argument("--data_path", type=str, help="Path to the dataset") + parser.add_argument("--get_results", action="store_true") + + return parser.parse_args() + + +def train_config(args): + # GPT-3 126M + # This example will generate 3 configs. + # It is expected that this script will be run 3 times with changing --run_number flag for each run from 0 to 2. + # After all configurations are trained, please trigger the script using --get_results flag. + runner = AutoConfigurator( + model=run.Config(GPTConfig126M), + num_nodes=1, + gpus_per_node=1, + gpu_memory_gb=40, + global_batch_size=16, + seq_length=512, + tensor_parallel_sizes=[1], + pipeline_parallel_sizes=[1], + micro_batch_sizes=[1, 2, 4], + max_training_days=1, + max_steps_per_run=25, + num_tokens_in_b=10, + vocab_size=51200, + data_paths=args.data_path, + path_to_logs=args.logs_dir, + ) + + base_cfg, configs = generate_configs(runner) + if not args.get_results: + # Get generated configs + partials = list(configs.values()) + names = list(configs.keys()) + + # Run pre-training + partial = partials[args.run_number - 1] + partial.log.dir = os.path.join(args.logs_dir, names[args.run_number - 1]) + pretrain = fdl.build(partial) + pretrain() + else: + # # Get Auto Configurator results + get_results(base_cfg, runner, args.logs_dir) + print(f"The results were successfully saved to {args.logs_dir}.") + + +def main(): + args = get_args() + train_config(args) + + +if __name__ == '__main__': + main() diff --git a/examples/llm/pretrain/README.md b/examples/llm/pretrain/README.md new file mode 100644 index 000000000000..c9bb7331f972 --- /dev/null +++ b/examples/llm/pretrain/README.md @@ -0,0 +1,72 @@ +# Pre-training + +### Listing the available recipes for pretraining + +```bash +nemorun llm pretrain --help +``` + +![recipe-listing](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/list-recipes.png) + + +### Run pre-training with a default recipe + +```bash +nemorun llm pretrain --factory llama3_8b +``` + +![llama3_70b](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b.png) + +We can also call the factory function with custom parameters: + +```bash +nemorun llm pretrain --factory "llama3_70b(num_nodes=128)" +``` + +![llama3_70b-128-nodes](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_70b_128nodes.png) + + +The CLI allows you to overwrite any parameter. For example, to run the recipe with 2000 steps: + +```bash +nemorun llm pretrain --factory llama3_70b trainer.max_steps=2000 +``` + +The syntax of the CLI is the same as the Python code. Which is great but in some cases you might want to inspect & edit a recipe interactively. An easy way to do this using the cli is the use the `--repl` flag. + +```bash +nemorun llm pretrain --factory llama3_70b --repl +``` + +![repl](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/repl.gif) + +We can also trigger a run from a jupyter notebook, see [pretrain.ipynb](pretrain.ipynb) for an example. This allows visualizes all configs in a structured format. See for instance the `llama3_8b` recipe: + +![llama3_8b_visualization](https://github.com/NVIDIA/NeMo/releases/download/v2.0.0rc0/llama3_8b_config.svg) + + +### Create and run a custom recipe + +We can create a script that contains a custom recipe. See [custom_recipe.py](custom_recipe.py) for an example. + +Note that we end the script with a call to `run.cli.main()`, which uses the same syntax as the CLI but allows us to provide specific defaults. We still can overwrite any parameter using the syntax `param=value`. We can set nested parameters using dotted notation, e.g. `trainer.max_steps=2000`. + +When running the custom_recipe.py file, it will execute the `custom_llama3_8b` recipe by default. However, you can select different recipes or modify parameters using the following methods: + +1. To select the `custom_llama3_70b` recipe: + ```bash + python custom_recipe.py --factory custom_llama3_70b + ``` + This will automatically call the `custom_llama3_70b` function defined in the script. + +2. To overwrite any parameter: + ```bash + python custom_recipe.py trainer.max_steps=2000 + ``` + +3. You can even apply transformations when triggering the CLI as if it's Python code: + ```bash + python custom_recipe.py "trainer.max_steps=*2" + ``` + +These options provide flexibility in customizing your pretraining recipe directly from the command line. \ No newline at end of file diff --git a/examples/llm/pretrain/custom_recipe.py b/examples/llm/pretrain/custom_recipe.py new file mode 100644 index 000000000000..a522a1a8e1f5 --- /dev/null +++ b/examples/llm/pretrain/custom_recipe.py @@ -0,0 +1,44 @@ +import nemo_run as run + +from nemo.collections import llm +from nemo.collections.llm.recipes import llama3_8b, llama3_70b + + +def custom_llama3_8b(): + pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8) + + pretrain.trainer.val_check_interval = 400 + pretrain.log.ckpt.save_top_k = -1 + pretrain.log.ckpt.every_n_train_steps = 400 + + pretrain.trainer.max_steps = 1000 + + return pretrain + + +def custom_llama3_70b(): + pretrain = llama3_70b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8) + + pretrain.trainer.val_check_interval = 400 + pretrain.log.ckpt.save_top_k = -1 + pretrain.log.ckpt.every_n_train_steps = 400 + + pretrain.trainer.max_steps = 1000 + + return pretrain + + +if __name__ == "__main__": + # When running this file, it will run the `custom_llama3_8b` recipe + + # To select the `custom_llama3_70b` recipe, use the following command: + # python custom_recipe.py --factory custom_llama3_70b + # This will automatically call the custom_llama3_70b that's defined above + + # Note that any parameter can be overwritten by using the following syntax: + # python custom_recipe.py trainer.max_steps=2000 + + # You can even apply transformations when triggering the CLI as if it's python code + # python custom_recipe.py "trainer.max_steps*=2" + + run.cli.main(llm.pretrain, default_factory=custom_llama3_8b) diff --git a/examples/llm/pretrain/default_executor.py b/examples/llm/pretrain/default_executor.py new file mode 100644 index 000000000000..2668d312f2b8 --- /dev/null +++ b/examples/llm/pretrain/default_executor.py @@ -0,0 +1,106 @@ +from typing import Optional +import nemo_run as run +from nemo.collections import llm + + +def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor: + env_vars = { + "TRANSFORMERS_OFFLINE": "1", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", + "NCCL_NVLS_ENABLE": "0", + "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", + "NVTE_ASYNC_AMAX_REDUCTION": "1", + "NVTE_FUSED_ATTN": "0", + } + + executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) + + return executor + + +def slurm_executor( + user: str, + host: str, + remote_job_dir: str, + account: str, + partition: str, + nodes: int, + devices: int, + time: str = "01:00:00", + custom_mounts: Optional[list[str]] = None, + custom_env_vars: Optional[dict[str, str]] = None, + container_image: str = "nvcr.io/nvidia/nemo:dev", + retries: int = 0, +) -> run.SlurmExecutor: + if not (user and host and remote_job_dir and account and partition and nodes and devices): + raise RuntimeError( + "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function." + ) + + mounts = [] + if custom_mounts: + mounts.extend(custom_mounts) + + env_vars = { + "TRANSFORMERS_OFFLINE": "1", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", + "NCCL_NVLS_ENABLE": "0", + "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", + "NVTE_ASYNC_AMAX_REDUCTION": "1", + "NVTE_FUSED_ATTN": "0", + } + if custom_env_vars: + env_vars |= custom_env_vars + + executor = run.SlurmExecutor( + account=account, + partition=partition, + tunnel=run.SSHTunnel( + user=user, + host=host, + job_dir=remote_job_dir, + ), + nodes=nodes, + ntasks_per_node=devices, + gpus_per_node=devices, + mem="0", + exclusive=True, + gres="gpu:8", + packager=run.GitArchivePackager(subpath="examples/llm/run"), + ) + + executor.container_image = container_image + executor.container_mounts = mounts + executor.env_vars = env_vars + executor.retries = retries + executor.time = time + + return executor + + +def my_slurm_executor(): + # TODO: Set your custom parameters for the Slurm Executor. + return slurm_executor( + user="", + host="", + remote_job_dir="", + account="", + partition="", + nodes=1, + devices=2, + ) + + +if __name__ == "__main__": + run.cli.main(llm.pretrain, default_executor=local_executor_torchrun) + + # This will re-expose the pretrain entrypoint with your custom local executor as default. + + # To run, for instance, the llama3_8b recipe, use the following command: + # python default_executor.py --factory llama3_8b + + # To run with any overrides, use the following command: + # python default_executor.py --factory llama3_8b trainer.max_steps=2000 + + # To use your custom Slurm executor, use the following command: + # python default_executor.py --executor my_slurm_executor --factory llama3_8b diff --git a/examples/llm/pretrain/pretrain.ipynb b/examples/llm/pretrain/pretrain.ipynb new file mode 100644 index 000000000000..194741a9da9f --- /dev/null +++ b/examples/llm/pretrain/pretrain.ipynb @@ -0,0 +1,737 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Trigger a run from a notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[NeMo W 2024-08-29 17:14:25 nemo_logging:349] /Users/romeyn/base/code/.venv/lib/python3.10/site-packages/megatron/core/optimizer/__init__.py:18: UserWarning: Transformer Engine and Apex are not installed. Falling back to Torch optimizers.\n", + " warnings.warn(\n", + " \n", + "[NeMo W 2024-08-29 17:14:25 nemo_logging:349] /Users/romeyn/base/code/.venv/lib/python3.10/site-packages/megatron/core/optimizer/clip_grads.py:31: UserWarning: Transformer Engine and Apex are not installed. Falling back to local implementations of multi_tensor_applier, multi_tensor_l2norm, and multi_tensor_scale\n", + " warnings.warn(\n", + " \n" + ] + } + ], + "source": [ + "import nemo_run as run\n", + "from nemo.collections import llm\n", + "from nemo.collections.llm.recipes import llama3_8b\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "2\n", + "\n", + "\n", + "Config:\n", + " Llama3Config8B\n", + "\n", + "\n", + "no arguments\n", + "\n", + "\n", + "\n", + "1\n", + "\n", + "\n", + "Config:\n", + " LlamaModel\n", + "\n", + "\n", + "config\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "1:c--2:c\n", + "\n", + "\n", + "\n", + "\n", + "0\n", + "\n", + "\n", + "Partial:\n", + " pretrain\n", + "\n", + "\n", + "model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "data\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "trainer\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "log\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "resume\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "optim\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "0:c--1:c\n", + "\n", + "\n", + "\n", + "\n", + "3\n", + "\n", + "\n", + "Config:\n", + " MockDataModule\n", + "\n", + "\n", + "seq_length\n", + "\n", + "8192\n", + "\n", + "\n", + "micro_batch_size\n", + "\n", + "1\n", + "\n", + "\n", + "global_batch_size\n", + "\n", + "512\n", + "\n", + "\n", + "\n", + "0:c--3:c\n", + "\n", + "\n", + "\n", + "\n", + "4\n", + "\n", + "\n", + "Config:\n", + " Trainer\n", + "\n", + "\n", + "accelerator\n", + "\n", + "'gpu'\n", + "\n", + "\n", + "accumulate_grad_batches\n", + "\n", + "1\n", + "\n", + "\n", + "callbacks\n", + "\n", + "\n", + "\n", + "list\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "0\n", + "\n", + "\n", + "devices\n", + "\n", + "8\n", + "\n", + "\n", + "gradient_clip_val\n", + "\n", + "1.0\n", + "\n", + "\n", + "limit_test_batches\n", + "\n", + "50\n", + "\n", + "\n", + "limit_val_batches\n", + "\n", + "32\n", + "\n", + "\n", + "log_every_n_steps\n", + "\n", + "10\n", + "\n", + "\n", + "max_steps\n", + "\n", + "1168251\n", + "\n", + "\n", + "num_nodes\n", + "\n", + "1\n", + "\n", + "\n", + "plugins\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "strategy\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "use_distributed_sampler\n", + "\n", + "False\n", + "\n", + "\n", + "val_check_interval\n", + "\n", + "2000\n", + "\n", + "\n", + "\n", + "0:c--4:c\n", + "\n", + "\n", + "\n", + "\n", + "9\n", + "\n", + "\n", + "Config:\n", + " NeMoLogger\n", + "\n", + "\n", + "name\n", + "\n", + "'default'\n", + "\n", + "\n", + "dir\n", + "\n", + "None\n", + "\n", + "\n", + "ckpt\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "tensorboard\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "wandb\n", + "\n", + "None\n", + "\n", + "\n", + "\n", + "0:c--9:c\n", + "\n", + "\n", + "\n", + "\n", + "12\n", + "\n", + "\n", + "Config:\n", + " AutoResume\n", + "\n", + "\n", + "resume_if_exists\n", + "\n", + "True\n", + "\n", + "\n", + "resume_ignore_no_checkpoint\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "0:c--12:c\n", + "\n", + "\n", + "\n", + "\n", + "13\n", + "\n", + "\n", + "Config:\n", + " MegatronOptimizerModule\n", + "\n", + "\n", + "config\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "lr_scheduler\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "0:c--13:c\n", + "\n", + "\n", + "\n", + "\n", + "5\n", + "\n", + "\n", + "Config:\n", + " TimingCallback\n", + "\n", + "\n", + "no arguments\n", + "\n", + "\n", + "\n", + "4:c--5:c\n", + "\n", + "\n", + "\n", + "\n", + "6\n", + "\n", + "\n", + "Config:\n", + " MegatronMixedPrecision\n", + "\n", + "\n", + "precision\n", + "\n", + "'bf16-mixed'\n", + "\n", + "\n", + "params_dtype\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "pipeline_dtype\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "autocast_enabled\n", + "\n", + "False\n", + "\n", + "\n", + "grad_reduce_in_fp32\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "4:c--6:c\n", + "\n", + "\n", + "\n", + "\n", + "8\n", + "\n", + "\n", + "Config:\n", + " MegatronStrategy\n", + "\n", + "\n", + "tensor_model_parallel_size\n", + "\n", + "1\n", + "\n", + "\n", + "pipeline_model_parallel_size\n", + "\n", + "1\n", + "\n", + "\n", + "virtual_pipeline_model_parallel_size\n", + "\n", + "None\n", + "\n", + "\n", + "context_parallel_size\n", + "\n", + "2\n", + "\n", + "\n", + "sequence_parallel\n", + "\n", + "False\n", + "\n", + "\n", + "ckpt_include_optimizer\n", + "\n", + "True\n", + "\n", + "\n", + "pipeline_dtype\n", + "\n", + "None\n", + "\n", + "\n", + "ckpt_async_save\n", + "\n", + "True\n", + "\n", + "\n", + "ckpt_parallel_load\n", + "\n", + "True\n", + "\n", + "\n", + "gradient_as_bucket_view\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "4:c--8:c\n", + "\n", + "\n", + "\n", + "\n", + "7\n", + "\n", + "\n", + "dtype\n", + "\n", + "torch.bfloat16\n", + "\n", + "\n", + "\n", + "6:c--7:c\n", + "\n", + "\n", + "\n", + "\n", + "6:c--7:c\n", + "\n", + "\n", + "\n", + "\n", + "10\n", + "\n", + "\n", + "Config:\n", + " ModelCheckpoint\n", + "\n", + "\n", + "save_last\n", + "\n", + "True\n", + "\n", + "\n", + "save_top_k\n", + "\n", + "10\n", + "\n", + "\n", + "every_n_train_steps\n", + "\n", + "200\n", + "\n", + "\n", + "save_best_model\n", + "\n", + "False\n", + "\n", + "\n", + "filename\n", + "\n", + "'{model_name}--{val_loss:.2f}-{step}-{consumed_samples}'\n", + "\n", + "\n", + "\n", + "9:c--10:c\n", + "\n", + "\n", + "\n", + "\n", + "11\n", + "\n", + "\n", + "Config:\n", + " TensorBoardLogger\n", + "\n", + "\n", + "save_dir\n", + "\n", + "'tb_logs'\n", + "\n", + "\n", + "name\n", + "\n", + "'default'\n", + "\n", + "\n", + "\n", + "9:c--11:c\n", + "\n", + "\n", + "\n", + "\n", + "14\n", + "\n", + "\n", + "Config:\n", + " OptimizerConfig\n", + "\n", + "\n", + "optimizer\n", + "\n", + "'adam'\n", + "\n", + "\n", + "lr\n", + "\n", + "0.0003\n", + "\n", + "\n", + "weight_decay\n", + "\n", + "0.1\n", + "\n", + "\n", + "bf16\n", + "\n", + "True\n", + "\n", + "\n", + "adam_beta1\n", + "\n", + "0.9\n", + "\n", + "\n", + "adam_beta2\n", + "\n", + "0.95\n", + "\n", + "\n", + "adam_eps\n", + "\n", + "1e-05\n", + "\n", + "\n", + "use_distributed_optimizer\n", + "\n", + "True\n", + "\n", + "\n", + "overlap_grad_reduce\n", + "\n", + "True\n", + "\n", + "\n", + "overlap_param_gather\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "13:c--14:c\n", + "\n", + "\n", + "\n", + "\n", + "15\n", + "\n", + "\n", + "Config:\n", + " CosineAnnealingScheduler\n", + "\n", + "\n", + "warmup_steps\n", + "\n", + "2000\n", + "\n", + "\n", + "constant_steps\n", + "\n", + "0\n", + "\n", + "\n", + "min_lr\n", + "\n", + "2.9999999999999997e-05\n", + "\n", + "\n", + "\n", + "13:c--15:c\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + ")]>,\n", + " data=,\n", + " trainer=],\n", + " devices=8,\n", + " gradient_clip_val=1.0,\n", + " limit_test_batches=50,\n", + " limit_val_batches=32,\n", + " log_every_n_steps=10,\n", + " max_steps=1168251,\n", + " num_nodes=1,\n", + " plugins=,\n", + " strategy=,\n", + " use_distributed_sampler=False,\n", + " val_check_interval=2000)]>,\n", + " log=,\n", + " tensorboard=,\n", + " wandb=None)]>,\n", + " resume=,\n", + " optim=,\n", + " lr_scheduler=)]>)]>" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8)\n", + "\n", + "pretrain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml index 89e61a8b917c..0464d85b5480 100644 --- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml +++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml @@ -210,6 +210,23 @@ model: image_folder: null image_aspect_ratio: 'square' +energon: + use_energon: False + data: + __module__: megatron.energon + __class__: Metadataset + splits: + # Train dataset, the datasets will be mixed according to their weights + train: + datasets: + - weight: 1.0 + path: null + val: + datasets: + - weight: 1.0 + path: null + + # Nsys profiling options nsys_profile: enabled: False diff --git a/examples/multimodal/multimodal_llm/neva/neva_finetune.py b/examples/multimodal/multimodal_llm/neva/neva_finetune.py index e94308ad89f3..1796a87bac9e 100644 --- a/examples/multimodal/multimodal_llm/neva/neva_finetune.py +++ b/examples/multimodal/multimodal_llm/neva/neva_finetune.py @@ -22,8 +22,6 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -mp.set_start_method("spawn", force=True) - @hydra_runner(config_path="conf", config_name="neva_finetune") def main(cfg) -> None: diff --git a/examples/multimodal/multimodal_llm/neva/neva_peft.py b/examples/multimodal/multimodal_llm/neva/neva_peft.py index 2c0e1bc41ac2..0960dd260ad4 100644 --- a/examples/multimodal/multimodal_llm/neva/neva_peft.py +++ b/examples/multimodal/multimodal_llm/neva/neva_peft.py @@ -23,8 +23,6 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -mp.set_start_method("spawn", force=True) - @hydra_runner(config_path="conf", config_name="neva_peft") def main(cfg) -> None: diff --git a/examples/multimodal/multimodal_llm/neva/neva_pretrain.py b/examples/multimodal/multimodal_llm/neva/neva_pretrain.py index 26e0dc294185..8aae9f2d655a 100644 --- a/examples/multimodal/multimodal_llm/neva/neva_pretrain.py +++ b/examples/multimodal/multimodal_llm/neva/neva_pretrain.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import torch.multiprocessing as mp from omegaconf.omegaconf import OmegaConf from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel @@ -22,8 +20,6 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager -mp.set_start_method("spawn", force=True) - @hydra_runner(config_path="conf", config_name="neva_config") def main(cfg) -> None: diff --git a/nemo/collections/common/parts/perf_metrics_utils.py b/nemo/collections/common/parts/perf_metrics_utils.py index 41273797e035..1633b1343340 100644 --- a/nemo/collections/common/parts/perf_metrics_utils.py +++ b/nemo/collections/common/parts/perf_metrics_utils.py @@ -2,7 +2,6 @@ import os from typing import List -from tensorboard.backend.event_processing import event_accumulator from nemo.utils import logging @@ -27,6 +26,7 @@ def read_tb_log(path: str, summary_name: str) -> List: Returns: summary_list: list, the values in the read summary list, formatted as a list. """ + from tensorboard.backend.event_processing import event_accumulator files = glob.glob(f"{path}/events*tfevents*") files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x))) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 8da00b0edd7f..614af0df400c 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -31,6 +31,11 @@ Baichuan2Config, Baichuan2Config7B, Baichuan2Model, + BaseMambaConfig1_3B, + BaseMambaConfig2_7B, + BaseMambaConfig130M, + BaseMambaConfig370M, + BaseMambaConfig780M, ChatGLM2Config6B, ChatGLM3Config6B, ChatGLMConfig, @@ -46,6 +51,12 @@ GemmaConfig7B, GemmaModel, GPTConfig, + GPTConfig5B, + GPTConfig7B, + GPTConfig20B, + GPTConfig40B, + GPTConfig126M, + GPTConfig175B, GPTModel, Llama2Config7B, Llama2Config13B, @@ -71,12 +82,15 @@ Nemotron4Config340B, NemotronConfig, NemotronModel, + NVIDIAMambaConfig8B, + NVIDIAMambaHybridConfig8B, Qwen2Config, Qwen2Config1P5B, Qwen2Config7B, Qwen2Config72B, Qwen2Config500M, Qwen2Model, + SSMConfig, Starcoder2Config, Starcoder2Config3B, Starcoder2Config7B, @@ -120,6 +134,14 @@ "Nemotron4Config22B", "Nemotron4Config340B", "NemotronConfig", + "SSMConfig", + "BaseMambaConfig130M", + "BaseMambaConfig370M", + "BaseMambaConfig780M", + "BaseMambaConfig1_3B", + "BaseMambaConfig2_7B", + "NVIDIAMambaConfig8B", + "NVIDIAMambaHybridConfig8B", "LlamaConfig", "Llama2Config7B", "Llama2Config13B", diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index d330b42d08c4..847b87131925 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -18,10 +18,10 @@ from pathlib import Path from typing import Any, Callable, Optional, Union +import nemo_run as run import pytorch_lightning as pl from typing_extensions import Annotated -from nemo.collections.llm.utils import Config, task from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform from nemo.utils import logging @@ -29,13 +29,13 @@ TokenizerType = Any -@task(namespace="llm") +@run.cli.entrypoint(namespace="llm") def train( model: pl.LightningModule, data: pl.LightningDataModule, trainer: Trainer, - log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None, - resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None, + log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None, + resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None, optim: Optional[OptimizerModule] = None, tokenizer: Optional[TokenizerType] = None, model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None, @@ -87,13 +87,13 @@ def train( return app_state.exp_dir -@task(namespace="llm") +@run.cli.entrypoint(namespace="llm") def pretrain( model: pl.LightningModule, data: pl.LightningDataModule, trainer: Trainer, - log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None, - resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None, + log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None, + resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None, optim: Optional[OptimizerModule] = None, ) -> Path: """ @@ -135,13 +135,13 @@ def pretrain( ) -@task(namespace="llm") +@run.cli.entrypoint(namespace="llm") def finetune( model: pl.LightningModule, data: pl.LightningDataModule, trainer: Trainer, - log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None, - resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None, + log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None, + resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None, optim: Optional[OptimizerModule] = None, peft: Optional[Union[PEFT, ModelTransform, Callable]] = None, ) -> Path: @@ -186,13 +186,13 @@ def finetune( ) -@task(namespace="llm") +@run.cli.entrypoint(namespace="llm") def validate( model: pl.LightningModule, data: pl.LightningDataModule, trainer: Trainer, - log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None, - resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None, + log: Annotated[Optional[NeMoLogger], run.Config[NeMoLogger]] = None, + resume: Annotated[Optional[AutoResume], run.Config[AutoResume]] = None, optim: Optional[OptimizerModule] = None, tokenizer: Optional[TokenizerType] = None, model_transform: Optional[Union[PEFT, ModelTransform, Callable]] = None, @@ -311,7 +311,7 @@ def store_args_to_json(triton_http_address, triton_port, triton_request_timeout, json.dump(args_dict, f) -@task(namespace="llm") +@run.cli.entrypoint(namespace="llm") def deploy( nemo_checkpoint: Path = None, model_type: str = "llama", @@ -400,7 +400,7 @@ def deploy( nm.stop() -@task(name="import", namespace="llm") +@run.cli.entrypoint(name="import", namespace="llm") def import_ckpt( model: pl.LightningModule, source: str, @@ -414,7 +414,7 @@ def load_connector_from_trainer_ckpt(path: Path, target: str) -> io.ModelConnect return io.load_context(path).model.exporter(target, path) -@task(name="export", namespace="llm") +@run.cli.entrypoint(name="export", namespace="llm") def export_ckpt( path: Path, target: str, diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py index 7fa5bd719581..46cab3163368 100644 --- a/nemo/collections/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -63,6 +63,7 @@ def __init__( num_workers: int = 8, pin_memory: bool = True, persistent_workers: bool = False, + pad_to_max_length: bool = False, ): super().__init__() self.seq_length = seq_length @@ -78,6 +79,7 @@ def __init__( self.rampup_batch_size = rampup_batch_size self.data_sampler = None self.max_train_samples = None + self.pad_to_max_length = pad_to_max_length def setup(self, stage: str): self.data_sampler = MegatronDataSampler( @@ -97,6 +99,7 @@ def train_dataloader(self) -> DataLoader: self._create_dataset( str(self.train_path), max_num_samples=self.max_train_samples, + pad_to_max_length=self.pad_to_max_length, ) ) @@ -105,6 +108,7 @@ def val_dataloader(self) -> DataLoader: self._create_dataset( str(self.validation_path), is_test=True, + pad_to_max_length=self.pad_to_max_length, ), ) @@ -114,6 +118,7 @@ def test_dataloader(self) -> DataLoader: str(self.test_path), tokens_to_generate=32, is_test=True, + pad_to_max_length=self.pad_to_max_length, ) ) diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py index ccb2d21729ed..534922efe3a3 100644 --- a/nemo/collections/llm/gpt/data/pre_training.py +++ b/nemo/collections/llm/gpt/data/pre_training.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +import os import warnings from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Optional @@ -34,6 +35,66 @@ from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec +def is_number_tryexcept(s): + """Returns True if string is a number.""" + if s is None: + return False + try: + float(s) + return True + except ValueError: + return False + + +def is_zipped_list(paths): + # ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"] + even = paths[::2] + if len(even) == 0: + return False + is_num = list(map(is_number_tryexcept, even)) + if any(is_num): + assert all(is_num), "Got malformatted zipped list" + return is_num[0] + + +def validate_dataset_asset_accessibility(paths): + if paths is None: + raise ValueError("Expected path to have a value.") + + if isinstance(paths, tuple) or isinstance(paths, list): + if is_zipped_list(paths): + # remove weights from paths. + paths = paths[1::2] + for p in paths: + validate_dataset_asset_accessibility(p) + return + elif isinstance(paths, dict): + for p in paths.values(): + validate_dataset_asset_accessibility(p) + return + + if not isinstance(paths, str) and not isisntance(paths, Path): + raise ValueError("Expected path to be of string or Path type.") + + path = Path(paths) + suffices = ('.bin', '.idx') + if path.is_dir(): + if not os.access(path, os.R_OK): + raise PermissionError(f"Expected {str(path)} to be readable.") + # Will let the downstream class confirm contents are ok. + return + if path.exists(): + if not os.access(path, os.R_OK): + raise PermissionError(f"Expected {str(path)} to be readable.") + return + for suffix in suffices: + file_path = Path(str(path) + suffix) + if not file_path.exists(): + raise FileNotFoundError(f"Expected {str(file_path)} to exist.") + if not os.access(file_path, os.R_OK): + raise PermissionError(f"Expected {str(file_path)} to be readable.") + + class PreTrainingDataModule(pl.LightningDataModule, IOMixin): """PyTorch Lightning-compatible data module for pre-training GPT-style models. @@ -100,6 +161,8 @@ def __init__( from megatron.core.datasets.utils import get_blend_from_list + validate_dataset_asset_accessibility(paths) + build_kwargs = {} if isinstance(paths, dict): if split is not None: diff --git a/nemo/collections/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py index a2dfa12af69e..3f73d67ec61d 100644 --- a/nemo/collections/llm/gpt/data/squad.py +++ b/nemo/collections/llm/gpt/data/squad.py @@ -53,6 +53,7 @@ def __init__( num_workers: int = 8, pin_memory: bool = True, persistent_workers: bool = False, + pad_to_max_length: bool = False, ): self.force_redownload = force_redownload self.delete_raw = delete_raw @@ -69,6 +70,7 @@ def __init__( num_workers=num_workers, pin_memory=pin_memory, persistent_workers=persistent_workers, + pad_to_max_length=pad_to_max_length, ) def prepare_data(self) -> None: diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 81098040191c..aa3615b3ddfd 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -15,6 +15,12 @@ from nemo.collections.llm.gpt.model.baichuan import Baichuan2Config, Baichuan2Config7B, Baichuan2Model from nemo.collections.llm.gpt.model.base import ( GPTConfig, + GPTConfig5B, + GPTConfig7B, + GPTConfig20B, + GPTConfig40B, + GPTConfig126M, + GPTConfig175B, GPTModel, MaskedTokenLossReduction, gpt_data_step, @@ -71,6 +77,16 @@ Qwen2Config500M, Qwen2Model, ) +from nemo.collections.llm.gpt.model.ssm import ( + BaseMambaConfig1_3B, + BaseMambaConfig2_7B, + BaseMambaConfig130M, + BaseMambaConfig370M, + BaseMambaConfig780M, + NVIDIAMambaConfig8B, + NVIDIAMambaHybridConfig8B, + SSMConfig, +) from nemo.collections.llm.gpt.model.starcoder import StarcoderConfig, StarcoderConfig15B, StarcoderModel from nemo.collections.llm.gpt.model.starcoder2 import ( Starcoder2Config, @@ -137,6 +153,14 @@ "Qwen2Config7B", "Qwen2Config72B", "Qwen2Model", + "SSMConfig", + "BaseMambaConfig130M", + "BaseMambaConfig370M", + "BaseMambaConfig780M", + "BaseMambaConfig1_3B", + "BaseMambaConfig2_7B", + "NVIDIAMambaConfig8B", + "NVIDIAMambaHybridConfig8B", "MaskedTokenLossReduction", "gpt_data_step", "gpt_forward_step", diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py index b60c0430b8be..19a04a65a026 100644 --- a/nemo/collections/llm/gpt/model/baichuan.py +++ b/nemo/collections/llm/gpt/model/baichuan.py @@ -106,7 +106,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self), trust_remote_code=True) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)), trust_remote_code=True) @property def config(self) -> Baichuan2Config: diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index a6b53f4e859d..e0d752bf3411 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -182,6 +182,60 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel": ) +@dataclass +class GPTConfig126M(GPTConfig): + seq_length: int = 2048 + num_layers: int = 12 + hidden_size: int = 768 + ffn_hidden_size: int = 3072 + num_attention_heads: int = 12 + + +@dataclass +class GPTConfig5B(GPTConfig): + seq_length: int = 2048 + num_layers: int = 24 + hidden_size: int = 4096 + ffn_hidden_size: int = 16384 + num_attention_heads: int = 32 + + +@dataclass +class GPTConfig7B(GPTConfig): + seq_length: int = 2048 + num_layers: int = 32 + hidden_size: int = 4096 + ffn_hidden_size: int = 10880 + num_attention_heads: int = 32 + + +@dataclass +class GPTConfig20B(GPTConfig): + seq_length: int = 2048 + num_layers: int = 44 + hidden_size: int = 6144 + ffn_hidden_size: int = 24576 + num_attention_heads: int = 48 + + +@dataclass +class GPTConfig40B(GPTConfig): + seq_length: int = 2048 + num_layers: int = 48 + hidden_size: int = 8192 + ffn_hidden_size: int = 32768 + num_attention_heads: int = 64 + + +@dataclass +class GPTConfig175B(GPTConfig): + seq_length: int = 2048 + num_layers: int = 96 + hidden_size: int = 12288 + ffn_hidden_size: int = 49152 + num_attention_heads: int = 96 + + class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin): def __init__( self, diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py index 3b6453b2b891..162b42501d11 100644 --- a/nemo/collections/llm/gpt/model/chatglm.py +++ b/nemo/collections/llm/gpt/model/chatglm.py @@ -113,7 +113,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self), trust_remote_code=True) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)), trust_remote_code=True) @property def config(self) -> ChatGLMConfig: diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py index 753d75165197..e28d4409437b 100644 --- a/nemo/collections/llm/gpt/model/gemma.py +++ b/nemo/collections/llm/gpt/model/gemma.py @@ -134,7 +134,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> GemmaConfig: diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index 2c76b2fdd976..59d697f2f6b7 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -251,7 +251,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> LlamaConfig: diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py index 73e6a34fd7c2..a6415769112a 100644 --- a/nemo/collections/llm/gpt/model/mistral.py +++ b/nemo/collections/llm/gpt/model/mistral.py @@ -142,7 +142,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> MistralConfig7B: diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py index b0f40a2fc785..bc255ae8fb87 100644 --- a/nemo/collections/llm/gpt/model/mixtral.py +++ b/nemo/collections/llm/gpt/model/mixtral.py @@ -59,7 +59,7 @@ class MixtralConfig(GPTConfig): moe_aux_loss_coeff: float = 0.01 moe_expert_capacity_factor: float = 1.0 moe_pad_expert_input_to_capacity: bool = True - moe_router_topk: int = 1 + moe_router_topk: int = 2 moe_router_pre_softmax: bool = True moe_token_dispatcher_type: str = "alltoall" @@ -104,7 +104,7 @@ class MixtralConfig8x7B(MixtralConfig): @dataclass class MixtralConfig8x22B(MixtralConfig): """ - Config for Mixtral-8x7B model + Config for Mixtral-8x22B model Official announcement: https://mistral.ai/news/mixtral-8x22b/ """ @@ -114,9 +114,6 @@ class MixtralConfig8x22B(MixtralConfig): ffn_hidden_size: int = 16384 max_position_embeddings: int = 4096 seq_length: int = 4096 - # MoE - num_moe_experts: int = 8 - moe_router_topk: int = 2 class MixtralModel(GPTModel): @@ -171,7 +168,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> MixtralConfig8x7B | MixtralConfig8x22B: diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py index 44f10c0bee60..c8a8b5abee4b 100644 --- a/nemo/collections/llm/gpt/model/nemotron.py +++ b/nemo/collections/llm/gpt/model/nemotron.py @@ -173,7 +173,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self)) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> NemotronConfig: diff --git a/nemo/collections/llm/gpt/model/qwen2.py b/nemo/collections/llm/gpt/model/qwen2.py index 643bdda3ba8d..09ed910bac4c 100644 --- a/nemo/collections/llm/gpt/model/qwen2.py +++ b/nemo/collections/llm/gpt/model/qwen2.py @@ -141,7 +141,7 @@ def convert_state(self, source, target): def tokenizer(self) -> "AutoTokenizer": from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer - return AutoTokenizer(str(self), trust_remote_code=True) + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self)), trust_remote_code=True) @property def config(self) -> Qwen2Config: diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py new file mode 100644 index 000000000000..954fa8bfe9f7 --- /dev/null +++ b/nemo/collections/llm/gpt/model/ssm.py @@ -0,0 +1,317 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, Literal, Optional + +import torch + +from nemo.utils import logging + +try: + from megatron.core import parallel_state + from megatron.core.models.mamba import MambaModel as MCoreMambaModel + from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec + + HAVE_MEGATRON_CORE_OR_TE = True + +except (ImportError, ModuleNotFoundError): + logging.warning("The package `megatron.core` was not imported in this environment which is needed for SSMs.") + HAVE_MEGATRON_CORE_OR_TE = False + +from megatron.core.transformer.transformer_config import TransformerConfig +from nemo.collections.llm.gpt.model.base import GPTModel, gpt_data_step +from nemo.lightning import get_vocab_size, io, teardown + + +def ssm_forward_step(model, batch) -> torch.Tensor: + + forward_args = { + "input_ids": batch["tokens"], + "position_ids": batch["position_ids"], + "labels": batch["labels"], + } + forward_args["attention_mask"] = None + return model(**forward_args) + + +@dataclass +class SSMConfig(TransformerConfig, io.IOMixin): + # From megatron.core.models.mamba.mamba_model.MambaModel + fp16_lm_cross_entropy: bool = False + parallel_output: bool = True + share_embeddings_and_output_weights: bool = False + num_layers: int = 2 + mamba_ssm_ngroups: int = 8 + num_attention_heads: int = 1 + hybrid_attention_ratio: float = 0.0 + hybrid_mlp_ratio: float = 0.0 + hybrid_override_pattern: str = None + post_process: bool = True + pre_process: bool = True + seq_length: int = 2048 + # Mamba with no attention has no need for position embeddings, so none is default + position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'none' + rotary_percent: float = 1.0 + rotary_base: int = 10000 + seq_len_interpolation_factor: Optional[float] = None + apply_rope_fusion: bool = True + make_vocab_size_divisible_by: int = 128 + gated_linear_unit: bool = False + fp32_residual_connections: bool = True + normalization: str = 'RMSNorm' + add_bias_linear: bool = False + hidden_dropout: float = 0.0 + attention_dropout: float = 0.0 + layernorm_epsilon: float = 1e-5 + # TODO: Move this to better places? + get_attention_mask_from_fusion: bool = False + + forward_step_fn: Callable = ssm_forward_step + data_step_fn: Callable = gpt_data_step + + def configure_model(self, tokenizer) -> "MCoreMambaModel": + + return MCoreMambaModel( + self, + mamba_stack_spec=mamba_stack_spec, + vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by), + max_sequence_length=self.seq_length, + mamba_ssm_ngroups=self.mamba_ssm_ngroups, + hybrid_attention_ratio=self.hybrid_attention_ratio, + hybrid_mlp_ratio=self.hybrid_mlp_ratio, + hybrid_override_pattern=self.hybrid_override_pattern, + position_embedding_type=self.position_embedding_type, + rotary_percent=self.rotary_percent, + rotary_base=self.rotary_base, + seq_len_interpolation_factor=self.seq_len_interpolation_factor, + pre_process=parallel_state.is_pipeline_first_stage(), + post_process=parallel_state.is_pipeline_last_stage(), + ) + + +@io.model_importer(GPTModel, "pytorch") +class PyTorchSSMImporter(io.ModelConnector["GPTModel", GPTModel]): + + def __new__(cls, path: str, model_config=None): + instance = super().__new__(cls, path) + instance.model_config = model_config + return instance + + def init(self) -> GPTModel: + + return GPTModel(self.config, tokenizer=self.tokenizer) + + def apply(self, output_path: Path) -> Path: + + source = torch.load(str(self), map_location='cpu') + if 'model' in source: + source = source['model'] + + class ModelState: + def __init__(self, state_dict): + self._state_dict = state_dict + + def state_dict(self): + return self._state_dict + + source = ModelState(source) + target = self.init() + trainer = self.nemo_setup(target) + self.convert_state(source, target) + self.nemo_save(output_path, trainer) + + logging.info(f"Converted SSM model to Nemo, model saved to {output_path}") + + teardown(trainer, target) + del trainer, target + + return output_path + + def convert_state(self, source, target): + + if self.model_config.mapping_type == "base": + mapping = { + 'backbone.embedding.weight': 'embedding.word_embeddings.weight', + 'backbone.layers.*.mixer.A_log': 'decoder.layers.*.mixer.A_log', + 'backbone.layers.*.mixer.D': 'decoder.layers.*.mixer.D', + 'backbone.layers.*.mixer.conv1d.weight': 'decoder.layers.*.mixer.conv1d.weight', + 'backbone.layers.*.mixer.conv1d.bias': 'decoder.layers.*.mixer.conv1d.bias', + 'backbone.layers.*.mixer.in_proj.weight': 'decoder.layers.*.mixer.in_proj.weight', + 'backbone.layers.*.mixer.dt_bias': 'decoder.layers.*.mixer.dt_bias', + 'backbone.layers.*.mixer.out_proj.weight': 'decoder.layers.*.mixer.out_proj.weight', + 'backbone.layers.*.mixer.norm.weight': 'decoder.layers.*.mixer.norm.weight', + 'backbone.layers.*.norm.weight': 'decoder.layers.*.mixer.in_proj.layer_norm_weight', + 'backbone.norm_f.weight': 'decoder.final_norm.weight', + 'lm_head.weight': 'output_layer.weight', + } + elif "nvidia" in self.model_config.mapping_type: + mapping = { + 'embedding.word_embeddings.weight': 'embedding.word_embeddings.weight', + 'decoder.layers.*.mixer.A_log': 'decoder.layers.*.mixer.A_log', + 'decoder.layers.*.mixer.D': 'decoder.layers.*.mixer.D', + 'decoder.layers.*.mixer.conv1d.weight': 'decoder.layers.*.mixer.conv1d.weight', + 'decoder.layers.*.mixer.conv1d.bias': 'decoder.layers.*.mixer.conv1d.bias', + 'decoder.layers.*.mixer.in_proj.weight': 'decoder.layers.*.mixer.in_proj.weight', + 'decoder.layers.*.mixer.dt_bias': 'decoder.layers.*.mixer.dt_bias', + 'decoder.layers.*.mixer.out_proj.weight': 'decoder.layers.*.mixer.out_proj.weight', + 'decoder.layers.*.mixer.norm.weight': 'decoder.layers.*.mixer.norm.weight', + 'decoder.layers.*.norm.weight': 'decoder.layers.*.mixer.in_proj.layer_norm_weight', + 'decoder.final_norm.weight': 'decoder.final_norm.weight', + 'output_layer.weight': 'output_layer.weight', + } + if "hybrid" in self.model_config.mapping_type: + mapping.update( + { + 'decoder.layers.*.mlp.linear_fc1.layer_norm_weight': 'decoder.layers.*.mlp.linear_fc1.layer_norm_weight', + 'decoder.layers.*.mlp.linear_fc1.weight': 'decoder.layers.*.mlp.linear_fc1.weight', + 'decoder.layers.*.mlp.linear_fc2.weight': 'decoder.layers.*.mlp.linear_fc2.weight', + 'decoder.layers.*.self_attention.linear_proj.weight': 'decoder.layers.*.self_attention.linear_proj.weight', + 'decoder.layers.*.self_attention.linear_qkv.layer_norm_weight': 'decoder.layers.*.self_attention.linear_qkv.layer_norm_weight', + 'decoder.layers.*.self_attention.linear_qkv.weight': 'decoder.layers.*.self_attention.linear_qkv.weight', + } + ) + else: + raise AttributeError(f"mapping type [{self.mapping_type}] not found.") + return io.apply_transforms(source, target, mapping=mapping) + + @property + def tokenizer(self): + from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer + + tokenizer = get_nmt_tokenizer( + library=self.model_config.tokenizer_library, + model_name=self.model_config.tokenizer_name, + tokenizer_model=self.model_config.tokenizer_model_path, + use_fast=True, + ) + + return tokenizer + + @property + def config(self) -> SSMConfig: + return self.model_config + + +@dataclass +class BaseMambaConfig130M(SSMConfig): + hybrid_override_pattern: str = "M" * 24 + num_layers: int = 24 + seq_length: int = 2048 + hidden_size: int = 768 + mamba_ssm_ngroups: int = 1 + ffn_hidden_size: int = 768 + make_vocab_size_divisible_by: int = 16 + tokenizer_library: str = 'huggingface' + tokenizer_name: str = "EleutherAI/gpt-neox-20b" + mapping_type: str = "base" + + +@dataclass +class BaseMambaConfig370M(SSMConfig): + hybrid_override_pattern: str = "M" * 48 + num_layers: int = 48 + seq_length: int = 2048 + hidden_size: int = 1024 + mamba_ssm_ngroups: int = 1 + ffn_hidden_size: int = 1024 + make_vocab_size_divisible_by: int = 16 + tokenizer_library: str = 'huggingface' + tokenizer_name: str = "EleutherAI/gpt-neox-20b" + mapping_type: str = "base" + + +@dataclass +class BaseMambaConfig780M(SSMConfig): + hybrid_override_pattern: str = "M" * 48 + num_layers: int = 48 + seq_length: int = 2048 + hidden_size: int = 1536 + mamba_ssm_ngroups: int = 1 + ffn_hidden_size: int = 1536 + make_vocab_size_divisible_by: int = 16 + tokenizer_library: str = 'huggingface' + tokenizer_name: str = "EleutherAI/gpt-neox-20b" + mapping_type: str = "base" + + +@dataclass +class BaseMambaConfig1_3B(SSMConfig): + hybrid_override_pattern: str = "M" * 48 + num_layers: int = 48 + seq_length: int = 2048 + hidden_size: int = 2048 + mamba_ssm_ngroups: int = 1 + ffn_hidden_size: int = 2048 + make_vocab_size_divisible_by: int = 16 + tokenizer_library: str = 'huggingface' + tokenizer_name: str = "EleutherAI/gpt-neox-20b" + mapping_type: str = "base" + + +@dataclass +class BaseMambaConfig2_7B(SSMConfig): + hybrid_override_pattern: str = "M" * 64 + num_layers: int = 64 + seq_length: int = 2048 + hidden_size: int = 2560 + mamba_ssm_ngroups: int = 1 + ffn_hidden_size: int = 2560 + make_vocab_size_divisible_by: int = 16 + tokenizer_library: str = 'huggingface' + tokenizer_name: str = "EleutherAI/gpt-neox-20b" + mapping_type: str = "base" + + +@dataclass +class NVIDIAMambaConfig8B(SSMConfig): + hybrid_override_pattern: str = "M" * 56 + num_layers: int = 56 + seq_length: int = 4096 + hidden_size: int = 4096 + mamba_ssm_ngroups: int = 8 + ffn_hidden_size: int = 4096 + make_vocab_size_divisible_by: int = 128 + tokenizer_library: str = 'megatron' + tokenizer_name: str = "GPTSentencePieceTokenizer" + mapping_type: str = "nvidia-pure" + + +@dataclass +class NVIDIAMambaHybridConfig8B(SSMConfig): + hybrid_override_pattern: str = "M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-" + num_layers: int = 56 + seq_length: int = 4096 + hidden_size: int = 4096 + mamba_ssm_ngroups: int = 8 + ffn_hidden_size: int = 16384 + num_attention_heads: int = 32 + num_query_groups: int = 8 + make_vocab_size_divisible_by: int = 128 + tokenizer_library: str = 'megatron' + tokenizer_name: str = "GPTSentencePieceTokenizer" + mapping_type: str = "nvidia-hybrid" + + +__all__ = [ + "SSMConfig", + "BaseMambaConfig130M", + "BaseMambaConfig370M", + "BaseMambaConfig780M", + "BaseMambaConfig1_3B", + "BaseMambaConfig2_7B", + "NVIDIAMambaConfig8B", + "NVIDIAMambaHybridConfig8B", +] diff --git a/nemo/collections/llm/gpt/model/starcoder.py b/nemo/collections/llm/gpt/model/starcoder.py index 15deb0ba2191..e7cc3f411710 100644 --- a/nemo/collections/llm/gpt/model/starcoder.py +++ b/nemo/collections/llm/gpt/model/starcoder.py @@ -19,7 +19,6 @@ import torch.nn.functional as F from torch import nn -from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel from nemo.collections.llm.utils import Config from nemo.lightning import OptimizerModule, io, teardown @@ -120,7 +119,9 @@ def convert_state(self, source, target): @property def tokenizer(self) -> "AutoTokenizer": - return AutoTokenizer(str(self)) + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> StarcoderConfig: diff --git a/nemo/collections/llm/gpt/model/starcoder2.py b/nemo/collections/llm/gpt/model/starcoder2.py index c49af006c6f5..57b8d3635ade 100644 --- a/nemo/collections/llm/gpt/model/starcoder2.py +++ b/nemo/collections/llm/gpt/model/starcoder2.py @@ -20,7 +20,6 @@ import torch.nn.functional as F from torch import nn -from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel from nemo.collections.llm.utils import Config from nemo.lightning import OptimizerModule, io, teardown @@ -144,7 +143,9 @@ def convert_state(self, source, target): @property def tokenizer(self) -> "AutoTokenizer": - return AutoTokenizer(str(self)) + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + + return AutoTokenizer(self.save_hf_tokenizer_assets(str(self))) @property def config(self) -> Starcoder2Config: diff --git a/nemo/collections/llm/recipes/ADD-RECIPE.md b/nemo/collections/llm/recipes/ADD-RECIPE.md new file mode 100644 index 000000000000..c506374e3784 --- /dev/null +++ b/nemo/collections/llm/recipes/ADD-RECIPE.md @@ -0,0 +1,100 @@ +# How to Add a New Recipe + +This guide explains the process of adding a new recipe to the NeMo LLM collection. + +## Step 1: Create a New Python File + +Create a new Python file in the `nemo/collections/llm/recipes/` directory. Name it according to the model and its specific configuration, e.g., `my_new_model_12b.py`. + +## Step 2: Define the Model Configuration + +Create a function called `model` to define the model configuration: + +```python +NAME = "my_new_model_12b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + return run.Config(YourModel, config=run.Config(YourModelConfig)) +``` + +## Step 3: Define the Trainer Configuration + +Create a function called `trainer` to set up the trainer: + +```python +def trainer( + num_nodes: int = 1, + num_gpus_per_node: int = 8, + # Add other parameters as needed +) -> run.Config[nl.Trainer]: + strategy = run.Config( + nl.MegatronStrategy, + # Define your parallelism strategy here + ) + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + devices=num_gpus_per_node, + num_nodes=num_nodes, + # Add other trainer configurations + ) + return trainer +``` + +## Step 4: Define the Recipe Configuration + +Create a function called `pretrain_recipe` or `finetune_recipe` to define the recipe configuration: + +```python +from nemo.collections.llm import pretrain + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + # Add other parameters as needed +) -> run.Config[nl.PretrainRecipe]: + return run.Config( + nl.PretrainRecipe, + model=model(), + trainer=trainer(), + # Add other recipe configurations + data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) +``` + +```python +from nemo.collections.llm import finetune + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + # Add other parameters as needed +) -> run.Config[nl.FinetuneRecipe]: + return run.Config( + nl.FinetuneRecipe, + model=model(), + trainer=trainer(), + # Add other recipe configurations + data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) +``` + + +## Step 5: Import the recipe in the __init__.py file + +Import the recipe in the [__init__.py](__init__.py) file in the same directory: + +```python +from .my_new_model_12b import pretrain_recipe, finetune_recipe +``` + + +## Step 6: Add tests for the recipe + +Add tests for the recipe in the [tests](../../../../tests/collections/llm/recipes) directory. You can use [test_llama3_8b.py](../../../../tests/collections/llm/recipes/test_llama3_8b.py) as an example. diff --git a/nemo/collections/llm/recipes/README.md b/nemo/collections/llm/recipes/README.md new file mode 100644 index 000000000000..a3cf715acffb --- /dev/null +++ b/nemo/collections/llm/recipes/README.md @@ -0,0 +1,46 @@ +# NeMo LLM Recipes + +This directory contains recipes for pre-training and fine-tuning large language models (LLMs) using NeMo. + +A recipe in NeMo is a Python file that defines a complete configuration for training or fine-tuning an LLM. Each recipe typically includes: + +1. Model configuration: Defines the architecture and hyperparameters of the LLM. +2. Training configuration: Specifies settings for the PyTorch Lightning Trainer, including distributed training strategies. +3. Data configuration: Sets up the data pipeline, including batch sizes and sequence lengths. +4. Optimization configuration: Defines the optimizer and learning rate schedule. +5. Logging and checkpointing configuration: Specifies how to save model checkpoints and log training progress. + +Recipes are designed to be modular and extensible, allowing users to easily customize settings for their specific use cases. + +## Usage + +### Command Line Interface + +You can use these recipes via the NeMo CLI: + +```bash +nemorun llm --factory +``` +Where: +- `` is either `pretrain` or `finetune` +- `` is the name of the recipe (e.g. `llama3_8b`) + +For example: +```bash +nemorun llm pretrain --factory llama3_8b +``` + + +### Customizing Parameters + +You can override any parameter in the recipe: + +```bash +nemorun llm pretrain --factory llama3_8b trainer.max_steps=2000 +``` + +For more details around running recipes, see [pre-train](../../../../examples/llm/pretrain/README.md). + +## Adding a New Recipe + +See [ADD-RECIPE.md](ADD-RECIPE.md) for instructions on how to add a new recipe. \ No newline at end of file diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index 950ca6db7ac6..ec44d1c19864 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -1,3 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from nemo.collections.llm.recipes import ( llama3_8b, llama3_8b_16k, diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index cbf6b5e2e7a1..96c94fd6eeba 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -1,8 +1,23 @@ -from typing import Callable, Optional +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import nemo_run as run import pytorch_lightning as pl import torch -from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -13,32 +28,77 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192 -from nemo.collections.llm.utils import Config, Partial from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "llama3_70b" -def model() -> Config[pl.LightningModule]: - return Config(LlamaModel, config=Config(Llama3Config70B)) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 70B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 70B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_70b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(LlamaModel, config=run.Config(Llama3Config70B)) def trainer( - tensor_parallelism: int, - pipeline_parallelism: int, - pipeline_parallelism_type: Optional[torch.dtype], - virtual_pipeline_parallelism: Optional[int], - context_parallelism: int, - sequence_parallelism: bool, + tensor_parallelism: int = 4, + pipeline_parallelism: int = 4, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, + virtual_pipeline_parallelism: Optional[int] = 5, + context_parallelism: int = 2, + sequence_parallelism: bool = True, num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 1168251, - callbacks: Optional[list[Config[Callback]]] = None, -) -> Config[nl.Trainer]: - strategy = Config( + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Llama3 70B model. + + This function sets up the distributed training strategy optimized for the large 70B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_70b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size efficiently. + """ + strategy = run.Config( nl.MegatronStrategy, tensor_model_parallel_size=tensor_parallelism, pipeline_model_parallel_size=pipeline_parallelism, @@ -51,7 +111,7 @@ def trainer( ckpt_parallel_load=True, ) - trainer = Config( + trainer = run.Config( nl.Trainer, accelerator="gpu", accumulate_grad_batches=1, @@ -62,7 +122,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=bf16_mixed_plugin(), + plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -71,42 +131,89 @@ def trainer( return trainer +@run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - return Partial( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 70B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_70b + $ nemo llm pretrain --factory "llama3_70b(num_nodes=4, name='my_70b_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_70b_pretrain", num_nodes=4) + >>> print(recipe) + + Note: + This recipe is optimized for the large 70B model and requires significant computational resources. + """ + return run.Partial( fn, model=model(), trainer=trainer( - tensor_parallelism=4, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, - context_parallelism=2, - sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ), - data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) +@run.cli.factory(target=pretrain, name=NAME + "_performance") def pretrain_recipe_performance( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - """'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default - due to being model specific or lacking sufficent support. For better compatibility please use - the default 'pretrain_recipe()' above.""" - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Llama3 70B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) recipe.trainer.callbacks.append( - Config( + run.Config( MegatronCommOverlapCallback, tp_comm_overlap=True, tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192, @@ -118,18 +225,66 @@ def pretrain_recipe_performance( return recipe -def hf_resume() -> Config[nl.AutoResume]: - return Config( +def hf_resume() -> run.Config[nl.AutoResume]: + """ + Configure automatic resumption from a Hugging Face checkpoint for Llama3 70B model. + + This function sets up the configuration to resume training from a pre-trained + Hugging Face model checkpoint. + + More info about the model can be found at: https://huggingface.co/meta-llama/Meta-Llama-3-70B + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint. + + Note: + This is particularly useful for fine-tuning scenarios where you want to + start from the pre-trained Llama3 70B model. + """ + return run.Config( nl.AutoResume, - restore_config=Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-70B"), + restore_config=run.Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-70B"), ) -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune - ) +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 70B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning of the large model. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_70b + $ nemo llm finetune --factory "llama3_70b(num_nodes=4, name='my_70b_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_70b_finetune", num_nodes=4) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. Be aware that fine-tuning a 70B model + requires substantial computational resources. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) recipe.resume = hf_resume() - recipe.peft = Config(LoRA) - recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + recipe.peft = run.Config(LoRA) + recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/llama3_70b_16k.py b/nemo/collections/llm/recipes/llama3_70b_16k.py index 87826661606f..3798088ff722 100644 --- a/nemo/collections/llm/recipes/llama3_70b_16k.py +++ b/nemo/collections/llm/recipes/llama3_70b_16k.py @@ -1,57 +1,81 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_70b -from nemo.collections.llm.utils import Config, Partial -from nemo.utils.exp_manager import TimingCallback NAME = "llama3_70b_16k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = llama3_70b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 70B model configuration with 16k sequence length. - model = llama3_70b.model() - model.config.seq_length = 16384 + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 70B model with 16k sequence length. - trainer = llama3_70b.trainer( - tensor_parallelism=4, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, - context_parallelism=4, - sequence_parallelism=True, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_70b_16k ... - data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = llama3_70b.model() + model_config.config.seq_length = 16384 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Llama3 70B model with 16k sequence length. + This function sets up the distributed training strategy optimized for the large 70B model with longer sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = llama3_70b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. - model = llama3_70b.model() - model.config.seq_length = 16384 + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_70b_16k ... - trainer = llama3_70b.trainer( + Python API usage: + >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size and longer sequence length efficiently. + """ + return llama3_70b.trainer( tensor_parallelism=2, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, @@ -60,13 +84,93 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 70B model with 16k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_70b_16k + $ nemo llm pretrain --factory "llama3_70b_16k(num_nodes=4, name='my_70b_16k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_70b_16k_pretrain", num_nodes=4) + >>> print(recipe) + + Note: + This recipe is optimized for the large 70B model with longer sequences (16k). + It requires significant computational resources. + """ + recipe = llama3_70b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 70B model with 16k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_70b_16k + $ nemo llm finetune --factory "llama3_70b_16k(num_nodes=4, name='my_70b_16k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_70b_16k_finetune", num_nodes=4) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning the large 70B model with longer sequences (16k). + It uses the SQuAD dataset adapted for 16k sequence length. Be aware that this configuration + requires substantial computational resources. + """ + recipe = llama3_70b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/llama3_70b_64k.py b/nemo/collections/llm/recipes/llama3_70b_64k.py index 5185e6b2ec45..353bdd659947 100644 --- a/nemo/collections/llm/recipes/llama3_70b_64k.py +++ b/nemo/collections/llm/recipes/llama3_70b_64k.py @@ -1,72 +1,179 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_70b -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "llama3_70b_64k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = llama3_70b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 70B model configuration with 64k sequence length. - model = llama3_70b.model() - model.config.seq_length = 65536 + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 70B model with 64k sequence length. - trainer = llama3_70b.trainer( - tensor_parallelism=8, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, - context_parallelism=8, - sequence_parallelism=True, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_70b_64k ... - data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = llama3_70b.model() + model_config.config.seq_length = 65536 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 32, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Llama3 70B model with 64k sequence length. + This function sets up the distributed training strategy optimized for the large 70B model with long sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = llama3_70b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. - model = llama3_70b.model() - model.config.seq_length = 65536 + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_70b_64k ... - trainer = llama3_70b.trainer( - tensor_parallelism=2, + Python API usage: + >>> trainer_config = trainer(num_nodes=32, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size and long sequence length efficiently. + It requires a significant amount of computational resources. + """ + return llama3_70b.trainer( + tensor_parallelism=8, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, virtual_pipeline_parallelism=5, - context_parallelism=2, + context_parallelism=8, sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 32, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 70B model with 64k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_70b_64k + $ nemo llm pretrain --factory "llama3_70b_64k(num_nodes=32, name='my_70b_64k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_70b_64k_pretrain", num_nodes=32) + >>> print(recipe) + + Note: + This recipe is optimized for the large 70B model with long sequences (64k). + It requires extensive computational resources due to the model size and extended sequence length. + """ + recipe = llama3_70b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 32, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 70B model with 64k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_70b_64k + $ nemo llm finetune --factory "llama3_70b_64k(num_nodes=32, name='my_70b_64k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_70b_64k_finetune", num_nodes=32) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning the large 70B model with long sequences (64k). + It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration + requires extensive computational resources due to the model size and extended sequence length. + """ + recipe = llama3_70b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index 17d4e8b168b3..8b2ea2969273 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -1,5 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from typing import Callable, Optional +import nemo_run as run import pytorch_lightning as pl import torch from pytorch_lightning.callbacks.callback import Callback @@ -12,31 +28,77 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin -from nemo.collections.llm.utils import Config, Partial +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.utils.exp_manager import TimingCallback NAME = "llama3_8b" -def model() -> Config[pl.LightningModule]: - return Config(LlamaModel, config=Config(Llama3Config8B)) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 8B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 8B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_8b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(LlamaModel, config=run.Config(Llama3Config8B)) def trainer( - tensor_parallelism: int, - pipeline_parallelism: int, - pipeline_parallelism_type: Optional[torch.dtype], - virtual_pipeline_parallelism: Optional[int], - context_parallelism: int, - sequence_parallelism: bool, + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 1168251, - callbacks: Optional[list[Config[Callback]]] = None, -) -> Config[nl.Trainer]: - strategy = Config( + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Llama3 8B model. + + This function sets up the distributed training strategy and other training parameters. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_8b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + For more information on distributed training strategies, refer to the + NeMo documentation on multi-GPU and multi-node training. + """ + strategy = run.Config( nl.MegatronStrategy, tensor_model_parallel_size=tensor_parallelism, pipeline_model_parallel_size=pipeline_parallelism, @@ -49,7 +111,7 @@ def trainer( ckpt_parallel_load=True, ) - trainer = Config( + trainer = run.Config( nl.Trainer, accelerator="gpu", accumulate_grad_batches=1, @@ -60,7 +122,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=bf16_mixed_plugin(), + plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -69,42 +131,93 @@ def trainer( return trainer +@run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - return Partial( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 8B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_8b + $ nemo llm pretrain --factory "llama3_8b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_8b_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + For more details on pre-training LLMs with NeMo, see the pre-training + guide in the `examples/llm/pretrain/` directory. + """ + return run.Partial( fn, model=model(), trainer=trainer( - tensor_parallelism=1, - pipeline_parallelism=1, - pipeline_parallelism_type=None, - virtual_pipeline_parallelism=None, - context_parallelism=2, - sequence_parallelism=False, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ), - data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) +@run.cli.factory(target=pretrain, name=NAME + "_optimized") def pretrain_recipe_performance( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - """'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default - due to being model specific or lacking sufficent support. For better compatibility please use - the default 'pretrain_recipe()' above.""" - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a performance-optimized pre-training recipe for Llama3 8B model. + + This recipe enables performance optimizations that may not be suitable for all use cases. + It builds upon the standard pre-training recipe and adds additional performance enhancements. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for performance-optimized pre-training. + + Examples: + $ nemo llm pretrain --factory llama3_8b_optimized + + Python API usage: + >>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4) + >>> print(recipe) + + Note: + Use this recipe with caution and only when you need maximum performance. + It may not be suitable for all hardware configurations or use cases. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) recipe.trainer.callbacks.append( - Config( + run.Config( MegatronCommOverlapCallback, tp_comm_overlap=False, ) @@ -112,18 +225,61 @@ def pretrain_recipe_performance( return recipe -def hf_resume() -> Config[nl.AutoResume]: - return Config( +def hf_resume() -> run.Config[nl.AutoResume]: + """Configure automatic resumption from a Hugging Face checkpoint. + + This function sets up the configuration to resume training from a pre-trained + Hugging Face model checkpoint. + + More info about the model can be found at: https://huggingface.co/meta-llama/Meta-Llama-3-8B + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint. + """ + return run.Config( nl.AutoResume, - restore_config=Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-8B"), + restore_config=run.Config(nl.RestoreConfig, path="hf://meta-llama/Meta-Llama-3-8B"), ) -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune - ) +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 8B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_8b + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_8b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) recipe.resume = hf_resume() - recipe.peft = Config(LoRA) - recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + recipe.peft = run.Config(LoRA) + recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py index 27762777c622..bd02f1975864 100644 --- a/nemo/collections/llm/recipes/llama3_8b_16k.py +++ b/nemo/collections/llm/recipes/llama3_8b_16k.py @@ -1,57 +1,81 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_8b -from nemo.collections.llm.utils import Config, Partial -from nemo.utils.exp_manager import TimingCallback NAME = "llama3_8b_16k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = llama3_8b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 8B model configuration with 16k sequence length. - model = llama3_8b.model() - model.config.seq_length = 16384 + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 8B model with 16k sequence length. - trainer = llama3_8b.trainer( - tensor_parallelism=2, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, - context_parallelism=2, - sequence_parallelism=True, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_8b_16k ... - data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = llama3_8b.model() + model_config.config.seq_length = 16384 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Llama3 8B model with 16k sequence length. + This function sets up the distributed training strategy optimized for longer sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = llama3_8b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_8b_16k ... - model = llama3_8b.model() - model.config.seq_length = 16384 + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) - trainer = llama3_8b.trainer( + Note: + This configuration uses increased parallelism to handle the longer sequence length efficiently. + """ + return llama3_8b.trainer( tensor_parallelism=2, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, @@ -60,13 +84,91 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 8B model with 16k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_8b_16k + $ nemo llm pretrain --factory "llama3_8b_16k(num_nodes=2, name='my_16k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_8b_16k_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for handling longer sequences (16k) compared to the standard 8k version. + """ + recipe = llama3_8b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 8B model with 16k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_8b_16k + $ nemo llm finetune --factory "llama3_8b_16k(num_nodes=2, name='my_16k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_8b_16k_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning with longer sequences (16k) compared to the standard 8k version. + It uses the SQuAD dataset adapted for 16k sequence length. + """ + recipe = llama3_8b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py index 90001c6189a0..e5845e4530ca 100644 --- a/nemo/collections/llm/recipes/llama3_8b_64k.py +++ b/nemo/collections/llm/recipes/llama3_8b_64k.py @@ -1,57 +1,81 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import llama3_8b -from nemo.collections.llm.utils import Config, Partial -from nemo.utils.exp_manager import TimingCallback NAME = "llama3_8b_64k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = llama3_8b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3 8B model configuration with 64k sequence length. - model = llama3_8b.model() - model.config.seq_length = 65536 + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3 8B model with 64k sequence length. - trainer = llama3_8b.trainer( - tensor_parallelism=2, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=5, - context_parallelism=4, - sequence_parallelism=True, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=llama3_8b_64k ... - data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = llama3_8b.model() + model_config.config.seq_length = 65536 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Llama3 8B model with 64k sequence length. + This function sets up the distributed training strategy optimized for long sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = llama3_8b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama3_8b_64k ... - model = llama3_8b.model() - model.config.seq_length = 65536 + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) - trainer = llama3_8b.trainer( + Note: + This configuration uses significantly increased parallelism to handle the long sequence length efficiently. + """ + return llama3_8b.trainer( tensor_parallelism=2, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, @@ -60,13 +84,93 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: sequence_parallelism=True, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Llama3 8B model with 64k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama3_8b_64k + $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=2, name='my_64k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for handling long sequences (64k) compared to the standard 8k version. + It requires significant computational resources due to the extended sequence length. + """ + recipe = llama3_8b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3 8B model with 64k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama3_8b_64k + $ nemo llm finetune --factory "llama3_8b_64k(num_nodes=2, name='my_64k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="llama3_8b_64k_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning with long sequences (64k) compared to the standard 8k version. + It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration requires + substantial computational resources due to the extended sequence length. + """ + recipe = llama3_8b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/log/__init__.py b/nemo/collections/llm/recipes/log/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/nemo/collections/llm/recipes/log/__init__.py +++ b/nemo/collections/llm/recipes/log/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py index 4d5e9223b535..b59d549726c6 100644 --- a/nemo/collections/llm/recipes/log/default.py +++ b/nemo/collections/llm/recipes/log/default.py @@ -1,9 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from typing import Optional +from nemo_run import Config, cli from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger from nemo import lightning as nl -from nemo.collections.llm.utils import Config def tensorboard_logger(name: str, save_dir: str = "tb_logs") -> Config[TensorBoardLogger]: @@ -24,15 +39,15 @@ def wandb_logger(project: str, name: str, entity: Optional[str] = None) -> Confi return cfg +@cli.factory(is_target_default=True) def default_log( - ckpt_dir: str, - name: str, + dir: Optional[str] = None, + name: str = "default", tensorboard_logger: Optional[Config[TensorBoardLogger]] = None, wandb_logger: Optional[Config[WandbLogger]] = None, ) -> Config[nl.NeMoLogger]: ckpt = Config( nl.ModelCheckpoint, - save_best_model=False, save_last=True, save_top_k=10, every_n_train_steps=200, @@ -45,13 +60,14 @@ def default_log( name=name, tensorboard=tensorboard_logger, wandb=wandb_logger, - dir=ckpt_dir, + dir=dir, ) -def default_resume() -> Config[nl.AutoResume]: +@cli.factory(is_target_default=True) +def default_resume(resume_if_exists=True, resume_ignore_no_checkpoint=True) -> Config[nl.AutoResume]: return Config( nl.AutoResume, - resume_if_exists=True, - resume_ignore_no_checkpoint=True, + resume_if_exists=resume_if_exists, + resume_ignore_no_checkpoint=resume_ignore_no_checkpoint, ) diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral.py index c504340348fe..902e7623afd2 100644 --- a/nemo/collections/llm/recipes/mistral.py +++ b/nemo/collections/llm/recipes/mistral.py @@ -1,61 +1,242 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.api import squad +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel -from nemo.collections.llm.peft.api import gpt_lora -from nemo.collections.llm.recipes.log.default import default_log +from nemo.collections.llm.peft.lora import LoRA +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.utils import Partial, factory +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.utils.exp_manager import TimingCallback NAME = "mistral" -@factory(name=NAME) -def model() -> pl.LightningModule: - return MistralModel(MistralConfig7B()) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mistral 7B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mistral 7B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mistral ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(MistralModel, config=run.Config(MistralConfig7B)) + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 2, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 100, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mistral 7B model. -@factory(name=NAME) -def trainer(devices=8) -> nl.Trainer: - strategy = nl.MegatronStrategy(tensor_model_parallel_size=2) + This function sets up the distributed training strategy and other training parameters. - return nl.Trainer( - devices=devices, - max_steps=100, + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mistral ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_include_optimizer=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ) + + trainer = run.Config( + nl.Trainer, accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + gradient_clip_val=1.0, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), strategy=strategy, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + use_distributed_sampler=False, + val_check_interval=2000, ) + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Mistral 7B model. -@factory(name=NAME + "_hf") -def hf_resume() -> nl.AutoResume: - return nl.AutoResume(restore_config=nl.RestoreConfig(path="hf://mistralai/Mistral-7B-v0.3")) + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. -@factory(name=NAME, for_task="llm.pretrain") -def pretrain_recipe() -> Partial: - return Partial( - pretrain, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=distributed_fused_adam_with_cosine_annealing(), + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mistral + $ nemo llm pretrain --factory "mistral(num_nodes=2, name='my_mistral_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mistral_pretrain", num_nodes=2) + >>> print(recipe) + """ + return run.Partial( + fn, + model=model(), + trainer=trainer( + tensor_parallelism=1, + pipeline_parallelism=1, + pipeline_parallelism_type=None, + virtual_pipeline_parallelism=None, + context_parallelism=2, + sequence_parallelism=False, + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), ) -@factory(name=NAME, for_task="llm.finetune") -def finetune_recipe() -> Partial: - return Partial( - finetune, - model=model, - trainer=trainer, - data=squad, - log=default_log, - optim=distributed_fused_adam_with_cosine_annealing(), - peft=gpt_lora, - resume=hf_resume, +@run.cli.factory(name=NAME + "_hf") +def hf_resume() -> run.Config[nl.AutoResume]: + """ + Configure automatic resumption from a Hugging Face checkpoint for Mistral 7B model. + + This function sets up the configuration to resume training from a pre-trained + Hugging Face model checkpoint. + + More info about the model can be found at: https://huggingface.co/mistralai/Mistral-7B-v0.3 + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint. + + Note: + This is particularly useful for fine-tuning scenarios where you want to + start from the pre-trained Mistral 7B model. + """ + return run.Config( + nl.AutoResume, restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mistral-7B-v0.3") ) + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mistral 7B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mistral + $ nemo llm finetune --factory "mistral(num_nodes=2, name='my_mistral_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mistral_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) + recipe.resume = hf_resume() + recipe.peft = run.Config(LoRA) + recipe.data = run.Config(SquadDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1) + return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py index 209a5926a008..2320c89dfd2c 100644 --- a/nemo/collections/llm/recipes/mixtral_8x22b.py +++ b/nemo/collections/llm/recipes/mixtral_8x22b.py @@ -1,7 +1,24 @@ -from typing import Callable, Optional +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import nemo_run as run import pytorch_lightning as pl import torch +from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -12,31 +29,76 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x22b" -def model() -> Config[pl.LightningModule]: - return Config(MixtralModel, config=Config(MixtralConfig8x22B)) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x22B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x22B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x22b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(MixtralModel, config=run.Config(MixtralConfig8x22B)) def trainer( - tensor_parallelism: int, - pipeline_parallelism: int, - pipeline_parallelism_type: Optional[torch.dtype], - virtual_pipeline_parallelism: Optional[int], - context_parallelism: int, - sequence_parallelism: bool, - expert_parallelism: int, - num_nodes: int = 1, + tensor_parallelism: int = 8, + pipeline_parallelism: int = 8, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, + virtual_pipeline_parallelism: Optional[int] = 7, + context_parallelism: int = 1, + sequence_parallelism: bool = True, + expert_parallelism: int = 1, + num_nodes: int = 8, num_gpus_per_node: int = 8, max_steps: int = 1168251, - callbacks: Optional[list[Config[Callback]]] = None, -) -> Config[nl.Trainer]: - strategy = Config( + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x22B model. + + This function sets up the distributed training strategy optimized for the large Mixtral 8x22B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + expert_parallelism (int): Degree of expert parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x22b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=8, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size efficiently. + """ + strategy = run.Config( nl.MegatronStrategy, tensor_model_parallel_size=tensor_parallelism, pipeline_model_parallel_size=pipeline_parallelism, @@ -48,9 +110,14 @@ def trainer( gradient_as_bucket_view=True, ckpt_async_save=True, ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + ), ) - trainer = Config( + trainer = run.Config( nl.Trainer, accelerator="gpu", accumulate_grad_batches=1, @@ -61,7 +128,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=bf16_mixed_plugin(), + plugins=run.Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -70,43 +137,107 @@ def trainer( return trainer +@run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - return Partial( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x22B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x22b + $ nemo llm pretrain --factory "mixtral_8x22b(num_nodes=2, name='my_mixtral_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=2) + >>> print(recipe) + """ + return run.Partial( fn, model=model(), trainer=trainer( - tensor_parallelism=8, - pipeline_parallelism=8, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=7, - context_parallelism=1, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, callbacks=[run.Config(TimingCallback)] ), - data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) -def hf_resume() -> Config[nl.AutoResume]: - return Config( +def hf_resume() -> run.Config[nl.AutoResume]: + """ + Configure automatic resumption from a Hugging Face checkpoint for Mixtral 8x22B model. + + This function sets up the configuration to resume training from a pre-trained + Hugging Face model checkpoint. + + More info about the model can be found at: https://huggingface.co/mistralai/Mixtral-8x22B-v0.1 + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint. + + Note: + This is particularly useful for fine-tuning scenarios where you want to + start from the pre-trained Mixtral 8x22B model. + """ + return run.Config( nl.AutoResume, - restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x22B-v0.1"), + restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x22B-v0.1"), ) -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune - ) +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 8, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x22B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x22b + $ nemo llm finetune --factory "mixtral_8x22b(num_nodes=2, name='my_mixtral_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) recipe.resume = hf_resume() - recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) - recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b.py b/nemo/collections/llm/recipes/mixtral_8x3b.py index 7dc8170e13e3..14318bea9e5a 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b.py @@ -1,5 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from typing import Callable, Optional +import nemo_run as run import pytorch_lightning as pl import torch from pytorch_lightning.callbacks.callback import Callback @@ -12,31 +28,74 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin -from nemo.collections.llm.utils import Config, Partial +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x3b" -def model() -> Config[pl.LightningModule]: - return Config(MixtralModel, config=Config(MixtralConfig8x3B)) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x3B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x3b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(MixtralModel, config=run.Config(MixtralConfig8x3B)) def trainer( - tensor_parallelism: int, - pipeline_parallelism: int, - pipeline_parallelism_type: Optional[torch.dtype], - virtual_pipeline_parallelism: Optional[int], - context_parallelism: int, - sequence_parallelism: bool, - expert_parallelism: int, + tensor_parallelism: int = 4, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = True, + expert_parallelism: int = 1, num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 1168251, - callbacks: Optional[list[Config[Callback]]] = None, -) -> Config[nl.Trainer]: - strategy = Config( + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x3B model. + + This function sets up the distributed training strategy optimized for the Mixtral 8x3B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + expert_parallelism (int): Degree of expert parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x3b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=8) + >>> print(trainer_config) + """ + strategy = run.Config( nl.MegatronStrategy, tensor_model_parallel_size=tensor_parallelism, pipeline_model_parallel_size=pipeline_parallelism, @@ -50,7 +109,7 @@ def trainer( ckpt_parallel_load=True, ) - trainer = Config( + trainer = run.Config( nl.Trainer, accelerator="gpu", accumulate_grad_batches=1, @@ -61,7 +120,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=bf16_mixed_plugin(), + plugins=bf16_mixed(), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -70,43 +129,108 @@ def trainer( return trainer +@run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - return Partial( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x3B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): Function to use for pre-training (default: nemo.collections.llm.api.pretrain). + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x3b + $ nemo llm pretrain --factory "mixtral_8x3b(num_nodes=2, name='my_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x3b_pretrain", num_nodes=2) + >>> print(recipe) + """ + return run.Partial( fn, model=model(), trainer=trainer( - tensor_parallelism=4, - pipeline_parallelism=1, - pipeline_parallelism_type=None, - virtual_pipeline_parallelism=None, - context_parallelism=1, - sequence_parallelism=True, - expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ), - data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) -def hf_resume() -> Config[nl.AutoResume]: - return Config( +def hf_resume() -> run.Config[nl.AutoResume]: + """ + Configure the Hugging Face model resuming for Mixtral 8x3B model. + + This function sets up the configuration for resuming training from a Hugging Face model. + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from a Hugging Face model. + + Examples: + CLI usage: + $ nemo llm finetune --factory "mixtral_8x3b(resume=hf_resume())" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x3b_finetune", num_nodes=2) + >>> recipe.resume = hf_resume() + >>> print(recipe) + """ + return run.Config( nl.AutoResume, - restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), + restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), ) -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune - ) +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x3B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x3b + $ nemo llm finetune --factory "mixtral_8x3b(num_nodes=2, name='my_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x3b_finetune", num_nodes=2) + >>> print(recipe) + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) + recipe.resume = hf_resume() - recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) - recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py index dbf27f86415c..287ac331ee65 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py @@ -1,61 +1,82 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x3b -from nemo.collections.llm.utils import Config, Partial -from nemo.utils.exp_manager import TimingCallback - NAME = "mixtral_8x3b_16k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = mixtral_8x3b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x3B model configuration with 16k sequence length. - model = mixtral_8x3b.model() - model.config.seq_length = 16384 - model.config.max_position_embeddings = 16384 + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model with 16k sequence length. - trainer = mixtral_8x3b.trainer( - tensor_parallelism=2, - pipeline_parallelism=2, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=2, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x3b_16k ... - data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = mixtral_8x3b.model() + model_config.config.seq_length = 16384 + model_config.config.max_position_embeddings = 16384 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x3B model with 16k sequence length. + This function sets up the distributed training strategy optimized for longer sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = mixtral_8x3b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x3b_16k ... - model = mixtral_8x3b.model() - model.config.seq_length = 16384 - model.config.max_position_embeddings = 16384 + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) - trainer = mixtral_8x3b.trainer( + Note: + This configuration uses increased parallelism to handle the longer sequence length efficiently. + """ + return mixtral_8x3b.trainer( tensor_parallelism=2, pipeline_parallelism=2, pipeline_parallelism_type=torch.bfloat16, @@ -65,13 +86,91 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x3B model with 16k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x3b_16k + $ nemo llm pretrain --factory "mixtral_8x3b_16k(num_nodes=2, name='my_16k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x3b_16k_pretrain", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for handling longer sequences (16k) compared to the standard version. + """ + recipe = mixtral_8x3b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x3B model with 16k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x3b_16k + $ nemo llm finetune --factory "mixtral_8x3b_16k(num_nodes=2, name='my_16k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x3b_16k_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning with longer sequences (16k) compared to the standard version. + It uses the SQuAD dataset adapted for 16k sequence length. + """ + recipe = mixtral_8x3b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py index b2a7724b35a9..98cf2f4f9e7b 100644 --- a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py +++ b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py @@ -1,62 +1,84 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x3b -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x3b_64k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = mixtral_8x3b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x3B model configuration with 64k sequence length. - model = mixtral_8x3b.model() - model.config.seq_length = 65536 - model.config.max_position_embeddings = 65536 + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model with 64k sequence length. - trainer = mixtral_8x3b.trainer( - tensor_parallelism=4, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=4, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x3b_64k ... - data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = mixtral_8x3b.model() + model_config.config.seq_length = 65536 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 8, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x3B model with 64k sequence length. + This function sets up the distributed training strategy optimized for long sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = mixtral_8x3b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. - model = mixtral_8x3b.model() - model.config.seq_length = 65536 - model.config.max_position_embeddings = 65536 + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. - trainer = mixtral_8x3b.trainer( - tensor_parallelism=2, - pipeline_parallelism=2, + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x3b_64k ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=8, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses significantly increased parallelism to handle the long sequence length efficiently. + """ + return mixtral_8x3b.trainer( + tensor_parallelism=4, + pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, virtual_pipeline_parallelism=8, context_parallelism=4, @@ -64,13 +86,93 @@ def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 8, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x3B model with 64k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x3b_64k + $ nemo llm pretrain --factory "mixtral_8x3b_64k(num_nodes=8, name='my_64k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x3b_64k_pretrain", num_nodes=8) + >>> print(recipe) + + Note: + This recipe is optimized for handling long sequences (64k) compared to the standard version. + It requires significant computational resources due to the extended sequence length. + """ + recipe = mixtral_8x3b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 8, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x3B model with 64k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x3b_64k + $ nemo llm finetune --factory "mixtral_8x3b_64k(num_nodes=8, name='my_64k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x3b_64k_finetune", num_nodes=8) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning with long sequences (64k) compared to the standard version. + It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration requires + substantial computational resources due to the extended sequence length. + """ + recipe = mixtral_8x3b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py index bacbfcab4e2d..21c9ef572a68 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b.py @@ -1,7 +1,24 @@ -from typing import Callable, Optional +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import nemo_run as run import pytorch_lightning as pl import torch +from megatron.core.distributed import DistributedDataParallelConfig from pytorch_lightning.callbacks.callback import Callback from nemo import lightning as nl @@ -12,31 +29,73 @@ from nemo.collections.llm.peft.lora import LoRA from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x7b" -def model() -> Config[pl.LightningModule]: - return Config(MixtralModel, config=Config(MixtralConfig8x7B)) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x7B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x7b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + return run.Config(MixtralModel, config=run.Config(MixtralConfig8x7B)) def trainer( - tensor_parallelism: int, - pipeline_parallelism: int, - pipeline_parallelism_type: Optional[torch.dtype], - virtual_pipeline_parallelism: Optional[int], - context_parallelism: int, - sequence_parallelism: bool, - expert_parallelism: int, - num_nodes: int = 1, + tensor_parallelism: int = 8, + pipeline_parallelism: int = 2, + pipeline_parallelism_type: Optional[torch.dtype] = torch.bfloat16, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = True, + expert_parallelism: int = 1, + num_nodes: int = 2, num_gpus_per_node: int = 8, max_steps: int = 1168251, - callbacks: Optional[list[Config[Callback]]] = None, -) -> Config[nl.Trainer]: - strategy = Config( + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x7B model. + + This function sets up the distributed training strategy optimized for the Mixtral 8x7B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + expert_parallelism (int): Degree of expert parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x7b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + """ + strategy = run.Config( nl.MegatronStrategy, tensor_model_parallel_size=tensor_parallelism, pipeline_model_parallel_size=pipeline_parallelism, @@ -48,9 +107,14 @@ def trainer( gradient_as_bucket_view=True, ckpt_async_save=True, ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + ), ) - trainer = Config( + trainer = run.Config( nl.Trainer, accelerator="gpu", accumulate_grad_batches=1, @@ -61,7 +125,7 @@ def trainer( log_every_n_steps=10, max_steps=max_steps, num_nodes=num_nodes, - plugins=bf16_mixed_plugin(), + plugins=run.Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), strategy=strategy, use_distributed_sampler=False, val_check_interval=2000, @@ -70,43 +134,107 @@ def trainer( return trainer +@run.cli.factory(target=pretrain, name=NAME) def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - return Partial( + dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, fn=pretrain +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x7B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x7b + $ nemo llm pretrain --factory "mixtral_8x7b(num_nodes=2, name='my_mixtral_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x7b_pretrain", num_nodes=2) + >>> print(recipe) + """ + return run.Partial( fn, model=model(), trainer=trainer( - tensor_parallelism=8, - pipeline_parallelism=2, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=None, - context_parallelism=1, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, callbacks=[run.Config(TimingCallback)] ), - data=Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(ckpt_dir=ckpt_dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), resume=default_resume(), ) -def hf_resume() -> Config[nl.AutoResume]: - return Config( +def hf_resume() -> run.Config[nl.AutoResume]: + """ + Configure automatic resumption from a Hugging Face checkpoint for Mixtral 8x7B model. + + This function sets up the configuration to resume training from a pre-trained + Hugging Face model checkpoint. + + More info about the model can be found at: https://huggingface.co/mistralai/Mixtral-8x7B-v0.1 + + Returns: + run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint. + + Note: + This is particularly useful for fine-tuning scenarios where you want to + start from the pre-trained Mixtral 8x7B model. + """ + return run.Config( nl.AutoResume, - restore_config=Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), + restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), ) -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune - ) +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x7B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x7b + $ nemo llm finetune --factory "mixtral_8x7b(num_nodes=2, name='my_mixtral_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x7b_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. + """ + recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) recipe.resume = hf_resume() - recipe.peft = Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) - recipe.data = Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) + recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) + recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py index 0542f22836d6..4b5fd07a69e9 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b_16k.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b_16k.py @@ -1,76 +1,174 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x7b -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x7b_16k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = mixtral_8x7b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x7B model configuration with 16k sequence length. - model = mixtral_8x7b.model() - model.config.seq_length = 16384 - model.config.max_position_embeddings = 16384 + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model with 16k sequence length. - trainer = mixtral_8x7b.trainer( - tensor_parallelism=2, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=4, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x7b_16k ... - data = Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = mixtral_8x7b.model() + model_config.config.seq_length = 16384 + model_config.config.max_position_embeddings = 16384 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x7B model with 16k sequence length. + This function sets up the distributed training strategy optimized for longer sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = mixtral_8x7b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. - model = mixtral_8x7b.model() - model.config.seq_length = 16384 - model.config.max_position_embeddings = 16384 + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x7b_16k ... - trainer = mixtral_8x7b.trainer( + Python API usage: + >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses increased parallelism to handle the longer sequence length efficiently. + """ + return mixtral_8x7b.trainer( tensor_parallelism=2, - pipeline_parallelism=2, + pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, virtual_pipeline_parallelism=8, - context_parallelism=1, + context_parallelism=4, sequence_parallelism=True, - expert_parallelism=8, + expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x7B model with 16k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x7b_16k + $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=2, name='my_16k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=2) + >>> print(recipe) + """ + recipe = mixtral_8x7b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 2, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x7B model with 16k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 16k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x7b_16k + $ nemo llm finetune --factory "mixtral_8x7b_16k(num_nodes=2, name='my_16k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x7b_16k_finetune", num_nodes=2) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. + """ + recipe = mixtral_8x7b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py index 4fb8de98063e..6a1f76961325 100644 --- a/nemo/collections/llm/recipes/mixtral_8x7b_64k.py +++ b/nemo/collections/llm/recipes/mixtral_8x7b_64k.py @@ -1,76 +1,180 @@ -from typing import Callable - +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import nemo_run as run +import pytorch_lightning as pl import torch -from nemo.collections.llm.api import pretrain +from nemo.collections.llm.api import finetune, pretrain from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule from nemo.collections.llm.recipes import mixtral_8x7b -from nemo.collections.llm.utils import Config, Partial from nemo.utils.exp_manager import TimingCallback NAME = "mixtral_8x7b_64k" -def pretrain_recipe( - name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain -) -> Partial: - recipe = mixtral_8x7b.pretrain_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn - ) +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Mixtral 8x7B model configuration with 64k sequence length. - model = mixtral_8x7b.model() - model.config.seq_length = 65536 - model.config.max_position_embeddings = 65536 + Returns: + run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model with 64k sequence length. - trainer = mixtral_8x7b.trainer( - tensor_parallelism=4, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=4, - context_parallelism=8, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], - ) + Examples: + CLI usage: + $ nemo llm pretrain model=mixtral_8x7b_64k ... - data = Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + model_config = mixtral_8x7b.model() + model_config.config.seq_length = 65536 + return model_config - recipe.model = model - recipe.trainer = trainer - recipe.data = data - return recipe +def trainer( + num_nodes: int = 16, + num_gpus_per_node: int = 8, +) -> run.Config: + """ + Configure the NeMo Lightning Trainer for Mixtral 8x7B model with 64k sequence length. + This function sets up the distributed training strategy optimized for very long sequences. -def finetune_recipe(name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int) -> Partial: - recipe = mixtral_8x7b.finetune_recipe( - name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node - ) + Args: + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Config: Configuration for the NeMo Lightning Trainer. - model = mixtral_8x7b.model() - model.config.seq_length = 65536 - model.config.max_position_embeddings = 65536 + Examples: + CLI usage: + $ nemo llm pretrain trainer=mixtral_8x7b_64k ... - trainer = mixtral_8x7b.trainer( - tensor_parallelism=2, + Python API usage: + >>> trainer_config = trainer(num_nodes=16, num_gpus_per_node=8) + >>> print(trainer_config) + + Note: + This configuration uses significantly increased parallelism to handle the long sequence length efficiently. + It requires a substantial amount of computational resources. + """ + return mixtral_8x7b.trainer( + tensor_parallelism=4, pipeline_parallelism=4, pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=4, + virtual_pipeline_parallelism=4, + context_parallelism=8, sequence_parallelism=True, expert_parallelism=1, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, - callbacks=[Config(TimingCallback)], + callbacks=[run.Config(TimingCallback)], ) - data = Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) - recipe.model = model - recipe.trainer = trainer - recipe.data = data +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 16, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a pre-training recipe for Mixtral 8x7B model with 64k sequence length. + + This function sets up a complete configuration for pre-training, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory mixtral_8x7b_64k + $ nemo llm pretrain --factory "mixtral_8x7b_64k(num_nodes=16, name='my_64k_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="mixtral_8x7b_64k_pretrain", num_nodes=16) + >>> print(recipe) + + Note: + This recipe is optimized for handling long sequences (64k) compared to the standard version. + It requires extensive computational resources due to the model size and extended sequence length. + """ + recipe = mixtral_8x7b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 16, + num_gpus_per_node: int = 8, +) -> run.Partial: + """ + Create a fine-tuning recipe for Mixtral 8x7B model with 64k sequence length. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, and data settings optimized for 64k sequence length. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory mixtral_8x7b_64k + $ nemo llm finetune --factory "mixtral_8x7b_64k(num_nodes=16, name='my_64k_finetune')" + + Python API usage: + >>> recipe = finetune_recipe(name="mixtral_8x7b_64k_finetune", num_nodes=16) + >>> print(recipe) + + Note: + This recipe is optimized for fine-tuning with long sequences (64k) compared to the standard version. + It uses the SQuAD dataset adapted for 64k sequence length. Be aware that this configuration requires + substantial computational resources due to the model size and extended sequence length. + """ + recipe = mixtral_8x7b.finetune_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + + recipe.model = model() + recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) + recipe.data = run.Config(SquadDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) return recipe diff --git a/nemo/collections/llm/recipes/optim/__init__.py b/nemo/collections/llm/recipes/optim/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/nemo/collections/llm/recipes/optim/__init__.py +++ b/nemo/collections/llm/recipes/optim/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py index d38bbc09d8e6..77472d8a3755 100644 --- a/nemo/collections/llm/recipes/optim/adam.py +++ b/nemo/collections/llm/recipes/optim/adam.py @@ -1,11 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import nemo_run as run from megatron.core.optimizer import OptimizerConfig -from nemo.collections.llm.utils import Config from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule, OptimizerModule -def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> Config[OptimizerModule]: - opt_cfg = Config( +@run.cli.factory +def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> run.Config[OptimizerModule]: + opt_cfg = run.Config( OptimizerConfig, optimizer="adam", lr=max_lr, @@ -20,14 +36,14 @@ def distributed_fused_adam_with_cosine_annealing(max_lr: float = 1e-4) -> Config clip_grad=1.0, ) - sched = Config( + sched = run.Config( CosineAnnealingScheduler, warmup_steps=2000, constant_steps=0, min_lr=0.1 * max_lr, ) - return Config( + return run.Config( MegatronOptimizerModule, config=opt_cfg, lr_scheduler=sched, diff --git a/nemo/collections/llm/recipes/precision/__init__.py b/nemo/collections/llm/recipes/precision/__init__.py index e69de29bb2d1..d9155f923f18 100644 --- a/nemo/collections/llm/recipes/precision/__init__.py +++ b/nemo/collections/llm/recipes/precision/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo/collections/llm/recipes/precision/mixed_precision.py b/nemo/collections/llm/recipes/precision/mixed_precision.py index 6a9cb64404ce..3c0332a0b330 100644 --- a/nemo/collections/llm/recipes/precision/mixed_precision.py +++ b/nemo/collections/llm/recipes/precision/mixed_precision.py @@ -1,11 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import nemo_run as run import torch -from nemo.collections.llm.utils import Config from nemo.lightning.pytorch.plugins.mixed_precision import MegatronMixedPrecision -def bf16_mixed_plugin() -> Config[MegatronMixedPrecision]: - return Config( +@run.cli.factory +def bf16_mixed() -> run.Config[MegatronMixedPrecision]: + return run.Config( MegatronMixedPrecision, precision="bf16-mixed", params_dtype=torch.bfloat16, @@ -15,8 +31,9 @@ def bf16_mixed_plugin() -> Config[MegatronMixedPrecision]: ) -def fp16_mixed_plugin() -> Config[MegatronMixedPrecision]: - return Config( +@run.cli.factory +def fp16_mixed() -> run.Config[MegatronMixedPrecision]: + return run.Config( MegatronMixedPrecision, precision="16-mixed", params_dtype=torch.half, diff --git a/nemo/collections/llm/tokenizer.py b/nemo/collections/llm/tokenizer.py index 77320c4b9c02..ef8cc53db7e5 100644 --- a/nemo/collections/llm/tokenizer.py +++ b/nemo/collections/llm/tokenizer.py @@ -12,12 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo.lightning.io.artifact import FileArtifact +from nemo.lightning.io.artifact import DirOrStringArtifact, FileArtifact from nemo.lightning.io.mixin import track_io __all__ = [] + +def extract_name(cls): + return str(cls).split('.')[-1].rstrip('>').rstrip("'") + + try: + # Track HF tokenizers + from transformers import AutoTokenizer as HfAutoTokenizer + from transformers.models.llama.tokenization_llama import LlamaTokenizer + from transformers.models.llama.tokenization_llama_fast import LlamaTokenizerFast + + for cls in [HfAutoTokenizer, LlamaTokenizer, LlamaTokenizerFast]: + track_io( + cls, + artifacts=[ + FileArtifact(attr_name, required=False) + for attr_name in ['vocab_file', 'merges_file', 'tokenizer_file', 'name_or_path'] + ], + ) + __all__.append(extract_name(cls)) + from nemo.collections.common.tokenizers import AutoTokenizer track_io( @@ -25,6 +45,7 @@ artifacts=[ FileArtifact("vocab_file", required=False), FileArtifact("merges_file", required=False), + DirOrStringArtifact("pretrained_model_name", required=False), ], ) __all__.append("AutoTokenizer") diff --git a/nemo/collections/llm/tools/auto_configurator/__init__.py b/nemo/collections/llm/tools/auto_configurator/__init__.py new file mode 100644 index 000000000000..5c6bde2c285a --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/__init__.py @@ -0,0 +1,2 @@ +from nemo.collections.llm.tools.auto_configurator.core.calculate_performance import get_results +from nemo.collections.llm.tools.auto_configurator.runner import AutoConfigurator, generate_configs diff --git a/tests/lightning/fabric/__init__.py b/nemo/collections/llm/tools/auto_configurator/core/__init__.py similarity index 100% rename from tests/lightning/fabric/__init__.py rename to nemo/collections/llm/tools/auto_configurator/core/__init__.py diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py new file mode 100644 index 000000000000..ee1579f6f6e8 --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py @@ -0,0 +1,367 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from megatron.core.optimizer import OptimizerConfig +from pytorch_lightning.loggers import TensorBoardLogger + +from nemo import lightning as nl +from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer +from nemo.collections.llm import PreTrainingDataModule +from nemo.collections.llm.utils import Config +from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule +from nemo.utils.exp_manager import TimingCallback + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class BaseConfig: + def __init__(self, config=None): + """ + Args: + config (AutoConfigurator): auto configurator runner config. + """ + + self.config = config + + self.model = self.get_model() + self.optim = self.get_optim() + self.trainer = self.get_trainer() + self.data = self.get_data() + self.log = self.get_logger() + self.run = self.get_run_config() + self.tokenizer = self.get_tokenizer(config.tokenizer_type, config.tokenizer_path) + + def get_model(self): + """Function that returns model config. + + Returns: + Config: model config. + """ + + self.config.model.seq_length = self.config.seq_length + + return self.config.model + + def get_optim(self) -> Config[OptimizerConfig]: + """Function that returns optimizer config. + + Returns: + Config[OptimizerConfig]: optimizer config. + """ + optim_params = { + "optimizer": "adam", + "lr": 1e-4, + "min_lr": 1e-5, + "use_distributed_optimizer": True, + "bf16": True, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "overlap_grad_reduce": True, + "overlap_param_gather": True, + "clip_grad": 1.0, + "adam_eps": 1e-5, + } + + optim_config = Config( + OptimizerConfig, + **optim_params, + ) + + sched = Config( + CosineAnnealingScheduler, + warmup_steps=10, + constant_steps=0, + min_lr=optim_config.min_lr, + ) + + return Config( + MegatronOptimizerModule, + config=optim_config, + lr_scheduler=sched, + ) + + def get_trainer(self) -> Config[nl.Trainer]: + """Function that returns config for PTL trainer. + + Returns: + Config[nl.Trainer]: trainer config. + """ + + trainer_config = { + "accelerator": "gpu", + "enable_checkpointing": False, + "use_distributed_sampler": False, + "max_epochs": None, + "log_every_n_steps": 1, + "limit_val_batches": 1, + "limit_test_batches": 1, + "accumulate_grad_batches": 1, + "num_nodes": self.config.num_nodes, + "devices": self.config.num_gpus, + "max_steps": self.config.max_steps_per_run, + "val_check_interval": self.config.max_steps_per_run, + } + + strategy = Config( + nl.MegatronStrategy, + pipeline_dtype=torch.bfloat16, + ) + + return Config( + nl.Trainer, + **trainer_config, + strategy=strategy, + plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"), + callbacks=[Config(TimingCallback)], + ) + + def get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config: + """Function that returns the tokenizer config. + + Args: + tokenizer_type (str): tokenizer type. + tokenizer_path (str): path to the tokenizer. + + Returns: + Config: tokenizer config. + """ + + if tokenizer_type == "sentencepiece": + return Config(SentencePieceTokenizer, model_path=tokenizer_path) + else: + return Config(AutoTokenizer, pretrained_model_name=tokenizer_path) + + def get_data(self) -> Config[PreTrainingDataModule]: + """Function that returns dataset config. + + Returns: + Config[PreTrainingDataModule]: data config. + """ + + # Data config + data_config = { + "paths": self.config.data_paths, + "seq_length": self.config.seq_length, + "global_batch_size": self.config.global_batch_size, + "num_workers": 2, + "index_mapping_dir": None, + } + + # Define the tokenizer + tokenizer = self.get_tokenizer( + self.config.tokenizer_type, + self.config.tokenizer_path, + ) + + return Config( + PreTrainingDataModule, + **data_config, + tokenizer=tokenizer, + ) + + def get_logger(self) -> Config[nl.NeMoLogger]: + """Function that returns the training strategy. + + Returns: + Config[nl.NeMoLogger]: NeMo Logger config. + """ + + # Define TensorBoard Logger + tb_logger = Config(TensorBoardLogger, save_dir="tb_logs") + + ckpt = Config( + nl.ModelCheckpoint, + monitor="reduced_train_loss", + save_last=False, + save_top_k=0, + ) + + return Config( + nl.NeMoLogger, + ckpt=ckpt, + tensorboard=tb_logger, + wandb=None, + dir=self.config.path_to_logs, + ) + + def get_run_config(self) -> dict: + """Function that returns config for cluster job. + + Returns: + dict: cluster job config. + """ + + run_config = { + "name": self.config.model.__class__.__name__, + "time_limit": f"0-00:{self.config.max_minutes_per_run}:00", + } + + return run_config + + +def calculate_model_size( + gpu_count: int, + max_training_days: float, + model_size_in_b: float = None, + tflops_per_gpu: int = 140, + num_tokens_in_b: int = 300, + model_name: str = "gpt3", +) -> float: + """Estimates a model size to be trained given the constraints. If the + model_size is provided, it estimates the time to train it with the given + constraints. + + Example: + output 5B params to train for 7 days with 160 GPUs. + + Args: + gpu_count (int): number of gpus to use (num_nodes * gpus_per_node). + max_training_days (float): number of days to train the model for. + model_size_in_b (float): number of parameters in the model, if known. + tflops_per_gpu (int): estimated number of TFLOPS/s per GPU. + num_tokens_in_b (int): number of tokens to train the model for. + model_name (str): name of the model. + + Returns: + float: number of parameters to use for training. + """ + + # Model size is not known, must be estimated. + if model_size_in_b is None: + model_size_in_b = _estimate_model_size( + max_training_days=max_training_days, + gpu_count=gpu_count, + tflops_per_gpu=tflops_per_gpu, + num_tokens_in_b=num_tokens_in_b, + model_name=model_name, + ) + # Model size is known, so only time to train estimate is needed. + else: + max_training_days = _estimate_training_time( + model_size_in_b=model_size_in_b, + gpu_count=gpu_count, + tflops_per_gpu=tflops_per_gpu, + num_tokens_in_b=num_tokens_in_b, + model_name=model_name, + ) + + print( + f"You can train a {model_size_in_b}B parameter model in " + f"{max_training_days} days using {gpu_count} GPUs. This result assumes " + f"you are training to {num_tokens_in_b}B tokens, and each GPU achieves " + f"{tflops_per_gpu} TFLOPS." + ) + return model_size_in_b + + +def _estimate_model_size( + max_training_days: float, + gpu_count: int, + tflops_per_gpu: int, + num_tokens_in_b: int, + model_name: str, +) -> float: + """Estimates model size given time and hardware constraints. It's only used if the model size is not provided by the user. + + Args: + max_training_days (float): number of days to train the model for. + gpu_count (int): number of gpus to use (num_nodes * gpus_per_node). + tflops_per_gpu (int): estimated number of TFLOPS/s per GPU. + num_tokens_in_b (int): number of tokens to train the model for. + model_name (str): name of the model, such as gpt3, t5, mt5... + + Returns: + float: number of parameters to use for training. + + Raises: + NotImplementedError: if the model_name is not one of the supported models. + """ + + model_penalty = 0.87 if model_name == "mt5" else 1.0 + valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma", "nemotron"] + try: + if model_name in valid_models: + return round( + model_penalty + * (max_training_days * 3600 * 24 * gpu_count * tflops_per_gpu * 1e12) + / (8 * num_tokens_in_b * 1e9) + / 1e9, + 2, + ) + else: + raise NotImplementedError + except ValueError as err: + print(f"Input values were not valid: {err}") + except ZeroDivisionError as err: + print(f"Cannot divide by zero. This can happen if num_tokens_in_b is zero: {err}") + except NotImplementedError as err: + print(f"Model size estimation is only available for {valid_models}: {err}") + return None + + +def _estimate_training_time( + model_size_in_b: float, + gpu_count: int, + tflops_per_gpu: int, + num_tokens_in_b: int, + model_name: str, +) -> float: + """Estimates training time for a given model size and hardware constraint. To be used when a model size is provided by the user. + + Args: + model_size_in_b (float): number of parameters to use for training. + gpu_count (int): number of gpus to use (num_nodes * gpus_per_node). + tflops_per_gpu (int): estimated number of TFLOPS/s per GPU. + num_tokens_in_b (int): number of tokens to train the model for. + model_name (str): name of the model, such as gpt3, t5, mt5... + + Returns: + float: number of days it will take to train the model. + + Raises: + NotImplementedError: if the model_name is not one of the supported models. + """ + + model_penalty = 1.15 if model_name == "mt5" else 1.0 + valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma", "nemotron"] + try: + if model_name in valid_models: + return round( + model_penalty + * (model_size_in_b * 1e9 * 8 * num_tokens_in_b * 1e9) + / (3600 * 24 * gpu_count * tflops_per_gpu * 1e12), + 2, + ) + else: + raise NotImplementedError + except ValueError as err: + print(f"Input values were not valid: {err}") + except ZeroDivisionError as err: + print(f"Cannot divide by zero. This can happen if gpu_count or tflops_per_gpu are zero: {err}") + except NotImplementedError as err: + print(f"Training time estimation is only available for {valid_models}: {err}") + return None diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py new file mode 100644 index 000000000000..5b7ac0ebc4d3 --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py @@ -0,0 +1,334 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from typing import Optional + +import pandas as pd +from tensorboard.backend.event_processing import event_accumulator + + +def get_results( + base_config=None, + train_config=None, + path_to_save: str = None, + output_top_n: Optional[int] = 10, +): + """Generates performance results. + + Args: + config (AutoConfigurator): auto configurator runner config. + path_to_save (str): path where to save performance results. + output_top_n (Optional[int]): Number of configs to be printed out as best configs. + """ + + # Define needed variables + model_name = train_config.model_type + model_size = train_config.model_size_in_b + global_batch_size = base_config.data.global_batch_size + seq_length = base_config.data.seq_length + + vocab_size = train_config.vocab_size + num_nodes = train_config.num_nodes + gpus_per_node = train_config.gpus_per_node + + layers = base_config.model.num_layers + hs = base_config.model.hidden_size + ffn_hs = base_config.model.ffn_hidden_size + + training_logs = path_to_save + final_result_logs = path_to_save + + result_columns = [ + "Model Name", + "Model Size", + "Seq Length", + "TP", + "PP", + "CP", + "EP", + "MBS", + "Act Ckpt Layers", + "Act Ckpt Micro Bathes", + "Act Ckpt Layers per Pipeline", + "Num Layers", + "Hidden Size", + "FFN Hidden Size", + "GBS", + "Nodes", + "GPUs per Node", + "Time per Step", + "Samples per Second", + "Model TFLOPS / GPU", + "Model TFLOPS Aggregate", + ] + error_columns = [ + "Model Name", + "Model Size", + "Seq Length", + "TP", + "PP", + "CP", + "EP", + "MBS", + "Act Ckpt Layers", + "Act Ckpt Micro Bathes", + "Act Ckpt Layers per Pipeline", + "Num Layers", + "Hidden Size", + "FFN Hidden Size", + "GBS", + "Nodes", + "GPUs per Node", + "Error Message", + ] + result = [] + errors = [] + dirs = [f.path for f in os.scandir(training_logs) if f.is_dir()] + + for candidate_dir in dirs: + logs_dir = os.path.join(training_logs, candidate_dir, "tb_logs/lightning_logs") + logs_folder = [f.path for f in os.scandir(logs_dir) if f.is_dir()][0] + tp, pp, cp, ep, mbs, act_ckpt, num_mbs_act, act_per_pipe = get_config(candidate_dir) + + for f in os.listdir(logs_folder): + if f.endswith("0.txt"): + error_file = os.path.join(logs_folder, f) + error = find_error(error_file) + if error: + errors.append( + [ + model_name, + model_size, + seq_length, + tp, + pp, + cp, + ep, + mbs, + act_ckpt, + num_mbs_act, + act_per_pipe, + layers, + hs, + ffn_hs, + global_batch_size, + num_nodes, + gpus_per_node, + error, + ] + ) + + files = os.listdir(logs_folder) + for f in files: + if f.startswith("events"): + event_file = os.path.join(logs_folder, f) + ea = event_accumulator.EventAccumulator(event_file) + ea.Reload() + try: + timing_list = ea.Scalars("train_step_timing in s") + if len(timing_list) <= 6: + continue + timing_list = [x.value for x in timing_list[5:]] + avg_global_step_time = round(sum(timing_list) / len(timing_list), 4) + samples_per_s = round(global_batch_size / avg_global_step_time, 2) + m_tflops, m_tflops_gpu = calculate_tflops( + model_name=model_name, + gbs=global_batch_size, + enc_seq_len=seq_length, + dec_seq_len=seq_length, + hs=hs, + ffn_hs=ffn_hs, + layers=layers, + vocab=vocab_size, + nodes=num_nodes, + gpus_per_node=gpus_per_node, + time_per_step=avg_global_step_time, + ) + config_name = f"tp{tp}_pp{pp}_cp{cp}_ep{ep}_mbs{mbs}_act_{act_ckpt}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}" + result.append( + [ + model_name, + model_size, + seq_length, + tp, + pp, + cp, + ep, + mbs, + act_ckpt, + num_mbs_act, + act_per_pipe, + layers, + hs, + ffn_hs, + global_batch_size, + num_nodes, + gpus_per_node, + avg_global_step_time, + samples_per_s, + m_tflops_gpu, + m_tflops, + ] + ) + finally: + continue + result.sort(key=lambda x: x[17]) + print(f"Top {min(output_top_n, len(result))} configs sorted from fastest to slowest:") + for i, res in enumerate(result): + print(f"Config #{i+1}: {res[-1]} with {res[17]:.4f}s per global step.") + if i + 1 == output_top_n: + break + + top_config = f"{model_name}_{model_size}b_{num_nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_cp_{result[0][5]}_ep_{result[0][6]}_mbs_{result[0][7]}_act_ckpt_{result[0][8]}_num_mbs_act_{result[0][9]}_act_per_pipe_{result[0][10]}" + print("\n==================================================") + print(f"Optimal config: {top_config} with {result[0][17]:.4f}s per global step.") + print("==================================================\n") + + # Save results as a CSV file. + os.makedirs(final_result_logs, exist_ok=True) + result_df = pd.DataFrame(result, columns=result_columns) + result_df.to_csv(os.path.join(final_result_logs, f"final_summary_{num_nodes}nodes.csv"), index=False) + + error_df = pd.DataFrame(errors, columns=error_columns) + error_df.to_csv(os.path.join(final_result_logs, f"failed_jobs_{num_nodes}nodes.csv"), index=False) + + +def calculate_tflops( + model_name, + gbs, + enc_seq_len, + dec_seq_len, + hs, + ffn_hs, + layers, + vocab, + nodes, + gpus_per_node, + time_per_step, +): + """Calculates model and hardware TFLOPS for each model. + + GPT-3 Formulas: + Model FLOPs = (24𝐵𝑠ℎ^2 + 4𝐵��^2ℎ) x (3 x num_layers) + 6𝐵𝑠ℎ + T5/mT5 Formula: + Model FLOPs = + Bert Formula: + Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL)) + """ + + if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral"]: + # Model FLOPS calculation + model_flops = ( + (24 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs) * (3 * layers) + + (6 * gbs * enc_seq_len * hs * vocab) + ) / time_per_step + model_flops_per_gpu = model_flops / (nodes * gpus_per_node) + + model_tflops = model_flops / 1e12 + model_tflops_per_gpu = model_flops_per_gpu / 1e12 + + elif model_name == "bert": + model_flops = ( + 72 * gbs * layers * enc_seq_len * hs * hs * (1 + (enc_seq_len / (6 * hs)) + (vocab / (12 * hs * layers))) + ) / time_per_step + model_flops_per_gpu = model_flops / (nodes * gpus_per_node) + model_tflops = model_flops / 1e12 + model_tflops_per_gpu = model_flops_per_gpu / 1e12 + + elif model_name in ["t5", "mt5"]: + # Encoder Layer FLOPS: include self attention + MLP + flops_self_attn_enc = 8 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs + flops_mlp_enc = 6 * gbs * enc_seq_len * hs * ffn_hs # geglu needs two gemms for h -> ffn_h + flops_enc_layer = flops_self_attn_enc + flops_mlp_enc + + # Decoder Layer FLOPS: inlcude self_attn + cross_attn + MLP + flops_self_attn_dec = 8 * gbs * dec_seq_len * hs * hs + 4 * gbs * dec_seq_len * dec_seq_len * hs + flops_cross_attn_dec = ( + 4 * gbs * enc_seq_len * hs * hs + + 4 * gbs * dec_seq_len * hs * hs + + 4 * gbs * enc_seq_len * dec_seq_len * hs + ) + flops_mlp_dec = 6 * gbs * dec_seq_len * hs * ffn_hs # geglu needs two gemms for h -> ffn_h + flops_dec_layer = flops_self_attn_dec + flops_cross_attn_dec + flops_mlp_dec + + # FLOPs of logits layer in the head + flops_logits = 2 * gbs * dec_seq_len * hs * vocab + + # FLOPs of fprop + flops_fprop = (flops_enc_layer + flops_dec_layer) * (layers // 2) + flops_logits + + # FLOPs of each train step (FLOPs of bprop is 2*fprop) + model_flops = 3 * flops_fprop / time_per_step + model_flops_per_gpu = model_flops / (nodes * gpus_per_node) + model_tflops = model_flops / 1e12 + model_tflops_per_gpu = model_flops_per_gpu / 1e12 + + else: + raise NotImplementedError("Model type not supported.") + return round(model_tflops, 2), round(model_tflops_per_gpu, 2) + + +def find_error(error_file: str, errors: list = ["CUDA out of memory"]): + """Function that finds the error among job output. + + Args: + errors (list): list of "popular" errors. + error_file (str): path to the job output. + + Returns: + str: serror message if job has been failed because of one of listed errors or None if not. + """ + + error = None + with open(error_file, "r") as f: + output = f.read() + for e in errors: + if e in output: + error = e + return error + + +def get_config(run_name: str) -> tuple: + """Function that extract model parallelism parameters + + Args: + run_name (str): name of the run. + + Returns: + tuple: model parallelism parameters. + """ + pattern = r'_(tp|pp|cp|ep|mbs|act_ckpt|num_mbs_act|act_per_pipe)_([^_]+)' + + # Find all matches in the input string + matches = re.findall(pattern, run_name) + + # Convert matches to a dictionary + params = {param: value for param, value in matches} + + return ( + params["tp"], + params["pp"], + params["cp"], + params["ep"], + params["mbs"], + params["act_ckpt"], + params["num_mbs_act"], + params["act_per_pipe"], + ) + + +if __name__ == "__main__": + main() diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py new file mode 100644 index 000000000000..087bf3c6fb0e --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py @@ -0,0 +1,892 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import List, Tuple + +from nemo.collections.llm.tools.auto_configurator.core import utils + + +GPT_BASED_MODELS = [ + "gpt3", + "bert", + "llama", + "baichuan2", + "chatglm", + "qwen2", + "mixtral", + "mistral", + "gemma", + "nemotron", +] + + +def generate_grid_search_configs( + base_cfg: dict, + train_cfg: dict, +) -> Tuple[dict, dict]: + """Generates the grid of all possible configurations for the given model, and stores each different configuration in a yaml file. + + Args: + base_cfg (dict): base configuration of the model to be trained. + train_cfg (dict): train configuration of the model to be trained. + + Returns: + dict: base config. + dict: generated configs. + """ + + model_name = train_cfg.model_type + model_size_in_b = train_cfg.model_size_in_b + + # 2 * num_layers is needed because of encoder/decoder architecture. + multiplier = 1 if model_name in GPT_BASED_MODELS else 2 + + seq_length = base_cfg.model.seq_length + num_layers = base_cfg.model.num_layers if model_name in GPT_BASED_MODELS else base_cfg.model.encoder.num_layers + + if model_name in GPT_BASED_MODELS: + act_method = None + else: + act_method = base_cfg.model.encoder.activations_checkpoint_method + + params = _calculate_tp_pp_mbs_grid( + model_size_in_b=model_size_in_b, + num_layers=num_layers, + model_name=model_name, + seq_length=seq_length, + train_cfg=train_cfg, + ) + + max_minutes = train_cfg.max_minutes_per_run + max_steps = train_cfg.max_steps_per_run + num_nodes = train_cfg.num_nodes + + valid_tp_pp_list = [] + for tp in params.tp: + for pp in params.pp: + for cp in params.cp: + for ep in params.ep: + for mbs in params.mbs: + num_gpus = base_cfg.trainer.num_nodes * base_cfg.trainer.devices + base_cfg.data.global_batch_size = params.gbs + if model_name in GPT_BASED_MODELS: + att_heads = base_cfg.model.num_attention_heads + num_layers = base_cfg.model.num_layers + else: + att_heads = base_cfg.model.encoder.num_attention_heads + num_layers = base_cfg.model.encoder.num_layers + model_parallelism = (tp * pp * cp * ep) if (cp and ep) else (tp * pp) + mod_gbs = params.gbs % (mbs * num_gpus / model_parallelism) + mod_att_heads = att_heads % tp + mod_layers = (multiplier * num_layers) % pp + mod_cp = cp if cp else 1 + mod_ep = ep if ep else 1 + if ( + mod_gbs == 0 + and mod_att_heads == 0 + and mod_layers == 0 + and (tp, pp, cp, ep) not in valid_tp_pp_list + and (mod_cp // mod_ep == mod_cp or mod_ep // mod_cp == mod_ep) + and params.min_model_parallel <= model_parallelism <= params.max_model_parallel + ): + valid_tp_pp_list.append((tp, pp, cp, ep)) + + # Generate grid search configs. + configs = {} + for tp, pp, cp, ep in valid_tp_pp_list: + ( + virtual_pipelines, + act_ckpt_layers, + num_micro_batches_partial_act_ckpt, + act_ckpt_layers_per_pipeline, + ) = _set_activations_checkpoint_params( + tp, + pp, + cp, + ep, + num_layers, + act_method, + multiplier, + model_size_in_b, + model_name, + ) + for mbs in params.mbs: + kwargs = { + "base_cfg": base_cfg, + "act": None, + "num_mbs_act": None, + "act_per_pipe": None, + "tp": tp, + "pp": pp, + "cp": cp, + "ep": ep, + "virtual_pipelines": virtual_pipelines, + "mbs": mbs, + "max_minutes": max_minutes, + "max_steps": max_steps, + "num_nodes": num_nodes, + "model_name": model_name, + "model_size": model_size_in_b, + } + if act_ckpt_layers[0] is not None: + if act_layers is not None and act_layers != "auto": + act_ckpt_layers = act_layers + for act in act_ckpt_layers: + for num_mbs_act in num_micro_batches_partial_act_ckpt: + for act_per_pipe in act_ckpt_layers_per_pipeline: + kwargs["act"] = act + kwargs["num_mbs_act"] = num_mbs_act + kwargs["act_per_pipe"] = act_per_pipe + new_cfg = utils.modify_cfg(**kwargs) + if new_cfg: # Save candidate cfg. + configs[new_cfg["run"]["name"]] = new_cfg + else: + new_cfg = utils.modify_cfg(**kwargs) + if new_cfg: # Save candidate cfg. + config_name = new_cfg["run"]["name"] + new_cfg.pop("run") + configs[config_name] = new_cfg + + print(f"\nAll candidate configurations created correctly. Total number of configs: {len(configs)}.\n") + return base_cfg, configs + + +def _set_activations_checkpoint_params( + tp, pp, cp, ep, num_layers, act_method, multiplier, model_size_in_b, model_name +): + act_multiple = 4 // pp + if act_method == "block": + if 1.0 <= model_size_in_b < 11.3: + act_multiple = 8 // pp + elif 11.3 <= model_size_in_b < 26.0: + act_multiple = 16 // pp + elif 26.0 <= model_size_in_b < 60.0: + act_multiple = 16 // pp + elif 60.0 <= model_size_in_b: + act_multiple = 32 // pp + act_multiple = max(act_multiple, 1) + + virtual_pipelines = None + # Num micro batches with partial act ckpt + min_micro_b = 0 # 0 will not be used, minimum will be set to 1 later in the code. + max_micro_b = pp + interval_micro_b = 1 + # Act ckpt layers per pipeline + min_layers_per_pipe = 0 + max_layers_per_pipe = num_layers + interval_layers_per_pipe = act_multiple + if model_name in GPT_BASED_MODELS and pp > 2: # Interleaved pipeline scheduling. + virtual_pipelines = num_layers // pp # TODO: verify that this is the best value. + act_multiple = 1 + max_micro_b = pp * (virtual_pipelines - 1) + (pp - 1) * 2 + 1 + interval_micro_b = virtual_pipelines * 8 + max_layers_per_pipe = multiplier * num_layers // pp // virtual_pipelines + 1 + + ( + act_ckpt_layers, + num_micro_batches_partial_act_ckpt, + act_ckpt_layers_per_pipeline, + ) = ([None], [None], [None]) + if act_method == "block": + # Act ckpt num layers + if virtual_pipelines is None: + act_ckpt_layers = range(0, multiplier * num_layers // pp + 1, act_multiple) + else: + act_ckpt_layers = range(0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple) + + if pp > 1 and model_name in GPT_BASED_MODELS: + # Num micro batches with partial act ckpt + num_micro_batches_partial_act_ckpt = list(range(min_micro_b, max_micro_b + 1, interval_micro_b)) + if num_micro_batches_partial_act_ckpt[0] == 0: + num_micro_batches_partial_act_ckpt[0] = 1 + + # Act ckpt layers per pipeline + act_ckpt_layers_per_pipeline = range( + min_layers_per_pipe, max_layers_per_pipe + 1, interval_layers_per_pipe + ) + + return ( + virtual_pipelines, + act_ckpt_layers, + num_micro_batches_partial_act_ckpt, + act_ckpt_layers_per_pipeline, + ) + + +@dataclass +class GPT3GridSearch: + """Selects grid search space for TP, PP, CP, EP, MBS parameters for GPT-3 and 80GB GPUs. + + Args: + model_size_in_b (float): number of parameters in the model. + valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config. + seq_length (int): sequence length to use for training. + gpu_memory_gb (int): size of GPU memory in GB. + """ + + model_size_in_b: int + valid_pp: List[int] + seq_length: int + gpu_memory_gb: int + + tp = [1, 2, 4, 8] + pp = [1] + cp = [1] + ep = [1] + mbs = [1, 2, 4, 8] + + gbs: int = 1024 + min_model_parallel: int = 1 + max_model_parallel: int = 8 + + def init_params(self): + model_size_in_b = self.model_size_in_b + gpu_memory_gb = self.gpu_memory_gb + seq_length = self.seq_length + + if gpu_memory_gb == 80: + if seq_length == 2048: + if model_size_in_b <= 1.0: + self.tp = [1, 2] + self.gbs = 256 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.gbs = 1024 + elif model_size_in_b <= 8.0: + self.tp = [1, 2, 4] + self.gbs = 2048 + elif model_size_in_b <= 13.0: + self.tp = [1, 2, 4, 8] + self.gbs = 2048 + elif model_size_in_b <= 23.0: + self.tp = [1, 2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 4] + self.mbs = [1, 2, 4] + self.min_model_parallel = 4 + self.max_model_parallel = 8 + self.gbs = 2048 + elif model_size_in_b <= 45.0: + self.tp = [2, 4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 4] + self.mbs = [1, 2, 4] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 2048 + elif model_size_in_b <= 95: + self.tp = [2, 4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 8] + self.mbs = [1, 2, 4, 8] + self.min_model_parallel = 8 + self.max_model_parallel = 64 + self.gbs = 2048 + elif model_size_in_b <= 130.0: + self.tp = [2, 4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 16] + self.mbs = [1, 2, 4, 8] + self.min_model_parallel = 16 + self.max_model_parallel = 128 + self.gbs = 2048 + elif model_size_in_b <= 195.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 16] + self.mbs = [1, 2, 4] + self.min_model_parallel = 32 + self.max_model_parallel = 256 + self.gbs = 2048 + elif model_size_in_b <= 395.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 8 <= x <= 32] + self.mbs = [1, 2, 4] + self.min_model_parallel = 64 + self.max_model_parallel = 512 + self.gbs = 2048 + elif model_size_in_b <= 790.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 8 <= x <= 100] + self.mbs = [1, 2, 4] + self.min_model_parallel = 128 + self.max_model_parallel = 1024 + self.gbs = 2048 + elif model_size_in_b <= 1100.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 16 <= x <= 130] + self.mbs = [1, 2, 4] + self.min_model_parallel = 256 + self.max_model_parallel = 2048 + self.gbs = 2048 + elif seq_length == 4096: + if model_size_in_b <= 1.0: + self.tp = [1, 2, 4] + self.mbs = [1, 2, 4, 8] + self.gbs = 128 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.mbs = [1, 2, 4, 8] + self.gbs = 512 + elif model_size_in_b <= 8.0: + self.tp = [1, 2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2, 4] + self.gbs = 1024 + elif model_size_in_b <= 13.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2, 4] + self.gbs = 1024 + elif model_size_in_b <= 23.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.gbs = 1024 + elif model_size_in_b <= 45.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 4] + self.mbs = [1, 2] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 1024 + elif model_size_in_b <= 95: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 8] + self.mbs = [1, 2] + self.min_model_parallel = 8 + self.max_model_parallel = 64 + self.gbs = 1024 + elif seq_length == 8192: + if model_size_in_b <= 1.0: + self.tp = [1, 2] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2, 4] + self.gbs = 64 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2, 4] + self.gbs = 128 + elif model_size_in_b <= 8.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2] + self.gbs = 256 + elif model_size_in_b <= 13.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2] + self.gbs = 256 + elif model_size_in_b <= 23.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 4] + self.mbs = [1] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 256 + elif model_size_in_b <= 45.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 8] + self.mbs = [1] + self.min_model_parallel = 32 + self.max_model_parallel = 64 + self.gbs = 256 + elif seq_length == 16384: + if model_size_in_b <= 1.0: + self.tp = [2, 4] + self.mbs = [1, 2] + self.gbs = 32 + elif model_size_in_b <= 4.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1] + self.gbs = 64 + elif model_size_in_b <= 8.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1] + self.gbs = 128 + elif model_size_in_b <= 13.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1] + self.gbs = 128 + elif model_size_in_b <= 23.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 4] + self.mbs = [1] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 128 + elif seq_length == 32768: + if model_size_in_b <= 1.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1] + self.gbs = 16 + elif model_size_in_b <= 4.0: + self.tp = [2, 4] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1] + self.gbs = 32 + elif model_size_in_b <= 8.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.mbs = [1] + self.gbs = 64 + elif model_size_in_b <= 13.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.mbs = [1] + self.gbs = 64 + elif model_size_in_b <= 23.0: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 4] + self.mbs = [1] + self.min_model_parallel = 16 + self.max_model_parallel = 32 + self.gbs = 64 + elif gpu_memory_gb == 40: + if model_size_in_b <= 1.0: + self.tp = [1, 2, 4] + self.mbs = [1, 2, 4, 8] + self.gbs = 256 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4, 8] + self.mbs = [1, 2, 4, 8] + self.gbs = 1024 + elif model_size_in_b <= 8.0: + self.tp = [2, 4, 8] + self.pp = [1, 2] + self.mbs = [1, 2, 4] + self.min_model_parallel = 2 + self.gbs = 2048 + elif model_size_in_b <= 13.0: + self.tp = [4, 8] + self.pp = [1, 2, 4] + self.mbs = [1, 2, 4] + self.min_model_parallel = 4 + self.max_model_parallel = 32 + self.gbs = 2048 + elif model_size_in_b <= 23.0: + self.tp = [2, 4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 8] + self.min_model_parallel = 8 + self.max_model_parallel = 64 + self.gbs = 2048 + elif model_size_in_b <= 45.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 12] + self.mbs = [1, 2, 4] + self.min_model_parallel = 16 + self.max_model_parallel = 128 + self.gbs = 2048 + elif model_size_in_b <= 95: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 16] + self.mbs = [1, 2, 4] + self.min_model_parallel = 16 + self.max_model_parallel = 256 + self.gbs = 2048 + elif model_size_in_b <= 130.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 26] + self.mbs = [1, 2] + self.min_model_parallel = 32 + self.max_model_parallel = 512 + self.gbs = 2048 + elif model_size_in_b <= 195.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 32] + self.mbs = [1, 2] + self.min_model_parallel = 64 + self.max_model_parallel = 1024 + self.gbs = 2048 + elif model_size_in_b <= 395.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 64] + self.mbs = [1, 2] + self.min_model_parallel = 128 + self.max_model_parallel = 2048 + self.gbs = 2048 + elif model_size_in_b <= 790.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 8 <= x <= 128] + self.mbs = [1, 2] + self.min_model_parallel = 256 + self.max_model_parallel = 4096 + self.gbs = 2048 + elif model_size_in_b <= 1100.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 8 <= x <= 192] + self.mbs = [1, 2] + self.min_model_parallel = 512 + self.max_model_parallel = 8192 + self.gbs = 2048 + + +@dataclass +class T5GridSearch: + """Selects grid search space for TP, PP, MBS parameters for T5/mT5 and 80GB GPUs. + + Args: + model_size_in_b (float): number of parameters in the model. + valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config. + seq_length (int): sequence length to use for training. + gpu_memory_gb (int): size of GPU memory in GB. + """ + + model_size_in_b: int + seq_length: int + gpu_memory_gb: int + valid_pp: List[int] + + tp = [1, 2, 4, 8] + pp = [1] + cp = [None] + ep = [None] + mbs = [1, 2, 4, 6, 8, 12, 16] + + gbs: int = 1920 + min_model_parallel: int = 1 + max_model_parallel: int = 8 + + def init_params(self): + model_size_in_b = self.model_size_in_b + gpu_memory_gb = self.gpu_memory_gb + seq_length = self.seq_length + + if gpu_memory_gb == 80: + if model_size_in_b <= 1.0: + self.tp = [1, 2] + self.mbs = [16, 32, 64, 128] + self.gbs = 2048 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.mbs = [4, 6, 8, 12, 16, 24, 32, 48] + self.gbs = 1920 + elif model_size_in_b <= 8.0: + self.tp = [2, 4, 8] + self.mbs = [4, 6, 8, 12, 16, 24, 32] + self.gbs = 1920 + elif model_size_in_b <= 14.5: + self.tp = [4, 8] + self.mbs = [2, 4, 6, 8, 12, 16, 24] + self.gbs = 1920 + elif model_size_in_b <= 25.9: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.gbs = 1920 + elif model_size_in_b <= 43.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 4] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 1920 + elif model_size_in_b <= 85.5: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 8] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 16 + self.max_model_parallel = 64 + self.gbs = 1920 + elif model_size_in_b <= 165.5: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 16] + self.mbs = [1, 2, 4, 6] + self.min_model_parallel = 32 + self.max_model_parallel = 128 + self.gbs = 1920 + elif model_size_in_b <= 250: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 32] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 64 + self.max_model_parallel = 256 + self.gbs = 1920 + elif gpu_memory_gb == 40: + if model_size_in_b <= 1.0: + self.tp = [1, 2] + self.mbs = [16, 32, 64, 128] + self.gbs = 2048 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.mbs = [4, 8, 12, 16, 24, 32, 48] + self.gbs = 1920 + elif model_size_in_b <= 8.0: + self.tp = [2, 4, 8] + self.mbs = [4, 6, 8, 12, 16, 24] + self.gbs = 1920 + elif model_size_in_b <= 14.5: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 2] + self.mbs = [2, 4, 6, 8, 12, 16] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.gbs = 1920 + elif model_size_in_b <= 25.9: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 8] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 1920 + elif model_size_in_b <= 43.0: + self.tp = [4, 8] + self.pp = [x for x in self.valid_pp if 1 <= x <= 8] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 16 + self.max_model_parallel = 32 + self.gbs = 1920 + elif model_size_in_b <= 85.5: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 2 <= x <= 8] + self.mbs = [1, 2, 4, 6, 8] + self.min_model_parallel = 32 + self.max_model_parallel = 64 + self.gbs = 1920 + elif model_size_in_b <= 165.5: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 4 <= x <= 32] + self.mbs = [1, 2, 4] + self.min_model_parallel = 64 + self.max_model_parallel = 128 + self.gbs = 1920 + elif model_size_in_b <= 250: + self.tp = [8] + self.pp = [x for x in self.valid_pp if 8 <= x <= 64] + self.mbs = [1, 2, 4] + self.min_model_parallel = 128 + self.max_model_parallel = 256 + self.gbs = 1920 + + +@dataclass +class BertGridSearch: + """Selects grid search space for TP, PP, MBS parameters for BERT and 80GB GPUs. + + Args: + model_size_in_b (float): number of parameters in the model. + valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config. + seq_length (int): sequence length to use for training. + gpu_memory_gb (int): size of GPU memory in GB. + """ + + model_size_in_b: int + seq_length: int + gpu_memory_gb: int + valid_pp: List[int] + + tp = [1, 2, 4, 8] + pp = [1] + cp = [None] + ep = [None] + mbs = [1, 2, 4, 6, 8, 12, 16] + + gbs: int = 1920 + min_model_parallel: int = 1 + max_model_parallel: int = 8 + + def init_params(self): + model_size_in_b = self.model_size_in_b + gpu_memory_gb = self.gpu_memory_gb + seq_length = self.seq_length + + if gpu_memory_gb == 80: + if model_size_in_b <= 1.0: + self.tp = [1, 2] + self.gbs = 256 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4] + self.gbs = 1024 + elif model_size_in_b <= 8.0: + self.tp = [2, 4, 8] + self.min_model_parallel = 2 + self.gbs = 2048 + elif model_size_in_b <= 13.0: + self.tp = [2, 4, 8] + self.mbs = [1, 2, 3, 4, 6] + self.min_model_parallel = 2 + self.gbs = 2048 + elif model_size_in_b <= 25.0: + self.tp = [4, 8] + self.mbs = [1, 2, 3, 4] + self.min_model_parallel = 4 + self.gbs = 2048 + elif model_size_in_b <= 46.5: + self.tp = [4, 8] + self.pp = [1, 2, 4] + self.mbs = [1, 2, 3, 4] + self.min_model_parallel = 4 + self.max_model_parallel = 16 + self.gbs = 2048 + elif model_size_in_b <= 87.5: + self.tp = [4, 8] + self.pp = [2, 4, 6, 8] + self.mbs = [1, 2, 3, 4] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 2048 + elif model_size_in_b <= 165.5: + self.tp = [4, 8] + self.pp = [4, 6, 8, 16] + self.mbs = [2, 4, 6, 8] + self.min_model_parallel = 16 + self.max_model_parallel = 128 + self.gbs = 2048 + elif model_size_in_b <= 250.5: + self.tp = [8] + self.pp = [4, 8, 16, 32] + self.mbs = [1, 2, 3, 4] + self.min_model_parallel = 32 + self.max_model_parallel = 256 + self.gbs = 2048 + else: + raise ValueError("No BERT model larger than 250B parameters is supported.") + elif gpu_memory_gb == 40: + if model_size_in_b <= 1.0: + self.tp = [1, 2, 4] + self.gbs = 256 + elif model_size_in_b <= 4.0: + self.tp = [1, 2, 4, 8] + self.gbs = 1024 + elif model_size_in_b <= 8.0: + self.tp = [2, 4, 8] + self.mbs = [1, 2, 4] + self.gbs = 2048 + elif model_size_in_b <= 13.0: + self.tp = [2, 4, 8] + self.mbs = [1, 2, 4] + self.gbs = 2048 + elif model_size_in_b <= 25.0: + self.tp = [2, 4, 8] + self.pp = [1, 2] + self.mbs = [1, 2, 4] + self.min_model_parallel = 2 + self.max_model_parallel = 16 + self.gbs = 2048 + elif model_size_in_b <= 46.5: + self.tp = [4, 8] + self.pp = [1, 2, 4, 8] + self.mbs = [1, 2, 3] + self.min_model_parallel = 8 + self.max_model_parallel = 32 + self.gbs = 2048 + elif model_size_in_b <= 87.5: + self.tp = [4, 8] + self.pp = [2, 4, 6, 8] + self.mbs = [1, 2, 3] + self.min_model_parallel = 16 + self.max_model_parallel = 64 + self.gbs = 2048 + elif model_size_in_b <= 165.5: + self.tp = [8] + self.pp = [4, 6, 8, 16] + self.mbs = [1, 2] + self.min_model_parallel = 32 + self.max_model_parallel = 256 + self.gbs = 2048 + elif model_size_in_b <= 250.5: + self.tp = [8] + self.pp = [8, 16, 32] + self.mbs = [1, 2] + self.min_model_parallel = 64 + self.max_model_parallel = 512 + self.gbs = 2048 + else: + raise ValueError("No BERT model larger than 250B parameters is supported.") + + +def _calculate_tp_pp_mbs_grid( + model_size_in_b: float, + num_layers: int, + model_name: str, + seq_length: int, + train_cfg: dict, +) -> Tuple[int, int, int]: + """Selects grid search space for TP, PP, MBS parameters for any model, and calls the necessary heuristics function accordingly. + + Args: + model_size_in_b (float): number of parameters in the model. + num_layers (int): number of layers in the model config. + model_name (str): name of the model to be used, such as gpt3, t5, mt5... + seq_length (int): sequence length to use for training. + train_cfg (dict): config of the model that will be launched. + + Returns: + dataclass object with model parallelism parameters. + + Raises: + NotImplementedError: if the model_name is not one of the supported models. + """ + + tp_sizes = train_cfg.tensor_parallel_sizes + pp_sizes = train_cfg.pipeline_parallel_sizes + cp_sizes = train_cfg.context_parallel_sizes + ep_sizes = train_cfg.expert_parallel_sizes + min_model_parallel_size = train_cfg.min_model_parallel_size + max_model_parallel_size = train_cfg.max_model_parallel_size + mbs_sizes = train_cfg.micro_batch_sizes + gbs_size = train_cfg.global_batch_size + gpu_memory_gb = train_cfg.gpu_memory_gb + multiplier = 1 if model_name in GPT_BASED_MODELS else 2 + init_pp = [] if model_name in GPT_BASED_MODELS else [1] + valid_pp = init_pp + [ + multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0 + ] # Only divisors of num_layers are possible. + + kwargs = { + "model_size_in_b": model_size_in_b, + "valid_pp": valid_pp, + "seq_length": seq_length, + "gpu_memory_gb": gpu_memory_gb, + } + + if model_name in GPT_BASED_MODELS: + search_class = GPT3GridSearch + elif model_name in ["t5", "mt5"]: + search_class = T5GridSearch + elif model_name == "bert": + search_class = BertGridSearch + else: + raise NotImplementedError("Model name not implemented.") + + params = search_class(**kwargs) + params.init_params() + + # Override the tp, pp, mbs search if indicated in the config params. + if tp_sizes is not None and tp_sizes != "auto": + params.tp = tp_sizes + if pp_sizes is not None and pp_sizes != "auto": + params.pp = pp_sizes + if cp_sizes is not None and cp_sizes != "auto": + params.cp = cp_sizes + if ep_sizes is not None and ep_sizes != "auto": + params.ep = ep_sizes + if mbs_sizes is not None and mbs_sizes != "auto": + params.mbs = mbs_sizes + if gbs_size is not None and gbs_size != "auto": + params.gbs = gbs_size + if min_model_parallel_size is not None and min_model_parallel_size != "auto": + params.min_model_parallel = min_model_parallel_size + if max_model_parallel_size is not None and max_model_parallel_size != "auto": + params.max_model_parallel = max_model_parallel_size + return params diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py new file mode 100644 index 000000000000..3441c7cdbf9b --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py @@ -0,0 +1,470 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass + + +GPT_BASED_MODELS = [ + "gpt3", + "bert", + "llama", + "baichuan2", + "chatglm", + "qwen2", + "mixtral", + "mistral", + "gemma", + "nemotron", +] + + +@dataclass +class ModelSizeParams: + """Calculates the parameters that affect model_size: hidden size, attention heads, KV channels, and FFN size. It also calculates the learning rate. + + Args: + model_size_in_b (float): number of parameters in the desired model config, in billions. + vocab_size (int): size of the vocabulary to use for training. + seq_length (int): sequence length to be used during training. + model_name (str): name of the model to be trained, i.e. gpt3, t5, mt5... + + Raises: + ValueError: if the model size is larger than the max supported model size. + NotImplementedError: if the model name is not supported. + """ + + model_size_in_b: float + vocab_size: int + seq_length: int + model_name: str + + # Model size params + layers: int = None + hs: int = None + att_h: int = None + ffn: int = None + kv: int = None + lr: float = None + + def init_params(self): + model_name = self.model_name + model_size_in_b = self.model_size_in_b + if model_name in GPT_BASED_MODELS: + if model_size_in_b < 0.25: + self.hs, self.att_h, self.lr = 768, 12, 6e-4 + elif model_size_in_b < 0.5: + self.hs, self.att_h, self.lr = 1024, 16, 3e-4 + elif model_size_in_b < 1: + self.hs, self.att_h, self.lr = 1536, 16, 2.5e-4 + elif model_size_in_b < 2: + self.hs, self.att_h, self.lr = 2048, 16, 2e-4 + elif model_size_in_b < 3: + self.hs, self.att_h, self.lr = 2560, 32, 1.6e-4 + elif model_size_in_b < 4.5: + self.hs, self.att_h, self.lr = 3072, 32, 1.4e-4 + elif model_size_in_b < 8: + self.hs, self.att_h, self.lr = 4096, 32, 1.2e-4 + elif model_size_in_b < 15: + self.hs, self.att_h, self.lr = 5120, 40, 1e-4 + elif model_size_in_b < 25: + self.hs, self.att_h, self.lr = 6144, 48, 1e-4 + elif model_size_in_b < 52: + self.hs, self.att_h, self.lr = 8192, 64, 0.8e-4 + elif model_size_in_b < 105: + self.hs, self.att_h, self.lr = 10240, 80, 0.7e-4 + elif model_size_in_b < 205: + self.hs, self.att_h, self.lr = 12288, 96, 0.6e-4 + elif model_size_in_b < 405: + self.hs, self.att_h, self.lr = 20480, 128, 0.5e-4 + elif model_size_in_b < 805: + self.hs, self.att_h, self.lr = 20480, 128, 0.4e-4 + elif model_size_in_b < 1105: + self.hs, self.att_h, self.lr = 25600, 160, 0.3e-4 + else: + raise ValueError("Model_size for GPT-3 must be smaller than 1.1T parameters.") + elif model_name == "t5": + self.kv, self.lr = 64, 1e-4 + if model_size_in_b < 0.1: + self.hs, self.att_h, self.ffn = 512, 6, 1024 + elif model_size_in_b < 0.4: + self.hs, self.att_h, self.ffn = 768, 12, 2048 + elif model_size_in_b < 1: + self.hs, self.att_h, self.ffn = 1024, 16, 2816 + elif model_size_in_b < 5: + self.hs, self.att_h, self.ffn = 2048, 32, 5120 + elif model_size_in_b < 15: + self.hs, self.att_h, self.ffn = 4096, 64, 10240 + elif model_size_in_b < 25.9: + self.hs, self.att_h, self.ffn = 5120, 80, 10880 + elif model_size_in_b < 43.0: + self.hs, self.att_h, self.ffn = 6144, 96, 10880 + elif model_size_in_b <= 85.5: + self.hs, self.att_h, self.ffn = 6144, 96, 16384 + elif model_size_in_b <= 165.5: + self.hs, self.att_h, self.ffn, kv = 7680, 96, 20480, 128 + elif model_size_in_b <= 250: + self.hs, self.att_h, self.ffn, kv = 12288, 96, 32768, 128 + else: + raise ValueError("Model_size for T5 must be smaller than 250B parameters.") + elif model_name == "mt5": + self.kv, self.lr = 64, 1e-4 + if model_size_in_b < 0.25: + self.hs, self.att_h, self.ffn = 512, 6, 1024 + elif model_size_in_b < 0.5: + self.hs, self.att_h, self.ffn = 768, 12, 2048 + elif model_size_in_b < 1.2: + self.hs, self.att_h, self.ffn = 1024, 16, 2816 + elif model_size_in_b < 5: + self.hs, self.att_h, self.ffn = 2048, 32, 5120 + elif model_size_in_b < 15: + self.hs, self.att_h, self.ffn = 4096, 64, 10240 + elif model_size_in_b < 25.9: + self.hs, self.att_h, self.ffn = 5120, 80, 10880 + elif model_size_in_b < 43.0: + self.hs, self.att_h, self.ffn = 6144, 96, 10880 + elif model_size_in_b <= 85.5: + self.hs, self.att_h, self.ffn = 6144, 96, 16384 + elif model_size_in_b <= 165.5: + self.hs, self.att_h, self.ffn, kv = 7680, 96, 20480, 128 + elif model_size_in_b <= 250: + self.hs, self.att_h, self.ffn, kv = 12288, 96, 32768, 128 + else: + raise ValueError("Model_size for mT5 must be smaller than 250B parameters.") + elif model_name == "bert": + self.lr = 1e-4 + if model_size_in_b < 0.25: + self.hs, self.att_h, self.lr = 768, 12, 2e-4 + elif model_size_in_b < 0.5: + self.hs, self.att_h, self.lr = 1024, 16, 2e-4 + elif model_size_in_b < 1: + self.hs, self.att_h = 1536, 16 + elif model_size_in_b < 2: + self.hs, self.att_h = 2048, 16 + elif model_size_in_b < 3: + self.hs, self.att_h = 2560, 32 + elif model_size_in_b < 4.5: + self.hs, self.att_h = 2560, 32 + elif model_size_in_b < 8: + self.hs, self.att_h = 4096, 32 + elif model_size_in_b < 15: + self.hs, self.att_h = 5120, 40 + elif model_size_in_b <= 25: + self.hs, self.att_h = 6144, 48 + elif model_size_in_b <= 46.5: + self.hs, self.att_h = 7680, 48 + elif model_size_in_b <= 87.5: + self.hs, self.att_h = 9216, 96 + elif model_size_in_b <= 165.5: + self.hs, self.att_h = 9216, 96 + elif model_size_in_b <= 250.5: + self.hs, self.att_h = 12288, 96 + else: + raise ValueError("Model_size for BERT must be smaller than 25B parameters.") + self.ffn = 4 * self.hs + else: + raise NotImplementedError("Model name is not valid.") + + # Try powers of 2 + margin = 0.01 + for attempt in range(0, 10): + for layers in (2**p for p in range(1, 10)): + out_size = _calculate_model_size( + vocab_size=self.vocab_size, + seq_length=self.seq_length, + hidden_size=self.hs, + num_layers=layers, + ffn_size=self.ffn, + kv_channels=self.kv, + att_heads=self.att_h, + model_name=self.model_name, + ) + if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers: + self.layers = layers + margin += 0.01 # Double margin of acceptable model sizes. + + # Try multiples of 16 + margin = 0.01 + for attempt in range(0, 6): + for layers in range(16, 201, 16): + out_size = _calculate_model_size( + vocab_size=self.vocab_size, + seq_length=self.seq_length, + hidden_size=self.hs, + num_layers=layers, + ffn_size=self.ffn, + kv_channels=self.kv, + att_heads=self.att_h, + model_name=self.model_name, + ) + if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers: + self.layers = layers + margin += 0.01 # Double margin of acceptable model sizes. + + # Try multiples of 2 + margin = 0.01 + for attempt in range(0, 6): + for layers in range(2, 201, 2): + out_size = _calculate_model_size( + vocab_size=self.vocab_size, + seq_length=self.seq_length, + hidden_size=self.hs, + num_layers=layers, + ffn_size=self.ffn, + kv_channels=self.kv, + att_heads=self.att_h, + model_name=self.model_name, + ) + if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers: + self.layers = layers + margin += 0.01 # Double margin of acceptable model sizes. + + # Try multiples of 5 + margin = 0.01 + for attempt in range(0, 6): + for layers in range(5, 201, 5): + out_size = _calculate_model_size( + vocab_size=self.vocab_size, + seq_length=self.seq_length, + hidden_size=self.hs, + num_layers=layers, + ffn_size=self.ffn, + kv_channels=self.kv, + att_heads=self.att_h, + model_name=self.model_name, + ) + if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers: + self.layers = layers + margin += 0.01 # Double margin of acceptable model sizes. + + # Try any valid number + margin = 0.01 + for attempt in range(0, 10): + for layers in range(1, 200): + out_size = _calculate_model_size( + vocab_size=self.vocab_size, + seq_length=self.seq_length, + hidden_size=self.hs, + num_layers=layers, + ffn_size=self.ffn, + kv_channels=self.kv, + att_heads=self.att_h, + model_name=self.model_name, + ) + if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers: + self.layers = layers + margin += 0.01 # Double margin of acceptable model sizes. + + if not self.layers: + raise Exception("Number of layers not found, config is not possible.") + + +def _calculate_model_size( + vocab_size: int = None, + seq_length: int = None, + hidden_size: int = None, + num_layers: int = None, + ffn_size: int = None, + kv_channels: int = None, + att_heads: int = None, + model_name: str = "gpt3", +): + """Calculates the model size (number of parameters in billions), given the model parameters and name. + + Args: + vocab_size (int): vocabulary size to be used during training. + seq_length (int): input sequence length to be used during training. + hidden_size (int): size of the hidden layers of the model. + num_layers (int): number of layers in the model. + ffn_size (int): FFN size of the model. + kv_channels (int): number of KV channels in the transformer layers. + att_heads (int): number of attention heads in the transformer layers. + model_name (str): name of the model, i.e gpt3, t5, mt5... + + Returns: + float: size of the model in billions of parameters. + + Raises: + NotImplementedError: if the model name is not valid. + """ + + if model_name in GPT_BASED_MODELS: + model_size = ( + 12 + * num_layers + * hidden_size**2 + * (1 + (13 / (12 * hidden_size)) + ((vocab_size + seq_length) / (12 * num_layers * hidden_size))) + / 1e9 + ) + elif model_name in ["t5", "mt5"]: + # 2 L F + 3 L P + H (2 + 4 L F + L (21 + 12 P) + 1 S + 1 V) + proj_size = att_heads * kv_channels + model_size = ( + 2 * num_layers * 1.5 * ffn_size + + 3 * num_layers * proj_size + + hidden_size + * (2 + 4 * num_layers * 1.5 * ffn_size + num_layers * (21 + 12 * proj_size) + seq_length + vocab_size) + ) / 1e9 + elif model_name == "bert": + model_size = ( + num_layers * (ffn_size + hidden_size * (4 * hidden_size + 3 * att_heads + 2 * ffn_size + 6)) + + hidden_size * (vocab_size + seq_length + hidden_size + 5) + ) / 1e9 + + else: + raise NotImplementedError("Model name is not valid.") + + return model_size + + +def generic_base_config(config) -> dict: + """Generates a base config dictionary from a base config python file. + + Args: + config (AutoConfigurator): config object for the Auto Configurator tool. + + Returns: + BaseConfig: base configuration for the model. + AutoConfigurator: config object for the Auto Configurator tool. + """ + + from nemo.collections.llm.tools.auto_configurator.core.base_config import BaseConfig, calculate_model_size + + default_model = False if config.model_size_in_b else True + + model_size_in_b = calculate_model_size( + config.gpu_count, + config.max_training_days, + config.model_size_in_b, + config.tflops_per_gpu, + config.num_tokens_in_b, + config.model_type, + ) + base_cfg = BaseConfig(config) + + if default_model: + params = ModelSizeParams( + model_size_in_b, + config.vocab_size, + config.seq_length, + config.model_type, + ) + params.init_params() + + if config.model_type in GPT_BASED_MODELS: + base_cfg.model.num_layers = params.layers + base_cfg.model.hidden_size = params.hs + base_cfg.model.num_attention_heads = params.att_h + base_cfg.model.kv_channels = params.kv + if not params.ffn: + base_cfg.model.ffn_hidden_size = params.hs * 4 + else: + base_cfg.model.ffn_hidden_size = params.ffn + + config.model_size_in_b = model_size_in_b + + return base_cfg, config + + +def modify_cfg( + base_cfg: dict, + act: int, + num_mbs_act: int, + act_per_pipe: int, + tp: int, + pp: int, + cp: int, + ep: int, + virtual_pipelines: int, + mbs: int, + max_minutes: int, + max_steps: int, + num_nodes: int, + model_name: str, + model_size, +) -> dict: + """Modify the base configuration for the model with the new parameters that are specific to the current model, which the Auto Configurator tool heuristics selected. + + Args: + base_cfg (dict): base configuration for the current model, which will be modified in this function. + act (int): number of activation checkpointing layers to use for the model. + num_mbs_act (int): sets the number of micro-batches where only a partial number of Transformer layers get checkpointed and recomputed within a window of micro-batches. + act_per_pipe (int): sets the number of Transformer layers to skip checkpointing at later pipeline stages. + tp (int): Tensor Parallelism (TP) value to be set for the model. + pp (int): Pipeline Parallelism (PP) value to be set for the model. + cp (int): Context Parallelism (CP) value to be set for the model. + ep (int): Expert Parallelism (EP) value to be set for the model. + virtual_pipelines (int): Virtual Pipelines value to be set for the model. + mbs (int): Micro Batch Size (MBS) value to be set for the model. + max_minutes (int): maximum amount of time to run this model for. + max_steps (int): maximum number of steps to run this model for. + num_nodes (int): number of nodes to use for the training run. + model_name (str): name of the model, i.e. gpt3, t5, mt5... + + Returns: + dict: dictionary containing the updated model configuration parameters. + """ + + if model_name in GPT_BASED_MODELS: + att_heads = base_cfg.model.num_attention_heads + num_layers = base_cfg.model.num_layers + else: + att_heads = base_cfg.model.encoder.num_attention_heads + num_layers = base_cfg.model.encoder.num_layers + + # gbs = mbs * num_gpus * accumulate_grad_batches / (tp * pp) + num_gpus = base_cfg.trainer.num_nodes * base_cfg.trainer.devices + gbs = base_cfg.data.global_batch_size + seq_len = base_cfg.model.seq_length + + new_cfg = dict(run=base_cfg.run) + if act is not None: + if model_name in GPT_BASED_MODELS: + new_cfg["activations_checkpoint_num_layers"] = act + else: + new_cfg["encoder"]["activations_checkpoint_num_layers"] = act // 2 + new_cfg["decoder"]["activations_checkpoint_num_layers"] = act // 2 + + if num_mbs_act is not None and model_name in GPT_BASED_MODELS: + new_cfg["num_micro_batches_with_partial_activation_checkpoints"] = num_mbs_act + + if act_per_pipe is not None and model_name in GPT_BASED_MODELS: + new_cfg["activations_checkpoint_layers_per_pipeline"] = act_per_pipe + + if virtual_pipelines is not None and model_name in GPT_BASED_MODELS: + new_cfg["virtual_pipeline_model_parallel_size"] = virtual_pipelines + + new_cfg["tensor_model_parallel_size"] = tp + new_cfg["pipeline_model_parallel_size"] = pp + new_cfg["micro_batch_size"] = mbs + new_cfg["global_batch_size"] = gbs + + if cp is not None: + new_cfg["context_parallel_size"] = cp + + if ep is not None: + new_cfg["expert_model_parallel_size"] = ep + + mod_gbs = gbs % (mbs * num_gpus / (tp * pp)) + mod_att_heads = att_heads % tp + mod_layers = num_layers % pp + if mod_gbs == 0 and mod_att_heads == 0 and mod_layers == 0: + # Valid config + new_cfg["run"][ + "name" + ] = f"{model_name}_{str(model_size)}b_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}" + print( + f"Valid config: SeqLen={seq_len}, GBS={gbs}, MBS={mbs}, TP={tp}, PP={pp}, CP={cp}, EP={ep}, act_ckpt_layers={act}, num_mbs_act={num_mbs_act}, act_per_pipe={act_per_pipe}. Adding to directory." + ) + return new_cfg + return None diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py new file mode 100644 index 000000000000..0c80c9a21a9e --- /dev/null +++ b/nemo/collections/llm/tools/auto_configurator/runner.py @@ -0,0 +1,246 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import re + +from typing import List, Optional + +from nemo.collections.llm import GPTModel +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.tools.auto_configurator.core.training_config import generate_grid_search_configs +from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config +from nemo.collections.llm.utils import Config, Partial +from nemo.utils import logging + +SUPPORTED_MODELS = [ + "gpt3", + "llama", + "mixtral", + "mistral", + "gemma", + "nemotron", +] + +SUPPORTED_TOKENIZERS = [ + "autotokenizer", + "sentencepiece", + "huggingface", +] + + +class AutoConfigurator: + """Auto Configurator runner config class.""" + + def __init__( + self, + model: Config = None, + num_nodes: int = None, + data_paths: List = None, + path_to_logs: str = None, + tokenizer_type: Optional[str] = "autotokenizer", + tokenizer_path: Optional[str] = "GPT2BPETokenizer", + gpus_per_node: Optional[int] = 8, + gpu_memory_gb: Optional[int] = 80, + seq_length: Optional[int] = 2048, + global_batch_size: Optional[int] = "auto", + tensor_parallel_sizes: Optional[List[int]] = "auto", + pipeline_parallel_sizes: Optional[List[int]] = "auto", + micro_batch_sizes: Optional[List[int]] = "auto", + context_parallel_sizes: Optional[List[int]] = [1], + expert_parallel_sizes: Optional[List[int]] = [1], + min_model_parallel_size: Optional[int] = "auto", + max_model_parallel_size: Optional[int] = "auto", + num_tokens_in_b: Optional[int] = 300, + tflops_per_gpu: Optional[int] = 140, + max_minutes_per_run: Optional[int] = 30, + max_training_days: Optional[int] = 2, + max_steps_per_run: Optional[int] = 50, + vocab_size: Optional[int] = 51200, + ): + """ + Args: + model_type (Config): model type to be used for training. + num_nodes (int): number of nodes to be used for training. + data_paths (List): list of datafiles to be used for training. + path_to_logs (str): path to the directory where the logs will be stored. + tokenizer_type (Optional[str]): tokenizer type. + tokenizer_path (Optional[str]): path to the tokenizer model. + model_size (Optional[int]): size of model to be trained. + gpus_per_node (Optional[int]): number of GPUs per node to be used. + gpu_memory_gb (Optional[int]): memory per GPU, in GB. Currently 40GB and 80GB A100s/H100s supported. + seq_length (Optional[int]): model sequence length. Available seq_length list for GPT-based models: [2048, 4096, 8192, 16384, 32768]. + global_batch_size (Optional[int]): model global batch size. Set to "auto" if you want auto configurator to find optimal gbs. + tensor_parallel_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8]. + pipeline_parallel_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8]. + micro_batch_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8]. + context_parallel_sizes (Optional[List[int]]): model context parallel size. A list, such as [1, 2, 4, 8]. + expert_parallel_sizes (Optional[List[int]]): model expert parallel size. A list, such as [1, 2, 4, 8]. + min_model_parallel_size (Optional[int]): set to "auto" to use our recommendation, or a value for the minimum desired parallelism. + max_model_parallel_size (Optional[int]): set to "auto" to use our recommendation, or a value for the maximum desired parallelism. + num_tokens_in_b (Optional[int]): number of tokens in billions in train dataset. + tflops_per_gpu (Optional[int]): estimated tflops per GPU. + max_minutes_per_run (Optional[int]): maximum number of minutes per run for the grid search. + max_training_days (Optional[int]): number of days expected model to be trained. + max_steps_per_run (Optional[int]): maximum number of steps per run for the grid search. + vocab_size (Optional[int]): size of tokenizer vocabulary. + """ + + # Print out the config + config = locals() + config.pop('self') + for key, value in config.items(): + setattr(self, key, value) + logging.info(self._get_message(config)) + + model_type = self._get_model_type(model) + assert model_type in SUPPORTED_MODELS, f"model_type must be set to one of {SUPPORTED_MODELS}." + assert tokenizer_type in SUPPORTED_TOKENIZERS, f"tokenizer_type must be set to one of {SUPPORTED_TOKENIZERS}." + assert num_nodes, "num_nodes value must be specified." + assert data_paths, "training data must be specified." + assert path_to_logs, f"path_to_logs parameter must be specified." + gpu_count = num_nodes * gpus_per_node + assert gpu_count > 0, "num_nodes * gpus_per_node must be an int larger than zero." + assert gpu_memory_gb in ( + 40, + 80, + ), "gpu_memory_gb can only be 40 or 80." + assert max_minutes_per_run >= 10, "max_minutes_per_run must be an int and be at least 10 minutes." + + self.model_type = model_type + self.model_size_in_b = self._get_model_size(model) + self.gpu_count = gpu_count + self.num_gpus = gpus_per_node + + def _get_message(self, config: dict) -> str: + """ + Function that returns runner config line by line. + + Args: + config (dict): runner config. + + Returns: + str: runner config params. + """ + + message = "AutoConfigurator runner config:\n" + for key, value in config.items(): + message += f"{key}: {value}\n" + + return message + + def _get_model_type(self, model: Config) -> str: + """ + Function that returns model type from model class name. + + Args: + models (Config): model object. + + Returns: + str: model type. + """ + + match = re.search(r"\w+\d+[MB]", str(model)) + if match: + model = match.group(0) + + if "GPT" in model: + return "gpt3" + elif "Llama" in model: + return "llama" + elif "Mixtral" in model: + return "mixtral" + elif "Mistral" in model: + return "mistral" + elif "Gemma" in model: + return "gemma" + elif "Nemotron" in model: + return "nemotron" + else: + return None + + def _get_model_size(self, model: Config) -> int: + """ + Function that returns model size from model class name. + + Args: + model (Config): model class name. + + Returns: + int: model size. + """ + match = re.search(r'(\d+)([BM])', str(model)) + if match: + size = int(match.group(1)) + measure = match.group(2) + if measure == 'B': + return size + elif measure == 'M': + return size / 1000 # Convert millions to billions + return None + + +def generate_configs(runner_config: AutoConfigurator = None) -> dict: + """ + Function that returns a dictionary of Partial configs. + + Args: + config (AutoConfigurator): Auto Configurator object. + + Returns: + dict: dictionary of Partial configs. + """ + + # Generate base config for the given model size + base_cfg, train_cfg = generic_base_config(runner_config) + + # Launch grid search for training constraints + base_config, train_configs = generate_grid_search_configs(base_cfg, train_cfg) + + tokenizer = base_config.tokenizer + model = Config(GPTModel, config=base_config.model, tokenizer=tokenizer) + + configs = {} + for name, config in train_configs.items(): + trainer = copy.deepcopy(base_config.trainer) + data = copy.deepcopy(base_config.data) + log = copy.deepcopy(base_config.log) + + # Set data params + data.micro_batch_size = config.get("micro_batch_size") + data.global_batch_size = config.get("global_batch_size") + + # Set strategy params + trainer.strategy.tensor_model_parallel_size = config.get("tensor_model_parallel_size") + trainer.strategy.pipeline_model_parallel_size = config.get("pipeline_model_parallel_size") + trainer.strategy.context_parallel_size = config.get("context_parallel_size") + trainer.strategy.expert_model_parallel_size = config.get("expert_model_parallel_size") + trainer.strategy.virtual_pipeline_model_parallel_size = config.get( + "virtual_pipeline_model_parallel_size", None + ) + if config.get("tensor_model_parallel_size") > 1: + trainer.strategy.sequence_parallel = True + + # Set the directory where to save the logs + configs[name] = Partial( + pretrain, + model=model, + trainer=trainer, + data=data, + optim=base_config.optim, + log=log, + resume=None, + ) + + return base_cfg, configs diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py index 89f1ab24f0a9..4bd4443e46f5 100644 --- a/nemo/collections/multimodal/data/neva/conversation.py +++ b/nemo/collections/multimodal/data/neva/conversation.py @@ -34,6 +34,7 @@ DEFAULT_IM_START_TOKEN["llama_3"] = "<|reserved_special_token_4|>" DEFAULT_IM_END_TOKEN["llama_3"] = "<|reserved_special_token_5|>" + DEFAULT_VID_START_TOKEN = "" DEFAULT_VID_END_TOKEN = "" TIME_TOKEN_TEMPLATE = "" @@ -507,6 +508,7 @@ def dict(self): sep2=DEFAULT_EOS_TOKEN, ) + default_conversation = conv_vicuna_v1 conv_templates = { "default": conv_vicuna_v0, diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 37f57ff21bba..f46b75e7b472 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -40,6 +40,7 @@ DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IMAGE_TOKEN, DEFAULT_LABELS_TOKEN, + DEFAULT_PAD_TOKEN, DEFAULT_VID_END_TOKEN, DEFAULT_VID_START_TOKEN, DEFAULT_VIDEO_TOKEN, @@ -353,8 +354,14 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in if use_plain: assert default_token in conversation[0]['value'] conversation[0]['value'] = default_token - for turn in conversation: - turn["value"] = turn["value"].replace(default_token, replace_token) + if multimodal_cfg["conv_template"] == "interleaved": + # directly replace the default_token in the conversation, + # since we don't use the conversation template + updated_conversation = conversation.replace(default_token, replace_token) + source['conversations'] = updated_conversation + else: + for turn in conversation: + turn["value"] = turn["value"].replace(default_token, replace_token) return sources @@ -791,6 +798,52 @@ def preprocess_v1( ) +def preprocess_interleaved_prompt( + sources: dict, + tokenizer, + cfg, +) -> Dict: + """tokenize the interleaved prompt and mask the text part of the prompt""" + conversations = [] + for source in sources: + conversations.append(source['conversations']) + add_extra_token = cfg.get("add_extra_token") + tokens = tokenize( + texts=conversations, + tokenizer=tokenizer, + context_length=cfg.get("context_length"), + add_extra_token=add_extra_token, + ) + + model_type = cfg['model_type'] + image_patch_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] + image_start_token = DEFAULT_IM_START_TOKEN[model_type] + image_end_token = DEFAULT_IM_END_TOKEN[model_type] + DEFAULT_TOKENS = [image_patch_token, image_start_token, image_end_token, DEFAULT_PAD_TOKEN] + img_patch_id, img_start_id, img_end_id, pad_id = get_tokens_ids(tokenizer, DEFAULT_TOKENS) + tokens[tokens == img_patch_id] = 0 # DEFAULT_IMAGE_PATCH_TOKEN + + labels = tokens.clone().detach() + + # Mask labels change for interleaved prompt + labels[labels == img_start_id] = IGNORE_INDEX + labels[labels == img_end_id] = IGNORE_INDEX + labels[labels == 0] = IGNORE_INDEX + labels[labels == pad_id] = IGNORE_INDEX + + if add_extra_token: + tokens = tokens[:, :-1].contiguous() + labels = labels[:, 1:].contiguous() + else: + labels = torch.roll(labels, shifts=-1, dims=-1) + labels[:, -1] = IGNORE_INDEX + + return dict( + tokens=tokens, + labels=labels, + ) + + def preprocess_nvgpt( sources: dict, tokenizer, @@ -1075,6 +1128,29 @@ def preprocess_plain( ) +def preprocess_conversations(self, sources): + if self.conv_template in ["nvgpt", "nv_steerlm"]: + return preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "nv_dpo": + return preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "v1": + return preprocess_v1(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "llama_2": + return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "llama_3": + return preprocess_llama_3(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "mistral": + return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg, is_mistral=True) + elif self.conv_template == "yi_34b": + return preprocess_yi_34b(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "plain": + return preprocess_plain(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "interleaved": + return preprocess_interleaved_prompt(sources, self.tokenizer, self.multimodal_cfg) + else: + raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.") + + class LazySupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" @@ -1215,57 +1291,7 @@ def expand2square(pil_img, background_color): media_tensors = torch.tensor([]) sources = copy.deepcopy(sources) - if self.conv_template in ["nvgpt", "nv_steerlm"]: - data_dict = preprocess_nvgpt( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "nv_dpo": - data_dict = preprocess_nv_dpo( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "v1": - data_dict = preprocess_v1( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "llama_2": - data_dict = preprocess_llama_2( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "llama_3": - data_dict = preprocess_llama_3( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "mistral": - data_dict = preprocess_llama_2( - sources, - self.tokenizer, - self.multimodal_cfg, - is_mistral=True, - ) - elif self.conv_template == "plain": - data_dict = preprocess_plain( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - elif self.conv_template == "yi_34b": - data_dict = preprocess_yi_34b( - sources, - self.tokenizer, - self.multimodal_cfg, - ) - else: - raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.") + data_dict = preprocess_conversations(self, sources) if isinstance(i, int): data_dict = dict(tokens=data_dict["tokens"][0], labels=data_dict["labels"][0]) diff --git a/nemo/collections/multimodal/data/neva/neva_energon_dataset.py b/nemo/collections/multimodal/data/neva/neva_energon_dataset.py new file mode 100644 index 000000000000..a83e616f248f --- /dev/null +++ b/nemo/collections/multimodal/data/neva/neva_energon_dataset.py @@ -0,0 +1,506 @@ +import dataclasses +from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import torch +from einops import rearrange +from megatron.energon import ( + Batch, + CaptioningSample, + DefaultTaskEncoder, + InterleavedSample, + SimilarityInterleavedSample, + VQASample, + batch_pad_stack, +) +from PIL import Image + +from nemo.collections.multimodal.data.neva.neva_dataset import ( + DEFAULT_IMAGE_TOKEN, + preprocess_conversations, + preprocess_interleaved_prompt, + preprocess_llama_2, + preprocess_llama_3, + preprocess_multimodal, + preprocess_nv_dpo, + preprocess_nvgpt, + preprocess_plain, + preprocess_v1, + preprocess_yi_34b, + process_image, +) + + +# Type for intermediate batch, after batch() +@dataclass +class ImageTaskSample: + __key__: str + __subflavor__: str + conversations: List[dict] + image: Optional[Union[str, List[str], torch.Tensor]] = None + video: Optional[Union[str, List[str]]] = None + + tokens: Optional[torch.Tensor] = None + labels: Optional[torch.Tensor] = None + attention_mask: Optional[torch.Tensor] = None + loss_mask: Optional[torch.Tensor] = None + position_ids: Optional[torch.Tensor] = None + + +# Typing for the resulting batch data after encode_batch() +@dataclass +class ImageTaskBatch(Batch): + tokens: torch.Tensor + labels: torch.Tensor + attention_mask: torch.Tensor + loss_mask: torch.Tensor + position_ids: torch.Tensor + media: Optional[torch.Tensor] = None + + +# Required for energon, https://nvidia.github.io/Megatron-Energon/task_encoders.html +class TaskEncoder(DefaultTaskEncoder[VQASample, InterleavedSample, ImageTaskBatch, dict]): + """A task encoder for data samples for captioning, pretraining, sft and interleaved multimodal tasks. + It defines how the data is processed after it is loaded from the dataset. + Currently, it supports captioning, pretraining, sft and interleaved multimodal tasks and datasets. + """ + + def __init__(self, tokenizer, image_processor, multimodal_cfg: dict, data_cfg: dict): + super().__init__(batch_type=ImageTaskBatch) + self.tokenizer = tokenizer + self.image_processor = image_processor + self.multimodal_cfg = multimodal_cfg + self.data_cfg = data_cfg + self.conv_template = multimodal_cfg["conv_template"] + self.max_num_images = 6 + self.image_following_text_only = False + self.caption_prompts = [ + "Generate a short cap fotion of the image.", + "Describe the image concisely.", + "Provide a brief description of the given image.", + ] + self.prompt_index = 0 + + def encode_sample( + self, + sample: Union[ImageTaskSample, CaptioningSample, VQASample, InterleavedSample, SimilarityInterleavedSample], + ) -> dict: + if isinstance(sample, InterleavedSample): + return self.encode_interleaved(sample) + elif isinstance(sample, VQASample): + return self.encode_pretrain(sample) + elif isinstance(sample, CaptioningSample): + return self.encode_captioning(sample) + elif isinstance(sample, SimilarityInterleavedSample) and self.conv_template == "interleaved": + return self.encode_similarity_interleaved(sample) + else: + return self.encode_sft(sample) + + def encode_captioning(self, sample: CaptioningSample) -> dict: + """Preprocessing function for datasets like COCO, containing image-caption pairs. + See Energon codebase for more details on CaptioningSample. + https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/captioning.py + """ + processed_image = self.process_images(sample.image) + + prompt = f"\n{self.caption_prompts[self.prompt_index]}\n" + self.prompt_index = (self.prompt_index + 1) % len(self.caption_prompts) + + caption = sample.caption.strip() + + conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": caption}] + + processed_sample = {"conversations": conversation, "image": processed_image} + + if self.multimodal_cfg['is_multimodal']: + cur_token_len = self.calculate_token_length(processed_sample["image"]) + processed_sample = preprocess_multimodal( + [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain") + )[0] + + processed = preprocess_conversations(self, [processed_sample]) + tokens = processed["tokens"] + labels = processed["labels"] + attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavor__=sample.__subflavor__, + conversations=conversation, + image=processed_sample["image"], + tokens=tokens.squeeze(0), + labels=labels.squeeze(0), + attention_mask=attention_mask.squeeze(0), + loss_mask=loss_mask.squeeze(0), + position_ids=position_ids, + ) + + def encode_pretrain(self, sample: VQASample) -> dict: + """Preprocessing function for datasets like LlaVA-Pretrain, multimodal synthesized conversation from the image-caption pairs. + See Energon codebase for more details on VQASample. + https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/vqa.py + """ + conversations = [{"from": "human", "value": sample.context}, {"from": "gpt", "value": sample.answers}] + processed_sample = {"conversations": conversations} + + if self.multimodal_cfg['is_multimodal']: + if hasattr(sample, 'image') and sample.image is not None: + processed_sample["image"] = self.process_images(sample.image) + cur_token_len = self.calculate_token_length(processed_sample["image"]) + processed_sample = preprocess_multimodal( + [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain") + )[0] + + processed = preprocess_conversations(self, [processed_sample]) + tokens = processed["tokens"] + labels = processed["labels"] + attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavor__=sample.__subflavor__, + conversations=conversations, + image=processed_sample.get("image"), + video=processed_sample.get("video"), + tokens=tokens.squeeze(0), + labels=labels.squeeze(0), + attention_mask=attention_mask.squeeze(0), + loss_mask=loss_mask.squeeze(0), + position_ids=position_ids, + ) + + def encode_sft(self, sample: Union[ImageTaskSample, VQASample, InterleavedSample]) -> dict: + """Preprocessing function for datasets like LLaVA-Instruct, conversational multimodal instruction-following data. + See Energon codebase for more details on VQASample. + https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/vqa.py + """ + conversations = sample.texts if hasattr(sample, 'texts') else sample.conversations + processed_sample = {"conversations": conversations} + image_present = False + + if self.multimodal_cfg['is_multimodal']: + image_present = False + if hasattr(sample, 'image') and sample.image is not None: + processed_sample["image"] = self.process_images(sample.image) + image_present = True + elif hasattr(sample, 'images') and sample.images: + processed_sample["image"] = self.process_images(sample.images[0]) + image_present = True + elif hasattr(sample, 'video') and sample.video: + # Implement video processing if needed + pass + + if image_present: + processed_sample = preprocess_multimodal( + [processed_sample], + self.multimodal_cfg, + self.calculate_token_length(processed_sample["image"]), + use_plain=(self.conv_template == "plain"), + )[0] + + processed = preprocess_conversations(self, [processed_sample]) + tokens = processed["tokens"] + labels = processed["labels"] + attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels) + + if not image_present: + processed_sample["image"] = torch.zeros( + 1, 3, self.multimodal_cfg["crop_size"][0], self.multimodal_cfg["crop_size"][1] + ) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavor__=sample.__subflavor__, + conversations=conversations, + # rewrite image so it creates tensor of zeros if not present + image=processed_sample.get("image", torch.tensor([])), + tokens=tokens.squeeze(0), + labels=labels.squeeze(0), + attention_mask=attention_mask.squeeze(0), + loss_mask=loss_mask.squeeze(0), + position_ids=position_ids, + ) + + def encode_similarity_interleaved(self, sample: SimilarityInterleavedSample) -> dict: + """Preprocessing function for datasets like MMC4, where text and images are interleaved via a similarity matrix or matched_text_indices. + See Energon codebase for more details on SimilarityInterleavedSample. + https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/similarity_interleaved.py + """ + # 4 fields: sample.images, sample.texts, sample.similarity_matrix, sample.matched_text_index + images, sentence_ixs = [], [] + for sample_image, sim_vec in zip(sample.images, sample.matched_text_indices): + images.append(sample_image) + sentence_ixs.append(sim_vec) + + # constrain max num images + max_num_images = self.max_num_images + if len(images) > max_num_images: + images = images[:max_num_images] + sentence_ixs = sentence_ixs[:max_num_images] + + images = [images[i] for i in np.argsort(sentence_ixs)] + + for ix in sentence_ixs: + sample.texts[ix] = f"{DEFAULT_IMAGE_TOKEN} {sample.texts[ix]}" + + if self.image_following_text_only: + # use pad token to divide sentence pieces + text = self.tokenizer.pad_id.join(sample.texts) + else: + text = " ".join(sample.texts) + + text = text.replace(" ", "").replace(" ", "") + text = f"{text}{self.tokenizer.eos_id}" + + if len(images) > 0: + processed_images = self.process_images(images) + else: + processed_images = None + + # check the case where the last token is the image token. + if text.endswith(DEFAULT_IMAGE_TOKEN): + text = text[: -len(DEFAULT_IMAGE_TOKEN)] + + n_im_patch = text.count(DEFAULT_IMAGE_TOKEN) + processed_images = processed_images[:n_im_patch] + assert len(processed_images) == n_im_patch + + processed_sample = {"conversations": text, "image": processed_images} + + if self.multimodal_cfg['is_multimodal']: + if images: + cur_token_len = self.calculate_token_length(processed_sample["image"]) + processed_sample = preprocess_multimodal( + [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain") + )[0] + + processed = preprocess_conversations(self, [processed_sample]) + + tokens = processed["tokens"] + labels = processed["labels"] + attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels) + + # pad images + if images: + processed_sample["image"] = self.pad_images(processed_sample["image"], self.max_num_images) + else: + # add extra dummy images + processed_sample["image"] = torch.zeros( + self.max_num_images, 3, self.multimodal_cfg["crop_size"][0], self.multimodal_cfg["crop_size"][1] + ) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavor__=sample.__subflavor__, + conversations=processed_sample["conversations"], + image=processed_sample["image"], + tokens=tokens.squeeze(0), + labels=labels.squeeze(0), + attention_mask=attention_mask.squeeze(0), + loss_mask=loss_mask.squeeze(0), + position_ids=position_ids, + ) + + def encode_interleaved(self, sample: InterleavedSample) -> dict: + """Preprocessing function for datasets like OBELISC, where text and images are strictly interleaved. + See Energon codebase for more details on SimilarityInterleavedSample. + https://github.com/NVIDIA/Megatron-Energon/blob/develop/src/megatron/energon/flavors/interleaved.py + """ + interleaved_text = [] + images = [] + for item in sample.sequence: + if isinstance(item, str): + interleaved_text.append(item) + elif isinstance(item, torch.Tensor) or isinstance(item, Image.Image): + interleaved_text.append(DEFAULT_IMAGE_TOKEN) + images.append(item) + else: + raise ValueError(f"Unsupported type in interleaved sequence: {type(item)}") + + # constrain max num images + max_num_images = self.max_num_images + + n_im_patch = interleaved_text.count(DEFAULT_IMAGE_TOKEN) + if n_im_patch > max_num_images: + interleaved_text, kept_image_indices = self.remove_excess_image_tokens(interleaved_text, max_num_images) + images = [images[i] for i in kept_image_indices] + + if len(images) > max_num_images: + images = images[:max_num_images] + + if len(images) > 0: + processed_images = self.process_images(images) + else: + processed_images = None + + combined_text = ' '.join(interleaved_text) + + processed_sample = {"conversations": combined_text, "image": processed_images} + + if self.multimodal_cfg['is_multimodal']: + if images: + cur_token_len = self.calculate_token_length(processed_sample["image"]) + processed_sample = preprocess_multimodal( + [processed_sample], self.multimodal_cfg, cur_token_len, use_plain=(self.conv_template == "plain") + )[0] + + processed = preprocess_conversations(self, [processed_sample]) + + tokens = processed["tokens"] + labels = processed["labels"] + + attention_mask, loss_mask, position_ids = self.get_masks_and_position_ids(tokens, labels) + + # pad images + if images: + processed_sample["image"] = self.pad_images(processed_sample["image"], self.max_num_images) + else: + processed_sample["image"] = torch.zeros( + self.max_num_images, 3, self.multimodal_cfg["crop_size"][0], self.multimodal_cfg["crop_size"][1] + ) + + return ImageTaskSample( + __key__=sample.__key__, + __subflavor__=sample.__subflavor__, + conversations=processed_sample["conversations"], + image=processed_sample["image"], + tokens=tokens.squeeze(0), + labels=labels.squeeze(0), + attention_mask=attention_mask.squeeze(0), + loss_mask=loss_mask.squeeze(0), + position_ids=position_ids, + ) + + def remove_excess_image_tokens(self, interleaved_text, max_num_images): + if interleaved_text[-1] == DEFAULT_IMAGE_TOKEN: + interleaved_text = interleaved_text[:-1] + + image_indices = [i for i, token in enumerate(interleaved_text) if token == DEFAULT_IMAGE_TOKEN] + + if len(image_indices) <= max_num_images: + return interleaved_text, list(range(len(image_indices))) + + # we keep the images that are close to the text tokens + importance = [] + for i, img_idx in enumerate(image_indices): + has_text_before = img_idx > 0 and interleaved_text[img_idx - 1] != DEFAULT_IMAGE_TOKEN + has_text_after = ( + img_idx < len(interleaved_text) - 1 and interleaved_text[img_idx + 1] != DEFAULT_IMAGE_TOKEN + ) + + if has_text_before and has_text_after: + importance.append((0, img_idx)) # highest importance + elif has_text_before or has_text_after: + importance.append((1, img_idx)) + else: + importance.append((2, img_idx)) + + importance.sort(key=lambda x: (x[0], x[1])) + kept_indices = {idx for _, idx in importance[:max_num_images]} + + # update idx to map correctly to the original images array + kept_image_indices = [image_indices.index(i) for i in kept_indices if i in image_indices] + + new_interleaved_text = [ + token for i, token in enumerate(interleaved_text) if token != DEFAULT_IMAGE_TOKEN or i in kept_indices + ] + + return new_interleaved_text, kept_image_indices + + def process_images(self, images): + if not isinstance(images, list): + images = [images] + processed_images = [] + for image in images: + image = process_image(self.image_processor, image, self.multimodal_cfg['image_aspect_ratio']) + processed_images.append(image) + return torch.stack(processed_images) # make it always 4D, otherwise has problem when len(images) > 1 + + def pad_images(self, images, max_num_images): + if len(images) < max_num_images: + pad_size = max_num_images - len(images) + padded_images = torch.cat([images, torch.zeros(pad_size, *images.size()[1:])], dim=0) + return padded_images + return images + + def batch(self, samples: List[ImageTaskSample]) -> ImageTaskBatch: + """Pads and stacks the samples in the batch.""" + batch = ImageTaskBatch( + tokens=batch_pad_stack([s.tokens for s in samples]), + labels=batch_pad_stack([s.labels for s in samples]), + attention_mask=batch_pad_stack([s.attention_mask for s in samples]), + loss_mask=batch_pad_stack([s.loss_mask for s in samples]), + position_ids=batch_pad_stack([s.position_ids for s in samples]), + media=( + torch.stack([s.image for s in samples if s.image is not None]) + if self.multimodal_cfg['is_multimodal'] + else None + ), + ) + + # TODO: cleanup, this is following logic in neva_dataset when we rearrange media tensor + if batch.media.shape[1] == 1: + batch.media = rearrange(batch.media, "b T c h w -> b T 1 c h w") + else: + batch.media = rearrange(batch.media, "b T c h w -> b T 1 c h w") + + return batch + + def preprocess_conversations(self, sources): + if self.conv_template == "nvgpt": + return preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "nv_dpo": + return preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "v1": + return preprocess_v1(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "llama_2": + return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "llama_3": + return preprocess_llama_3(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "mistral": + return preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg, is_mistral=True) + elif self.conv_template == "yi_34b": + return preprocess_yi_34b(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "plain": + return preprocess_plain(sources, self.tokenizer, self.multimodal_cfg) + elif self.conv_template == "interleaved": + return preprocess_interleaved_prompt(sources, self.tokenizer, self.multimodal_cfg) + else: + raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.") + + def encode_batch(self, batch: ImageTaskBatch) -> dict: + raw = dataclasses.asdict(batch) + return raw + + def calculate_token_length(self, media_tensor): + if len(media_tensor.shape) == 4: + height = media_tensor.shape[2] + width = media_tensor.shape[3] + else: + raise ValueError("Media tensor must be 4-dimensional") + patch_dim = self.multimodal_cfg['patch_dim'] + height_num_patches = height // patch_dim + width_num_patches = width // patch_dim + if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample': + height_num_patches = (height_num_patches + 1) // 2 * 2 + width_num_patches = (width_num_patches + 1) // 2 * 2 + + return height_num_patches * width_num_patches + + def get_masks_and_position_ids(self, tokens, labels): + from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids + + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + data=tokens, + eod_token=self.tokenizer.eos_id, + eod_mask_loss=self.data_cfg.get("eod_mask_loss", False), + reset_attention_mask=False, + reset_position_ids=False, + ) + + loss_mask[labels == -1] = 0.0 + tokens[tokens == -1] = 0 + labels[labels == -1] = 0 + + return attention_mask, loss_mask, position_ids diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index 6218332c2bde..07bc4f3960d3 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -21,7 +21,7 @@ import torch import torch.nn.functional as F from einops import rearrange, reduce, repeat -from omegaconf import DictConfig, ListConfig +from omegaconf import DictConfig, ListConfig, OmegaConf from pkg_resources import packaging from pytorch_lightning.trainer.trainer import Trainer from transformers import CLIPVisionModel, SiglipVisionModel @@ -69,6 +69,25 @@ from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +try: + from megatron.energon import ( + LimitDataset, + RepeatDataset, + WorkerConfig, + get_loader, + get_savable_loader, + get_train_dataset, + get_val_datasets, + ) + + from nemo.collections.multimodal.data.neva.neva_energon_dataset import TaskEncoder + + HAVE_ENERGON = True + +except (ImportError, ModuleNotFoundError): + + HAVE_ENERGON = False + try: from megatron.core import InferenceParams, dist_checkpointing, parallel_state, tensor_parallel from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace @@ -1226,10 +1245,22 @@ def setup(self, stage=None): else: # TODO: consider adding a ModelPT guard to check if model is being restored. # allowing restored models to optionally setup datasets - self.build_train_valid_test_datasets() - self.setup_training_data(self.cfg.data) - self.setup_validation_data(self.cfg.data) - self.setup_test_data(self.cfg.data) + + if self.cfg.get('energon', {}).get('use_energon', False): + if not HAVE_ENERGON: + raise ImportError( + "Megatron-Energon was not found. Please see the Energon README for installation instructions: https://github.com/NVIDIA/Megatron-Energon?tab=readme-ov-file#installation." + ) + assert not self.use_peft, "NeMo does not currently support the combination of Energon and PEFT." + logging.info( + "You are now using an experimental implementation of Megatron-Energon, https://github.com/NVIDIA/Megatron-Energon, for your NeVA dataloader. Further updates to Energon support in NeMo will be done in NeMo 2.0 implementation." + ) + self.build_train_valid_test_datasets_energon() + else: + self.build_train_valid_test_datasets() + self.setup_training_data(self.cfg.data) + self.setup_validation_data(self.cfg.data) + self.setup_test_data(self.cfg.data) # when using pipeline model parallel the final stage need to initialize word embeddings if parallel_state.get_pipeline_model_parallel_world_size() > 1: @@ -1435,6 +1466,144 @@ def build_pretraining_data_loader( persistent_workers=True if self.cfg.data.num_workers > 0 else False, ) + def datasets_provider(self, worker_config=None): + """Create multimodal train, validation and test datasets.""" + if parallel_state.get_pipeline_model_parallel_world_size() == 1: + micro_batch_size = self.cfg.micro_batch_size + else: + micro_batch_size = self.cfg.global_batch_size // parallel_state.get_data_parallel_world_size() + + dname = OmegaConf.to_container(self.cfg.energon.data, resolve=True) + + image_processor = ( + self.model.module.image_processor if hasattr(self.model, "module") else self.model.image_processor + ) + + add_extra_token = 1 + if getattr(self.cfg, 'no_seqlen_plus_one_input_tokens', False): + add_extra_token = 0 + + multimodal_cfg = dict( + is_multimodal=self.cfg.data.is_multimodal, + sep_image_conv_front=self.cfg.data.sep_image_conv_front, + model_type=self.cfg.mm_cfg.llm.get("model_type", "nvgpt"), + conv_template=self.cfg.data.get("conv_template", "nvgpt"), + patch_dim=self.cfg.mm_cfg.vision_encoder.patch_dim, + crop_size=self.cfg.mm_cfg.vision_encoder.get("crop_size", (336, 336)), + image_folder=self.cfg.data.get('image_folder', None), + video_folder=self.cfg.data.get('video_folder', None), + image_aspect_ratio=self.cfg.data.image_aspect_ratio, + use_im_start_end=getattr(self.cfg.mm_cfg, 'use_im_start_end', False), + image_processor=image_processor, + add_extra_token=add_extra_token, + context_length=self.cfg.encoder_seq_length, + media_type=self.cfg.data.get('media_type', 'image'), + num_frames=self.cfg.data.get('num_frames', -1), + use_lita=getattr(self.cfg.mm_cfg, 'use_lita', False), + lita=getattr(self.cfg.mm_cfg, 'lita', {}), + mm_mlp_adapter_type=self.cfg.mm_cfg.get('mm_mlp_adapter_type', 'linear'), + ) + + data_cfg = dict( + splice_single_frame=self.cfg.data.get('splice_single_frame', None), + num_frames=self.cfg.data.get('num_frames', -1), + sep_token_between_frames=self.cfg.data.get('sep_token_between_frames', False), + ) + + train_dataset = get_train_dataset( + dname, + batch_size=micro_batch_size, + task_encoder=TaskEncoder( + tokenizer=self.tokenizer, + image_processor=image_processor, + multimodal_cfg=multimodal_cfg, + data_cfg=data_cfg, + ), + worker_config=worker_config, + virtual_epoch_length=1000, + max_samples_per_sequence=100, + shuffle_buffer_size=100, + image_decode="pil", + ) + + val_datasets = get_val_datasets( + dname, + batch_size=micro_batch_size, + # This is the total number over all workers + task_encoder=TaskEncoder( + tokenizer=self.tokenizer, + image_processor=image_processor, + multimodal_cfg=multimodal_cfg, + data_cfg=data_cfg, + ), + worker_config=worker_config, + image_decode="pil", + ) + + val_datasets_without_source_datasets = [ + # Limit the dataset to eval_iters * num_microbatches + LimitDataset( + # Repeat the inner dataset in case it's too short + RepeatDataset(val_ds, worker_config=worker_config), + length=self.cfg.micro_batch_size * self.trainer.limit_val_batches, + worker_config=worker_config, + reset_after_epoch=True, + ) + for val_ds, _src_ds in val_datasets + ] + + return train_dataset, val_datasets_without_source_datasets, None + + # energon dataset builder + def build_train_valid_test_datasets_energon(self): + """Builds train and validation dataloaders using Megatron-Energon""" + rank = parallel_state.get_data_parallel_rank() + world_size = parallel_state.get_data_parallel_world_size() + data_parallel_group = parallel_state.get_data_parallel_group() + worker_debug_path = None + worker_log_level = 0 + logging.info( + f" Multimodal train dataloader initializing with rank {rank} world_size {world_size} data_parallel_group {data_parallel_group} ****** " + ) + + worker_config = WorkerConfig( + rank=rank, + world_size=world_size, + num_workers=1, + data_parallel_group=data_parallel_group, + worker_debug_path=worker_debug_path, + worker_log_level=worker_log_level, + ) + train_ds, valid_ds1, test_ds = self.datasets_provider(worker_config) + train_dataloader = get_savable_loader(train_ds, worker_config=worker_config) + + # Restore energon train dataloader state if we are resuming training + restore = os.path.exists(self.trainer.ckpt_path) if self.trainer.ckpt_path else False + if restore: + replica_id = ( + parallel_state.get_pipeline_model_parallel_rank(), + parallel_state.get_tensor_model_parallel_rank(), + parallel_state.get_context_parallel_rank(), + ) + sharded_state_dict = { + 'dataloader_state': ShardedObject( + data=None, + key='dataloader_state', + global_shape=[parallel_state.get_data_parallel_world_size()], + global_offset=[parallel_state.get_data_parallel_rank()], + replica_id=replica_id, + ) + } + state_dict = dist_checkpointing.load(sharded_state_dict, self.trainer.ckpt_path) + train_dataloader.restore_state_rank(state_dict['dataloader_state']) + logging.info(f"Restored dataset state from {self.trainer.ckpt_path}") + + valid_dataloader = [get_loader(valid_ds, worker_config=worker_config) for valid_ds in valid_ds1] + # valid_dataloader = get_loader(valid_ds1, worker_config=worker_config) + self._train_dl = train_dataloader + self._validation_dl = valid_dataloader + return self._train_dl, self._validation_dl + @classmethod def list_available_models(cls) -> Optional[PretrainedModelInfo]: """ @@ -1512,6 +1681,49 @@ def on_load_checkpoint(self, checkpoint) -> None: self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) parallel_state.set_virtual_pipeline_model_parallel_rank(0) + def on_save_checkpoint(self, checkpoint) -> None: + """LightningModule hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint + """ + + # Neva supports Megatron Energon dataloader, this requires saving the dataloader state on each data parallel group + def should_save_dataloader_state(): + if self._train_dl is None: + return False + if not hasattr(self._train_dl, "save_state"): + return False + first_rank = ( + parallel_state.is_pipeline_first_stage(ignore_virtual=True) + and parallel_state.get_tensor_model_parallel_rank() == 0 + ) + return first_rank + + def save_dataloader_state(): + train_dataloader_state_dict = self._train_dl.save_state_rank() + checkpoint['dataloader_state'] = ShardedObject( + data=train_dataloader_state_dict, + key='dataloader_state', + global_shape=[parallel_state.get_data_parallel_world_size()], + global_offset=[parallel_state.get_data_parallel_rank()], + ) + + # Save energon train dataloader state if conditions are met + if self.cfg.get('energon', False) and should_save_dataloader_state(): + save_dataloader_state() + + # mcore uses distributed checkpointing + # FSDP supports the lagecy checkpointing or torch-FSDP-native sharded checkpointing + if self.mcore_gpt and not self.use_fsdp: + checkpoint['sharded_state_dict'] = self.sharded_state_dict() + + # legacy checkpointing for interleaved + else: + if isinstance(self.model, list): + for i in range(len(self.model)): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint() + parallel_state.set_virtual_pipeline_model_parallel_rank(0) + def sharded_state_dict(self, prefix: str = ''): if self.use_peft: return None diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py index 2c3b30f2fc74..d38de8eb10b9 100644 --- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py +++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py @@ -424,24 +424,26 @@ def __init__(self, *args, **kwargs): # TODO (yuya): need to handle post_process correctly in order to enable PP self.output_dim = kwargs.pop('output_dim') super().__init__(*args, **kwargs) - self.final_layernorm = TENorm( - config=self.config, - hidden_size=self.config.hidden_size, - eps=self.config.layernorm_epsilon, - ) - self.head = torch.nn.Linear( - self.config.hidden_size, - self.output_dim, - bias=False, - ) + if self.post_process: + self.final_layernorm = TENorm( + config=self.config, + hidden_size=self.config.hidden_size, + eps=self.config.layernorm_epsilon, + ) + self.head = torch.nn.Linear( + self.config.hidden_size, + self.output_dim, + bias=False, + ) def forward(self, x): x = super().forward( x, ) - x = self.final_layernorm(x) - x = x[:, 0] - x = self.head(x) + if self.post_process: + x = self.final_layernorm(x) + x = x[:, 0] + x = self.head(x) return x diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index ea8053398a88..6ba2e8ca91f9 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -149,6 +149,7 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None): checkpoint = dist_checkpointing.load( sharded_state_dict=checkpoint, checkpoint_dir=tmp_model_weights_dir, + strict=dist_checkpointing.validation.StrictHandling.LOG_UNEXPECTED, ) state_dict = checkpoint["state_dict"] diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py index 54dff1cd7887..afbe85e0edbb 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py @@ -67,9 +67,6 @@ def on_validation_epoch_end(self): averaged_loss = torch.tensor(0.0, dtype=torch.float32).cuda() return averaged_loss - def sharded_state_dict(self, prefix: str = ''): - return None - def _reset_activation_checkpointing_args(self): return diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index 09f265ed2521..c6b7aac04a55 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -531,6 +531,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents ) # HARDCODED FOR NOW data_dict = preprocess_llama_3(sources, tokenizer, multimodal_cfg) + elif multimodal_cfg["conv_template"] == "mistral": record = { 'conversations': [ @@ -552,6 +553,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents ) # HARDCODED FOR NOW data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg, is_mistral=True) + elif multimodal_cfg["conv_template"] == "v1": record = { 'conversations': [ diff --git a/nemo/collections/vlm/__init__.py b/nemo/collections/vlm/__init__.py new file mode 100644 index 000000000000..2aeeae299a7d --- /dev/null +++ b/nemo/collections/vlm/__init__.py @@ -0,0 +1,41 @@ +from nemo.collections.vlm.neva.data import ( + DataConfig, + ImageDataConfig, + ImageToken, + MockDataModule, + MultiModalToken, + NevaLazyDataModule, + VideoDataConfig, + VideoToken, +) +from nemo.collections.vlm.neva.model import ( + CLIPViTConfig, + HFCLIPVisionConfig, + Llava1_5Config7B, + Llava1_5Config13B, + LlavaConfig, + LlavaModel, + MultimodalProjectorConfig, + NevaConfig, + NevaModel, +) + +__all__ = [ + "MockDataModule", + "NevaLazyDataModule", + "DataConfig", + "ImageDataConfig", + "VideoDataConfig", + "MultiModalToken", + "ImageToken", + "VideoToken", + "CLIPViTConfig", + "HFCLIPVisionConfig", + "MultimodalProjectorConfig", + "NevaConfig", + "NevaModel", + "LlavaConfig", + "Llava1_5Config7B", + "Llava1_5Config13B", + "LlavaModel", +] diff --git a/nemo/collections/vlm/neva/__init__.py b/nemo/collections/vlm/neva/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/nemo/collections/vlm/neva/data/__init__.py b/nemo/collections/vlm/neva/data/__init__.py new file mode 100644 index 000000000000..bbd502e21c80 --- /dev/null +++ b/nemo/collections/vlm/neva/data/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig, VideoDataConfig +from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule +from nemo.collections.vlm.neva.data.mock import MockDataModule +from nemo.collections.vlm.neva.data.multimodal_tokens import ImageToken, MultiModalToken, VideoToken + +__all__ = [ + "NevaLazyDataModule", + "MockDataModule", + "DataConfig", + "ImageDataConfig", + "VideoDataConfig", + "MultiModalToken", + "ImageToken", + "VideoToken", +] diff --git a/nemo/collections/vlm/neva/data/api.py b/nemo/collections/vlm/neva/data/api.py new file mode 100644 index 000000000000..c2e51e033d8a --- /dev/null +++ b/nemo/collections/vlm/neva/data/api.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytorch_lightning as pl + +from nemo.collections.vlm.neva.data.lazy import NevaLazyDataModule +from nemo.collections.vlm.neva.data.mock import MockDataModule + + +def mock() -> pl.LightningDataModule: + return MockDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2) + + +def lazy() -> pl.LightningDataModule: + return NevaLazyDataModule(seq_length=4096, global_batch_size=16, micro_batch_size=2) + + +__all__ = ["mock", "lazy"] diff --git a/nemo/collections/vlm/neva/data/config.py b/nemo/collections/vlm/neva/data/config.py new file mode 100644 index 000000000000..3b22d5a493b3 --- /dev/null +++ b/nemo/collections/vlm/neva/data/config.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional + +from .multimodal_tokens import ImageToken, MultiModalToken, VideoToken + + +@dataclass +class DataConfig: + media_type: str # currently supported: image or video + media_token: MultiModalToken + conv_template: str = "v1" # check `nemo/collections/multimodal/data/neva/conversation.py` + reset_position_ids: bool = False # Option to reset the position IDs in the dataset at an interval + reset_attention_mask: bool = False # Option to reset the attention mask from the dataset + eod_mask_loss: bool = False # Option to enable the EOD mask loss + + +@dataclass +class ImageDataConfig(DataConfig): + media_type: str = "image" + media_token: MultiModalToken = ImageToken + image_folder: Optional[str] = None + image_process_mode: str = 'pad' + + +@dataclass +class VideoDataConfig(DataConfig): + media_type: str = "video" + media_token: MultiModalToken = VideoToken + splice_single_frame: Optional[str] = None + # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded. + num_frames: int = 8 # Selects the number of frames to use from the video + sep_token_between_frames: bool = False # TODO: Allow usage of separator tokens between frames + video_folder: Optional[str] = None diff --git a/nemo/collections/vlm/neva/data/conversation.py b/nemo/collections/vlm/neva/data/conversation.py new file mode 100644 index 000000000000..22c435cb1fd2 --- /dev/null +++ b/nemo/collections/vlm/neva/data/conversation.py @@ -0,0 +1,677 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import dataclasses +import re +from collections import defaultdict +from enum import Enum, auto +from io import BytesIO +from typing import Any, List, Optional, Union + +from PIL import Image +from transformers import AutoTokenizer + + +class SeparatorStyle(Enum): + """Different separator style.""" + + SINGLE = auto() + TWO = auto() + MPT = auto() + PLAIN = auto() + CHATML = auto() + LLAMA_2 = auto() + LLAMA_3 = auto() + MISTRAL = auto() + NVGPT = auto() + QWEN = auto() + GEMMA = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that keeps all conversation history.""" + + system: Optional[str] + roles: tuple[str, str] + messages: List[List[str]] + offset: int + sep_style: SeparatorStyle = SeparatorStyle.SINGLE + sep: str = "###" + sep2: str = None + version: str = "Unknown" + + tokenizer_name_or_path: Any = None + stop_str: Union[str, List[str]] = None + stop_token_ids: List[int] = None + + skip_next: bool = False + + def process_prompt_with_images(self, messages): + # Process messages to handle potential image tokens. + return messages + + def process_chat_template(self, tokenizer_name_or_path, messages): + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) + if self.system is None: + chat = [] + else: + chat = [{"role": "system", "content": self.system}] + for role, message in messages: + chat.append({"role": role.lower(), "content": message}) + ret = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) + return ret + + def get_prompt(self): + messages = self.messages + messages = self.process_prompt_with_images(messages) + + if self.sep_style == SeparatorStyle.SINGLE: + ret = self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + self.sep + else: + ret += role + ":" + + elif self.sep_style == SeparatorStyle.TWO: + """ + A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {{ user_message_1 }} ASSISTANT: {{ model_answer_1 }}USER: {{ user_message_2 }} + """ + seps = [self.sep, self.sep2] + ret = self.system + seps[0] + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + + elif self.sep_style == SeparatorStyle.MISTRAL and self.version == "vila": + """ + [INST] {{ user_message_1 }} [/INST]{{ model_answer_1 }}[INST] {{ user_message_2 }} [/INST] + """ + wrap_sys = lambda msg: f"{msg}" + ("\n" if msg else "") + wrap_inst = lambda msg: f"[INST] {msg} [/INST]" + ret = "" + + for i, (role, message) in enumerate(messages): + if i == 0: + assert message, "first message should not be none" + assert role == self.roles[0], "first message should come from user" + if message: + if type(message) is tuple: + message, _, _ = message + if i == 0: + message = wrap_sys(self.system) + message + if i % 2 == 0: + message = wrap_inst(message) + ret += self.sep + message + else: + ret += message + self.sep2 + else: + ret += "" + + elif self.sep_style == SeparatorStyle.LLAMA_2: + """ + [INST] <> + You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. + <> + + {{ user_message_1 }} [/INST] {{ model_answer_1 }} [INST] {{ user_message_2 }} [/INST] + """ + tokenizer_name_or_path = self.tokenizer_name_or_path or "meta-llama/Llama-2-7b-chat-hf" + ret = self.process_chat_template(tokenizer_name_or_path, messages) + + elif self.sep_style == SeparatorStyle.LLAMA_3: + """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + + {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|> + + {{ user_message_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + + {{ model_answer_1 }}<|eot_id|><|start_header_id|>user<|end_header_id|> + + {{ user_message_2 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + """ + tokenizer_name_or_path = self.tokenizer_name_or_path or "meta-llama/Meta-Llama-3-8B-Instruct" + ret = self.process_chat_template(tokenizer_name_or_path, messages) + + elif self.sep_style == SeparatorStyle.NVGPT: + ret = self.sep2 + self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + '\n' + message + '\n' + self.sep + else: + ret += role + '\n' + + elif self.sep_style == SeparatorStyle.PLAIN: + seps = [self.sep, self.sep2] + ret = self.system + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += message + seps[i % 2] + else: + ret += "" + + elif self.sep_style == SeparatorStyle.MISTRAL: + """ + NOT tested in NeMo! + """ + tokenizer_name_or_path = self.tokenizer_name_or_path or "mistralai/Mistral-7B-Instruct-v0.2" + ret = self.process_chat_template(tokenizer_name_or_path, messages) + + elif self.sep_style == SeparatorStyle.CHATML: + """ + NOT tested in NeMo! + """ + ret = "" if self.system == "" else self.system + self.sep + "\n" + for role, message in messages: + if message: + if type(message) is tuple: + message, images = message + message = "" * len(images) + message + ret += role + "\n" + message + self.sep + "\n" + else: + ret += role + "\n" + return ret + + elif self.sep_style == SeparatorStyle.MPT: + """ + NOT tested in NeMo! + """ + ret = self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + message + self.sep + else: + ret += role + + elif self.sep_style == SeparatorStyle.GEMMA: + """ + NOT tested in NeMo! + """ + ret = "" + for i, (role, message) in enumerate(messages): + assert role == self.roles[i % 2], "Conversation should alternate user/assistant/user/assistant/..." + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + message + self.sep + else: + ret += role + + else: + raise ValueError(f"Invalid style: {self.sep_style}") + + return ret + + def append_message(self, role, message): + self.messages.append([role, message]) + + def process_image(self, image, image_process_mode, return_pil=False, image_format="PNG"): + if image_process_mode == "Pad": + + def expand2square(pil_img, background_color=(122, 116, 104)): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + image = expand2square(image) + elif image_process_mode in ["Default", "Crop"]: + pass + elif image_process_mode == "Resize": + image = image.resize((336, 336)) + else: + raise ValueError(f"Invalid image_process_mode: {image_process_mode}") + + if type(image) is not Image.Image: + image = Image.open(image).convert("RGB") + + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + max_len, min_len = 1008, 672 + shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) + longest_edge = int(shortest_edge * aspect_ratio) + W, H = image.size + if H > W: + H, W = longest_edge, shortest_edge + else: + H, W = shortest_edge, longest_edge + image = image.resize((W, H)) + if return_pil: + return image + else: + buffered = BytesIO() + image.save(buffered, format=image_format) + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + return img_b64_str + + def get_images(self, return_pil=False, return_path=False): + images = [] + for i, (role, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + if type(msg) is tuple: + msg, image, image_process_mode = msg + if type(image) != list: + image = [image] + for img in image: + if not return_path: + img = self.process_image(img, image_process_mode, return_pil=return_pil) + images.append(img) + return images + + def to_gradio_chatbot(self): + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + if type(msg) is tuple: + msg, image, image_process_mode = msg + if type(image) != list: + image = [image] + if len(image) == 1: + msg = "\n" + msg.replace("", "").strip() + else: + msg = re.sub(r"()\n(?=)", r"\1 ", msg) + for img in image: + img_b64_str = self.process_image(img, "Default", return_pil=False, image_format="JPEG") + img_str = f'' + msg = msg.replace("", img_str, 1).strip() + if len(msg) > 0: + ret.append([msg, None]) + else: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def copy(self): + return Conversation( + system=self.system, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + version=self.version, + ) + + def dict(self): + if len(self.get_images()) > 0: + return { + "system": self.system, + "roles": self.roles, + "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages], + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + return { + "system": self.system, + "roles": self.roles, + "messages": self.messages, + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + + +# Conversation Template for NVGPT +conv_nvgpt = Conversation( + system="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n""", + roles=("User", "Assistant"), + version="nvgpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.NVGPT, + sep="", + sep2=f"System\n", +) + +conv_nv_dpo = Conversation( + system="\n", + roles=("User", "Assistant"), + version="nv_dpo", + messages=[], + offset=0, + sep_style=SeparatorStyle.NVGPT, + sep="", + sep2=f"System\n", +) + +conv_vicuna_v0 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("Human", "Assistant"), + messages=[ + ["Human", "What are the key differences between renewable and non-renewable energy sources?"], + [ + "Assistant", + "Renewable energy sources are those that can be replenished naturally in a relatively " + "short amount of time, such as solar, wind, hydro, geothermal, and biomass. " + "Non-renewable energy sources, on the other hand, are finite and will eventually be " + "depleted, such as coal, oil, and natural gas. Here are some key differences between " + "renewable and non-renewable energy sources:\n" + "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable " + "energy sources are finite and will eventually run out.\n" + "2. Environmental impact: Renewable energy sources have a much lower environmental impact " + "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, " + "and other negative effects.\n" + "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically " + "have lower operational costs than non-renewable sources.\n" + "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote " + "locations than non-renewable sources.\n" + "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different " + "situations and needs, while non-renewable sources are more rigid and inflexible.\n" + "6. Sustainability: Renewable energy sources are more sustainable over the long term, while " + "non-renewable sources are not, and their depletion can lead to economic and social instability.\n", + ], + ], + offset=2, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_vicuna_v1 = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="v1", + messages=[], + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", + stop_str="", +) + +conv_llama_2 = Conversation( + system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. + +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=[], + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", + stop_str=" ", +) + +conv_llava_llama_2 = Conversation( + system="You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=[], + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", + stop_str=" ", +) + +conv_llava_llama_3 = Conversation( + system="You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.", + roles=("user", "assistant"), + version="llama_v3", + messages=[], + offset=0, + sep="<|eot_id|>", + sep_style=SeparatorStyle.LLAMA_3, + tokenizer_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct", + stop_str="<|eot_id|>", +) + +conv_mistral_instruct = Conversation( + system="", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=[], + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", + stop_str=" ", +) + +conv_llava_llama_2_simple = Conversation( + system="Answer the questions about the visual content that the user provides.", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=[], + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", + stop_str=" ", +) + +conv_llava_llama_2_mmtag = Conversation( + system="Answer the questions about the visual content that the user provides." + "The visual content will be provided with the following format: visual content.", + roles=("USER", "ASSISTANT"), + version="llama_v2_mmtag", + messages=[], + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", + stop_str=" ", +) + +conv_mpt = Conversation( + system="""<|im_start|>system +A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +conv_qwen = Conversation( + system="""<|im_start|>system +You are a helpful assistant.""", + roles=("<|im_start|>user", "<|im_start|>assistant"), + version="qwen", + messages=[], + offset=0, + sep_style=SeparatorStyle.CHATML, + sep="<|im_end|>", +) + +conv_gemma_instruct = Conversation( + system="", + roles=("user\n", "model\n"), + version="gemma", + messages=[], + offset=0, + sep_style=SeparatorStyle.GEMMA, + sep="\n", +) + +conv_llava_plain = Conversation( + system="", + roles=("", ""), + messages=[], + offset=0, + sep_style=SeparatorStyle.PLAIN, + sep="", + sep2="\n", + stop_str="\n", +) + +conv_llava_v0 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("Human", "Assistant"), + messages=[], + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_llava_v0_mmtag = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language." + "The visual content will be provided with the following format: visual content.", + roles=("Human", "Assistant"), + messages=[], + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", + version="v0_mmtag", +) + +conv_llava_v1 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("USER", "ASSISTANT"), + version="v1", + messages=[], + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", +) + +conv_llava_v1_mmtag = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language." + "The visual content will be provided with the following format: visual content.", + roles=("USER", "ASSISTANT"), + messages=[], + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", + version="v1_mmtag", +) + +conv_mistral_vila = Conversation( + system=None, + roles=("USER", "ASSISTANT"), + version="vila", + messages=[], + offset=0, + sep_style=SeparatorStyle.MISTRAL, + sep="", + sep2="", + stop_str="", +) + +conv_mistral_orca = Conversation( + system="""<|im_start|>system +You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +conv_mistral_zephyr = Conversation( + system="""<|system|> +You are a helpful AI assistant.""", + roles=("<|user|>\n", "<|assistant|>\n"), + version="mpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="", +) + +conv_mistral_direct = Conversation( + system="""<|im_start|>system +Answer the questions.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +conv_chatml_direct = Conversation( + system="""<|im_start|>system +Answer the questions.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +default_conversation = conv_vicuna_v1 +conv_templates = { + "default": conv_vicuna_v1, + "v0": conv_vicuna_v0, + "v1": conv_vicuna_v1, + "vicuna_v1": conv_vicuna_v1, + "llama_2": conv_llama_2, + "mistral_instruct": conv_mistral_instruct, + "mistral_orca": conv_mistral_orca, + "mistral_zephyr": conv_mistral_zephyr, + "mistral_direct": conv_mistral_direct, + "mistral": conv_mistral_vila, + "plain": conv_llava_plain, + "v0_plain": conv_llava_plain, + "chatml_direct": conv_chatml_direct, + "llava_v0": conv_llava_v0, + "llava_v0_mmtag": conv_llava_v0_mmtag, + "llava_v1": conv_llava_v1, + "llava_v1_mmtag": conv_llava_v1_mmtag, + "llava_llama_2": conv_llava_llama_2, + "llava_llama_3": conv_llava_llama_3, + "llava_llama_2_simple": conv_llava_llama_2_simple, + "llava_llama_2_mmtag": conv_llava_llama_2_mmtag, + "llava_mistral_instruct": conv_mistral_instruct, + "mpt": conv_mpt, + "qwen_1_5": conv_qwen, + "gemma_instruct": conv_gemma_instruct, + "nvgpt": conv_nvgpt, + "nv_steerlm": conv_nvgpt, + "nv_dpo": conv_nv_dpo, +} + +if __name__ == "__main__": + print(default_conversation.get_prompt()) diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py new file mode 100644 index 000000000000..ca1179e24033 --- /dev/null +++ b/nemo/collections/vlm/neva/data/lazy.py @@ -0,0 +1,612 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING, Optional + +import pytorch_lightning as pl +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS +from torch.utils import data +from torch.utils.data import DataLoader + +from nemo.collections.vlm.neva.data.config import DataConfig, ImageDataConfig +from nemo.collections.vlm.neva.data.conversation import conv_templates as supported_conv_templates +from nemo.lightning.pytorch.plugins import MegatronDataSampler + +if TYPE_CHECKING: + pass + +import json +import logging +import os +import re +import tarfile +from typing import Any, Dict, List, Sequence + +import decord +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image +from torch.utils.data import Dataset, default_collate +from transformers import CLIPImageProcessor, SiglipImageProcessor + +from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids +from nemo.collections.vlm.neva.data.multimodal_tokens import IGNORE_INDEX, SPECIAL_TOKEN_MAP + + +class TarOrFolderImageLoader: + """ + A class for loading images from a tar archive or a regular folder. + + This class provides functionality to open and read images from either a tar archive + (.tar file) or a standard directory with image files. It builds an index of images + if the source is a tar archive for efficient access. + + Attributes: + image_folder (str): The path to the tar archive or image folder. + tar_index (dict): A dictionary that maps file names to their tarfile member + objects if the image source is a tar archive. + + Methods: + __init__(self, image_folder): Initializes the loader with the specified image folder. + build_index(self): Builds an index of image file names and their corresponding + tarfile member objects for a tar archive. + open_image(self, file_name): Opens and returns an image by its file name. The image + is returned as an RGB PIL Image object. + """ + + def __init__(self, image_folder): + self.image_folder = image_folder + self.tar_index = {} + if self.image_folder.endswith('.tar'): + self.build_index() + + def build_index(self): + with tarfile.open(self.image_folder, 'r') as tar: + for member in tar.getmembers(): + self.tar_index[member.name] = member + + def open_image(self, file_name): + if self.image_folder.endswith('.tar'): + with tarfile.open(self.image_folder, 'r') as tar: + member = self.tar_index.get(file_name) + if member: + f = tar.extractfile(member) + return Image.open(f).convert('RGB') + else: + return Image.open(os.path.join(self.image_folder, file_name)).convert('RGB') + return None + + +class TarOrFolderVideoLoader: + """ + A class for loading videos from a tar archive or a regular folder. + + This class provides functionality to open and read videos from either a tar archive + (.tar file) or a standard directory with video files. It builds an index of videos + if the source is a tar archive for efficient access. + + Attributes: + video_folder (str): The path to the tar archive or video folder. + data_config (dict): A dictionary of configuration options for video decoding to frames + tar_index (dict): A dictionary that maps file names to their tarfile member + objects if the video source is a tar archive. + + Methods: + __init__(self, video_folder): Initializes the loader with the specified video folder. + build_index(self): Builds an index of image file names and their corresponding + tarfile member objects for a tar archive. + open_video(self, file_name): Opens and returns an video by its file name. The video + is returned as a list of RGB PIL Image objects. + flatten_frames(self, cap): Converts decord VideoReader video object to list of frame + images based on data config information. + """ + + def __init__(self, video_folder, data_config): + self.video_folder = video_folder + self.data_config = data_config + self.tar_index = {} + if self.video_folder.endswith('.tar'): + self.build_index() + + def build_index(self): + with tarfile.open(self.video_folder, 'r') as tar: + for member in tar.getmembers(): + self.tar_index[member.name] = member + + def open_video(self, file_name): + if self.video_folder.endswith('.tar'): + with tarfile.open(self.video_folder, 'r') as tar: + member = self.tar_index.get(file_name) + if member: + f = tar.extractfile(member) + cap = decord.VideoReader(f) + return self.flatten_frames(cap) + else: + # decord.bridge.set_bridge("torch") + cap = decord.VideoReader(os.path.join(self.video_folder, file_name)) + return self.flatten_frames(cap) + return None + + def flatten_frames(self, cap): + if self.data_config.splice_single_frame == 'first': + frame = cap[0].asnumpy() + return Image.fromarray(frame).convert('RGB') + elif self.data_config.splice_single_frame == 'middle': + frame = cap[len(cap) // 2].asnumpy() + return Image.fromarray(frame).convert('RGB') + elif self.data_config.splice_single_frame == 'last': + frame = cap[-1].asnumpy() + return Image.fromarray(frame).convert('RGB') + else: + if self.data_config.num_frames == -1: + frames = [] + for frame in cap: + rgb_frame = frame.asnumpy() + img = Image.fromarray(rgb_frame).convert('RGB') + frames.append(img) + return frames + else: + num_frames = min(len(cap), self.data_config.num_frames) + indices = np.linspace(0, len(cap) - 1, num_frames, dtype=int) + frames = [Image.fromarray(cap[i].asnumpy()).convert('RGB') for i in indices] + while len(frames) < self.data_config.num_frames: + frames.append(frames[-1]) + return frames + + +def process_image(processor, image, image_process_mode="square"): # this needs to be merged with conv's process image + if isinstance(processor, CLIPImageProcessor) or isinstance(processor, SiglipImageProcessor): + # image processor from HF + if image_process_mode == 'keep': + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + max_len, min_len = 448, 224 + shortest_edge = int(min(max_len / aspect_ratio, min_len)) + image = processor.preprocess( + image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge} + )['pixel_values'][0] + elif image_process_mode == 'pad': + + def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean)) + image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + else: + image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + else: + assert image_process_mode == 'square', 'NeMo image transform with setting `image_process_mode` to `square`.' + image = processor(image) + return image + + +def tokenize_special_token(prompt, tokenizer, special_token_map=None): + """ + Tokenizes a given prompt with special handling for multiple special tokens. + + This function splits the prompt at special tokens, tokenizes each chunk separately, + and then reassembles the chunks with the corresponding special token inserted in place of the placeholders. + + Parameters: + prompt (str): The input prompt containing text and special token placeholders. + tokenizer: The tokenizer object used to tokenize the prompt chunks. + special_token_map (list, optional): A list containing tuples of special token strings + and their corresponding token indices. Defaults to SPECIAL_TOKEN_MAP. + + Returns: + torch.Tensor: A tensor of token IDs representing the tokenized prompt with special tokens. + """ + + # Use the default special token map if none is provided + if special_token_map is None: + special_token_map = SPECIAL_TOKEN_MAP + + # Create a mapping of special tokens to their indices + special_token_dict = {token: index for token, index in special_token_map} + + # Split the prompt into chunks and track special tokens + regex_pattern = '(' + '|'.join(re.escape(token) for token in special_token_dict.keys()) + ')' + chunks = re.split(regex_pattern, prompt) + + # Tokenize each chunk and replace special tokens with their indices + tokenized_chunks = [] + for chunk in chunks: + if chunk in special_token_dict: + tokenized_chunks.append(special_token_dict[chunk]) + elif len(chunk) > 0: + tokenized_chunks.extend(tokenizer(chunk, add_special_tokens=False).input_ids) + + return torch.tensor(tokenized_chunks, dtype=torch.long) + + +def find_pattern_indices(template, pattern, search_start_index=0, allow_first_token_mismatch=False): + template_len = len(template) + pattern_len = len(pattern) + for i in range(search_start_index, template_len - pattern_len + 1): + match = template[i : i + pattern_len] == pattern + if torch.all(match) or (allow_first_token_mismatch and torch.all(match[1:])): + return i, i + pattern_len + return -1, -1 + + +class LazySupervisedDataset(Dataset): + + def __init__( + self, + data_path, + data_config, + tokenizer, + image_processor, + ): + super().__init__() + if data_path is not None: + with open(data_path, "r") as file: + list_data_dict = json.load(file) + else: + list_data_dict = [] + + logging.warning("Formatting inputs...Skip in lazy mode") + self.data_config = data_config + self.tokenizer = tokenizer + self.image_processor = image_processor + + self.conv_template = data_config.conv_template + self.conv = supported_conv_templates[self.conv_template] + self.image_process_mode = data_config.image_process_mode + self.list_data_dict = list_data_dict + + image_folder = getattr(data_config, "image_folder", None) + video_folder = getattr(data_config, "video_folder", None) + + self.image_loader = TarOrFolderImageLoader(image_folder) if image_folder else None + self.video_loader = TarOrFolderVideoLoader(video_folder, data_config) if video_folder else None + + def __len__(self): + return len(self.list_data_dict) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + source = self.list_data_dict[i] + conversations = self._apply_prompt_templates(source, use_plain=self.conv_template == "plain") + tokens, labels = self._tokenize_and_label(conversations) + + media_tensors = self._process_images(source) + data_dict = dict( + image=media_tensors, + tokens=tokens, + labels=labels, + ) + return data_dict + + def _process_images(self, source): + media_tensors = torch.tensor([]) + if 'image' in source: + if not isinstance(source['image'], list): + source['image'] = [source['image']] + + images = [] + for image_file in source['image']: + image = self.image_loader.open_image(image_file) + if image is None: + logging.warning(f"Image {image_file} could not be found!") + image = process_image(self.image_processor, image, self.image_process_mode) + images.append(image) + + if images: + media_tensors = torch.stack(images) + return media_tensors + + def _apply_prompt_templates(self, source, use_plain=False): + conv = self.conv + + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + source = source['conversations'] + if roles[source[0]["from"]] != conv.roles[0]: + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{j}" + conv.append_message(role, sentence["value"]) + + if use_plain: + assert len(conv.messages) == 2, "Plain template requires image-caption pairs." + assert "" in conv.messages[0][1] + conv.messages[0][1] = "" + + return conv.get_prompt() + + def _tokenize_and_label(self, conversations): + tokens = tokenize_special_token(conversations, self.tokenizer) + labels = torch.ones_like(tokens) * IGNORE_INDEX + search_start_index = 0 + for i in range(1, len(self.conv.messages), 2): + stop_str = getattr(self.conv, "stop_str", None) + assert ( + stop_str is not None + ), "If `stop_str` is not provided, issues might occur in labeling the answer tokens." + answer_tokens = self.tokenizer.encode( + self.conv.messages[i][1] + ("" if stop_str is None else stop_str), + add_special_tokens=False, + return_tensors="pt", + )[0] + answer_start, answer_end = find_pattern_indices(tokens, answer_tokens, search_start_index) + labels[answer_start:answer_end] = tokens[answer_start:answer_end] + search_start_index = answer_end + tokens = tokens[:-1] + labels = labels[1:] + return tokens, labels + + def _get_crop_size(self): + if isinstance(self.image_processor, CLIPImageProcessor): + return [self.image_processor.crop_size['height'], self.image_processor.crop_size['width']] + else: + raise NotImplementedError + + +class NevaDataset(LazySupervisedDataset): + """Dataset for supervised fine-tuning.""" + + def __init__( + self, + data_path, + data_config, + tokenizer, + image_processor, + ): + + if data_path.endswith(".json"): + super().__init__(data_path, data_config, tokenizer, image_processor) + + elif data_path.endswith(".jsonl"): + super().__init__(None, data_config, tokenizer, image_processor) + logging.warning("Loading image inputs from SteerLM Dataset...") + if data_config.media_type == 'image': + image_folder = data_config.image_folder + for line in open(data_path, "r"): + record = json.loads(line) + + # This currently supports only a single image + # search for tag + + record['image'] = [] + for turn in record['conversations']: + matches = re.finditer('', "", turn['value']) + + self.list_data_dict.append(record) + + else: + raise ValueError(f"Formatting of {data_path} is not supported in Neva.") + + def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + data_config = self.data_config + packed_sequence = "cu_seqlens" in instances[0] + max_len = max(instance['tokens'].shape[0] for instance in instances) + for instance in instances: + pad_len = max_len - instance['tokens'].shape[0] + instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0) + instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', IGNORE_INDEX) + if packed_sequence and instance["cu_seqlens"][-1] != max_len: + instance["cu_seqlens"] = torch.cat((instance["cu_seqlens"], torch.IntTensor([max_len])), 0) + + if packed_sequence: + max_len_cu = max(instance['cu_seqlens'].shape[0] for instance in instances) + max_len_image = max(instance['image'].shape[0] for instance in instances) + for instance in instances: + pad_len_cu = max_len_cu - instance['cu_seqlens'].shape[0] + instance['cu_seqlens'] = F.pad(instance['cu_seqlens'], (0, pad_len_cu), 'constant', max_len) + + x = instance['image'] + num_pad = max_len_image - x.shape[0] + pad_tensor = torch.zeros(num_pad, *x.shape[1:], dtype=x.dtype, device=x.device) + instance['image'] = torch.cat((x, pad_tensor), dim=0) + + media_type = data_config.media_type + if media_type == 'image': + media = [instance.pop('image') for instance in instances] + media = torch.cat(media, dim=0) + if media.size(0) == 0: + media = None + elif media_type == 'video': + media = [instance.pop('video', None) for instance in instances] + else: + raise ValueError(f"Unsupported media type {media_type}") + + batch = default_collate(instances) + tokenizer = self.tokenizer + + tokens = batch['tokens'] + labels = batch['labels'] + + if packed_sequence: + cu_seqlens = batch["cu_seqlens"] + position_ids = [] + for cu_seqlen in cu_seqlens: + position_ids.append([]) + for ind in range(0, len(cu_seqlen) - 1): + seqlen = cu_seqlen[ind + 1] - cu_seqlen[ind] + position_ids[-1].extend(list(range(seqlen))) + position_ids = torch.LongTensor(position_ids) + loss_mask = torch.ones(tokens.size(), dtype=torch.float, device=tokens.device) + attention_mask = torch.ones(tokens.size(), dtype=torch.long, device=tokens.device) + else: + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + data=tokens, + eod_token=tokenizer.eos_token_id, + eod_mask_loss=data_config.eod_mask_loss, + reset_attention_mask=data_config.reset_attention_mask, + reset_position_ids=data_config.reset_position_ids, + ) + + loss_mask[labels < 0] = 0.0 + + batch = { + 'tokens': tokens, + 'labels': labels, + 'attention_mask': attention_mask, + 'loss_mask': loss_mask, + 'position_ids': position_ids, + 'media': media, + } + if packed_sequence: + batch["cu_seqlens"] = cu_seqlens + return batch + + +class NevaLazyDataModule(pl.LightningDataModule): + def __init__( + self, + paths: str | List[str], + weights: Optional[List[float]] = None, + data_config: Optional[DataConfig] = ImageDataConfig, + seq_length: int = 2048, + tokenizer: Optional = None, + image_processor: Optional = None, + micro_batch_size: int = 4, + global_batch_size: int = 8, + num_train_samples: int = 10_000, + num_val_samples: int = 10_000, + num_test_samples: int = 10_000, + num_workers: int = 8, + pin_memory: bool = True, + persistent_workers: bool = False, + use_packed_sequence: bool = False, + seed: int = 1234, + ) -> None: + super().__init__() + if not isinstance(paths, (list, tuple)): + paths = [paths] + if weights is not None: + assert len(weights) == len(paths) + if len(weights) == 1: + # weights must be None if there is only one dataset + weights = None + + self.paths = paths + self.weights = weights + self.data_config = data_config + self.seq_length = seq_length + self.tokenizer = tokenizer + self.image_processor = image_processor + self.num_train_samples = num_train_samples + self.num_val_samples = num_val_samples + self.num_test_samples = num_test_samples + self.num_workers = num_workers + self.pin_memory = pin_memory + self.persistent_workers = persistent_workers + self.seed = seed + self.use_packed_sequence = use_packed_sequence + self.init_global_step = 0 + + if tokenizer is None or image_processor is None: + logging.warning(f"Processor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.") + from transformers import AutoProcessor + + processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + self.tokenizer = tokenizer or processor.tokenizer + self.image_processor = image_processor or processor.image_processor + + self.data_sampler = MegatronDataSampler( + seq_len=self.seq_length, + micro_batch_size=micro_batch_size, + global_batch_size=global_batch_size, + dataloader_type="cyclic", + ) + + def setup(self, stage: str = "") -> None: + assert len(self.paths) == 1, "not yet support blend dataset in Neva 2.0!" + if self.use_packed_sequence: + pass # TODO + else: + # TODO: + # rng = torch.Generator().manual_seed(self.seed) + # train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], generator=rng) + self._train_ds = NevaDataset(self.paths[0], self.data_config, self.tokenizer, self.image_processor) + self._validation_ds = NevaDataset(self.paths[0], self.data_config, self.tokenizer, self.image_processor) + + def train_dataloader(self) -> TRAIN_DATALOADERS: + return self._create_dataloader(self._train_ds) + + def val_dataloader(self) -> EVAL_DATALOADERS: + return self._create_dataloader(self._validation_ds) + + def test_dataloader(self) -> EVAL_DATALOADERS: + return self._create_dataloader(self._test_ds) + + def _create_dataloader(self, dataset, **kwargs) -> DataLoader: + self.init_global_step = self.trainer.global_step + self.data_sampler.init_global_step = self.init_global_step + return DataLoader( + dataset, + num_workers=self.num_workers, + pin_memory=self.pin_memory, + persistent_workers=self.persistent_workers, + collate_fn=getattr(dataset, 'collate_fn', data.dataloader.default_collate), + **kwargs, + ) + + def state_dict(self) -> Dict[str, Any]: + """Called when saving a checkpoint, implement to generate and save datamodule state. + + Returns: + A dictionary containing datamodule state. + + """ + consumed_samples = self.data_sampler.compute_consumed_samples(self.trainer.global_step - self.init_global_step) + return {'consumed_samples': consumed_samples} + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + """Called when loading a checkpoint, implement to reload datamodule state given datamodule stat + + Args: + state_dict: the datamodule state returned by ``state_dict``. + + """ + try: + from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR + except ModuleNotFoundError: + from nemo.lightning.apex_utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR + consumed_samples = state_dict['consumed_samples'] + self.data_sampler.init_consumed_samples = consumed_samples + self.data_sampler.prev_consumed_samples = consumed_samples + self.if_first_step = 1 + + if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is not None: + num_microbatch_calculator = _GLOBAL_NUM_MICROBATCHES_CALCULATOR # noqa: SLF001 + + num_microbatch_calculator.update( + consumed_samples=consumed_samples, + consistency_check=False, + ) diff --git a/nemo/collections/vlm/neva/data/mock.py b/nemo/collections/vlm/neva/data/mock.py new file mode 100644 index 000000000000..ac4bc56a068c --- /dev/null +++ b/nemo/collections/vlm/neva/data/mock.py @@ -0,0 +1,179 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional + +import numpy as np +import pytorch_lightning as pl +import torch +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS +from torch.utils import data +from torch.utils.data import DataLoader, Dataset + +from nemo.collections.vlm.neva.data.multimodal_tokens import IMAGE_TOKEN_INDEX +from nemo.lightning.pytorch.plugins import MegatronDataSampler + + +class MockDataModule(pl.LightningDataModule): + def __init__( + self, + seq_length: int = 2048, + tokenizer: Optional = None, + image_processor: Optional = None, + micro_batch_size: int = 4, + global_batch_size: int = 8, + rampup_batch_size: Optional[List[int]] = None, + num_train_samples: int = 10_000, + num_val_samples: int = 10_000, + num_test_samples: int = 10_000, + num_workers: int = 8, + pin_memory: bool = True, + persistent_workers: bool = False, + ): + super().__init__() + self.seq_length = seq_length + self.num_train_samples = num_train_samples + self.num_val_samples = num_val_samples + self.num_test_samples = num_test_samples + self.num_workers = num_workers + self.pin_memory = pin_memory + self.persistent_workers = persistent_workers + + if tokenizer is None or image_processor is None: + from transformers import AutoProcessor + + processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + self.tokenizer = tokenizer or processor.tokenizer + self.image_processor = image_processor or processor.image_processor + self.data_sampler = MegatronDataSampler( + seq_len=self.seq_length, + micro_batch_size=micro_batch_size, + global_batch_size=global_batch_size, + rampup_batch_size=rampup_batch_size, + ) + + def setup(self, stage: str = "") -> None: + self._train_ds = _MockNevaDataset( + self.tokenizer, self.image_processor, "train", self.num_train_samples, self.seq_length + ) + self._validation_ds = _MockNevaDataset( + self.tokenizer, self.image_processor, "valid", self.num_val_samples, self.seq_length + ) + self._test_ds = _MockNevaDataset( + self.tokenizer, self.image_processor, "test", self.num_test_samples, self.seq_length + ) + + def train_dataloader(self) -> TRAIN_DATALOADERS: + if not hasattr(self, "_train_ds"): + self.setup() + return self._create_dataloader(self._train_ds) + + def val_dataloader(self) -> EVAL_DATALOADERS: + if not hasattr(self, "_validation_ds"): + self.setup() + return self._create_dataloader(self._validation_ds) + + def test_dataloader(self) -> EVAL_DATALOADERS: + if not hasattr(self, "_test_ds"): + self.setup() + return self._create_dataloader(self._test_ds) + + def _create_dataloader(self, dataset, **kwargs) -> DataLoader: + return DataLoader( + dataset, + num_workers=self.num_workers, + pin_memory=self.pin_memory, + persistent_workers=self.persistent_workers, + collate_fn=dataset.collate_fn, + **kwargs, + ) + + +class _MockNevaDataset(Dataset): + def __init__( + self, + tokenizer, + image_processor, + name: str, + num_samples: int, + seq_length: int, + seed: int = 42, + ) -> None: + super().__init__() + self.name = name + self.seq_length = seq_length + + self.vocab_size = tokenizer.vocab_size + + crop_size = image_processor.crop_size + self.image_height, self.image_width = crop_size["height"], crop_size["width"] + + self.length = num_samples + self.seed = seed + + self.loss_mask = torch.ones(self.seq_length, dtype=torch.float) + self.position_ids = torch.arange(self.seq_length, dtype=torch.int64) + + def __len__(self) -> int: + return self.length + + def _get_text(self, idx: int) -> np.ndarray: + np_gen = np.random.default_rng(seed=(self.seed + idx)) + return np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64) + + def __getitem__(self, idx) -> Dict[str, torch.Tensor]: + # Generate data of the expected size and datatype (based on GPTDataset). + np_gen = np.random.default_rng(seed=(self.seed + idx)) + tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length + 1], dtype=np.int64)) + tokens[2] = IMAGE_TOKEN_INDEX # ImageToken token index + labels = tokens.clone() + images = torch.from_numpy(np_gen.random(size=[3, self.image_height, self.image_width], dtype=np.float32)) + tokens = tokens[:-1] + labels = labels[1:] + return { + "media": images, + "tokens": tokens, + "labels": labels, + "loss_mask": self.loss_mask, + "position_ids": self.position_ids, + } + + def _collate_fn(self, batch): + """ + A default implementation of a collation function. + Users should override this method to define custom data loaders. + """ + collated_batch = data.dataloader.default_collate(batch) + collated_batch["attention_mask"] = None + return collated_batch + + def collate_fn(self, batch): + """Method that user pass as functor to DataLoader. + + The method optionally performs neural type checking and add types to the outputs. + + Please note, subclasses of Dataset should not implement `input_types`. + + # Usage: + dataloader = torch.utils.data.DataLoader( + ...., + collate_fn=dataset.collate_fn, + .... + ) + + Returns + ------- + Collated batch, with or without types. + """ + return self._collate_fn(batch) diff --git a/nemo/collections/vlm/neva/data/multimodal_tokens.py b/nemo/collections/vlm/neva/data/multimodal_tokens.py new file mode 100644 index 000000000000..8c4dcadad63c --- /dev/null +++ b/nemo/collections/vlm/neva/data/multimodal_tokens.py @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Callable, Optional + + +@dataclass +class MultiModalToken: + """ + Base class for multimodal tokens representing different media types. + """ + + token_str: str + token_index: int + media_type: str + use_start_end: bool + encoder_fn: Optional[Callable] = None + + +@dataclass +class ImageToken(MultiModalToken): + token_str: str = "" + token_index: int = -200 + media_type: str = "image" + use_start_end: bool = False + + +@dataclass +class VideoToken(MultiModalToken): + token_str: str = "