Skip to content

Commit

Permalink
Merge branch 'main' into export_wordlist_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
JimmyZhang12 authored Apr 16, 2024
2 parents ec29505 + 468d5b6 commit 9c54ba6
Show file tree
Hide file tree
Showing 112 changed files with 5,695 additions and 1,212 deletions.
22 changes: 12 additions & 10 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@ name: "CICD NeMo"

on:
pull_request:
types: [opened, reopened, ready_for_review]
branches: [ "main" ]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
gpu-test:
runs-on: self-hosted-azure
Expand All @@ -28,7 +31,7 @@ jobs:
nvidia-smi
cicd-cluster-clean:
runs-on: self-hosted-azure
runs-on: self-hosted-azure-cpu
steps:
- name: Clean server from old files
run: |
Expand All @@ -50,7 +53,7 @@ jobs:

cicd-test-container-setup:
needs: [cicd-cluster-clean]
runs-on: self-hosted-azure
runs-on: self-hosted-azure-cpu
# uses: actions/cache@v2
#container:
# image: nvcr.io/nvidia/pytorch:24.01-py3
Expand All @@ -70,7 +73,7 @@ jobs:
- name: Container setup
run: |
# Pull base PyTorch container
docker pull nvcr.io/nvidia/pytorch:24.01-py3
docker pull nvcr.io/nvidia/pytorch:24.02-py3
docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
set -x
Expand All @@ -90,21 +93,19 @@ jobs:
# NeMo Installation
./reinstall.sh release
# Transformer Engine 1.2.0
# Transformer Engine installation
git clone https://github.com/NVIDIA/TransformerEngine.git && \
pushd TransformerEngine && \
git fetch origin 9b2fed514ea419141146f843ab2c84b22b86bfd7 && \
git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . && \
popd
# Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760
# Apex installation
git clone https://github.com/NVIDIA/apex.git && \
pushd apex && \
git checkout b496d85fb88a801d8e680872a12822de310951fd && \
git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
cp -R apex /usr/local/lib/python3.10/dist-packages && \
popd
Expand All @@ -113,12 +114,13 @@ jobs:
# Megatron Core installation
git clone https://github.com/NVIDIA/Megatron-LM.git && \
pushd Megatron-LM && \
git checkout 43792028f003ed25a3ee8c5a0d4cad82317d81b5 && \
git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \
pip install . && \
pushd megatron/core/datasets && \
make && \
popd && \
popd
export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
# Install only for test: L2: Segmentation Tool
pushd tools/ctc_segmentation && \
Expand Down Expand Up @@ -194,7 +196,7 @@ jobs:

L0_Unit_Tests_CPU:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
runs-on: self-hosted-azure-cpu
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ COPY . .

# start building the final container
FROM nemo-deps as nemo
ARG NEMO_VERSION=1.23.0
ARG NEMO_VERSION=2.0.0

# Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container
# version information as runtime environment variable for introspection purposes
Expand Down
106 changes: 78 additions & 28 deletions Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
pipeline {
agent {
docker {
image 'nvcr.io/nvidia/pytorch:24.01-py3'
image 'nvcr.io/nvidia/pytorch:24.02-py3'
args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1'
}
}
Expand Down Expand Up @@ -63,44 +63,35 @@ pipeline {
}
}

// Transformer Engine 1.2.0
stage('Transformer Engine installation') {
steps {
sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
git fetch origin 8c9abbb80dba196f086b8b602a7cf1bce0040a6a && \
git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .'
}
}

// Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760
stage('Apex installation') {
steps {
sh 'git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
git checkout c07a4cf67102b9cd3f97d1ba36690f985bae4227 && \
git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
cp -R apex /usr/local/lib/python3.10/dist-packages'
}
}

stage('Pytorch lightning installation') {
steps {
sh 'git clone -b bug_fix https://github.com/athitten/pytorch-lightning.git && \
cd pytorch-lightning && \
PACKAGE_NAME=pytorch pip install -e .'
}
}

// pip package should be working with main, if not we can update the commit here
// until the pip package is updated
stage('Megatron Core installation') {
steps {
sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \
pip install .'
git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \
pip install . && \
cd megatron/core/datasets && \
make'
sh 'export PYTHONPATH="${PYTHONPATH}:/mnt/D3/JenkinsWorkDir/workspace/NeMo-multibranch_${GIT_BRANCH}/Megatron-LM"'
}
}

Expand Down Expand Up @@ -134,6 +125,7 @@ pipeline {
sh 'python tests/core_ptl/check_imports.py --domain "nlp"'
}
}

stage('L0: Unit Tests GPU') {
steps {
sh 'NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads'
Expand Down Expand Up @@ -236,7 +228,7 @@ pipeline {
trainer.max_steps=20 \
model.micro_batch_size=1 \
model.global_batch_size=1 \
model.data.synthetic_data=True \
model.data.synthetic_data=True \
model.first_stage_key=images_moments \
model.cond_stage_key=clip_encoded \
model.optim.name=megatron_fused_adam \
Expand Down Expand Up @@ -3526,6 +3518,64 @@ pipeline {
failFast true
steps {
sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \
trainer.num_nodes=1 \
trainer.devices=2 \
trainer.precision=bf16 \
trainer.accelerator=gpu \
model.data.data_prefix=['none'] \
exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
model.mcore_gpt=True \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.optim.name=distributed_fused_adam \
model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
model.data.num_workers=4 \
model.micro_batch_size=1 \
model.data.shuffle_documents=False \
trainer.val_check_interval=30 \
+trainer.num_sanity_val_steps=0 \
model.init_method_std=0.023 \
model.optim.lr=6.0e-4 \
model.megatron_amp_O2=True \
model.data.splits_string=\'\"98,2,0\"\' \
model.data.dataloader_type=cyclic \
trainer.max_steps=10"
sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \
trainer.num_nodes=1 \
trainer.devices=2 \
trainer.precision=bf16 \
trainer.accelerator=gpu \
model.data.data_prefix=['none'] \
exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
model.mcore_gpt=True \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.optim.name=distributed_fused_adam \
model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
model.data.num_workers=4 \
model.micro_batch_size=1 \
model.data.shuffle_documents=False \
trainer.val_check_interval=30 \
+trainer.num_sanity_val_steps=0 \
model.init_method_std=0.023 \
model.optim.lr=6.0e-4 \
model.megatron_amp_O2=True \
model.data.splits_string=\'\"98,2,0\"\' \
model.data.dataloader_type=cyclic \
trainer.max_steps=20"
sh "rm -rf examples/nlp/language_modeling/mcore_retro_results"
}
}
stage('L2: (Legacy) Megatron RETRO Pretraining and Resume Training') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps {
sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
trainer.devices=2 \
trainer.num_nodes=1 \
trainer.accelerator=gpu \
Expand All @@ -3536,7 +3586,7 @@ pipeline {
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
trainer.val_check_interval=10 \
exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \
exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
model.data.data_prefix='' \
model.data.knn_index='' \
model.data.retrieval_prefix='' \
Expand All @@ -3555,7 +3605,7 @@ pipeline {
model.enc_cross_attention=[1] \
model.dec_cross_attention=[1] \
+model.data.mock=True"
sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \
sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
trainer.devices=2 \
trainer.num_nodes=1 \
trainer.accelerator=gpu \
Expand All @@ -3566,7 +3616,7 @@ pipeline {
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
trainer.val_check_interval=10 \
exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \
exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
model.data.data_prefix='' \
model.data.knn_index='' \
model.data.retrieval_prefix='' \
Expand All @@ -3585,10 +3635,10 @@ pipeline {
model.enc_cross_attention=[1] \
model.dec_cross_attention=[1] \
+model.data.mock=True"
sh "rm -rf examples/nlp/language_modeling/retro_results"
sh "rm -rf examples/nlp/language_modeling/retro_legacy_results"
}
}
stage('L2: Megatron RETRO muTransfer Pretraining Performance') {
stage('L2: (Legacy) Megatron RETRO muTransfer Pretraining Performance') {
when {
anyOf {
branch 'main'
Expand All @@ -3609,7 +3659,7 @@ pipeline {
trainer.limit_val_batches=0 \
trainer.gradient_clip_val=1.0 \
+trainer.num_sanity_val_steps=0 \
exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \
exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results/ \
+exp_manager.version=smalltest \
model.data.neighbors=2 \
model.megatron_amp_O2=False \
Expand Down Expand Up @@ -3660,15 +3710,15 @@ import torch
if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()):
import sys
sys.exit(0)
event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_legacy_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
ea = EventAccumulator(str(event_file)).Reload()
vals = []
for i in ea.Scalars('reduced_train_loss'):
vals.append(i.value)
training_curve = pd.DataFrame({'loss': vals})
gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv')
assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
sh "rm -rf examples/nlp/language_modeling/retro_results"
sh "rm -rf examples/nlp/language_modeling/retro_legacy_results"
}
}
stage('L2: BioMegatron Bert NER Task') {
Expand Down Expand Up @@ -4654,7 +4704,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.sequence_parallel=true \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
model.peft.peft_scheme='lora' \
model.answer_only_loss=True \
model.micro_batch_size=1 \
Expand Down Expand Up @@ -5858,4 +5908,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
cleanWs()
}
}
}
}
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
.. _main-readme:

**NVIDIA NeMo Framework**
===============
=========================

Latest News
-----------
Expand Down
Loading

0 comments on commit 9c54ba6

Please sign in to comment.