Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into huvu/t5_nemo2.0_nemoci
Browse files Browse the repository at this point in the history
  • Loading branch information
Huy Vu2 committed Nov 19, 2024
2 parents dd2c4aa + da79f6a commit 7b8cfd9
Show file tree
Hide file tree
Showing 615 changed files with 11,960 additions and 1,857 deletions.
28 changes: 14 additions & 14 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2943,7 +2943,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=null \
Expand Down Expand Up @@ -2975,7 +2975,7 @@ jobs:
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=null \
Expand Down Expand Up @@ -3398,8 +3398,8 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_t5_eval.py \
--model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
python examples/nlp/language_modeling/megatron_t5_eval.py \
--model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
--prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
--tensor_model_parallel_size 1
Expand All @@ -3410,7 +3410,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
Expand All @@ -3421,7 +3421,7 @@ jobs:
exp_manager.exp_dir=/tmp/nlp_mcore_t5_lora_tuning_tp2 \
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
model.peft.peft_scheme=lora \
model.answer_only_loss=True \
model.micro_batch_size=1 \
Expand All @@ -3433,8 +3433,8 @@ jobs:
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
model.peft.restore_from_path=/tmp/nlp_mcore_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
model.peft.restore_from_ckpt_name=null \
model.peft.restore_from_hparams_path=null \
Expand Down Expand Up @@ -3852,14 +3852,14 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
python tests/collections/llm/megatron_t5_pretraining.py \
--devices=2 \
--max-steps=3 \
--experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \
--data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \
--index-mapping-dir=tests/collections/llm/t5_index_mappings/${{ github.run_id }}
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
python tests/collections/llm/megatron_t5_pretraining.py \
--devices=2 \
--max-steps=6 \
--experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \
Expand All @@ -3876,11 +3876,11 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
python tests/collections/llm/megatron_t5_finetuning.py \
--devices=2 \
--max-steps=250 \
--experiment-dir=tests/collections/llm/t5_finetune_results/${{ github.run_id }} \
--checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
--checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_padding_attnmasktype_150steps
AFTER_SCRIPT: |
rm -rf tests/collections/llm/t5_finetune_results/${{ github.run_id }}
Expand All @@ -3891,12 +3891,12 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
python tests/collections/llm/megatron_t5_finetuning.py \
--devices=2 \
--max-steps=250 \
--peft=lora \
--experiment-dir=tests/collections/llm/t5_peft_results/${{ github.run_id }} \
--checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
--checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_padding_attnmasktype_150steps
AFTER_SCRIPT: |
rm -rf tests/collections/llm/t5_peft_results/${{ github.run_id }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/monitor-vms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
| jq -c '[
.runners[]
| select(.status == "online")
| select(.name | contains("gpu"))
| select(.name | contains("cpu") | not)
| {
"vm": .name,
"n_gpus": [
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ on:

jobs:
release:
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.10.0
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.12.3
with:
release-ref: ${{ inputs.release-ref }}
image-name: nemo_container
Expand All @@ -39,3 +39,4 @@ jobs:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
PAT: ${{ secrets.PAT }}
8 changes: 6 additions & 2 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@
{
"path": "detect_secrets.filters.allowlist.is_line_allowlisted"
},
{
"path": "detect_secrets.filters.common.is_baseline_file",
"filename": ".secrets.baseline"
},
{
"path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
"min_level": 2
Expand Down Expand Up @@ -273,7 +277,7 @@
"filename": "scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py",
"hashed_secret": "e0308bd21bffc156d79208f9ecf130370a015002",
"is_verified": false,
"line_number": 460
"line_number": 471
}
],
"scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py": [
Expand Down Expand Up @@ -2083,5 +2087,5 @@
}
]
},
"generated_at": "2024-10-25T13:43:17Z"
"generated_at": "2024-11-14T09:37:19Z"
}
2 changes: 1 addition & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.19.0
ARG MCORE_TAG=aded519cfb1de2abf96f36ca059f992294b7876f
ARG MCORE_TAG=c1728c12f1f1cdbb786e52f1ffe512295d76bef3

ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
Expand Down
58 changes: 0 additions & 58 deletions docs/source/nlp/distillation.rst

This file was deleted.

67 changes: 0 additions & 67 deletions docs/source/nlp/nemo_megatron/model_distillation/drop_layers.rst

This file was deleted.

2 changes: 1 addition & 1 deletion docs/source/nlp/punctuation_and_capitalization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ An example of a config file is
- trainer config
-
- Parameters of
`pytorch_lightning.Trainer <https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api>`_.
`lightning.pytorch.Trainer <https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api>`_.
* - **exp_manager**
- exp manager config
-
Expand Down
2 changes: 1 addition & 1 deletion docs/source/starthere/fundamentals.rst
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ Below is an example training script for our ``ExampleEncDecModel`` model. We hig
:linenos:
:emphasize-lines: 10, 11, 12
import pytorch_lightning as pl
import lightning.pytorch as pl
from nemo.collections.path_to_model_class import ExampleEncDecModel
from nemo.core.config import hydra_runner
Expand Down
2 changes: 1 addition & 1 deletion examples/asr/asr_adapters/eval_asr_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"""

import pytorch_lightning as pl
import lightning.pytorch as pl
from omegaconf import OmegaConf, open_dict

from nemo.collections.asr.models import ASRModel
Expand Down
2 changes: 1 addition & 1 deletion examples/asr/asr_adapters/train_asr_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
import os
from dataclasses import is_dataclass

import pytorch_lightning as pl
import lightning.pytorch as pl
from omegaconf import DictConfig, OmegaConf, open_dict

from nemo.collections.asr.models import ASRModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
from dataclasses import dataclass
from typing import Optional

import pytorch_lightning as pl
import lightning.pytorch as pl
import torch
from omegaconf import OmegaConf

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from dataclasses import dataclass
from typing import Optional

import pytorch_lightning as pl
import lightning.pytorch as pl
import torch
from omegaconf import OmegaConf

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
from dataclasses import dataclass
from typing import Optional

import pytorch_lightning as pl
import lightning.pytorch as pl
import torch
from omegaconf import OmegaConf, open_dict

Expand Down
2 changes: 1 addition & 1 deletion examples/asr/asr_ctc/speech_to_text_ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
"""

import pytorch_lightning as pl
import lightning.pytorch as pl
from omegaconf import OmegaConf

from nemo.collections.asr.models import EncDecCTCModel
Expand Down
2 changes: 1 addition & 1 deletion examples/asr/asr_ctc/speech_to_text_ctc_bpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
"""

import pytorch_lightning as pl
import lightning.pytorch as pl
from omegaconf import OmegaConf

from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
"""

import pytorch_lightning as pl
import lightning.pytorch as pl
from omegaconf import OmegaConf

from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
"""

import pytorch_lightning as pl
import lightning.pytorch as pl
from omegaconf import OmegaConf

from nemo.collections.asr.models import EncDecHybridRNNTCTCModel
Expand Down
2 changes: 1 addition & 1 deletion examples/asr/asr_transducer/speech_to_text_rnnt.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
"""

import pytorch_lightning as pl
import lightning.pytorch as pl
from omegaconf import OmegaConf

from nemo.collections.asr.models import EncDecRNNTModel
Expand Down
Loading

0 comments on commit 7b8cfd9

Please sign in to comment.