Skip to content

Commit

Permalink
Merge branch 'main' into auto_cudagraph
Browse files Browse the repository at this point in the history
  • Loading branch information
akoumpa authored Sep 6, 2024
2 parents 134c210 + ad5ef75 commit 0c33371
Show file tree
Hide file tree
Showing 16 changed files with 614 additions and 71 deletions.
135 changes: 124 additions & 11 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,16 +133,119 @@ jobs:
NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
IS_OPTIONAL: true

# # TODO refactor: Commenting this test out until it is fixed & works properly again (test passes again)
# OPTIONAL_L0_Unit_Tests_CPU:
# needs: [cicd-test-container-setup]
# uses: ./.github/workflows/_test_template.yml
# with:
# RUNNER: self-hosted-azure-cpu
# TIMEOUT: 60
# SCRIPT: |
# CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
# IS_OPTIONAL: true
# L0: CPU unit tests
L0_Unit_Tests_CPU_ASR:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
TIMEOUT: 20
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Audio:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Common:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_LLM:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Multimodal:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_NLP:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_TTS:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Core:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Hydra:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Lightning:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
IS_OPTIONAL: true

L0_Unit_Tests_CPU_Ohers:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-cpu
SCRIPT: |
CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat \
--ignore=tests/collections/asr \
--ignore=tests/collections/audio \
--ignore=tests/collections/common \
--ignore=tests/collections/llm \
--ignore=tests/collections/multimodal \
--ignore=tests/collections/nlp \
--ignore=tests/collections/tts \
--ignore=tests/core \
--ignore=tests/core_ptl \
--ignore=tests/hydra \
--ignore=tests/lightning \
--ignore=tests/utils
IS_OPTIONAL: true


L0_Setup_Test_Data_And_Models:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -4868,7 +4971,17 @@ jobs:
- gpu-test
- cicd-test-container-setup
- L0_Unit_Tests_GPU
#- OPTIONAL_L0_Unit_Tests_CPU
- L0_Unit_Tests_CPU_ASR
- L0_Unit_Tests_CPU_Audio
- L0_Unit_Tests_CPU_Common
- L0_Unit_Tests_CPU_LLM
- L0_Unit_Tests_CPU_Multimodal
- L0_Unit_Tests_CPU_NLP
- L0_Unit_Tests_CPU_TTS
- L0_Unit_Tests_CPU_Core
- L0_Unit_Tests_CPU_Hydra
- L0_Unit_Tests_CPU_Lightning
- L0_Unit_Tests_CPU_Ohers
- L2_Community_LLM_Checkpoints_tests_Bert
- L2_Community_LLM_Checkpoints_tests_Mamba2
- L2_Community_LLM_Checkpoints_tests_Llama
Expand Down
1 change: 0 additions & 1 deletion examples/llm/megatron_gpt_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ def get_args():
strategy = nl.MegatronStrategy()
checkpoint_callback = ModelCheckpoint(
every_n_train_steps=5000,
enable_nemo_ckpt_io=False,
)
callbacks = [checkpoint_callback]

Expand Down
11 changes: 11 additions & 0 deletions nemo/collections/llm/gpt/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@

_, HAVE_TE = safe_import("transformer_engine")

# Gradient accumulation fusion may be enabled if available, for more information see:
# https://github.com/NVIDIA/Megatron-LM/blob/01945b98d1ea3a2acb5e8301e181a328104f4856/megatron/core/tensor_parallel/layers.py#L575
# TODO: Clean this up with a getter and install instructions
_grad_accum_fusion_available = True
try:
import fused_weight_gradient_mlp_cuda
except ImportError:
_grad_accum_fusion_available = False

if TYPE_CHECKING:
from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel

Expand Down Expand Up @@ -124,6 +133,8 @@ class GPTConfig(TransformerConfig, io.IOMixin):
seq_length: int = 1024
attention_softmax_in_fp32: bool = False
masked_softmax_fusion: bool = True
cross_entropy_loss_fusion: bool = True
gradient_accumulation_fusion: bool = _grad_accum_fusion_available
deallocate_pipeline_outputs = True

transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = default_layer_spec
Expand Down
25 changes: 25 additions & 0 deletions nemo/collections/llm/recipes/llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
from nemo.collections.llm.utils import Config, Partial
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

NAME = "llama3_70b"
Expand Down Expand Up @@ -93,6 +95,29 @@ def pretrain_recipe(
)


def pretrain_recipe_performance(
name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
) -> Partial:
"""'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default
due to being model specific or lacking sufficent support. For better compatibility please use
the default 'pretrain_recipe()' above."""
recipe = pretrain_recipe(
name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
)

recipe.trainer.callbacks.append(
Config(
MegatronCommOverlapCallback,
tp_comm_overlap=True,
tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
defer_embedding_wgrad_compute=True,
wgrad_deferral_limit=22,
)
)

return recipe


def hf_resume() -> Config[nl.AutoResume]:
return Config(
nl.AutoResume,
Expand Down
20 changes: 20 additions & 0 deletions nemo/collections/llm/recipes/llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin
from nemo.collections.llm.utils import Config, Partial
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

NAME = "llama3_8b"
Expand Down Expand Up @@ -92,6 +93,25 @@ def pretrain_recipe(
)


def pretrain_recipe_performance(
name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
) -> Partial:
"""'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default
due to being model specific or lacking sufficent support. For better compatibility please use
the default 'pretrain_recipe()' above."""
recipe = pretrain_recipe(
name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
)

recipe.trainer.callbacks.append(
Config(
MegatronCommOverlapCallback,
tp_comm_overlap=False,
)
)
return recipe


def hf_resume() -> Config[nl.AutoResume]:
return Config(
nl.AutoResume,
Expand Down
Empty file.
73 changes: 73 additions & 0 deletions nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from dataclasses import dataclass


@dataclass
class TPOverlapCfg:
pass


@dataclass
class PipelineOverlapCfg(TPOverlapCfg):
num_sm: int
cga_size: int
num_splits: int
set_sm_margin: bool
fp8_buf: bool = (False,)
method: str = 'pipeline'


@dataclass
class RingExchangeOverlapCfg(TPOverlapCfg):
aggregate: bool = False
method: str = 'ring_exchange'


@dataclass
class BulkOverlapCfg(TPOverlapCfg):
num_sm: int
cga_size: int
set_sm_margin: bool
method: str = 'bulk'


@dataclass
class TransformerLayerTPOverlapCfg:
qkv_dgrad: TPOverlapCfg
qkv_wgrad: TPOverlapCfg
fc1_dgrad: TPOverlapCfg
fc1_wgrad: TPOverlapCfg
qkv_fprop: TPOverlapCfg
proj_dgrad: TPOverlapCfg
fc1_fprop: TPOverlapCfg
fc2_dgrad: TPOverlapCfg
proj_fprop: TPOverlapCfg
fc2_fprop: TPOverlapCfg


# TODO: Add more configs and create a getter function for expose a single api
# Model configs: H100/70B/TP8/MBS1/SeqLen8K
userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True),
fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True),
)

userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
)
4 changes: 2 additions & 2 deletions nemo/lightning/io/mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def io_dump(self, output: Path):
will be stored.
"""
output_path = Path(output)
local_artifacts_dir = "artifacts"
local_artifacts_dir = "."
artifacts_dir = output_path / local_artifacts_dir
artifacts_dir.mkdir(parents=True, exist_ok=True)

Expand Down Expand Up @@ -518,7 +518,7 @@ def _io_path_elements_fn(x):
return x.__io__.__path_elements__()


def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "artifacts"):
def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."):
for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
current_val = getattr(cfg, artifact.attr)
if current_val is None:
Expand Down
Loading

0 comments on commit 0c33371

Please sign in to comment.