Merge branch 'main' into auto_cudagraph

JimmyZhang12 · Sep 6, 2024 · 0c33371 · 0c33371
2 parents 134c210 + ad5ef75
commit 0c33371
Show file tree

Hide file tree

Showing 16 changed files with 614 additions and 71 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -133,16 +133,119 @@ jobs:
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
       IS_OPTIONAL: true
 
-   # # TODO refactor: Commenting this test out until it is fixed & works properly again (test passes again)
-  # OPTIONAL_L0_Unit_Tests_CPU:
-  #   needs: [cicd-test-container-setup]
-  #   uses: ./.github/workflows/_test_template.yml
-  #   with:
-  #     RUNNER: self-hosted-azure-cpu
-  #     TIMEOUT: 60
-  #     SCRIPT: |
-  #       CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-  #     IS_OPTIONAL: true
+  # L0: CPU unit tests
+  L0_Unit_Tests_CPU_ASR:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       TIMEOUT: 20
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_CPU_Audio:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_CPU_Common:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_CPU_LLM:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_CPU_Multimodal:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_CPU_NLP:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_CPU_TTS:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_CPU_Core:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_CPU_Hydra:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_CPU_Lightning:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+       IS_OPTIONAL: true
+
+  L0_Unit_Tests_CPU_Ohers:
+     needs: [cicd-test-container-setup]
+     uses: ./.github/workflows/_test_template.yml
+     with:
+       RUNNER: self-hosted-azure-cpu
+       SCRIPT: |
+         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat \
+         --ignore=tests/collections/asr \
+         --ignore=tests/collections/audio \
+         --ignore=tests/collections/common \
+         --ignore=tests/collections/llm \
+         --ignore=tests/collections/multimodal \
+         --ignore=tests/collections/nlp \
+         --ignore=tests/collections/tts \
+         --ignore=tests/core \
+         --ignore=tests/core_ptl \
+         --ignore=tests/hydra \
+         --ignore=tests/lightning \
+         --ignore=tests/utils
+       IS_OPTIONAL: true
+
 
   L0_Setup_Test_Data_And_Models:
     needs: [cicd-test-container-setup]
@@ -4868,7 +4971,17 @@ jobs:
       - gpu-test
       - cicd-test-container-setup
       - L0_Unit_Tests_GPU
-      #- OPTIONAL_L0_Unit_Tests_CPU
+      - L0_Unit_Tests_CPU_ASR
+      - L0_Unit_Tests_CPU_Audio
+      - L0_Unit_Tests_CPU_Common
+      - L0_Unit_Tests_CPU_LLM
+      - L0_Unit_Tests_CPU_Multimodal
+      - L0_Unit_Tests_CPU_NLP
+      - L0_Unit_Tests_CPU_TTS
+      - L0_Unit_Tests_CPU_Core
+      - L0_Unit_Tests_CPU_Hydra
+      - L0_Unit_Tests_CPU_Lightning
+      - L0_Unit_Tests_CPU_Ohers
       - L2_Community_LLM_Checkpoints_tests_Bert
       - L2_Community_LLM_Checkpoints_tests_Mamba2
       - L2_Community_LLM_Checkpoints_tests_Llama

diff --git a/examples/llm/megatron_gpt_pretraining.py b/examples/llm/megatron_gpt_pretraining.py
@@ -71,7 +71,6 @@ def get_args():
     strategy = nl.MegatronStrategy()
     checkpoint_callback = ModelCheckpoint(
         every_n_train_steps=5000,
-        enable_nemo_ckpt_io=False,
     )
     callbacks = [checkpoint_callback]
 

diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
@@ -32,6 +32,15 @@
 
 _, HAVE_TE = safe_import("transformer_engine")
 
+# Gradient accumulation fusion may be enabled if available, for more information see:
+# https://github.com/NVIDIA/Megatron-LM/blob/01945b98d1ea3a2acb5e8301e181a328104f4856/megatron/core/tensor_parallel/layers.py#L575
+# TODO: Clean this up with a getter and install instructions
+_grad_accum_fusion_available = True
+try:
+    import fused_weight_gradient_mlp_cuda
+except ImportError:
+    _grad_accum_fusion_available = False
+
 if TYPE_CHECKING:
     from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
 
@@ -124,6 +133,8 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     seq_length: int = 1024
     attention_softmax_in_fp32: bool = False
     masked_softmax_fusion: bool = True
+    cross_entropy_loss_fusion: bool = True
+    gradient_accumulation_fusion: bool = _grad_accum_fusion_available
     deallocate_pipeline_outputs = True
 
     transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = default_layer_spec

diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
@@ -14,7 +14,9 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin
+from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192
 from nemo.collections.llm.utils import Config, Partial
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama3_70b"
@@ -93,6 +95,29 @@ def pretrain_recipe(
     )
 
 
+def pretrain_recipe_performance(
+    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
+) -> Partial:
+    """'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default
+    due to being model specific or lacking sufficent support. For better compatibility please use
+    the default 'pretrain_recipe()' above."""
+    recipe = pretrain_recipe(
+        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
+    )
+
+    recipe.trainer.callbacks.append(
+        Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
+            defer_embedding_wgrad_compute=True,
+            wgrad_deferral_limit=22,
+        )
+    )
+
+    return recipe
+
+
 def hf_resume() -> Config[nl.AutoResume]:
     return Config(
         nl.AutoResume,

diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
@@ -14,6 +14,7 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed_plugin
 from nemo.collections.llm.utils import Config, Partial
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "llama3_8b"
@@ -92,6 +93,25 @@ def pretrain_recipe(
     )
 
 
+def pretrain_recipe_performance(
+    name: str, ckpt_dir: str, num_nodes: int, num_gpus_per_node: int, fn: Callable = pretrain
+) -> Partial:
+    """'pretrain_recipe_performance' turns on performance optimizations that cannot be enabled by default
+    due to being model specific or lacking sufficent support. For better compatibility please use
+    the default 'pretrain_recipe()' above."""
+    recipe = pretrain_recipe(
+        name=name, ckpt_dir=ckpt_dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn
+    )
+
+    recipe.trainer.callbacks.append(
+        Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
 def hf_resume() -> Config[nl.AutoResume]:
     return Config(
         nl.AutoResume,

diff --git a/nemo/collections/llm/recipes/tp_overlap_configs/__init__.py b/nemo/collections/llm/recipes/tp_overlap_configs/__init__.py
diff --git a/nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py b/nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py
@@ -0,0 +1,73 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class TPOverlapCfg:
+    pass
+
+
+@dataclass
+class PipelineOverlapCfg(TPOverlapCfg):
+    num_sm: int
+    cga_size: int
+    num_splits: int
+    set_sm_margin: bool
+    fp8_buf: bool = (False,)
+    method: str = 'pipeline'
+
+
+@dataclass
+class RingExchangeOverlapCfg(TPOverlapCfg):
+    aggregate: bool = False
+    method: str = 'ring_exchange'
+
+
+@dataclass
+class BulkOverlapCfg(TPOverlapCfg):
+    num_sm: int
+    cga_size: int
+    set_sm_margin: bool
+    method: str = 'bulk'
+
+
+@dataclass
+class TransformerLayerTPOverlapCfg:
+    qkv_dgrad: TPOverlapCfg
+    qkv_wgrad: TPOverlapCfg
+    fc1_dgrad: TPOverlapCfg
+    fc1_wgrad: TPOverlapCfg
+    qkv_fprop: TPOverlapCfg
+    proj_dgrad: TPOverlapCfg
+    fc1_fprop: TPOverlapCfg
+    fc2_dgrad: TPOverlapCfg
+    proj_fprop: TPOverlapCfg
+    fc2_fprop: TPOverlapCfg
+
+
+# TODO: Add more configs and create a getter function for expose a single api
+# Model configs: H100/70B/TP8/MBS1/SeqLen8K
+userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
+    qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
+    qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
+    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
+    fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
+    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
+    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
+    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
+    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
+    proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True),
+    fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True),
+)
+
+userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
+    qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
+    qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
+    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
+    fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
+    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
+    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
+    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
+    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
+    proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
+    fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
+)
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
@@ -141,7 +141,7 @@ def io_dump(self, output: Path):
                            will be stored.
         """
         output_path = Path(output)
-        local_artifacts_dir = "artifacts"
+        local_artifacts_dir = "."
         artifacts_dir = output_path / local_artifacts_dir
         artifacts_dir.mkdir(parents=True, exist_ok=True)
 
@@ -518,7 +518,7 @@ def _io_path_elements_fn(x):
     return x.__io__.__path_elements__()
 
 
-def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "artifacts"):
+def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
         current_val = getattr(cfg, artifact.attr)
         if current_val is None: