diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py index b2d6a953a9ab..52de2aaead19 100644 --- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py +++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py @@ -63,11 +63,6 @@ from megatron.core import parallel_state from megatron.core.distributed import DistributedDataParallel as McoreDDP from megatron.core.distributed import DistributedDataParallelConfig - from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add - from megatron.core.models.gpt import GPTModel as MCoreGPTModel - from megatron.core.models.vision.clip_vit_model import CLIPViTModel - from megatron.core.pipeline_parallel.schedules import get_forward_backward_func - from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules from megatron.core.extensions.transformer_engine import ( TEColumnParallelLinear, TEDotProductAttention, @@ -75,6 +70,11 @@ TENorm, TERowParallelLinear, ) + from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add + from megatron.core.models.gpt import GPTModel as MCoreGPTModel + from megatron.core.models.vision.clip_vit_model import CLIPViTModel + from megatron.core.pipeline_parallel.schedules import get_forward_backward_func + from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType as MCoreAttnMaskType from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.mlp import MLP, MLPSubmodules diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py index 48fdffb057e5..05f371466614 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py @@ -14,16 +14,16 @@ try: - from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add - from megatron.core.fusions.fused_layer_norm import FusedLayerNorm - from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear - from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.extensions.transformer_engine import ( TEColumnParallelLinear, TEDotProductAttention, TENorm, TERowParallelLinear, ) + from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add + from megatron.core.fusions.fused_layer_norm import FusedLayerNorm + from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear + from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp @@ -59,7 +59,11 @@ self_attn_bda=get_bias_dropout_add, post_att_layernorm=TENorm, mlp=ModuleSpec( - module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TEColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), ), mlp_bda=get_bias_dropout_add, post_mlp_layernorm=TENorm, @@ -84,7 +88,11 @@ self_attn_bda=get_bias_dropout_add, post_att_layernorm=FusedLayerNorm, mlp=ModuleSpec( - module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,), + module=MLP, + submodules=MLPSubmodules( + linear_fc1=ColumnParallelLinear, + linear_fc2=RowParallelLinear, + ), ), mlp_bda=get_bias_dropout_add, post_mlp_layernorm=FusedLayerNorm, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 4539c0e27d25..441ecae47aab 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -15,14 +15,14 @@ from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults try: - from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add - from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.extensions.transformer_engine import ( TEColumnParallelLinear, TEDotProductAttention, TENorm, TERowParallelLinear, ) + from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add + from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.mlp import MLP, MLPSubmodules @@ -62,7 +62,11 @@ def get_falcon_layer_spec() -> ModuleSpec: self_attn_bda=get_bias_dropout_add, pre_mlp_layernorm=TENorm, mlp=ModuleSpec( - module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,), + module=MLP, + submodules=MLPSubmodules( + linear_fc1=TEColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), ), mlp_bda=get_bias_dropout_add, ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py index 7459b9d1f95f..6aa05f15eece 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py @@ -36,11 +36,11 @@ try: from megatron.core import parallel_state, tensor_parallel from megatron.core.fusions.fused_layer_norm import FusedLayerNorm + from megatron.core.transformer.graphs import CudaGraphManager from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_block import TransformerBlockSubmodules, get_num_layers_to_build from megatron.core.transformer.transformer_layer import BaseTransformerLayer from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint - from megatron.core.transformer.graphs import CudaGraphManager HAVE_MEGATRON_CORE = True @@ -241,9 +241,7 @@ def __init__(self, config, layer_number=1, hidden_dropout=None): super().__init__(**transformer_layer_args) if config.enable_cuda_graph and self.training: - assert ( - not config.cpu_offloading and config.recompute_granularity is None - ), "Cudagraphs not supported" + assert not config.cpu_offloading and config.recompute_granularity is None, "Cudagraphs not supported" self.add_module('cudagraph_manager', CudaGraphManager()) # Called by MCore's TransformerBlock.forward @@ -334,6 +332,7 @@ def __call__(self, *args, **kwargs): return self.cudagraph_manager(self, args, kwargs) return super().__call__(*args, **kwargs) + # Use this spec to use the full Transformer layer from Transformer Engine def get_gpt_full_te_layer_autocast_spec(transformer_config) -> ModuleSpec: if not HAVE_MEGATRON_CORE or not HAVE_TE: diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py index 6cf068b85ebc..e05c61bf3d24 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py @@ -13,10 +13,10 @@ # limitations under the License. try: + from megatron.core.extensions.transformer_engine import TENorm from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules - from megatron.core.extensions.transformer_engine import TENorm from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py index 63441ec51597..b98913dd98c6 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from torch import Tensor, nn + from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_layer_spec import ( griffin_mqa_layer_with_transformer_engine_spec, griffin_recurrent_layer_with_transformer_engine_spec, @@ -20,9 +21,9 @@ try: from megatron.core import parallel_state, tensor_parallel + from megatron.core.extensions.transformer_engine import TENorm, te_checkpoint from megatron.core.models.common.language_module.language_module import LanguageModule from megatron.core.packed_seq_params import PackedSeqParams - from megatron.core.extensions.transformer_engine import TENorm, te_checkpoint from megatron.core.transformer.spec_utils import build_module from megatron.core.transformer.transformer_config import TransformerConfig diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py index f7b3bc4c198d..397be833a12c 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add -from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.extensions.transformer_engine import ( TEDotProductAttention, TELayerNormColumnParallelLinear, TERowParallelLinear, ) +from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add +from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.mlp import MLP, MLPSubmodules @@ -53,7 +53,10 @@ self_attn_bda=get_bias_dropout_add, mlp=ModuleSpec( module=MLP, - submodules=MLPSubmodules(linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,), + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), ), mlp_bda=get_bias_dropout_add, ), @@ -74,7 +77,10 @@ recurrent_bda=get_bias_dropout_add, mlp=ModuleSpec( module=MLP, - submodules=MLPSubmodules(linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,), + submodules=MLPSubmodules( + linear_fc1=TELayerNormColumnParallelLinear, + linear_fc2=TERowParallelLinear, + ), ), mlp_bda=get_bias_dropout_add, ), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 25e740b4027d..571d93120308 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -843,7 +843,6 @@ def training_step(self, dataloader_iter): if hasattr(module, 'embedding'): for param in module.embedding.parameters(): param.data_ptr() - if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and parallel_state.is_pipeline_last_stage( ignore_virtual=True @@ -2141,7 +2140,7 @@ def build_transformer_config(self) -> TransformerConfig: 'moe_z_loss_coeff': self.cfg.get('moe_z_loss_coeff', None), # 1e-3 would be a good start value for z-loss 'moe_input_jitter_eps': self.cfg.get('moe_input_jitter_eps', None), 'moe_token_dropping': self.cfg.get('moe_token_dropping', False), # TODO: Support token dropping. - 'enable_cuda_graph': self.cfg.get('enable_cuda_graph', False) + 'enable_cuda_graph': self.cfg.get('enable_cuda_graph', False), } if model_specific_configs['num_moe_experts'] is not None: assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE' diff --git a/nemo/collections/nlp/modules/common/hyena/hyena.py b/nemo/collections/nlp/modules/common/hyena/hyena.py index 61bbfe7ca933..f1b4fe20f537 100644 --- a/nemo/collections/nlp/modules/common/hyena/hyena.py +++ b/nemo/collections/nlp/modules/common/hyena/hyena.py @@ -23,10 +23,7 @@ import torch import torch.nn as nn from einops import rearrange -from megatron.core.extensions.transformer_engine import ( - TELayerNormColumnParallelLinear, - TERowParallelLinear, -) +from megatron.core.extensions.transformer_engine import TELayerNormColumnParallelLinear, TERowParallelLinear from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig diff --git a/nemo/collections/nlp/modules/common/hyena/hyena_spec.py b/nemo/collections/nlp/modules/common/hyena/hyena_spec.py index 2b06e277d004..58797cf27838 100644 --- a/nemo/collections/nlp/modules/common/hyena/hyena_spec.py +++ b/nemo/collections/nlp/modules/common/hyena/hyena_spec.py @@ -1,9 +1,6 @@ import torch.nn as nn +from megatron.core.extensions.transformer_engine import TELayerNormColumnParallelLinear, TERowParallelLinear from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec -from megatron.core.extensions.transformer_engine import ( - TELayerNormColumnParallelLinear, - TERowParallelLinear, -) from megatron.core.transformer.spec_utils import ModuleSpec from nemo.collections.nlp.modules.common.hyena.hyena import ( diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py index 0ac9298d1f6d..5128b4ca6b16 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py @@ -15,6 +15,7 @@ import torch import torch.nn.functional as F from megatron.core import InferenceParams +from megatron.core.extensions.transformer_engine import SplitAlongDim from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl @@ -22,7 +23,6 @@ from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.transformer.attention import SelfAttention -from megatron.core.extensions.transformer_engine import SplitAlongDim from megatron.core.transformer.mlp import MLP from megatron.core.transformer.moe.experts import SequentialMLP from megatron.core.transformer.transformer_block import TransformerBlock