Skip to content

Commit

Permalink
Apply isort and black reformatting
Browse files Browse the repository at this point in the history
Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
  • Loading branch information
JimmyZhang12 committed Aug 20, 2024
1 parent e9270a6 commit 51c7c87
Show file tree
Hide file tree
Showing 11 changed files with 46 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,18 @@
from megatron.core import parallel_state
from megatron.core.distributed import DistributedDataParallel as McoreDDP
from megatron.core.distributed import DistributedDataParallelConfig
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.models.gpt import GPTModel as MCoreGPTModel
from megatron.core.models.vision.clip_vit_model import CLIPViTModel
from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
from megatron.core.extensions.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TELayerNormColumnParallelLinear,
TENorm,
TERowParallelLinear,
)
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.models.gpt import GPTModel as MCoreGPTModel
from megatron.core.models.vision.clip_vit_model import CLIPViTModel
from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
from megatron.core.transformer.enums import AttnMaskType as MCoreAttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.mlp import MLP, MLPSubmodules
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@


try:
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.extensions.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TENorm,
TERowParallelLinear,
)
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
Expand Down Expand Up @@ -59,7 +59,11 @@
self_attn_bda=get_bias_dropout_add,
post_att_layernorm=TENorm,
mlp=ModuleSpec(
module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TEColumnParallelLinear,
linear_fc2=TERowParallelLinear,
),
),
mlp_bda=get_bias_dropout_add,
post_mlp_layernorm=TENorm,
Expand All @@ -84,7 +88,11 @@
self_attn_bda=get_bias_dropout_add,
post_att_layernorm=FusedLayerNorm,
mlp=ModuleSpec(
module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
module=MLP,
submodules=MLPSubmodules(
linear_fc1=ColumnParallelLinear,
linear_fc2=RowParallelLinear,
),
),
mlp_bda=get_bias_dropout_add,
post_mlp_layernorm=FusedLayerNorm,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults

try:
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.extensions.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TENorm,
TERowParallelLinear,
)
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.mlp import MLP, MLPSubmodules
Expand Down Expand Up @@ -62,7 +62,11 @@ def get_falcon_layer_spec() -> ModuleSpec:
self_attn_bda=get_bias_dropout_add,
pre_mlp_layernorm=TENorm,
mlp=ModuleSpec(
module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TEColumnParallelLinear,
linear_fc2=TERowParallelLinear,
),
),
mlp_bda=get_bias_dropout_add,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
try:
from megatron.core import parallel_state, tensor_parallel
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.transformer.graphs import CudaGraphManager
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules, get_num_layers_to_build
from megatron.core.transformer.transformer_layer import BaseTransformerLayer
from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
from megatron.core.transformer.graphs import CudaGraphManager

HAVE_MEGATRON_CORE = True

Expand Down Expand Up @@ -241,9 +241,7 @@ def __init__(self, config, layer_number=1, hidden_dropout=None):
super().__init__(**transformer_layer_args)

if config.enable_cuda_graph and self.training:
assert (
not config.cpu_offloading and config.recompute_granularity is None
), "Cudagraphs not supported"
assert not config.cpu_offloading and config.recompute_granularity is None, "Cudagraphs not supported"
self.add_module('cudagraph_manager', CudaGraphManager())

# Called by MCore's TransformerBlock.forward
Expand Down Expand Up @@ -334,6 +332,7 @@ def __call__(self, *args, **kwargs):
return self.cudagraph_manager(self, args, kwargs)
return super().__call__(*args, **kwargs)


# Use this spec to use the full Transformer layer from Transformer Engine
def get_gpt_full_te_layer_autocast_spec(transformer_config) -> ModuleSpec:
if not HAVE_MEGATRON_CORE or not HAVE_TE:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.

try:
from megatron.core.extensions.transformer_engine import TENorm
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.extensions.transformer_engine import TENorm
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from torch import Tensor, nn

from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_layer_spec import (
griffin_mqa_layer_with_transformer_engine_spec,
griffin_recurrent_layer_with_transformer_engine_spec,
Expand All @@ -20,9 +21,9 @@

try:
from megatron.core import parallel_state, tensor_parallel
from megatron.core.extensions.transformer_engine import TENorm, te_checkpoint
from megatron.core.models.common.language_module.language_module import LanguageModule
from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.extensions.transformer_engine import TENorm, te_checkpoint
from megatron.core.transformer.spec_utils import build_module
from megatron.core.transformer.transformer_config import TransformerConfig

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.extensions.transformer_engine import (
TEDotProductAttention,
TELayerNormColumnParallelLinear,
TERowParallelLinear,
)
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.mlp import MLP, MLPSubmodules
Expand Down Expand Up @@ -53,7 +53,10 @@
self_attn_bda=get_bias_dropout_add,
mlp=ModuleSpec(
module=MLP,
submodules=MLPSubmodules(linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,),
submodules=MLPSubmodules(
linear_fc1=TELayerNormColumnParallelLinear,
linear_fc2=TERowParallelLinear,
),
),
mlp_bda=get_bias_dropout_add,
),
Expand All @@ -74,7 +77,10 @@
recurrent_bda=get_bias_dropout_add,
mlp=ModuleSpec(
module=MLP,
submodules=MLPSubmodules(linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,),
submodules=MLPSubmodules(
linear_fc1=TELayerNormColumnParallelLinear,
linear_fc2=TERowParallelLinear,
),
),
mlp_bda=get_bias_dropout_add,
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,6 @@ def training_step(self, dataloader_iter):
if hasattr(module, 'embedding'):
for param in module.embedding.parameters():
param.data_ptr()


if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and parallel_state.is_pipeline_last_stage(
ignore_virtual=True
Expand Down Expand Up @@ -2141,7 +2140,7 @@ def build_transformer_config(self) -> TransformerConfig:
'moe_z_loss_coeff': self.cfg.get('moe_z_loss_coeff', None), # 1e-3 would be a good start value for z-loss
'moe_input_jitter_eps': self.cfg.get('moe_input_jitter_eps', None),
'moe_token_dropping': self.cfg.get('moe_token_dropping', False), # TODO: Support token dropping.
'enable_cuda_graph': self.cfg.get('enable_cuda_graph', False)
'enable_cuda_graph': self.cfg.get('enable_cuda_graph', False),
}
if model_specific_configs['num_moe_experts'] is not None:
assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE'
Expand Down
5 changes: 1 addition & 4 deletions nemo/collections/nlp/modules/common/hyena/hyena.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,7 @@
import torch
import torch.nn as nn
from einops import rearrange
from megatron.core.extensions.transformer_engine import (
TELayerNormColumnParallelLinear,
TERowParallelLinear,
)
from megatron.core.extensions.transformer_engine import TELayerNormColumnParallelLinear, TERowParallelLinear
from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
from megatron.core.transformer.spec_utils import ModuleSpec, build_module
from megatron.core.transformer.transformer_config import TransformerConfig
Expand Down
5 changes: 1 addition & 4 deletions nemo/collections/nlp/modules/common/hyena/hyena_spec.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import torch.nn as nn
from megatron.core.extensions.transformer_engine import TELayerNormColumnParallelLinear, TERowParallelLinear
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
from megatron.core.extensions.transformer_engine import (
TELayerNormColumnParallelLinear,
TERowParallelLinear,
)
from megatron.core.transformer.spec_utils import ModuleSpec

from nemo.collections.nlp.modules.common.hyena.hyena import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
import torch
import torch.nn.functional as F
from megatron.core import InferenceParams
from megatron.core.extensions.transformer_engine import SplitAlongDim
from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl
from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl
from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb
from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.transformer.attention import SelfAttention
from megatron.core.extensions.transformer_engine import SplitAlongDim
from megatron.core.transformer.mlp import MLP
from megatron.core.transformer.moe.experts import SequentialMLP
from megatron.core.transformer.transformer_block import TransformerBlock
Expand Down

0 comments on commit 51c7c87

Please sign in to comment.