diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 43cc8c26444f..0db32c7dfa6e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -343,7 +343,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): module.cuda(torch.cuda.current_device()) else: self.model.cuda(torch.cuda.current_device()) - self._wrap_model_for_O2() self.enable_autocast = ( diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 5d52c8e7cf04..8206ae8e84b3 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -309,6 +309,7 @@ def build( nemo_model_config=nemo_model_config, reshard_model=self.reshard_model, mapping=mapping, + trt_model_type=trt_model_type, ) print_mem("pre build_and_save_engine") diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py index f40882a3f343..23439ef4f0bd 100644 --- a/nemo/export/trt_llm/nemo_utils.py +++ b/nemo/export/trt_llm/nemo_utils.py @@ -257,7 +257,8 @@ def nemo_llm_model_to_model_config( tokenizer, nemo_model_config, reshard_model, - mapping + mapping, + trt_model_type, ) -> Tuple[PretrainedConfig, dict]: """Converts the NEMO model object and construct the `ModelConfig` before tensorrt_llm deployment.""" from megatron.core import parallel_state @@ -270,6 +271,21 @@ def nemo_llm_model_to_model_config( tokenizer_vocab_size=tokenizer.vocab_size, reshard_model=reshard_model) + renamed_weight_dict = {} + if trt_model_type == 'GPTForCausalLM': + for key, val in weights_dict.items(): + if 'layernorm' in key: + new_key = key.replace("pre_mlp_layernorm", "post_layernorm") + else: + new_key = key + renamed_weight_dict[new_key] = val + + activation = None + if nemo_model_config['activation'] == 'fast-swiglu': + activation = 'silu' + else: + activation = nemo_model_config['activation'] + if isinstance(nemo_model, list): torch_dtype = next(iter(nemo_model[0].state_dict().values())).dtype else: @@ -277,7 +293,7 @@ def nemo_llm_model_to_model_config( str_dtype = trt_dtype_to_str(np_dtype_to_trt(torch_dtype_to_np(torch_dtype))) model_config = PretrainedConfig( - architecture='LlamaForCausalLM', + architecture=trt_model_type, dtype=str_dtype, logits_dtype='float32', vocab_size=tokenizer.vocab_size, @@ -286,7 +302,7 @@ def nemo_llm_model_to_model_config( num_hidden_layers=nemo_model_config.get('num_layers'), num_attention_heads=nemo_model_config.get('num_attention_heads'), num_key_value_heads=nemo_model_config.get('num_query_groups'), - hidden_act='silu', + hidden_act=activation, intermediate_size=nemo_model_config.get('ffn_hidden_size'), norm_epsilon=nemo_model_config.get('layernorm_epsilon'), position_embedding_type="rope_gpt_neox", @@ -301,15 +317,16 @@ def nemo_llm_model_to_model_config( 'pre_quant_scale': False, 'exclude_modules': None}, kv_dtype=str_dtype, - rotary_scaling=None, moe_normalization_mode=None, - rotary_base=10000.0, + rotary_pct=nemo_model_config.get('rotary_percentage', 1.0), + rotary_base=nemo_model_config.get('rotary_base', 10000), moe_num_experts=0, moe_top_k=0, moe_tp_mode=2, - attn_bias=False, disable_weight_only_quant_plugin=False, - mlp_bias=False + attn_bias=False, + mlp_bias=False, + bias=False ) model_config.mapping = mapping - return model_config, weights_dict + return model_config, renamed_weight_dict