diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 8b484941f6da..4e81b3b4b41d 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -288,7 +288,7 @@ def build( # So we manipulate TRTLLM to emulate a TP->PP single node setup tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank) device_ids = [ - (i+torch.cuda.current_device()-mp_rank) + ((i+mp_rank-torch.cuda.current_device())+mp_size)%mp_size for i in range(mp_size)] mapping = tensorrt_llm.Mapping( diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py index 007712028b7f..0d8392ef23b5 100644 --- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py +++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py @@ -393,7 +393,6 @@ def convert_nemo_model( starmap_args = [] import time - tic = time.time() layers_per_pp = num_layers // pp_size layers_per_chunk = layers_per_pp // vp_size @@ -435,9 +434,6 @@ def convert_nemo_model( gathered_params[key2] = weight_list[idx] tl_params = gathered_params - toc = time.time() - print(f" PP Reshard save took {toc-tic}") - # ----------------Convert layer level weights---------------- layer_params = extract_layers_with_prefix(tl_params, transformer_layer_prefix) layer_params = { @@ -458,7 +454,7 @@ def broadcast_item(item, group, src_rank): #broadcast a tensor across PP group and save it def save_pp_weight( - src_key_or_tensor, dst_key, pp_src_idx, transpose_weights=False, weight_type=None): + src_key_or_tensor, dst_key, pp_src_idx, transpose_weights=True, weight_type=None): have_tensor = False if torch.distributed.get_rank() == pp_src_idx: @@ -501,14 +497,12 @@ def save_pp_weight( get_layer_name("final_layernorm.weight", transformer_layer_prefix), "ln_f.weight", pp_last_rank, - transpose_weights=True, weight_type='layernorm_weight' ) save_pp_weight( get_layer_name("final_layernorm.bias", transformer_layer_prefix), "ln_f.bias", pp_last_rank, - transpose_weights=True, ) # ----------------Convert Embeddings---------------- @@ -532,7 +526,6 @@ def remove_vocab_padding(tensor): world_embed, "vocab_embedding.weight", pp_first_rank, - transpose_weights=True, ) if pp_is_last or reshard_model: @@ -547,7 +540,6 @@ def remove_vocab_padding(tensor): lm_head, "lm_head.weight", pp_last_rank, - transpose_weights=True, ) tic = time.time()