Skip to content

Commit

Permalink
fix crash
Browse files Browse the repository at this point in the history
Signed-off-by: root <worker@nvidia.com>
  • Loading branch information
root committed May 11, 2024
1 parent fc1bbf0 commit ba40124
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 10 deletions.
2 changes: 1 addition & 1 deletion nemo/export/tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def build(
# So we manipulate TRTLLM to emulate a TP->PP single node setup
tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)
device_ids = [
(i+torch.cuda.current_device()-mp_rank)
((i+mp_rank-torch.cuda.current_device())+mp_size)%mp_size
for i in range(mp_size)]

mapping = tensorrt_llm.Mapping(
Expand Down
10 changes: 1 addition & 9 deletions nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,6 @@ def convert_nemo_model(
starmap_args = []

import time
tic = time.time()

layers_per_pp = num_layers // pp_size
layers_per_chunk = layers_per_pp // vp_size
Expand Down Expand Up @@ -435,9 +434,6 @@ def convert_nemo_model(
gathered_params[key2] = weight_list[idx]
tl_params = gathered_params

toc = time.time()
print(f" PP Reshard save took {toc-tic}")

# ----------------Convert layer level weights----------------
layer_params = extract_layers_with_prefix(tl_params, transformer_layer_prefix)
layer_params = {
Expand All @@ -458,7 +454,7 @@ def broadcast_item(item, group, src_rank):

#broadcast a tensor across PP group and save it
def save_pp_weight(
src_key_or_tensor, dst_key, pp_src_idx, transpose_weights=False, weight_type=None):
src_key_or_tensor, dst_key, pp_src_idx, transpose_weights=True, weight_type=None):

have_tensor = False
if torch.distributed.get_rank() == pp_src_idx:
Expand Down Expand Up @@ -501,14 +497,12 @@ def save_pp_weight(
get_layer_name("final_layernorm.weight", transformer_layer_prefix),
"ln_f.weight",
pp_last_rank,
transpose_weights=True,
weight_type='layernorm_weight'
)
save_pp_weight(
get_layer_name("final_layernorm.bias", transformer_layer_prefix),
"ln_f.bias",
pp_last_rank,
transpose_weights=True,
)

# ----------------Convert Embeddings----------------
Expand All @@ -532,7 +526,6 @@ def remove_vocab_padding(tensor):
world_embed,
"vocab_embedding.weight",
pp_first_rank,
transpose_weights=True,
)

if pp_is_last or reshard_model:
Expand All @@ -547,7 +540,6 @@ def remove_vocab_padding(tensor):
lm_head,
"lm_head.weight",
pp_last_rank,
transpose_weights=True,
)

tic = time.time()
Expand Down

0 comments on commit ba40124

Please sign in to comment.