Skip to content

Commit

Permalink
Apply isort and black reformatting
Browse files Browse the repository at this point in the history
Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
  • Loading branch information
JimmyZhang12 committed May 17, 2024
1 parent df36d00 commit a4a6243
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 158 deletions.
45 changes: 23 additions & 22 deletions nemo/export/tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@
except Exception:
use_deploy = False

def print_mem(prefix):
torch.cuda.empty_cache()
pyt = torch.cuda.memory_allocated() / (1024**3)
el = (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / (1024**3)
print(f"Mem Usage | {prefix} | {pyt} {el} | {el-pyt}")

def print_mem(prefix):
torch.cuda.empty_cache()
pyt = torch.cuda.memory_allocated() / (1024**3)
el = (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / (1024**3)
print(f"Mem Usage | {prefix} | {pyt} {el} | {el-pyt}")


@wrapt.decorator
Expand Down Expand Up @@ -277,10 +277,11 @@ def build(
max_batch_size: int = 4,
use_refit: bool = True,
reshard_model: bool = False,
):
):
origdev = torch.cuda.current_device()

from megatron.core import parallel_state

assert tensorrt_llm.mpi_rank() == torch.distributed.get_rank()

gpus_per_node = 8
Expand All @@ -298,29 +299,26 @@ def build(

if reshard_model and pp_size > 1:
self.reshard_model = True
dp_size = dp_size*pp_size
dp_size = dp_size * pp_size
dp_rank = torch.distributed.get_rank() // tp_size
pp_rank = 0
pp_size = 1
mp_group = parallel_state.get_tensor_model_parallel_group()
else:
self.reshard_model = False
mp_group = parallel_state.get_model_parallel_group()
mp_size = tp_size*pp_size
mp_rank = tp_size*pp_rank + tp_rank
mp_size = tp_size * pp_size
mp_rank = tp_size * pp_rank + tp_rank
if dp_size > 1:
self.model_dir = os.path.join(self.model_dir, f"dp_rank{dp_rank}")

# TRTLLM asserts that rank equals the device num however this
# is not true for the megatron core mapping TP->DP->PP.
# So we manipulate TRTLLM to emulate a TP->PP single node setup
tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)
tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)
mapping = tensorrt_llm.Mapping(
world_size = mp_size,
rank = mp_rank,
gpus_per_node = mp_size,
tp_size = tp_size,
pp_size = pp_size)
world_size=mp_size, rank=mp_rank, gpus_per_node=mp_size, tp_size=tp_size, pp_size=pp_size
)

LOGGER.info(
f'''TRT-LLM rank mapping: Rank {torch.distributed.get_rank()}, mp_rank {mp_rank}:
Expand All @@ -330,20 +328,21 @@ def build(
mp_group_ranks = torch.distributed.distributed_c10d.get_process_group_ranks(mp_group)
print(f"{torch.distributed.get_rank()} color {dp_rank} mp_rank {mp_rank} mp_group_ranks {mp_group_ranks}")
print(f"trtllm mpi : {tensorrt_llm.bindings.MpiComm.getRank()} {tensorrt_llm.bindings.MpiComm.getSize()}")

model_config, weights = nemo_llm_model_to_model_config(
nemo_model=nemo_model,
tokenizer=self.tokenizer,
nemo_model_config=nemo_model_config,
reshard_model=self.reshard_model,
mapping=mapping,
trt_model_type=trt_model_type)
trt_model_type=trt_model_type,
)

print_mem("pre build_and_save_engine")
engine = build_and_save_engine(
max_input_len=max_input_len,
max_output_len=max_output_len,
max_input_tokens=max_input_tokens,
max_input_tokens=max_input_tokens,
max_batch_size=max_batch_size,
model_config=model_config,
model_weights=weights,
Expand All @@ -370,13 +369,15 @@ def refit(
):
from .trt_llm.nemo.nemo_ckpt_convert import convert_nemo_model
from .trt_llm.tensorrt_llm_run import create_gpt_session

assert self.use_refit, "TRT-LLM model must be built() with refit=True"

print_mem("pre refit")
import time

tic = time.time()
# Build or refit TRT-LLM engine from a nemo model.

# Build or refit TRT-LLM engine from a nemo model.
weights = convert_nemo_model(
nemo_model=nemo_model,
nemo_model_config=nemo_model_config,
Expand All @@ -396,7 +397,7 @@ def refit(

tic = time.time()
session = self.model_runner.session
session.refit_engine(weights, self.session_params.model_config.data_type)
session.refit_engine(weights, self.session_params.model_config.data_type)
toc = time.time()
print(f"refit_runtime_engine took {toc-tic}")

Expand Down
40 changes: 19 additions & 21 deletions nemo/export/trt_llm/nemo/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
return weights_dict


#Similar to split_save_weight but done on GPU for performance
# Similar to split_save_weight but done on GPU for performance
@torch.no_grad()
def save_weight_torch(key, val, config):
num_layers = config["num_layers"]
Expand All @@ -425,23 +425,23 @@ def save_weight_torch(key, val, config):
num_kv_heads = config["num_kv_heads"]
move_to_cpu = config["move_to_cpu"]
save_dict = config["save_dict"]

def save(key, tensor, add_prefix=True):
assert torch.is_tensor(tensor)
if add_prefix:
key = f"transformer.{key}"

if len(tensor.shape) >= 2:
tensor = tensor.reshape(tensor.shape[0], -1)
tensor = torch.transpose(tensor, 0 , 1)
tensor = torch.transpose(tensor, 0, 1)
tensor = tensor.detach().contiguous()
tensor = tensor.to(storage_type)

if move_to_cpu:
if key not in save_dict:
cpu_copy = torch.empty(
tensor.size(), dtype=tensor.dtype,
layout=tensor.layout, device="cpu", pin_memory=True)
tensor.size(), dtype=tensor.dtype, layout=tensor.layout, device="cpu", pin_memory=True
)
cpu_copy.copy_(tensor, non_blocking=True)
save_dict[key] = cpu_copy
else:
Expand All @@ -450,12 +450,13 @@ def save(key, tensor, add_prefix=True):
save_dict[key] = tensor.cuda()

if config.get("transpose_weights", False) and val.ndim == 2:
val = val.T
val = val.T

if "self_attention" in key:
key = key.replace("self_attention", "attention")

if ('layer_norm_weight' in key
if (
'layer_norm_weight' in key
or 'layernorm.weight' in key
or "final_layernorm.weight" in key
or "ln_f.weight" in key
Expand All @@ -471,8 +472,8 @@ def save(key, tensor, add_prefix=True):
val = val.float() + 1.0
save(key, val)
elif (
"input_layernorm.bias" in key
or "pre_mlp_layernorm.bias" in key
"input_layernorm.bias" in key
or "pre_mlp_layernorm.bias" in key
or "ln_f.bias" in key
or "vocab_embedding" in key
):
Expand All @@ -485,7 +486,7 @@ def save(key, tensor, add_prefix=True):
or "mlp.dense_4h_to_h.weight" in key
or "attention.linear_proj.weight" in key
or "mlp.linear_fc2.weight" in key
or "mlp.dense_h_to_4h_2.weight" in key
or "mlp.dense_h_to_4h_2.weight" in key
or "mlp.dense_h_to_4h_2.bias" in key
):
if "attention.linear_proj.weight" in key:
Expand All @@ -501,12 +502,12 @@ def save(key, tensor, add_prefix=True):
or "mlp.linear_fc1.bias" in key
):
if split_gated_activation:
val, gate = torch.chunk(val, 2, axis=-1)
val, gate = torch.chunk(val, 2, axis=-1)

if "mlp.linear_fc1" in key:
key = key.replace("mlp.linear_fc1", "mlp.fc")
save(key, val)

if split_gated_activation:
key = key.replace("mlp.fc", "mlp.gate")
save(key, gate)
Expand All @@ -522,17 +523,14 @@ def save(key, tensor, add_prefix=True):
val = val.reshape(hidden_dim, num_kv_heads // tp_size, q_num + 2, size_per_head)

# Split the QKV to separate variables.
#[qqqqkkvv] - > [qqqq,kk,vv]
# [qqqqkkvv] - > [qqqq,kk,vv]
qkv = torch.split(val, [q_num, 1, 1], dim=2)
split_vals = torch.concatenate([
qkv[0].reshape(hidden_dim, -1),
qkv[1].reshape(hidden_dim, -1),
qkv[2].reshape(hidden_dim, -1)
], dim=1)
split_vals = torch.concatenate(
[qkv[0].reshape(hidden_dim, -1), qkv[1].reshape(hidden_dim, -1), qkv[2].reshape(hidden_dim, -1)], dim=1
)
save(key, split_vals)

elif "lm_head.weight" in key:
save(key, val, add_prefix=False)
else:
raise RuntimeError(f"{key} not handled by NeMo->TRTLLM converter!")

Loading

0 comments on commit a4a6243

Please sign in to comment.