Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
JimmyZhang12 · May 17, 2024 · a4a6243 · a4a6243
1 parent df36d00
commit a4a6243
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 158 deletions.
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
@@ -43,12 +43,12 @@
 except Exception:
     use_deploy = False
 
-def print_mem(prefix):
-        torch.cuda.empty_cache()
-        pyt = torch.cuda.memory_allocated() / (1024**3)
-        el = (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / (1024**3)
-        print(f"Mem Usage | {prefix} | {pyt} {el} | {el-pyt}")
 
+def print_mem(prefix):
+    torch.cuda.empty_cache()
+    pyt = torch.cuda.memory_allocated() / (1024**3)
+    el = (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / (1024**3)
+    print(f"Mem Usage | {prefix} | {pyt} {el} | {el-pyt}")
 
 
 @wrapt.decorator
@@ -277,10 +277,11 @@ def build(
         max_batch_size: int = 4,
         use_refit: bool = True,
         reshard_model: bool = False,
-    ):  
+    ):
         origdev = torch.cuda.current_device()
 
         from megatron.core import parallel_state
+
         assert tensorrt_llm.mpi_rank() == torch.distributed.get_rank()
 
         gpus_per_node = 8
@@ -298,29 +299,26 @@ def build(
 
         if reshard_model and pp_size > 1:
             self.reshard_model = True
-            dp_size = dp_size*pp_size
+            dp_size = dp_size * pp_size
             dp_rank = torch.distributed.get_rank() // tp_size
             pp_rank = 0
             pp_size = 1
             mp_group = parallel_state.get_tensor_model_parallel_group()
         else:
             self.reshard_model = False
             mp_group = parallel_state.get_model_parallel_group()
-        mp_size = tp_size*pp_size
-        mp_rank = tp_size*pp_rank + tp_rank
+        mp_size = tp_size * pp_size
+        mp_rank = tp_size * pp_rank + tp_rank
         if dp_size > 1:
             self.model_dir = os.path.join(self.model_dir, f"dp_rank{dp_rank}")
 
         # TRTLLM asserts that rank equals the device num however this
         # is not true for the megatron core mapping TP->DP->PP.
         # So we manipulate TRTLLM to emulate a TP->PP single node setup
-        tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)        
+        tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)
         mapping = tensorrt_llm.Mapping(
-            world_size = mp_size,
-            rank = mp_rank,
-            gpus_per_node = mp_size, 
-            tp_size = tp_size,
-            pp_size = pp_size)
+            world_size=mp_size, rank=mp_rank, gpus_per_node=mp_size, tp_size=tp_size, pp_size=pp_size
+        )
 
         LOGGER.info(
             f'''TRT-LLM rank mapping: Rank {torch.distributed.get_rank()}, mp_rank {mp_rank}:
@@ -330,20 +328,21 @@ def build(
         mp_group_ranks = torch.distributed.distributed_c10d.get_process_group_ranks(mp_group)
         print(f"{torch.distributed.get_rank()} color {dp_rank} mp_rank {mp_rank} mp_group_ranks {mp_group_ranks}")
         print(f"trtllm mpi : {tensorrt_llm.bindings.MpiComm.getRank()} {tensorrt_llm.bindings.MpiComm.getSize()}")
-        
+
         model_config, weights = nemo_llm_model_to_model_config(
             nemo_model=nemo_model,
             tokenizer=self.tokenizer,
             nemo_model_config=nemo_model_config,
             reshard_model=self.reshard_model,
             mapping=mapping,
-            trt_model_type=trt_model_type)
+            trt_model_type=trt_model_type,
+        )
 
         print_mem("pre build_and_save_engine")
         engine = build_and_save_engine(
             max_input_len=max_input_len,
             max_output_len=max_output_len,
-            max_input_tokens=max_input_tokens,  
+            max_input_tokens=max_input_tokens,
             max_batch_size=max_batch_size,
             model_config=model_config,
             model_weights=weights,
@@ -370,13 +369,15 @@ def refit(
     ):
         from .trt_llm.nemo.nemo_ckpt_convert import convert_nemo_model
         from .trt_llm.tensorrt_llm_run import create_gpt_session
+
         assert self.use_refit, "TRT-LLM model must be built() with refit=True"
-    
+
         print_mem("pre refit")
         import time
+
         tic = time.time()
-        
-        # Build or refit TRT-LLM engine from a nemo model. 
+
+        # Build or refit TRT-LLM engine from a nemo model.
         weights = convert_nemo_model(
             nemo_model=nemo_model,
             nemo_model_config=nemo_model_config,
@@ -396,7 +397,7 @@ def refit(
 
         tic = time.time()
         session = self.model_runner.session
-        session.refit_engine(weights, self.session_params.model_config.data_type) 
+        session.refit_engine(weights, self.session_params.model_config.data_type)
         toc = time.time()
         print(f"refit_runtime_engine took {toc-tic}")
 

diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py
@@ -413,7 +413,7 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
     return weights_dict
 
 
-#Similar to split_save_weight but done on GPU for performance
+# Similar to split_save_weight but done on GPU for performance
 @torch.no_grad()
 def save_weight_torch(key, val, config):
     num_layers = config["num_layers"]
@@ -425,23 +425,23 @@ def save_weight_torch(key, val, config):
     num_kv_heads = config["num_kv_heads"]
     move_to_cpu = config["move_to_cpu"]
     save_dict = config["save_dict"]
-    
+
     def save(key, tensor, add_prefix=True):
         assert torch.is_tensor(tensor)
         if add_prefix:
             key = f"transformer.{key}"
-        
+
         if len(tensor.shape) >= 2:
             tensor = tensor.reshape(tensor.shape[0], -1)
-            tensor = torch.transpose(tensor, 0 , 1)
+            tensor = torch.transpose(tensor, 0, 1)
         tensor = tensor.detach().contiguous()
         tensor = tensor.to(storage_type)
-        
+
         if move_to_cpu:
             if key not in save_dict:
                 cpu_copy = torch.empty(
-                    tensor.size(), dtype=tensor.dtype,
-                    layout=tensor.layout, device="cpu", pin_memory=True)
+                    tensor.size(), dtype=tensor.dtype, layout=tensor.layout, device="cpu", pin_memory=True
+                )
                 cpu_copy.copy_(tensor, non_blocking=True)
                 save_dict[key] = cpu_copy
             else:
@@ -450,12 +450,13 @@ def save(key, tensor, add_prefix=True):
             save_dict[key] = tensor.cuda()
 
     if config.get("transpose_weights", False) and val.ndim == 2:
-        val = val.T 
+        val = val.T
 
     if "self_attention" in key:
         key = key.replace("self_attention", "attention")
 
-    if ('layer_norm_weight' in key
+    if (
+        'layer_norm_weight' in key
         or 'layernorm.weight' in key
         or "final_layernorm.weight" in key
         or "ln_f.weight" in key
@@ -471,8 +472,8 @@ def save(key, tensor, add_prefix=True):
             val = val.float() + 1.0
         save(key, val)
     elif (
-        "input_layernorm.bias" in key 
-        or "pre_mlp_layernorm.bias" in key 
+        "input_layernorm.bias" in key
+        or "pre_mlp_layernorm.bias" in key
         or "ln_f.bias" in key
         or "vocab_embedding" in key
     ):
@@ -485,7 +486,7 @@ def save(key, tensor, add_prefix=True):
         or "mlp.dense_4h_to_h.weight" in key
         or "attention.linear_proj.weight" in key
         or "mlp.linear_fc2.weight" in key
-        or "mlp.dense_h_to_4h_2.weight" in key 
+        or "mlp.dense_h_to_4h_2.weight" in key
         or "mlp.dense_h_to_4h_2.bias" in key
     ):
         if "attention.linear_proj.weight" in key:
@@ -501,12 +502,12 @@ def save(key, tensor, add_prefix=True):
         or "mlp.linear_fc1.bias" in key
     ):
         if split_gated_activation:
-            val, gate = torch.chunk(val, 2, axis=-1) 
+            val, gate = torch.chunk(val, 2, axis=-1)
 
         if "mlp.linear_fc1" in key:
             key = key.replace("mlp.linear_fc1", "mlp.fc")
         save(key, val)
-        
+
         if split_gated_activation:
             key = key.replace("mlp.fc", "mlp.gate")
             save(key, gate)
@@ -522,17 +523,14 @@ def save(key, tensor, add_prefix=True):
         val = val.reshape(hidden_dim, num_kv_heads // tp_size, q_num + 2, size_per_head)
 
         # Split the QKV to separate variables.
-        #[qqqqkkvv] - > [qqqq,kk,vv]
+        # [qqqqkkvv] - > [qqqq,kk,vv]
         qkv = torch.split(val, [q_num, 1, 1], dim=2)
-        split_vals = torch.concatenate([
-                qkv[0].reshape(hidden_dim, -1), 
-                qkv[1].reshape(hidden_dim, -1), 
-                qkv[2].reshape(hidden_dim, -1)
-            ], dim=1)
+        split_vals = torch.concatenate(
+            [qkv[0].reshape(hidden_dim, -1), qkv[1].reshape(hidden_dim, -1), qkv[2].reshape(hidden_dim, -1)], dim=1
+        )
         save(key, split_vals)
 
     elif "lm_head.weight" in key:
         save(key, val, add_prefix=False)
     else:
         raise RuntimeError(f"{key} not handled by NeMo->TRTLLM converter!")
-