multinode fix again

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
JimmyZhang12 · May 17, 2024 · 6718d0a · 6718d0a
1 parent a4a6243
commit 6718d0a
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 2 deletions.
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
@@ -315,10 +315,31 @@ def build(
         # TRTLLM asserts that rank equals the device num however this
         # is not true for the megatron core mapping TP->DP->PP.
         # So we manipulate TRTLLM to emulate a TP->PP single node setup
+<<<<<<< Updated upstream
         tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)
         mapping = tensorrt_llm.Mapping(
             world_size=mp_size, rank=mp_rank, gpus_per_node=mp_size, tp_size=tp_size, pp_size=pp_size
         )
+=======
+        global_devices = [None for _ in range(torch.distributed.get_world_size())]
+        torch.distributed.all_gather_object(global_devices, torch.cuda.current_device())
+        gpus_per_node = max(global_devices) + 1
+        roll_amt = (torch.cuda.current_device()-mp_rank%gpus_per_node+gpus_per_node) % gpus_per_node
+        device_ids = [i for i in range(gpus_per_node)]
+        for _ in range(roll_amt):        
+            device_ids.append(device_ids.pop(0))
+
+        print(f"{torch.cuda.current_device()} mp_rank {mp_rank}: device_ids {device_ids} roll_amt {roll_amt}")
+
+
+        tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)        
+        mapping = tensorrt_llm.Mapping(
+            world_size = mp_size,
+            rank = mp_rank,
+            gpus_per_node = gpus_per_node, 
+            tp_size = tp_size,
+            pp_size = pp_size)
+>>>>>>> Stashed changes
 
         LOGGER.info(
             f'''TRT-LLM rank mapping: Rank {torch.distributed.get_rank()}, mp_rank {mp_rank}:
@@ -357,7 +378,8 @@ def build(
             json.dump(engine.config.to_dict(), f, indent=4)
 
         print_mem("post build_and_save_engine")
-        self.model_runner, self.session_params = load_refit(engine_dir=self.model_dir)
+        self.model_runner, self.session_params = load_refit(
+            engine_dir=self.model_dir, device_ids=device_ids, gpus_per_node=gpus_per_node)
         print_mem("post load_refit")
 
         print(f"engine saved to {self.model_dir} device: {origdev} {torch.cuda.current_device()}")

diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -321,7 +321,7 @@ def create_gpt_session(session_params: GptSession_params, engine_data: bytearray
     )
 
 
-def load_refit(engine_dir):
+def load_refit(engine_dir, device_ids, gpus_per_node):
     """Loaded the compiled LLM model and run it.
 
     It also supports running the TRT LLM model on multi-GPU.
@@ -337,10 +337,13 @@ def load_refit(engine_dir):
 
     # TRTLLM assumes rank < gpus_per_node but this is not true for multinode setups
     # So hack around this using an arbitrarily big gpus_per_node to avoid asserts
+<<<<<<< Updated upstream
     gpus_per_node = 64
     mp_rank = tensorrt_llm.bindings.MpiComm.getRank()
     device_ids = [(i + torch.cuda.current_device() - mp_rank + gpus_per_node) % gpus_per_node for i in range(mp_size)]
     print(f"{torch.cuda.current_device()} device_ids {device_ids}")
+=======
+>>>>>>> Stashed changes
 
     world_config = WorldConfig.mpi(
         gpus_per_node=gpus_per_node, tensor_parallelism=tp_size, pipeline_parallelism=pp_size, device_ids=device_ids