Skip to content

Commit

Permalink
multinode fix again
Browse files Browse the repository at this point in the history
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
  • Loading branch information
jiemingz committed May 17, 2024
1 parent a4a6243 commit 6718d0a
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
24 changes: 23 additions & 1 deletion nemo/export/tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,10 +315,31 @@ def build(
# TRTLLM asserts that rank equals the device num however this
# is not true for the megatron core mapping TP->DP->PP.
# So we manipulate TRTLLM to emulate a TP->PP single node setup
<<<<<<< Updated upstream
tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)
mapping = tensorrt_llm.Mapping(
world_size=mp_size, rank=mp_rank, gpus_per_node=mp_size, tp_size=tp_size, pp_size=pp_size
)
=======
global_devices = [None for _ in range(torch.distributed.get_world_size())]
torch.distributed.all_gather_object(global_devices, torch.cuda.current_device())
gpus_per_node = max(global_devices) + 1
roll_amt = (torch.cuda.current_device()-mp_rank%gpus_per_node+gpus_per_node) % gpus_per_node
device_ids = [i for i in range(gpus_per_node)]
for _ in range(roll_amt):
device_ids.append(device_ids.pop(0))

print(f"{torch.cuda.current_device()} mp_rank {mp_rank}: device_ids {device_ids} roll_amt {roll_amt}")


tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)
mapping = tensorrt_llm.Mapping(
world_size = mp_size,
rank = mp_rank,
gpus_per_node = gpus_per_node,
tp_size = tp_size,
pp_size = pp_size)
>>>>>>> Stashed changes

LOGGER.info(
f'''TRT-LLM rank mapping: Rank {torch.distributed.get_rank()}, mp_rank {mp_rank}:
Expand Down Expand Up @@ -357,7 +378,8 @@ def build(
json.dump(engine.config.to_dict(), f, indent=4)

print_mem("post build_and_save_engine")
self.model_runner, self.session_params = load_refit(engine_dir=self.model_dir)
self.model_runner, self.session_params = load_refit(
engine_dir=self.model_dir, device_ids=device_ids, gpus_per_node=gpus_per_node)
print_mem("post load_refit")

print(f"engine saved to {self.model_dir} device: {origdev} {torch.cuda.current_device()}")
Expand Down
5 changes: 4 additions & 1 deletion nemo/export/trt_llm/tensorrt_llm_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def create_gpt_session(session_params: GptSession_params, engine_data: bytearray
)


def load_refit(engine_dir):
def load_refit(engine_dir, device_ids, gpus_per_node):
"""Loaded the compiled LLM model and run it.
It also supports running the TRT LLM model on multi-GPU.
Expand All @@ -337,10 +337,13 @@ def load_refit(engine_dir):

# TRTLLM assumes rank < gpus_per_node but this is not true for multinode setups
# So hack around this using an arbitrarily big gpus_per_node to avoid asserts
<<<<<<< Updated upstream
gpus_per_node = 64
mp_rank = tensorrt_llm.bindings.MpiComm.getRank()
device_ids = [(i + torch.cuda.current_device() - mp_rank + gpus_per_node) % gpus_per_node for i in range(mp_size)]
print(f"{torch.cuda.current_device()} device_ids {device_ids}")
=======
>>>>>>> Stashed changes

world_config = WorldConfig.mpi(
gpus_per_node=gpus_per_node, tensor_parallelism=tp_size, pipeline_parallelism=pp_size, device_ids=device_ids
Expand Down

0 comments on commit 6718d0a

Please sign in to comment.