diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 7fdd6eac314c..76a182706481 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -307,7 +307,6 @@ def build( mp_group = parallel_state.get_model_parallel_group() mp_size = tp_size*pp_size mp_rank = tp_size*pp_rank + tp_rank - if dp_size > 1: self.model_dir = os.path.join(self.model_dir, f"dp_rank{dp_rank}") @@ -316,9 +315,9 @@ def build( # So we manipulate TRTLLM to emulate a TP->PP single node setup tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank) device_ids = [ - ((i+mp_rank-torch.cuda.current_device())+mp_size)%mp_size + ((i+torch.cuda.current_device()-mp_rank)+mp_size)%mp_size for i in range(mp_size)] - + mapping = tensorrt_llm.Mapping( world_size = mp_size, rank = mp_rank, @@ -327,7 +326,7 @@ def build( pp_size = pp_size) LOGGER.info( - f'''TRT-LLM rank mapping: Rank {torch.distributed.get_rank()} -> {mp_rank}: + f'''TRT-LLM rank mapping: Rank {torch.distributed.get_rank()}, mp_rank {mp_rank}: tp_rank {parallel_state.get_tensor_model_parallel_rank()} -> {mapping.tp_rank}, pp_rank {parallel_state.get_pipeline_model_parallel_rank()} -> {mapping.pp_rank}''' ) @@ -354,13 +353,14 @@ def build( model_dir=self.model_dir, ) torch.distributed.barrier() - print(f"engine saved to {self.model_dir}") - if torch.cuda.current_device() == 0: - cfg_path = Path(os.path.join(self.model_dir, 'config.json')) - if not cfg_path.exists(): - with open(cfg_path, "w", encoding="utf-8") as f: - json.dump(engine.config.to_dict(), f, indent=4) + myrank = torch.distributed.get_rank() + cfg_path = Path(os.path.join(self.model_dir, f'config_{myrank}.json')) + print(f"engine saved to {self.model_dir}") + print(self.model_dir, f'config_{myrank}.json') + if not cfg_path.exists(): + with open(cfg_path, "w", encoding="utf-8") as f: + json.dump(engine.config.to_dict(), f, indent=4) print_mem("post build_and_save_engine") diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 1ce666e83e4f..e6778b576103 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -313,7 +313,7 @@ def load_refit(engine_dir, device_ids): It also supports running the TRT LLM model on multi-GPU. """ - config_path = Path(engine_dir) / "config.json" + config_path = Path(engine_dir) / f"config_{torch.distributed.get_rank()}.json" json_config = GptJsonConfig.parse_file(config_path) model_config = json_config.model_config