diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index bef20b13f198..3223c4a90646 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -361,6 +361,7 @@ def build( print_mem("post build_and_save_engine") self.model_runner, self.session_params = load_refit(engine_dir=self.model_dir) + print_mem("post load_refit") print(f"device: {origdev} {torch.cuda.current_device()}") diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index c8c81b277180..17eb13d3a999 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -411,9 +411,10 @@ def build_and_save_engine( build_config.lora_config = lora_config model = model_cls.from_config(model_config) + # use_parallel_embedding=True, + model = optimize_model( model, - use_parallel_embedding=True, share_embedding_table=model_config.share_embedding_table, ) preprocess_weights(model_weights, model_config) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index f5f6a963cc10..6d9cce99bdbb 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -323,10 +323,11 @@ def load_refit(engine_dir): # TRTLLM assumes rank < gpus_per_node but this is not true for multinode setups # So hack around this using an arbitrarily big gpus_per_node to avoid asserts - gpus_per_node = 9999 + gpus_per_node = 64 mp_rank = tensorrt_llm.bindings.MpiComm.getRank() device_ids = [ - (i+torch.cuda.current_device()-mp_rank) for i in range(mp_size)] + (i+torch.cuda.current_device()-mp_rank+gpus_per_node)%gpus_per_node + for i in range(mp_size)] print(f"{torch.cuda.current_device()} device_ids {device_ids}") world_config = WorldConfig.mpi(gpus_per_node=gpus_per_node,