diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index bef20b13f198..3223c4a90646 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -361,6 +361,7 @@ def build(
         print_mem("post build_and_save_engine")
         
         self.model_runner, self.session_params = load_refit(engine_dir=self.model_dir)
+        print_mem("post load_refit")
 
         print(f"device: {origdev} {torch.cuda.current_device()}")
 
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index c8c81b277180..17eb13d3a999 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -411,9 +411,10 @@ def build_and_save_engine(
         build_config.lora_config = lora_config
 
     model = model_cls.from_config(model_config)
+        # use_parallel_embedding=True,
+
     model = optimize_model(
         model,
-        use_parallel_embedding=True,
         share_embedding_table=model_config.share_embedding_table,
     )
     preprocess_weights(model_weights, model_config)
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index f5f6a963cc10..6d9cce99bdbb 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -323,10 +323,11 @@ def load_refit(engine_dir):
 
     # TRTLLM assumes rank < gpus_per_node but this is not true for multinode setups
     # So hack around this using an arbitrarily big gpus_per_node to avoid asserts
-    gpus_per_node = 9999
+    gpus_per_node = 64
     mp_rank = tensorrt_llm.bindings.MpiComm.getRank()        
     device_ids = [
-        (i+torch.cuda.current_device()-mp_rank) for i in range(mp_size)]
+        (i+torch.cuda.current_device()-mp_rank+gpus_per_node)%gpus_per_node
+        for i in range(mp_size)]
     print(f"{torch.cuda.current_device()} device_ids {device_ids}")
 
     world_config = WorldConfig.mpi(gpus_per_node=gpus_per_node,