Skip to content

Commit

Permalink
tiny cleanup
Browse files Browse the repository at this point in the history
Signed-off-by: Terry Kong <terryk@nvidia.com>
  • Loading branch information
terrykong committed Oct 2, 2024
1 parent f3550aa commit 1d19459
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions nemo/export/trt_llm/tensorrt_llm_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,13 +478,15 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node):
device_ids.append(device_ids.pop(0))
engine_index = model_parallel_rank
mpi_rank = mpi_comm().Get_rank()
# TODO: copied from worldConfig.h (getDevice())
# Copied from worldConfig.h (getDevice())
mpi_device = mpi_rank % gpus_per_node

Check notice

Code scanning / CodeQL

Unused local variable Note

Variable mpi_device is not used.
# TODO: Consider re-enabling
# assert torch.cuda.current_device() == mpi_device

# TODO: check if API exists (copied from gptJsonConfig.cpp)
# https://github.com/terrykong/TensorRT-LLM/blob/05316d3313360012536ace46c781518f5afae75e/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp#L478
engine_filename = f"rank{engine_index}.engine"
serialize_path = Path(engine_dir) / engine_filename
# $#$#$assert torch.cuda.current_device() == mpi_device
with open(serialize_path, "rb") as f:
engine_data = bytearray(f.read())

Expand All @@ -494,9 +496,8 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node):
engine = Engine.from_buffer(engine_buffer=engine_data, json_config_str=json_config_str, rank=model_parallel_rank)
decoder = ModelRunner.from_engine(
engine=engine,
# rank=world_config.rank,
# We want the engine to have the mp_rank, but the python runtime to not resassign the device of the current process
# So we will set it to the current
# So we will set it to the current device
rank=torch.cuda.current_device(),
_disable_torch_cuda_device_set=True,
)
Expand Down

0 comments on commit 1d19459

Please sign in to comment.