Skip to content

Commit

Permalink
NCCL P2P communication chunk size
Browse files Browse the repository at this point in the history
Signed-off-by: Sangkug Lym <slym@nvidia.com>
  • Loading branch information
erhoo82 committed Nov 15, 2024
1 parent bf7cc64 commit d4661a1
Showing 1 changed file with 8 additions and 0 deletions.
8 changes: 8 additions & 0 deletions nemo/lightning/run/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ class PerfEnvPlugin(run.Plugin):
enable_layernorm_sm_margin: bool = True
layernorm_sm_margin: int = 16
enable_vboost: bool = False
nccl_pp_comm_chunksize: int = None

def get_vboost_srun_cmd(self, nodes, job_dir):
import shlex
Expand Down Expand Up @@ -294,6 +295,13 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)

# Use a large chunk size (2MB) for P2P communications to reduce the buffering overhead
# from the communication kernel execution time
pp_size = task.trainer.strategy.pipeline_model_parallel_size
if pp_size > 1 and self.nccl_pp_comm_chunksize is not None:
assert isinstance(self.nccl_pp_comm_chunksize, int) and self.nccl_pp_comm_chunksize > 1
executor.env_vars["NCCL_P2P_NET_CHUNKSIZE"] = str(self.nccl_pp_comm_chunksize)

# Improve perf by steering power to tensor cores, may not work on all systems
if self.enable_vboost and isinstance(executor, run.SlurmExecutor):
vboost_cmd = self.get_vboost_srun_cmd(executor.nodes, executor.job_dir)
Expand Down

0 comments on commit d4661a1

Please sign in to comment.