NCCL P2P communication chunk size

Signed-off-by: Sangkug Lym <slym@nvidia.com>
erhoo82 · Nov 15, 2024 · d4661a1 · d4661a1
1 parent bf7cc64
commit d4661a1
Showing 1 changed file with 8 additions and 0 deletions.
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
@@ -260,6 +260,7 @@ class PerfEnvPlugin(run.Plugin):
     enable_layernorm_sm_margin: bool = True
     layernorm_sm_margin: int = 16
     enable_vboost: bool = False
+    nccl_pp_comm_chunksize: int = None
 
     def get_vboost_srun_cmd(self, nodes, job_dir):
         import shlex
@@ -294,6 +295,13 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
                 executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
                 executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
 
+            # Use a large chunk size (2MB) for P2P communications to reduce the buffering overhead
+            # from the communication kernel execution time
+            pp_size = task.trainer.strategy.pipeline_model_parallel_size
+            if pp_size > 1 and self.nccl_pp_comm_chunksize is not None:
+                assert isinstance(self.nccl_pp_comm_chunksize, int) and self.nccl_pp_comm_chunksize > 1
+                executor.env_vars["NCCL_P2P_NET_CHUNKSIZE"] = str(self.nccl_pp_comm_chunksize)
+
         # Improve perf by steering power to tensor cores, may not work on all systems
         if self.enable_vboost and isinstance(executor, run.SlurmExecutor):
             vboost_cmd = self.get_vboost_srun_cmd(executor.nodes, executor.job_dir)