Skip to content

Commit

Permalink
fix: adds missing support for mcore dist opt and adds test for moe
Browse files Browse the repository at this point in the history
Signed-off-by: Terry Kong <terryk@nvidia.com>

moe test is all2all

Signed-off-by: Terry Kong <terryk@nvidia.com>

other params

Signed-off-by: Terry Kong <terryk@nvidia.com>

fix peft mixtral

Signed-off-by: Terry Kong <terryk@nvidia.com>

dockerfile bump to be on dev

Signed-off-by: Terry Kong <terryk@nvidia.com>

just take dockerfile on dev

Signed-off-by: Terry Kong <terryk@nvidia.com>
  • Loading branch information
terrykong committed Dec 6, 2024
1 parent b0dd4d5 commit e897fd7
Show file tree
Hide file tree
Showing 21 changed files with 185 additions and 51 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ jobs:
- kd-llama3
- sft-llama3
- rm-llama3
- dpo-mixtral-ep
- dpo-mixtral-peft-tp-sp
with:
RUNNER: self-hosted-azure
# Fairly aggresive timeout that all functional tests should try to adhere to
Expand Down
20 changes: 10 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ ARG MAX_JOBS=8
# Git refs for dependencies
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG PYTRITON_VERSION=0.5.10
ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main
ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main
ARG NEMO_TAG=06eae2895c0fea09f8dd7c34feff0163e55c419a # On: main
ARG MLM_TAG=844119f5c856a3037ec7c7f6d6ef7b3518ceee6b # On: main
ARG ALIGNER_COMMIT=main
ARG TRTLLM_VERSION=v0.13.0
ARG PROTOBUF_VERSION=4.24.4
Expand Down Expand Up @@ -123,19 +123,19 @@ RUN cd /opt/NeMo-Aligner && \

RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch

# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
# NOTE: Comment this layer out if it is not needed
# NOTE: This section exists to allow cherry-picking PRs in cases where
# we do not wish to simply update to the top-of-tree. Sometimes PRs
# cannot be cherry-picked cleanly if rebased a few times to top-of-tree
# so this logic also requires you to select a SHA (can be dangling) from
# the PR.
RUN <<"EOF" bash -exu
cd NeMo
# Ensures we don't cherry-pick "future" origin/main commits
git fetch -a
# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
# d27dd28b4186f6ecd9f46f1c5679a5eef9bad14e: fix: export weight name mapping if model is nemo model#11497
for pr_and_commit in \
"10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
"10652 60e677423667c029dd05875da72bf0719774f844" \
"10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
"11497 d27dd28b4186f6ecd9f46f1c5679a5eef9bad14e" \
; do
pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# dpo specific args
dpo:
Expand All @@ -17,6 +18,7 @@ trainer:

# how many GBS we loop over
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_kto.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# kto specific args
kto:
Expand All @@ -17,6 +18,7 @@ trainer:

# how many GBS we loop over
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_ppo_actor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

ppo:
# How many steps we train warmup the critic for (without training the policy)
Expand All @@ -21,6 +22,7 @@ trainer:
max_steps: -1 # max PPO steps (-1 to go through the whole train set)
val_check_interval: 10
save_interval: ${.val_check_interval}
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# PPO args to generate the data for training
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_ppo_critic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

ppo:
port: 5556
Expand All @@ -15,6 +16,7 @@ trainer:

# used to set the learning rate scheduler
max_steps: 10000
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# a PyTriton parameter to specify
Expand Down
4 changes: 3 additions & 1 deletion examples/nlp/gpt/conf/gpt_rs_actor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

rs:
max_epochs: 1
max_steps: -1 # max rs steps (-1 to go through the whole train set)
val_check_interval: 10
save_interval: ${.val_check_interval}
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# pick up from the model
Expand Down Expand Up @@ -177,4 +179,4 @@ model:
# define fields from the base model's config that should be ignored when merging with this config.
overwrite_base_config:
data:
data_prefix: True
data_prefix: True
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ trainer:
devices: 1
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

sft:
max_epochs: 1
Expand All @@ -15,6 +16,7 @@ trainer:
limit_train_batches: 1.0

limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# can be used to register any custom metrics that require token-by-token generation
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/gpt_spin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16-mixed
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# spin specific args
spin:
Expand All @@ -18,6 +19,7 @@ trainer:

# how many GBS we loop over
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 2 additions & 0 deletions examples/nlp/gpt/conf/training_rm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ trainer:
devices: 8
accelerator: gpu
precision: bf16
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value

# rm specific args
rm:
Expand All @@ -20,6 +21,7 @@ trainer:
# set to float for a percentage
# of the validation dataset
limit_val_batches: 1.0
# TODO: delete once Megatron Core optimizer becomes default
gradient_clip_val: 1.0

# do not change these
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/critic_server_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def run_training(self, tokens=None, returns=None, prev_values=None, mask=None):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

if grad_norm is not None:
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def train_single_step(self, global_batch):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

trainer_metrics = {}
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def run_training(self, dataloader_iter):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

if grad_norm is not None:
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/rs.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def run_training(self, dataloader_iter):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

if grad_norm is not None:
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/spin.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def train_single_step(self, global_batch):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

trainer_metrics = {}
Expand Down
2 changes: 1 addition & 1 deletion nemo_aligner/algorithms/supervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def train_single_step(self, batch):
grad_norm = grad_norm.item() if torch.is_tensor(grad_norm) else grad_norm
lr = self.optimizer.param_groups[0]["lr"]

self.optimizer.step()
self.optimizer.step(closure=None)
self.scheduler.step()

trainer_metrics = {}
Expand Down
56 changes: 44 additions & 12 deletions nemo_aligner/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,31 +101,52 @@ def prepare_for_training_step(ptl_model, zero_grad=True):
param.data_ptr()


# TODO: Delete this once API introduced in NeMo (https://github.com/NVIDIA/NeMo/pull/10803)
# TODO: Update PR to move this logic into staticmethod in nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
def grad_reductions(ptl_model):
# when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
if ptl_model.cfg.get("tensor_model_parallel_size", 1) > 1 and ptl_model.cfg.get("sequence_parallel", False):
ptl_model.allreduce_sequence_parallel_gradients()

if ptl_model.with_distributed_adam:
# synchronize asynchronous grad reductions
# note: not necessary, but reduces performance degradation
# from multiple simultaneous NCCL calls
ptl_model._optimizer._finish_bucket_grad_sync()
# Mcore DistOpt handles this, so we don't have to
if not ptl_model.use_mcore_dist_optim:
ptl_model.megatron_timer_start("allreduce_sequence_parallel_gradients", log_level=1)
ptl_model.allreduce_sequence_parallel_gradients()
ptl_model.megatron_timer_stop("allreduce_sequence_parallel_gradients")

ptl_model.megatron_timer_start("gradient_allreduce", log_level=1)
if ptl_model.use_fsdp:
# Reduce the gradients omitted from FSDP-sharding
ptl_model.allreduce_fsdp_sharding_omitted_gradients()
elif ptl_model.with_distributed_adam:
if not ptl_model.use_mcore_dist_optim:
# synchronize asynchronous grad reductions
# note: not necessary, but reduces performance degradation
# from multiple simultaneous NCCL calls
ptl_model._optimizer._finish_bucket_grad_sync()
# else: Mcore distributed optim calls finalize_model_grads to finish grad sync
elif ptl_model.megatron_amp_O2:
# when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
if ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1 or ptl_model.cfg.get("sequence_parallel", False):
if (
ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1
or ptl_model.cfg.get("sequence_parallel", False)
or not ptl_model.cfg.get("async_grad_allreduce", True)
):
# main grads are stored in the MainParamsOptimizer wrapper
ptl_model._optimizer.allreduce_main_grads()
else:
# async grad allreduce is not currently implemented for O1/autocasting mixed precision training
# so we all-reduce gradients after the pipeline
ptl_model.allreduce_gradients() # @sangkug we think this is causing memory to blow up (hurts perf)
ptl_model.megatron_timer_stop("gradient_allreduce")

if ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1 and ptl_model.cfg.get(
"share_embeddings_and_output_weights", True
if (
not ptl_model.use_mcore_dist_optim
and ptl_model.cfg.get("pipeline_model_parallel_size", 1) > 1
and ptl_model.cfg.get("share_embeddings_and_output_weights", True)
):
ptl_model.megatron_timer_start("allreduce_first_last_embeddings", log_level=1)
# when using pipeline parallelism the first and last stage must keep embeddings in sync
ptl_model.allreduce_first_last_embeddings()
ptl_model.megatron_timer_stop("allreduce_first_last_embeddings")


def prepare_for_validation_step(ptl_model):
Expand Down Expand Up @@ -155,14 +176,26 @@ def set_eval(ptl_model):
ptl_model.eval()


# TODO: adapt the version in /opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
def clip_gradients(ptl_model, clip_val):
"""PTL hook to configure gradients.
We use gradient clipping implementation from megatron-lm.
"""
if clip_val is None:
return

clip_val = float(clip_val)
if clip_val <= 0:
return

if ptl_model.with_megatron_fused_adam or ptl_model.use_mcore_dist_optim:
# Gradient clipping is done in optimizer step
return

if ptl_model.grad_clip_pl_default:
# use the default behavior
return super().configure_gradient_clipping(*args, **kwargs)

if ptl_model.with_distributed_adam:
grad_norm = clip_grad_norm_distributed_optimizer(ptl_model._optimizer, clip_val)
else:
Expand All @@ -171,6 +204,5 @@ def clip_gradients(ptl_model, clip_val):
parameters = ptl_model._optimizer.get_parameters_with_grad()
else:
parameters = ptl_model.get_parameters_with_grad()
grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val)

grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val, use_fsdp=ptl_model.use_fsdp,)
return grad_norm
Loading

0 comments on commit e897fd7

Please sign in to comment.