From 29853925298dc7b4c42f31137b880df402969829 Mon Sep 17 00:00:00 2001 From: yisheng Date: Sun, 4 Aug 2024 20:23:09 -0700 Subject: [PATCH] fix nan issue when running megatron-deepspeed --- megatron/training.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 6ba26f3944..e7d7aed809 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -672,7 +672,6 @@ def train_step(forward_step_func, data_iterator, timers = get_timers() if args.deepspeed and args.ds_pipeline_enabled: - skipped_iter = 0 num_zeros_in_grad = 0 assert isinstance(model[0], deepspeed.PipelineEngine) loss = model[0].train_batch(data_iter=data_iterator) @@ -682,6 +681,8 @@ def train_step(forward_step_func, data_iterator, if additional_losses is not None: loss_dict.update(additional_losses) grad_norm = model[0].get_global_grad_norm() + update_successful = model[0].was_step_applied() + skipped_iter = 0 if update_successful else 1 return loss_dict, skipped_iter, grad_norm, num_zeros_in_grad # Set grad to zero. @@ -760,7 +761,7 @@ def train_step(forward_step_func, data_iterator, # Update learning rate. if args.deepspeed: - skipped_iter = 0 + skipped_iter = 0 if update_successful else 1 grad_norm = None num_zeros_in_grad = None