Skip to content

Commit

Permalink
fix nan issue when running megatron-deepspeed
Browse files Browse the repository at this point in the history
  • Loading branch information
ys950902 committed Aug 8, 2024
1 parent fc989b8 commit 2985392
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,6 @@ def train_step(forward_step_func, data_iterator,
timers = get_timers()

if args.deepspeed and args.ds_pipeline_enabled:
skipped_iter = 0
num_zeros_in_grad = 0
assert isinstance(model[0], deepspeed.PipelineEngine)
loss = model[0].train_batch(data_iter=data_iterator)
Expand All @@ -682,6 +681,8 @@ def train_step(forward_step_func, data_iterator,
if additional_losses is not None:
loss_dict.update(additional_losses)
grad_norm = model[0].get_global_grad_norm()
update_successful = model[0].was_step_applied()
skipped_iter = 0 if update_successful else 1
return loss_dict, skipped_iter, grad_norm, num_zeros_in_grad

# Set grad to zero.
Expand Down Expand Up @@ -760,7 +761,7 @@ def train_step(forward_step_func, data_iterator,

# Update learning rate.
if args.deepspeed:
skipped_iter = 0
skipped_iter = 0 if update_successful else 1
grad_norm = None
num_zeros_in_grad = None

Expand Down

0 comments on commit 2985392

Please sign in to comment.