Skip to content

Commit

Permalink
fix nan issue when running megatron-deepspeed
Browse files Browse the repository at this point in the history
  • Loading branch information
ys950902 committed Aug 5, 2024
1 parent fc989b8 commit 836a9f3
Showing 1 changed file with 5 additions and 6 deletions.
11 changes: 5 additions & 6 deletions megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,12 +883,11 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
if not skipped_iter:
total_loss_dict[key] = total_loss_dict.get(
key, get_accelerator().FloatTensor([0.0])) + loss_dict[key]
else:
value = loss_dict[key].float().sum().item()
is_nan = value == float('inf') or \
value == -float('inf') or \
value != value
got_nan = got_nan or is_nan
value = loss_dict[key].float().sum().item()
is_nan = value == float('inf') or \
value == -float('inf') or \
value != value
got_nan = got_nan or is_nan
total_loss_dict[nan_iters_key] = total_loss_dict.get(
nan_iters_key, 0) + int(got_nan)

Expand Down

0 comments on commit 836a9f3

Please sign in to comment.