grad_wei can't be NoneType when running with DeepSpeed, for zero3 wil…

…l divided the gradient
microsoft · Jul 22, 2024 · 59b511a · 59b511a
1 parent fc989b8
commit 59b511a
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
@@ -368,7 +368,7 @@ def backward(ctx, grad_output):
         #     grad_weight = grad_output.t().matmul(total_input)
         from megatron.core.tensor_parallel.weight_grad_store import WeightGradStore
         WeightGradStore.put(total_input, grad_output, weight, gradientUpdateFunction)
-        grad_weight = None
+        grad_weight = weight.grad
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
         if ctx.sequence_parallel: