Skip to content

Commit

Permalink
fixes #131, module 'eole.utils' has no attribute 'distributed' error …
Browse files Browse the repository at this point in the history
…when training multi-gpu (#132)
  • Loading branch information
isanvicente authored Oct 25, 2024
1 parent 5369b07 commit d7959ba
Showing 1 changed file with 3 additions and 6 deletions.
9 changes: 3 additions & 6 deletions eole/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import torch
import traceback
import eole.utils
from eole.utils.distributed import all_gather_list, all_reduce_and_rescale_tensors
from eole.utils.loss import LossCompute
from eole.utils.logging import logger
from eole.utils.misc import clear_gpu_cache, get_autocast
Expand Down Expand Up @@ -333,9 +334,7 @@ def train(
self._maybe_update_estim_lambda(step)

if self.n_gpu > 1 and self.parallel_mode == "data_parallel":
normalization = sum(
eole.utils.distributed.all_gather_list(normalization)
)
normalization = sum(all_gather_list(normalization))

self._gradient_accumulation(
batches, normalization, total_stats, report_stats
Expand Down Expand Up @@ -570,9 +569,7 @@ def _gradient_accumulation(
for p in self.model.parameters()
if p.requires_grad and p.grad is not None
]
eole.utils.distributed.all_reduce_and_rescale_tensors(
grads, float(self.n_gpu)
)
all_reduce_and_rescale_tensors(grads, float(self.n_gpu))

self.optim.step()

Expand Down

0 comments on commit d7959ba

Please sign in to comment.