From f72eb32c399ff9de0f4db99b2b1a4f1ed9ea512e Mon Sep 17 00:00:00 2001 From: isanvicente Date: Tue, 22 Oct 2024 12:12:50 +0000 Subject: [PATCH 01/10] fix issue #131, module 'eole.utils' has no attribute 'distributed' error when training multi-gpu --- eole/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/eole/trainer.py b/eole/trainer.py index 8c92ad0f..4f17a48f 100644 --- a/eole/trainer.py +++ b/eole/trainer.py @@ -14,6 +14,7 @@ import torch import traceback import eole.utils +from eole.utils import distributed from eole.utils.loss import LossCompute from eole.utils.logging import logger from eole.utils.misc import clear_gpu_cache, get_autocast From f6b6fdd18cb0fbdab5b34c9475beef62d5a1ad15 Mon Sep 17 00:00:00 2001 From: isanvicente Date: Tue, 22 Oct 2024 21:55:17 +0000 Subject: [PATCH 02/10] fix issue #131, import only functions --- eole/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eole/trainer.py b/eole/trainer.py index 4f17a48f..dc4f146e 100644 --- a/eole/trainer.py +++ b/eole/trainer.py @@ -14,7 +14,7 @@ import torch import traceback import eole.utils -from eole.utils import distributed +from eole.utils.distributed import all_gather_list,all_reduce_and_rescale_tensors from eole.utils.loss import LossCompute from eole.utils.logging import logger from eole.utils.misc import clear_gpu_cache, get_autocast From b733b7353fe53bd80dc2e270be5c869ffeab78dd Mon Sep 17 00:00:00 2001 From: isanvicente Date: Wed, 23 Oct 2024 09:11:58 +0000 Subject: [PATCH 03/10] apply black formatter to eole/trainer.py --- eole/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eole/trainer.py b/eole/trainer.py index dc4f146e..948e41ab 100644 --- a/eole/trainer.py +++ b/eole/trainer.py @@ -14,7 +14,7 @@ import torch import traceback import eole.utils -from eole.utils.distributed import all_gather_list,all_reduce_and_rescale_tensors +from eole.utils.distributed import all_gather_list, all_reduce_and_rescale_tensors from eole.utils.loss import LossCompute from eole.utils.logging import logger from eole.utils.misc import clear_gpu_cache, get_autocast From 7ebf69613d4256d4972bdc4105f1b7fdfedd45c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 24 Oct 2024 09:36:31 +0200 Subject: [PATCH 04/10] make flake happy --- eole/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eole/trainer.py b/eole/trainer.py index 948e41ab..e9b1f672 100644 --- a/eole/trainer.py +++ b/eole/trainer.py @@ -335,7 +335,7 @@ def train( if self.n_gpu > 1 and self.parallel_mode == "data_parallel": normalization = sum( - eole.utils.distributed.all_gather_list(normalization) + all_gather_list(normalization) ) self._gradient_accumulation( @@ -571,7 +571,7 @@ def _gradient_accumulation( for p in self.model.parameters() if p.requires_grad and p.grad is not None ] - eole.utils.distributed.all_reduce_and_rescale_tensors( + all_reduce_and_rescale_tensors( grads, float(self.n_gpu) ) From ab6e151ecd51c853f9e2c6b2b9804084dd79dfda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 24 Oct 2024 09:52:56 +0200 Subject: [PATCH 05/10] make black happy --- eole/trainer.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/eole/trainer.py b/eole/trainer.py index e9b1f672..81236b83 100644 --- a/eole/trainer.py +++ b/eole/trainer.py @@ -334,9 +334,7 @@ def train( self._maybe_update_estim_lambda(step) if self.n_gpu > 1 and self.parallel_mode == "data_parallel": - normalization = sum( - all_gather_list(normalization) - ) + normalization = sum(all_gather_list(normalization)) self._gradient_accumulation( batches, normalization, total_stats, report_stats @@ -571,9 +569,7 @@ def _gradient_accumulation( for p in self.model.parameters() if p.requires_grad and p.grad is not None ] - all_reduce_and_rescale_tensors( - grads, float(self.n_gpu) - ) + all_reduce_and_rescale_tensors(grads, float(self.n_gpu)) self.optim.step() From 63dfd152f1c1ce6aa262f4de73507b1ccb28390d Mon Sep 17 00:00:00 2001 From: isanvicente Date: Mon, 28 Oct 2024 14:40:09 +0000 Subject: [PATCH 06/10] updated eole/bin/model/average_models.py to work with safetensors model format. --- eole/bin/model/average_models.py | 64 +++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/eole/bin/model/average_models.py b/eole/bin/model/average_models.py index 2d45104c..6d070c99 100755 --- a/eole/bin/model/average_models.py +++ b/eole/bin/model/average_models.py @@ -1,41 +1,56 @@ #!/usr/bin/env python import torch from eole.bin import BaseBin, register_bin +from eole.models import model_saver +from eole.config import recursive_model_fields_set +from safetensors.torch import load_file, save_file +import os +import json +import pdb -def average_models(model_files, fp32=False): +def average_models(model_paths, fp32=False): vocab = None config = None avg_model = None avg_generator = None - for i, model_file in enumerate(model_files): - m = torch.load(model_file, map_location="cpu") - model_weights = m["model"] - generator_weights = m["generator"] + for i, model_path in enumerate(model_paths): + # torch pt code + # m = torch.load(model_file, map_location="cpu") + # model_weights = m["model"] + # generator_weights = m["generator"] + # safetensor checkpoint load + m = model_saver.load_checkpoint(model_path) + model_weights = load_file(os.path.join(model_path, "model.00.safetensors")) + # pdb.set_trace() if fp32: for k, v in model_weights.items(): model_weights[k] = v.float() - for k, v in generator_weights.items(): - generator_weights[k] = v.float() + # generator weights are already in model_weights.items() + # for k, v in generator_weights.items(): + # generator_weights[k] = v.float() if i == 0: - vocab, config = m["vocab"], m["config"] + vocab, config, optim = m["vocab"], m["config"], m["optim"] avg_model = model_weights - avg_generator = generator_weights + # generator weights are already in model_weights.items() + # avg_generator = generator_weights else: for k, v in avg_model.items(): avg_model[k].mul_(i).add_(model_weights[k]).div_(i + 1) - for k, v in avg_generator.items(): - avg_generator[k].mul_(i).add_(generator_weights[k]).div_(i + 1) + # for k, v in avg_generator.items(): + # avg_generator[k].mul_(i).add_(generator_weights[k]).div_(i + 1) + + # Deleted from final + # "generator": avg_generator, final = { "vocab": vocab, "config": config, - "optim": None, - "generator": avg_generator, + "optim": optim, "model": avg_model, } return final @@ -56,4 +71,25 @@ def add_args(cls, parser): @classmethod def run(cls, args): final = average_models(args.models, args.fp32) - torch.save(final, args.output) + + if not os.path.isdir(args.output): + os.makedirs(args.output, exist_ok=True) + + # pdb.set_trace() + # this maybe better implemented using model_saver classes + # config + with open(os.path.join(args.output, "config.json"), "w") as f: + json.dump( + recursive_model_fields_set(final["config"]), + f, + indent=2, + ensure_ascii=False, + ) + # vocab + with open(os.path.join(args.output, "vocab.json"), "w") as f: + json.dump(final["vocab"], f, indent=2, ensure_ascii=False) + # optimizer + torch.save(final["optim"], os.path.join(args.output, "optimizer.pt")) + # model weights + save_file(final["model"], os.path.join(args.output, "model.00.safetensors")) + # torch.save(final, args.output) From 17444a769652ba1ffaa52ff6802e8a7db71c5ef5 Mon Sep 17 00:00:00 2001 From: isanvicente Date: Mon, 28 Oct 2024 16:20:54 +0000 Subject: [PATCH 07/10] make flake happy --- eole/bin/model/average_models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/eole/bin/model/average_models.py b/eole/bin/model/average_models.py index 6d070c99..f4f9ded1 100755 --- a/eole/bin/model/average_models.py +++ b/eole/bin/model/average_models.py @@ -6,14 +6,12 @@ from safetensors.torch import load_file, save_file import os import json -import pdb def average_models(model_paths, fp32=False): vocab = None config = None avg_model = None - avg_generator = None for i, model_path in enumerate(model_paths): # torch pt code From df26b408ee4c5b2a6f0dc90d903fc3a04af131b9 Mon Sep 17 00:00:00 2001 From: isanvicente Date: Tue, 29 Oct 2024 12:05:18 +0000 Subject: [PATCH 08/10] delete comments --- eole/bin/model/average_models.py | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/eole/bin/model/average_models.py b/eole/bin/model/average_models.py index f4f9ded1..5592fd96 100755 --- a/eole/bin/model/average_models.py +++ b/eole/bin/model/average_models.py @@ -14,37 +14,20 @@ def average_models(model_paths, fp32=False): avg_model = None for i, model_path in enumerate(model_paths): - # torch pt code - # m = torch.load(model_file, map_location="cpu") - # model_weights = m["model"] - # generator_weights = m["generator"] - # safetensor checkpoint load m = model_saver.load_checkpoint(model_path) model_weights = load_file(os.path.join(model_path, "model.00.safetensors")) - # pdb.set_trace() - + if fp32: for k, v in model_weights.items(): model_weights[k] = v.float() - # generator weights are already in model_weights.items() - # for k, v in generator_weights.items(): - # generator_weights[k] = v.float() - + if i == 0: vocab, config, optim = m["vocab"], m["config"], m["optim"] avg_model = model_weights - # generator weights are already in model_weights.items() - # avg_generator = generator_weights else: for k, v in avg_model.items(): avg_model[k].mul_(i).add_(model_weights[k]).div_(i + 1) - - # for k, v in avg_generator.items(): - # avg_generator[k].mul_(i).add_(generator_weights[k]).div_(i + 1) - - # Deleted from final - # "generator": avg_generator, - + final = { "vocab": vocab, "config": config, @@ -73,9 +56,6 @@ def run(cls, args): if not os.path.isdir(args.output): os.makedirs(args.output, exist_ok=True) - # pdb.set_trace() - # this maybe better implemented using model_saver classes - # config with open(os.path.join(args.output, "config.json"), "w") as f: json.dump( recursive_model_fields_set(final["config"]), @@ -83,11 +63,9 @@ def run(cls, args): indent=2, ensure_ascii=False, ) - # vocab + with open(os.path.join(args.output, "vocab.json"), "w") as f: json.dump(final["vocab"], f, indent=2, ensure_ascii=False) - # optimizer + torch.save(final["optim"], os.path.join(args.output, "optimizer.pt")) - # model weights save_file(final["model"], os.path.join(args.output, "model.00.safetensors")) - # torch.save(final, args.output) From 55ccd8137a6acffa07e5bf20d01f521d350f6b80 Mon Sep 17 00:00:00 2001 From: isanvicente Date: Tue, 29 Oct 2024 12:07:42 +0000 Subject: [PATCH 09/10] delete comments --- eole/bin/model/average_models.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/eole/bin/model/average_models.py b/eole/bin/model/average_models.py index 5592fd96..5b15c35c 100755 --- a/eole/bin/model/average_models.py +++ b/eole/bin/model/average_models.py @@ -56,6 +56,8 @@ def run(cls, args): if not os.path.isdir(args.output): os.makedirs(args.output, exist_ok=True) + # this maybe better implemented using model_saver classes + # config with open(os.path.join(args.output, "config.json"), "w") as f: json.dump( recursive_model_fields_set(final["config"]), @@ -63,9 +65,10 @@ def run(cls, args): indent=2, ensure_ascii=False, ) - + # vocab with open(os.path.join(args.output, "vocab.json"), "w") as f: json.dump(final["vocab"], f, indent=2, ensure_ascii=False) - + # optimizer torch.save(final["optim"], os.path.join(args.output, "optimizer.pt")) + # model weights save_file(final["model"], os.path.join(args.output, "model.00.safetensors")) From 4dab32fc9eb4b50e34f629e9ebb6ba9fb00a2627 Mon Sep 17 00:00:00 2001 From: isanvicente Date: Tue, 29 Oct 2024 13:22:16 +0000 Subject: [PATCH 10/10] delete comments black formatted --- eole/bin/model/average_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eole/bin/model/average_models.py b/eole/bin/model/average_models.py index 5b15c35c..3a4b8b57 100755 --- a/eole/bin/model/average_models.py +++ b/eole/bin/model/average_models.py @@ -16,18 +16,18 @@ def average_models(model_paths, fp32=False): for i, model_path in enumerate(model_paths): m = model_saver.load_checkpoint(model_path) model_weights = load_file(os.path.join(model_path, "model.00.safetensors")) - + if fp32: for k, v in model_weights.items(): model_weights[k] = v.float() - + if i == 0: vocab, config, optim = m["vocab"], m["config"], m["optim"] avg_model = model_weights else: for k, v in avg_model.items(): avg_model[k].mul_(i).add_(model_weights[k]).div_(i + 1) - + final = { "vocab": vocab, "config": config,