From eba20f3be21c4484a96512edd98dbebcb90609b9 Mon Sep 17 00:00:00 2001 From: Anh Uong Date: Wed, 15 May 2024 15:43:06 -0600 Subject: [PATCH] fix: merging of model for multi-gpu (#158) * only copy over if adapter found, problem when lora multi-gpu train Signed-off-by: Anh-Uong * formatting and helpful comment Signed-off-by: Anh-Uong --------- Signed-off-by: Anh-Uong --- build/launch_training.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/build/launch_training.py b/build/launch_training.py index b03388e4f..bf9c1538b 100644 --- a/build/launch_training.py +++ b/build/launch_training.py @@ -142,12 +142,16 @@ def main(): export_path, ) - create_merged_model( - checkpoint_models=full_checkpoint_dir, - export_path=export_path, - base_model=model_args.model_name_or_path, - save_tokenizer=True, - ) + # ensure checkpoint dir has correct files, important with multi-gpu tuning + if os.path.exists( + os.path.join(full_checkpoint_dir, "adapter_config.json") + ): + create_merged_model( + checkpoint_models=full_checkpoint_dir, + export_path=export_path, + base_model=model_args.model_name_or_path, + save_tokenizer=True, + ) except Exception as e: # pylint: disable=broad-except logging.error(traceback.format_exc()) write_termination_log(