From eba20f3be21c4484a96512edd98dbebcb90609b9 Mon Sep 17 00:00:00 2001
From: Anh Uong <anh.uong@ibm.com>
Date: Wed, 15 May 2024 15:43:06 -0600
Subject: [PATCH] fix: merging of model for multi-gpu (#158)

* only copy over if adapter found, problem when lora multi-gpu train

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* formatting and helpful comment

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

---------

Signed-off-by: Anh-Uong <anh.uong@ibm.com>
---
 build/launch_training.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/build/launch_training.py b/build/launch_training.py
index b03388e4f..bf9c1538b 100644
--- a/build/launch_training.py
+++ b/build/launch_training.py
@@ -142,12 +142,16 @@ def main():
                     export_path,
                 )
 
-                create_merged_model(
-                    checkpoint_models=full_checkpoint_dir,
-                    export_path=export_path,
-                    base_model=model_args.model_name_or_path,
-                    save_tokenizer=True,
-                )
+                # ensure checkpoint dir has correct files, important with multi-gpu tuning
+                if os.path.exists(
+                    os.path.join(full_checkpoint_dir, "adapter_config.json")
+                ):
+                    create_merged_model(
+                        checkpoint_models=full_checkpoint_dir,
+                        export_path=export_path,
+                        base_model=model_args.model_name_or_path,
+                        save_tokenizer=True,
+                    )
             except Exception as e:  # pylint: disable=broad-except
                 logging.error(traceback.format_exc())
                 write_termination_log(