NVIDIA · HuiyingLi · Oct 7, 2024 · Oct 9, 2024 · Oct 11, 2024 · Oct 17, 2024
diff --git a/nemo/collections/llm/peft/__init__.py b/nemo/collections/llm/peft/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.peft.api import gpt_lora, merge_lora
 from nemo.collections.llm.peft.lora import LoRA
 
-__all__ = ["LoRA", "gpt_lora"]
+__all__ = ["LoRA", "gpt_lora", "merge_lora"]
diff --git a/nemo/collections/llm/peft/api.py b/nemo/collections/llm/peft/api.py
@@ -12,14 +12,124 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.llm.peft.lora import LoRA
+import json
+from pathlib import Path
+from typing import Any, Dict, Union
+
+import pytorch_lightning as pl
+from megatron.core import dist_checkpointing
+from pytorch_lightning.trainer.states import TrainerFn
+
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.llm.peft.lora import LoRA, LoRAMerge
 from nemo.collections.llm.utils import factory
+from nemo.lightning import MegatronStrategy, Trainer, _strategy_lib, io
+from nemo.lightning.ckpt_utils import ADAPTER_META_FILENAME, ckpt_to_context_subdir
+from nemo.lightning.io.pl import TrainerContext, ckpt_to_weights_subdir
+from nemo.lightning.pytorch.callbacks import PEFT
 from nemo.lightning.pytorch.callbacks.peft import PEFT
+from nemo.lightning.pytorch.strategies.utils import RestoreConfig
+from nemo.utils import logging
 
 
 @factory
 def gpt_lora() -> PEFT:
     return LoRA()
 
 
-__all__ = ["gpt_lora"]
+def merge_lora(
+    model: pl.LightningModule,
+    lora_checkpoint_path: str,
+    output_path: str,
+):
+    """
+    Merges the LoRA adapter weights into the base model's weights.
+
+    Python Usage:
+    ```python
+    def llama3_8b() -> pl.LightningModule:
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
+        return llm.LlamaModel(llm.Llama3Config8B(), tokenizer=tokenizer)
+
+
+    if __name__ == '__main__':
+        llm.peft.merge_lora(
+            model=llama3_8b(),
+            lora_checkpoint_path=your_lora_checkpoint_path,
+            output_path=your_output_path,
+        )
+    ```
+
+    Args:
+        model: The base model instance to merge the LoRA adapter weights into.
+        lora_checkpoint_path: The path to the LoRA checkpoint.
+        output_path: The path to save the merged checkpoint.
+
+    """
+    from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+
+    trainer = Trainer(
+        devices=1,
+        accelerator="cpu",
+        strategy=MegatronStrategy(ddp="pytorch", setup_optimizers=False, plugins=bf16_mixed()),
+    )
+
+    if (
+        adapter_meta_path := ckpt_to_weights_subdir(lora_checkpoint_path, is_saving=False) / ADAPTER_META_FILENAME
+    ).exists():
+        with open(adapter_meta_path, "r") as f:
+            metadata = json.load(f)
+        restore_config = RestoreConfig(
+            path=metadata["model_ckpt_path"],
+            load_model_state=True,
+            load_optim_state=False,
+        )
+    else:
+        raise ValueError(f"Cannot find adapter meta file in {lora_checkpoint_path}")
+
+    trainer.strategy.restore_config = restore_config
+    trainer.strategy._setup_optimizers = False
+    trainer.ckpt_path = None
+    trainer.strategy.connect(model)
+    trainer.strategy.setup_environment()
+
+    if not model.state_dict():
+        with _strategy_lib.megatron_cpu_init_context(model.config):
+            model.configure_model()
+
+    trainer.strategy.setup(trainer)
+    trainer.state.fn = TrainerFn.TESTING
+    trainer.strategy.setup_megatron_parallel(trainer=trainer)
+    trainer.strategy.trainer = trainer
+
+    lora: Union[io.TrainerContext, LoRA] = io.load_context(
+        ckpt_to_context_subdir(lora_checkpoint_path), "model.model_transform"
+    )
+    assert isinstance(lora, LoRA), "LoRA config not found in checkpoint"
+    model = lora(model)
+    adapter_sharded_state_dict = {
+        k: v for k, v in trainer.strategy.megatron_parallel.sharded_state_dict().items() if ".adapter." in k
+    }
+    adapter_state = trainer.strategy.checkpoint_io.load_checkpoint(
+        ckpt_to_weights_subdir(lora_checkpoint_path, is_saving=False), sharded_state_dict=adapter_sharded_state_dict
+    )
+    trainer.strategy.load_model_state_dict(adapter_state, strict=False)
+
+    lora_merge = LoRAMerge()
+    merged_model = lora_merge(trainer.strategy.megatron_parallel)
+    merged_weights = {k: v for k, v in merged_model.sharded_state_dict().items() if ".adapter." not in k}
+    weight_path = ckpt_to_weights_subdir(output_path, is_saving=True)
+    Path(weight_path).mkdir(parents=True, exist_ok=True)
+    dist_checkpointing.save(merged_weights, str(ckpt_to_weights_subdir(output_path, is_saving=True)))
+    if hasattr(model.tokenizer, "save_pretrained"):
+        model.tokenizer.save_pretrained("/tmp/nemo_tokenizer")
+        model.tokenizer = AutoTokenizer("/tmp/nemo_tokenizer")
+    if hasattr(trainer.model, "__io__") and hasattr(trainer.model.tokenizer, '__io__'):
+        trainer.model.__io__.tokenizer = trainer.model.tokenizer.__io__
+    TrainerContext.from_trainer(trainer).io_dump(ckpt_to_context_subdir(output_path), yaml_attrs=["model"])
+    logging.info(f"Merged checkpoint saved to {output_path}")
+
+
+__all__ = ["gpt_lora", "merge_lora"]
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
@@ -244,3 +244,16 @@ def wildcard_match(pattern, key):
             )
             return AdapterParallelAdd(m, adapter)
         return m
+
+
+class LoRAMerge(PEFT):
+    @torch.no_grad()
+    def transform(self, m: nn.Module, name=None, prefix=None):
+        print(f"merging module", (prefix if prefix else "") + "." + (name if name else ""))
+        if not isinstance(m, AdapterParallelAdd):
+            return m
+        base_weight = m.to_wrap.weight
+        lora_weight = m.adapter.linear_out.weight.to(base_weight) @ m.adapter.linear_in.weight.to(base_weight.device)
+        merged_weight = base_weight + lora_weight
+        m.to_wrap.weight.data = merged_weight
+        return m