adapter-hub · julian-fong · Dec 1, 2024 · Dec 1, 2024 · Dec 1, 2024 · Dec 12, 2024
diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
@@ -64,6 +64,7 @@
         "SeqBnInvConfig",
         "StaticAdapterFusionConfig",
         "UniPELTConfig",
+        "VeraConfig",
     ],
     "context": [
         "AdapterSetup",
@@ -181,6 +182,7 @@
         SeqBnInvConfig,
         StaticAdapterFusionConfig,
         UniPELTConfig,
+        VeraConfig,
     )
     from .context import AdapterSetup, ForwardContext
     from .heads import (

diff --git a/src/adapters/configuration/adapter_config.py b/src/adapters/configuration/adapter_config.py
@@ -478,11 +478,20 @@ class LoRAConfig(AdapterConfig):
             (addition of decomposed matrix, as in LoRA) or "scale" (element-wise multiplication of vector, as in
             (IA)^3). "scale" can only be used together with r=1. Defaults to "add".
         init_weights (:obj:`str`, optional): Initialization method for the weights of the LoRA modules.
-            Currently, this can be either "lora" (default) or "bert".
+            Currently, this can be either "lora" (default) or "bert", or "vera".
         use_gating (:obj:`bool`, optional):
             Place a trainable gating module besides the added parameter module to control module activation. This is
             e.g. used for UniPELT. Defaults to False. Note that modules with use_gating=True cannot be merged using
             `merge_adapter()`.
+        d (:obj:`bool` or :obj:`float`, optional):
+            The value of d used in the VeraConfig. Defaults to None. Places a trainable
+            scaling parameter `d` before the decomposition matrix A to allow scaling of the
+            internal weights.
+
+        b (:obj:`bool` or :obj:`float`, optional):
+            The value of b used in the VeraConfig. Defaults to None. Places a trainable
+            scaling parameter `b` before the decomposition matrix B to allow scaling of the
+            internal weights.
     """
 
     architecture: Optional[str] = "lora"
@@ -499,6 +508,8 @@ class LoRAConfig(AdapterConfig):
     composition_mode: str = "add"
     init_weights: str = "lora"
     use_gating: bool = False
+    d: Union[bool, float] = None
+    b: Union[bool, float] = None
 
 
 @dataclass(eq=False)
@@ -523,6 +534,27 @@ class IA3Config(LoRAConfig):
     use_gating: bool = False
 
 
+@dataclass(eq=False)
+class VeraConfig(LoRAConfig):
+    """
+    Lora Config that applies vector-based random matrix adaptation. It adds
+    trainable matrices 'd' and 'b' while keeping the original LoRA matrices
+    frozen, random, and shared across layers. See more through their paper:
+    https://arxiv.org/pdf/2106.09685. Note that `r` will still be supplied
+    since we are still initializing decomposition matrices A and B.
+    The `composition_mode` parameter should also be set to `add`.
+    """
+
+    selfattn_lora: bool = True
+    intermediate_lora: bool = False
+    output_lora: bool = False
+
+    r: int = 8
+    d: Union[bool, float] = 0.1
+    b: Union[bool, float] = 0.0
+    init_weights: str = "vera"
+
+
 @dataclass(eq=False)
 class ReftConfig(AdapterConfig):
     """

diff --git a/src/adapters/methods/lora.py b/src/adapters/methods/lora.py
@@ -16,6 +16,7 @@
 
 from ..composition import Average, BatchSplit, Parallel, Stack
 from ..configuration import LoRAConfig, ModelAdaptersConfig
+from ..context import ForwardContext
 from .adapter_layer_base import AdapterLayerBase, ComposableAdapterLayerBase
 from .utils import dequantize_bnb_weight
 
@@ -68,6 +69,9 @@ def __init__(
         elif config.init_weights == "ia3":
             nn.init.ones_(self.lora_A)
             nn.init.ones_(self.lora_B)
+        elif config.init_weights == "vera":
+            nn.init.kaiming_uniform_(self.lora_A)
+            nn.init.kaiming_uniform_(self.lora_B)
         else:
             raise ValueError("Unknown init_weights type: {}".format(config.init_weights))
 
@@ -90,6 +94,7 @@ def com_inv(self, weights: torch.Tensor, added: torch.Tensor) -> torch.Tensor:
         return weights - added * self.scaling
 
     def forward(self, hidden_states: Optional[torch.Tensor], layer_input: torch.Tensor):
+        print("triggered")
         if hidden_states is None:
             hidden_states = layer_input
         hidden_states = self.lora_dropout(hidden_states) @ torch.t(self.lora_A) @ torch.t(self.lora_B)
@@ -131,7 +136,7 @@ def __init__(
         # For compatibility with LoRA, allow all init_weights types here.
         # Usually should be "ia3".
         if config.init_weights == "lora":
-            logger.warning("(IA)^3 module initialized with LoRA zeo init. Ignore if this is intended.")
+            logger.warning("(IA)^3 module initialized with LoRA zero init. Ignore if this is intended.")
             nn.init.zeros_(self.lora_B)
         elif config.init_weights == "bert":
             nn.init.normal_(self.lora_B, std=0.02)
@@ -174,6 +179,111 @@ def forward(self, hidden_states: Optional[torch.Tensor], layer_input: torch.Tens
         return hidden_states, gate
 
 
+class Vera(nn.Module):
+    def __init__(
+        self,
+        lora_A_shape,
+        lora_B_shape,
+        config: LoRAConfig,
+        gating_heads: int = 1,
+    ):
+        super().__init__()
+        self.d = config.d
+        self.b = config.b
+        self.r = config.r
+        self.alpha = config.alpha
+        self.use_gating = config.use_gating
+
+        # Optional dropout
+        if config.dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=config.dropout)
+
+        self.lora_A_shape = lora_A_shape
+        self.lora_B_shape = lora_B_shape
+        self.d_shape = self.lora_A_shape[0]
+        self.b_shape = self.lora_B_shape[0]
+
+        # Actual trainable parameters
+        self.vera_D = nn.Parameter(torch.diag(torch.ones(self.d_shape) * self.d))
+        self.vera_B = nn.Parameter(torch.diag(torch.ones(self.b_shape) * self.b))
+        self.scaling = self.alpha / self.r
+
+        if self.use_gating:
+            self.gate = nn.Linear(lora_A_shape[-1], gating_heads)
+            nn.init.normal_(self.gate.weight, std=0.02)
+
+    @property
+    def delta_w(self) -> torch.Tensor:
+        parameters = ForwardContext.get_context().shared_parameters[self.name]
+        lora_A = parameters["lora_A"]
+        lora_B = parameters["lora_B"]
+        return self.vera_B @ lora_B @ self.vera_D @ lora_A
+
+    def com(self, weights: torch.Tensor, added: torch.Tensor, scaling=None) -> torch.Tensor:
+        """Performs the composition operation between existing and injected weights."""
+        if scaling is None:
+            scaling = self.scaling
+        return weights + added * scaling
+
+    def com_inv(self, weights: torch.Tensor, added: torch.Tensor) -> torch.Tensor:
+        """Inverts the composition operation between existing and injected weights."""
+        return weights - added * self.scaling
+
+    def forward(self, hidden_states: Optional[torch.Tensor], layer_input: torch.Tensor):
+        parameters = ForwardContext.get_context().shared_parameters[self.name]
+        lora_A = parameters["lora_A"]
+        lora_B = parameters["lora_B"]
+
+        if hidden_states is None:
+            hidden_states = layer_input
+
+        if getattr(self, "lora_dropout"):
+            hidden_states = self.lora_dropout(hidden_states)
+
+        hidden_states = hidden_states @ self.vera_B @ lora_B @ self.vera_D @ lora_A
+
+        if self.use_gating:
+            gate = torch.sigmoid(self.gate(layer_input))
+            gate = torch.mean(gate, dim=1).unsqueeze(-1)
+            hidden_states = hidden_states * gate
+        else:
+            gate = None
+
+        return hidden_states, gate
+
+    def set_vera_adapter_name(self, name):
+        self.name = name
+
+
+def init_shared_Vera_parameters(model_config, adapter_config, device):
+    hidden_size = model_config.hidden_size
+    r = adapter_config["r"]
+
+    parameters = nn.ParameterDict()
+
+    # initialize frozen, random tensors A, B
+    parameters["lora_A"] = torch.zeros(r, hidden_size).to(device)
+    parameters["lora_B"] = torch.zeros(hidden_size, r).to(device)
+
+    if adapter_config["init_weights"] == "lora":
+        # initialize A the same way as the default for nn.Linear and B to zero
+        nn.init.kaiming_uniform_(parameters["lora_A"], a=math.sqrt(5))
+        nn.init.zeros_(parameters["lora_B"])
+    elif adapter_config["init_weights"] == "bert":
+        nn.init.normal_(parameters["lora_A"], std=0.02)
+        nn.init.normal_(parameters["lora_B"], std=0.02)
+    elif adapter_config["init_weights"] == "ia3":
+        nn.init.ones_(parameters["lora_A"])
+        nn.init.ones_(parameters["lora_B"])
+    elif adapter_config["init_weights"] == "vera":
+        nn.init.kaiming_uniform_(parameters["lora_A"])
+        nn.init.kaiming_uniform_(parameters["lora_B"])
+    else:
+        raise ValueError("Unknown init_weights type: {}".format(adapter_config["init_weights"]))
+
+    return parameters
+
+
 class LoRALayer(AdapterLayerBase):
     adapter_modules_name = "loras"
 
@@ -199,6 +309,7 @@ def _get_lora_shapes(self, config: LoRAConfig):
 
     def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
         self.layer_idx = layer_idx
+
         lora_config = self.adapters_config.match(
             adapter_name,
             config_type=LoRAConfig,
@@ -207,7 +318,10 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
         )
         if lora_config is not None and self._check_lora_location(lora_config):
             if lora_config.composition_mode == "add":
-                lora_cls = LoRA
+                if isinstance(lora_config.d, float) or isinstance(lora_config.b, float):
+                    lora_cls = Vera
+                else:
+                    lora_cls = LoRA
             elif lora_config.composition_mode == "scale":
                 lora_cls = IA3
             else:
@@ -217,6 +331,10 @@ def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
                 lora_config,
                 gating_heads=self.get_n_heads(lora_config),
             )
+            # if we're using Vera, then set the adapter name into the Vera object
+            if lora_cls == Vera:
+                lora.set_vera_adapter_name(name=adapter_name)
+
             lora.train(self.training)
             lora = lora.to(self.weight.device)
             self.loras[adapter_name] = lora

diff --git a/src/adapters/model_mixin.py b/src/adapters/model_mixin.py
@@ -10,7 +10,7 @@
 import torch
 from torch import nn
 
-from adapters.configuration.adapter_config import ConfigUnion, LoRAConfig
+from adapters.configuration.adapter_config import ConfigUnion, LoRAConfig, VeraConfig
 from transformers import GenerationConfig
 from transformers.modeling_outputs import ModelOutput
 from transformers.utils import is_accelerate_available
@@ -22,7 +22,7 @@
 from .loading import AdapterFusionLoader, AdapterLoader, PredictionHeadLoader, WeightsLoader
 from .methods.adapter_layer_base import AdapterLayerBase
 from .methods.bottleneck import BottleneckLayer
-from .methods.lora import LoRALayer
+from .methods.lora import LoRALayer, init_shared_Vera_parameters
 from .methods.modeling import Adapter, GLOWCouplingBlock, NICECouplingBlock, init_shared_parameters
 from .methods.prefix_tuning import PrefixTuningLayer, PrefixTuningPool
 from .methods.prompt_tuning import PromptTuningLayer
@@ -610,13 +610,21 @@ def _add_adapter_weights(self, adapter_name: str):
                         )
                     else:
                         raise ValueError(
-                            "The model has different hidden sizes {}. Sharing comapcter weights is only possible if"
+                            "The model has different hidden sizes {}. Sharing compacter weights is only possible if"
                             " the hidden_sizes match.".format(hidden_sizes)
                         )
                 else:
                     self.base_model.shared_parameters[adapter_name] = init_shared_parameters(
                         adapter_config, self.config.hidden_size, self.device
                     )
+
+        # Vera Initialization
+        if self.adapters_config.match(adapter_name, VeraConfig):
+            adapter_config = self.adapters_config.match(adapter_name, VeraConfig)
+            self.base_model.shared_parameters[adapter_name] = init_shared_Vera_parameters(
+                self.config, adapter_config, self.device
+            )
+
         # Prefix Tuning
         for module in self.modules():
             if isinstance(module, PrefixTuningPool):