adapter-hub · calpt · Oct 5, 2023 · Sep 9, 2023 · Sep 9, 2023 · Sep 9, 2023
diff --git a/docs/installation.md b/docs/installation.md
@@ -1,7 +1,7 @@
 # Installation
 
 The `adapters` package is designed as an add-on for Hugging Face's Transformers library.
-It currently supports Python 3.7+ and PyTorch 1.3.1+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first. 
+It currently supports Python 3.8+ and PyTorch 1.10+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first. 
 
 ```{eval-rst}
 .. important::

diff --git a/hf_transformers b/hf_transformers
diff --git a/setup.py b/setup.py
@@ -21,7 +21,7 @@
 # We try to follow their general layout wherever sensible.
 
 _deps = [
-    "accelerate>=0.20.1",
+    "accelerate>=0.20.3",
     "black==22.3",  # after updating to black 2023, also update Python version in pyproject.toml to 3.7
     "datasets!=2.5.0",
     "dill<0.3.5",
@@ -60,8 +60,8 @@
     "sphinx-intl",
     "sphinx-multiversion",
     "timeout-decorator",
-    "torch>=1.7,!=1.12.0",
-    "transformers==4.31.0",
+    "torch>=1.10,!=1.12.0",
+    "transformers==4.33.3",
     "beautifulsoup4",
 ]
 

diff --git a/src/adapters/head_utils.py b/src/adapters/head_utils.py
@@ -483,6 +483,20 @@
         },
         "layers": [None, "qa_outputs"],
     },
+    "T5ForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 2,
+            "activation_function": "tanh",
+        },
+        "layers": [
+            None,
+            "classification_head.dense",
+            None,
+            None,
+            "classification_head.out_proj",
+        ],
+    },
     "DebertaV2ForSequenceClassification": {
         "config": {
             "head_type": "classification",

diff --git a/src/adapters/heads/base.py b/src/adapters/heads/base.py
@@ -554,9 +554,9 @@ def tie_weights(self):
                 self = getattr(self, self.base_model_prefix)
             self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)
 
-    def _resize_token_embeddings(self, new_num_tokens):
+    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
         old_embeddings = self.get_input_embeddings()
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
         self.set_input_embeddings(new_embeddings)
 
         # if word embeddings are not tied, make sure that lm head is resized as well
@@ -730,6 +730,27 @@ def delete_head(self, head_name: str):
         if self.active_head == head_name:
             self.active_head = None
 
+    def _get_used_heads(self, head_name: str = None):
+        if head_name:
+            used_heads = [head_name]
+        # together with context, check if we have heads at all to allow for models without heads
+        elif len(self.heads) > 0 and AdapterSetup.get_context_head_setup():
+            used_heads = AdapterSetup.get_context_head_setup()
+            if isinstance(used_heads, str):
+                used_heads = [used_heads]
+        elif self._active_heads:
+            used_heads = self._active_heads
+        else:
+            return []
+
+        head_modules = []
+        for head in used_heads:
+            if head not in self.heads:
+                raise ValueError("Unknown head_name '{}'".format(head))
+            head_modules.append(self.heads[head])
+
+        return head_modules
+
     def forward_head(
         self, all_outputs, head_name=None, cls_output=None, attention_mask=None, return_dict=False, **kwargs
     ):
@@ -750,16 +771,8 @@ def forward_head(
             return_dict (bool): Whether or not to return a ``ModelOutput`` instead of a plain tuple.
             **kwargs: Additional keyword arguments passed to the forward pass of the head.
         """
-        if head_name:
-            used_heads = [head_name]
-        # together with context, check if we have heads at all to allow for models without heads
-        elif len(self.heads) > 0 and AdapterSetup.get_context_head_setup():
-            used_heads = AdapterSetup.get_context_head_setup()
-            if isinstance(used_heads, str):
-                used_heads = [used_heads]
-        elif self._active_heads:
-            used_heads = self._active_heads
-        else:
+        used_head_modules = self._get_used_heads(head_name)
+        if len(used_head_modules) == 0:
             logger.debug("No prediction head is used.")
             return all_outputs
 
@@ -787,9 +800,6 @@ def _get_head_input(outputs, cls_out, batch):
             if inv_adapter:
                 kwargs["invertible_adapter"] = inv_adapter
 
-        for head in used_heads:
-            if head not in self.heads:
-                raise ValueError("Unknown head_name '{}'".format(head))
         if isinstance(self.active_head, BatchSplit):
             if sum(self.active_head.batch_sizes) != all_outputs[0].size()[0]:
                 raise ValueError(
@@ -830,14 +840,13 @@ def _get_head_input(outputs, cls_out, batch):
                 else None
             )
             return_output = MultiHeadOutput(head_outputs=head_outputs, loss=combined_loss)
-        elif len(used_heads) > 1:
+        elif len(used_head_modules) > 1:
             head_outputs = []
-            for head in used_heads:
-                head_module = self.heads[head]
+            for head_module in used_head_modules:
                 head_outputs.append(head_module(all_outputs, cls_output, attention_mask, return_dict, **kwargs))
             return_output = MultiHeadOutput(head_outputs=head_outputs)
         else:
-            head_module = self.heads[used_heads[0]]
+            head_module = used_head_modules[0]
             return_output = head_module(all_outputs, cls_output, attention_mask, return_dict, **kwargs)
 
         if isinstance(return_output, ModelOutput):

diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py
@@ -15,7 +15,8 @@
     CLIPVisionModelAdaptersMixin,
 )
 from .distilbert.mixin_distilbert import DistilBertModelAdaptersMixin, DistilBertTransformerAdaptersMixin
-from .gptj.mixin_gptj import GPTJMLPAdaptersMixin
+from .gpt2.mixin_gpt2 import GPT2ModelAdapterMixin
+from .gptj.mixin_gptj import GPTJMLPAdaptersMixin, GPTJModelAdapterMixin
 from .llama.mixin_llama import LlamaModelAdapterMixin
 from .t5.mixin_t5 import T5BlockAdaptersMixin, T5ModelAdaptersMixin, T5ModelAdaptersWithHeadsMixin
 from .vit.mixin_vit import ViTIntermediateAdaptersMixin, ViTModelAdaptersMixin
@@ -49,7 +50,9 @@
     "MBartDecoder": BartDecoderAdaptersMixin,
     "MBartDecoderWrapper": BartDecoderWrapperAdaptersMixin,
     "MBartModel": BartModelAdaptersMixin,
+    "GPT2Model": GPT2ModelAdapterMixin,
     "GPTJMLP": GPTJMLPAdaptersMixin,
+    "GPTJModel": GPTJModelAdapterMixin,
     "RobertaLayer": BertLayerAdaptersMixin,
     "RobertaModel": BertModelAdaptersMixin,
     "T5Block": T5BlockAdaptersMixin,

diff --git a/src/adapters/models/gpt2/mixin_gpt2.py b/src/adapters/models/gpt2/mixin_gpt2.py
@@ -5,7 +5,7 @@
 from ...layer import AdapterLayer
 from ...lora import Linear as LoRALinear
 from ...lora import MergedLinear as LoRAMergedLinear
-from ...model_mixin import EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelAdaptersMixin
+from ...model_mixin import EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin
 from ...prefix_tuning import PrefixTuningShim
 
 
@@ -54,7 +54,7 @@ def init_adapters(self, model_config, adapters_config):
         self.output_adapters = AdapterLayer("output_adapter")
 
 
-class GPT2ModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelAdaptersMixin):
+class GPT2ModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin):
     def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
         for i, layer in enumerate(self.base_model.h):
             yield i, layer