From 0645d4afe1d6c064f6e0bbf2d0fbaa84b3f5874b Mon Sep 17 00:00:00 2001
From: calpt <calpt@mail.de>
Date: Sat, 9 Sep 2023 10:44:47 +0200
Subject: [PATCH 1/6] Upgrade Transformers to v4.33.1

---
 docs/installation.md | 2 +-
 hf_transformers      | 2 +-
 setup.py             | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index 917992173d..c3b8468eb8 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,7 +1,7 @@
 # Installation
 
 The `adapters` package is designed as an add-on for Hugging Face's Transformers library.
-It currently supports Python 3.7+ and PyTorch 1.3.1+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first. 
+It currently supports Python 3.8+ and PyTorch 1.10+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first. 
 
 ```{eval-rst}
 .. important::
diff --git a/hf_transformers b/hf_transformers
index e42587f596..fa6107c97e 160000
--- a/hf_transformers
+++ b/hf_transformers
@@ -1 +1 @@
-Subproject commit e42587f596181396e1c4b63660abf0c736b10dae
+Subproject commit fa6107c97edf7cf725305a34735a57875b67d85e
diff --git a/setup.py b/setup.py
index fd5490935a..86e2063880 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
 # We try to follow their general layout wherever sensible.
 
 _deps = [
-    "accelerate>=0.20.1",
+    "accelerate>=0.20.3",
     "black==22.3",  # after updating to black 2023, also update Python version in pyproject.toml to 3.7
     "datasets!=2.5.0",
     "dill<0.3.5",
@@ -60,8 +60,8 @@
     "sphinx-intl",
     "sphinx-multiversion",
     "timeout-decorator",
-    "torch>=1.7,!=1.12.0",
-    "transformers==4.31.0",
+    "torch>=1.10,!=1.12.0",
+    "transformers==4.33.1",
     "beautifulsoup4",
 ]
 

From 2fe51eebf2c1d4112254ae09a788e3b97e42b218 Mon Sep 17 00:00:00 2001
From: calpt <calpt@mail.de>
Date: Sat, 9 Sep 2023 10:55:08 +0200
Subject: [PATCH 2/6] Remove copied GPT-2/ GPT-J model classes

---
 src/adapters/models/__init__.py           |   5 +-
 src/adapters/models/gpt2/mixin_gpt2.py    |   4 +-
 src/adapters/models/gpt2/modeling_gpt2.py | 214 +---------------------
 src/adapters/models/gptj/modeling_gptj.py | 195 +-------------------
 4 files changed, 10 insertions(+), 408 deletions(-)

diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py
index b0e59abf14..11da5d325e 100644
--- a/src/adapters/models/__init__.py
+++ b/src/adapters/models/__init__.py
@@ -15,7 +15,8 @@
     CLIPVisionModelAdaptersMixin,
 )
 from .distilbert.mixin_distilbert import DistilBertModelAdaptersMixin, DistilBertTransformerAdaptersMixin
-from .gptj.mixin_gptj import GPTJMLPAdaptersMixin
+from .gpt2.mixin_gpt2 import GPT2ModelAdapterMixin
+from .gptj.mixin_gptj import GPTJMLPAdaptersMixin, GPTJModelAdapterMixin
 from .llama.mixin_llama import LlamaModelAdapterMixin
 from .t5.mixin_t5 import T5BlockAdaptersMixin, T5ModelAdaptersMixin, T5ModelAdaptersWithHeadsMixin
 from .vit.mixin_vit import ViTIntermediateAdaptersMixin, ViTModelAdaptersMixin
@@ -49,7 +50,9 @@
     "MBartDecoder": BartDecoderAdaptersMixin,
     "MBartDecoderWrapper": BartDecoderWrapperAdaptersMixin,
     "MBartModel": BartModelAdaptersMixin,
+    "GPT2Model": GPT2ModelAdapterMixin,
     "GPTJMLP": GPTJMLPAdaptersMixin,
+    "GPTJModel": GPTJModelAdapterMixin,
     "RobertaLayer": BertLayerAdaptersMixin,
     "RobertaModel": BertModelAdaptersMixin,
     "T5Block": T5BlockAdaptersMixin,
diff --git a/src/adapters/models/gpt2/mixin_gpt2.py b/src/adapters/models/gpt2/mixin_gpt2.py
index 19acec262a..b3cbf12219 100644
--- a/src/adapters/models/gpt2/mixin_gpt2.py
+++ b/src/adapters/models/gpt2/mixin_gpt2.py
@@ -5,7 +5,7 @@
 from ...layer import AdapterLayer
 from ...lora import Linear as LoRALinear
 from ...lora import MergedLinear as LoRAMergedLinear
-from ...model_mixin import EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelAdaptersMixin
+from ...model_mixin import EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin
 from ...prefix_tuning import PrefixTuningShim
 
 
@@ -54,7 +54,7 @@ def init_adapters(self, model_config, adapters_config):
         self.output_adapters = AdapterLayer("output_adapter")
 
 
-class GPT2ModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelAdaptersMixin):
+class GPT2ModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin):
     def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
         for i, layer in enumerate(self.base_model.h):
             yield i, layer
diff --git a/src/adapters/models/gpt2/modeling_gpt2.py b/src/adapters/models/gpt2/modeling_gpt2.py
index a1397153be..1c571c23fe 100644
--- a/src/adapters/models/gpt2/modeling_gpt2.py
+++ b/src/adapters/models/gpt2/modeling_gpt2.py
@@ -20,16 +20,10 @@
 import torch
 import torch.utils.checkpoint
 
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
-from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2Model
-from transformers.utils import logging
+from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block
 
 from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_
-from ...context import ForwardContext
-from .mixin_gpt2 import GPT2AttentionAdaptersMixin, GPT2DecoderBlockAdaptersMixin, GPT2ModelAdapterMixin
-
-
-logger = logging.get_logger(__name__)
+from .mixin_gpt2 import GPT2AttentionAdaptersMixin, GPT2DecoderBlockAdaptersMixin
 
 
 class GPT2AttentionWithAdapters(GPT2AttentionAdaptersMixin, GPT2Attention):
@@ -151,207 +145,3 @@ def forward(
             outputs = (hidden_states,) + outputs[1:]
 
         return outputs  # hidden_states, present, (attentions, cross_attentions)
-
-
-class GPT2ModelWithAdapters(GPT2ModelAdapterMixin, GPT2Model):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.init_adapters(config, None)
-
-    @ForwardContext.wrap
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            batch_size = input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size = inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            past_length = past_key_values[0][0].size(-2)
-        if position_ids is None:
-            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        # GPT2Attention mask.
-        if attention_mask is not None:
-            if batch_size <= 0:
-                raise ValueError("batch_size has to be defined and > 0")
-            attention_mask = attention_mask.view(batch_size, -1)
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask[:, None, None, :]
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and the dtype's smallest value for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.add_cross_attention and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
-
-        if token_type_ids is not None:
-            token_type_embeds = self.wte(token_type_ids)
-            hidden_states = hidden_states + token_type_embeds
-
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure layer_past is on same device as hidden_states (might not be correct)
-                if layer_past is not None:
-                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if attention_mask is not None:
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if isinstance(head_mask, torch.Tensor):
-                    head_mask = head_mask.to(hidden_states.device)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache, output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    None,
-                    attention_mask,
-                    head_mask[i],
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = outputs[0]
-            # also adjust output shape if necessary
-            if getattr(ForwardContext.get_context(), "adapters_parallelized", False):
-                output_shape = hidden_states.size()
-
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        hidden_states = self.ln_f(hidden_states)
-
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
-                if v is not None
-            )
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
diff --git a/src/adapters/models/gptj/modeling_gptj.py b/src/adapters/models/gptj/modeling_gptj.py
index b9f9c0fa00..453f0c9b6d 100644
--- a/src/adapters/models/gptj/modeling_gptj.py
+++ b/src/adapters/models/gptj/modeling_gptj.py
@@ -19,19 +19,11 @@
 import torch
 import torch.utils.checkpoint
 
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from transformers.models.gptj.modeling_gptj import (
-    GPTJAttention,
-    GPTJBlock,
-    GPTJModel,
-    apply_rotary_pos_emb,
-    get_embed_positions,
-)
+from transformers.models.gptj.modeling_gptj import GPTJAttention, GPTJBlock, apply_rotary_pos_emb, get_embed_positions
 from transformers.utils.import_utils import is_torch_fx_proxy
 
 from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_
-from ...context import ForwardContext
-from .mixin_gptj import GPTJAttentionAdaptersMixin, GPTJDecoderBlockAdaptersMixin, GPTJModelAdapterMixin
+from .mixin_gptj import GPTJAttentionAdaptersMixin, GPTJDecoderBlockAdaptersMixin
 
 
 class GPTJAttentionWithAdapters(GPTJAttentionAdaptersMixin, GPTJAttention):
@@ -152,186 +144,3 @@ def forward(
             outputs = (hidden_states,) + outputs[1:]
 
         return outputs  # hidden_states, present, (attentions)
-
-
-class GPTJModelWithAdapters(GPTJModelAdapterMixin, GPTJModel):
-    @ForwardContext.wrap
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            batch_size = input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size = inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1]).long()
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            past_length = past_key_values[0][0].size(-2)
-
-        if position_ids is None:
-            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        # Attention mask.
-        if attention_mask is not None:
-            if batch_size <= 0:
-                raise ValueError("batch_size has to be defined and > 0")
-            attention_mask = attention_mask.view(batch_size, -1)
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask[:, None, None, :]
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and the dtype's smallest value for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x num_attention_heads x N x N
-        # head_mask has shape n_layer x batch x num_attention_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-        hidden_states = inputs_embeds
-
-        if token_type_ids is not None:
-            token_type_embeds = self.wte(token_type_ids)
-            hidden_states = hidden_states + token_type_embeds
-
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                # logger.warning_once(
-                #     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                # )
-                use_cache = False
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure layer_past is on same device as hidden_states (might not be correct)
-                if layer_past is not None:
-                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if attention_mask is not None:
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if isinstance(head_mask, torch.Tensor):
-                    head_mask = head_mask.to(hidden_states.device)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    # logger.warning(
-                    #     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    # )
-                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache, output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    None,
-                    attention_mask,
-                    head_mask[i],
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = outputs[0]
-            # also adjust output shape if necessary
-            if getattr(ForwardContext.get_context(), "adapters_parallelized", False):
-                output_shape = hidden_states.size()
-
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        hidden_states = self.ln_f(hidden_states)
-
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )

From 0a3b96b1fdfd9db3455dd04ec716e333c21dffc7 Mon Sep 17 00:00:00 2001
From: calpt <calpt@mail.de>
Date: Sat, 9 Sep 2023 12:07:24 +0200
Subject: [PATCH 3/6] Transformers upgrade fixes: - Add argument to
 `_resize_token_embeddings()` - Add seq. classification head to T5 - Fix test
 config of Llama

---
 src/adapters/head_utils.py              | 14 +++++++
 src/adapters/heads/base.py              |  4 +-
 src/adapters/models/t5/adapter_model.py | 52 ++++++++++++++++++++++++-
 tests_adapters/test_llama.py            |  1 +
 tests_adapters/test_t5.py               |  4 --
 5 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/src/adapters/head_utils.py b/src/adapters/head_utils.py
index b4f9ba437e..7673857adc 100644
--- a/src/adapters/head_utils.py
+++ b/src/adapters/head_utils.py
@@ -483,6 +483,20 @@
         },
         "layers": [None, "qa_outputs"],
     },
+    "T5ForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 2,
+            "activation_function": "tanh",
+        },
+        "layers": [
+            None,
+            "classification_head.dense",
+            None,
+            None,
+            "classification_head.out_proj",
+        ],
+    },
     "DebertaV2ForSequenceClassification": {
         "config": {
             "head_type": "classification",
diff --git a/src/adapters/heads/base.py b/src/adapters/heads/base.py
index 75666aa41c..2a097c74ad 100644
--- a/src/adapters/heads/base.py
+++ b/src/adapters/heads/base.py
@@ -554,9 +554,9 @@ def tie_weights(self):
                 self = getattr(self, self.base_model_prefix)
             self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)
 
-    def _resize_token_embeddings(self, new_num_tokens):
+    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
         old_embeddings = self.get_input_embeddings()
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
         self.set_input_embeddings(new_embeddings)
 
         # if word embeddings are not tied, make sure that lm head is resized as well
diff --git a/src/adapters/models/t5/adapter_model.py b/src/adapters/models/t5/adapter_model.py
index af442ae893..5522748291 100644
--- a/src/adapters/models/t5/adapter_model.py
+++ b/src/adapters/models/t5/adapter_model.py
@@ -5,7 +5,14 @@
 from transformers.models.t5.modeling_t5 import T5_INPUTS_DOCSTRING, T5_START_DOCSTRING, T5Model, T5PreTrainedModel
 from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
 
-from ...heads import ModelWithFlexibleHeadsAdaptersMixin, QuestionAnsweringHead, Seq2SeqLMHead
+from ...composition import adjust_tensors_for_parallel
+from ...heads import (
+    ClassificationHead,
+    ModelWithFlexibleHeadsAdaptersMixin,
+    MultiLabelClassificationHead,
+    QuestionAnsweringHead,
+    Seq2SeqLMHead,
+)
 from ...model_mixin import EmbeddingAdaptersWrapperMixin
 from ...wrappers import init
 
@@ -102,11 +109,24 @@ def forward(
             else:
                 model_output["last_hidden_state"] = new_hidden_state
 
+        # sequence classification based on last token in sequence
+        if input_ids is not None and sequence_output.shape[1] == input_ids.shape[1]:
+            eos_mask = input_ids.eq(self.config.eos_token_id)
+            (eos_mask,) = adjust_tensors_for_parallel(sequence_output, eos_mask)
+            if len(torch.unique(eos_mask.sum(1))) > 1:
+                raise ValueError("All examples must have the same number of <eos> tokens.")
+            cls_representation = sequence_output[eos_mask, :].view(
+                sequence_output.size(0), -1, sequence_output.size(-1)
+            )[:, -1, :]
+        else:
+            cls_representation = sequence_output
+
         if head or self.active_head:
             kwargs["labels"] = labels
             head_outputs = self.forward_head(
                 model_output,
                 head_name=head,
+                cls_output=cls_representation,
                 return_dict=return_dict,
                 **kwargs,
             )
@@ -175,6 +195,8 @@ def _reorder_cache(self, past, beam_idx):
     head_types = {
         "seq2seq_lm": Seq2SeqLMHead,
         "question_answering": QuestionAnsweringHead,
+        "classification": ClassificationHead,
+        "multilabel_classification": MultiLabelClassificationHead,
     }
 
     def add_seq2seq_lm_head(self, head_name, overwrite_ok=False):
@@ -199,3 +221,31 @@ def add_qa_head(
     ):
         head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label)
         self.add_prediction_head(head, overwrite_ok)
+
+    def add_classification_head(
+        self,
+        head_name,
+        num_labels=2,
+        layers=2,
+        activation_function="tanh",
+        overwrite_ok=False,
+        multilabel=False,
+        id2label=None,
+    ):
+        """
+        Adds a sequence classification head on top of the model.
+
+        Args:
+            head_name (str): The name of the head.
+            num_labels (int, optional): Number of classification labels. Defaults to 2.
+            layers (int, optional): Number of layers. Defaults to 2.
+            activation_function (str, optional): Activation function. Defaults to 'tanh'.
+            overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False.
+            multilabel (bool, optional): Enable multilabel classification setup. Defaults to False.
+        """
+
+        if multilabel:
+            head = MultiLabelClassificationHead(self, head_name, num_labels, layers, activation_function, id2label)
+        else:
+            head = ClassificationHead(self, head_name, num_labels, layers, activation_function, id2label)
+        self.add_prediction_head(head, overwrite_ok)
diff --git a/tests_adapters/test_llama.py b/tests_adapters/test_llama.py
index f1c7940b3e..2fd455c174 100644
--- a/tests_adapters/test_llama.py
+++ b/tests_adapters/test_llama.py
@@ -30,6 +30,7 @@ class LlamaAdapterTestBase(AdapterTestBase):
         intermediate_size=37,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
+        pad_token_id=0,
     )
     tokenizer_name = "openlm-research/open_llama_13b"
 
diff --git a/tests_adapters/test_t5.py b/tests_adapters/test_t5.py
index 102586ae54..7061f68ed3 100644
--- a/tests_adapters/test_t5.py
+++ b/tests_adapters/test_t5.py
@@ -38,10 +38,6 @@ class T5AdapterTestBase(AdapterTestBase):
     )
     tokenizer_name = "t5-base"
 
-    def add_head(self, model, name, **kwargs):
-        model.add_seq2seq_lm_head(name)
-        return self.default_input_samples_shape[-1]
-
     def dataset(self, tokenizer=None):
         # setup tokenizer
         if tokenizer is None:

From 5065d27c0219f2d36d13891e0ccee4cda1c10881 Mon Sep 17 00:00:00 2001
From: calpt <calpt@mail.de>
Date: Sat, 16 Sep 2023 16:29:20 +0200
Subject: [PATCH 4/6] Use seq. classification head in T5 tests. Move used heads
 retrieval to new method.

---
 src/adapters/heads/base.py                    | 43 +++++++++++--------
 src/adapters/models/t5/adapter_model.py       | 32 +++++++-------
 src/adapters/models/t5/modeling_t5.py         | 18 +++++---
 tests_adapters/composition/test_parallel.py   |  4 +-
 tests_adapters/methods/test_adapter_common.py |  4 +-
 tests_adapters/methods/test_prefix_tuning.py  |  3 +-
 tests_adapters/test_t5.py                     | 43 +------------------
 7 files changed, 61 insertions(+), 86 deletions(-)

diff --git a/src/adapters/heads/base.py b/src/adapters/heads/base.py
index 2a097c74ad..dd43a4e658 100644
--- a/src/adapters/heads/base.py
+++ b/src/adapters/heads/base.py
@@ -730,6 +730,27 @@ def delete_head(self, head_name: str):
         if self.active_head == head_name:
             self.active_head = None
 
+    def _get_used_heads(self, head_name: str = None):
+        if head_name:
+            used_heads = [head_name]
+        # together with context, check if we have heads at all to allow for models without heads
+        elif len(self.heads) > 0 and AdapterSetup.get_context_head_setup():
+            used_heads = AdapterSetup.get_context_head_setup()
+            if isinstance(used_heads, str):
+                used_heads = [used_heads]
+        elif self._active_heads:
+            used_heads = self._active_heads
+        else:
+            return []
+
+        head_modules = []
+        for head in used_heads:
+            if head not in self.heads:
+                raise ValueError("Unknown head_name '{}'".format(head))
+            head_modules.append(self.heads[head])
+
+        return head_modules
+
     def forward_head(
         self, all_outputs, head_name=None, cls_output=None, attention_mask=None, return_dict=False, **kwargs
     ):
@@ -750,16 +771,8 @@ def forward_head(
             return_dict (bool): Whether or not to return a ``ModelOutput`` instead of a plain tuple.
             **kwargs: Additional keyword arguments passed to the forward pass of the head.
         """
-        if head_name:
-            used_heads = [head_name]
-        # together with context, check if we have heads at all to allow for models without heads
-        elif len(self.heads) > 0 and AdapterSetup.get_context_head_setup():
-            used_heads = AdapterSetup.get_context_head_setup()
-            if isinstance(used_heads, str):
-                used_heads = [used_heads]
-        elif self._active_heads:
-            used_heads = self._active_heads
-        else:
+        used_head_modules = self._get_used_heads(head_name)
+        if len(used_head_modules) == 0:
             logger.debug("No prediction head is used.")
             return all_outputs
 
@@ -787,9 +800,6 @@ def _get_head_input(outputs, cls_out, batch):
             if inv_adapter:
                 kwargs["invertible_adapter"] = inv_adapter
 
-        for head in used_heads:
-            if head not in self.heads:
-                raise ValueError("Unknown head_name '{}'".format(head))
         if isinstance(self.active_head, BatchSplit):
             if sum(self.active_head.batch_sizes) != all_outputs[0].size()[0]:
                 raise ValueError(
@@ -830,14 +840,13 @@ def _get_head_input(outputs, cls_out, batch):
                 else None
             )
             return_output = MultiHeadOutput(head_outputs=head_outputs, loss=combined_loss)
-        elif len(used_heads) > 1:
+        elif len(used_head_modules) > 1:
             head_outputs = []
-            for head in used_heads:
-                head_module = self.heads[head]
+            for head_module in used_head_modules:
                 head_outputs.append(head_module(all_outputs, cls_output, attention_mask, return_dict, **kwargs))
             return_output = MultiHeadOutput(head_outputs=head_outputs)
         else:
-            head_module = self.heads[used_heads[0]]
+            head_module = used_head_modules[0]
             return_output = head_module(all_outputs, cls_output, attention_mask, return_dict, **kwargs)
 
         if isinstance(return_output, ModelOutput):
diff --git a/src/adapters/models/t5/adapter_model.py b/src/adapters/models/t5/adapter_model.py
index 5522748291..66441727c7 100644
--- a/src/adapters/models/t5/adapter_model.py
+++ b/src/adapters/models/t5/adapter_model.py
@@ -73,9 +73,14 @@ def forward(
         **kwargs
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-            # get decoder inputs from shifting lm labels to the right
-            decoder_input_ids = self._shift_right(labels)
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            # Check if we're using a LM head
+            if labels is not None and any([isinstance(head, Seq2SeqLMHead) for head in self._get_used_heads(head)]):
+                # get decoder inputs from shifting lm labels to the right
+                decoder_input_ids = self._shift_right(labels)
+            else:
+                # decoder_input_ids from input_ids if no decoder_input_ids are provided
+                decoder_input_ids = self._shift_right(input_ids)
 
         model_output = self.transformer(
             input_ids=input_ids,
@@ -121,18 +126,15 @@ def forward(
         else:
             cls_representation = sequence_output
 
-        if head or self.active_head:
-            kwargs["labels"] = labels
-            head_outputs = self.forward_head(
-                model_output,
-                head_name=head,
-                cls_output=cls_representation,
-                return_dict=return_dict,
-                **kwargs,
-            )
-            return head_outputs
-        else:
-            return model_output
+        kwargs["labels"] = labels
+        head_outputs = self.forward_head(
+            model_output,
+            head_name=head,
+            cls_output=cls_representation,
+            return_dict=return_dict,
+            **kwargs,
+        )
+        return head_outputs
 
     # Copied from T5ForConditionalGeneration
     def prepare_inputs_for_generation(
diff --git a/src/adapters/models/t5/modeling_t5.py b/src/adapters/models/t5/modeling_t5.py
index 820c092027..7d7e467f0a 100644
--- a/src/adapters/models/t5/modeling_t5.py
+++ b/src/adapters/models/t5/modeling_t5.py
@@ -292,7 +292,8 @@ def forward(
             raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
 
         if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            if self.embed_tokens is None:
+                raise ValueError("You have to initialize the model with valid token embeddings")
             inputs_embeds = self.embed_tokens(input_ids)
 
         batch_size, seq_length = input_shape
@@ -301,7 +302,8 @@ def forward(
         mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
 
         if use_cache is True:
-            assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder"
+            if not self.is_decoder:
+                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
 
         if attention_mask is None:
             attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
@@ -330,6 +332,13 @@ def forward(
         else:
             encoder_extended_attention_mask = None
 
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
         # Prepare head mask if needed
         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
         cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
@@ -369,11 +378,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
 
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
diff --git a/tests_adapters/composition/test_parallel.py b/tests_adapters/composition/test_parallel.py
index c33d5e362f..56ea422308 100644
--- a/tests_adapters/composition/test_parallel.py
+++ b/tests_adapters/composition/test_parallel.py
@@ -234,7 +234,7 @@ def run_parallel_training_equivalent_to_single(self, adapter_config):
         dataset = []
         for i in range(3):
             input_data = self.get_input_samples(config=model.config)
-            if isinstance(model, T5AdapterModel) or isinstance(model, BertGenerationAdapterModel):
+            if isinstance(model, BertGenerationAdapterModel):
                 input_data["labels"] = torch.randint(0, 2, (3, 64))
             else:
                 input_data["labels"] = torch.randint(0, 2, (3, 1))
@@ -291,7 +291,7 @@ def test_parallel_training_single_forward_pass(self):
                 self.assertTrue(torch.equal(v, state_dict[k.replace(b1, b2)]))
 
         input_data = self.get_input_samples(config=model.config)
-        if isinstance(model, T5AdapterModel) or isinstance(model, BertGenerationAdapterModel):
+        if isinstance(model, BertGenerationAdapterModel):
             input_data["labels"] = torch.randint(0, 2, (3, 64), device=torch_device)
         else:
             input_data["labels"] = torch.randint(0, 2, (3, 1), device=torch_device)
diff --git a/tests_adapters/methods/test_adapter_common.py b/tests_adapters/methods/test_adapter_common.py
index 81033924a0..616e6a99e8 100644
--- a/tests_adapters/methods/test_adapter_common.py
+++ b/tests_adapters/methods/test_adapter_common.py
@@ -19,7 +19,7 @@
     SeqBnInvConfig,
 )
 from adapters.heads.language_modeling import CausalLMHead
-from transformers import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
+from transformers import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, CLIPConfig
 from transformers.testing_utils import require_torch, torch_device
 
 from .base import AdapterMethodBaseTestMixin, create_twin_models
@@ -148,7 +148,7 @@ def test_get_adapter(self):
         n_layers = len(list(model.iter_layers()))
         if model.config.is_encoder_decoder:
             n_prefix_layers = 3
-        elif model.config.is_composition:
+        elif model.config.is_composition or isinstance(model.config, CLIPConfig):
             n_prefix_layers = 2
         else:
             n_prefix_layers = 1
diff --git a/tests_adapters/methods/test_prefix_tuning.py b/tests_adapters/methods/test_prefix_tuning.py
index f08a9a492f..798f4b19d4 100644
--- a/tests_adapters/methods/test_prefix_tuning.py
+++ b/tests_adapters/methods/test_prefix_tuning.py
@@ -1,6 +1,7 @@
 import torch
 
 from adapters import ADAPTER_MODEL_MAPPING, AutoAdapterModel, PrefixTuningConfig
+from transformers import CLIPConfig
 from transformers.testing_utils import require_torch, torch_device
 
 from .base import AdapterMethodBaseTestMixin
@@ -24,7 +25,7 @@ def test_get_prefix_tuning(self):
         model = self.get_model()
         if model.config.is_encoder_decoder:
             n_prefix_layers = 3
-        elif model.config.is_composition:
+        elif model.config.is_composition or isinstance(model.config, CLIPConfig):
             n_prefix_layers = 2
         else:
             n_prefix_layers = 1
diff --git a/tests_adapters/test_t5.py b/tests_adapters/test_t5.py
index 7061f68ed3..c8717d8b54 100644
--- a/tests_adapters/test_t5.py
+++ b/tests_adapters/test_t5.py
@@ -1,8 +1,6 @@
 import unittest
 
-from datasets import load_dataset
-
-from transformers import AutoTokenizer, T5Config
+from transformers import T5Config
 from transformers.testing_utils import require_torch
 
 from .composition.test_parallel import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin
@@ -38,45 +36,6 @@ class T5AdapterTestBase(AdapterTestBase):
     )
     tokenizer_name = "t5-base"
 
-    def dataset(self, tokenizer=None):
-        # setup tokenizer
-        if tokenizer is None:
-            tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
-
-        def preprocess_function(examples):
-            inputs = examples["document"]
-            targets = examples["summary"]
-            inputs = ["Summarize: " + inp for inp in inputs]
-            model_inputs = tokenizer(inputs, padding=True, truncation=True)
-
-            # Setup the tokenizer for targets
-            with tokenizer.as_target_tokenizer():
-                labels = tokenizer(targets, padding=True, truncation=True)
-
-            # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-            # padding in the loss.
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-            model_inputs["labels"] = labels["input_ids"]
-            return model_inputs
-
-        data_args = {
-            "task_name": "xsum",
-            "path": "./hf_transformers/tests/fixtures/tests_samples/xsum/sample.json",
-        }
-        dataset = load_dataset("json", data_files=data_args["path"])
-        train_dataset = dataset["train"]
-        train_dataset = train_dataset.map(
-            preprocess_function,
-            batched=True,
-            desc="Running tokenizer on train dataset",
-        )
-        return train_dataset
-
 
 @require_torch
 class T5AdapterTest(

From 188a2311f732c80c2e03550a6654610da781ebfe Mon Sep 17 00:00:00 2001
From: calpt <calpt@mail.de>
Date: Sun, 17 Sep 2023 12:09:02 +0200
Subject: [PATCH 5/6] Bump to v4.33.2

---
 hf_transformers | 2 +-
 setup.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hf_transformers b/hf_transformers
index fa6107c97e..6da93f5580 160000
--- a/hf_transformers
+++ b/hf_transformers
@@ -1 +1 @@
-Subproject commit fa6107c97edf7cf725305a34735a57875b67d85e
+Subproject commit 6da93f5580e109fad5f7b523cf2b6e8a5bafb623
diff --git a/setup.py b/setup.py
index 86e2063880..0a0e73ad84 100644
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,7 @@
     "sphinx-multiversion",
     "timeout-decorator",
     "torch>=1.10,!=1.12.0",
-    "transformers==4.33.1",
+    "transformers==4.33.2",
     "beautifulsoup4",
 ]
 

From 3bd1a2e0da4c1ee8db713aacfb30148ecb3daa2f Mon Sep 17 00:00:00 2001
From: calpt <calpt@mail.de>
Date: Wed, 4 Oct 2023 23:24:41 +0200
Subject: [PATCH 6/6] Bump to v4.33.3

---
 hf_transformers | 2 +-
 setup.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hf_transformers b/hf_transformers
index 6da93f5580..bffac926ca 160000
--- a/hf_transformers
+++ b/hf_transformers
@@ -1 +1 @@
-Subproject commit 6da93f5580e109fad5f7b523cf2b6e8a5bafb623
+Subproject commit bffac926ca6bc6c965a92bfbfd00c567a2c0fb90
diff --git a/setup.py b/setup.py
index 0a0e73ad84..aa83e52ec2 100644
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,7 @@
     "sphinx-multiversion",
     "timeout-decorator",
     "torch>=1.10,!=1.12.0",
-    "transformers==4.33.2",
+    "transformers==4.33.3",
     "beautifulsoup4",
 ]