From 0645d4afe1d6c064f6e0bbf2d0fbaa84b3f5874b Mon Sep 17 00:00:00 2001 From: calpt Date: Sat, 9 Sep 2023 10:44:47 +0200 Subject: [PATCH 1/6] Upgrade Transformers to v4.33.1 --- docs/installation.md | 2 +- hf_transformers | 2 +- setup.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 917992173d..c3b8468eb8 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,7 +1,7 @@ # Installation The `adapters` package is designed as an add-on for Hugging Face's Transformers library. -It currently supports Python 3.7+ and PyTorch 1.3.1+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first. +It currently supports Python 3.8+ and PyTorch 1.10+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first. ```{eval-rst} .. important:: diff --git a/hf_transformers b/hf_transformers index e42587f596..fa6107c97e 160000 --- a/hf_transformers +++ b/hf_transformers @@ -1 +1 @@ -Subproject commit e42587f596181396e1c4b63660abf0c736b10dae +Subproject commit fa6107c97edf7cf725305a34735a57875b67d85e diff --git a/setup.py b/setup.py index fd5490935a..86e2063880 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ # We try to follow their general layout wherever sensible. _deps = [ - "accelerate>=0.20.1", + "accelerate>=0.20.3", "black==22.3", # after updating to black 2023, also update Python version in pyproject.toml to 3.7 "datasets!=2.5.0", "dill<0.3.5", @@ -60,8 +60,8 @@ "sphinx-intl", "sphinx-multiversion", "timeout-decorator", - "torch>=1.7,!=1.12.0", - "transformers==4.31.0", + "torch>=1.10,!=1.12.0", + "transformers==4.33.1", "beautifulsoup4", ] From 2fe51eebf2c1d4112254ae09a788e3b97e42b218 Mon Sep 17 00:00:00 2001 From: calpt Date: Sat, 9 Sep 2023 10:55:08 +0200 Subject: [PATCH 2/6] Remove copied GPT-2/ GPT-J model classes --- src/adapters/models/__init__.py | 5 +- src/adapters/models/gpt2/mixin_gpt2.py | 4 +- src/adapters/models/gpt2/modeling_gpt2.py | 214 +--------------------- src/adapters/models/gptj/modeling_gptj.py | 195 +------------------- 4 files changed, 10 insertions(+), 408 deletions(-) diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py index b0e59abf14..11da5d325e 100644 --- a/src/adapters/models/__init__.py +++ b/src/adapters/models/__init__.py @@ -15,7 +15,8 @@ CLIPVisionModelAdaptersMixin, ) from .distilbert.mixin_distilbert import DistilBertModelAdaptersMixin, DistilBertTransformerAdaptersMixin -from .gptj.mixin_gptj import GPTJMLPAdaptersMixin +from .gpt2.mixin_gpt2 import GPT2ModelAdapterMixin +from .gptj.mixin_gptj import GPTJMLPAdaptersMixin, GPTJModelAdapterMixin from .llama.mixin_llama import LlamaModelAdapterMixin from .t5.mixin_t5 import T5BlockAdaptersMixin, T5ModelAdaptersMixin, T5ModelAdaptersWithHeadsMixin from .vit.mixin_vit import ViTIntermediateAdaptersMixin, ViTModelAdaptersMixin @@ -49,7 +50,9 @@ "MBartDecoder": BartDecoderAdaptersMixin, "MBartDecoderWrapper": BartDecoderWrapperAdaptersMixin, "MBartModel": BartModelAdaptersMixin, + "GPT2Model": GPT2ModelAdapterMixin, "GPTJMLP": GPTJMLPAdaptersMixin, + "GPTJModel": GPTJModelAdapterMixin, "RobertaLayer": BertLayerAdaptersMixin, "RobertaModel": BertModelAdaptersMixin, "T5Block": T5BlockAdaptersMixin, diff --git a/src/adapters/models/gpt2/mixin_gpt2.py b/src/adapters/models/gpt2/mixin_gpt2.py index 19acec262a..b3cbf12219 100644 --- a/src/adapters/models/gpt2/mixin_gpt2.py +++ b/src/adapters/models/gpt2/mixin_gpt2.py @@ -5,7 +5,7 @@ from ...layer import AdapterLayer from ...lora import Linear as LoRALinear from ...lora import MergedLinear as LoRAMergedLinear -from ...model_mixin import EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelAdaptersMixin +from ...model_mixin import EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin from ...prefix_tuning import PrefixTuningShim @@ -54,7 +54,7 @@ def init_adapters(self, model_config, adapters_config): self.output_adapters = AdapterLayer("output_adapter") -class GPT2ModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelAdaptersMixin): +class GPT2ModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin): def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]: for i, layer in enumerate(self.base_model.h): yield i, layer diff --git a/src/adapters/models/gpt2/modeling_gpt2.py b/src/adapters/models/gpt2/modeling_gpt2.py index a1397153be..1c571c23fe 100644 --- a/src/adapters/models/gpt2/modeling_gpt2.py +++ b/src/adapters/models/gpt2/modeling_gpt2.py @@ -20,16 +20,10 @@ import torch import torch.utils.checkpoint -from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions -from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2Model -from transformers.utils import logging +from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_ -from ...context import ForwardContext -from .mixin_gpt2 import GPT2AttentionAdaptersMixin, GPT2DecoderBlockAdaptersMixin, GPT2ModelAdapterMixin - - -logger = logging.get_logger(__name__) +from .mixin_gpt2 import GPT2AttentionAdaptersMixin, GPT2DecoderBlockAdaptersMixin class GPT2AttentionWithAdapters(GPT2AttentionAdaptersMixin, GPT2Attention): @@ -151,207 +145,3 @@ def forward( outputs = (hidden_states,) + outputs[1:] return outputs # hidden_states, present, (attentions, cross_attentions) - - -class GPT2ModelWithAdapters(GPT2ModelAdapterMixin, GPT2Model): - def __init__(self, config): - super().__init__(config) - - self.init_adapters(config, None) - - @ForwardContext.wrap - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - attention_mask: Optional[torch.FloatTensor] = None, - token_type_ids: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - batch_size = input_ids.shape[0] - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - batch_size = inputs_embeds.shape[0] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - device = input_ids.device if input_ids is not None else inputs_embeds.device - - if token_type_ids is not None: - token_type_ids = token_type_ids.view(-1, input_shape[-1]) - if position_ids is not None: - position_ids = position_ids.view(-1, input_shape[-1]) - - if past_key_values is None: - past_length = 0 - past_key_values = tuple([None] * len(self.h)) - else: - past_length = past_key_values[0][0].size(-2) - if position_ids is None: - position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) - position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) - - # GPT2Attention mask. - if attention_mask is not None: - if batch_size <= 0: - raise ValueError("batch_size has to be defined and > 0") - attention_mask = attention_mask.view(batch_size, -1) - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # this attention mask is more simple than the triangular masking of causal attention - # used in OpenAI GPT, we just need to prepare the broadcast dimension here. - attention_mask = attention_mask[:, None, None, :] - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and the dtype's smallest value for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility - attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min - - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.add_cross_attention and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() - encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) - if encoder_attention_mask is None: - encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) - else: - encoder_attention_mask = None - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # head_mask has shape n_layer x batch x n_heads x N x N - head_mask = self.get_head_mask(head_mask, self.config.n_layer) - - if inputs_embeds is None: - inputs_embeds = self.wte(input_ids) - position_embeds = self.wpe(position_ids) - hidden_states = inputs_embeds + position_embeds - - if token_type_ids is not None: - token_type_embeds = self.wte(token_type_ids) - hidden_states = hidden_states + token_type_embeds - - hidden_states = self.drop(hidden_states) - - output_shape = input_shape + (hidden_states.size(-1),) - - presents = () if use_cache else None - all_self_attentions = () if output_attentions else None - all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None - all_hidden_states = () if output_hidden_states else None - for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - # Model parallel - if self.model_parallel: - torch.cuda.set_device(hidden_states.device) - # Ensure layer_past is on same device as hidden_states (might not be correct) - if layer_past is not None: - layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past) - # Ensure that attention_mask is always on the same device as hidden_states - if attention_mask is not None: - attention_mask = attention_mask.to(hidden_states.device) - if isinstance(head_mask, torch.Tensor): - head_mask = head_mask.to(hidden_states.device) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, use_cache, output_attentions) - - return custom_forward - - outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(block), - hidden_states, - None, - attention_mask, - head_mask[i], - encoder_hidden_states, - encoder_attention_mask, - ) - else: - outputs = block( - hidden_states, - layer_past=layer_past, - attention_mask=attention_mask, - head_mask=head_mask[i], - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - use_cache=use_cache, - output_attentions=output_attentions, - ) - - hidden_states = outputs[0] - # also adjust output shape if necessary - if getattr(ForwardContext.get_context(), "adapters_parallelized", False): - output_shape = hidden_states.size() - - if use_cache is True: - presents = presents + (outputs[1],) - - if output_attentions: - all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) - if self.config.add_cross_attention: - all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],) - - # Model Parallel: If it's the last layer for that device, put things on the next device - if self.model_parallel: - for k, v in self.device_map.items(): - if i == v[-1] and "cuda:" + str(k) != self.last_device: - hidden_states = hidden_states.to("cuda:" + str(k + 1)) - - hidden_states = self.ln_f(hidden_states) - - hidden_states = hidden_states.view(output_shape) - # Add last hidden state - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple( - v - for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions] - if v is not None - ) - - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=hidden_states, - past_key_values=presents, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - cross_attentions=all_cross_attentions, - ) diff --git a/src/adapters/models/gptj/modeling_gptj.py b/src/adapters/models/gptj/modeling_gptj.py index b9f9c0fa00..453f0c9b6d 100644 --- a/src/adapters/models/gptj/modeling_gptj.py +++ b/src/adapters/models/gptj/modeling_gptj.py @@ -19,19 +19,11 @@ import torch import torch.utils.checkpoint -from transformers.modeling_outputs import BaseModelOutputWithPast -from transformers.models.gptj.modeling_gptj import ( - GPTJAttention, - GPTJBlock, - GPTJModel, - apply_rotary_pos_emb, - get_embed_positions, -) +from transformers.models.gptj.modeling_gptj import GPTJAttention, GPTJBlock, apply_rotary_pos_emb, get_embed_positions from transformers.utils.import_utils import is_torch_fx_proxy from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_ -from ...context import ForwardContext -from .mixin_gptj import GPTJAttentionAdaptersMixin, GPTJDecoderBlockAdaptersMixin, GPTJModelAdapterMixin +from .mixin_gptj import GPTJAttentionAdaptersMixin, GPTJDecoderBlockAdaptersMixin class GPTJAttentionWithAdapters(GPTJAttentionAdaptersMixin, GPTJAttention): @@ -152,186 +144,3 @@ def forward( outputs = (hidden_states,) + outputs[1:] return outputs # hidden_states, present, (attentions) - - -class GPTJModelWithAdapters(GPTJModelAdapterMixin, GPTJModel): - @ForwardContext.wrap - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - attention_mask: Optional[torch.FloatTensor] = None, - token_type_ids: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - batch_size = input_ids.shape[0] - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - batch_size = inputs_embeds.shape[0] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - device = input_ids.device if input_ids is not None else inputs_embeds.device - - if token_type_ids is not None: - token_type_ids = token_type_ids.view(-1, input_shape[-1]) - - if position_ids is not None: - position_ids = position_ids.view(-1, input_shape[-1]).long() - - if past_key_values is None: - past_length = 0 - past_key_values = tuple([None] * len(self.h)) - else: - past_length = past_key_values[0][0].size(-2) - - if position_ids is None: - position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) - position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) - - # Attention mask. - if attention_mask is not None: - if batch_size <= 0: - raise ValueError("batch_size has to be defined and > 0") - attention_mask = attention_mask.view(batch_size, -1) - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # this attention mask is more simple than the triangular masking of causal attention - # used in OpenAI GPT, we just need to prepare the broadcast dimension here. - attention_mask = attention_mask[:, None, None, :] - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and the dtype's smallest value for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility - attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x num_attention_heads x N x N - # head_mask has shape n_layer x batch x num_attention_heads x N x N - head_mask = self.get_head_mask(head_mask, self.config.n_layer) - - if inputs_embeds is None: - inputs_embeds = self.wte(input_ids) - hidden_states = inputs_embeds - - if token_type_ids is not None: - token_type_embeds = self.wte(token_type_ids) - hidden_states = hidden_states + token_type_embeds - - hidden_states = self.drop(hidden_states) - - output_shape = input_shape + (hidden_states.size(-1),) - - if self.gradient_checkpointing and self.training: - if use_cache: - # logger.warning_once( - # "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - # ) - use_cache = False - - presents = () if use_cache else None - all_self_attentions = () if output_attentions else None - all_hidden_states = () if output_hidden_states else None - for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - # Model parallel - if self.model_parallel: - torch.cuda.set_device(hidden_states.device) - # Ensure layer_past is on same device as hidden_states (might not be correct) - if layer_past is not None: - layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past) - # Ensure that attention_mask is always on the same device as hidden_states - if attention_mask is not None: - attention_mask = attention_mask.to(hidden_states.device) - if isinstance(head_mask, torch.Tensor): - head_mask = head_mask.to(hidden_states.device) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if self.gradient_checkpointing and self.training: - if use_cache: - # logger.warning( - # "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - # ) - use_cache = False - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, use_cache, output_attentions) - - return custom_forward - - outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(block), - hidden_states, - None, - attention_mask, - head_mask[i], - ) - else: - outputs = block( - hidden_states, - layer_past=layer_past, - attention_mask=attention_mask, - position_ids=position_ids, - head_mask=head_mask[i], - use_cache=use_cache, - output_attentions=output_attentions, - ) - - hidden_states = outputs[0] - # also adjust output shape if necessary - if getattr(ForwardContext.get_context(), "adapters_parallelized", False): - output_shape = hidden_states.size() - - if use_cache is True: - presents = presents + (outputs[1],) - - if output_attentions: - all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) - - # Model Parallel: If it's the last layer for that device, put things on the next device - if self.model_parallel: - for k, v in self.device_map.items(): - if i == v[-1] and "cuda:" + str(k) != self.last_device: - hidden_states = hidden_states.to("cuda:" + str(k + 1)) - - hidden_states = self.ln_f(hidden_states) - - hidden_states = hidden_states.view(output_shape) - # Add last hidden state - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) - - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=presents, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - ) From 0a3b96b1fdfd9db3455dd04ec716e333c21dffc7 Mon Sep 17 00:00:00 2001 From: calpt Date: Sat, 9 Sep 2023 12:07:24 +0200 Subject: [PATCH 3/6] Transformers upgrade fixes: - Add argument to `_resize_token_embeddings()` - Add seq. classification head to T5 - Fix test config of Llama --- src/adapters/head_utils.py | 14 +++++++ src/adapters/heads/base.py | 4 +- src/adapters/models/t5/adapter_model.py | 52 ++++++++++++++++++++++++- tests_adapters/test_llama.py | 1 + tests_adapters/test_t5.py | 4 -- 5 files changed, 68 insertions(+), 7 deletions(-) diff --git a/src/adapters/head_utils.py b/src/adapters/head_utils.py index b4f9ba437e..7673857adc 100644 --- a/src/adapters/head_utils.py +++ b/src/adapters/head_utils.py @@ -483,6 +483,20 @@ }, "layers": [None, "qa_outputs"], }, + "T5ForSequenceClassification": { + "config": { + "head_type": "classification", + "layers": 2, + "activation_function": "tanh", + }, + "layers": [ + None, + "classification_head.dense", + None, + None, + "classification_head.out_proj", + ], + }, "DebertaV2ForSequenceClassification": { "config": { "head_type": "classification", diff --git a/src/adapters/heads/base.py b/src/adapters/heads/base.py index 75666aa41c..2a097c74ad 100644 --- a/src/adapters/heads/base.py +++ b/src/adapters/heads/base.py @@ -554,9 +554,9 @@ def tie_weights(self): self = getattr(self, self.base_model_prefix) self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix) - def _resize_token_embeddings(self, new_num_tokens): + def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None): old_embeddings = self.get_input_embeddings() - new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) + new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of) self.set_input_embeddings(new_embeddings) # if word embeddings are not tied, make sure that lm head is resized as well diff --git a/src/adapters/models/t5/adapter_model.py b/src/adapters/models/t5/adapter_model.py index af442ae893..5522748291 100644 --- a/src/adapters/models/t5/adapter_model.py +++ b/src/adapters/models/t5/adapter_model.py @@ -5,7 +5,14 @@ from transformers.models.t5.modeling_t5 import T5_INPUTS_DOCSTRING, T5_START_DOCSTRING, T5Model, T5PreTrainedModel from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward -from ...heads import ModelWithFlexibleHeadsAdaptersMixin, QuestionAnsweringHead, Seq2SeqLMHead +from ...composition import adjust_tensors_for_parallel +from ...heads import ( + ClassificationHead, + ModelWithFlexibleHeadsAdaptersMixin, + MultiLabelClassificationHead, + QuestionAnsweringHead, + Seq2SeqLMHead, +) from ...model_mixin import EmbeddingAdaptersWrapperMixin from ...wrappers import init @@ -102,11 +109,24 @@ def forward( else: model_output["last_hidden_state"] = new_hidden_state + # sequence classification based on last token in sequence + if input_ids is not None and sequence_output.shape[1] == input_ids.shape[1]: + eos_mask = input_ids.eq(self.config.eos_token_id) + (eos_mask,) = adjust_tensors_for_parallel(sequence_output, eos_mask) + if len(torch.unique(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + cls_representation = sequence_output[eos_mask, :].view( + sequence_output.size(0), -1, sequence_output.size(-1) + )[:, -1, :] + else: + cls_representation = sequence_output + if head or self.active_head: kwargs["labels"] = labels head_outputs = self.forward_head( model_output, head_name=head, + cls_output=cls_representation, return_dict=return_dict, **kwargs, ) @@ -175,6 +195,8 @@ def _reorder_cache(self, past, beam_idx): head_types = { "seq2seq_lm": Seq2SeqLMHead, "question_answering": QuestionAnsweringHead, + "classification": ClassificationHead, + "multilabel_classification": MultiLabelClassificationHead, } def add_seq2seq_lm_head(self, head_name, overwrite_ok=False): @@ -199,3 +221,31 @@ def add_qa_head( ): head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label) self.add_prediction_head(head, overwrite_ok) + + def add_classification_head( + self, + head_name, + num_labels=2, + layers=2, + activation_function="tanh", + overwrite_ok=False, + multilabel=False, + id2label=None, + ): + """ + Adds a sequence classification head on top of the model. + + Args: + head_name (str): The name of the head. + num_labels (int, optional): Number of classification labels. Defaults to 2. + layers (int, optional): Number of layers. Defaults to 2. + activation_function (str, optional): Activation function. Defaults to 'tanh'. + overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False. + multilabel (bool, optional): Enable multilabel classification setup. Defaults to False. + """ + + if multilabel: + head = MultiLabelClassificationHead(self, head_name, num_labels, layers, activation_function, id2label) + else: + head = ClassificationHead(self, head_name, num_labels, layers, activation_function, id2label) + self.add_prediction_head(head, overwrite_ok) diff --git a/tests_adapters/test_llama.py b/tests_adapters/test_llama.py index f1c7940b3e..2fd455c174 100644 --- a/tests_adapters/test_llama.py +++ b/tests_adapters/test_llama.py @@ -30,6 +30,7 @@ class LlamaAdapterTestBase(AdapterTestBase): intermediate_size=37, hidden_act="gelu", hidden_dropout_prob=0.1, + pad_token_id=0, ) tokenizer_name = "openlm-research/open_llama_13b" diff --git a/tests_adapters/test_t5.py b/tests_adapters/test_t5.py index 102586ae54..7061f68ed3 100644 --- a/tests_adapters/test_t5.py +++ b/tests_adapters/test_t5.py @@ -38,10 +38,6 @@ class T5AdapterTestBase(AdapterTestBase): ) tokenizer_name = "t5-base" - def add_head(self, model, name, **kwargs): - model.add_seq2seq_lm_head(name) - return self.default_input_samples_shape[-1] - def dataset(self, tokenizer=None): # setup tokenizer if tokenizer is None: From 5065d27c0219f2d36d13891e0ccee4cda1c10881 Mon Sep 17 00:00:00 2001 From: calpt Date: Sat, 16 Sep 2023 16:29:20 +0200 Subject: [PATCH 4/6] Use seq. classification head in T5 tests. Move used heads retrieval to new method. --- src/adapters/heads/base.py | 43 +++++++++++-------- src/adapters/models/t5/adapter_model.py | 32 +++++++------- src/adapters/models/t5/modeling_t5.py | 18 +++++--- tests_adapters/composition/test_parallel.py | 4 +- tests_adapters/methods/test_adapter_common.py | 4 +- tests_adapters/methods/test_prefix_tuning.py | 3 +- tests_adapters/test_t5.py | 43 +------------------ 7 files changed, 61 insertions(+), 86 deletions(-) diff --git a/src/adapters/heads/base.py b/src/adapters/heads/base.py index 2a097c74ad..dd43a4e658 100644 --- a/src/adapters/heads/base.py +++ b/src/adapters/heads/base.py @@ -730,6 +730,27 @@ def delete_head(self, head_name: str): if self.active_head == head_name: self.active_head = None + def _get_used_heads(self, head_name: str = None): + if head_name: + used_heads = [head_name] + # together with context, check if we have heads at all to allow for models without heads + elif len(self.heads) > 0 and AdapterSetup.get_context_head_setup(): + used_heads = AdapterSetup.get_context_head_setup() + if isinstance(used_heads, str): + used_heads = [used_heads] + elif self._active_heads: + used_heads = self._active_heads + else: + return [] + + head_modules = [] + for head in used_heads: + if head not in self.heads: + raise ValueError("Unknown head_name '{}'".format(head)) + head_modules.append(self.heads[head]) + + return head_modules + def forward_head( self, all_outputs, head_name=None, cls_output=None, attention_mask=None, return_dict=False, **kwargs ): @@ -750,16 +771,8 @@ def forward_head( return_dict (bool): Whether or not to return a ``ModelOutput`` instead of a plain tuple. **kwargs: Additional keyword arguments passed to the forward pass of the head. """ - if head_name: - used_heads = [head_name] - # together with context, check if we have heads at all to allow for models without heads - elif len(self.heads) > 0 and AdapterSetup.get_context_head_setup(): - used_heads = AdapterSetup.get_context_head_setup() - if isinstance(used_heads, str): - used_heads = [used_heads] - elif self._active_heads: - used_heads = self._active_heads - else: + used_head_modules = self._get_used_heads(head_name) + if len(used_head_modules) == 0: logger.debug("No prediction head is used.") return all_outputs @@ -787,9 +800,6 @@ def _get_head_input(outputs, cls_out, batch): if inv_adapter: kwargs["invertible_adapter"] = inv_adapter - for head in used_heads: - if head not in self.heads: - raise ValueError("Unknown head_name '{}'".format(head)) if isinstance(self.active_head, BatchSplit): if sum(self.active_head.batch_sizes) != all_outputs[0].size()[0]: raise ValueError( @@ -830,14 +840,13 @@ def _get_head_input(outputs, cls_out, batch): else None ) return_output = MultiHeadOutput(head_outputs=head_outputs, loss=combined_loss) - elif len(used_heads) > 1: + elif len(used_head_modules) > 1: head_outputs = [] - for head in used_heads: - head_module = self.heads[head] + for head_module in used_head_modules: head_outputs.append(head_module(all_outputs, cls_output, attention_mask, return_dict, **kwargs)) return_output = MultiHeadOutput(head_outputs=head_outputs) else: - head_module = self.heads[used_heads[0]] + head_module = used_head_modules[0] return_output = head_module(all_outputs, cls_output, attention_mask, return_dict, **kwargs) if isinstance(return_output, ModelOutput): diff --git a/src/adapters/models/t5/adapter_model.py b/src/adapters/models/t5/adapter_model.py index 5522748291..66441727c7 100644 --- a/src/adapters/models/t5/adapter_model.py +++ b/src/adapters/models/t5/adapter_model.py @@ -73,9 +73,14 @@ def forward( **kwargs ): return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: - # get decoder inputs from shifting lm labels to the right - decoder_input_ids = self._shift_right(labels) + if decoder_input_ids is None and decoder_inputs_embeds is None: + # Check if we're using a LM head + if labels is not None and any([isinstance(head, Seq2SeqLMHead) for head in self._get_used_heads(head)]): + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + else: + # decoder_input_ids from input_ids if no decoder_input_ids are provided + decoder_input_ids = self._shift_right(input_ids) model_output = self.transformer( input_ids=input_ids, @@ -121,18 +126,15 @@ def forward( else: cls_representation = sequence_output - if head or self.active_head: - kwargs["labels"] = labels - head_outputs = self.forward_head( - model_output, - head_name=head, - cls_output=cls_representation, - return_dict=return_dict, - **kwargs, - ) - return head_outputs - else: - return model_output + kwargs["labels"] = labels + head_outputs = self.forward_head( + model_output, + head_name=head, + cls_output=cls_representation, + return_dict=return_dict, + **kwargs, + ) + return head_outputs # Copied from T5ForConditionalGeneration def prepare_inputs_for_generation( diff --git a/src/adapters/models/t5/modeling_t5.py b/src/adapters/models/t5/modeling_t5.py index 820c092027..7d7e467f0a 100644 --- a/src/adapters/models/t5/modeling_t5.py +++ b/src/adapters/models/t5/modeling_t5.py @@ -292,7 +292,8 @@ def forward( raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") if inputs_embeds is None: - assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings" + if self.embed_tokens is None: + raise ValueError("You have to initialize the model with valid token embeddings") inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape @@ -301,7 +302,8 @@ def forward( mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length if use_cache is True: - assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder" + if not self.is_decoder: + raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") if attention_mask is None: attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) @@ -330,6 +332,13 @@ def forward( else: encoder_extended_attention_mask = None + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) @@ -369,11 +378,6 @@ def forward( all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False def create_custom_forward(module): def custom_forward(*inputs): diff --git a/tests_adapters/composition/test_parallel.py b/tests_adapters/composition/test_parallel.py index c33d5e362f..56ea422308 100644 --- a/tests_adapters/composition/test_parallel.py +++ b/tests_adapters/composition/test_parallel.py @@ -234,7 +234,7 @@ def run_parallel_training_equivalent_to_single(self, adapter_config): dataset = [] for i in range(3): input_data = self.get_input_samples(config=model.config) - if isinstance(model, T5AdapterModel) or isinstance(model, BertGenerationAdapterModel): + if isinstance(model, BertGenerationAdapterModel): input_data["labels"] = torch.randint(0, 2, (3, 64)) else: input_data["labels"] = torch.randint(0, 2, (3, 1)) @@ -291,7 +291,7 @@ def test_parallel_training_single_forward_pass(self): self.assertTrue(torch.equal(v, state_dict[k.replace(b1, b2)])) input_data = self.get_input_samples(config=model.config) - if isinstance(model, T5AdapterModel) or isinstance(model, BertGenerationAdapterModel): + if isinstance(model, BertGenerationAdapterModel): input_data["labels"] = torch.randint(0, 2, (3, 64), device=torch_device) else: input_data["labels"] = torch.randint(0, 2, (3, 1), device=torch_device) diff --git a/tests_adapters/methods/test_adapter_common.py b/tests_adapters/methods/test_adapter_common.py index 81033924a0..616e6a99e8 100644 --- a/tests_adapters/methods/test_adapter_common.py +++ b/tests_adapters/methods/test_adapter_common.py @@ -19,7 +19,7 @@ SeqBnInvConfig, ) from adapters.heads.language_modeling import CausalLMHead -from transformers import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING +from transformers import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, CLIPConfig from transformers.testing_utils import require_torch, torch_device from .base import AdapterMethodBaseTestMixin, create_twin_models @@ -148,7 +148,7 @@ def test_get_adapter(self): n_layers = len(list(model.iter_layers())) if model.config.is_encoder_decoder: n_prefix_layers = 3 - elif model.config.is_composition: + elif model.config.is_composition or isinstance(model.config, CLIPConfig): n_prefix_layers = 2 else: n_prefix_layers = 1 diff --git a/tests_adapters/methods/test_prefix_tuning.py b/tests_adapters/methods/test_prefix_tuning.py index f08a9a492f..798f4b19d4 100644 --- a/tests_adapters/methods/test_prefix_tuning.py +++ b/tests_adapters/methods/test_prefix_tuning.py @@ -1,6 +1,7 @@ import torch from adapters import ADAPTER_MODEL_MAPPING, AutoAdapterModel, PrefixTuningConfig +from transformers import CLIPConfig from transformers.testing_utils import require_torch, torch_device from .base import AdapterMethodBaseTestMixin @@ -24,7 +25,7 @@ def test_get_prefix_tuning(self): model = self.get_model() if model.config.is_encoder_decoder: n_prefix_layers = 3 - elif model.config.is_composition: + elif model.config.is_composition or isinstance(model.config, CLIPConfig): n_prefix_layers = 2 else: n_prefix_layers = 1 diff --git a/tests_adapters/test_t5.py b/tests_adapters/test_t5.py index 7061f68ed3..c8717d8b54 100644 --- a/tests_adapters/test_t5.py +++ b/tests_adapters/test_t5.py @@ -1,8 +1,6 @@ import unittest -from datasets import load_dataset - -from transformers import AutoTokenizer, T5Config +from transformers import T5Config from transformers.testing_utils import require_torch from .composition.test_parallel import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin @@ -38,45 +36,6 @@ class T5AdapterTestBase(AdapterTestBase): ) tokenizer_name = "t5-base" - def dataset(self, tokenizer=None): - # setup tokenizer - if tokenizer is None: - tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - - def preprocess_function(examples): - inputs = examples["document"] - targets = examples["summary"] - inputs = ["Summarize: " + inp for inp in inputs] - model_inputs = tokenizer(inputs, padding=True, truncation=True) - - # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer(targets, padding=True, truncation=True) - - # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore - # padding in the loss. - labels["input_ids"] = [ - [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] - ] - - model_inputs["labels"] = labels["input_ids"] - return model_inputs - - data_args = { - "task_name": "xsum", - "path": "./hf_transformers/tests/fixtures/tests_samples/xsum/sample.json", - } - dataset = load_dataset("json", data_files=data_args["path"]) - train_dataset = dataset["train"] - train_dataset = train_dataset.map( - preprocess_function, - batched=True, - desc="Running tokenizer on train dataset", - ) - return train_dataset - @require_torch class T5AdapterTest( From 188a2311f732c80c2e03550a6654610da781ebfe Mon Sep 17 00:00:00 2001 From: calpt Date: Sun, 17 Sep 2023 12:09:02 +0200 Subject: [PATCH 5/6] Bump to v4.33.2 --- hf_transformers | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hf_transformers b/hf_transformers index fa6107c97e..6da93f5580 160000 --- a/hf_transformers +++ b/hf_transformers @@ -1 +1 @@ -Subproject commit fa6107c97edf7cf725305a34735a57875b67d85e +Subproject commit 6da93f5580e109fad5f7b523cf2b6e8a5bafb623 diff --git a/setup.py b/setup.py index 86e2063880..0a0e73ad84 100644 --- a/setup.py +++ b/setup.py @@ -61,7 +61,7 @@ "sphinx-multiversion", "timeout-decorator", "torch>=1.10,!=1.12.0", - "transformers==4.33.1", + "transformers==4.33.2", "beautifulsoup4", ] From 3bd1a2e0da4c1ee8db713aacfb30148ecb3daa2f Mon Sep 17 00:00:00 2001 From: calpt Date: Wed, 4 Oct 2023 23:24:41 +0200 Subject: [PATCH 6/6] Bump to v4.33.3 --- hf_transformers | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hf_transformers b/hf_transformers index 6da93f5580..bffac926ca 160000 --- a/hf_transformers +++ b/hf_transformers @@ -1 +1 @@ -Subproject commit 6da93f5580e109fad5f7b523cf2b6e8a5bafb623 +Subproject commit bffac926ca6bc6c965a92bfbfd00c567a2c0fb90 diff --git a/setup.py b/setup.py index 0a0e73ad84..aa83e52ec2 100644 --- a/setup.py +++ b/setup.py @@ -61,7 +61,7 @@ "sphinx-multiversion", "timeout-decorator", "torch>=1.10,!=1.12.0", - "transformers==4.33.2", + "transformers==4.33.3", "beautifulsoup4", ]