Skip to content

Commit

Permalink
Upgrade Transformers to v4.33.3 (#586)
Browse files Browse the repository at this point in the history
Upgrade notes:
- remove copying of model classes for GPT-2 and GPT-J due to changes to Transformers merged in huggingface/transformers#25188.
- Add extra argument to `_resize_token_embeddings()`
- Add sequence classification head to T5 & use it in adapter tests
- Fix test config of Llama
  • Loading branch information
calpt authored Oct 5, 2023
1 parent 009649b commit 5652c0b
Show file tree
Hide file tree
Showing 16 changed files with 142 additions and 504 deletions.
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Installation

The `adapters` package is designed as an add-on for Hugging Face's Transformers library.
It currently supports Python 3.7+ and PyTorch 1.3.1+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first.
It currently supports Python 3.8+ and PyTorch 1.10+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first.

```{eval-rst}
.. important::
Expand Down
2 changes: 1 addition & 1 deletion hf_transformers
Submodule hf_transformers updated 1003 files
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# We try to follow their general layout wherever sensible.

_deps = [
"accelerate>=0.20.1",
"accelerate>=0.20.3",
"black==22.3", # after updating to black 2023, also update Python version in pyproject.toml to 3.7
"datasets!=2.5.0",
"dill<0.3.5",
Expand Down Expand Up @@ -60,8 +60,8 @@
"sphinx-intl",
"sphinx-multiversion",
"timeout-decorator",
"torch>=1.7,!=1.12.0",
"transformers==4.31.0",
"torch>=1.10,!=1.12.0",
"transformers==4.33.3",
"beautifulsoup4",
]

Expand Down
14 changes: 14 additions & 0 deletions src/adapters/head_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,20 @@
},
"layers": [None, "qa_outputs"],
},
"T5ForSequenceClassification": {
"config": {
"head_type": "classification",
"layers": 2,
"activation_function": "tanh",
},
"layers": [
None,
"classification_head.dense",
None,
None,
"classification_head.out_proj",
],
},
"DebertaV2ForSequenceClassification": {
"config": {
"head_type": "classification",
Expand Down
47 changes: 28 additions & 19 deletions src/adapters/heads/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,9 +554,9 @@ def tie_weights(self):
self = getattr(self, self.base_model_prefix)
self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)

def _resize_token_embeddings(self, new_num_tokens):
def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
old_embeddings = self.get_input_embeddings()
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
self.set_input_embeddings(new_embeddings)

# if word embeddings are not tied, make sure that lm head is resized as well
Expand Down Expand Up @@ -730,6 +730,27 @@ def delete_head(self, head_name: str):
if self.active_head == head_name:
self.active_head = None

def _get_used_heads(self, head_name: str = None):
if head_name:
used_heads = [head_name]
# together with context, check if we have heads at all to allow for models without heads
elif len(self.heads) > 0 and AdapterSetup.get_context_head_setup():
used_heads = AdapterSetup.get_context_head_setup()
if isinstance(used_heads, str):
used_heads = [used_heads]
elif self._active_heads:
used_heads = self._active_heads
else:
return []

head_modules = []
for head in used_heads:
if head not in self.heads:
raise ValueError("Unknown head_name '{}'".format(head))
head_modules.append(self.heads[head])

return head_modules

def forward_head(
self, all_outputs, head_name=None, cls_output=None, attention_mask=None, return_dict=False, **kwargs
):
Expand All @@ -750,16 +771,8 @@ def forward_head(
return_dict (bool): Whether or not to return a ``ModelOutput`` instead of a plain tuple.
**kwargs: Additional keyword arguments passed to the forward pass of the head.
"""
if head_name:
used_heads = [head_name]
# together with context, check if we have heads at all to allow for models without heads
elif len(self.heads) > 0 and AdapterSetup.get_context_head_setup():
used_heads = AdapterSetup.get_context_head_setup()
if isinstance(used_heads, str):
used_heads = [used_heads]
elif self._active_heads:
used_heads = self._active_heads
else:
used_head_modules = self._get_used_heads(head_name)
if len(used_head_modules) == 0:
logger.debug("No prediction head is used.")
return all_outputs

Expand Down Expand Up @@ -787,9 +800,6 @@ def _get_head_input(outputs, cls_out, batch):
if inv_adapter:
kwargs["invertible_adapter"] = inv_adapter

for head in used_heads:
if head not in self.heads:
raise ValueError("Unknown head_name '{}'".format(head))
if isinstance(self.active_head, BatchSplit):
if sum(self.active_head.batch_sizes) != all_outputs[0].size()[0]:
raise ValueError(
Expand Down Expand Up @@ -830,14 +840,13 @@ def _get_head_input(outputs, cls_out, batch):
else None
)
return_output = MultiHeadOutput(head_outputs=head_outputs, loss=combined_loss)
elif len(used_heads) > 1:
elif len(used_head_modules) > 1:
head_outputs = []
for head in used_heads:
head_module = self.heads[head]
for head_module in used_head_modules:
head_outputs.append(head_module(all_outputs, cls_output, attention_mask, return_dict, **kwargs))
return_output = MultiHeadOutput(head_outputs=head_outputs)
else:
head_module = self.heads[used_heads[0]]
head_module = used_head_modules[0]
return_output = head_module(all_outputs, cls_output, attention_mask, return_dict, **kwargs)

if isinstance(return_output, ModelOutput):
Expand Down
5 changes: 4 additions & 1 deletion src/adapters/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
CLIPVisionModelAdaptersMixin,
)
from .distilbert.mixin_distilbert import DistilBertModelAdaptersMixin, DistilBertTransformerAdaptersMixin
from .gptj.mixin_gptj import GPTJMLPAdaptersMixin
from .gpt2.mixin_gpt2 import GPT2ModelAdapterMixin
from .gptj.mixin_gptj import GPTJMLPAdaptersMixin, GPTJModelAdapterMixin
from .llama.mixin_llama import LlamaModelAdapterMixin
from .t5.mixin_t5 import T5BlockAdaptersMixin, T5ModelAdaptersMixin, T5ModelAdaptersWithHeadsMixin
from .vit.mixin_vit import ViTIntermediateAdaptersMixin, ViTModelAdaptersMixin
Expand Down Expand Up @@ -49,7 +50,9 @@
"MBartDecoder": BartDecoderAdaptersMixin,
"MBartDecoderWrapper": BartDecoderWrapperAdaptersMixin,
"MBartModel": BartModelAdaptersMixin,
"GPT2Model": GPT2ModelAdapterMixin,
"GPTJMLP": GPTJMLPAdaptersMixin,
"GPTJModel": GPTJModelAdapterMixin,
"RobertaLayer": BertLayerAdaptersMixin,
"RobertaModel": BertModelAdaptersMixin,
"T5Block": T5BlockAdaptersMixin,
Expand Down
4 changes: 2 additions & 2 deletions src/adapters/models/gpt2/mixin_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ...layer import AdapterLayer
from ...lora import Linear as LoRALinear
from ...lora import MergedLinear as LoRAMergedLinear
from ...model_mixin import EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelAdaptersMixin
from ...model_mixin import EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin
from ...prefix_tuning import PrefixTuningShim


Expand Down Expand Up @@ -54,7 +54,7 @@ def init_adapters(self, model_config, adapters_config):
self.output_adapters = AdapterLayer("output_adapter")


class GPT2ModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelAdaptersMixin):
class GPT2ModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelBaseAdaptersMixin):
def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
for i, layer in enumerate(self.base_model.h):
yield i, layer
Expand Down
Loading

0 comments on commit 5652c0b

Please sign in to comment.