Skip to content

Commit

Permalink
refactor: move tokenizer_data_utils with the rest of utils, add furth…
Browse files Browse the repository at this point in the history
…er unit testing. (foundation-model-stack#348)

* cleanup: Move tokenizer_data_utils to /utils to /data, change imports

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>

* tests: Add additional tests for test_embedding_resize to check resize with special tokens, resize multiple of. fmt

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>

* lint

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>

* fix: more thorough testing from output of function

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>

* test: move assertion

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>

---------

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
  • Loading branch information
willmj authored Oct 8, 2024
1 parent 63d0c5d commit ee2cd66
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 22 deletions.
4 changes: 2 additions & 2 deletions scripts/run_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
import torch

# Local
from tuning.data import tokenizer_data_utils
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


### Utilities
Expand Down Expand Up @@ -219,7 +219,7 @@ def load(
# where the model's layers are modified, in our case the embedding layer
# is modified, so we resize the backbone model's embedding layer with our own
# utility before passing it along to load the PEFT model.
tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
{}, tokenizer=tokenizer, model=base_model
)
model = PeftModel.from_pretrained(
Expand Down
74 changes: 70 additions & 4 deletions tests/utils/test_embedding_resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
import torch

# Local
from tuning.data import tokenizer_data_utils
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize

MODEL_NAME = "Maykeye/TinyLLama-v0"
INPUT_TEXT = "### Text: @NortonSupport Thanks much.\n\n### Label:"


def _inference(
Expand All @@ -41,16 +42,16 @@ def _inference(


def test_output_unaltered_across_embedding_resizes():
input_text = "### Text: @NortonSupport Thanks much.\n\n### Label:"
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_not_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model_resized, multiple_of=8
)

tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
special_tokens_dict={},
tokenizer=tokenizer,
model=model_not_resized,
Expand All @@ -74,3 +75,68 @@ def test_output_unaltered_across_embedding_resizes():
)

assert output_from_model_not_resized == output_from_model_resized


def test_resize_with_special_tokens():
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

input_tokenizer_len = len(tokenizer.get_vocab())

special_tokens = {"sep_token": "<SEP>", "pad_token": "<PAD>"}
resize_result = tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens,
tokenizer=tokenizer,
model=model,
multiple_of=1,
)

assert "<SEP>" in tokenizer.get_vocab()
assert "<PAD>" in tokenizer.get_vocab()

output_tokenizer_len = len(tokenizer.get_vocab())

assert output_tokenizer_len == input_tokenizer_len + 2
assert resize_result["num_new_tokens"] == output_tokenizer_len - input_tokenizer_len

output = _inference(
tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20
)
assert output is not None


def test_no_resize_when_no_special_tokens():
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

input_tokenizer_len = len(tokenizer.get_vocab())

resize_result = tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=1
)

output_tokenizer_len = len(tokenizer.get_vocab())

assert input_tokenizer_len == output_tokenizer_len
assert resize_result["num_new_tokens"] == 0

output = _inference(
tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20
)

assert output is not None


def test_resize_with_multiple_of():
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

resize_result = tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=8
)

assert model.get_input_embeddings().embedding_dim % 8 == 0
assert resize_result["new_embedding_size"] % 8 == 0
assert model.get_output_embeddings().out_features % 8 == 0
2 changes: 1 addition & 1 deletion tests/utils/test_tokenizer_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

# Local
# First party
from tuning.data.tokenizer_data_utils import tokenizer_and_embedding_resize
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


def test_tokenizer_and_embedding_resize_return_values():
Expand Down
13 changes: 0 additions & 13 deletions tuning/data/__init__.py

This file was deleted.

4 changes: 2 additions & 2 deletions tuning/sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
FileLoggingTrackerConfig,
TrackerConfigFactory,
)
from tuning.data import tokenizer_data_utils
from tuning.trackers.tracker_factory import FILE_LOGGING_TRACKER, get_tracker
from tuning.trainercontroller import TrainerControllerCallback
from tuning.utils.config_utils import get_hf_peft_config, get_json_config
Expand All @@ -70,6 +69,7 @@
is_pretokenized_dataset,
validate_data_args,
)
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


def train(
Expand Down Expand Up @@ -294,7 +294,7 @@ def train(

# TODO: lower priority but understand if resizing impacts inference quality and why its needed.
# It makes sense if we manipulate tokenizer that we also save it and provide it to inference.
added_tokens_dict = tokenizer_data_utils.tokenizer_and_embedding_resize(
added_tokens_dict = tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens_dict,
tokenizer=tokenizer,
model=model,
Expand Down
File renamed without changes.

0 comments on commit ee2cd66

Please sign in to comment.