From f079e6fa36ab784e6113ede32202b7082f5ccce0 Mon Sep 17 00:00:00 2001 From: Scott Lundberg Date: Fri, 22 Dec 2023 18:39:19 +0000 Subject: [PATCH] Fix #555 (patch huggingface bug) --- guidance/models/transformers/_transformers.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py index 049247e58..28f3471b6 100644 --- a/guidance/models/transformers/_transformers.py +++ b/guidance/models/transformers/_transformers.py @@ -71,7 +71,15 @@ def __init__(self, model=None, tokenizer=None, echo=True, caching=True, temperat self._cache_state["cache_token_ids"] = [] def _joint_tokenize(self, token_ids): - return self._orig_tokenizer(self._orig_tokenizer.decode(token_ids), add_special_tokens=False)["input_ids"] + first_decode = self._orig_tokenizer.decode(token_ids) + new_ids = self._orig_tokenizer(first_decode, add_special_tokens=False)["input_ids"] + + # HACK: check for a bug in the HuggingFace tokenizer (that will just add extra spaces during an encode-decode cycle) + second_decode = self._orig_tokenizer.decode(new_ids) + if second_decode != first_decode and len(second_decode) == len(first_decode) + 1 and second_decode.startswith(" "): + new_ids = new_ids[0:1] + new_ids[2:] + + return new_ids def _model_and_tokenizer(self, model, tokenizer, **kwargs):