diff --git a/src/open_clip/tokenizer.py b/src/open_clip/tokenizer.py index 0c6315fed..3b762c2fa 100644 --- a/src/open_clip/tokenizer.py +++ b/src/open_clip/tokenizer.py @@ -21,7 +21,6 @@ _nltk_init = False DEFAULT_CONTEXT_LENGTH = 77 # default context length for OpenAI CLIP -RE_DEDUPLICATE_WHITESPACE = re.compile(r"\s+") @lru_cache() @@ -71,7 +70,7 @@ def basic_clean(text): def whitespace_clean(text): - text = RE_DEDUPLICATE_WHITESPACE.sub(" ", text) + text = " ".join(text.split()) text = text.strip() return text @@ -127,7 +126,7 @@ def canonicalize_text( else: text = text.translate(trans_punctuation) text = text.lower() - text = RE_DEDUPLICATE_WHITESPACE.sub(" ", text) + text = " ".join(text.split()) return text.strip() @@ -191,7 +190,7 @@ def bpe(self, token): j = word.index(first, i) new_word.extend(word[i:j]) i = j - except: + except Exception: new_word.extend(word[i:]) break