From 07ab8ae86ba8b459b9007f992168b7a112d0d09a Mon Sep 17 00:00:00 2001 From: Santiago Castro Date: Tue, 17 Oct 2023 21:18:34 -0400 Subject: [PATCH] Fix `random_mask_tokenize` when the text is long Without this patch, the function crashes for long texts. See https://colab.research.google.com/drive/1SHBAUEnI1dNJmXQPUqZekFqXm7xrwH65?usp=sharing --- src/open_clip/tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/open_clip/tokenizer.py b/src/open_clip/tokenizer.py index 3e651aed5..954ebb6d2 100644 --- a/src/open_clip/tokenizer.py +++ b/src/open_clip/tokenizer.py @@ -250,7 +250,7 @@ def random_mask_tokenize(texts: Union[str, List[str]], context_length: int = 77) if len(tokens) > context_length - 2: # 2 for sot and eot token indices = np.random.permutation(len(tokens)).tolist() indices = indices[:context_length - 2] - tokens = tokens[indices] + tokens = [tokens[i] for i in indices] tokens = [sot_token,] + tokens + [eot_token,] result[i, :len(tokens)] = torch.tensor(tokens) @@ -350,4 +350,4 @@ def get_order(x): tokens[-1] = eot_token result[i, :len(tokens)] = torch.tensor(tokens) - return result \ No newline at end of file + return result