From 7f719c17cc132d724dfb3c77c8ad863988471bf5 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Wed, 6 Nov 2024 20:16:51 +0530 Subject: [PATCH] Update encoding methods in BaseChunker to use encode and batch_encode_plus for improved compatibility --- src/chonkie/chunker/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py index 805a30e..eae8ada 100644 --- a/src/chonkie/chunker/base.py +++ b/src/chonkie/chunker/base.py @@ -39,7 +39,7 @@ def _get_tokenizer_backend(self): def _encode(self, text: str): """Encode text using the backend tokenizer.""" if self._tokenizer_backend == "transformers": - return self.tokenizer(text) + return self.tokenizer.encode(text) elif self._tokenizer_backend == "tokenizers": return self.tokenizer.encode(text).ids elif self._tokenizer_backend == "tiktoken": @@ -50,7 +50,7 @@ def _encode(self, text: str): def _encode_batch(self, texts: List[str]): """Encode a batch of texts using the backend tokenizer.""" if self._tokenizer_backend == "transformers": - return self.tokenizer(texts) + return self.tokenizer.batch_encode_plus(texts)['input_ids'] elif self._tokenizer_backend == "tokenizers": return self.tokenizer.encode_batch(texts) elif self._tokenizer_backend == "tiktoken":