Skip to content

Commit

Permalink
Update encoding methods in BaseChunker to use encode and batch_encode…
Browse files Browse the repository at this point in the history
…_plus for improved compatibility
  • Loading branch information
bhavnicksm committed Nov 6, 2024
1 parent 78730e3 commit 7f719c1
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/chonkie/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def _get_tokenizer_backend(self):
def _encode(self, text: str):
"""Encode text using the backend tokenizer."""
if self._tokenizer_backend == "transformers":
return self.tokenizer(text)
return self.tokenizer.encode(text)
elif self._tokenizer_backend == "tokenizers":
return self.tokenizer.encode(text).ids
elif self._tokenizer_backend == "tiktoken":
Expand All @@ -50,7 +50,7 @@ def _encode(self, text: str):
def _encode_batch(self, texts: List[str]):
"""Encode a batch of texts using the backend tokenizer."""
if self._tokenizer_backend == "transformers":
return self.tokenizer(texts)
return self.tokenizer.batch_encode_plus(texts)['input_ids']
elif self._tokenizer_backend == "tokenizers":
return self.tokenizer.encode_batch(texts)
elif self._tokenizer_backend == "tiktoken":
Expand Down

0 comments on commit 7f719c1

Please sign in to comment.