✅ based on the PR comments, changed test case to check for an expecte…

…d number instead of checking if length is non-zero; added `return_attention_mask=True` in the `run_tokenizer` method Signed-off-by: m-misiura <mmisiura@redhat.com>
caikit · Dec 4, 2024 · f629248 · f629248
1 parent 261e1a3
commit f629248
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/caikit_nlp/modules/text_generation/text_generation_local.py b/caikit_nlp/modules/text_generation/text_generation_local.py
@@ -592,7 +592,7 @@ def run_tokenizer(
                 The token count
         """
         error.type_check("<NLP48137045E>", str, text=text)
-        tokenized_output = self.model.tokenizer(text)
+        tokenized_output = self.model.tokenizer(text, return_attention_mask=True)
         return TokenizationResults(
             token_count=len(tokenized_output["input_ids"]),
         )

diff --git a/tests/modules/text_generation/test_text_generation_local.py b/tests/modules/text_generation/test_text_generation_local.py
@@ -228,10 +228,10 @@ def test_run_tokenizer_edge_cases(disable_wip, set_cpu_device):
     short_text = "This is a test sentence."
     short_result = model.run_tokenizer(short_text)
     assert isinstance(short_result, TokenizationResults)
-    assert short_result.token_count > 0
+    assert short_result.token_count == len(model.model.tokenizer.encode(short_text))
 
     # Edge case: Long input
     long_text = "This is a test sentence. " * 1000
     long_result = model.run_tokenizer(long_text)
     assert isinstance(long_result, TokenizationResults)
-    assert long_result.token_count > 0
+    assert long_result.token_count == len(model.model.tokenizer.encode(long_text))