Add a fix for custom code tokenizers in pipelines (huggingface#32300)

* Add a fix for the case when tokenizers are passed as a string * Support image processors and feature extractors as well * Reverting load_feature_extractor and load_image_processor * Add test * Test is torch-only * Add tests for preprocessors and feature extractors and move test * Extremely experimental fix * Revert that change, wrong branch! * Typo! * Split tests
zucchini-nlp · Aug 30, 2024 · 625d48b · 625d48b
1 parent e8435ea
commit 625d48b
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 1 deletion.
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
@@ -904,7 +904,11 @@ def pipeline(
 
     model_config = model.config
     hub_kwargs["_commit_hash"] = model.config._commit_hash
-    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
+    load_tokenizer = (
+        type(model_config) in TOKENIZER_MAPPING
+        or model_config.tokenizer_class is not None
+        or isinstance(tokenizer, str)
+    )
     load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
     load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
 

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
@@ -26,10 +26,13 @@
 from requests.exceptions import HTTPError
 
 from transformers import (
+    AutomaticSpeechRecognitionPipeline,
     AutoModelForSequenceClassification,
     AutoTokenizer,
     DistilBertForSequenceClassification,
+    MaskGenerationPipeline,
     TextClassificationPipeline,
+    TextGenerationPipeline,
     TFAutoModelForSequenceClassification,
     pipeline,
 )
@@ -859,6 +862,42 @@ def new_forward(*args, **kwargs):
 
         self.assertEqual(self.COUNT, 1)
 
+    @require_torch
+    def test_custom_code_with_string_tokenizer(self):
+        # This test checks for an edge case - tokenizer loading used to fail when using a custom code model
+        # with a separate tokenizer that was passed as a repo name rather than a tokenizer object.
+        # See https://github.com/huggingface/transformers/issues/31669
+        text_generator = pipeline(
+            "text-generation",
+            model="Rocketknight1/fake-custom-model-test",
+            tokenizer="Rocketknight1/fake-custom-model-test",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(text_generator, TextGenerationPipeline)  # Assert successful loading
+
+    @require_torch
+    def test_custom_code_with_string_feature_extractor(self):
+        speech_recognizer = pipeline(
+            "automatic-speech-recognition",
+            model="Rocketknight1/fake-custom-wav2vec2",
+            feature_extractor="Rocketknight1/fake-custom-wav2vec2",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(speech_recognizer, AutomaticSpeechRecognitionPipeline)  # Assert successful loading
+
+    @require_torch
+    def test_custom_code_with_string_preprocessor(self):
+        mask_generator = pipeline(
+            "mask-generation",
+            model="Rocketknight1/fake-custom-sam",
+            processor="Rocketknight1/fake-custom-sam",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(mask_generator, MaskGenerationPipeline)  # Assert successful loading
+
 
 @require_torch
 @is_staging_test