Add 'auto' dataset option

huggingface · Nov 26, 2024 · 09b3927 · 09b3927
1 parent 7c072e1
commit 09b3927
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 10 deletions.
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -136,8 +136,11 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help=(
             "The dataset used for data-aware compression or quantization with NNCF. "
-            "You can use the one from the list ['wikitext2','c4','c4-new'] for language models "
-            "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
+            "For language models you can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the"
+            "dataset will be collected from model's generations. "
+            "For diffusion models on of ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS',"
+            "'laion/filtered-wit']."
+            "For visual language models the dataset must be set to 'contextual'."
         ),
     )
     optional_group.add_argument(

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -314,9 +314,12 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
                 - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                     using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
         dataset (`str or List[str]`, *optional*):
-            The dataset used for data-aware compression with NNCF. For language models you can provide your own dataset
-            in a list of strings or just use the one from the list ['wikitext2','c4','c4-new']. For diffusion models it
-            must be one of ['conceptual_captions', 'laion/220k-GPT4Vision-captions-from-LIVIS', 'laion/filtered-wit'].
+            The dataset used for data-aware compression with NNCF.
+            - For language models you can provide your own dataset in a list of strings or just use one from the list
+                ['auto', 'wikitext2','c4','c4-new']. With 'auto' the dataset will be collected from model's generations.
+            - For diffusion models the dataset must be one of ['conceptual_captions',
+                'laion/220k-GPT4Vision-captions-from-LIVIS', 'laion/filtered-wit'].
+            - For visual language models the dataset must be set to 'contextual'.
             Alternatively, you can provide data objects via `calibration_dataset` argument of `OVQuantizer.quantize()`
             method.
         ratio (`float`, defaults to 1.0):
@@ -423,7 +426,7 @@ def post_init(self):
                 f"If you wish to provide a custom dataset, please use the `OVQuantizer` instead."
             )
         if self.dataset is not None and isinstance(self.dataset, str):
-            lm_datasets = ["wikitext2", "c4", "c4-new"]
+            lm_datasets = ["wikitext2", "c4", "c4-new", "auto"]
             visual_lm_datasets = list(PREDEFINED_VISUAL_LM_DATASETS.keys())
             stable_diffusion_datasets = list(PREDEFINED_SD_DATASETS.keys())
             if self.dataset not in lm_datasets + visual_lm_datasets + stable_diffusion_datasets:

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
@@ -734,7 +734,11 @@ def _prepare_causal_lm_dataset(self, quantization_config: OVWeightQuantizationCo
         nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
         config_dataset = quantization_config.dataset
         if isinstance(config_dataset, str):
-            calibration_dataset = get_dataset(config_dataset, tokenizer, seqlen=32, nsamples=nsamples)
+            if config_dataset == "auto":
+                generated_data = nncf.data.generate_text_data(self.model, tokenizer, dataset_size=nsamples)
+                calibration_dataset = [tokenizer(text, return_tensors="pt") for text in generated_data]
+            else:
+                calibration_dataset = get_dataset(config_dataset, tokenizer, seqlen=32, nsamples=nsamples)
         elif isinstance(config_dataset, list) and all(isinstance(it, str) for it in config_dataset):
             calibration_dataset = [tokenizer(text, return_tensors="pt") for text in config_dataset[:nsamples]]
         else:

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -134,7 +134,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         (
             "text-generation-with-past",
             "llama_awq",
-            "int4 --ratio 1.0 --sym --group-size 16 --lora --dataset wikitext2 --num-samples 1",
+            "int4 --ratio 1.0 --sym --group-size 16 --lora --dataset auto --num-samples 16",
             {"int8": 60, "int4": 14},
         ),
         ("text-generation-with-past", "llama_awq", "int4 --group-size 16 --backup-precision none", {"int4": 28}),

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -313,8 +313,8 @@ class OVWeightCompressionTest(unittest.TestCase):
             dict(
                 bits=4,
                 group_size=16,
-                num_samples=1,
-                dataset="c4",
+                num_samples=16,
+                dataset="auto",
                 lora=True,
             ),
             {"int4": 28, "int8": 60},