Skip to content

Commit

Permalink
Add 'auto' dataset option
Browse files Browse the repository at this point in the history
  • Loading branch information
nikita-savelyevv committed Nov 26, 2024
1 parent 7c072e1 commit 09b3927
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 10 deletions.
7 changes: 5 additions & 2 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,11 @@ def parse_args_openvino(parser: "ArgumentParser"):
default=None,
help=(
"The dataset used for data-aware compression or quantization with NNCF. "
"You can use the one from the list ['wikitext2','c4','c4-new'] for language models "
"or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
"For language models you can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the"
"dataset will be collected from model's generations. "
"For diffusion models on of ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS',"
"'laion/filtered-wit']."
"For visual language models the dataset must be set to 'contextual'."
),
)
optional_group.add_argument(
Expand Down
11 changes: 7 additions & 4 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,9 +314,12 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
dataset (`str or List[str]`, *optional*):
The dataset used for data-aware compression with NNCF. For language models you can provide your own dataset
in a list of strings or just use the one from the list ['wikitext2','c4','c4-new']. For diffusion models it
must be one of ['conceptual_captions', 'laion/220k-GPT4Vision-captions-from-LIVIS', 'laion/filtered-wit'].
The dataset used for data-aware compression with NNCF.
- For language models you can provide your own dataset in a list of strings or just use one from the list
['auto', 'wikitext2','c4','c4-new']. With 'auto' the dataset will be collected from model's generations.
- For diffusion models the dataset must be one of ['conceptual_captions',
'laion/220k-GPT4Vision-captions-from-LIVIS', 'laion/filtered-wit'].
- For visual language models the dataset must be set to 'contextual'.
Alternatively, you can provide data objects via `calibration_dataset` argument of `OVQuantizer.quantize()`
method.
ratio (`float`, defaults to 1.0):
Expand Down Expand Up @@ -423,7 +426,7 @@ def post_init(self):
f"If you wish to provide a custom dataset, please use the `OVQuantizer` instead."
)
if self.dataset is not None and isinstance(self.dataset, str):
lm_datasets = ["wikitext2", "c4", "c4-new"]
lm_datasets = ["wikitext2", "c4", "c4-new", "auto"]
visual_lm_datasets = list(PREDEFINED_VISUAL_LM_DATASETS.keys())
stable_diffusion_datasets = list(PREDEFINED_SD_DATASETS.keys())
if self.dataset not in lm_datasets + visual_lm_datasets + stable_diffusion_datasets:
Expand Down
6 changes: 5 additions & 1 deletion optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,11 @@ def _prepare_causal_lm_dataset(self, quantization_config: OVWeightQuantizationCo
nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
config_dataset = quantization_config.dataset
if isinstance(config_dataset, str):
calibration_dataset = get_dataset(config_dataset, tokenizer, seqlen=32, nsamples=nsamples)
if config_dataset == "auto":
generated_data = nncf.data.generate_text_data(self.model, tokenizer, dataset_size=nsamples)
calibration_dataset = [tokenizer(text, return_tensors="pt") for text in generated_data]
else:
calibration_dataset = get_dataset(config_dataset, tokenizer, seqlen=32, nsamples=nsamples)
elif isinstance(config_dataset, list) and all(isinstance(it, str) for it in config_dataset):
calibration_dataset = [tokenizer(text, return_tensors="pt") for text in config_dataset[:nsamples]]
else:
Expand Down
2 changes: 1 addition & 1 deletion tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ class OVCLIExportTestCase(unittest.TestCase):
(
"text-generation-with-past",
"llama_awq",
"int4 --ratio 1.0 --sym --group-size 16 --lora --dataset wikitext2 --num-samples 1",
"int4 --ratio 1.0 --sym --group-size 16 --lora --dataset auto --num-samples 16",
{"int8": 60, "int4": 14},
),
("text-generation-with-past", "llama_awq", "int4 --group-size 16 --backup-precision none", {"int4": 28}),
Expand Down
4 changes: 2 additions & 2 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,8 @@ class OVWeightCompressionTest(unittest.TestCase):
dict(
bits=4,
group_size=16,
num_samples=1,
dataset="c4",
num_samples=16,
dataset="auto",
lora=True,
),
{"int4": 28, "int8": 60},
Expand Down

0 comments on commit 09b3927

Please sign in to comment.