diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 9d8f23f9dd..3c52bbcd6a 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -167,7 +167,7 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin): bits (`int`, defaults to 8): The number of bits to quantize to. - sym (`bool`, *optional*, defaults to `False`): + sym (`bool`, defaults to `False`): Whether to use symetric quantization. tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): The tokenizer used to process the dataset. You can pass either: @@ -177,26 +177,24 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin): user or organization name, like `dbmdz/bert-base-german-cased`. - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. - dataset (`Union[List[str]]`, *optional*): + dataset (`str or List[str]`, *optional*): The dataset used for data-aware compression. You can provide your own dataset in a list of string or just use the the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs or - ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for SD models - group_size (`int`, *optional*, defaults to 128): - The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. - ratio (`float`, *optional*, defaults to 1.0): + ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for SD models. + ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM and the rest to INT8_ASYM). + group_size (`int`, *optional*): + The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. all_layers (`bool`, *optional*): Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion. - sensitivity_metric (`nncf.SensitivityMetric`, *optional*): + sensitivity_metric (`str`, *optional*): The sensitivity metric for assigning quantization precision to layers. In order to preserve the accuracy of the model, the more sensitive layers receives a higher precision. - awq (`bool`, *optional*): - Enables AWQ method to unify weight ranges and improve overall model accuracy. - ignored_scope (`nncf.IgnoredScope`, *optional*): + ignored_scope (`dict`, *optional*): An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization. - subset_size (`int`, *optional*, defaults to 128): - Number of data samples to calculate activation statistics. + num_samples (`int`, *optional*): + The maximum number of samples composing the calibration dataset. """ @@ -205,13 +203,13 @@ def __init__( bits: int = 8, sym: bool = False, tokenizer: Optional[Any] = None, - dataset: Optional[str] = None, + dataset: Optional[Union[str, List[str]]] = None, ratio: float = 1.0, group_size: Optional[int] = None, all_layers: Optional[bool] = None, sensitivity_metric: Optional[str] = None, ignored_scope: Optional[dict] = None, - subset_size: int = 128, + num_samples: Optional[int] = None, **kwargs, ): self.bits = bits @@ -223,7 +221,7 @@ def __init__( self.all_layers = all_layers self.sensitivity_metric = sensitivity_metric self.ignored_scope = ignored_scope - self.subset_size = subset_size + self.num_samples = num_samples self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release self.post_init() diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 92a2ce436d..aab94c9e99 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -635,7 +635,8 @@ def _from_pretrained( # from optimum.gptq.utils import get_seqlen # seqlen = get_seqlen(causal_model) - dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32) + nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 + dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) quantization_config = copy.deepcopy(quantization_config) quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 7d9e625d9f..c6c963355d 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -17,6 +17,7 @@ import math import os import shutil +from copy import deepcopy from pathlib import Path from tempfile import TemporaryDirectory, gettempdir from typing import Any, Dict, List, Optional, Union @@ -35,7 +36,6 @@ from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available from huggingface_hub import snapshot_download -from nncf import Dataset from openvino._offline_transformations import compress_model_transformation from openvino.runtime import Core from transformers import CLIPFeatureExtractor, CLIPTokenizer @@ -276,17 +276,15 @@ def _from_pretrained( kwargs[name] = load_method(new_model_save_dir) quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) - - dataset = None - if quantization_config: - dataset = quantization_config.dataset - quantization_config.dataset = None # apply weight compression without dataset - + weight_quantization_config = deepcopy(quantization_config) unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name - if quantization_config and dataset is None: - unet = cls.load_model(unet_path, quantization_config) - else: + if weight_quantization_config is not None and weight_quantization_config.dataset is not None: + # load the UNet model uncompressed to apply hybrid quantization further unet = cls.load_model(unet_path) + # Apply weights compression to other `components` without dataset + weight_quantization_config.dataset = None + else: + unet = cls.load_model(unet_path, quantization_config) components = { "vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, @@ -296,12 +294,12 @@ def _from_pretrained( } for key, value in components.items(): - components[key] = cls.load_model(value, quantization_config) if value.is_file() else None + components[key] = cls.load_model(value, weight_quantization_config) if value.is_file() else None if model_save_dir is None: model_save_dir = new_model_save_dir - if quantization_config and dataset is not None: + if quantization_config and quantization_config.dataset is not None: sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs) supported_pipelines = ( @@ -313,23 +311,23 @@ def _from_pretrained( raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}") num_inference_steps = 4 if isinstance(sd_model, OVLatentConsistencyModelPipeline) else 50 - quantization_config.dataset = dataset + nsamples = quantization_config.num_samples if quantization_config.num_samples else 200 + dataset = deepcopy(quantization_config.dataset) - if isinstance(quantization_config.dataset, str): + if isinstance(dataset, str): from .quantization import get_stable_diffusion_dataset - dataset_name = quantization_config.dataset - num_samples = math.ceil(quantization_config.subset_size / num_inference_steps) - quantization_config.dataset = get_stable_diffusion_dataset(dataset_name, num_samples) + num_unet_runs = math.ceil(nsamples / num_inference_steps) + dataset = get_stable_diffusion_dataset(dataset, num_unet_runs) - unet_inputs = sd_model.prepare_inputs( - quantization_config.dataset, quantization_config.subset_size, num_inference_steps - ) - quantization_config.dataset = unet_inputs + unet_inputs = sd_model._prepare_unet_inputs(dataset, nsamples, num_inference_steps) from .quantization import _hybrid_quantization - unet = _hybrid_quantization(sd_model.unet.model, quantization_config) + hybrid_quantization_config = deepcopy(quantization_config) + hybrid_quantization_config.dataset = unet_inputs + hybrid_quantization_config.num_samples = nsamples + unet = _hybrid_quantization(sd_model.unet.model, hybrid_quantization_config) return cls( unet=unet, @@ -340,27 +338,26 @@ def _from_pretrained( **kwargs, ) - def prepare_inputs( + def _prepare_unet_inputs( self, - dataset: Dataset, - subset_size: int, + dataset: List[str], + num_samples: int, num_inference_steps: int, height: Optional[int] = 512, width: Optional[int] = 512, **kwargs, - ) -> Dataset: + ) -> Dict[str, Any]: self.compile() calibration_data = [] from .quantization import InferRequestWrapper self.unet.request = InferRequestWrapper(self.unet.request, calibration_data) - for prompt in dataset.get_inference_data(): + for prompt in dataset: _ = self.__call__(prompt, num_inference_steps=num_inference_steps, height=height, width=width) - if len(calibration_data) >= subset_size: - break + self.unet.request = self.unet.request.request - return Dataset(calibration_data) + return calibration_data[:num_samples] @classmethod def _from_transformers( diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 8bc9dbec7d..d4228a65fa 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -19,7 +19,7 @@ from collections import deque from copy import deepcopy from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import nncf import openvino @@ -548,7 +548,7 @@ def _remove_unused_columns(self, dataset: "Dataset"): def _weight_only_quantization( model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict] -): +) -> openvino.runtime.Model: config = quantization_config if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) @@ -562,7 +562,8 @@ def _weight_only_quantization( from optimum.gptq.data import get_dataset, prepare_dataset - dataset = get_dataset(config.dataset, tokenizer, seqlen=32) + nsamples = config.num_samples if config.num_samples else 128 + dataset = get_dataset(config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) sensitivity_metric = None @@ -588,7 +589,7 @@ def _weight_only_quantization( # awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0 ignored_scope=ignored_scope, dataset=dataset, - # subset_size=config.subset_size, # TODO : enable from nncf v2.9.0 + # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 ) @@ -639,7 +640,7 @@ def _collect_ops_with_weights(model): def get_stable_diffusion_dataset( dataset_name: str, nsamples: int = 50, seed: int = 0, text_column: str = "caption" -) -> nncf.Dataset: +) -> List[str]: if dataset_name not in ["conceptual_captions", "laion/220k-GPT4Vision-captions-from-LIVIS", "laion/filtered-wit"]: raise ValueError( f"""You have entered a string value for dataset. You can only choose between @@ -649,37 +650,46 @@ def get_stable_diffusion_dataset( data = load_dataset(dataset_name, split="train", streaming=True).shuffle(seed=seed).take(nsamples) dataset = [batch[text_column] for batch in data] - return nncf.Dataset(dataset) + return dataset -def _hybrid_quantization(model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict]): - dataset = quantization_config.dataset - wc_ignored_scope = deepcopy(quantization_config.ignored_scope) - - if isinstance(wc_ignored_scope, dict): - wc_ignored_scope["types"] = wc_ignored_scope.get("types", []) + ["Convolution"] - else: - assert wc_ignored_scope is None - wc_ignored_scope = {"types": ["Convolution"]} +def _hybrid_quantization( + model: openvino.runtime.Model, quantization_config: OVWeightQuantizationConfig +) -> openvino.runtime.Model: + """ + Quantize a model in hybrid mode with NNCF which means that we quantize: + weights of MatMul and Embedding layers and activations of other layers. + The optimization specifications defined in `quantization_config`. + + Args: + model (`openvino.runtime.Model`): + The OpenVINO Runtime model for applying hybrid quantization. + quantization_config (`OVWeightQuantizationConfig`): + The configuration containing the parameters related to quantization. + Returns: + The OpenVINO Runtime model with applied hybrid quantization. + """ + ignored_scope = quantization_config.ignored_scope if quantization_config.ignored_scope is not None else {} ops_to_compress = _collect_ops_with_weights(model) - ptq_ignored_scope = deepcopy(quantization_config.ignored_scope) - if isinstance(ptq_ignored_scope, dict): - ptq_ignored_scope["names"] = ptq_ignored_scope.get("names", []) + ops_to_compress - else: - assert ptq_ignored_scope is None - ptq_ignored_scope = {"names": ops_to_compress} + ptq_ignored_scope = deepcopy(ignored_scope) + ptq_ignored_scope["names"] = ignored_scope.get("names", []) + ops_to_compress - quantization_config.dataset = None # Apply Weight Compression without dataset - quantization_config.ignored_scope = wc_ignored_scope - compressed_model = _weight_only_quantization(model, quantization_config) + wc_quantization_config = deepcopy(quantization_config) + wc_quantization_config.ignored_scope = ignored_scope + wc_quantization_config.ignored_scope["types"] = ignored_scope.get("types", []) + ["Convolution"] + # Apply Weight Compression without dataset + wc_quantization_config.dataset = None + compressed_model = _weight_only_quantization(model, wc_quantization_config) + subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 quantized_model = nncf.quantize( - compressed_model, - dataset, + model=compressed_model, + calibration_dataset=nncf.Dataset(quantization_config.dataset), model_type=nncf.ModelType.TRANSFORMER, ignored_scope=nncf.IgnoredScope(**ptq_ignored_scope), + # The SQ algo should be disabled for MatMul nodes because their weights are already compressed advanced_parameters=nncf.AdvancedQuantizationParameters(AdvancedSmoothQuantParameters(matmul=-1)), - subset_size=quantization_config.subset_size, + subset_size=subset_size, ) return quantized_model diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 39dc01212c..9b8b5a7575 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -362,7 +362,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION) def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8): model_id = MODEL_NAMES[model_type] - quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", subset_size=5) + quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2) with tempfile.TemporaryDirectory() as tmp_dir: model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) @@ -373,18 +373,18 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f model.save_pretrained(tmp_dir) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[2:]) def test_ovmodel_hybrid_quantization_with_custom_dataset( self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8 ): model_id = MODEL_NAMES[model_type] - dataset_name = "daspartho/stable-diffusion-prompts" - dataset = load_dataset(dataset_name, split="train", streaming=True) - quantization_dataset = nncf.Dataset(dataset, lambda x: x["prompt"]) + dataset = [ + "dream rose covered with clean crystal, sharp edges, transparent, beautiful, highly detailed, high render" + ] model = model_cls.from_pretrained( model_id, export=True, - quantization_config=OVWeightQuantizationConfig(bits=8, dataset=quantization_dataset, subset_size=3), + quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, num_samples=3), ) num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet) self.assertEqual(expected_num_fake_quantize, num_fake_quantize)