diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index bcb6ad93b7..11a2a286f5 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -75,6 +75,15 @@ For 4-bit quantization you can also specify the following arguments in the quant Smaller `group_size` and `ratio` values usually improve accuracy at the sacrifice of the model size and inference latency. +Quality of 4-bit weight compressed model can further be improved by employing one of the following data-dependent methods: +* AWQ which stands for Activation Aware Quantization is an algorithm that tunes model weights for more accurate 4-bit compression. It slightly improves generation quality of compressed LLMs, but requires significant additional time and memory for tuning weights on a calibration dataset. Please note that it is possible that there will be no matching patterns in the model to apply AWQ, in such case it will be skipped. +* Scale Estimation is a method that tunes quantization scales to minimize the `L2` error between the original and compressed layers. Providing a dataset is required to run scale estimation. Using this method also incurs additional time and memory overhead. + +AWQ and Scale Estimation algorithms can be applied together or separately. For that, provide corresponding arguments to the 4-bit `OVWeightQuantizationConfig` together with a dataset. For example: +```python +quantization_config = OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, quant_method="awq", scale_estimation=True, dataset="wikitext2") +``` + ### Static quantization When applying post-training static quantization, both the weights and the activations are quantized. diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 07e1dcffae..f58b37228b 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -141,6 +141,16 @@ def parse_args_openvino(parser: "ArgumentParser"): "case it will be skipped." ), ) + optional_group.add_argument( + "--scale-estimation", + action="store_true", + default=None, + help=( + "Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original " + "and compressed layers. Providing a dataset is required to run scale estimation. Please note, that " + "applying scale estimation takes additional memory and time." + ), + ) optional_group.add_argument( "--sensitivity-metric", type=str, @@ -255,6 +265,7 @@ def run(self): "num_samples": self.args.num_samples, "quant_method": QuantizationMethod.AWQ if self.args.awq else None, "sensitivity_metric": self.args.sensitivity_metric, + "scale_estimation": self.args.scale_estimation, } if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}: diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index eb233f3d15..ab54e257c3 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -172,7 +172,19 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): - Weight compression method to apply. + Weight compression method to apply. Possible options: + - "default": default weight quantization will be applied. + - "awq": compressed weights will be computed according to the Activation-Aware-Quantization (AWQ) + method. AWQ improves generation quality of INT4-compressed LLMs, but requires + additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is + required. Note: it's possible that there will be no matching patterns in the model to apply AWQ, in + such case it will be skipped. + - "hybrid": The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and + activations of other layers, facilitating accuracy preservation post-optimization while reducing + the model size. Hybrid mode performs well when applied to a UNet model in diffusion pipelines. + scale_estimation (`bool`, *optional*): + Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and + compressed layers. Providing a dataset is required to run scale estimation. """ def __init__( @@ -188,6 +200,7 @@ def __init__( ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, quant_method: Union[QuantizationMethod, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT, + scale_estimation: bool = None, **kwargs, ): super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples) @@ -198,6 +211,7 @@ def __init__( self.all_layers = all_layers self.sensitivity_metric = sensitivity_metric self.quant_method = quant_method + self.scale_estimation = scale_estimation self.post_init() def post_init(self): diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 43cf1dd93b..3ddd01b720 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -823,6 +823,7 @@ def _weight_only_quantization( ignored_scope=config.get_ignored_scope_instance(), dataset=dataset, subset_size=config.num_samples if config.num_samples else 128, + scale_estimation=config.scale_estimation, ) diff --git a/setup.py b/setup.py index 23fce0f827..6c35a264bc 100644 --- a/setup.py +++ b/setup.py @@ -60,8 +60,8 @@ EXTRAS_REQUIRE = { "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], - "openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"], - "nncf": ["nncf>=2.10.0"], + "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"], + "nncf": ["nncf>=2.11.0"], "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.41.2"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index c81761bc9f..dadb3104a3 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -97,6 +97,13 @@ class OVCLIExportTestCase(unittest.TestCase): 4, 28, ), + ( + "text-generation-with-past", + "llama_awq", + "int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ", + 4, + 28, + ), ] def _openvino_export( @@ -218,6 +225,7 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec self.assertEqual(expected_int8, num_int8) self.assertEqual(expected_int4, num_int4) self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout) + self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout) def test_exporters_cli_help(self): subprocess.run( diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index bae0ad772f..0a553848c4 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -230,6 +230,7 @@ class OVWeightCompressionTest(unittest.TestCase): sensitivity_metric="mean_activation_magnitude", dataset="ptb", quant_method=QuantizationMethod.AWQ, + scale_estimation=True, ), 16, ), @@ -456,8 +457,8 @@ def test_ovmodel_4bit_auto_compression_with_config( with tempfile.TemporaryDirectory() as tmp_dir: quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) - if quantization_config.quant_method == QuantizationMethod.AWQ: - # TODO: Check that AWQ was actually applied + if quantization_config.quant_method == QuantizationMethod.AWQ or quantization_config.scale_estimation: + # TODO: Check that AWQ and SE was actually applied pass tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -558,6 +559,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): "ignored_scope": nncf.IgnoredScope(), "awq": None, "subset_size": 128, + "scale_estimation": None, } compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params)