huggingface · echarlaix · Jun 18, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx
@@ -75,6 +75,15 @@ For 4-bit quantization you can also specify the following arguments in the quant
 
 Smaller `group_size` and `ratio` values usually improve accuracy at the sacrifice of the model size and inference latency.
 
+Quality of 4-bit weight compressed model can further be improved by employing one of the following data-dependent methods:
+* AWQ which stands for Activation Aware Quantization is an algorithm that tunes model weights for more accurate 4-bit compression. It slightly improves generation quality of compressed LLMs, but requires significant additional time and memory for tuning weights on a calibration dataset. Please note that it is possible that there will be no matching patterns in the model to apply AWQ, in such case it will be skipped.
+* Scale Estimation is a method that tunes quantization scales to minimize the `L2` error between the original and compressed layers. Providing a dataset is required to run scale estimation. Using this method also incurs additional time and memory overhead.
+
+AWQ and Scale Estimation algorithms can be applied together or separately. For that, provide corresponding arguments to the 4-bit `OVWeightQuantizationConfig` together with a dataset. For example:
+```python
+quantization_config = OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, quant_method="awq", scale_estimation=True, dataset="wikitext2")
+```
+
 ### Static quantization
 
 When applying post-training static quantization, both the weights and the activations are quantized.

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -141,6 +141,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "case it will be skipped."
         ),
     )
+    optional_group.add_argument(
+        "--scale-estimation",
+        action="store_true",
+        default=None,
+        help=(
+            "Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original "
+            "and compressed layers. Providing a dataset is required to run scale estimation. Please note, that "
+            "applying scale estimation takes additional memory and time."
+        ),
+    )
     optional_group.add_argument(
         "--sensitivity-metric",
         type=str,
@@ -255,6 +265,7 @@ def run(self):
                     "num_samples": self.args.num_samples,
                     "quant_method": QuantizationMethod.AWQ if self.args.awq else None,
                     "sensitivity_metric": self.args.sensitivity_metric,
+                    "scale_estimation": self.args.scale_estimation,
                 }
 
             if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -172,7 +172,19 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
         num_samples (`int`, *optional*):
             The maximum number of samples composing the calibration dataset.
         quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT):
-            Weight compression method to apply.
+            Weight compression method to apply. Possible options:
+                - "default": default weight quantization will be applied.
+                - "awq": compressed weights will be computed according to the Activation-Aware-Quantization (AWQ)
+                  method. AWQ improves generation quality of INT4-compressed LLMs, but requires
+                  additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is
+                  required. Note: it's possible that there will be no matching patterns in the model to apply AWQ, in
+                  such case it will be skipped.
+                - "hybrid": The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and
+                  activations of other layers, facilitating accuracy preservation post-optimization while reducing
+                  the model size. Hybrid mode performs well when applied to a UNet model in diffusion pipelines.
+        scale_estimation (`bool`, *optional*):
+            Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and
+            compressed layers. Providing a dataset is required to run scale estimation.
     """
 
     def __init__(
@@ -188,6 +200,7 @@ def __init__(
         ignored_scope: Optional[dict] = None,
         num_samples: Optional[int] = None,
         quant_method: Union[QuantizationMethod, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT,
+        scale_estimation: bool = None,
         **kwargs,
     ):
         super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
@@ -198,6 +211,7 @@ def __init__(
         self.all_layers = all_layers
         self.sensitivity_metric = sensitivity_metric
         self.quant_method = quant_method
+        self.scale_estimation = scale_estimation
         self.post_init()
 
     def post_init(self):

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
@@ -823,6 +823,7 @@ def _weight_only_quantization(
         ignored_scope=config.get_ignored_scope_instance(),
         dataset=dataset,
         subset_size=config.num_samples if config.num_samples else 128,
+        scale_estimation=config.scale_estimation,
     )
 
 

diff --git a/setup.py b/setup.py
@@ -60,8 +60,8 @@
 
 EXTRAS_REQUIRE = {
     "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"],
-    "openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"],
-    "nncf": ["nncf>=2.10.0"],
+    "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"],
+    "nncf": ["nncf>=2.11.0"],
     "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.41.2"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -97,6 +97,13 @@ class OVCLIExportTestCase(unittest.TestCase):
             4,
             28,
         ),
+        (
+            "text-generation-with-past",
+            "llama_awq",
+            "int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ",
+            4,
+            28,
+        ),
     ]
 
     def _openvino_export(
@@ -218,6 +225,7 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec
             self.assertEqual(expected_int8, num_int8)
             self.assertEqual(expected_int4, num_int4)
             self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
+            self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout)
 
     def test_exporters_cli_help(self):
         subprocess.run(

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -230,6 +230,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 sensitivity_metric="mean_activation_magnitude",
                 dataset="ptb",
                 quant_method=QuantizationMethod.AWQ,
+                scale_estimation=True,
             ),
             16,
         ),
@@ -456,8 +457,8 @@ def test_ovmodel_4bit_auto_compression_with_config(
         with tempfile.TemporaryDirectory() as tmp_dir:
             quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
             model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
-            if quantization_config.quant_method == QuantizationMethod.AWQ:
-                # TODO: Check that AWQ was actually applied
+            if quantization_config.quant_method == QuantizationMethod.AWQ or quantization_config.scale_estimation:
+                # TODO: Check that AWQ and SE was actually applied
                 pass
 
             tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -558,6 +559,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self):
                             "ignored_scope": nncf.IgnoredScope(),
                             "awq": None,
                             "subset_size": 128,
+                            "scale_estimation": None,
                         }
                         compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params)