Apply comments; some additional tweaks

huggingface · Jun 17, 2024 · b796fd0 · b796fd0
1 parent 6c3daef
commit b796fd0
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 16 deletions.
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -263,7 +263,7 @@ def run(self):
                     "all_layers": None if is_int8 else self.args.all_layers,
                     "dataset": self.args.dataset,
                     "num_samples": self.args.num_samples,
-                    "awq": self.args.awq,
+                    "quant_method": QuantizationMethod.AWQ if self.args.awq else None,
                     "sensitivity_metric": self.args.sensitivity_metric,
                     "scale_estimation": self.args.scale_estimation,
                 }

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -172,12 +172,18 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
         num_samples (`int`, *optional*):
             The maximum number of samples composing the calibration dataset.
         quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT):
-            Weight compression method to apply.
+            Weight compression method to apply. Possible options:
+                - "default": default weight quantization will be applied.
+                - "awq": compressed weights will be computed according to the Activation-Aware-Quantization (AWQ)
+                  method. AWQ improves generation quality of INT4-compressed LLMs, but requires
+                  additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is
+                  required. Note: it's possible that there will be no matching patterns in the model to apply AWQ, in
+                  such case it will be skipped.
+                - "hybrid": The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and
+                  activations of other layers, facilitating accuracy preservation post-optimization while reducing
+                  the model size. Hybrid mode performs well when applied to a UNet model in diffusion pipelines.
         awq (`bool`, *optional*):
-            Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires
-            additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is required.
-            Note: it's possible that there will be no matching patterns in the model to apply AWQ, in such case it
-            will be skipped.
+            Alias for `quant_method="awq"`.
         scale_estimation (`bool`, *optional*):
             Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and
             compressed layers. Providing a dataset is required to run scale estimation.
@@ -208,8 +214,9 @@ def __init__(
         self.all_layers = all_layers
         self.sensitivity_metric = sensitivity_metric
         self.quant_method = quant_method
-        self.awq = awq
         self.scale_estimation = scale_estimation
+        if awq:
+            self.quant_method = QuantizationMethod.AWQ
         self.post_init()
 
     def post_init(self):
@@ -255,11 +262,6 @@ def post_init(self):
         if self.tokenizer is not None and not isinstance(self.tokenizer, str):
             raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}")
 
-        if self.quant_method == QuantizationMethod.AWQ:
-            self.quant_method = OVQuantizationMethod.DEFAULT
-            self.awq = True
-            logger.warning('Using quant_method="AWQ" is deprecated. Please use awq=True instead in the future.')
-
 
 @dataclass
 class OVDynamicQuantizationConfig(OVWeightQuantizationConfig):

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
@@ -819,7 +819,7 @@ def _weight_only_quantization(
         group_size=config.group_size,
         all_layers=config.all_layers,
         sensitivity_metric=sensitivity_metric,
-        awq=config.awq,
+        awq=config.quant_method == QuantizationMethod.AWQ or None,
         ignored_scope=config.get_ignored_scope_instance(),
         dataset=dataset,
         subset_size=config.num_samples if config.num_samples else 128,

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -229,7 +229,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 ratio=0.8,
                 sensitivity_metric="mean_activation_magnitude",
                 dataset="ptb",
-                awq=True,
+                quant_method=QuantizationMethod.AWQ,
                 scale_estimation=True,
             ),
             16,
@@ -457,8 +457,8 @@ def test_ovmodel_4bit_auto_compression_with_config(
         with tempfile.TemporaryDirectory() as tmp_dir:
             quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
             model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
-            if quantization_config.awq:
-                # TODO: Check that AWQ and SE were actually applied
+            if quantization_config.quant_method == QuantizationMethod.AWQ or quantization_config.scale_estimation:
+                # TODO: Check that AWQ and SE was actually applied
                 pass
 
             tokenizer = AutoTokenizer.from_pretrained(model_id)