diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index deb117530..f58b37228 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -263,7 +263,7 @@ def run(self): "all_layers": None if is_int8 else self.args.all_layers, "dataset": self.args.dataset, "num_samples": self.args.num_samples, - "awq": self.args.awq, + "quant_method": QuantizationMethod.AWQ if self.args.awq else None, "sensitivity_metric": self.args.sensitivity_metric, "scale_estimation": self.args.scale_estimation, } diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index e6eb12e86..e1a29c18e 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -172,12 +172,18 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): - Weight compression method to apply. + Weight compression method to apply. Possible options: + - "default": default weight quantization will be applied. + - "awq": compressed weights will be computed according to the Activation-Aware-Quantization (AWQ) + method. AWQ improves generation quality of INT4-compressed LLMs, but requires + additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is + required. Note: it's possible that there will be no matching patterns in the model to apply AWQ, in + such case it will be skipped. + - "hybrid": The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and + activations of other layers, facilitating accuracy preservation post-optimization while reducing + the model size. Hybrid mode performs well when applied to a UNet model in diffusion pipelines. awq (`bool`, *optional*): - Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires - additional time for tuning weights on a calibration dataset. To run AWQ, providing a dataset is required. - Note: it's possible that there will be no matching patterns in the model to apply AWQ, in such case it - will be skipped. + Alias for `quant_method="awq"`. scale_estimation (`bool`, *optional*): Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and compressed layers. Providing a dataset is required to run scale estimation. @@ -208,8 +214,9 @@ def __init__( self.all_layers = all_layers self.sensitivity_metric = sensitivity_metric self.quant_method = quant_method - self.awq = awq self.scale_estimation = scale_estimation + if awq: + self.quant_method = QuantizationMethod.AWQ self.post_init() def post_init(self): @@ -255,11 +262,6 @@ def post_init(self): if self.tokenizer is not None and not isinstance(self.tokenizer, str): raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}") - if self.quant_method == QuantizationMethod.AWQ: - self.quant_method = OVQuantizationMethod.DEFAULT - self.awq = True - logger.warning('Using quant_method="AWQ" is deprecated. Please use awq=True instead in the future.') - @dataclass class OVDynamicQuantizationConfig(OVWeightQuantizationConfig): diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 271ee2326..3ddd01b72 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -819,7 +819,7 @@ def _weight_only_quantization( group_size=config.group_size, all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, - awq=config.awq, + awq=config.quant_method == QuantizationMethod.AWQ or None, ignored_scope=config.get_ignored_scope_instance(), dataset=dataset, subset_size=config.num_samples if config.num_samples else 128, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index d957129ab..b7b770a85 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -229,7 +229,7 @@ class OVWeightCompressionTest(unittest.TestCase): ratio=0.8, sensitivity_metric="mean_activation_magnitude", dataset="ptb", - awq=True, + quant_method=QuantizationMethod.AWQ, scale_estimation=True, ), 16, @@ -457,8 +457,8 @@ def test_ovmodel_4bit_auto_compression_with_config( with tempfile.TemporaryDirectory() as tmp_dir: quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) - if quantization_config.awq: - # TODO: Check that AWQ and SE were actually applied + if quantization_config.quant_method == QuantizationMethod.AWQ or quantization_config.scale_estimation: + # TODO: Check that AWQ and SE was actually applied pass tokenizer = AutoTokenizer.from_pretrained(model_id)