diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index aae66c148b..07765920bc 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -21,6 +21,8 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import datasets +from transformers.utils.quantization_config import QuantizationMethod + import nncf import openvino import torch @@ -677,10 +679,10 @@ def _weight_only_quantization( group_size=config.group_size, all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, - # awq=config.quant_method == QuantizationMethod.AWQ, # TODO : enable from nncf v2.9.0 + awq=config.quant_method == QuantizationMethod.AWQ, ignored_scope=config.get_ignored_scope_instance(), dataset=dataset, - # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 + subset_size=config.num_samples if config.num_samples else 128, ) diff --git a/setup.py b/setup.py index ea87e6ad59..bc91ba2472 100644 --- a/setup.py +++ b/setup.py @@ -60,8 +60,8 @@ EXTRAS_REQUIRE = { "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], - "openvino": ["openvino>=2023.3", "nncf>=2.8.1", "openvino-tokenizers[transformers]"], - "nncf": ["nncf>=2.8.1"], + "openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"], + "nncf": ["nncf>=2.10.0"], "ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index e269578c35..ca4d43d295 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -224,17 +224,17 @@ class OVWeightCompressionTest(unittest.TestCase): ), ( OVModelForCausalLM, - "hf-internal-testing/tiny-random-OPTForCausalLM", + "HuggingFaceH4/tiny-random-LlamaForCausalLM", dict( bits=4, sym=True, - group_size=-1, + group_size=16, ratio=0.8, sensitivity_metric="mean_activation_magnitude", dataset="ptb", quant_method=QuantizationMethod.AWQ, ), - 14, + 16, ), ) @@ -455,7 +455,20 @@ def test_ovmodel_4bit_auto_compression_with_config( ): with tempfile.TemporaryDirectory() as tmp_dir: quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) - model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) + + from nncf.common.logging.track_progress import track + + with unittest.mock.patch("nncf.common.logging.track_progress.track", wraps=track) as track_patch: + model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) + if quantization_config.quant_method == QuantizationMethod.AWQ: + # Called at least once with description="Applying AWQ" + self.assertTrue( + any( + args.kwargs.get("description", None) == "Applying AWQ" + for args in track_patch.call_args_list + ) + ) + tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token