diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index ddd0cdec75..bb62ba312f 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -37,6 +37,7 @@ from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator from transformers.pytorch_utils import Conv1D from transformers.utils import is_accelerate_available +from transformers.utils.quantization_config import QuantizationMethod from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed from optimum.exporters.tasks import TasksManager @@ -670,10 +671,10 @@ def _weight_only_quantization( group_size=config.group_size, all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, - # awq=config.quant_method == QuantizationMethod.AWQ, # TODO : enable from nncf v2.9.0 + awq=config.quant_method == QuantizationMethod.AWQ or None, ignored_scope=config.get_ignored_scope_instance(), dataset=dataset, - # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 + subset_size=config.num_samples if config.num_samples else 128, ) diff --git a/setup.py b/setup.py index a7937bd1e4..0011cbe8ed 100644 --- a/setup.py +++ b/setup.py @@ -60,8 +60,8 @@ EXTRAS_REQUIRE = { "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], - "openvino": ["openvino>=2023.3", "nncf>=2.8.1", "openvino-tokenizers[transformers]"], - "nncf": ["nncf>=2.8.1"], + "openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"], + "nncf": ["nncf>=2.10.0"], "ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 26dfc658a5..d873878abb 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -221,17 +221,17 @@ class OVWeightCompressionTest(unittest.TestCase): ), ( OVModelForCausalLM, - "opt", + "llama_awq", dict( bits=4, sym=True, - group_size=-1, + group_size=16, ratio=0.8, sensitivity_metric="mean_activation_magnitude", dataset="ptb", quant_method=QuantizationMethod.AWQ, ), - 14, + 16, ), ) @@ -452,6 +452,10 @@ def test_ovmodel_4bit_auto_compression_with_config( with tempfile.TemporaryDirectory() as tmp_dir: quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) + if quantization_config.quant_method == QuantizationMethod.AWQ: + # TODO: Check that AWQ was actually applied + pass + tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -548,6 +552,8 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): "sensitivity_metric": None, "dataset": None, "ignored_scope": nncf.IgnoredScope(), + "awq": None, + "subset_size": 128, } compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 9cb94bbf34..5d34036c9d 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -57,6 +57,7 @@ "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-longt5", "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM", "llama_gptq": "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ", "m2m_100": "hf-internal-testing/tiny-random-m2m_100", "opt": "hf-internal-testing/tiny-random-OPTModel",