Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update NNCF to 2.10. Enable AWQ algorithm. #673

Merged
merged 12 commits into from
Apr 26, 2024
5 changes: 3 additions & 2 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
from transformers.pytorch_utils import Conv1D
from transformers.utils import is_accelerate_available
from transformers.utils.quantization_config import QuantizationMethod

from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed
from optimum.exporters.tasks import TasksManager
Expand Down Expand Up @@ -670,10 +671,10 @@ def _weight_only_quantization(
group_size=config.group_size,
all_layers=config.all_layers,
sensitivity_metric=sensitivity_metric,
# awq=config.quant_method == QuantizationMethod.AWQ, # TODO : enable from nncf v2.9.0
awq=config.quant_method == QuantizationMethod.AWQ or None,
ignored_scope=config.get_ignored_scope_instance(),
dataset=dataset,
# subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0
subset_size=config.num_samples if config.num_samples else 128,
)


Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@

EXTRAS_REQUIRE = {
"neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"],
"openvino": ["openvino>=2023.3", "nncf>=2.8.1", "openvino-tokenizers[transformers]"],
"nncf": ["nncf>=2.8.1"],
"openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"],
"nncf": ["nncf>=2.10.0"],
"ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"],
"diffusers": ["diffusers"],
"quality": QUALITY_REQUIRE,
Expand Down
12 changes: 9 additions & 3 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,17 +221,17 @@ class OVWeightCompressionTest(unittest.TestCase):
),
(
OVModelForCausalLM,
"opt",
"llama_awq",
dict(
bits=4,
sym=True,
group_size=-1,
group_size=16,
ratio=0.8,
sensitivity_metric="mean_activation_magnitude",
dataset="ptb",
quant_method=QuantizationMethod.AWQ,
),
14,
16,
),
)

Expand Down Expand Up @@ -452,6 +452,10 @@ def test_ovmodel_4bit_auto_compression_with_config(
with tempfile.TemporaryDirectory() as tmp_dir:
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
if quantization_config.quant_method == QuantizationMethod.AWQ:
# TODO: Check that AWQ was actually applied
pass

tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
Expand Down Expand Up @@ -548,6 +552,8 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self):
"sensitivity_metric": None,
"dataset": None,
"ignored_scope": nncf.IgnoredScope(),
"awq": None,
"subset_size": 128,
}
compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params)

Expand Down
1 change: 1 addition & 0 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
"levit": "hf-internal-testing/tiny-random-LevitModel",
"longt5": "hf-internal-testing/tiny-random-longt5",
"llama": "fxmarty/tiny-llama-fast-tokenizer",
"llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM",
"llama_gptq": "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ",
"m2m_100": "hf-internal-testing/tiny-random-m2m_100",
"opt": "hf-internal-testing/tiny-random-OPTModel",
Expand Down
Loading