diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 6965efcb5..08c0721e5 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -368,7 +368,9 @@ def run(self): model.save_pretrained(self.args.output) if not self.args.disable_convert_tokenizer: maybe_convert_tokenizers(library_name, self.args.output, model, task=task) - elif (task.startswith("text-generation") or task == "image-text-to-text") and quantize_with_dataset: + elif (task.startswith("text-generation") and quantize_with_dataset) or ( + task == "image-text-to-text" and quantization_config is not None + ): if task.startswith("text-generation"): from optimum.intel import OVModelForCausalLM @@ -378,7 +380,7 @@ def run(self): model_cls = OVModelForVisualCausalLM - # To quantize a model with a dataset, an instance of a model class is required + # In this case, to apply quantization an instance of a model class is required model = model_cls.from_pretrained( self.args.model, export=True, diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 8d6edea0f..a7c221008 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -598,7 +598,8 @@ def _from_transformers( if load_in_8bit is None and not quantization_config: ov_config = None else: - ov_config = OVConfig(dtype="fp32") + # Export in fp32 if compression won't be applied later + ov_config = OVConfig(dtype="fp32" if load_in_8bit is False else "auto") stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 97cbe8ef2..c8247989d 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -14,12 +14,14 @@ import subprocess import unittest from pathlib import Path +from typing import Dict, List from parameterized import parameterized from transformers import AutoModelForCausalLM from utils_tests import ( _ARCHITECTURES_TO_EXPECTED_INT8, MODEL_NAMES, + compare_num_quantized_nodes_per_model, get_num_quantized_nodes, ) @@ -108,37 +110,47 @@ class OVCLIExportTestCase(unittest.TestCase): SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("flux", 7, 56)) TEST_4BIT_CONFIGURATIONS = [ - ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}), - ("text-generation-with-past", "opt125m", "int4 --group-size 64", {"int8": 4, "int4": 144}), - ("text-generation-with-past", "opt125m", "mxfp4", {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}), - ("text-generation-with-past", "opt125m", "nf4", {"int8": 4, "nf4": 72}), - ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 8 --all-layers", {"int4": 16}), + ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", [{"int8": 4, "int4": 72}]), + ("text-generation-with-past", "opt125m", "int4 --group-size 64", [{"int8": 4, "int4": 144}]), + ("text-generation-with-past", "opt125m", "mxfp4", [{"int8": 4, "f4e2m1": 72, "f8e8m0": 72}]), + ("text-generation-with-past", "opt125m", "nf4", [{"int8": 4, "nf4": 72}]), + ( + "text-generation-with-past", + "llama_awq", + "int4 --ratio 1.0 --sym --group-size 8 --all-layers", + [{"int4": 16}], + ), ( "text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --awq --dataset wikitext2 --num-samples 100 " "--sensitivity-metric max_activation_variance", - {"int8": 4, "int4": 14}, + [{"int8": 4, "int4": 14}], ), ( "text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ", - {"int8": 4, "int4": 14}, + [{"int8": 4, "int4": 14}], ), ( "text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --gptq --dataset wikitext2 --num-samples 100 ", - {"int8": 4, "int4": 14}, + [{"int8": 4, "int4": 14}], ), ( "text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --lora-correction --dataset auto --num-samples 16", - {"int8": 60, "int4": 14}, + [{"int8": 60, "int4": 14}], + ), + ( + "text-generation-with-past", + "llama_awq", + "int4 --group-size 16 --backup-precision none --ratio 0.5", + [{"int4": 6}], ), - ("text-generation-with-past", "llama_awq", "int4 --group-size 16 --backup-precision none", {"int4": 28}), ] if is_transformers_version(">=", "4.40.0"): @@ -147,16 +159,28 @@ class OVCLIExportTestCase(unittest.TestCase): ( "image-text-to-text", "llava_next", - 'int4 --group-size 16 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" ' + "int4 --group-size 16 --ratio 0.8", + [{"int8": 14, "int4": 16}, {"int8": 9}, {"int8": 1}], + ), + ( + "image-text-to-text", + "llava_next", + 'int4 --group-size 16 --ratio 0.8 --sensitivity-metric "hessian_input_activation" ' "--dataset contextual --num-samples 1", - {"int8": 8, "int4": 22}, + [{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}], + ), + ( + "image-text-to-text", + "nanollava", + "int4 --group-size 8 --ratio 0.8 --trust-remote-code", + [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}], ), ( "image-text-to-text", "nanollava", - 'int4 --group-size 8 --ratio 0.9 --sensitivity-metric "mean_activation_variance" ' + 'int4 --group-size 8 --ratio 0.8 --sensitivity-metric "mean_activation_variance" ' "--dataset contextual --num-samples 1 --trust-remote-code", - {"int8": 12, "int4": 18}, + [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}], ), ] ) @@ -164,19 +188,44 @@ class OVCLIExportTestCase(unittest.TestCase): if is_transformers_version(">=", "4.45.0"): TEST_4BIT_CONFIGURATIONS.extend( [ + ( + "image-text-to-text", + "minicpmv", + "int4 --group-size 4 --ratio 0.8 --trust-remote-code", + [{"int8": 10, "int4": 20}, {"int8": 26}, {"int8": 1}, {"int8": 6}], + ), + ( + "image-text-to-text", + "minicpmv", + 'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" ' + "--dataset contextual --num-samples 1 --trust-remote-code", + [{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}], + ), + ( + "image-text-to-text", + "internvl2", + "int4 --group-size 4 --ratio 0.8 --trust-remote-code", + [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}], + ), ( "image-text-to-text", "internvl2", - 'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "hessian_input_activation" ' + 'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" ' "--dataset contextual --num-samples 1 --trust-remote-code", - {"int8": 6, "int4": 24}, + [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}], + ), + ( + "image-text-to-text", + "phi3_v", + "int4 --group-size 4 --ratio 0.8 --trust-remote-code", + [{"int8": 8, "int4": 10}, {"int8": 7}, {"int8": 1}, {"int8": 2}], ), ( "image-text-to-text", "phi3_v", - 'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" ' + 'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" ' "--dataset contextual --num-samples 1 --trust-remote-code", - {"int8": 4, "int4": 14}, + [{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}], ), ] ) @@ -300,7 +349,9 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in self.assertEqual(exp_num_fq, num_fq) @parameterized.expand(TEST_4BIT_CONFIGURATIONS) - def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict): + def test_exporters_cli_4bit( + self, task: str, model_type: str, option: str, expected_num_weight_nodes_per_model: List[Dict] + ): with TemporaryDirectory() as tmpdir: result = subprocess.run( f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", @@ -317,11 +368,15 @@ def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expec else _HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")] ).from_pretrained(tmpdir, **model_kwargs) - ov_model = model.lm_model if task == "image-text-to-text" else model.model + submodels = [] + if task == "text-generation-with-past": + submodels = [model] + elif task == "image-text-to-text": + submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model] + submodels += [getattr(model, part) for part in model.additional_parts] + + compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model) - _, num_weight_nodes = get_num_quantized_nodes(ov_model) - expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)}) - self.assertEqual(expected_num_weight_nodes, num_weight_nodes) self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout) self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout) self.assertTrue("--gptq" not in option or b"Applying GPTQ" in result.stdout) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index d02dea3f1..e0535a899 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -77,7 +77,12 @@ from optimum.intel.openvino.quantization import InferRequestWrapper from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version -from utils_tests import MODEL_NAMES, get_num_quantized_nodes, _ARCHITECTURES_TO_EXPECTED_INT8 +from utils_tests import ( + MODEL_NAMES, + get_num_quantized_nodes, + _ARCHITECTURES_TO_EXPECTED_INT8, + compare_num_quantized_nodes_per_model, +) _TASK_TO_DATASET = { "text-generation": ("wikitext", "wikitext-2-raw-v1", "text"), @@ -239,21 +244,26 @@ class OVWeightCompressionTest(unittest.TestCase): "gpt2", # model name False, # trust remote code dict(bits=4, sym=False, group_size=-1, ratio=0.8), # quantization config - {"int4": 30, "int8": 14}, # reference number of low-precision nodes + [{"int8": 14, "int4": 30}], # reference number of low-precision nodes ), ( OVModelForCausalLM, "gpt2", False, dict(bits=4, weight_format="mxfp4", group_size=32), - {"f4e2m1": 20, "f8e8m0": 20, "int8": 4}, + [{"int8": 4, "f4e2m1": 20, "f8e8m0": 20}], ), ( OVModelForCausalLM, "gpt2", False, dict(bits=4, weight_format="nf4", group_size=32), - {"nf4": 20, "int8": 4}, + [ + { + "int8": 4, + "nf4": 20, + } + ], ), ( OVModelForCausalLM, @@ -265,14 +275,14 @@ class OVWeightCompressionTest(unittest.TestCase): group_size=32, ignored_scope={"names": ["__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul"]}, ), - {"int4": 38, "int8": 4}, + [{"int8": 4, "int4": 38}], ), ( OVModelForCausalLM, "gpt2", False, dict(bits=4, sym=False, group_size=-1, ratio=0.8, all_layers=True), - {"int4": 26, "int8": 18}, + [{"int8": 18, "int4": 26}], ), ( OVModelForCausalLM, @@ -286,7 +296,7 @@ class OVWeightCompressionTest(unittest.TestCase): sensitivity_metric="mean_activation_magnitude", dataset="c4", ), - {"int4": 25, "int8": 14}, + [{"int8": 14, "int4": 25}], ), ( OVModelForCausalLM, @@ -300,7 +310,7 @@ class OVWeightCompressionTest(unittest.TestCase): sensitivity_metric="mean_activation_magnitude", dataset=["one two, " * i for i in range(10)], ), - {"int4": 24, "int8": 16}, + [{"int8": 16, "int4": 24}], ), ( OVModelForCausalLM, @@ -316,7 +326,7 @@ class OVWeightCompressionTest(unittest.TestCase): quant_method=QuantizationMethod.AWQ, scale_estimation=True, ), - {"int4": 12, "int8": 8}, + [{"int8": 8, "int4": 12}], ), ( OVModelForCausalLM, @@ -331,7 +341,7 @@ class OVWeightCompressionTest(unittest.TestCase): dataset="c4", quant_method="awq", ), - {"int4": 12, "int8": 8}, + [{"int8": 8, "int4": 12}], ), ( OVModelForCausalLM, @@ -346,7 +356,7 @@ class OVWeightCompressionTest(unittest.TestCase): dataset="c4", gptq=True, ), - {"int4": 12, "int8": 8}, + [{"int8": 8, "int4": 12}], ), ( OVModelForCausalLM, @@ -359,14 +369,35 @@ class OVWeightCompressionTest(unittest.TestCase): dataset="auto", lora_correction=True, ), - {"int4": 28, "int8": 60}, + [{"int8": 60, "int4": 28}], ), ( OVModelForCausalLM, "llama_awq", False, dict(bits=4, backup_precision="none", group_size=16), - {"int4": 28}, + [{"int4": 28}], + ), + ( + OVModelForCausalLM, + "llama_awq", + False, + dict(bits=4, backup_precision="none", group_size=16, ratio=0.5), + [{"int4": 6}], + ), + ( + OVModelForCausalLM, + "llama_awq", + False, + dict(bits=4, backup_precision="int8_sym", group_size=16, ratio=0.5), + [{"int4": 6, "int8": 13}], + ), + ( + OVModelForCausalLM, + "llama_awq", + False, + dict(bits=4, backup_precision="int8_asym", group_size=16, ratio=0.5), + [{"int4": 6, "int8": 26}], ), ] @@ -386,7 +417,7 @@ class OVWeightCompressionTest(unittest.TestCase): num_samples=1, processor=MODEL_NAMES["llava_next"], ), - {"int4": 24, "int8": 6}, + [{"int8": 6, "int4": 24}, {"int8": 9}, {"int8": 1}], ), ( OVModelForVisualCausalLM, @@ -397,13 +428,13 @@ class OVWeightCompressionTest(unittest.TestCase): group_size=8, dataset="contextual", ratio=0.8, - sensitivity_metric="mean_activation_magnitude", + sensitivity_metric="mean_activation_variance", num_samples=1, processor=MODEL_NAMES["nanollava_vision_tower"], tokenizer=MODEL_NAMES["nanollava"], trust_remote_code=True, ), - {"int4": 16, "int8": 14}, + [{"int8": 16, "int4": 14}, {"int8": 15}, {"int8": 1}], ), ] ) @@ -425,7 +456,7 @@ class OVWeightCompressionTest(unittest.TestCase): processor=MODEL_NAMES["minicpmv"], trust_remote_code=True, ), - {"int4": 22, "int8": 8}, + [{"int8": 8, "int4": 22}, {"int8": 26}, {"int8": 1}, {"int8": 6}], ), ( OVModelForVisualCausalLM, @@ -440,7 +471,7 @@ class OVWeightCompressionTest(unittest.TestCase): num_samples=1, trust_remote_code=True, ), - {"int4": 22, "int8": 8}, + [{"int8": 8, "int4": 22}, {"int8": 11}, {"int8": 1}], ), ( OVModelForVisualCausalLM, @@ -455,7 +486,7 @@ class OVWeightCompressionTest(unittest.TestCase): num_samples=1, trust_remote_code=True, ), - {"int4": 14, "int8": 4}, + [{"int8": 4, "int4": 14}, {"int8": 7}, {"int8": 1}, {"int8": 2}], ), ] ) @@ -723,7 +754,7 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( - self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes + self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes_per_model ): model_id = MODEL_NAMES[model_name] with TemporaryDirectory() as tmp_dir: @@ -735,14 +766,17 @@ def test_ovmodel_4bit_auto_compression_with_config( # TODO: Check that AWQ was actually applied pass - ov_model = model.lm_model if model_cls == OVModelForVisualCausalLM else model.model + submodels = [] + if isinstance(model, OVModelForCausalLM): + submodels = [model.model] + elif isinstance(model, OVModelForVisualCausalLM): + submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model] + submodels += [getattr(model, part) for part in model.additional_parts] + compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model) - _, num_weight_nodes = get_num_quantized_nodes(ov_model) - expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)}) - self.assertEqual(expected_num_weight_nodes, num_weight_nodes) model.save_pretrained(tmp_dir) - - wc_rt_info = ov_model.get_rt_info()["nncf"]["weight_compression"] + # At the moment the first model in the list is the only one we apply data-aware compression to + wc_rt_info = submodels[0].get_rt_info()["nncf"]["weight_compression"] self.assertEqual(quantization_config.quant_method.lower() == "awq", wc_rt_info["awq"].value == "True") self.assertEqual( quantization_config.scale_estimation or False, wc_rt_info["scale_estimation"].value == "True" @@ -870,9 +904,9 @@ def main_export_not_in_stacktrace(*args, **kwargs): } compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) - @parameterized.expand(LOAD_IN_4_BITS_SCOPE) + @parameterized.expand(LOAD_IN_4_BITS_SCOPE[::5]) def test_ovmodel_4bit_dynamic_with_config( - self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes + self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes_per_model ): model_id = MODEL_NAMES[model_name] with TemporaryDirectory() as tmp_dir: @@ -886,13 +920,15 @@ def test_ovmodel_4bit_dynamic_with_config( self.assertEqual(model.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"], str(group_size)) self.assertEqual(model.ov_config["KV_CACHE_PRECISION"], "u8") - ov_model = model.lm_model if model_cls == OVModelForVisualCausalLM else model.model + submodels = [] + if isinstance(model, OVModelForCausalLM): + submodels = [model.model] + elif isinstance(model, OVModelForVisualCausalLM): + submodels = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model] + submodels += [getattr(model, part) for part in model.additional_parts] + compare_num_quantized_nodes_per_model(self, submodels, expected_num_weight_nodes_per_model) - _, num_weight_nodes = get_num_quantized_nodes(ov_model) - expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)}) - self.assertEqual(expected_num_weight_nodes, num_weight_nodes) model.save_pretrained(tmp_dir) - openvino_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(openvino_config.quantization_config.bits, 4) self.assertEqual(openvino_config.dtype, quantization_config.weight_format) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index bf509a044..6130ee06c 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -11,11 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import unittest +from typing import Dict, List, Union import numpy as np import openvino as ov import torch +from optimum.intel.openvino.modeling_base import OVBaseModel + MODEL_NAMES = { "albert": "hf-internal-testing/tiny-random-albert", @@ -218,3 +222,17 @@ def get_num_quantized_nodes(model): if type_name == "nf4": num_weight_nodes["nf4"] += 1 return num_fake_quantize, num_weight_nodes + + +def compare_num_quantized_nodes_per_model( + test_case: unittest.TestCase, + models: List[Union[ov.Model, OVBaseModel]], + expected_num_weight_nodes_per_model: List[Dict], +): + test_case.assertEqual(len(models), len(expected_num_weight_nodes_per_model)) + actual_num_weights_per_model = [] + for submodel, expected_num_weight_nodes in zip(models, expected_num_weight_nodes_per_model): + _, num_weight_nodes = get_num_quantized_nodes(submodel) + expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)}) + actual_num_weights_per_model.append(num_weight_nodes) + test_case.assertEqual(expected_num_weight_nodes_per_model, actual_num_weights_per_model)