From eac1f6c994e52d60fa68bd68da372d455b0a5fc2 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 8 Jul 2024 11:07:30 +0200 Subject: [PATCH] Increase default 4-bit compression ratio from 0.8 to 1.0 (#805) * Increase default 4-bit ratio from 0.8 to 1.0 * Style * Fix test --- optimum/commands/export/openvino.py | 10 ++-------- optimum/intel/openvino/configuration.py | 8 ++++++++ optimum/intel/openvino/modeling_decoder.py | 10 ++++++++-- tests/openvino/test_exporters_cli.py | 8 ++++---- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 5adcb36495..ee1f62388f 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -221,7 +221,7 @@ def parse_args(parser: "ArgumentParser"): def run(self): from ...exporters.openvino.__main__ import infer_task, main_export, maybe_convert_tokenizers - from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig + from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIG, _DEFAULT_4BIT_CONFIGS, OVConfig def _get_default_int4_config(model_id_or_path, library_name): if model_id_or_path in _DEFAULT_4BIT_CONFIGS: @@ -233,13 +233,7 @@ def _get_default_int4_config(model_id_or_path, library_name): if original_model_name in _DEFAULT_4BIT_CONFIGS: return _DEFAULT_4BIT_CONFIGS[original_model_name] - return { - "bits": 4, - "ratio": 0.8, - "sym": False, - "group_size": None, - "all_layers": None, - } + return _DEFAULT_4BIT_CONFIG library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library) if library_name == "sentence_transformers" and self.args.library is None: diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index aaa3aa663c..5abb518183 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -104,6 +104,14 @@ }, } +_DEFAULT_4BIT_CONFIG = { + "bits": 4, + "ratio": 1.0, + "sym": False, + "group_size": 128, + "all_layers": None, +} + class OVQuantizationMethod(str, Enum): DEFAULT = "default" diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 067b3e5d5d..4f8b26d934 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -40,7 +40,13 @@ from ...exporters.openvino.stateful import model_has_state from ..utils.import_utils import is_nncf_available, is_transformers_version from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS -from .configuration import _DEFAULT_4BIT_CONFIGS, OVConfig, OVWeightQuantizationConfig, _check_default_4bit_configs +from .configuration import ( + _DEFAULT_4BIT_CONFIG, + _DEFAULT_4BIT_CONFIGS, + OVConfig, + OVWeightQuantizationConfig, + _check_default_4bit_configs, +) from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME, STR_TO_OV_TYPE @@ -775,7 +781,7 @@ def _from_pretrained( init_cls = cls if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}: - quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, quantization_config) + quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, _DEFAULT_4BIT_CONFIG) quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) enable_compilation = kwargs.pop("compile", True) and not quantization_config diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 2df03f6b34..b1b6643186 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -86,10 +86,10 @@ class OVCLIExportTestCase(unittest.TestCase): ) TEST_4BIT_CONFIGURATONS = [ - ("text-generation-with-past", "opt125m", "int4_sym_g128", 62, 86), - ("text-generation-with-past", "opt125m", "int4_asym_g128", 62, 86), - ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86), - ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86), + ("text-generation-with-past", "opt125m", "int4_sym_g128", 4, 144), + ("text-generation-with-past", "opt125m", "int4_asym_g128", 4, 144), + ("text-generation-with-past", "opt125m", "int4_sym_g64", 4, 144), + ("text-generation-with-past", "opt125m", "int4_asym_g64", 4, 144), ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32), ( "text-generation-with-past",