diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 9e9ae8cd88..c32b65b2b9 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -22,7 +22,7 @@ jobs: fail-fast: false matrix: python-version: [3.9] - transformers-version: [4.39.0, 4.41.2] + transformers-version: [4.39.0, 4.42.3] ipex-version: [2.2.0, 2.3.*] include: - python-version: 3.8 diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index eed980076d..c11016fde1 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -60,7 +60,7 @@ Optional arguments: --pad-token-id PAD_TOKEN_ID This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it. --ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80% of the layers will be quantized to int4 while - 20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 0.8. + 20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. --sym Whether to apply symmetric quantization --group-size GROUP_SIZE The group size to use for int4 quantization. Recommended value is 128 and -1 will results in per-column quantization. diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index ee1f62388f..2bdee32e17 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -102,7 +102,7 @@ def parse_args_openvino(parser: "ArgumentParser"): default=None, help=( "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 " - "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 0.8." + "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0." ), ) optional_group.add_argument( @@ -277,7 +277,7 @@ def _get_default_int4_config(model_id_or_path, library_name): else: quantization_config = { "bits": 8 if is_int8 else 4, - "ratio": 1 if is_int8 else (self.args.ratio or 0.8), + "ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]), "sym": self.args.sym or False, "group_size": -1 if is_int8 else self.args.group_size, "all_layers": None if is_int8 else self.args.all_layers, diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index e5299fb5c2..0d43152889 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -34,7 +34,7 @@ # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version _TRANSFORMERS_MIN_VERSION = "4.39.0" -_TRANSFORMERS_MAX_VERSION = "4.41.2" +_TRANSFORMERS_MAX_VERSION = "4.42.3" _IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index bb1817122f..a42b217c30 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -136,7 +136,6 @@ class IPEXModel(OptimizedModel): base_model_prefix = "ipex_model" main_input_name = "input_ids" output_name = "last_hidden_state" - _supports_cache_class = False def __init__( self, @@ -430,6 +429,8 @@ def forward( class IPEXModelForCausalLM(IPEXModel, GenerationMixin): auto_model_class = AutoModelForCausalLM export_feature = "text-generation" + _supports_cache_class = False + _is_stateful = False def __init__( self, @@ -478,8 +479,8 @@ def __init__( else: self._reorder_cache = self.model_cls._reorder_cache.__get__(self) - if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon"}: - self.prepare_inputs_for_generation = _prepare_inputs_for_generation_for_llama + if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon", "mistral"}: + self.prepare_inputs_for_generation = _ipex_prepare_inputs_for_generation else: self.prepare_inputs_for_generation = self.model_cls.prepare_inputs_for_generation.__get__(self) @@ -615,7 +616,7 @@ def generate(self, *args, **kwargs): return super().generate(*args, **kwargs) -def _prepare_inputs_for_generation_for_llama( +def _ipex_prepare_inputs_for_generation( input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs ): from transformers.cache_utils import Cache diff --git a/setup.py b/setup.py index 5bb79b4169..47398130ad 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.11.0"], - "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.41.2"], + "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.42.3"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE,