Skip to content

Commit

Permalink
Merge branch 'main' into ipex_tests
Browse files Browse the repository at this point in the history
  • Loading branch information
echarlaix authored Jul 11, 2024
2 parents e71f067 + b25e845 commit 7f62152
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test_ipex.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
fail-fast: false
matrix:
python-version: [3.9]
transformers-version: [4.39.0, 4.41.2]
transformers-version: [4.39.0, 4.42.3]
ipex-version: [2.2.0, 2.3.*]
include:
- python-version: 3.8
Expand Down
2 changes: 1 addition & 1 deletion docs/source/openvino/export.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Optional arguments:
--pad-token-id PAD_TOKEN_ID
This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
--ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80% of the layers will be quantized to int4 while
20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 0.8.
20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0.
--sym Whether to apply symmetric quantization
--group-size GROUP_SIZE
The group size to use for int4 quantization. Recommended value is 128 and -1 will results in per-column quantization.
Expand Down
4 changes: 2 additions & 2 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
default=None,
help=(
"A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 0.8."
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
),
)
optional_group.add_argument(
Expand Down Expand Up @@ -277,7 +277,7 @@ def _get_default_int4_config(model_id_or_path, library_name):
else:
quantization_config = {
"bits": 8 if is_int8 else 4,
"ratio": 1 if is_int8 else (self.args.ratio or 0.8),
"ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
"sym": self.args.sym or False,
"group_size": -1 if is_int8 else self.args.group_size,
"all_layers": None if is_int8 else self.args.all_layers,
Expand Down
2 changes: 1 addition & 1 deletion optimum/exporters/ipex/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

# Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version
_TRANSFORMERS_MIN_VERSION = "4.39.0"
_TRANSFORMERS_MAX_VERSION = "4.41.2"
_TRANSFORMERS_MAX_VERSION = "4.42.3"

_IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",)

Expand Down
9 changes: 5 additions & 4 deletions optimum/intel/ipex/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ class IPEXModel(OptimizedModel):
base_model_prefix = "ipex_model"
main_input_name = "input_ids"
output_name = "last_hidden_state"
_supports_cache_class = False

def __init__(
self,
Expand Down Expand Up @@ -430,6 +429,8 @@ def forward(
class IPEXModelForCausalLM(IPEXModel, GenerationMixin):
auto_model_class = AutoModelForCausalLM
export_feature = "text-generation"
_supports_cache_class = False
_is_stateful = False

def __init__(
self,
Expand Down Expand Up @@ -478,8 +479,8 @@ def __init__(
else:
self._reorder_cache = self.model_cls._reorder_cache.__get__(self)

if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon"}:
self.prepare_inputs_for_generation = _prepare_inputs_for_generation_for_llama
if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon", "mistral"}:
self.prepare_inputs_for_generation = _ipex_prepare_inputs_for_generation
else:
self.prepare_inputs_for_generation = self.model_cls.prepare_inputs_for_generation.__get__(self)

Expand Down Expand Up @@ -615,7 +616,7 @@ def generate(self, *args, **kwargs):
return super().generate(*args, **kwargs)


def _prepare_inputs_for_generation_for_llama(
def _ipex_prepare_inputs_for_generation(
input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
from transformers.cache_utils import Cache
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
"neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"],
"openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"],
"nncf": ["nncf>=2.11.0"],
"ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.41.2"],
"ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.42.3"],
"diffusers": ["diffusers"],
"quality": QUALITY_REQUIRE,
"tests": TESTS_REQUIRE,
Expand Down

0 comments on commit 7f62152

Please sign in to comment.